Bright
Bright

Reputation: 13

How to find rolling top 3 values in a column by group?

A data frame has 3 columns

-----------------------------------------
|    Id    |    Country    |    Date    |
-----------------------------------------

The 3 columns record the travel history of the person.

3 more columns need to be created representing the rolling top 3 countries this person (ID) has travelled to the most often before the date on the row.

(If tie appears for 2 countries, the latest travelled country has the precedence.)

    mydata <- data.frame(ID = c('A1B1', 'A1B1', 'A1B1', 'A1B1', 'A1B1', 'A1B1', 'A1B1', 'A1B1', 'A2B2', 'A2B2', 'A2B2', 'A2B2', 'A2B2', 'A2B2'), 
                         Country = c('Japan', 'USA', 'USA', 'USA', 'Germany', 'Germany', 'Japan', 'France', 'UK', 'Spain', 'Spain', 'UK', 'UK', 'Brazil'), 
                         Date = as.Date(c('2010/01/02', '2010/04/18', '2011/03/22', '2011/11/23', '2012/05/09', '2012/09/11', '2014/01/06', '2015/12/11', '2010/04/03', '2010/05/11', '2011/05/01', '2012/03/01', '2013/01/03', '2014/01/04')))

    # final data should look like below
    
    #ID    Country  Date          Pref1   Pref2   Pref3
    #A1B1  Japan    2010-01-02    NA      NA      NA
    #A1B1  USA      2010-04-18    Japan   NA      NA
    #A1B1  USA      2011-03-22    USA     Japan   NA
    #A1B1  USA      2011-11-23    USA     Japan   NA
    #A1B1  Germany  2012-05-09    USA     Japan   NA
    #A1B1  Germany  2012-09-11    USA     Germany Japan
    #A1B1  Japan    2014-01-06    USA     Germany Japan
    #A1B1  France   2015-12-11    USA     Japan   Germany
    #A2B2  UK       2010-04-03    NA      NA      NA
    #A2B2  Spain    2010-05-11    UK      NA      NA
    #A2B2  Spain    2011-05-01    Spain   UK      NA
    #A2B2  UK       2012-03-01    Spain   UK      NA
    #A2B2  UK       2013-01-03    UK      Spain   NA
    #A2B2  Brazil   2014-01-04    UK      Spain   NA

Q. How to create the last 3 columns for rolling top 3 countries in counts by ID?

Upvotes: 1

Views: 141

Answers (4)

hello_friend
hello_friend

Reputation: 5788

Here's a messy Base R solution:

  rlln_rnk_df <- do.call("rbind", lapply(split(mydata, mydata$ID), function(x){
      y <- do.call("rbind", lapply(seq_len(nrow(x)), function(i){
          tmp <- x[x$Date <= x$Date[i],]
          tmp1 <- cbind(head(tmp[order(tmp$Date, decreasing = TRUE),], 1), 
                        rnk = t(names(sort(table(tmp$Country), decreasing = TRUE))))
          tmp1 <- setNames(tmp1, c(names(tmp), paste0("rnk.", 1:(ncol(tmp1) - ncol(tmp)))))
          tmp1[,setdiff(paste0("rnk.", 1:(length(unique(mydata$Country)))), names(tmp1))] <- NA_character_
          tmp1
          }
        )
      )
      z <- y[order(y$Date),]
      cbind(ID = z$ID, Country = z$Country, Date = z$Date,
                 z[match(z$Date, z$Date[2:nrow(z)]), (grep("rnk", names(z), value = TRUE))])
    }
  )
)
df_clean <- data.frame(rlln_rnk_df[, colSums(is.na(rlln_rnk_df)) < nrow(rlln_rnk_df)],
                       row.names = NULL)

Upvotes: 0

pseudospin
pseudospin

Reputation: 2767

I think this does it. I've included the mydata here as I think there was a typo in one of the dates.

mydata <- data.frame(ID = c('A1B1', 'A1B1', 'A1B1', 'A1B1', 'A1B1', 'A1B1', 'A1B1', 'A1B1', 'A2B2', 'A2B2', 'A2B2', 'A2B2', 'A2B2', 'A2B2'), 
  Country = c('Japan', 'USA', 'USA', 'USA', 'Germany', 'Germany', 'Japan', 'France', 'UK', 'Spain', 'Spain', 'UK', 'UK', 'Brazil'), 
  Date = as.Date(c('2010/01/02', '2010/04/18', '2011/03/22', '2011/11/23', '2012/05/09', '2012/09/11', '2014/01/06', '2015/12/11', '2010/04/03', '2010/05/11', '2011/05/01', '2012/03/01', '2013/01/03', '2014/01/04')))

library(data.table)
setDT(mydata)
mydata[order(Date), `:=`(num_v = seq_len(.N), last_v = Date), .(ID, Country)]
x <- mydata[
  mydata[, CJ(Country = unique(Country), Date = unique(Date)), ID], 
  on=c('ID', 'Country', 'Date'), roll=Inf]
x[, `:=`(num_v = shift(num_v), last_v = shift(last_v)), .(ID, Country)]
x[is.na(num_v), Country := NA]
y <- x[, 
  .SD[order(-num_v, -last_v)][1:3, .(Pref = paste0('Pref',1:3), Country)],
  .(ID, Date)]
dcast(y, ID+Date~Pref, value.var = 'Country')

#>       ID       Date Pref1   Pref2   Pref3
#>  1: A1B1 2010-01-02  <NA>    <NA>    <NA>
#>  2: A1B1 2010-04-18 Japan    <NA>    <NA>
#>  3: A1B1 2011-03-22   USA   Japan    <NA>
#>  4: A1B1 2011-11-23   USA   Japan    <NA>
#>  5: A1B1 2012-05-09   USA   Japan    <NA>
#>  6: A1B1 2012-09-11   USA Germany   Japan
#>  7: A1B1 2014-01-06   USA Germany   Japan
#>  8: A1B1 2015-12-11   USA   Japan Germany
#>  9: A2B2 2010-04-03  <NA>    <NA>    <NA>
#> 10: A2B2 2010-05-11    UK    <NA>    <NA>
#> 11: A2B2 2011-05-01 Spain      UK    <NA>
#> 12: A2B2 2012-03-01 Spain      UK    <NA>
#> 13: A2B2 2013-01-03    UK   Spain    <NA>
#> 14: A2B2 2014-01-04    UK   Spain    <NA>

You can join back on the Country from the original mydata if you need it.

Upvotes: 0

Ronak Shah
Ronak Shah

Reputation: 388982

Here is a way taking last 3 unique countries at each row for each ID.

library(dplyr)

mydata %>%
  group_by(ID) %>%
  mutate(data = purrr::map(row_number(), ~{
    un_country <- Country[seq_len(.x - 1)]
    if(.x == 1) un_country <- NA
    else  un_country <- names(sort(table(un_country), decreasing = TRUE))[1:3]
    data.frame(t(un_country[1:3]))
  })) %>%
  tidyr::unnest_wider(data)
  
#    ID    Country Date       X1    X2      X3   
#   <chr> <chr>   <date>     <chr> <chr>   <chr>
# 1 A1B1  Japan   2010-01-02 NA    NA      NA   
# 2 A1B1  USA     2010-04-18 Japan NA      NA   
# 3 A1B1  USA     2011-03-22 Japan USA     NA   
# 4 A1B1  USA     2011-11-23 USA   Japan   NA   
# 5 A1B1  Germany 2011-05-09 USA   Japan   NA   
# 6 A1B1  Germany 2012-09-11 USA   Germany Japan
# 7 A1B1  Japan   2014-01-06 USA   Germany Japan
# 8 A1B1  France  2015-12-11 USA   Germany Japan
# 9 A2B2  UK      2010-04-03 NA    NA      NA   
#10 A2B2  Spain   2010-05-11 UK    NA      NA   
#11 A2B2  Spain   2011-05-01 Spain UK      NA   
#12 A2B2  UK      2012-03-01 Spain UK      NA   
#13 A2B2  UK      2013-01-03 Spain UK      NA   
#14 A2B2  Brazil  2014-01-04 UK    Spain   NA   

Upvotes: 2

Ryan John
Ryan John

Reputation: 1430

This isn't a super clean answer. Hopefully it helps you gets you close.

library(readr)
df <- readr::read_table(
"ID           Country     Date
A1B1         Japan       2010-01-02
A1B1         USA         2010-04-18
A1B1         USA         2011-03-22
A1B1         USA         2011-11-23
A1B1         Germany     2012-05-09
A1B1         Germany     2012-09-11
A1B1         Japan       2014-01-06
A1B1         France      2015-12-11
A2B2         UK          2010-04-03
A2B2         Spain       2010-05-11
A2B2         Spain       2011-05-01
A2B2         UK          2012-03-01
A3B2         UK          2013-01-03
A3B2         Brazil      2014-01-04")
df 

library(tidyverse)
rankings <- df %>%
  group_by(ID, Country) %>%
  summarise(obs = n(),
            last_dt = max(Date)) %>%
  arrange(ID,-obs, desc(last_dt)) %>%
  mutate(rank = 1:n()) %>% print() %>%
  filter(rank <= 3) %>% 
  pivot_wider(
    names_from = rank,
    values_from = Country,
    names_prefix = "rank_",
    id_cols = ID
  ) %>% print()
#> `summarise()` regrouping output by 'ID' (override with `.groups` argument)
#> # A tibble: 8 x 5
#> # Groups:   ID [3]
#>   ID    Country   obs last_dt     rank
#>   <chr> <chr>   <int> <date>     <int>
#> 1 A1B1  USA         3 2011-11-23     1
#> 2 A1B1  Japan       2 2014-01-06     2
#> 3 A1B1  Germany     2 2012-09-11     3
#> 4 A1B1  France      1 2015-12-11     4
#> 5 A2B2  UK          2 2012-03-01     1
#> 6 A2B2  Spain       2 2011-05-01     2
#> 7 A3B2  Brazil      1 2014-01-04     1
#> 8 A3B2  UK          1 2013-01-03     2
#> # A tibble: 3 x 4
#> # Groups:   ID [3]
#>   ID    rank_1 rank_2 rank_3 
#>   <chr> <chr>  <chr>  <chr>  
#> 1 A1B1  USA    Japan  Germany
#> 2 A2B2  UK     Spain  <NA>   
#> 3 A3B2  Brazil UK     <NA>

df %>% left_join(rankings, by = "ID")
#> # A tibble: 14 x 6
#>    ID    Country Date       rank_1 rank_2 rank_3 
#>    <chr> <chr>   <date>     <chr>  <chr>  <chr>  
#>  1 A1B1  Japan   2010-01-02 USA    Japan  Germany
#>  2 A1B1  USA     2010-04-18 USA    Japan  Germany
#>  3 A1B1  USA     2011-03-22 USA    Japan  Germany
#>  4 A1B1  USA     2011-11-23 USA    Japan  Germany
#>  5 A1B1  Germany 2012-05-09 USA    Japan  Germany
#>  6 A1B1  Germany 2012-09-11 USA    Japan  Germany
#>  7 A1B1  Japan   2014-01-06 USA    Japan  Germany
#>  8 A1B1  France  2015-12-11 USA    Japan  Germany
#>  9 A2B2  UK      2010-04-03 UK     Spain  <NA>   
#> 10 A2B2  Spain   2010-05-11 UK     Spain  <NA>   
#> 11 A2B2  Spain   2011-05-01 UK     Spain  <NA>   
#> 12 A2B2  UK      2012-03-01 UK     Spain  <NA>   
#> 13 A3B2  UK      2013-01-03 Brazil UK     <NA>   
#> 14 A3B2  Brazil  2014-01-04 Brazil UK     <NA>

Created on 2020-08-29 by the reprex package (v0.3.0)

Upvotes: 0

Related Questions