Reputation: 13
A data frame has 3 columns
-----------------------------------------
| Id | Country | Date |
-----------------------------------------
The 3 columns record the travel history of the person.
3 more columns need to be created representing the rolling top 3 countries this person (ID) has travelled to the most often before the date on the row.
(If tie appears for 2 countries, the latest travelled country has the precedence.)
mydata <- data.frame(ID = c('A1B1', 'A1B1', 'A1B1', 'A1B1', 'A1B1', 'A1B1', 'A1B1', 'A1B1', 'A2B2', 'A2B2', 'A2B2', 'A2B2', 'A2B2', 'A2B2'),
Country = c('Japan', 'USA', 'USA', 'USA', 'Germany', 'Germany', 'Japan', 'France', 'UK', 'Spain', 'Spain', 'UK', 'UK', 'Brazil'),
Date = as.Date(c('2010/01/02', '2010/04/18', '2011/03/22', '2011/11/23', '2012/05/09', '2012/09/11', '2014/01/06', '2015/12/11', '2010/04/03', '2010/05/11', '2011/05/01', '2012/03/01', '2013/01/03', '2014/01/04')))
# final data should look like below
#ID Country Date Pref1 Pref2 Pref3
#A1B1 Japan 2010-01-02 NA NA NA
#A1B1 USA 2010-04-18 Japan NA NA
#A1B1 USA 2011-03-22 USA Japan NA
#A1B1 USA 2011-11-23 USA Japan NA
#A1B1 Germany 2012-05-09 USA Japan NA
#A1B1 Germany 2012-09-11 USA Germany Japan
#A1B1 Japan 2014-01-06 USA Germany Japan
#A1B1 France 2015-12-11 USA Japan Germany
#A2B2 UK 2010-04-03 NA NA NA
#A2B2 Spain 2010-05-11 UK NA NA
#A2B2 Spain 2011-05-01 Spain UK NA
#A2B2 UK 2012-03-01 Spain UK NA
#A2B2 UK 2013-01-03 UK Spain NA
#A2B2 Brazil 2014-01-04 UK Spain NA
Upvotes: 1
Views: 141
Reputation: 5788
Here's a messy Base R solution:
rlln_rnk_df <- do.call("rbind", lapply(split(mydata, mydata$ID), function(x){
y <- do.call("rbind", lapply(seq_len(nrow(x)), function(i){
tmp <- x[x$Date <= x$Date[i],]
tmp1 <- cbind(head(tmp[order(tmp$Date, decreasing = TRUE),], 1),
rnk = t(names(sort(table(tmp$Country), decreasing = TRUE))))
tmp1 <- setNames(tmp1, c(names(tmp), paste0("rnk.", 1:(ncol(tmp1) - ncol(tmp)))))
tmp1[,setdiff(paste0("rnk.", 1:(length(unique(mydata$Country)))), names(tmp1))] <- NA_character_
tmp1
}
)
)
z <- y[order(y$Date),]
cbind(ID = z$ID, Country = z$Country, Date = z$Date,
z[match(z$Date, z$Date[2:nrow(z)]), (grep("rnk", names(z), value = TRUE))])
}
)
)
df_clean <- data.frame(rlln_rnk_df[, colSums(is.na(rlln_rnk_df)) < nrow(rlln_rnk_df)],
row.names = NULL)
Upvotes: 0
Reputation: 2767
I think this does it. I've included the mydata
here as I think there was a typo in one of the dates.
mydata <- data.frame(ID = c('A1B1', 'A1B1', 'A1B1', 'A1B1', 'A1B1', 'A1B1', 'A1B1', 'A1B1', 'A2B2', 'A2B2', 'A2B2', 'A2B2', 'A2B2', 'A2B2'),
Country = c('Japan', 'USA', 'USA', 'USA', 'Germany', 'Germany', 'Japan', 'France', 'UK', 'Spain', 'Spain', 'UK', 'UK', 'Brazil'),
Date = as.Date(c('2010/01/02', '2010/04/18', '2011/03/22', '2011/11/23', '2012/05/09', '2012/09/11', '2014/01/06', '2015/12/11', '2010/04/03', '2010/05/11', '2011/05/01', '2012/03/01', '2013/01/03', '2014/01/04')))
library(data.table)
setDT(mydata)
mydata[order(Date), `:=`(num_v = seq_len(.N), last_v = Date), .(ID, Country)]
x <- mydata[
mydata[, CJ(Country = unique(Country), Date = unique(Date)), ID],
on=c('ID', 'Country', 'Date'), roll=Inf]
x[, `:=`(num_v = shift(num_v), last_v = shift(last_v)), .(ID, Country)]
x[is.na(num_v), Country := NA]
y <- x[,
.SD[order(-num_v, -last_v)][1:3, .(Pref = paste0('Pref',1:3), Country)],
.(ID, Date)]
dcast(y, ID+Date~Pref, value.var = 'Country')
#> ID Date Pref1 Pref2 Pref3
#> 1: A1B1 2010-01-02 <NA> <NA> <NA>
#> 2: A1B1 2010-04-18 Japan <NA> <NA>
#> 3: A1B1 2011-03-22 USA Japan <NA>
#> 4: A1B1 2011-11-23 USA Japan <NA>
#> 5: A1B1 2012-05-09 USA Japan <NA>
#> 6: A1B1 2012-09-11 USA Germany Japan
#> 7: A1B1 2014-01-06 USA Germany Japan
#> 8: A1B1 2015-12-11 USA Japan Germany
#> 9: A2B2 2010-04-03 <NA> <NA> <NA>
#> 10: A2B2 2010-05-11 UK <NA> <NA>
#> 11: A2B2 2011-05-01 Spain UK <NA>
#> 12: A2B2 2012-03-01 Spain UK <NA>
#> 13: A2B2 2013-01-03 UK Spain <NA>
#> 14: A2B2 2014-01-04 UK Spain <NA>
You can join back on the Country
from the original mydata
if you need it.
Upvotes: 0
Reputation: 388982
Here is a way taking last 3 unique countries at each row for each ID
.
library(dplyr)
mydata %>%
group_by(ID) %>%
mutate(data = purrr::map(row_number(), ~{
un_country <- Country[seq_len(.x - 1)]
if(.x == 1) un_country <- NA
else un_country <- names(sort(table(un_country), decreasing = TRUE))[1:3]
data.frame(t(un_country[1:3]))
})) %>%
tidyr::unnest_wider(data)
# ID Country Date X1 X2 X3
# <chr> <chr> <date> <chr> <chr> <chr>
# 1 A1B1 Japan 2010-01-02 NA NA NA
# 2 A1B1 USA 2010-04-18 Japan NA NA
# 3 A1B1 USA 2011-03-22 Japan USA NA
# 4 A1B1 USA 2011-11-23 USA Japan NA
# 5 A1B1 Germany 2011-05-09 USA Japan NA
# 6 A1B1 Germany 2012-09-11 USA Germany Japan
# 7 A1B1 Japan 2014-01-06 USA Germany Japan
# 8 A1B1 France 2015-12-11 USA Germany Japan
# 9 A2B2 UK 2010-04-03 NA NA NA
#10 A2B2 Spain 2010-05-11 UK NA NA
#11 A2B2 Spain 2011-05-01 Spain UK NA
#12 A2B2 UK 2012-03-01 Spain UK NA
#13 A2B2 UK 2013-01-03 Spain UK NA
#14 A2B2 Brazil 2014-01-04 UK Spain NA
Upvotes: 2
Reputation: 1430
This isn't a super clean answer. Hopefully it helps you gets you close.
library(readr)
df <- readr::read_table(
"ID Country Date
A1B1 Japan 2010-01-02
A1B1 USA 2010-04-18
A1B1 USA 2011-03-22
A1B1 USA 2011-11-23
A1B1 Germany 2012-05-09
A1B1 Germany 2012-09-11
A1B1 Japan 2014-01-06
A1B1 France 2015-12-11
A2B2 UK 2010-04-03
A2B2 Spain 2010-05-11
A2B2 Spain 2011-05-01
A2B2 UK 2012-03-01
A3B2 UK 2013-01-03
A3B2 Brazil 2014-01-04")
df
library(tidyverse)
rankings <- df %>%
group_by(ID, Country) %>%
summarise(obs = n(),
last_dt = max(Date)) %>%
arrange(ID,-obs, desc(last_dt)) %>%
mutate(rank = 1:n()) %>% print() %>%
filter(rank <= 3) %>%
pivot_wider(
names_from = rank,
values_from = Country,
names_prefix = "rank_",
id_cols = ID
) %>% print()
#> `summarise()` regrouping output by 'ID' (override with `.groups` argument)
#> # A tibble: 8 x 5
#> # Groups: ID [3]
#> ID Country obs last_dt rank
#> <chr> <chr> <int> <date> <int>
#> 1 A1B1 USA 3 2011-11-23 1
#> 2 A1B1 Japan 2 2014-01-06 2
#> 3 A1B1 Germany 2 2012-09-11 3
#> 4 A1B1 France 1 2015-12-11 4
#> 5 A2B2 UK 2 2012-03-01 1
#> 6 A2B2 Spain 2 2011-05-01 2
#> 7 A3B2 Brazil 1 2014-01-04 1
#> 8 A3B2 UK 1 2013-01-03 2
#> # A tibble: 3 x 4
#> # Groups: ID [3]
#> ID rank_1 rank_2 rank_3
#> <chr> <chr> <chr> <chr>
#> 1 A1B1 USA Japan Germany
#> 2 A2B2 UK Spain <NA>
#> 3 A3B2 Brazil UK <NA>
df %>% left_join(rankings, by = "ID")
#> # A tibble: 14 x 6
#> ID Country Date rank_1 rank_2 rank_3
#> <chr> <chr> <date> <chr> <chr> <chr>
#> 1 A1B1 Japan 2010-01-02 USA Japan Germany
#> 2 A1B1 USA 2010-04-18 USA Japan Germany
#> 3 A1B1 USA 2011-03-22 USA Japan Germany
#> 4 A1B1 USA 2011-11-23 USA Japan Germany
#> 5 A1B1 Germany 2012-05-09 USA Japan Germany
#> 6 A1B1 Germany 2012-09-11 USA Japan Germany
#> 7 A1B1 Japan 2014-01-06 USA Japan Germany
#> 8 A1B1 France 2015-12-11 USA Japan Germany
#> 9 A2B2 UK 2010-04-03 UK Spain <NA>
#> 10 A2B2 Spain 2010-05-11 UK Spain <NA>
#> 11 A2B2 Spain 2011-05-01 UK Spain <NA>
#> 12 A2B2 UK 2012-03-01 UK Spain <NA>
#> 13 A3B2 UK 2013-01-03 Brazil UK <NA>
#> 14 A3B2 Brazil 2014-01-04 Brazil UK <NA>
Created on 2020-08-29 by the reprex package (v0.3.0)
Upvotes: 0