Srm Murty
Srm Murty

Reputation: 135

How to get previous two years max value in R

Hi I have data frame as

How to create column max_value with max from last 2 years max value

dt <- 
structure(list(Name = c("A", "A", "A", "A", "A", "A", "A", "B", 
"B", "B", "B", "B", "B"), year = c(2012L, 2012L, 2013L, 2014L, 
2015L, 2016L, 2017L, 2012L, 2013L, 2013L, 2014L, 2015L, 2016L
), value = c(22L, 99L, 12L, 1L, 23L, 40L, 12L, 12L, 33L, 40L, 
NA, 20L, 20L), max_value = c(NA, NA, 99L, 99L, 12L, 23L, 40L, 
NA, 12L, 12L, 40L, 40L, 20L)), .Names = c("Name", "year", "value", 
"max_value"), row.names = c(NA, -13L), class = c("data.table", 
"data.frame"))

Name   year   value   *max_value*
A      2012    22        NA
A      2012    99        NA
A      2013    12        99
A      2014    01        99
A      2015    23        12
A      2016    40        23
A      2017    12        40
B      2012    12        NA
B      2013    33        12
B      2013    40        12
B      2014    NA        40
B      2015    20        40
B      2016    20        20

Thanks in advance

Upvotes: 4

Views: 410

Answers (6)

shs
shs

Reputation: 3909

The complication here is that the years have different numbers of entries. This can be worked around by grouping, then summarizing to get a within-year maximum. Next use slide_int() to get the max of the two prior years, to ultimately join this yearly data back to the original frame:

library(tidyverse)
library(slider)

sol <- dt %>%
  group_by(Name, year) %>%
  summarize(max_val = max(value, na.rm = T)) %>% 
  mutate(slide_max = slide_int(lag(max_val), max, 
                               .before = 1, na.rm = TRUE, .complete = T)) %>%
  select(-max_val) %>%
  right_join(dt, by = c("Name", "year"))

sol
#> # A tibble: 13 × 5
#> # Groups:   Name [2]
#>    Name   year slide_max value max_value
#>    <chr> <int>     <int> <int>     <int>
#>  1 A      2012        NA    22        NA
#>  2 A      2012        NA    99        NA
#>  3 A      2013        99    12        99
#>  4 A      2014        99     1        99
#>  5 A      2015        12    23        12
#>  6 A      2016        23    40        23
#>  7 A      2017        40    12        40
#>  8 B      2012        NA    12        NA
#>  9 B      2013        12    33        12
#> 10 B      2013        12    40        12
#> 11 B      2014        40    NA        40
#> 12 B      2015        40    20        40
#> 13 B      2016        20    20        20

identical(sol$slide_max, sol$max_value)
#> [1] TRUE

Upvotes: 2

Wimpel
Wimpel

Reputation: 27792

Here is an other data.table approach, using a self-join by .EACHI

library(data.table)
# temporary rowwise id
setDT(dt)[, id := .I]
# set key
setkey(dt, id)
# self join, set infinite values back to NA
dt[dt, max_val2 := {
  dt[Name == i.Name & year >= (i.year - 2) & year < i.year, max(value, na.rm = TRUE)]
}, by = .EACHI][is.infinite(max_val2), max_val2 := NA][, id := NULL]

    Name year value max_value max_val2
 1:    A 2012    22        NA       NA
 2:    A 2012    99        NA       NA
 3:    A 2013    12        99       99
 4:    A 2014     1        99       99
 5:    A 2015    23        12       12
 6:    A 2016    40        23       23
 7:    A 2017    12        40       40
 8:    B 2012    12        NA       NA
 9:    B 2013    33        12       12
10:    B 2013    40        12       12
11:    B 2014    NA        40       40
12:    B 2015    20        40       NA
13:    B 2016    20        20       NA

Upvotes: 2

Eric Fail
Eric Fail

Reputation: 7958

Here's what I got so far. The slider package is par of the tidyverse,

library(slider)
dt %>% group_by(Name) %>% 
       mutate(slide_max = slide_dbl(lag(value), max, .before = 2, na.rm = TRUE, .complete = T))
# A tibble: 13 x 5
# Groups:   Name [2]
   Name   year value max_value slide_max
   <chr> <int> <int>     <int>     <dbl>
 1 A      2012    22        NA        NA
 2 A      2012    99        NA        NA
 3 A      2013    12        99        99
 4 A      2014     1        99        99
 5 A      2015    23        12        99
 6 A      2016    40        23        23
 7 A      2017    12        40        40
 8 B      2012    12        NA        NA
 9 B      2013    33        12        NA
10 B      2013    40        12        33
11 B      2014    NA        40        40
12 B      2015    20        40        40
13 B      2016    20        20        40

Upvotes: 1

Matt Jewett
Matt Jewett

Reputation: 3379

Here is a base R solution using mapply.

df <- data.frame(Name = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B"),
                 year = c(2012, 2012, 2013, 2014, 2015, 2016, 2017, 2012, 2013, 2013, 2014, 2015, 2016),
                 value = c(22, 99, 12, 1, 23, 40, 12, 12, 33, 40, NA, 20, 20),
                 stringsAsFactors = FALSE)

max.vals <- mapply(function(x, y){
                     vals <- df[df$year %in% c(x-2,x-1) & df$Name == y,"value"]
                     max.val <- ifelse(length(vals) > 0, max(vals, na.rm = TRUE), NA)
                     max.val <- list(y,x,max.val)
                     names(max.val) <- c("Name","year","max_value")
                     return(max.val)
                   },
                  unique(df[,c("Name","year")])$year,
                  unique(df[,c("Name","year")])$Name
                 ) 

max.vals <- as.data.frame(t(max.vals),stringsAsFactors = FALSE)

df <- merge(df, max.vals)

Upvotes: 0

lmo
lmo

Reputation: 38520

Here is a method with data.table using aggregation, a two dimensional shift, apply, and a join.

library(data.table)
dt[dt[, .(mx=max(value)), by=c("Name", "year")
      ][, .(year,
            max_val=apply(matrix(unlist(shift(mx, 1:2)), ncol=2), 1, max, na.rm=TRUE)),
        by=Name],
    on=c("Name", "year")][is.infinite(max_val), max_val := NA][]

The first line calculates the maximum value by year and name. The second line, for each name, the year and using apply the maximum of the two lagged years (using shift(mx, 1:2)) are returned, dropping NA values. This results in warnings for every line that has 2 NA values, and a -Inf is returned in that place. I had to manually convert the output of shift into a matrix, in order to feed it to apply, which is not ideal. The resulting data.table is joined onto the original using name and year as IDs. Finally, the -Inf values are replaced with NA in the final line and the result printed with [].

This returns

    Name year value max_value max_val
 1:    A 2012    22        NA      NA
 2:    A 2012    99        NA      NA
 3:    A 2013    12        99      99
 4:    A 2014     1        99      99
 5:    A 2015    23        12      12
 6:    A 2016    40        23      23
 7:    A 2017    12        40      40
 8:    B 2012    12        NA      NA
 9:    B 2013    33        12      12
10:    B 2013    40        12      12
11:    B 2014    NA        40      40
12:    B 2015    20        40      40
13:    B 2016    20        20      20

data

dt <- 
structure(list(Name = c("A", "A", "A", "A", "A", "A", "A", "B", 
"B", "B", "B", "B", "B"), year = c(2012L, 2012L, 2013L, 2014L, 
2015L, 2016L, 2017L, 2012L, 2013L, 2013L, 2014L, 2015L, 2016L
), value = c(22L, 99L, 12L, 1L, 23L, 40L, 12L, 12L, 33L, 40L, 
NA, 20L, 20L), max_value = c(NA, NA, 99L, 99L, 12L, 23L, 40L, 
NA, 12L, 12L, 40L, 40L, 20L)), .Names = c("Name", "year", "value", 
"max_value"), row.names = c(NA, -13L), class = c("data.table", 
"data.frame"))

Upvotes: 1

Matt
Matt

Reputation: 994

Use by:

> by(dat$value, dat$year, function(x) max(x))
dat$year: 2012
[1] 99
------------------------------------------------------------ 
dat$year: 2013
[1] 40
------------------------------------------------------------ 
dat$year: 2014
[1] NA
------------------------------------------------------------ 
dat$year: 2015
[1] 23
------------------------------------------------------------ 
dat$year: 2016
[1] 40
------------------------------------------------------------ 
dat$year: 2017
[1] 12

EDIT: Misunderstood the question at first. This should be what you want:

Assign the results to a data.frame:

> dat1=by(dat$value, dat$year, function(x) max(x))
> data.frame("max"=dat1[1:length(dat1)])
     max
2012  99
2013  40
2014  NA
2015  23
2016  40
2017  12

Make a new data frame to hold biannual maximum and loop to compare years:

bi_max=data.frame("max"=nrow(dat_max))
for(i in 1:nrow(dat_max)){
  bi_max[i,]=max(dat_max$max[i], dat_max$max[i-1], na.rm=T)
}
rownames(bi_max)=rownames(dat_max)

The final result:

> bi_max
     max
2012  99
2013  99
2014  40
2015  23
2016  40
2017  40

Upvotes: -1

Related Questions