Aravindh Rajan
Aravindh Rajan

Reputation: 109

create a new column in R by summing up specific columns based on multiple conditions

I am trying to create a new column called period from the dataframe below

structure(list(fw01 = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3, 
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4, 
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8, 
19.7, 15, 21.4), fw02 = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 
8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4), 
   fw03 = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8, 
   167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7, 
   71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145, 
   301, 121), fw04 = c(110, 110, 93, 110, 175, 105, 245, 62, 
   95, 123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 
   150, 150, 245, 175, 66, 91, 113, 264, 175, 335, 109), fw05 = c(3.9, 
   3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92, 
   3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76, 
   3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
   ), fw06 = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19, 
   3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2, 
   1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14, 
   1.513, 3.17, 2.77, 3.57, 2.78), fw07 = c(16.46, 17.02, 18.61, 
   19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6, 
   18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87, 
   17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
   ), fw08 = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 
   0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), fw09 = c(1, 
   1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 
   0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), fw10 = c(4, 4, 4, 3, 
   3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 
   3, 3, 4, 5, 5, 5, 5, 5, 4), fw11 = c(4, 4, 1, 1, 2, 1, 4, 
   2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1, 
   2, 2, 4, 6, 8, 2)), class = "data.frame", row.names = c(NA, 
-32L))

these are the periods that I have defined based on the requirement

p1 <- c("fw01","fw02","fw03","fw04")
p2 <- c("fw05","fw06","fw07","fw08")
p3 <- c("fw09","fw10","fw11","fw12","fw13")
p4 <- c("fw14","fw15","fw16","fw17")
p5 <- c("fw18","fw19","fw20","fw21")
p6 <- c("fw22","fw23","fw24","fw25","fw26")
p7 <- c("fw27","fw28","fw29","fw30")
p8 <- c("fw31","fw32","fw33","fw34")
p9 <- c("fw35","fw36","fw37","fw38","fw39")
p10 <- c("fw40","fw41","fw42","fw43")
p11 <- c("fw44","fw45","fw46","fw47")
p12 <- c("fw48","fw49","fw50","fw51","fw52")

my requirement is that the period column should be a summation of the weeks of 1st period i.e fw01+fw02+fw03+fw04 if the week number in the last column of the table belongs belongs to 2nd period and so on. In the above example, the last column is week 11(fw11) that belongs to 3rd period which is p3 as per the vectors defined. in this case i want the period column to be a summation of the weeks in period2 which is fw05+fw06+fw07+fw08. For this I coded 2 different logic

1st approach is using dplyr::case_when

dplyr::mutate(df2,
                     prev_per = case_when(rev(names(df2))[1] %in% p2 ~ fw01+fw02+fw03+fw04,
                                        rev(names(df2))[1] %in% p3 ~ fw05+fw06+fw07+fw08,
                                        rev(names(df2))[1] %in% p4 ~ fw09+fw10+fw11+fw12+fw13,
                                        rev(names(df2))[1] %in% p5 ~ fw14+fw15+fw16+fw17,
                                        rev(names(df2))[1] %in% p6 ~ fw18+fw19+fw20+fw21,
                                        rev(names(df2))[1] %in% p7 ~ fw22+fw23+fw24+fw25+fw26,
                                        rev(names(df2))[1] %in% p8 ~ fw27+fw28+fw29+fw30,
                                        rev(names(df2))[1] %in% p9 ~ fw31+fw32+fw33+fw34,
                                        rev(names(df2))[1] %in% p10 ~ fw35+fw36+fw37+fw38+fw39,
                                        rev(names(df2))[1] %in% p11 ~ fw40+fw41+fw42+fw43,
                                        rev(names(df2))[1] %in% p12 ~ fw44+fw45+fw46+fw47))

the problem with the above is, the other columns are not there in the data frame at all. ideally the condition gets satisfied in the 2nd case_when itself, but the operation is not breaking which leads to an error saying fw12 not found

sample of the 2nd approach is using ifelse function from base package.

df7<- dplyr::mutate(df2,
                    prev_per = ifelse(rev(names(df2))[1] %in% p2, fw01+fw02+fw03+fw04,
                               ifelse(rev(names(df2))[1] %in% p3, fw05+fw06+fw07+fw08,
                               ifelse(rev(names(df2))[1] %in% p4, fw09+fw10+fw11+fw12+fw13))))

here, the operation is breaking properly when the condition is satisfied, but its returning the same number in all the rows of the column period as below

structure(list(fw01 = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3, 
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4, 
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8, 
19.7, 15, 21.4), fw02 = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 
8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4), 
    fw03 = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8, 
    167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7, 
    71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145, 
    301, 121), fw04 = c(110, 110, 93, 110, 175, 105, 245, 62, 
    95, 123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 
    150, 150, 245, 175, 66, 91, 113, 264, 175, 335, 109), fw05 = c(3.9, 
    3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92, 
    3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76, 
    3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
    ), fw06 = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19, 
    3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2, 
    1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14, 
    1.513, 3.17, 2.77, 3.57, 2.78), fw07 = c(16.46, 17.02, 18.61, 
    19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6, 
    18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87, 
    17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
    ), fw08 = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 
    0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), fw09 = c(1, 
    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 
    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), fw10 = c(4, 4, 4, 3, 
    3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 
    3, 3, 4, 5, 5, 5, 5, 5, 4), fw11 = c(4, 4, 1, 1, 2, 1, 4, 
    2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1, 
    2, 2, 4, 6, 8, 2), prev_per = c(22.98, 22.98, 22.98, 22.98, 
    22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 
    22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 
    22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 
    22.98)), class = "data.frame", row.names = c(NA, -32L))

Please suggest how to fix this issue.

Upvotes: 1

Views: 66

Answers (2)

AHart
AHart

Reputation: 448

One, and hopefully more general, alternative is to reshape the data and sum within a column identified by the period. Here I do that by creating a key, merging it with the reshaped data, and then summing within the period that has the highest 'cutpoint'.

# Create a 'period' key
  pkey <-
    tibble(
      perID = c('p1','p2','p3','p4'),
      varlist = list(
        c("fw01","fw02","fw03","fw04"),
        c("fw05","fw06","fw07","fw08"),
        c("fw09","fw10","fw11","fw12","fw13"),
        c("fw14","fw15","fw16","fw17")
      )
    ) %>%
    unnest(cols = varlist) %>%
    # now create end-of-period indicator
    group_by(perID) %>%
    mutate(
      cutpt = if_else(varlist == max(varlist), 1,0)
    )

# Period sums w/i rows
  df %>%
    # unique ids
    rownames_to_column('rID') %>%
    # stack the data
    pivot_longer(
      -rID,
      names_to = 'varlist'
    ) %>%
    # merge the period key
    left_join(pkey) %>%
    group_by(rID) %>%
    # period as sum in period w/last cut point
    mutate(
      period = sum(value[perID == max(perID[cutpt == 1])])
    ) %>%
    # clean and widen
    select(rID,varlist,value,period) %>%
    pivot_wider(
      id_cols = c(rID,period),
      names_from = varlist,
      values_from = value
    ) 

Longer than the prior solution, but perhaps more widely applicable.

Upvotes: 0

AHart
AHart

Reputation: 448

For a truly awful solution, you can take advantage of the fact that case_when ignores NULL inputs and if without an else will return NULL.

So move the conditions in an if and follow with TRUE to force the evaluation.

df %>%
  mutate(
    prev_per = case_when(
      if (rev(names(df))[1] %in% p2) TRUE ~ fw01+fw02+fw03+fw04,
      if (rev(names(df))[1] %in% p3) TRUE ~ fw05+fw06+fw07+fw08,
      if (rev(names(df))[1] %in% p4) TRUE ~ fw09+fw10+fw11+fw12+fw13,
      if (rev(names(df))[1] %in% p5) TRUE ~ fw14+fw15+fw16+fw17,
      if (rev(names(df))[1] %in% p6) TRUE ~ fw18+fw19+fw20+fw21
    )
  )

I hate this answer, but it seems to work. Output:

   fw01 fw02  fw03 fw04 fw05  fw06  fw07 fw08 fw09 fw10 fw11 prev_per
1  21.0    6 160.0  110 3.90 2.620 16.46    0    1    4    4   22.980
2  21.0    6 160.0  110 3.90 2.875 17.02    0    1    4    4   23.795
3  22.8    4 108.0   93 3.85 2.320 18.61    1    1    4    1   25.780
4  21.4    6 258.0  110 3.08 3.215 19.44    1    0    3    1   26.735
5  18.7    8 360.0  175 3.15 3.440 17.02    0    0    3    2   23.610

Upvotes: 1

Related Questions