Reputation: 684

Accumulate values from one column based on keys of another column in r

I have a dataframe where I need to create a new column of values using the strings in Issue_key column and form the strings of values from the Issue_description column.

Issue_Description_needed is what I am seeking. I have used the lag and summarise functions from dplyr but it's not the correct approach.

input <- data.frame(
  
  Issue_Key = c("KLM391",
                "KLM391.02",
                "KLM391.01",
                "KLM391.04",
                "KLM391.01.0999993",
                "KLM391.01.0999993.898",
                "KLM391.01.0999993.898.asds",
                "KLM391.01.0999993.898.tyut",
                "KLM391.02.0999993",
                "KLM391.02.0999996",
                "KLM391.04.0999991",
                "KLM391.04.0999998"),
  
  Issue_description = c("L",
                        "M2",
                        "M1",
                        "M4",
                        "O123",
                        "P1234",
                        "Q12345",
                        "Q67809",
                        "XYq12",
                        "46525",
                        "4hrh4",
                        "fg785"),
  
  Issue_description_needed = c(
    "L",
    "L.M2",
    "L.M1",
    "L.M4",
    "L.M1.O123",
    "L.M1.O123.P1234",
    "L.M1.O123.P1234.Q12345",
    "L.M1.O123.P1234.Q67809",
    "L.M2.XYq12",
    "L.M2.46525",
    "L.M4.4hrh4",
    "L.M4.fg785"
  )
)

Output:

                    Issue_Key Issue_description    Issue_description_needed
1                      KLM391                 L                           L
2                   KLM391.01                M1                        L.M1
3           KLM391.01.0999993              O123                   L.M1.O123
4       KLM391.01.0999993.898             P1234             L.M1.O123.P1234
5  KLM391.01.0999993.898.asds            Q12345      L.M1.O123.P1234.Q12345
6  KLM391.01.0999993.898.tyut            Q67809      L.M1.O123.P1234.Q67809
7                   KLM391.02                M2                        L.M2
8           KLM391.02.0999993             XYq12                  L.M2.XYq12
9           KLM391.02.0999996             46525                  L.M2.46525
10                  KLM391.04                M4                        L.M4
11          KLM391.04.0999991             4hrh4                  L.M4.4hrh4
12          KLM391.04.0999998             fg785                  L.M4.fg785

I have searched the forum with best of my knowledge:

Upvotes: 1

Answers (3)

CaseebRamos

Reputation: 684

I used @r2evans response and formulated this answer which solves my problem at a considerable speed but doesn't solve fully (continue question here Having issues accumulating values from one column based on keys in another column):

fun <- function(st) strcapture("(.*)[.][^.]+$", st, list(L=""))$L

input <- input %>%
  mutate(K = fun(Issue_Key))



while (TRUE) {
  input <- left_join(input, select(input, Issue_Key, iss = Issue_description), by = c("K" = "Issue_Key")) %>%
    mutate(
      Issue_description= if_else(is.na(iss), Issue_description,
                          if_else(Issue_Key == K, Issue_description, paste(iss, Issue_description, sep = "."))),
      K = fun(K)) %>%
    select(-iss)
  if (all(is.na(input$K))) break
}



input$K <- NULL



# Update the PROJ_NAME column to include repeated parts for each unique PROJ_NAME
input$Issue_description <- sapply(strsplit(as.character(input$Issue_description), "\\."), function(x) {
  unique_parts <- unique(x)
  paste(unique_parts, collapse = ".")
})



# Print the updated dataframe
print(input)

1                      KLM391                      L
2                   KLM391.02                   L.M2
3                   KLM391.01                   L.M1
4                   KLM391.04                   L.M4
5           KLM391.01.0999993              L.M1.O123
6       KLM391.01.0999993.898        L.M1.O123.P1234
7  KLM391.01.0999993.898.asds L.M1.O123.P1234.Q12345
8  KLM391.01.0999993.898.tyut L.M1.O123.P1234.Q67809
9           KLM391.02.0999993             L.M2.XYq12
10          KLM391.02.0999996             L.M2.46525
11          KLM391.04.0999991             L.M4.4hrh4
12          KLM391.04.0999998             L.M4.fg785

Upvotes: 0

Onyambu

Reputation: 79208

You could write a function to do the substitution/replacement. Since the replacement is non-unique ie 0999993 is to be replaced with both O123 and XYq12 we write two functions.

my_sub <- function(x, y){
  fn <- function(x, y){
    idx <- !grepl(".", x, fixed = TRUE)
    if(!is.null(nms<-names(x))) idx <- idx & is.na(nms) 
    if(any(idx)){
      names(x)[idx] <- y[idx]
      x[!idx] <- Reduce(\(i,j)gsub(j, '', i, fixed = TRUE), x[idx], init = x[!idx])
      x[!idx] <- trimws(x[!idx], whitespace = '[.]')
      Recall(x, y)
    }
    else x
  }
  r <- fn(x, y)
  z <- mapply(gsub, r, names(r), x, USE.NAMES = FALSE)
  Reduce(\(i,j)gsub(r[j], y[j], i), seq(x), init=z)
}

cbind(input[1:2],new = my_sub(input[,1], input[, 2]))
#>                     Issue_Key Issue_description                    new
#> 1                      KLM391                 L                      L
#> 2                   KLM391.02                M2                   L.M2
#> 3                   KLM391.01                M1                   L.M1
#> 4                   KLM391.04                M4                   L.M4
#> 5           KLM391.01.0999993              O123              L.M1.O123
#> 6       KLM391.01.0999993.898             P1234        L.M1.O123.P1234
#> 7  KLM391.01.0999993.898.asds            Q12345 L.M1.O123.P1234.Q12345
#> 8  KLM391.01.0999993.898.tyut            Q67809 L.M1.O123.P1234.Q67809
#> 9           KLM391.02.0999993             XYq12             L.M2.XYq12
#> 10          KLM391.02.0999996             46525             L.M2.46525
#> 11          KLM391.04.0999991             4hrh4             L.M4.4hrh4
#> 12          KLM391.04.0999998             fg785             L.M4.fg785

^{Created on 2023-07-11 with reprex v2.0.2}

Upvotes: 1

LMc

Reputation: 18632

Using some tidyverse tools:

library(dplyr)
library(stringr)
library(purrr)

input |>
  mutate(split = str_split(Issue_Key, "\\."),
         needed = map(split, ~ str_flatten(Issue_description[match(accumulate(.x, paste, sep = "."), Issue_Key)], "."))) |>
  select(-split)

How it works

First we create a new list column split that splits up Issue_Key delimited by ".".
We iterate over this list column using map and for each row we do the following. Using row 5 as an example:

.x on row 5 resolves to c("KLM391", "01", "0999993") and we use accumulate to paste this into these substrings: c("KLM391", "KLM391.01", "KLM391.01.0999993").
Next we use match to get the row indicies in Issue_Key that matches c("KLM391", "KLM391.01", "KLM391.01.0999993"), which would be c(1, 3, 5).
Finally, we use these indices to simply look up Issue_description (e.g., Issue_description[c(1, 3, 5)]) and collapse everything into one string using str_flatten again delimited by ".".

Output

                    Issue_Key Issue_description Issue_description_needed                 needed
1                      KLM391                 L                        L                      L
2                   KLM391.02                M2                     L.M2                   L.M2
3                   KLM391.01                M1                     L.M1                   L.M1
4                   KLM391.04                M4                     L.M4                   L.M4
5           KLM391.01.0999993              O123                L.M1.O123              L.M1.O123
6       KLM391.01.0999993.898             P1234          L.M1.O123.P1234        L.M1.O123.P1234
7  KLM391.01.0999993.898.asds            Q12345   L.M1.O123.P1234.Q12345 L.M1.O123.P1234.Q12345
8  KLM391.01.0999993.898.tyut            Q67809   L.M1.O123.P1234.Q67809 L.M1.O123.P1234.Q67809
9           KLM391.02.0999993             XYq12               L.M2.XYq12             L.M2.XYq12
10          KLM391.02.0999996             46525               L.M2.46525             L.M2.46525
11          KLM391.04.0999991             4hrh4               L.M4.4hrh4             L.M4.4hrh4
12          KLM391.04.0999998             fg785               L.M4.fg785             L.M4.fg785

Upvotes: 2

Accumulate values from one column based on keys of another column in r

Answers (3)

Related Questions