BleepBloop
BleepBloop

Reputation: 29

R: Make a new column filled from value in a row

I have multiple dataframes all structured as below but with thousands of observations.

(df <- data.frame(
col1 = c("Elem_A", "String", "String", "String", "Elem_A", "String", "String", "Elem_A", "String", "String", "String", "String"), 
col2 = c("DOI_1", "String", "String", "String", "DOI_2", "String", "String", "DOI_3", "String", "String", "String", "String")))

#>     col1   col2
#> 1 Elem_A  DOI_1
#> 2 String String
#> 3 String String
#> 4 String String
#> 5 Elem_A  DOI_2
#> 6 String String
#> 7 String String
#> 8 Elem_A  DOI_3
#> 9 String String
#> 10 String String
#> 11 String String
#> 12 String String

I am wanting to structure it as below, pulling specifically the value that begins with "DOI" into a new column and filling that value down until it reaches the row with the next "DOI" value.

(df <- data.frame(
col1 = c("Elem_A", "String", "String", "String", 
    "Elem_A", "String", "String", "Elem_A", "String", "String", "String", "String", "String", "String", "String"), 
col2 = c("DOI_1", 
    "String", "String", "String", "DOI_2", "String", "String", 
    "DOI_3", "String", "String", "String", "String", "String", "String", "String"), 
col3 = c("DOI_1", "DOI_1", "DOI_1", "DOI_1", 
    "DOI_2", "DOI_2", "DOI_2", "DOI_3", "DOI_3", "DOI_3", "DOI_3", "DOI_3")))

#>     col1   col2  col3
#> 1 Elem_A  DOI_1 DOI_1
#> 2 String String DOI_1
#> 3 String String DOI_1
#> 4 String String DOI_1
#> 5 Elem_A  DOI_2 DOI_2
#> 6 String String DOI_2
#> 7 String String DOI_2
#> 8 Elem_A  DOI_3 DOI_3
#> 9 String String DOI_3
#> 10 String String DOI_3
#> 11 String String DOI_3
#> 12 String String DOI_3

I was thinking I should somehow incorporate str_detect but the issue is that sometimes "DOI" is also the beginning of some of the "Strings" values within the same column that the "DOI" values are in.

Upvotes: 1

Views: 74

Answers (2)

Mohamed Desouky
Mohamed Desouky

Reputation: 4425

With Base R way

s <- unlist(gregexpr("DOI_\\d+" , df$col2))

df$col3 <- unlist(Map(\(x,y) rep(df$col2[x] ,length.out = y + 1) ,
                  which(s > -1) , rle(s)$lengths[which(rle(s)$values == -1)]))
  • output
     col1   col2  col3
1  Elem_A  DOI_1 DOI_1
2  String String DOI_1
3  String String DOI_1
4  String String DOI_1
5  Elem_A  DOI_2 DOI_2
6  String String DOI_2
7  String String DOI_2
8  Elem_A  DOI_3 DOI_3
9  String String DOI_3
10 String String DOI_3
11 String String DOI_3
12 String String DOI_3

Upvotes: 0

akrun
akrun

Reputation: 887221

We can use str_detect with case_when/ifelse to retrieve the row element and then use fill to fill the NA values with the previous non-NA

library(dplyr)
library(tidyr)
library(stringr)
df <- df %>%
   mutate(col3 = case_when(str_detect(col2, "DOI_") ~ col2)) %>% 
   fill(col3)

-output

df
 col1   col2  col3
1  Elem_A  DOI_1 DOI_1
2  String String DOI_1
3  String String DOI_1
4  String String DOI_1
5  Elem_A  DOI_2 DOI_2
6  String String DOI_2
7  String String DOI_2
8  Elem_A  DOI_3 DOI_3
9  String String DOI_3
10 String String DOI_3
11 String String DOI_3
12 String String DOI_3

If 'DOI_\\d+' is a substring, then use str_extract to extract the substring

df <- df %>%
        mutate(col3 = str_extract(col2, "DOI_\\d+")) %>%
        fill(col3)

-output

df
 col1   col2  col3
1  Elem_A  DOI_1 DOI_1
2  String String DOI_1
3  String String DOI_1
4  String String DOI_1
5  Elem_A  DOI_2 DOI_2
6  String String DOI_2
7  String String DOI_2
8  Elem_A  DOI_3 DOI_3
9  String String DOI_3
10 String String DOI_3
11 String String DOI_3
12 String String DOI_3

Upvotes: 1

Related Questions