dylanjm
dylanjm

Reputation: 2101

Place regex match from one column in to first column with NA by row

I have a list of corrupted file-paths where in some cases the last directory got melded in with the name of the file (e.g. instead of DIR/analysis/filename.csv it's DIR/analysisfilename.csv). I have created a DF split on / and need to place extract analysis from the file-name and place it in the first preceding column that has a NA value.

Is there a way to do this in R?

Here is a small example with my current input and desired output

library(tidyverse)

dat <- structure(list(a = c("MOX", "MOX", "MOX", "MOX", "MOX", 
                            "MOX", "MOX", "MOX", "MOX", "MOX"), 
                      b = c("FFTF", "FFTF", "FFTF", "FFTF", "JOYO", 
                            "JOYO", "JOYO", "JOYO", "JOYO", "JOYO"), 
                      c = c("FO-2", "FO-2", "FO-2", "FO-2", "MK-I", 
                            "MK-I", "MK-I", "B14", "B14", "B14"), 
                      d = c("L09", "L09", "L09", "L09", NA, NA, NA, 
                            "PTM001", "PTM001", "PTM001"), 
                      e = c(NA_character_, NA_character_, NA_character_, 
                            NA_character_, NA_character_, NA_character_, 
                            NA_character_, NA_character_, NA_character_, 
                            NA_character_), 
                      f = c("analysisL09_1D_sampleJ", "analysisL09_1D_sampleH", 
                            "analysisL09_Multi-App_2DRZ", "analysisL09_2DRZ_new_bubble_gb_lim_DiffCoeff4_GrainGrowth", 
                            "analysisMK-I_50MW_new_bubble_gb_lim", "analysisMK-I_75MW_new_bubble_gb_lim", 
                            "analysisMK-I_50MW_old_bubble_gb_lim", "analysisB14_PTM001_1D_sample3_noAm", 
                            "analysisB14_PTM001_1D_sample1", "analysisB14_PTM001_1D_sample2")), 
                 row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"))

Current Output

dat
#> # A tibble: 10 x 6
#>    a     b     c     d      e     f                                             
#>    <chr> <chr> <chr> <chr>  <chr> <chr>                                         
#>  1 MOX   FFTF  FO-2  L09    <NA>  analysisL09_1D_sampleJ                        
#>  2 MOX   FFTF  FO-2  L09    <NA>  analysisL09_1D_sampleH                        
#>  3 MOX   FFTF  FO-2  L09    <NA>  analysisL09_Multi-App_2DRZ                    
#>  4 MOX   FFTF  FO-2  L09    <NA>  analysisL09_2DRZ_new_bubble_gb_lim_DiffCoeff4…
#>  5 MOX   JOYO  MK-I  <NA>   <NA>  analysisMK-I_50MW_new_bubble_gb_lim           
#>  6 MOX   JOYO  MK-I  <NA>   <NA>  analysisMK-I_75MW_new_bubble_gb_lim           
#>  7 MOX   JOYO  MK-I  <NA>   <NA>  analysisMK-I_50MW_old_bubble_gb_lim           
#>  8 MOX   JOYO  B14   PTM001 <NA>  analysisB14_PTM001_1D_sample3_noAm            
#>  9 MOX   JOYO  B14   PTM001 <NA>  analysisB14_PTM001_1D_sample1                 
#> 10 MOX   JOYO  B14   PTM001 <NA>  analysisB14_PTM001_1D_sample2
dat2 <- structure(list(a = c("MOX", "MOX", "MOX", "MOX", "MOX", 
                            "MOX", "MOX", "MOX", "MOX", "MOX"), 
                      b = c("FFTF", "FFTF", "FFTF", "FFTF", "JOYO", 
                            "JOYO", "JOYO", "JOYO", "JOYO", "JOYO"), 
                      c = c("FO-2", "FO-2", "FO-2", "FO-2", "MK-I", 
                            "MK-I", "MK-I", "B14", "B14", "B14"), 
                      d = c("L09", "L09", "L09", "L09", "analysis", "analysis", "analysis", 
                            "PTM001", "PTM001", "PTM001"), 
                      e = c("analysis", "analysis", "analysis", 
                            "analysis", NA_character_, NA_character_, 
                            NA_character_, "analysis", "analysis", 
                            "analysis"), 
                      f = c("L09_1D_sampleJ", "L09_1D_sampleH", 
                            "L09_Multi-App_2DRZ", "L09_2DRZ_new_bubble_gb_lim_DiffCoeff4_GrainGrowth", 
                            "MK-I_50MW_new_bubble_gb_lim", "MK-I_75MW_new_bubble_gb_lim", 
                            "MK-I_50MW_old_bubble_gb_lim", "B14_PTM001_1D_sample3_noAm", 
                            "B14_PTM001_1D_sample1", "B14_PTM001_1D_sample2")), 
                 row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"))

Desired Output

dat2
#> # A tibble: 10 x 6
#>    a     b     c     d        e        f                                        
#>    <chr> <chr> <chr> <chr>    <chr>    <chr>                                    
#>  1 MOX   FFTF  FO-2  L09      analysis L09_1D_sampleJ                           
#>  2 MOX   FFTF  FO-2  L09      analysis L09_1D_sampleH                           
#>  3 MOX   FFTF  FO-2  L09      analysis L09_Multi-App_2DRZ                       
#>  4 MOX   FFTF  FO-2  L09      analysis L09_2DRZ_new_bubble_gb_lim_DiffCoeff4_Gr…
#>  5 MOX   JOYO  MK-I  analysis <NA>     MK-I_50MW_new_bubble_gb_lim              
#>  6 MOX   JOYO  MK-I  analysis <NA>     MK-I_75MW_new_bubble_gb_lim              
#>  7 MOX   JOYO  MK-I  analysis <NA>     MK-I_50MW_old_bubble_gb_lim              
#>  8 MOX   JOYO  B14   PTM001   analysis B14_PTM001_1D_sample3_noAm               
#>  9 MOX   JOYO  B14   PTM001   analysis B14_PTM001_1D_sample1                    
#> 10 MOX   JOYO  B14   PTM001   analysis B14_PTM001_1D_sample2

Created on 2020-03-05 by the reprex package (v0.3.0)

Upvotes: 1

Views: 47

Answers (1)

Ben
Ben

Reputation: 30474

Here is another approach. First, would filter to include data that definitively includes analysis in f, then remove it. Next would make the data longer. For each filepath, would put analysis in the first missing NA value. Finally make wide again.

library(tidyverse)

dat %>%
  dplyr::filter(grepl("analysis", f)) %>%
  mutate(f = str_remove(f, "analysis")) %>%
  mutate(rn = row_number()) %>%
  pivot_longer(cols = -rn, names_to = "path", values_to = "value") %>%
  group_by(rn) %>%
  mutate(value = if_else(min(which(is.na(value))) == row_number(), "analysis", value)) %>%
  pivot_wider(id_cols = rn, names_from = path, values_from = value)

In base R you could do something similar like this as well:

dat$f <- sub("analysis", "", dat$f)
mat <- as.matrix(dat)
mat[cbind(seq(nrow(mat)), max.col(is.na(mat), "first"))] <- "analysis"
as.data.frame(mat)

Output

# A tibble: 10 x 7
# Groups:   rn [10]
      rn a     b     c     d        e        f                                                
   <int> <chr> <chr> <chr> <chr>    <chr>    <chr>                                            
 1     1 MOX   FFTF  FO-2  L09      analysis L09_1D_sampleJ                                   
 2     2 MOX   FFTF  FO-2  L09      analysis L09_1D_sampleH                                   
 3     3 MOX   FFTF  FO-2  L09      analysis L09_Multi-App_2DRZ                               
 4     4 MOX   FFTF  FO-2  L09      analysis L09_2DRZ_new_bubble_gb_lim_DiffCoeff4_GrainGrowth
 5     5 MOX   JOYO  MK-I  analysis NA       MK-I_50MW_new_bubble_gb_lim                      
 6     6 MOX   JOYO  MK-I  analysis NA       MK-I_75MW_new_bubble_gb_lim                      
 7     7 MOX   JOYO  MK-I  analysis NA       MK-I_50MW_old_bubble_gb_lim                      
 8     8 MOX   JOYO  B14   PTM001   analysis B14_PTM001_1D_sample3_noAm                       
 9     9 MOX   JOYO  B14   PTM001   analysis B14_PTM001_1D_sample1                            
10    10 MOX   JOYO  B14   PTM001   analysis B14_PTM001_1D_sample2 

Upvotes: 1

Related Questions