Reputation: 2101
I have a list of corrupted file-paths where in some cases the last directory got melded in with the name of the file (e.g. instead of DIR/analysis/filename.csv
it's DIR/analysisfilename.csv
). I have created a DF split on /
and need to place extract analysis
from the file-name and place it in the first preceding column that has a NA
value.
Is there a way to do this in R?
Here is a small example with my current input and desired output
library(tidyverse)
dat <- structure(list(a = c("MOX", "MOX", "MOX", "MOX", "MOX",
"MOX", "MOX", "MOX", "MOX", "MOX"),
b = c("FFTF", "FFTF", "FFTF", "FFTF", "JOYO",
"JOYO", "JOYO", "JOYO", "JOYO", "JOYO"),
c = c("FO-2", "FO-2", "FO-2", "FO-2", "MK-I",
"MK-I", "MK-I", "B14", "B14", "B14"),
d = c("L09", "L09", "L09", "L09", NA, NA, NA,
"PTM001", "PTM001", "PTM001"),
e = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_,
NA_character_),
f = c("analysisL09_1D_sampleJ", "analysisL09_1D_sampleH",
"analysisL09_Multi-App_2DRZ", "analysisL09_2DRZ_new_bubble_gb_lim_DiffCoeff4_GrainGrowth",
"analysisMK-I_50MW_new_bubble_gb_lim", "analysisMK-I_75MW_new_bubble_gb_lim",
"analysisMK-I_50MW_old_bubble_gb_lim", "analysisB14_PTM001_1D_sample3_noAm",
"analysisB14_PTM001_1D_sample1", "analysisB14_PTM001_1D_sample2")),
row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"))
Current Output
dat
#> # A tibble: 10 x 6
#> a b c d e f
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 MOX FFTF FO-2 L09 <NA> analysisL09_1D_sampleJ
#> 2 MOX FFTF FO-2 L09 <NA> analysisL09_1D_sampleH
#> 3 MOX FFTF FO-2 L09 <NA> analysisL09_Multi-App_2DRZ
#> 4 MOX FFTF FO-2 L09 <NA> analysisL09_2DRZ_new_bubble_gb_lim_DiffCoeff4…
#> 5 MOX JOYO MK-I <NA> <NA> analysisMK-I_50MW_new_bubble_gb_lim
#> 6 MOX JOYO MK-I <NA> <NA> analysisMK-I_75MW_new_bubble_gb_lim
#> 7 MOX JOYO MK-I <NA> <NA> analysisMK-I_50MW_old_bubble_gb_lim
#> 8 MOX JOYO B14 PTM001 <NA> analysisB14_PTM001_1D_sample3_noAm
#> 9 MOX JOYO B14 PTM001 <NA> analysisB14_PTM001_1D_sample1
#> 10 MOX JOYO B14 PTM001 <NA> analysisB14_PTM001_1D_sample2
dat2 <- structure(list(a = c("MOX", "MOX", "MOX", "MOX", "MOX",
"MOX", "MOX", "MOX", "MOX", "MOX"),
b = c("FFTF", "FFTF", "FFTF", "FFTF", "JOYO",
"JOYO", "JOYO", "JOYO", "JOYO", "JOYO"),
c = c("FO-2", "FO-2", "FO-2", "FO-2", "MK-I",
"MK-I", "MK-I", "B14", "B14", "B14"),
d = c("L09", "L09", "L09", "L09", "analysis", "analysis", "analysis",
"PTM001", "PTM001", "PTM001"),
e = c("analysis", "analysis", "analysis",
"analysis", NA_character_, NA_character_,
NA_character_, "analysis", "analysis",
"analysis"),
f = c("L09_1D_sampleJ", "L09_1D_sampleH",
"L09_Multi-App_2DRZ", "L09_2DRZ_new_bubble_gb_lim_DiffCoeff4_GrainGrowth",
"MK-I_50MW_new_bubble_gb_lim", "MK-I_75MW_new_bubble_gb_lim",
"MK-I_50MW_old_bubble_gb_lim", "B14_PTM001_1D_sample3_noAm",
"B14_PTM001_1D_sample1", "B14_PTM001_1D_sample2")),
row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"))
Desired Output
dat2
#> # A tibble: 10 x 6
#> a b c d e f
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 MOX FFTF FO-2 L09 analysis L09_1D_sampleJ
#> 2 MOX FFTF FO-2 L09 analysis L09_1D_sampleH
#> 3 MOX FFTF FO-2 L09 analysis L09_Multi-App_2DRZ
#> 4 MOX FFTF FO-2 L09 analysis L09_2DRZ_new_bubble_gb_lim_DiffCoeff4_Gr…
#> 5 MOX JOYO MK-I analysis <NA> MK-I_50MW_new_bubble_gb_lim
#> 6 MOX JOYO MK-I analysis <NA> MK-I_75MW_new_bubble_gb_lim
#> 7 MOX JOYO MK-I analysis <NA> MK-I_50MW_old_bubble_gb_lim
#> 8 MOX JOYO B14 PTM001 analysis B14_PTM001_1D_sample3_noAm
#> 9 MOX JOYO B14 PTM001 analysis B14_PTM001_1D_sample1
#> 10 MOX JOYO B14 PTM001 analysis B14_PTM001_1D_sample2
Created on 2020-03-05 by the reprex package (v0.3.0)
Upvotes: 1
Views: 47
Reputation: 30474
Here is another approach. First, would filter to include data that definitively includes analysis
in f
, then remove it. Next would make the data longer. For each filepath, would put analysis
in the first missing NA
value. Finally make wide again.
library(tidyverse)
dat %>%
dplyr::filter(grepl("analysis", f)) %>%
mutate(f = str_remove(f, "analysis")) %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = "path", values_to = "value") %>%
group_by(rn) %>%
mutate(value = if_else(min(which(is.na(value))) == row_number(), "analysis", value)) %>%
pivot_wider(id_cols = rn, names_from = path, values_from = value)
In base R you could do something similar like this as well:
dat$f <- sub("analysis", "", dat$f)
mat <- as.matrix(dat)
mat[cbind(seq(nrow(mat)), max.col(is.na(mat), "first"))] <- "analysis"
as.data.frame(mat)
Output
# A tibble: 10 x 7
# Groups: rn [10]
rn a b c d e f
<int> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 MOX FFTF FO-2 L09 analysis L09_1D_sampleJ
2 2 MOX FFTF FO-2 L09 analysis L09_1D_sampleH
3 3 MOX FFTF FO-2 L09 analysis L09_Multi-App_2DRZ
4 4 MOX FFTF FO-2 L09 analysis L09_2DRZ_new_bubble_gb_lim_DiffCoeff4_GrainGrowth
5 5 MOX JOYO MK-I analysis NA MK-I_50MW_new_bubble_gb_lim
6 6 MOX JOYO MK-I analysis NA MK-I_75MW_new_bubble_gb_lim
7 7 MOX JOYO MK-I analysis NA MK-I_50MW_old_bubble_gb_lim
8 8 MOX JOYO B14 PTM001 analysis B14_PTM001_1D_sample3_noAm
9 9 MOX JOYO B14 PTM001 analysis B14_PTM001_1D_sample1
10 10 MOX JOYO B14 PTM001 analysis B14_PTM001_1D_sample2
Upvotes: 1