How to extract numeric data from data with duplicate column names？

Question

There are repeated column names in my original data. I only want to extract the key information, such as "ansemis" and "mafruit", keep one row of column names, and the rest of the rows are the extracted data. However, since my original data is missing some "masec(n)" data, not every column is the same data (as shown in Figure 1).

I want to extract the data in this table as shown in Figure 2.

At the same time, I have 77 csv files of the same format (named 1_modrapport, 2_modrapport, 3_modrapport, 4_modrapport...77_modrapport in sequence). I want to summarize the extraction results of each csv, and insert a vertical row "soil_ref" named with the previous number of *_modrapport 1,2,3,4,5,... 77.

raw_data<-structure(list(P_usm = c("001_Pal_IRR1_N0", "P_usm", "001_Pal_IRR1_N0", 
                             "P_usm", "001_Pal_IRR1_N0", "P_usm", "001_Pal_IRR1_N0", "P_usm", 
                             "001_Pal_IRR1_N0"), wlieu = c("87_073_v3test", "wlieu", "87_073_v3test", 
                                                           "wlieu", "87_073_v3test", "wlieu", "87_073_v3test", "wlieu", 
                                                           "87_073_v3test"), ansemis = c("1980", "ansemis", "1981", "ansemis", 
                                                                                         "1982", "ansemis", "1983", "ansemis", "1984"), CNgrain = c("7.7690000000000001", 
                                                                                                                                                    "CNgrain", "6.4790000000000001", "CNgrain", "7.3739999999999997", 
                                                                                                                                                    "CNgrain", "6.5549999999999997", "CNgrain", "6.5449999999999999"
                                                                                         ), `masec(n)` = c("6.9470000000000001", "masec(n)", "7.7850000000000001", 
                                                                                                           "mafruit", "2.8279999999999998", "mafruit", "3.355", "mafruit", 
                                                                                                           "3.3410000000000002"), mafruit = c("2.3639999999999999", "mafruit", 
                                                                                                                                              "3.1230000000000002", NA, NA, NA, NA, NA, NA)), class = c("tbl_df", 
                                                                                                                                                                                                        "tbl", "data.frame"), row.names = c(NA, -9L))

My code:

library(dplyr)
library(readr)

process_csv_file <- function(file_path) {
  raw_data <- read_csv(file_path, col_types = cols(.default = "c"))
  column_names <- names(raw_data)
  num_rows <- nrow(raw_data)
  data_list <- lapply(seq(1, num_rows, by = 2), function(i) {
    if (i + 1 <= num_rows) {

      data_row <- raw_data[i + 1, ]

      tibble(
        P_usm = data_row$P_usm,
        ansemis = data_row$ansemis,
        mafruit = data_row$mafruit
      )
    }
  })
  
  data_combined <- bind_rows(data_list)
  
  return(data_combined)
}

path <- "C:/MyJavaSTICS/01_grid/Output_Results/MGIPallador_Results"

csv_files <- list.files(path, pattern = "*.csv", full.names = TRUE)

all_data <- lapply(csv_files, process_csv_file)
final_data <- bind_rows(all_data)
final_data_cleaned <- final_data %>%
  mutate(across(everything(), as.character))

print(head(final_data_cleaned))

write_csv(final_data_cleaned, "C:/MyJavaSTICS/01_grid/Output_Results/MGIPallador_Results/combined_data.csv")

This is the final format I want to have, soil_ref will be from 1 to 77 instead of 1 to 3 after summarizing all data together. How should I adjust my code? Thanks in advance for your help!

Jon Spring · Accepted Answer

Here's an approach where I group the rows into pairs, reshape longer, use each pair's header row to specify what column the data belongs in, remove the header rows, and then reshape wide again.

# Add the headers as a first row, so the first group can be 
#   treated the same as the others.
rbind(colnames(raw_data), raw_data) |>

  # Use whatever test is most reliable to distinguish header row vs data row
  mutate(header_row = P_usm == "P_usm", row_grp = cumsum(header_row)) |>

  pivot_longer(-(header_row:row_grp)) |>
  mutate(name = value[header_row], .by = c(row_grp, name)) |>
  filter(!header_row) |>
  pivot_wider(names_from = name, values_from = value)

Result

# A tibble: 5 × 9
  header_row row_grp P_usm           wlieu         ansemis CNgrain            `masec(n)`     mafruit `NA` 
                                                             
1 FALSE            1 001_Pal_IRR1_N0 87_073_v3test 1980    7.7690000000000001 6.94700000000… 2.3639… NA   
2 FALSE            2 001_Pal_IRR1_N0 87_073_v3test 1981    6.4790000000000001 7.78500000000… 3.1230… NA   
3 FALSE            3 001_Pal_IRR1_N0 87_073_v3test 1982    7.3739999999999997 NA             2.8279… NA   
4 FALSE            4 001_Pal_IRR1_N0 87_073_v3test 1983    6.5549999999999997 NA             3.355   NA   
5 FALSE            5 001_Pal_IRR1_N0 87_073_v3test 1984    6.5449999999999999 NA             3.3410… NA

How to extract numeric data from data with duplicate column names？

Answers (2)

Related Questions