Nightowl
Nightowl

Reputation: 101

How to extract numeric data from data with duplicate column names?

There are repeated column names in my original data. I only want to extract the key information, such as "ansemis" and "mafruit", keep one row of column names, and the rest of the rows are the extracted data. However, since my original data is missing some "masec(n)" data, not every column is the same data (as shown in Figure 1).

enter image description here

I want to extract the data in this table as shown in Figure 2.

enter image description here

At the same time, I have 77 csv files of the same format (named 1_modrapport, 2_modrapport, 3_modrapport, 4_modrapport...77_modrapport in sequence). I want to summarize the extraction results of each csv, and insert a vertical row "soil_ref" named with the previous number of *_modrapport 1,2,3,4,5,... 77.

raw_data<-structure(list(P_usm = c("001_Pal_IRR1_N0", "P_usm", "001_Pal_IRR1_N0", 
                             "P_usm", "001_Pal_IRR1_N0", "P_usm", "001_Pal_IRR1_N0", "P_usm", 
                             "001_Pal_IRR1_N0"), wlieu = c("87_073_v3test", "wlieu", "87_073_v3test", 
                                                           "wlieu", "87_073_v3test", "wlieu", "87_073_v3test", "wlieu", 
                                                           "87_073_v3test"), ansemis = c("1980", "ansemis", "1981", "ansemis", 
                                                                                         "1982", "ansemis", "1983", "ansemis", "1984"), CNgrain = c("7.7690000000000001", 
                                                                                                                                                    "CNgrain", "6.4790000000000001", "CNgrain", "7.3739999999999997", 
                                                                                                                                                    "CNgrain", "6.5549999999999997", "CNgrain", "6.5449999999999999"
                                                                                         ), `masec(n)` = c("6.9470000000000001", "masec(n)", "7.7850000000000001", 
                                                                                                           "mafruit", "2.8279999999999998", "mafruit", "3.355", "mafruit", 
                                                                                                           "3.3410000000000002"), mafruit = c("2.3639999999999999", "mafruit", 
                                                                                                                                              "3.1230000000000002", NA, NA, NA, NA, NA, NA)), class = c("tbl_df", 
                                                                                                                                                                                                        "tbl", "data.frame"), row.names = c(NA, -9L))

My code:

library(dplyr)
library(readr)

process_csv_file <- function(file_path) {
  raw_data <- read_csv(file_path, col_types = cols(.default = "c"))
  column_names <- names(raw_data)
  num_rows <- nrow(raw_data)
  data_list <- lapply(seq(1, num_rows, by = 2), function(i) {
    if (i + 1 <= num_rows) {

      data_row <- raw_data[i + 1, ]

      tibble(
        P_usm = data_row$P_usm,
        ansemis = data_row$ansemis,
        mafruit = data_row$mafruit
      )
    }
  })
  
  data_combined <- bind_rows(data_list)
  
  return(data_combined)
}

path <- "C:/MyJavaSTICS/01_grid/Output_Results/MGIPallador_Results"

csv_files <- list.files(path, pattern = "*.csv", full.names = TRUE)

all_data <- lapply(csv_files, process_csv_file)
final_data <- bind_rows(all_data)
final_data_cleaned <- final_data %>%
  mutate(across(everything(), as.character))

print(head(final_data_cleaned))

write_csv(final_data_cleaned, "C:/MyJavaSTICS/01_grid/Output_Results/MGIPallador_Results/combined_data.csv")

This is the final format I want to have, soil_ref will be from 1 to 77 instead of 1 to 3 after summarizing all data together. How should I adjust my code? Thanks in advance for your help!

enter image description here

Upvotes: 1

Views: 75

Answers (2)

LMc
LMc

Reputation: 18712

library(dplyr)
library(janitor)
library(purrr)


split(raw_data[-1, ], rep(seq(nrow(raw_data) / 2), each = 2)) %>%
  map_dfr(\(x) row_to_names(x, 1) %>% remove_empty(which = "cols")) %>%
  bind_rows(raw_data[1,], .) %>% 
  mutate(soil_ref = cumsum(duplicated(ansemis)) + 1, P_usm, ansemis, mafruit, 
         .by = P_usm, .keep = "none") %>% 
  mutate(across(everything(), type.convert))

This assumes that every other row is a header. The last mutate() statement converts columns to their appropriate type, which is why ansemis is integer, mafruit is double, etc.

Output

  P_usm           ansemis mafruit soil_ref
  <chr>             <int>   <dbl>    <int>
1 001_Pal_IRR1_N0    1980    2.36        1
2 001_Pal_IRR1_N0    1981    3.12        1
3 001_Pal_IRR1_N0    1982    2.83        1
4 001_Pal_IRR1_N0    1983    3.36        1
5 001_Pal_IRR1_N0    1984    3.34        1

Upvotes: 1

Jon Spring
Jon Spring

Reputation: 66880

Here's an approach where I group the rows into pairs, reshape longer, use each pair's header row to specify what column the data belongs in, remove the header rows, and then reshape wide again.

# Add the headers as a first row, so the first group can be 
#   treated the same as the others.
rbind(colnames(raw_data), raw_data) |>

  # Use whatever test is most reliable to distinguish header row vs data row
  mutate(header_row = P_usm == "P_usm", row_grp = cumsum(header_row)) |>

  pivot_longer(-(header_row:row_grp)) |>
  mutate(name = value[header_row], .by = c(row_grp, name)) |>
  filter(!header_row) |>
  pivot_wider(names_from = name, values_from = value)

Result

# A tibble: 5 × 9
  header_row row_grp P_usm           wlieu         ansemis CNgrain            `masec(n)`     mafruit `NA` 
  <lgl>        <int> <chr>           <chr>         <chr>   <chr>              <chr>          <chr>   <chr>
1 FALSE            1 001_Pal_IRR1_N0 87_073_v3test 1980    7.7690000000000001 6.94700000000… 2.3639… NA   
2 FALSE            2 001_Pal_IRR1_N0 87_073_v3test 1981    6.4790000000000001 7.78500000000… 3.1230… NA   
3 FALSE            3 001_Pal_IRR1_N0 87_073_v3test 1982    7.3739999999999997 NA             2.8279… NA   
4 FALSE            4 001_Pal_IRR1_N0 87_073_v3test 1983    6.5549999999999997 NA             3.355   NA   
5 FALSE            5 001_Pal_IRR1_N0 87_073_v3test 1984    6.5449999999999999 NA             3.3410… NA 

Upvotes: 1

Related Questions