megmac
megmac

Reputation: 509

Combining columns based on partial column name

I was hoping there would be an easier/faster/cleaner way of doing what I want because right now this is super convoluted:

I have column names of :

"OTS_SM0_1","OTS_SM0_2","OTS_SM0_3","OTS_SM0_4","OTS_SM0_5","OTS_SM0_6",
"OTS_SM0_7","OTS_SM0_8","OTS_SM0_9",
"OTS_SM1_x1_4","OTS_SM1_x1_6","OTS_SM1_x1_7","OTS_SM1_x1_8",
"OTS_SM1_x2_4","OTS_SM1_x2_6","OTS_SM1_x2_7","OTS_SM1_x2_8",
"OTS_SM1_x3_4","OTS_SM1_x3_6","OTS_SM1_x3_7","OTS_SM1_x3_8",
"OTS_SM1_x4_4","OTS_SM1_x4_6","OTS_SM1_x4_7","OTS_SM1_x4_8",
"OTS_SM1_x5_4","OTS_SM1_x5_6","OTS_SM1_x5_7","OTS_SM1_x5_8",
"OTS_SM1_x6_4","OTS_SM1_x6_6","OTS_SM1_x6_7","OTS_SM1_x6_8",
"OTS_SM1_x7_4","OTS_SM1_x7_6","OTS_SM1_x7_7","OTS_SM1_x7_8",
"OTS_SM1_x8_4","OTS_SM1_x8_6","OTS_SM1_x8_7","OTS_SM1_x8_8",
"OTS_SM1_x9_4","OTS_SM1_x9_6","OTS_SM1_x9_7","OTS_SM1_x9_8",
"OTS_SM2_x1","OTS_SM2_x2","OTS_SM2_x3","OTS_SM2_x4","OTS_SM2_x5",
"OTS_SM2_x6","OTS_SM2_x7","OTS_SM2_x8","OTS_SM2_x9"

And I need to concatenate their entries into one column based on name. These are the desired names to combine to:

OTS_SM0 OTS_SM1_x1  OTS_SM1_x2  OTS_SM1_x3  OTS_SM1_x4  OTS_SM1_x5  OTS_SM1_x6  
OTS_SM1_x7  OTS_SM1_x8  OTS_SM1_x9  OTS_SM2_x1  OTS_SM2_x2  OTS_SM2_x3  OTS_SM2_x4  
OTS_SM2_x5  OTS_SM2_x6  OTS_SM2_x7  OTS_SM2_x8  OTS_SM2_x9

However the catch is that these names will not always be the same only the OTS_SM part will remain constant and the number of columns to combine will change as well as their indices within the data frame.

My current solution is:

columnnames <- c("OTS_SM0_1","OTS_SM0_2","OTS_SM0_3","OTS_SM0_4","OTS_SM0_5","OTS_SM0_6","OTS_SM0_7","OTS_SM0_8","OTS_SM0_9",
"OTS_SM1_x1_4","OTS_SM1_x1_6","OTS_SM1_x1_7","OTS_SM1_x1_8","OTS_SM1_x2_4","OTS_SM1_x2_6","OTS_SM1_x2_7","OTS_SM1_x2_8","OTS_SM1_x3_4",
"OTS_SM1_x3_6","OTS_SM1_x3_7","OTS_SM1_x3_8","OTS_SM1_x4_4","OTS_SM1_x4_6","OTS_SM1_x4_7","OTS_SM1_x4_8","OTS_SM1_x5_4","OTS_SM1_x5_6",
"OTS_SM1_x5_7","OTS_SM1_x5_8","OTS_SM1_x6_4","OTS_SM1_x6_6","OTS_SM1_x6_7","OTS_SM1_x6_8","OTS_SM1_x7_4","OTS_SM1_x7_6","OTS_SM1_x7_7",
"OTS_SM1_x7_8","OTS_SM1_x8_4","OTS_SM1_x8_6","OTS_SM1_x8_7","OTS_SM1_x8_8","OTS_SM1_x9_4","OTS_SM1_x9_6","OTS_SM1_x9_7","OTS_SM1_x9_8",
"OTS_SM2_x1","OTS_SM2_x2","OTS_SM2_x3","OTS_SM2_x4","OTS_SM2_x5","OTS_SM2_x6","OTS_SM2_x7","OTS_SM2_x8","OTS_SM2_x9")

names1_index = grep('^(?!.*x).*OTS_SM', columnnames, perl=TRUE)
names1 = columnnames[names1_index]
names1 = substring(names1, 1, 7)
names2_index = grep("OTS_SM.*_x", columnnames)
names2 = columnnames[names2_index]
names2 = substring(names2, 1, 10)

which gives the output like this:

> names1
"OTS_SM0" "OTS_SM0" "OTS_SM0" "OTS_SM0" "OTS_SM0" 
"OTS_SM0" "OTS_SM0" "OTS_SM0" "OTS_SM0"
> names2
"OTS_SM1_x1" "OTS_SM1_x1" "OTS_SM1_x1" "OTS_SM1_x1" "OTS_SM1_x2" 
"OTS_SM1_x2" "OTS_SM1_x2" "OTS_SM1_x2" "OTS_SM1_x3" "OTS_SM1_x3" 
"OTS_SM1_x3" "OTS_SM1_x3" "OTS_SM1_x4" "OTS_SM1_x4" "OTS_SM1_x4" 
"OTS_SM1_x4" "OTS_SM1_x5" "OTS_SM1_x5" "OTS_SM1_x5" "OTS_SM1_x5" 
"OTS_SM1_x6" "OTS_SM1_x6" "OTS_SM1_x6" "OTS_SM1_x6" "OTS_SM1_x7" 
"OTS_SM1_x7" "OTS_SM1_x7" "OTS_SM1_x7" "OTS_SM1_x8" "OTS_SM1_x8"
"OTS_SM1_x8" "OTS_SM1_x8" "OTS_SM1_x9" "OTS_SM1_x9" "OTS_SM1_x9"
"OTS_SM1_x9" "OTS_SM2_x1" "OTS_SM2_x2" "OTS_SM2_x3" "OTS_SM2_x4" 
"OTS_SM2_x5" "OTS_SM2_x6" "OTS_SM2_x7" "OTS_SM2_x8" "OTS_SM2_x9"

So for example for the name1 variable in the dataframe DF:

   OTS_SM0_1 OTS_SM0_2 OTS_SM0_3 OTS_SM0_4 OTS_SM0_5 OTS_SM0_6 OTS_SM0_7 OTS_SM0_8 OTS_SM0_9        
   <chr>     <chr>     <chr>     <chr>     <chr>     <chr>     <chr>     <chr>     <chr>            
 1 0         0         0         0         0         0         0         0         None of the above
 2 Facebook  0         0         0         0         0         0         0         0                
 3 0         0         0         0         0         0         0         0         None of the above
 4 Facebook  Instagram Twitter   Snapchat  Pinterest 0         Tik Tok   0         0                
 5 0         0         0         0         0         LinkedIn  0         0         0                
 6 Facebook  0         0         0         Pinterest 0         0         0         0                
 7 Facebook  Instagram 0         0         0         0         0         0         0                
 8 Facebook  Instagram 0         0         Pinterest 0         Tik Tok   0         0                
 9 NA        NA        NA        NA        NA        NA        NA        NA        NA               
10 NA        NA        NA        NA        NA        NA        NA        NA        NA 

I would then combine same name column indices:

unique_names1 <- unique(names1)
for (i in 1:length(unique_names1)){
  combine1= DF[,grep(unique_names1[i],colnames(DF))]
  NewCol1 <- do.call(paste, c(combine1[], sep = ";"))
  NewCol1 <- str_remove_all(NewCol1,";NA")
  NewCol1 <- str_remove_all(NewCol1,"NA;")
  NewCol1 <- str_remove_all(NewCol1,";0")
  NewCol1 <- str_remove_all(NewCol1,"0;")
  DF <- cbind(DF,NewCol1)
}
NewCol1
   [1] "None of the above"        "Facebook"                                                             
   [3] "None of the above"        "Facebook;Instagram;Twitter;Snapchat;Pinterest;Tik Tok"
   [5] "LinkedIn"                 "Facebook;Pinterest"                                         
   [7] "Facebook;Instagram"       "Facebook;Instagram;Pinterest;Tik Tok"                        
   [9] "NA"                       "NA"                                                                   

Which is then obviously renamed to "OTS_SM0" using some more fun indexing. As well as removing the original columns.

Upvotes: 0

Views: 669

Answers (2)

ekoam
ekoam

Reputation: 8844

Suppose your dataframe DF looks like this

  OTS_SM0_1 OTS_SM0_2 OTS_SM0_3 OTS_SM0_4 OTS_SM0_5 OTS_SM0_6 OTS_SM0_7 OTS_SM0_8 OTS_SM0_9 OTS_SM1_x1_4 OTS_SM1_x1_6 OTS_SM1_x1_7 OTS_SM1_x1_8 OTS_SM1_x2_4 OTS_SM1_x2_6 OTS_SM1_x2_7 OTS_SM1_x2_8 OTS_SM1_x3_4 OTS_SM1_x3_6 OTS_SM1_x3_7 OTS_SM1_x3_8 OTS_SM1_x4_4 OTS_SM1_x4_6 OTS_SM1_x4_7 OTS_SM1_x4_8 OTS_SM1_x5_4 OTS_SM1_x5_6 OTS_SM1_x5_7 OTS_SM1_x5_8 OTS_SM1_x6_4 OTS_SM1_x6_6 OTS_SM1_x6_7 OTS_SM1_x6_8 OTS_SM1_x7_4 OTS_SM1_x7_6 OTS_SM1_x7_7 OTS_SM1_x7_8 OTS_SM1_x8_4 OTS_SM1_x8_6 OTS_SM1_x8_7 OTS_SM1_x8_8 OTS_SM1_x9_4 OTS_SM1_x9_6 OTS_SM1_x9_7 OTS_SM1_x9_8 OTS_SM2_x1 OTS_SM2_x2 OTS_SM2_x3 OTS_SM2_x4 OTS_SM2_x5 OTS_SM2_x6 OTS_SM2_x7 OTS_SM2_x8 OTS_SM2_x9
1         1         2         3         4         5         6         7         8         9           10           11           12           13           14           15           16           17           18           19           20           21           22           23           24           25           26           27           28           29           30           31           32           33           34           35           36           37           38           39           40           41           42           43           44           45         46         47         48         49         50         51         52         53         54
2         2         4         6         8        10        12        14        16        18           20           22           24           26           28           30           32           34           36           38           40           42           44           46           48           50           52           54           56           58           60           62           64           66           68           70           72           74           76           78           80           82           84           86           88           90         92         94         96         98        100        102        104        106        108

Here is a dplyr approach. We use id to preserve row relationships. After the first pivot, we remove the characters followed by the last "_" only if they are digits. Last, we summarize the rows into a single cell for each group of id and variable (name in this case) and convert the dataframe from long to wide.

library(dplyr)
library(tidyr)

DF %>%
  mutate(id = row_number()) %>%
  pivot_longer(-id) %>%
  group_by(id, name = sub("(_\\d+)?$", "", name)) %>%
  summarize(value = paste0(value, collapse = ";"), .groups = "drop") %>%
  pivot_wider() %>%
  select(-id)

Output (I used a different print method to show you all the columns. The default print method may render something different on your screen, but the underlying object is the same.)

                 OTS_SM0  OTS_SM1_x1  OTS_SM1_x2  OTS_SM1_x3  OTS_SM1_x4  OTS_SM1_x5  OTS_SM1_x6  OTS_SM1_x7  OTS_SM1_x8  OTS_SM1_x9 OTS_SM2_x1 OTS_SM2_x2 OTS_SM2_x3 OTS_SM2_x4 OTS_SM2_x5 OTS_SM2_x6 OTS_SM2_x7 OTS_SM2_x8 OTS_SM2_x9
1      1;2;3;4;5;6;7;8;9 10;11;12;13 14;15;16;17 18;19;20;21 22;23;24;25 26;27;28;29 30;31;32;33 34;35;36;37 38;39;40;41 42;43;44;45         46         47         48         49         50         51         52         53         54
2 2;4;6;8;10;12;14;16;18 20;22;24;26 28;30;32;34 36;38;40;42 44;46;48;50 52;54;56;58 60;62;64;66 68;70;72;74 76;78;80;82 84;86;88;90         92         94         96         98        100        102        104        106        108

However, if you want to do some analysis of the variables, having values nested in a single cell is usually not helpful. Perhaps you would also like to consider a slightly different representation of the data.

library(dplyr)
library(tidyr)

DF %>%
  rename_with(~sub("(_\\d+)?$", "`\\1", .)) %>%
  mutate(id = row_number()) %>%
  pivot_longer(-id, names_to = c(".value", NA), names_pattern = "(.+)`(_\\d+)?")

, which gives

   id OTS_SM0 OTS_SM1_x1 OTS_SM1_x2 OTS_SM1_x3 OTS_SM1_x4 OTS_SM1_x5 OTS_SM1_x6 OTS_SM1_x7 OTS_SM1_x8 OTS_SM1_x9 OTS_SM2_x1 OTS_SM2_x2 OTS_SM2_x3 OTS_SM2_x4 OTS_SM2_x5 OTS_SM2_x6 OTS_SM2_x7 OTS_SM2_x8 OTS_SM2_x9
1   1       1         10         14         18         22         26         30         34         38         42         46         47         48         49         50         51         52         53         54
2   1       2         11         15         19         23         27         31         35         39         43         NA         NA         NA         NA         NA         NA         NA         NA         NA
3   1       3         12         16         20         24         28         32         36         40         44         NA         NA         NA         NA         NA         NA         NA         NA         NA
4   1       4         13         17         21         25         29         33         37         41         45         NA         NA         NA         NA         NA         NA         NA         NA         NA
5   1       5         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA
6   1       6         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA
7   1       7         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA
8   1       8         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA
9   1       9         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA
10  2       2         20         28         36         44         52         60         68         76         84         92         94         96         98        100        102        104        106        108
11  2       4         22         30         38         46         54         62         70         78         86         NA         NA         NA         NA         NA         NA         NA         NA         NA
12  2       6         24         32         40         48         56         64         72         80         88         NA         NA         NA         NA         NA         NA         NA         NA         NA
13  2       8         26         34         42         50         58         66         74         82         90         NA         NA         NA         NA         NA         NA         NA         NA         NA
14  2      10         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA
15  2      12         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA
16  2      14         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA
17  2      16         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA
18  2      18         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA

Data

structure(list(OTS_SM0_1 = c(1, 2), OTS_SM0_2 = c(2, 4), OTS_SM0_3 = c(3, 
6), OTS_SM0_4 = c(4, 8), OTS_SM0_5 = c(5, 10), OTS_SM0_6 = c(6, 
12), OTS_SM0_7 = c(7, 14), OTS_SM0_8 = c(8, 16), OTS_SM0_9 = c(9, 
18), OTS_SM1_x1_4 = c(10, 20), OTS_SM1_x1_6 = c(11, 22), OTS_SM1_x1_7 = c(12, 
24), OTS_SM1_x1_8 = c(13, 26), OTS_SM1_x2_4 = c(14, 28), OTS_SM1_x2_6 = c(15, 
30), OTS_SM1_x2_7 = c(16, 32), OTS_SM1_x2_8 = c(17, 34), OTS_SM1_x3_4 = c(18, 
36), OTS_SM1_x3_6 = c(19, 38), OTS_SM1_x3_7 = c(20, 40), OTS_SM1_x3_8 = c(21, 
42), OTS_SM1_x4_4 = c(22, 44), OTS_SM1_x4_6 = c(23, 46), OTS_SM1_x4_7 = c(24, 
48), OTS_SM1_x4_8 = c(25, 50), OTS_SM1_x5_4 = c(26, 52), OTS_SM1_x5_6 = c(27, 
54), OTS_SM1_x5_7 = c(28, 56), OTS_SM1_x5_8 = c(29, 58), OTS_SM1_x6_4 = c(30, 
60), OTS_SM1_x6_6 = c(31, 62), OTS_SM1_x6_7 = c(32, 64), OTS_SM1_x6_8 = c(33, 
66), OTS_SM1_x7_4 = c(34, 68), OTS_SM1_x7_6 = c(35, 70), OTS_SM1_x7_7 = c(36, 
72), OTS_SM1_x7_8 = c(37, 74), OTS_SM1_x8_4 = c(38, 76), OTS_SM1_x8_6 = c(39, 
78), OTS_SM1_x8_7 = c(40, 80), OTS_SM1_x8_8 = c(41, 82), OTS_SM1_x9_4 = c(42, 
84), OTS_SM1_x9_6 = c(43, 86), OTS_SM1_x9_7 = c(44, 88), OTS_SM1_x9_8 = c(45, 
90), OTS_SM2_x1 = c(46, 92), OTS_SM2_x2 = c(47, 94), OTS_SM2_x3 = c(48, 
96), OTS_SM2_x4 = c(49, 98), OTS_SM2_x5 = c(50, 100), OTS_SM2_x6 = c(51, 
102), OTS_SM2_x7 = c(52, 104), OTS_SM2_x8 = c(53, 106), OTS_SM2_x9 = c(54, 
108)), row.names = c(NA, -2L), class = "data.frame")

Upvotes: 1

divibisan
divibisan

Reputation: 12155

The iris dataset is a good approximation of your problem:

head(iris)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa

We can do this pretty easily in tidyverse. First we convert to long-form so the names are easy to work with. Then, we truncate the names to the core you want to use for the combined columns. Then we spread it back to wide form with the new, more limited number of columns.

There's one other step needed: pivot_wider ensures each row is unique, so the new "Sepal" and "Petal" columns each contain a list of the values for each Species. Since you one one row for each (you're just concatinating), we use unnest to take those list columns and convert them into multiple rows:

library(tidyverse)
iris %>%
    pivot_longer(-Species) %>%
    mutate(name = gsub('\\..*', '', name)) %>%
    pivot_wider(names_from = 'name', values_from = 'value', values_fn = list) %>%
    unnest(cols = c('Sepal', 'Petal'))

# A tibble: 300 × 3
   Species Sepal Petal
   <fct>   <dbl> <dbl>
 1 setosa    5.1   1.4
 2 setosa    3.5   0.2
 3 setosa    4.9   1.4
 4 setosa    3     0.2
 5 setosa    4.7   1.3
 6 setosa    3.2   0.2
 7 setosa    4.6   1.5
 8 setosa    3.1   0.2
 9 setosa    5     1.4
10 setosa    3.6   0.2
# … with 290 more rows

If you really don't want to have to specify the column names in unnest you can use unnest(cols = colnames(.)) to apply it to all, or just leave out the cols argument (though that will give a warning and might break in the future),

Upvotes: 0

Related Questions