Combining columns based on partial column name

Question

I was hoping there would be an easier/faster/cleaner way of doing what I want because right now this is super convoluted:

I have column names of :

"OTS_SM0_1","OTS_SM0_2","OTS_SM0_3","OTS_SM0_4","OTS_SM0_5","OTS_SM0_6",
"OTS_SM0_7","OTS_SM0_8","OTS_SM0_9",
"OTS_SM1_x1_4","OTS_SM1_x1_6","OTS_SM1_x1_7","OTS_SM1_x1_8",
"OTS_SM1_x2_4","OTS_SM1_x2_6","OTS_SM1_x2_7","OTS_SM1_x2_8",
"OTS_SM1_x3_4","OTS_SM1_x3_6","OTS_SM1_x3_7","OTS_SM1_x3_8",
"OTS_SM1_x4_4","OTS_SM1_x4_6","OTS_SM1_x4_7","OTS_SM1_x4_8",
"OTS_SM1_x5_4","OTS_SM1_x5_6","OTS_SM1_x5_7","OTS_SM1_x5_8",
"OTS_SM1_x6_4","OTS_SM1_x6_6","OTS_SM1_x6_7","OTS_SM1_x6_8",
"OTS_SM1_x7_4","OTS_SM1_x7_6","OTS_SM1_x7_7","OTS_SM1_x7_8",
"OTS_SM1_x8_4","OTS_SM1_x8_6","OTS_SM1_x8_7","OTS_SM1_x8_8",
"OTS_SM1_x9_4","OTS_SM1_x9_6","OTS_SM1_x9_7","OTS_SM1_x9_8",
"OTS_SM2_x1","OTS_SM2_x2","OTS_SM2_x3","OTS_SM2_x4","OTS_SM2_x5",
"OTS_SM2_x6","OTS_SM2_x7","OTS_SM2_x8","OTS_SM2_x9"

And I need to concatenate their entries into one column based on name. These are the desired names to combine to:

OTS_SM0 OTS_SM1_x1  OTS_SM1_x2  OTS_SM1_x3  OTS_SM1_x4  OTS_SM1_x5  OTS_SM1_x6  
OTS_SM1_x7  OTS_SM1_x8  OTS_SM1_x9  OTS_SM2_x1  OTS_SM2_x2  OTS_SM2_x3  OTS_SM2_x4  
OTS_SM2_x5  OTS_SM2_x6  OTS_SM2_x7  OTS_SM2_x8  OTS_SM2_x9

However the catch is that these names will not always be the same only the OTS_SM part will remain constant and the number of columns to combine will change as well as their indices within the data frame.

My current solution is:

columnnames <- c("OTS_SM0_1","OTS_SM0_2","OTS_SM0_3","OTS_SM0_4","OTS_SM0_5","OTS_SM0_6","OTS_SM0_7","OTS_SM0_8","OTS_SM0_9",
"OTS_SM1_x1_4","OTS_SM1_x1_6","OTS_SM1_x1_7","OTS_SM1_x1_8","OTS_SM1_x2_4","OTS_SM1_x2_6","OTS_SM1_x2_7","OTS_SM1_x2_8","OTS_SM1_x3_4",
"OTS_SM1_x3_6","OTS_SM1_x3_7","OTS_SM1_x3_8","OTS_SM1_x4_4","OTS_SM1_x4_6","OTS_SM1_x4_7","OTS_SM1_x4_8","OTS_SM1_x5_4","OTS_SM1_x5_6",
"OTS_SM1_x5_7","OTS_SM1_x5_8","OTS_SM1_x6_4","OTS_SM1_x6_6","OTS_SM1_x6_7","OTS_SM1_x6_8","OTS_SM1_x7_4","OTS_SM1_x7_6","OTS_SM1_x7_7",
"OTS_SM1_x7_8","OTS_SM1_x8_4","OTS_SM1_x8_6","OTS_SM1_x8_7","OTS_SM1_x8_8","OTS_SM1_x9_4","OTS_SM1_x9_6","OTS_SM1_x9_7","OTS_SM1_x9_8",
"OTS_SM2_x1","OTS_SM2_x2","OTS_SM2_x3","OTS_SM2_x4","OTS_SM2_x5","OTS_SM2_x6","OTS_SM2_x7","OTS_SM2_x8","OTS_SM2_x9")

names1_index = grep('^(?!.*x).*OTS_SM', columnnames, perl=TRUE)
names1 = columnnames[names1_index]
names1 = substring(names1, 1, 7)
names2_index = grep("OTS_SM.*_x", columnnames)
names2 = columnnames[names2_index]
names2 = substring(names2, 1, 10)

which gives the output like this:

> names1
"OTS_SM0" "OTS_SM0" "OTS_SM0" "OTS_SM0" "OTS_SM0" 
"OTS_SM0" "OTS_SM0" "OTS_SM0" "OTS_SM0"
> names2
"OTS_SM1_x1" "OTS_SM1_x1" "OTS_SM1_x1" "OTS_SM1_x1" "OTS_SM1_x2" 
"OTS_SM1_x2" "OTS_SM1_x2" "OTS_SM1_x2" "OTS_SM1_x3" "OTS_SM1_x3" 
"OTS_SM1_x3" "OTS_SM1_x3" "OTS_SM1_x4" "OTS_SM1_x4" "OTS_SM1_x4" 
"OTS_SM1_x4" "OTS_SM1_x5" "OTS_SM1_x5" "OTS_SM1_x5" "OTS_SM1_x5" 
"OTS_SM1_x6" "OTS_SM1_x6" "OTS_SM1_x6" "OTS_SM1_x6" "OTS_SM1_x7" 
"OTS_SM1_x7" "OTS_SM1_x7" "OTS_SM1_x7" "OTS_SM1_x8" "OTS_SM1_x8"
"OTS_SM1_x8" "OTS_SM1_x8" "OTS_SM1_x9" "OTS_SM1_x9" "OTS_SM1_x9"
"OTS_SM1_x9" "OTS_SM2_x1" "OTS_SM2_x2" "OTS_SM2_x3" "OTS_SM2_x4" 
"OTS_SM2_x5" "OTS_SM2_x6" "OTS_SM2_x7" "OTS_SM2_x8" "OTS_SM2_x9"

So for example for the name1 variable in the dataframe DF:

   OTS_SM0_1 OTS_SM0_2 OTS_SM0_3 OTS_SM0_4 OTS_SM0_5 OTS_SM0_6 OTS_SM0_7 OTS_SM0_8 OTS_SM0_9        
                                                       
 1 0         0         0         0         0         0         0         0         None of the above
 2 Facebook  0         0         0         0         0         0         0         0                
 3 0         0         0         0         0         0         0         0         None of the above
 4 Facebook  Instagram Twitter   Snapchat  Pinterest 0         Tik Tok   0         0                
 5 0         0         0         0         0         LinkedIn  0         0         0                
 6 Facebook  0         0         0         Pinterest 0         0         0         0                
 7 Facebook  Instagram 0         0         0         0         0         0         0                
 8 Facebook  Instagram 0         0         Pinterest 0         Tik Tok   0         0                
 9 NA        NA        NA        NA        NA        NA        NA        NA        NA               
10 NA        NA        NA        NA        NA        NA        NA        NA        NA

I would then combine same name column indices:

unique_names1 <- unique(names1)
for (i in 1:length(unique_names1)){
  combine1= DF[,grep(unique_names1[i],colnames(DF))]
  NewCol1 <- do.call(paste, c(combine1[], sep = ";"))
  NewCol1 <- str_remove_all(NewCol1,";NA")
  NewCol1 <- str_remove_all(NewCol1,"NA;")
  NewCol1 <- str_remove_all(NewCol1,";0")
  NewCol1 <- str_remove_all(NewCol1,"0;")
  DF <- cbind(DF,NewCol1)
}

NewCol1
   [1] "None of the above"        "Facebook"                                                             
   [3] "None of the above"        "Facebook;Instagram;Twitter;Snapchat;Pinterest;Tik Tok"
   [5] "LinkedIn"                 "Facebook;Pinterest"                                         
   [7] "Facebook;Instagram"       "Facebook;Instagram;Pinterest;Tik Tok"                        
   [9] "NA"                       "NA"

Which is then obviously renamed to "OTS_SM0" using some more fun indexing. As well as removing the original columns.

ekoam · Accepted Answer

Suppose your dataframe DF looks like this

  OTS_SM0_1 OTS_SM0_2 OTS_SM0_3 OTS_SM0_4 OTS_SM0_5 OTS_SM0_6 OTS_SM0_7 OTS_SM0_8 OTS_SM0_9 OTS_SM1_x1_4 OTS_SM1_x1_6 OTS_SM1_x1_7 OTS_SM1_x1_8 OTS_SM1_x2_4 OTS_SM1_x2_6 OTS_SM1_x2_7 OTS_SM1_x2_8 OTS_SM1_x3_4 OTS_SM1_x3_6 OTS_SM1_x3_7 OTS_SM1_x3_8 OTS_SM1_x4_4 OTS_SM1_x4_6 OTS_SM1_x4_7 OTS_SM1_x4_8 OTS_SM1_x5_4 OTS_SM1_x5_6 OTS_SM1_x5_7 OTS_SM1_x5_8 OTS_SM1_x6_4 OTS_SM1_x6_6 OTS_SM1_x6_7 OTS_SM1_x6_8 OTS_SM1_x7_4 OTS_SM1_x7_6 OTS_SM1_x7_7 OTS_SM1_x7_8 OTS_SM1_x8_4 OTS_SM1_x8_6 OTS_SM1_x8_7 OTS_SM1_x8_8 OTS_SM1_x9_4 OTS_SM1_x9_6 OTS_SM1_x9_7 OTS_SM1_x9_8 OTS_SM2_x1 OTS_SM2_x2 OTS_SM2_x3 OTS_SM2_x4 OTS_SM2_x5 OTS_SM2_x6 OTS_SM2_x7 OTS_SM2_x8 OTS_SM2_x9
1         1         2         3         4         5         6         7         8         9           10           11           12           13           14           15           16           17           18           19           20           21           22           23           24           25           26           27           28           29           30           31           32           33           34           35           36           37           38           39           40           41           42           43           44           45         46         47         48         49         50         51         52         53         54
2         2         4         6         8        10        12        14        16        18           20           22           24           26           28           30           32           34           36           38           40           42           44           46           48           50           52           54           56           58           60           62           64           66           68           70           72           74           76           78           80           82           84           86           88           90         92         94         96         98        100        102        104        106        108

Here is a dplyr approach. We use id to preserve row relationships. After the first pivot, we remove the characters followed by the last "_" only if they are digits. Last, we summarize the rows into a single cell for each group of id and variable (name in this case) and convert the dataframe from long to wide.

library(dplyr)
library(tidyr)

DF %>%
  mutate(id = row_number()) %>%
  pivot_longer(-id) %>%
  group_by(id, name = sub("(_\d+)?$", "", name)) %>%
  summarize(value = paste0(value, collapse = ";"), .groups = "drop") %>%
  pivot_wider() %>%
  select(-id)

Output (I used a different print method to show you all the columns. The default print method may render something different on your screen, but the underlying object is the same.)

                 OTS_SM0  OTS_SM1_x1  OTS_SM1_x2  OTS_SM1_x3  OTS_SM1_x4  OTS_SM1_x5  OTS_SM1_x6  OTS_SM1_x7  OTS_SM1_x8  OTS_SM1_x9 OTS_SM2_x1 OTS_SM2_x2 OTS_SM2_x3 OTS_SM2_x4 OTS_SM2_x5 OTS_SM2_x6 OTS_SM2_x7 OTS_SM2_x8 OTS_SM2_x9
1      1;2;3;4;5;6;7;8;9 10;11;12;13 14;15;16;17 18;19;20;21 22;23;24;25 26;27;28;29 30;31;32;33 34;35;36;37 38;39;40;41 42;43;44;45         46         47         48         49         50         51         52         53         54
2 2;4;6;8;10;12;14;16;18 20;22;24;26 28;30;32;34 36;38;40;42 44;46;48;50 52;54;56;58 60;62;64;66 68;70;72;74 76;78;80;82 84;86;88;90         92         94         96         98        100        102        104        106        108

However, if you want to do some analysis of the variables, having values nested in a single cell is usually not helpful. Perhaps you would also like to consider a slightly different representation of the data.

library(dplyr)
library(tidyr)

DF %>%
  rename_with(~sub("(_\d+)?$", "`\1", .)) %>%
  mutate(id = row_number()) %>%
  pivot_longer(-id, names_to = c(".value", NA), names_pattern = "(.+)`(_\d+)?")

, which gives

   id OTS_SM0 OTS_SM1_x1 OTS_SM1_x2 OTS_SM1_x3 OTS_SM1_x4 OTS_SM1_x5 OTS_SM1_x6 OTS_SM1_x7 OTS_SM1_x8 OTS_SM1_x9 OTS_SM2_x1 OTS_SM2_x2 OTS_SM2_x3 OTS_SM2_x4 OTS_SM2_x5 OTS_SM2_x6 OTS_SM2_x7 OTS_SM2_x8 OTS_SM2_x9
1   1       1         10         14         18         22         26         30         34         38         42         46         47         48         49         50         51         52         53         54
2   1       2         11         15         19         23         27         31         35         39         43         NA         NA         NA         NA         NA         NA         NA         NA         NA
3   1       3         12         16         20         24         28         32         36         40         44         NA         NA         NA         NA         NA         NA         NA         NA         NA
4   1       4         13         17         21         25         29         33         37         41         45         NA         NA         NA         NA         NA         NA         NA         NA         NA
5   1       5         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA
6   1       6         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA
7   1       7         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA
8   1       8         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA
9   1       9         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA
10  2       2         20         28         36         44         52         60         68         76         84         92         94         96         98        100        102        104        106        108
11  2       4         22         30         38         46         54         62         70         78         86         NA         NA         NA         NA         NA         NA         NA         NA         NA
12  2       6         24         32         40         48         56         64         72         80         88         NA         NA         NA         NA         NA         NA         NA         NA         NA
13  2       8         26         34         42         50         58         66         74         82         90         NA         NA         NA         NA         NA         NA         NA         NA         NA
14  2      10         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA
15  2      12         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA
16  2      14         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA
17  2      16         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA
18  2      18         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA         NA

Data

structure(list(OTS_SM0_1 = c(1, 2), OTS_SM0_2 = c(2, 4), OTS_SM0_3 = c(3, 
6), OTS_SM0_4 = c(4, 8), OTS_SM0_5 = c(5, 10), OTS_SM0_6 = c(6, 
12), OTS_SM0_7 = c(7, 14), OTS_SM0_8 = c(8, 16), OTS_SM0_9 = c(9, 
18), OTS_SM1_x1_4 = c(10, 20), OTS_SM1_x1_6 = c(11, 22), OTS_SM1_x1_7 = c(12, 
24), OTS_SM1_x1_8 = c(13, 26), OTS_SM1_x2_4 = c(14, 28), OTS_SM1_x2_6 = c(15, 
30), OTS_SM1_x2_7 = c(16, 32), OTS_SM1_x2_8 = c(17, 34), OTS_SM1_x3_4 = c(18, 
36), OTS_SM1_x3_6 = c(19, 38), OTS_SM1_x3_7 = c(20, 40), OTS_SM1_x3_8 = c(21, 
42), OTS_SM1_x4_4 = c(22, 44), OTS_SM1_x4_6 = c(23, 46), OTS_SM1_x4_7 = c(24, 
48), OTS_SM1_x4_8 = c(25, 50), OTS_SM1_x5_4 = c(26, 52), OTS_SM1_x5_6 = c(27, 
54), OTS_SM1_x5_7 = c(28, 56), OTS_SM1_x5_8 = c(29, 58), OTS_SM1_x6_4 = c(30, 
60), OTS_SM1_x6_6 = c(31, 62), OTS_SM1_x6_7 = c(32, 64), OTS_SM1_x6_8 = c(33, 
66), OTS_SM1_x7_4 = c(34, 68), OTS_SM1_x7_6 = c(35, 70), OTS_SM1_x7_7 = c(36, 
72), OTS_SM1_x7_8 = c(37, 74), OTS_SM1_x8_4 = c(38, 76), OTS_SM1_x8_6 = c(39, 
78), OTS_SM1_x8_7 = c(40, 80), OTS_SM1_x8_8 = c(41, 82), OTS_SM1_x9_4 = c(42, 
84), OTS_SM1_x9_6 = c(43, 86), OTS_SM1_x9_7 = c(44, 88), OTS_SM1_x9_8 = c(45, 
90), OTS_SM2_x1 = c(46, 92), OTS_SM2_x2 = c(47, 94), OTS_SM2_x3 = c(48, 
96), OTS_SM2_x4 = c(49, 98), OTS_SM2_x5 = c(50, 100), OTS_SM2_x6 = c(51, 
102), OTS_SM2_x7 = c(52, 104), OTS_SM2_x8 = c(53, 106), OTS_SM2_x9 = c(54, 
108)), row.names = c(NA, -2L), class = "data.frame")

Combining columns based on partial column name

Answers (2)

Related Questions