nik
nik

Reputation: 2584

how to count and remove similar strings across columns

I have a data with many columns . for example this is with three columns

df<-structure(list(V1 = structure(c(5L, 1L, 7L, 3L, 2L, 4L, 6L, 6L
), .Label = c("CPSIAAAIAAVNALHGR", "DLNYCFSGMSDHR", "FPEHELIVDPQR", 
"IADPDAVKPDDWDEDAPSK", "LWADHGVQACFGR", "WGEAGAEYVVESTGVFTTMEK", 
"YYVTIIDAPGHR"), class = "factor"), V2 = structure(c(5L, 2L, 
7L, 3L, 4L, 6L, 1L, 1L), .Label = c("", "CPSIAAAIAAVNALHGR", 
"GCITIIGGGDTATCCAK", "HVGPGVLSMANAGPNTNGSQFFICTIK", "LLELGPKPEVAQQTR", 
"MVCCSAWSEDHPICNLFTCGFDR", "YYVTIIDAPGHR"), class = "factor"), 
    V3 = structure(c(4L, 3L, 2L, 4L, 3L, 1L, 1L, 1L), .Label = c("", 
    "AVCMLSNTTAIAEAWAR", "DLNYCFSGMSDHR", "FPEHELIVDPQR"), class = "factor")), .Names = c("V1", 
"V2", "V3"), class = "data.frame", row.names = c(NA, -8L))

-The first column, we don't look at any other column, we just count how many strings there are and keep the unique one

This continues for as many columns as we have

for example for this data, we will have the following

 Column 1              Column 2                    Column 3
LWADHGVQACFGR
CPSIAAAIAAVNALHGR     LLELGPKPEVAQQTR              AVCMLSNTTAIAEAWAR
YYVTIIDAPGHR          GCITIIGGGDTATCCAK 
FPEHELIVDPQR          HVGPGVLSMANAGPNTNGSQFFICTIK   
DLNYCFSGMSDHR         MVCCSAWSEDHPICNLFTCGFDR   
IADPDAVKPDDWDEDAPSK     
WGEAGAEYVVESTGVFTTMEK   

Upvotes: 0

Views: 169

Answers (2)

Sotos
Sotos

Reputation: 51592

Here is a solution via tidyverse,

library(tidyverse)

df1 <- df %>% 
 gather(var, string) %>% 
 filter(string != '' & !duplicated(string)) %>% 
 group_by(var) %>% 
 mutate(cnt = seq(n())) %>% 
 spread(var, string) %>%
 select(-cnt)

Which gives

# A tibble: 7 x 4
    cnt                    V1                          V2                V3
* <int>                 <chr>                       <chr>             <chr>
1     1         LWADHGVQACFGR             LLELGPKPEVAQQTR AVCMLSNTTAIAEAWAR
2     2     CPSIAAAIAAVNALHGR           GCITIIGGGDTATCCAK              <NA>
3     3          YYVTIIDAPGHR HVGPGVLSMANAGPNTNGSQFFICTIK              <NA>
4     4          FPEHELIVDPQR     MVCCSAWSEDHPICNLFTCGFDR              <NA>
5     5         DLNYCFSGMSDHR                        <NA>              <NA>
6     6   IADPDAVKPDDWDEDAPSK                        <NA>              <NA>
7     7 WGEAGAEYVVESTGVFTTMEK                        <NA>              <NA>

You can use colSums to get the number of strings,

colSums(!is.na(df1))
#V1 V2 V3 
# 7  4  1 

A similar approach via base R, that would save the strings in a list would be,

df[] <- lapply(df, as.character)
d1 <- stack(df)
d1 <- d1[d1$values != '' & !duplicated(d1$values),]
l1 <- unstack(d1, values ~ ind)

lengths(l1)
#V1 V2 V3 
# 7  4  1

Upvotes: 1

www
www

Reputation: 39174

A base R solution. df2 is the final output.

# Convert to character
L1 <- lapply(df, as.character)
# Get unique string
L2 <- lapply(L1, unique)
# Remove ""
L3 <- lapply(L2, function(vec){vec <- vec[!(vec %in% "")]})

# Use for loop to remove non-unique string from previous columns
for (i in 2:length(L3)){
  previous_vec <- unlist(L3[1:(i - 1)])
  current_vec <- L3[[i]]
  L3[[i]] <- current_vec[!(current_vec %in% previous_vec)]
}

# Get the maximum column length
max_num <- max(sapply(L3, length))

# Append "" to each column
L4 <- lapply(L3, function(vec){vec <- c(vec, rep("", max_num - length(vec)))})

# Convert L4 to a data frame
df2 <- as.data.frame(do.call(cbind, L4))

Upvotes: 1

Related Questions