Silvia
Silvia

Reputation: 55

From two data frames, is there any way to count unique values of a column in R?

From a tsv file, I have two tables.

For example,

table 1: V_one

readId V_segment
abc IGHV3-18-F(2695)
ghi IGHV3-9-F(2840)

and

table 2: V_two

readId V_segment
aaa IGHV3-9-F(2714),IGHV3-12-F(2656)
bbb IGHV3-10-F(2666),IGHV3-18-F(2666)

And I need to create another table where, for each unique value in the column "V_segment" of these new two tables, I get a count of the times every unique value appears in the column "V_segment" in the tsv file.

For example:

V_segment Count
IGHV3-18-F 3
IGHV3-9-F 15
IGHV3-9-F,IGHV3-12-F 4
IGHV3-10-F,IGHV3-18-F 7

I tried this code:

df <- read_tsv('file.tsv') # tsv file with lots of columns
segmV <- data.frame(df$readId, df$V_segment, stringsAsFactors = FALSE) # create a data frame with two columns of the tsv file
V_one <- segmV[!grepl(",", df$V_segment),] # table 1
V_two <- segmV[grepl(",", df$V_segment),] # table 2

v1 <- V_one %>%
      group_by(V_one$V_segment) %>%
      mutate(Count = n_distinct(V_one$V_segment))

But it's not working, since (for this) I need to ignore the information in the parenthesis.

> dput(head(V_one, n=10))
structure(list(readId = c("abc", "def", 
"ghi", "jkl", "mno", "pqr", "stu", "vwy", "zab", "cde"), 
    V_segment = c("IGHV3-18-F(2695)", "IGHV3-18-F(2782)", 
    "IGHV3-18-F(2772)", "IGHV3-18-F(1952)", "IGHV3-15-F(2792)", 
    "IGHV3-18-F(2558)", "IGHV3-9-F(2831)", "IGHV3-18-F(2221)", 
    "IGHV3-18-F(2812)", "IGHV3-15-F(2791)")), row.names = c(1L, 
2L, 4L, 9L, 11L, 14L, 17L, 19L, 24L, 25L), class = "data.frame")

> dput(head(V_two, n=10))
structure(list(readId = c("aaa", "bbb", "ccc", "ddd", "eee", "fff", 
"ggg", "hhh", "iii", "jjj"), 
    V_segment = c("IGHV3-10-F(2429),IGHV3-12-F(2429)", 
    "IGHV3-9-F(2714),IGHV3-12-F(2656)", "IGHV1-16-ORF(2920),IGHV1-19-F(2920)", 
    "IGHV3-17-F(2512),IGHV3-18-F(2512)", "IGHV3-10-F(2666),IGHV3-18-F(2666)", 
    "IGHV1-8-F(2901),IGHV1-21-F(2814)", "IGHV3-10-F(2685),IGHV3-18-F(2685)", 
    "IGHV3-12-F(2801),IGHV3-20-F(2743)", "IGHV3-20-F(2754),IGHV3-10-F(2725)", 
    "IGHV3-10-F(2714),IGHV3-18-F(2714)")), row.names = c(3L, 
5L, 6L, 7L, 8L, 10L, 12L, 13L, 15L, 16L), class = "data.frame")

> dput(head(V_table3, n=10))
structure(list(readId = c("aaa", "bbb", "ccc", "ddd", "eee", "fff", 
 "ggg", "hhh", "iii", "jjj"), 
        V_segment = c("IGHV3-10-F(242,9),IGHV3-12-F(242,9)", 
        "IGHV3-9-F(271,4),IGHV3-12-F(265,6)", "IGHV1-16-ORF(292,0),IGHV1-19-F(2920)", 
        "IGHV3-17-F(251,2),IGHV3-18-F(2512)", "IGHV3-10-F(2666),IGHV3-18-F(2666)", 
        "IGHV1-8-F(290,1),IGHV1-21-F(281,4)", "IGHV3-10-F(2685),IGHV3-18-F(2685)", 
        "IGHV3-12-F(2801),IGHV3-20-F(274,3)", "IGHV3-20-F(2754),IGHV3-10-F(272,5)", 
        "IGHV3-10-F(2714),IGHV3-18-F(271,4)")), row.names = c(3L, 
    5L, 6L, 7L, 8L, 10L, 12L, 13L, 15L, 16L), class = "data.frame")

Could you help me please?

Thank you in advance!

Upvotes: 2

Views: 158

Answers (3)

Friede
Friede

Reputation: 7979

This is basically the answer given by @BigFinger with a few modifications: (1) Copy and paste colnames from V_one to V_two, since there is a difference: V_Segment and V_segment. (2) Create a new object V. This allows us to have a look at it if we like to. (3) Convert the table to data.frame and set names.

Based on your desired output I would recommend to do:

names(V_two) = names(V_one)
V = rbind(V_one, V_two)
V$V_Segment = gsub("\\([0-9]+\\)", "", V$V_Segment)
as.data.frame(table(V$V_Segment)) |> setNames(c("V_Segment", "Count"))

which gives

                 V_Segment Count
1  IGHV1-16-ORF,IGHV1-19-F     1
2     IGHV1-8-F,IGHV1-21-F     1
3    IGHV3-10-F,IGHV3-12-F     1
4    IGHV3-10-F,IGHV3-18-F     3
5    IGHV3-12-F,IGHV3-20-F     1
6               IGHV3-15-F     2
7    IGHV3-17-F,IGHV3-18-F     1
8               IGHV3-18-F     7
9    IGHV3-20-F,IGHV3-10-F     1
10               IGHV3-9-F     1
11    IGHV3-9-F,IGHV3-12-F     1

Data:

V_one = structure(list(readId = c("abc", "def", 
                                  "ghi", "jkl", "mno", "pqr", "stu", "vwy", "zab", "cde"), 
                       V_Segment = c("IGHV3-18-F(2695)", "IGHV3-18-F(2782)", 
                                     "IGHV3-18-F(2772)", "IGHV3-18-F(1952)", "IGHV3-15-F(2792)", 
                                     "IGHV3-18-F(2558)", "IGHV3-9-F(2831)", "IGHV3-18-F(2221)", 
                                     "IGHV3-18-F(2812)", "IGHV3-15-F(2791)")), 
                  row.names = c(1L, 2L, 4L, 9L, 11L, 14L, 17L, 19L, 24L, 25L), class = "data.frame") 

V_two = structure(list(readId = c("aaa", "bbb", "ccc", "ddd", "eee", "fff", 
                          "ggg", "hhh", "iii", "jjj"), 
               V_segment = c("IGHV3-10-F(2429),IGHV3-12-F(2429)", 
                             "IGHV3-9-F(2714),IGHV3-12-F(2656)", "IGHV1-16-ORF(2920),IGHV1-19-F(2920)", 
                             "IGHV3-17-F(2512),IGHV3-18-F(2512)", "IGHV3-10-F(2666),IGHV3-18-F(2666)", 
                             "IGHV1-8-F(2901),IGHV1-21-F(2814)", "IGHV3-10-F(2685),IGHV3-18-F(2685)", 
                             "IGHV3-12-F(2801),IGHV3-20-F(2743)", "IGHV3-20-F(2754),IGHV3-10-F(2725)", 
                             "IGHV3-10-F(2714),IGHV3-18-F(2714)")), 
               row.names = c(3L, 5L, 6L, 7L, 8L, 10L, 12L, 13L, 15L, 16L), class = "data.frame")

Upvotes: 1

Hoel
Hoel

Reputation: 729

library(dplyr)

bind_rows(mget(ls(pattern = "V_"))) %>%  
  count(V_segment)

# A tibble: 4 × 2
  V_segment                             n
  <chr>                             <int>
1 IGHV3-10-F(2666),IGHV3-18-F(2666)     1
2 IGHV3-18-F(2695)                      1
3 IGHV3-9-F(2714),IGHV3-12-F(2656)      1
4 IGHV3-9-F(2840)                       1

Individual segments:

bind_rows(mget(ls(pattern = "V_"))) %>% 
  mutate(across(V_segment, ~ str_remove_all(.x, "\\(.*?\\)"))) %>%  
  count(V_segment)

# A tibble: 4 × 2
  V_segment                 n
  <chr>                 <int>
1 IGHV3-10-F,IGHV3-18-F     1
2 IGHV3-18-F                1
3 IGHV3-9-F                 1
4 IGHV3-9-F,IGHV3-12-F      1

With the provided sample data:

V_one <- structure(list(readId = c("abc", "def", 
                          "ghi", "jkl", "mno", "pqr", "stu", "vwy", "zab", "cde"), 
               V_Segment = c("IGHV3-18-F(2695)", "IGHV3-18-F(2782)", 
                             "IGHV3-18-F(2772)", "IGHV3-18-F(1952)", "IGHV3-15-F(2792)", 
                             "IGHV3-18-F(2558)", "IGHV3-9-F(2831)", "IGHV3-18-F(2221)", 
                             "IGHV3-18-F(2812)", "IGHV3-15-F(2791)")), row.names = c(1L, 
                                                                                     2L, 4L, 9L, 11L, 14L, 17L, 19L, 24L, 25L), class = "data.frame") %>% 
  as_tibble()

V_one %>%  
  mutate(V_Segment = str_remove(V_Segment, "\\(.*?\\)")) %>%
  count(V_Segment)

# A tibble: 3 × 2
  V_Segment      n
  <chr>      <int>
1 IGHV3-15-F     2
2 IGHV3-18-F     7
3 IGHV3-9-F      1

Upvotes: 2

BigFinger
BigFinger

Reputation: 1043

This is an alternative solution:

V_one = data.frame(
    readId = c("abc", "ghi"),
    V_segment = c("IGHV3-18-F(2695)", "IGHV3-9-F(2840)")
)

V_two = data.frame(
    readId = c("def", "jkl"),
    V_segment = c("IGHV3-9-F(2714),IGHV3-12-F(2656)", "IGHV3-10-F(2666),IGHV3-18-F(2666)")
)

table(gsub("\\([0-9]+\\)","",rbind(V_one,V_two)$V_segment))

Result:

IGHV3-10-F,IGHV3-18-F            IGHV3-18-F             IGHV3-9-F  IGHV3-9-F,IGHV3-12-F 
                    1                     1                     1                     1 

Upvotes: 0

Related Questions