Julie W.
Julie W.

Reputation: 21

How to make a frequency table from a data frame in R

The data frame is like this: enter image description here

header: system
Row 1:  00000000000000000503_0
Row 2:  00000000000000000503_1
Row 3:  00000000000000000503_2
Row 4:  00000000000000000503_3
Row 5:  000000000000000004e7_0
Row 6:  000000000000000004e7_1
Row 7:  00000000000000000681_0
Row 8:  00000000000000000681_1
Row 9:  00000000000000000681_2

I want to generate a frequency table with the quantities of the code before string "_" such that:

"00000000000000000503" appears 4 times, "000000000000000004e7" appears 2 times, and so on.

How do I do this in R?

Upvotes: 1

Views: 251

Answers (5)

akrun
akrun

Reputation: 887831

An option with str_remove and group_by

library(stringr)
library(dplyr)
df %>%
    group_by(V3 = str_remove(V3, "_\\d+$")) %>% 
    summarise(n = n())
# A tibble: 3 x 2
#  V3                       n
#  <chr>                <int>
#1 000000000000000004e7     2
#2 00000000000000000503     4
#3 00000000000000000681     3

Or in base R with table and trimws

table(trimws(df$V3, whitespace = "_[0-9]+"))

data

df <- structure(list(V1 = c("Row", "Row", "Row", "Row", "Row", "Row", 
"Row", "Row", "Row"), V2 = c("1:", "2:", "3:", "4:", "5:", "6:", 
"7:", "8:", "9:"), V3 = c("00000000000000000503_0", "00000000000000000503_1", 
"00000000000000000503_2", "00000000000000000503_3", "000000000000000004e7_0", 
"000000000000000004e7_1", "00000000000000000681_0", "00000000000000000681_1", 
"00000000000000000681_2")), class = "data.frame", row.names = c(NA, 
-9L))

Upvotes: 1

Sonali J
Sonali J

Reputation: 68

A tidyverse answer would be

my_data <-  mydata %>% 
  mutate_if(is.factor, as.character) %>%
  mutate(system = gsub('_[^_]*$', '', system)) %>%
  group_by(system) %>%
  count() %>%
  ungroup()
my_data

Upvotes: 1

akaDrHouse
akaDrHouse

Reputation: 2250

Another option using the stringr library that is included in tidyverse

> library(tidyverse)
> mydata <- data.frame(system = c("00000000000000000503_0",
                      "00000000000000000503_1",
                      "00000000000000000503_2",
                      "00000000000000000503_3",
                      "000000000000000004e7_0",
                      "000000000000000004e7_1",
                      "00000000000000000681_0",
                      "00000000000000000681_1",
                      "00000000000000000681_2"))
> mydata
                  system
1 00000000000000000503_0
2 00000000000000000503_1
3 00000000000000000503_2
4 00000000000000000503_3
5 000000000000000004e7_0
6 000000000000000004e7_1
7 00000000000000000681_0
8 00000000000000000681_1
9 00000000000000000681_2
> # Split data using str_split
> mydata$leftside <- sapply(mydata$system, function(x) unlist(str_split(x, "_"))[1]) #split string by the "_" and take first piece
> mydata$rightside <- sapply(mydata$system, function(x) unlist(str_split(x, "_"))[2]) #split string by the "_" and take second piece
> 
> mydata
                  system             leftside rightside
1 00000000000000000503_0 00000000000000000503         0
2 00000000000000000503_1 00000000000000000503         1
3 00000000000000000503_2 00000000000000000503         2
4 00000000000000000503_3 00000000000000000503         3
5 000000000000000004e7_0 000000000000000004e7         0
6 000000000000000004e7_1 000000000000000004e7         1
7 00000000000000000681_0 00000000000000000681         0
8 00000000000000000681_1 00000000000000000681         1
9 00000000000000000681_2 00000000000000000681         2

> # alternative tabulate fuction than base::table(). Can Provide nicer options.
> xtabs(data = mydata, formula = ~leftside)
leftside
000000000000000004e7 00000000000000000503 00000000000000000681 
                   2                    4                    3 

Upvotes: 1

NelsonGon
NelsonGon

Reputation: 13319

A dplyr-tidyr alternative:

df %>% 
  tidyr::separate(V3, c("target", "non_target")) %>% 
   count(target)
# A tibble: 3 x 2
  target                   n
  <chr>                <int>
1 000000000000000004e7     2
2 00000000000000000503     4
3 00000000000000000681     3

With base:

table(sapply(strsplit(df$system, "_"),"[[", 1))

Data:

df <- structure(list(V1 = c("Row", "Row", "Row", "Row", "Row", "Row", 
"Row", "Row", "Row"), V2 = c("1:", "2:", "3:", "4:", "5:", "6:", 
"7:", "8:", "9:"), V3 = c("00000000000000000503_0", "00000000000000000503_1", 
"00000000000000000503_2", "00000000000000000503_3", "000000000000000004e7_0", 
"000000000000000004e7_1", "00000000000000000681_0", "00000000000000000681_1", 
"00000000000000000681_2")), class = "data.frame", row.names = c(NA, 
-9L))

Upvotes: 1

Ronak Shah
Ronak Shah

Reputation: 389235

Remove everything after underscore and use table to count frequency

table(sub("_.*", "", data$col1))
#Also
#table(sub("(.*)_.*", "\\1", data$col1))

#000000000000000004e7 00000000000000000503 00000000000000000681 
#                   2                    4                    3 

If final output needs to be a dataframe use stack

stack(table(sub("_.*", "", data$col1)))

#  values                  ind
#1      2 000000000000000004e7
#2      4 00000000000000000503
#3      3 00000000000000000681

data

data <- structure(list(col1 = structure(c(3L, 4L, 5L, 6L, 1L, 2L, 7L, 
8L, 9L), .Label = c("000000000000000004e7_0", "000000000000000004e7_1", 
"00000000000000000503_0", "00000000000000000503_1", 
"00000000000000000503_2", 
"00000000000000000503_3", "00000000000000000681_0", 
"00000000000000000681_1", 
"00000000000000000681_2"), class = "factor")), class = "data.frame", 
row.names = c(NA, -9L))

Upvotes: 1

Related Questions