Reputation: 21
The data frame is like this: enter image description here
header: system
Row 1: 00000000000000000503_0
Row 2: 00000000000000000503_1
Row 3: 00000000000000000503_2
Row 4: 00000000000000000503_3
Row 5: 000000000000000004e7_0
Row 6: 000000000000000004e7_1
Row 7: 00000000000000000681_0
Row 8: 00000000000000000681_1
Row 9: 00000000000000000681_2
I want to generate a frequency table with the quantities of the code before string "_" such that:
"00000000000000000503" appears 4 times, "000000000000000004e7" appears 2 times, and so on.
How do I do this in R?
Upvotes: 1
Views: 251
Reputation: 887831
An option with str_remove
and group_by
library(stringr)
library(dplyr)
df %>%
group_by(V3 = str_remove(V3, "_\\d+$")) %>%
summarise(n = n())
# A tibble: 3 x 2
# V3 n
# <chr> <int>
#1 000000000000000004e7 2
#2 00000000000000000503 4
#3 00000000000000000681 3
Or in base R
with table
and trimws
table(trimws(df$V3, whitespace = "_[0-9]+"))
df <- structure(list(V1 = c("Row", "Row", "Row", "Row", "Row", "Row",
"Row", "Row", "Row"), V2 = c("1:", "2:", "3:", "4:", "5:", "6:",
"7:", "8:", "9:"), V3 = c("00000000000000000503_0", "00000000000000000503_1",
"00000000000000000503_2", "00000000000000000503_3", "000000000000000004e7_0",
"000000000000000004e7_1", "00000000000000000681_0", "00000000000000000681_1",
"00000000000000000681_2")), class = "data.frame", row.names = c(NA,
-9L))
Upvotes: 1
Reputation: 68
A tidyverse answer would be
my_data <- mydata %>%
mutate_if(is.factor, as.character) %>%
mutate(system = gsub('_[^_]*$', '', system)) %>%
group_by(system) %>%
count() %>%
ungroup()
my_data
Upvotes: 1
Reputation: 2250
Another option using the stringr
library that is included in tidyverse
> library(tidyverse)
> mydata <- data.frame(system = c("00000000000000000503_0",
"00000000000000000503_1",
"00000000000000000503_2",
"00000000000000000503_3",
"000000000000000004e7_0",
"000000000000000004e7_1",
"00000000000000000681_0",
"00000000000000000681_1",
"00000000000000000681_2"))
> mydata
system
1 00000000000000000503_0
2 00000000000000000503_1
3 00000000000000000503_2
4 00000000000000000503_3
5 000000000000000004e7_0
6 000000000000000004e7_1
7 00000000000000000681_0
8 00000000000000000681_1
9 00000000000000000681_2
> # Split data using str_split
> mydata$leftside <- sapply(mydata$system, function(x) unlist(str_split(x, "_"))[1]) #split string by the "_" and take first piece
> mydata$rightside <- sapply(mydata$system, function(x) unlist(str_split(x, "_"))[2]) #split string by the "_" and take second piece
>
> mydata
system leftside rightside
1 00000000000000000503_0 00000000000000000503 0
2 00000000000000000503_1 00000000000000000503 1
3 00000000000000000503_2 00000000000000000503 2
4 00000000000000000503_3 00000000000000000503 3
5 000000000000000004e7_0 000000000000000004e7 0
6 000000000000000004e7_1 000000000000000004e7 1
7 00000000000000000681_0 00000000000000000681 0
8 00000000000000000681_1 00000000000000000681 1
9 00000000000000000681_2 00000000000000000681 2
> # alternative tabulate fuction than base::table(). Can Provide nicer options.
> xtabs(data = mydata, formula = ~leftside)
leftside
000000000000000004e7 00000000000000000503 00000000000000000681
2 4 3
Upvotes: 1
Reputation: 13319
A dplyr
-tidyr
alternative:
df %>%
tidyr::separate(V3, c("target", "non_target")) %>%
count(target)
# A tibble: 3 x 2
target n
<chr> <int>
1 000000000000000004e7 2
2 00000000000000000503 4
3 00000000000000000681 3
With base
:
table(sapply(strsplit(df$system, "_"),"[[", 1))
Data:
df <- structure(list(V1 = c("Row", "Row", "Row", "Row", "Row", "Row",
"Row", "Row", "Row"), V2 = c("1:", "2:", "3:", "4:", "5:", "6:",
"7:", "8:", "9:"), V3 = c("00000000000000000503_0", "00000000000000000503_1",
"00000000000000000503_2", "00000000000000000503_3", "000000000000000004e7_0",
"000000000000000004e7_1", "00000000000000000681_0", "00000000000000000681_1",
"00000000000000000681_2")), class = "data.frame", row.names = c(NA,
-9L))
Upvotes: 1
Reputation: 389235
Remove everything after underscore and use table
to count frequency
table(sub("_.*", "", data$col1))
#Also
#table(sub("(.*)_.*", "\\1", data$col1))
#000000000000000004e7 00000000000000000503 00000000000000000681
# 2 4 3
If final output needs to be a dataframe use stack
stack(table(sub("_.*", "", data$col1)))
# values ind
#1 2 000000000000000004e7
#2 4 00000000000000000503
#3 3 00000000000000000681
data
data <- structure(list(col1 = structure(c(3L, 4L, 5L, 6L, 1L, 2L, 7L,
8L, 9L), .Label = c("000000000000000004e7_0", "000000000000000004e7_1",
"00000000000000000503_0", "00000000000000000503_1",
"00000000000000000503_2",
"00000000000000000503_3", "00000000000000000681_0",
"00000000000000000681_1",
"00000000000000000681_2"), class = "factor")), class = "data.frame",
row.names = c(NA, -9L))
Upvotes: 1