Reputation: 55
Edited to add: Sample data is now present--thanks to @LMc
Hi, folks. I've got a dataframe like this, filtered from IMDb:
structure(list(tconst = c("tt0003419", "tt0003419", "tt0004013", "tt0005231", "tt0005231", "tt0005615", "tt0005615", "tt0005772", "tt0005951", "tt0005951", "tt0006434", "tt0006434", "tt0006554", "tt0006820", "tt0007111", "tt0008826", "tt0010323", "tt0010323", "tt0010323", "tt0010323"), primaryTitle = c("The Student of Prague", "The Student of Prague", "The Ghost Breaker", "The Hound of the Baskervilles", "The Hound of the Baskervilles", "Life Without Soul", "Life Without Soul", "Mortmain", "Satan's Rhapsody", "Satan's Rhapsody", "Black Orchids", "Black Orchids", "The Crimson Stain Mystery", "Homunculus, 1. Teil", "A Night of Horror", "Alraune", "The Cabinet of Dr. Caligari", "The Cabinet of Dr. Caligari", "The Cabinet of Dr. Caligari", "The Cabinet of Dr. Caligari"), startYear = structure(c(-20819, -20819, -20454, -20089, -20089, -20089, -20089, -20089, -19358, -19358, -19358, -19358, -19724, -19724, -19358, -18628, -18263, -18263, -18263, -18263), class = "Date"), runtimeMinutes = c("85", "85", "60", "50", "50", "70", "70", "\N", "55", "55", "50", "50", "\N", "69", "56", "80", "76", "76", "76", "76"), decade = c(1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1920, 1920, 1920, 1920), genre = c("Drama", "Fantasy", "Adventure", "Mystery", "Crime", "Drama", "Sci-Fi", "Drama", "Fantasy", "Drama", "Drama", "Drama", "Mystery", "Sci-Fi", "Drama", "Sci-Fi", "Thriller", "Mystery", "Mystery", "Thriller" ), rating = c(6.5, 6.5, 5.2, 3.1, 3.1, 6.6, 6.6, 5.8, 6.8, 6.8, 4.8, 4.8, 6.9, 6.1, 6.1, 5.5, 8.1, 8.1, 8.1, 8.1), numVotes = c(2063, 2063, 36, 40, 40, 53, 53, 23, 719, 719, 18, 18, 18, 91, 20, 51, 62119, 62119, 62119, 62119)), row.names = c(NA, -20L), class = c("tbl_df", "tbl", "data.frame"))
What I want to get is a dataframe of counts sorted by decade and genre, something like this:
decade | genre | count |
---|---|---|
1910 | Drama | 15 |
1920 | Drama | 27 |
1930 | Drama | 32 |
... | ... | ... |
1910 | Fantasy | 12 |
1920 | Fantasy | 23 |
1930 | Drama | 41 |
... | ... | ... |
...and so on, through each group. I've tried several things, including this:
subgenres %>%
group_by(decade,genre) %>%
summarise(count=n())
But I get this error:
subgenres %>% group_by(decade) %>% summarise(count=n()) Error: 'format_error' is not an exported object from 'namespace:cli' Error in count(., decade, genre) : object 'decade' not found
I can count each each subgenres$genre, but I don't know how to nest groups and I'm not sure what my keyword search should be to do the research myself. I thought it was a type problem, but changes there didn't seem to work, either.
tibble [6,809 × 8] (S3: tbl_df/tbl/data.frame)
$ tconst : chr [1:6809] "tt0003419" "tt0003419" "tt0004013" "tt0005231" ...
$ primaryTitle : chr [1:6809] "The Student of Prague" "The Student of Prague" "The Ghost Breaker" "The Hound of the Baskervilles" ...
$ startYear : Date[1:6809], format: "1913-01-01" "1913-01-01" "1914-01-01" "1915-01-01" ...
$ runtimeMinutes: chr [1:6809] "85" "85" "60" "50" ...
$ decade : num [1:6809] 1910 1910 1910 1910 1910 1910 1910 1910 1910 1910 ...
$ genre : chr [1:6809] "Drama" "Fantasy" "Adventure" "Mystery" ...
$ rating : num [1:6809] 6.5 6.5 5.2 3.1 3.1 6.6 6.6 5.8 6.8 6.8 ...
$ numVotes : num [1:6809] 2063 2063 36 40 40 ...
Any insight is greatly appreciated!
Upvotes: 0
Views: 32
Reputation: 18662
Using your posted data and code the following both work:
library(dplyr)
subgenres %>%
group_by(decade,genre) %>%
summarise(count=n())
# dplyr >= 1.1.0
subgenres %>%
summarise(count=n(), .by = c(decade, genre))
When I insert the +
into the code, I am able to somewhat replicate your error:
df %>% +
group_by(decade,genre) %>% +
summarise(count=n())
Error in group_by(decade, genre) : object 'decade' not found
Output
decade genre count
<dbl> <chr> <int>
1 1910 Drama 7
2 1910 Fantasy 2
3 1910 Adventure 1
4 1910 Mystery 2
5 1910 Crime 1
6 1910 Sci-Fi 3
7 1920 Thriller 2
8 1920 Mystery 2
Data
structure(list(tconst = c("tt0003419", "tt0003419", "tt0004013",
"tt0005231", "tt0005231", "tt0005615", "tt0005615", "tt0005772",
"tt0005951", "tt0005951", "tt0006434", "tt0006434", "tt0006554",
"tt0006820", "tt0007111", "tt0008826", "tt0010323", "tt0010323",
"tt0010323", "tt0010323"), primaryTitle = c("The Student of Prague",
"The Student of Prague", "The Ghost Breaker", "The Hound of the Baskervilles",
"The Hound of the Baskervilles", "Life Without Soul", "Life Without Soul",
"Mortmain", "Satan's Rhapsody", "Satan's Rhapsody", "Black Orchids",
"Black Orchids", "The Crimson Stain Mystery", "Homunculus, 1. Teil",
"A Night of Horror", "Alraune", "The Cabinet of Dr. Caligari",
"The Cabinet of Dr. Caligari", "The Cabinet of Dr. Caligari",
"The Cabinet of Dr. Caligari"), startYear = structure(c(-20819,
-20819, -20454, -20089, -20089, -20089, -20089, -20089, -19358,
-19358, -19358, -19358, -19724, -19724, -19358, -18628, -18263,
-18263, -18263, -18263), class = "Date"), runtimeMinutes = c("85",
"85", "60", "50", "50", "70", "70", "\\N", "55", "55", "50",
"50", "\\N", "69", "56", "80", "76", "76", "76", "76"), decade = c(1910,
1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910, 1910,
1910, 1910, 1910, 1910, 1920, 1920, 1920, 1920), genre = c("Drama",
"Fantasy", "Adventure", "Mystery", "Crime", "Drama", "Sci-Fi",
"Drama", "Fantasy", "Drama", "Drama", "Drama", "Mystery", "Sci-Fi",
"Drama", "Sci-Fi", "Thriller", "Mystery", "Mystery", "Thriller"
), rating = c(6.5, 6.5, 5.2, 3.1, 3.1, 6.6, 6.6, 5.8, 6.8, 6.8,
4.8, 4.8, 6.9, 6.1, 6.1, 5.5, 8.1, 8.1, 8.1, 8.1), numVotes = c(2063,
2063, 36, 40, 40, 53, 53, 23, 719, 719, 18, 18, 18, 91, 20, 51,
62119, 62119, 62119, 62119)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
Upvotes: 1