Reputation: 21
I am reading the book Text Mining with R: A Tidy Approach by Julia Silge & David Robinson to try to find the difference between two works, and not the three in the original book, how can I draw a similar graph with ggplot?
In the original book:
austen <- austen_books() %>%
select(-book) %>%
mutate(author = "Jane Austen")
bronte <- gutenberg_download(c(1260, 768, 969, 9182, 767)) %>%
select(-gutenberg_id) %>%
mutate(author = "Brontë Sisters")
hgwells <- gutenberg_download(c(35, 36, 5230, 159)) %>%
select(-gutenberg_id) %>%
mutate(author = "H.G. Wells")
comparison_df <- books %>%
add_count(author, wt = n, name = "total_word") %>%
mutate(proportion = n / total_word) %>%
select(-total_word, -n) %>%
pivot_wider(names_from = author, values_from = proportion,
values_fill = list(proportion = 0)) %>%
pivot_longer(3:4, names_to = "other", values_to = "proportion")
comparison_df
#> # A tibble: 56,002 x 4
#> word `Jane Austen` other proportion
#> <chr> <dbl> <chr> <dbl>
#> 1 miss 0.00855 Brontë Sisters 0.00342
#> 2 miss 0.00855 H.G. Wells 0.000120
#> 3 time 0.00615 Brontë Sisters 0.00424
#> 4 time 0.00615 H.G. Wells 0.00682
#> 5 fanny 0.00449 Brontë Sisters 0.0000438
#> 6 fanny 0.00449 H.G. Wells 0
#> # ... with 5.6e+04 more rows
But what if I just want to compare two works?Just like austen and bronte.
comparison_df %>%
filter(proportion > 1 / 1e5) %>%
ggplot(aes(proportion, `Jane Austen`)) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(aes(color = abs(`Jane Austen` - proportion)),
alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = label_percent()) +
scale_y_log10(labels = label_percent()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
facet_wrap(~ other) +
guides(color = FALSE)
How can I modify the code above here?
Upvotes: 1
Views: 72
Reputation: 76402
Here is a complete, reproducible solution. The books data wrangling code is a copy&paste of or based on Text Mining with R: A Tidy Approach, Julia Silge & David Robinson.
suppressPackageStartupMessages({
library(dplyr)
library(tidyr)
library(tidytext)
library(stringr)
library(gutenbergr)
library(janeaustenr)
library(ggplot2)
library(scales)
})
data(stop_words)
austen <- austen_books() %>%
select(-book)
bronte <- gutenberg_download(c(1260, 768, 969, 9182, 767)) %>%
select(-gutenberg_id)
#> Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
#> Using mirror http://aleph.gutenberg.org
hgwells <- gutenberg_download(c(35, 36, 5230, 159)) %>%
select(-gutenberg_id)
bind_rows(
austen %>% mutate(author = "Jane Austen"),
bronte %>% mutate(author = "Brontë Sisters"),
hgwells %>% mutate(author = "H.G. Wells")
) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words, by = "word") %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(author, word, sort = TRUE) %>%
add_count(author, name = "total_word") %>%
mutate(proportion = n / total_word) %>%
select(-total_word, -n) %>%
pivot_wider(names_from = author, values_from = proportion,
values_fill = list(proportion = 0)) %>%
pivot_longer(3:4, names_to = "other", values_to = "proportion") %>%
#
# to filter author solves the question's problem
# also filter Jane Austen's values to avoid warnings, log10 was giving
# Warning messages:
# 1: Transformation introduced infinite values in continuous y-axis
# 2: Transformation introduced infinite values in continuous y-axis
# 3: Removed 18761 rows containing missing values (geom_point).
#
# I have separated the filters to make the code clearer
# but they can be combined as only one
#
filter(proportion > 1/1e5, `Jane Austen` > 1/1e5) %>%
filter(other == "Brontë Sisters") %>%
#
ggplot(aes(proportion, `Jane Austen`)) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(aes(color = abs(`Jane Austen` - proportion)),
alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = label_percent()) +
scale_y_log10(labels = label_percent()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
xlab(label = "Brontë Sisters") +
guides(color = "none")
#> Warning: Removed 1 rows containing missing values (geom_text).
Created on 2022-05-14 by the reprex package (v2.0.1)
Upvotes: 3