Reputation: 435
I want to get a data.frame like the one below, but including all years per topic. This one I made counts the number of items by year for each topic but when there is no item in some year, it just doesn't create that row for that particular topic, and it's blank in the final graph. Could anyone please tell me how to add the missing year with Count == 0 for the topics that have no value?
dtd2 <- structure(list(Topic = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L), .Label = c("Topic 1",
"Topic 10", "Topic 11", "Topic 12", "Topic 2", "Topic 3", "Topic 4",
"Topic 5", "Topic 6", "Topic 7", "Topic 8", "Topic 9"), class = "factor"),
Year = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 2L,
3L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 6L, 7L, 8L,
9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 2L, 3L, 4L, 5L, 6L, 7L, 8L), .Label = c("2011",
"2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019"
), class = "factor"), Count = c(3L, 3L, 3L, 5L, 5L, 11L,
17L, 14L, 4L, 1L, 1L, 4L, 2L, 3L, 9L, 4L, 2L, 1L, 3L, 4L,
5L, 18L, 23L, 19L, 15L, 1L, 5L, 6L, 8L, 11L, 17L, 7L, 1L,
3L, 6L, 4L, 20L, 21L, 18L, 12L, 3L, 1L, 1L, 2L, 5L, 5L, 11L,
5L, 2L, 1L, 1L, 2L, 2L, 5L, 7L, 23L, 9L, 1L, 1L, 2L, 3L,
6L, 4L, 9L, 8L, 1L, 1L, 6L, 2L, 3L, 3L, 1L, 3L, 2L, 5L, 7L,
11L, 11L, 28L, 11L, 2L, 1L, 2L, 2L, 5L, 6L, 5L, 16L, 3L,
4L, 2L, 2L, 7L, 6L, 8L, 6L)), row.names = c(NA, -96L), class = "data.frame")
ggplot(dtd2, aes(x = Year, y = Count, colour = Topic, group = Topic)) + geom_point() + geom_line() + labs(x = "Year", y = NULL, title = "Timeline")
Upvotes: 1
Views: 286
Reputation: 4150
A time series approach could be
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#>
#> date
library(tsibble)
#>
#> Attaching package: 'tsibble'
#> The following objects are masked from 'package:lubridate':
#>
#> interval, new_interval
#> The following object is masked from 'package:dplyr':
#>
#> id
dtd2 <- structure(list(Topic = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L), .Label = c("Topic 1",
"Topic 10", "Topic 11", "Topic 12", "Topic 2", "Topic 3", "Topic 4",
"Topic 5", "Topic 6", "Topic 7", "Topic 8", "Topic 9"), class = "factor"),
Year = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 2L,
3L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 6L, 7L, 8L,
9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 2L, 3L, 4L, 5L, 6L, 7L, 8L), .Label = c("2011",
"2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019"
), class = "factor"), Count = c(3L, 3L, 3L, 5L, 5L, 11L,
17L, 14L, 4L, 1L, 1L, 4L, 2L, 3L, 9L, 4L, 2L, 1L, 3L, 4L,
5L, 18L, 23L, 19L, 15L, 1L, 5L, 6L, 8L, 11L, 17L, 7L, 1L,
3L, 6L, 4L, 20L, 21L, 18L, 12L, 3L, 1L, 1L, 2L, 5L, 5L, 11L,
5L, 2L, 1L, 1L, 2L, 2L, 5L, 7L, 23L, 9L, 1L, 1L, 2L, 3L,
6L, 4L, 9L, 8L, 1L, 1L, 6L, 2L, 3L, 3L, 1L, 3L, 2L, 5L, 7L,
11L, 11L, 28L, 11L, 2L, 1L, 2L, 2L, 5L, 6L, 5L, 16L, 3L,
4L, 2L, 2L, 7L, 6L, 8L, 6L)), row.names = c(NA, -96L), class = "data.frame")
tsibble2 <- dtd2 %>%
mutate(Year = as_date(str_c(Year,"01",'01'))) %>%
as_tsibble(index = Year,key = Topic) %>%
tsibble::fill_gaps(.full = TRUE) %>%
group_by_key() %>%
index_by(year = Year %>% year) %>%
summarise(Count = Count %>% sum(na.rm = T)) %>%
as_tibble() %>%
mutate(year = year %>% as_factor())
tsibble2 %>%
ggplot() +
aes(x = year,y = Count,color = Topic,group = Topic) +
geom_line() +
geom_point()
Created on 2020-01-08 by the reprex package (v0.3.0)
Upvotes: 1
Reputation: 388982
We can use complete
from tidyr
to add missing years and fill Count
values with 0.
tidyr::complete(dtd2, Topic, Year = unique(Year), fill = list(Count = 0))
#A tibble: 108 x 3
# Topic Year Count
# <fct> <fct> <dbl>
# 1 Topic 1 2011 3
# 2 Topic 1 2012 3
# 3 Topic 1 2013 3
# 4 Topic 1 2014 5
# 5 Topic 1 2015 5
# 6 Topic 1 2016 11
# 7 Topic 1 2017 17
# 8 Topic 1 2018 14
# 9 Topic 1 2019 4
#10 Topic 10 2011 0
# … with 98 more rows
and use it in ggplot2
so that the lines are connected
library(ggplot2)
tidyr::complete(dtd2, Topic, Year = unique(Year), fill = list(Count = 0)) %>%
ggplot(., aes(x = Year, y = Count, colour = Topic, group = Topic)) +
geom_point() + geom_line() + labs(x = "Year", y = NULL, title = "Timeline")
Upvotes: 2
Reputation: 887153
We can use expand
library(dplyr)
library(tidyr)
library(ggplot2)
dtd2 %>%
expand(Topic = factor(Topic, levels = gtools::mixedsort(levels(Topic))) ,
Year = unique(Year)) %>%
left_join(dtd2) %>%
mutate(Count = replace_na(Count, 0)) %>%
ggplot(aes(x = Year, y = Count, colour = Topic, group = Topic)) +
geom_point() +
geom_line() +
labs(x = "Year", y = NULL, title = "Timeline")
Upvotes: 1