Reputation: 347
I would like to compute a graph that tells me the increasing nº of species I am recording thorugh time, preferably with ggplot
and geom_line
. So it would be something like: the first day I had one species, day 1 equals 1 species. 2nd day I detected 2 new species, day 2 should equal 3 species. If on day 5 I detected another new species, day five should equal 4 species, and so on.
It's probably an easy thing but couldn't find the way to an example within the forum, probably because I am not using the right words. What I would like from my data would be something like (don't take numbers as reference, my total number of species is only 15, no 52 like in this plot!):
You can use this subset of my dataset as a reproducible example:
dataset <- structure(list(date = structure(c(18496, 18496, 18497, 18497,
18498, 18498, 18498, 18499, 18499, 18500, 18501, 18504, 18505,
18505, 18506, 18506, 18506, 18506, 18506, 18507, 18507, 18507,
18507, 18507, 18509, 18483, 18484, 18484, 18486, 18486, 18487,
18487, 18488, 18488, 18489, 18490, 18490, 18492, 18493, 18494,
18519, 18519, 18520, 18518, 18518, 18518, 18496, 18496, 18496,
18499, 18500, 18501, 18504, 18506, 18506, 18506, 18506, 18508,
18484, 18486, 18490, 18491, 18491, 18492, 18494, 18495, 18482,
18484, 18486, 18486, 18486, 18486, 18496, 18498, 18499, 18501,
18504, 18505, 18507, 18507, 18509, 18482, 18483, 18484, 18486,
18487, 18488, 18488, 18488, 18489, 18489, 18490, 18492, 18493,
18495, 18498, 18499, 18504, 18504, 18504, 18505, 18505, 18506,
18506, 18507, 18507, 18507, 18507), class = "Date"), speciesname = c("Grey Heron",
"Grey Heron", "Common Sandpiper", "Grey Heron", "Green Sandpiper",
"Grey Heron", "Greater Flamingo", "Grey Heron", "Night Heron",
"Night Heron", "Common Sandpiper", "Grey Heron", "Common Sandpiper",
"Grey Heron", "Grey Heron", "Grey Heron", "Grey Heron", "Grey Heron",
"Dotterel", "Grey Heron", "Grey Heron", "Ortolan Bunting", "Ortolan Bunting",
"Spotted Flycatcher", "Grey Heron", "Grey Heron", "Grey Heron",
"Nightjar", "Grey Heron", "Green Sandpiper", "Common Sandpiper",
"Grey Heron", "Grey Heron", "Grey Heron", "Snipe", "Common Sandpiper",
"Common Sandpiper", "Grey Heron", "Night Heron", "Dunlin", "Snipe",
"Grey Heron", "Grey Heron", "Grey Heron", "Grey Heron", "Grey Heron",
"Night Heron", "Night Heron", "Grey Heron", "Green Sandpiper",
"Robin", "Green Sandpiper", "Ortolan Bunting", "Grey Heron",
"Dotterel", "Whimbrel", "Dotterel", "Grey Heron", "Common Sandpiper",
"Grey Heron", "Common Sandpiper", "Grey Heron", "Night Heron",
"Night Heron", "Green Sandpiper", "Little Bittern", "Grey Heron",
"Common Sandpiper", "Grey Heron", "Grey Heron", "Night Heron",
"Grey Heron", "Ortolan Bunting", "Common Sandpiper", "Grey Heron",
"Night Heron", "Night Heron", "Grey Heron", "Grey Heron", "Grey Heron",
"Grey Heron", "Common Sandpiper", "Grey Heron", "Grey Heron",
"Grey Heron", "Western Swamphen", "Common Sandpiper", "Grey Heron",
"Night Heron", "Grey Heron", "Whimbrel", "Greater Flamingo",
"Night Heron", "Grey Heron", "Grey Heron", "Grey Heron", "Grey Heron",
"Night Heron", "Ortolan Bunting", "Night Heron", "Night Heron",
"Night Heron", "Common Sandpiper", "Grey Heron", "Night Heron",
"Grey Heron", "Night Heron", "Grey Heron")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -108L))
Thanks a lot!!
Upvotes: 1
Views: 252
Reputation: 4184
Base R with env as dict:
dataset <- dataset[order(dataset$date),]
finda <- new.env() #as map/dict
dataset$speciesname_u <- 0
cc <- 0
for (i in seq_len(nrow(dataset))) {
ss <- as.character(dataset[i, "speciesname"])
if (ss %in% ls(finda)) {
dataset[i, "speciesname_u"] <- cc
} else {
finda[[ss]] <- NULL
cc <- cc + 1
dataset[i, "speciesname_u"] <- cc
}
}
dataset$date <- lubridate::ymd(dataset$date)
library(ggplot2)
ggplot(dataset, aes(x = date, y = speciesname_u)) + geom_line() + geom_point() +
scale_x_date(date_labels="%d-%b-%Y",breaks = '10 days') +
theme_bw()
#or aggragating by day and take the maximum
library(dplyr)
ggplot(dataset %>% group_by(date) %>% mutate(spec_u = max(speciesname_u)) %>% ungroup(), aes(x = date, y = spec_u)) + geom_line() + geom_point() +
scale_x_date(date_labels="%d-%b-%Y",breaks = '10 days') +
theme_bw()
Upvotes: 1
Reputation: 39613
Try this approach. Your first can use dplyr
functions to count the number of species by each day and after that you can compute the cumulative sum by day using cum()
with mutate()
. The good thing is that you can integrate all the process in one pipeline with dplyr
and ggplot2
. Here the code using the data you provided:
library(ggplot2)
library(dplyr)
#Code for data process
dataset %>%
group_by(date) %>%
#Compute number of species across days
summarise(N=n()) %>%
ungroup() %>%
mutate(Cum=cumsum(N)) %>%
#Plot
ggplot(aes(x=date,y=Cum,group=1))+
geom_line()+
geom_point()+
scale_x_date(date_labels="%d-%m-%Y",breaks = '3 days',expand = c(0.01,0))+
theme_bw()+
theme(axis.text.x = element_text(angle = 90))
Output:
And if more customization is needed:
#Code for data process 2
dataset %>%
group_by(date) %>%
#Compute number of species across days
summarise(N=n()) %>%
ungroup() %>%
mutate(Cum=cumsum(N)) %>%
#Plot
ggplot(aes(x=date,y=Cum,group=1))+
geom_line()+
geom_point()+
scale_x_date(date_labels="%d-%b-%Y",breaks = '3 days',expand = c(0.01,0))+
theme_bw()+
theme(axis.text.x = element_text(angle = 90),
axis.text = element_text(color='black',face='bold'),
axis.title = element_text(color='black',face='bold'))+
ylab('Cumulative number of species discovered in XYZ place')
Output:
For unique number of species each day we can use n_distinct()
:
#Code for data process 3
dataset %>%
group_by(date) %>%
#Compute number of species across days
summarise(N=n_distinct(speciesname)) %>%
ungroup() %>%
mutate(Cum=cumsum(N)) %>%
#Plot
ggplot(aes(x=date,y=Cum,group=1))+
geom_line()+
geom_point()+
scale_x_date(date_labels="%d-%b-%Y",breaks = '3 days',expand = c(0.01,0))+
theme_bw()+
theme(axis.text.x = element_text(angle = 90),
axis.text = element_text(color='black',face='bold'),
axis.title = element_text(color='black',face='bold'))+
ylab('Cumulative number of species discovered in XYZ place')
Output:
Upvotes: 1
Reputation: 378
If you want to find the cumulative sum of each species then you can do this:-
df<-dataset%>%
group_by(date, speciesname)%>%
summarise(number_of_species=n())
df<-mutate(group_by(df,speciesname), cumsum=cumsum(number_of_species))
# A tibble: 67 x 4
# Groups: speciesname [15]
date speciesname number_of_species cumsum
<date> <chr> <int> <int>
1 2020-08-08 Common Sandpiper 1 1
2 2020-08-08 Grey Heron 1 1
3 2020-08-09 Grey Heron 2 3
4 2020-08-10 Common Sandpiper 2 3
5 2020-08-10 Grey Heron 2 5
6 2020-08-10 Nightjar 1 1
7 2020-08-12 Green Sandpiper 1 1
8 2020-08-12 Grey Heron 6 11
9 2020-08-12 Night Heron 1 1
10 2020-08-13 Common Sandpiper 1 4
Then time to plot:-
ggplot(df, aes(x=date,y=cumsum, group=speciesname,colour=speciesname))+geom_line()+geom_point()+
ggtitle("Cumulative sum of species", subtitle = "From 08/08/2020 to 15/09/2020")
Which can give you this:-
Upvotes: 0