Reputation: 347

Cumulative sum of a qualitative variable in ggplot

I would like to compute a graph that tells me the increasing nº of species I am recording thorugh time, preferably with ggplot and geom_line. So it would be something like: the first day I had one species, day 1 equals 1 species. 2nd day I detected 2 new species, day 2 should equal 3 species. If on day 5 I detected another new species, day five should equal 4 species, and so on.

It's probably an easy thing but couldn't find the way to an example within the forum, probably because I am not using the right words. What I would like from my data would be something like (don't take numbers as reference, my total number of species is only 15, no 52 like in this plot!):

You can use this subset of my dataset as a reproducible example:

dataset <- structure(list(date = structure(c(18496, 18496, 18497, 18497, 
18498, 18498, 18498, 18499, 18499, 18500, 18501, 18504, 18505, 
18505, 18506, 18506, 18506, 18506, 18506, 18507, 18507, 18507, 
18507, 18507, 18509, 18483, 18484, 18484, 18486, 18486, 18487, 
18487, 18488, 18488, 18489, 18490, 18490, 18492, 18493, 18494, 
18519, 18519, 18520, 18518, 18518, 18518, 18496, 18496, 18496, 
18499, 18500, 18501, 18504, 18506, 18506, 18506, 18506, 18508, 
18484, 18486, 18490, 18491, 18491, 18492, 18494, 18495, 18482, 
18484, 18486, 18486, 18486, 18486, 18496, 18498, 18499, 18501, 
18504, 18505, 18507, 18507, 18509, 18482, 18483, 18484, 18486, 
18487, 18488, 18488, 18488, 18489, 18489, 18490, 18492, 18493, 
18495, 18498, 18499, 18504, 18504, 18504, 18505, 18505, 18506, 
18506, 18507, 18507, 18507, 18507), class = "Date"), speciesname = c("Grey Heron", 
"Grey Heron", "Common Sandpiper", "Grey Heron", "Green Sandpiper", 
"Grey Heron", "Greater Flamingo", "Grey Heron", "Night Heron", 
"Night Heron", "Common Sandpiper", "Grey Heron", "Common Sandpiper", 
"Grey Heron", "Grey Heron", "Grey Heron", "Grey Heron", "Grey Heron", 
"Dotterel", "Grey Heron", "Grey Heron", "Ortolan Bunting", "Ortolan Bunting", 
"Spotted Flycatcher", "Grey Heron", "Grey Heron", "Grey Heron", 
"Nightjar", "Grey Heron", "Green Sandpiper", "Common Sandpiper", 
"Grey Heron", "Grey Heron", "Grey Heron", "Snipe", "Common Sandpiper", 
"Common Sandpiper", "Grey Heron", "Night Heron", "Dunlin", "Snipe", 
"Grey Heron", "Grey Heron", "Grey Heron", "Grey Heron", "Grey Heron", 
"Night Heron", "Night Heron", "Grey Heron", "Green Sandpiper", 
"Robin", "Green Sandpiper", "Ortolan Bunting", "Grey Heron", 
"Dotterel", "Whimbrel", "Dotterel", "Grey Heron", "Common Sandpiper", 
"Grey Heron", "Common Sandpiper", "Grey Heron", "Night Heron", 
"Night Heron", "Green Sandpiper", "Little Bittern", "Grey Heron", 
"Common Sandpiper", "Grey Heron", "Grey Heron", "Night Heron", 
"Grey Heron", "Ortolan Bunting", "Common Sandpiper", "Grey Heron", 
"Night Heron", "Night Heron", "Grey Heron", "Grey Heron", "Grey Heron", 
"Grey Heron", "Common Sandpiper", "Grey Heron", "Grey Heron", 
"Grey Heron", "Western Swamphen", "Common Sandpiper", "Grey Heron", 
"Night Heron", "Grey Heron", "Whimbrel", "Greater Flamingo", 
"Night Heron", "Grey Heron", "Grey Heron", "Grey Heron", "Grey Heron", 
"Night Heron", "Ortolan Bunting", "Night Heron", "Night Heron", 
"Night Heron", "Common Sandpiper", "Grey Heron", "Night Heron", 
"Grey Heron", "Night Heron", "Grey Heron")), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -108L))

Thanks a lot!!

Upvotes: 1

Answers (3)

polkas

Reputation: 4184

Base R with env as dict:


dataset <- dataset[order(dataset$date),]

finda <- new.env() #as map/dict
dataset$speciesname_u <- 0
cc <- 0

for (i in seq_len(nrow(dataset))) {
  ss <- as.character(dataset[i, "speciesname"])
  if (ss %in% ls(finda)) {
    dataset[i, "speciesname_u"] <- cc
  } else {
    finda[[ss]] <- NULL
    cc <- cc + 1
    dataset[i, "speciesname_u"] <- cc
  }
}

dataset$date <- lubridate::ymd(dataset$date)
library(ggplot2)
ggplot(dataset, aes(x = date, y = speciesname_u)) + geom_line() + geom_point() +
    scale_x_date(date_labels="%d-%b-%Y",breaks = '10 days') +
    theme_bw()

#or aggragating by day and take the maximum 
library(dplyr)
ggplot(dataset %>% group_by(date) %>% mutate(spec_u = max(speciesname_u)) %>% ungroup(), aes(x = date, y = spec_u)) + geom_line() + geom_point() +
    scale_x_date(date_labels="%d-%b-%Y",breaks = '10 days') +
    theme_bw()

Upvotes: 1

Duck

Reputation: 39613

Try this approach. Your first can use dplyr functions to count the number of species by each day and after that you can compute the cumulative sum by day using cum() with mutate(). The good thing is that you can integrate all the process in one pipeline with dplyr and ggplot2. Here the code using the data you provided:

library(ggplot2)
library(dplyr)
#Code for data process
dataset %>% 
  group_by(date) %>%
  #Compute number of species across days
  summarise(N=n()) %>%
  ungroup() %>%
  mutate(Cum=cumsum(N)) %>%
  #Plot
  ggplot(aes(x=date,y=Cum,group=1))+
  geom_line()+
  geom_point()+
  scale_x_date(date_labels="%d-%m-%Y",breaks = '3 days',expand = c(0.01,0))+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90))

Output:

And if more customization is needed:

#Code for data process 2
dataset %>% 
  group_by(date) %>%
  #Compute number of species across days
  summarise(N=n()) %>%
  ungroup() %>%
  mutate(Cum=cumsum(N)) %>%
  #Plot
  ggplot(aes(x=date,y=Cum,group=1))+
  geom_line()+
  geom_point()+
  scale_x_date(date_labels="%d-%b-%Y",breaks = '3 days',expand = c(0.01,0))+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90),
        axis.text = element_text(color='black',face='bold'),
        axis.title = element_text(color='black',face='bold'))+
  ylab('Cumulative number of species discovered in XYZ place')

Output:

For unique number of species each day we can use n_distinct():

#Code for data process 3
dataset %>% 
  group_by(date) %>%
  #Compute number of species across days
  summarise(N=n_distinct(speciesname)) %>%
  ungroup() %>%
  mutate(Cum=cumsum(N)) %>%
  #Plot
  ggplot(aes(x=date,y=Cum,group=1))+
  geom_line()+
  geom_point()+
  scale_x_date(date_labels="%d-%b-%Y",breaks = '3 days',expand = c(0.01,0))+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90),
        axis.text = element_text(color='black',face='bold'),
        axis.title = element_text(color='black',face='bold'))+
  ylab('Cumulative number of species discovered in XYZ place')

Output:

Upvotes: 1

metaltoaster

Reputation: 378

If you want to find the cumulative sum of each species then you can do this:-

df<-dataset%>%
  group_by(date, speciesname)%>%
  summarise(number_of_species=n())


df<-mutate(group_by(df,speciesname), cumsum=cumsum(number_of_species))



# A tibble: 67 x 4
# Groups:   speciesname [15]
   date       speciesname      number_of_species cumsum
   <date>     <chr>                        <int>  <int>
 1 2020-08-08 Common Sandpiper                 1      1
 2 2020-08-08 Grey Heron                       1      1
 3 2020-08-09 Grey Heron                       2      3
 4 2020-08-10 Common Sandpiper                 2      3
 5 2020-08-10 Grey Heron                       2      5
 6 2020-08-10 Nightjar                         1      1
 7 2020-08-12 Green Sandpiper                  1      1
 8 2020-08-12 Grey Heron                       6     11
 9 2020-08-12 Night Heron                      1      1
10 2020-08-13 Common Sandpiper                 1      4

Then time to plot:-

ggplot(df, aes(x=date,y=cumsum, group=speciesname,colour=speciesname))+geom_line()+geom_point()+
  ggtitle("Cumulative sum of species", subtitle = "From 08/08/2020 to 15/09/2020")

Which can give you this:-

Upvotes: 0

Cumulative sum of a qualitative variable in ggplot

Answers (3)

Related Questions