syre
syre

Reputation: 982

Count individuals appearing at bracketed frequencies across cumulative intervals of discrete time

I have a dataset of events occurring at spatial coordinates over time (years). I wish to get a sense of the ergodicity of locations over time, that is the extent to which events repeat at the same locations. (The volume of events per location is indifferent.)

In order to do so, I have opted to count the years in which locations are active within intervals of several years, from 1 to the interval length. (Locations may activate in discontinuous years). Then I look at how the count of such locations increases as intervals accumulate up to the entire period. I provide a toy example below, but my actual dataset is much larger, with larger intervals.

I wish to know whether the logic and programming are sound and whether they can be simplified/improved.

#Toy dataset of 100 events at 60 point locations over 10 years from 1990 to 1999.
df1 <- structure(list(year = c("1997", "1996", "1998", "1995", "1997", "1995", "1995", "1997", "1997", "1999", "1997", "1999", "1996", "1997", "1997", "1991", "1996", "1992", "1993", "1999", "1997", "1995", "1997", "1996", "1997", "1997", "1991", "1997", "1997", "1998", "1999", "1996", "1997", "1998", "1996", "1998", "1997", "1999", "1999", "1995", "1998", "1996", "1996", "1997", "1995", "1996", "1995", "1995", "1994", "1997", "1998", "1999", "1999", "1999", "1995", "1999", "1999", "1999", "1999", "1997", "1999", "1994", "1994", "1996", "1994", "1999", "1999", "1993", "1997", "1999", "1999", "1998", "1999", "1998", "1999", "1997", "1999", "1996", "1996", "1996", "1999", "1996", "1995", "1995", "1997", "1996", "1999", "1999", "1997", "1994", "1995", "1995", "1999", "1996", "1998", "1997", "1997", "1995", "1999", "1998"), lng = c(127.107, 126.855, 126.896, 126.908, 126.951, 126.994, 126.833, 126.855, 126.939, 126.65, 127.01, 127.107, 126.864, 127.107, 126.65, 126.864, 126.951, 127.01, 127.139, 127.107, 127.492, 127.113, 127.119, 126.831, 127.009, 127.139, 127.047, 127.124, 126.732, 127.047, 126.908, 127.197, 126.903, 127.077, 126.951, 127.096, 126.972, 126.831, 127.047, 127.035, 126.85, 126.722, 126.932, 127.124, 126.872, 127.035, 127.069, 126.678, 127.124, 126.65, 126.887, 127.139, 127.047, 127.04, 126.855, 127.047, 127.01, 127.01, 126.841, 127.018, 126.937, 126.952, 127.047, 126.738, 126.831, 126.935, 126.871, 126.911, 126.707, 127.107, 126.972, 127.01, 126.937, 127.5, 126.968, 127.113, 127.035, 126.865, 126.831, 127.054, 127.035, 127.199, 126.722, 127.518, 126.98, 127.12, 126.972, 126.874, 126.965, 127.009, 127.518, 126.873, 126.722, 126.848, 127.009, 127, 127.113, 126.937, 127.107, 126.951), lat = c(37.5145, 37.5165, 37.5264, 37.5638, 37.3164, 37.4264, 37.5243, 37.5165, 37.5124, 37.4637, 37.3038, 37.5145, 37.4785, 37.5145, 37.4637, 37.4785, 37.3926, 37.3038, 37.434, 37.5145, 37.2298, 36.9921, 37.3827, 37.1995, 37.2911, 37.4449, 37.5172, 37.5301, 37.447, 37.5172, 37.5638, 37.5384, 37.5683, 37.1498, 37.3926, 37.5256, 37.2572, 37.3219, 37.6688, 37.2861, 37.551, 37.507, 37.3865, 37.5301, 37.4793, 37.2861, 37.205, 37.4094, 37.5301, 37.4637, 37.4954, 37.434, 37.5172, 37.5744, 37.5165, 37.5172, 37.3038, 37.3038, 37.2876, 37.5891, 37.5791, 37.4784, 37.5172, 37.5369, 37.3219, 37.3617, 37.3117, 37.388, 37.35, 37.5145, 37.2572, 37.3038, 37.5791, 37.2642, 37.3447, 36.9921, 37.2861, 37.2913, 37.3219, 37.5819, 37.2861, 37.6977, 37.507, 37.4138, 37.2965, 37.3004, 37.2572, 37.3618, 37.5384, 37.2911, 37.4138, 37.4612, 37.507, 37.534, 37.2911, 37.45, 36.9921, 37.5791, 37.5145, 37.3926)), row.names = c(NA, -100L), class = c("data.table", "data.frame"))

library(dplyr)
library(tidyr) #for the unite function

#Cross-tabulate years and unique locations
df2 <- df1 %>% distinct(year,lng,lat) %>% unite(coord, c("lng", "lat"), sep=";")
df2 <- table(df2$coord, df2$year) %>% t %>% as.data.frame(stringsAsFactors=F)
colnames(df2) <- c("year", "coord", "freq")

#Quantify ergodicity: Bin by 2-year interval & count active years in current and previous 
#intervals. Start with frequencies of 1 or 2.
df2 %>% filter(year<1992 & freq!=0) %>% group_by(coord) %>% mutate(freq=sum(freq)) %>% 
  filter(freq<3) %>% distinct(coord) %>% nrow
#[1] 2 (2 distinct locations are active for 1 or 2 years in 1990-1991)
df2 %>% filter(year<1994 & freq!=0) %>% group_by(coord) %>% mutate(freq=sum(freq)) %>% 
  filter(freq<3) %>% distinct(coord) %>% nrow
#[1] 5 (5 distinct locations are active for 1 or 2 years in 1990-1993)
df2 %>% filter(year<1996 & freq!=0) %>% group_by(coord) %>% mutate(freq=sum(freq)) %>% 
  filter(freq3) %>% distinct(coord) %>% nrow
#[1] 22
df2 %>% filter(year<1998 & freq!=0) %>% group_by(coord) %>% mutate(freq=sum(freq)) %>% 
  filter(freq<3) %>% distinct(coord) %>% nrow
#[1] 46
df2 %>% filter(year<2000 & freq!=0) %>% group_by(coord) %>% mutate(freq=sum(freq)) %>% 
  filter(freq<3) %>% distinct(coord) %>% nrow
#[1] 53

After 1-2, I move on to frequency brackets of 3-4, etc. until the 10-year period of analysis is covered. Predictably, frequencies of 3-4 would not be found in the first interval of 2 years, etc. The desired outcome is a table with frequency brackets as rows and cumulative intervals as columns, filled in with location counts.

EDIT: I have implemented nested loops for frequency brackets and cumulative intervals:

#Quantify ergodicity: Bin by 2-year interval & count active years in current and previous 
#intervals
df3 <- matrix(ncol=5, nrow=5) #build output matrix
colnames(df3) <- paste0(1990, "-", 1990+2*1:5-1) #cumulative intervals
rownames(df3) <- paste0(2*(1:5-1)+1, "-", 2*1:5) #frequency brackets
for (i in 1:5) {
  for (j in 1:5) {
  df3[i,j] <- df2 %>% filter(year<=1990+2*j-1 & freq!=0) %>% group_by(coord) %>% 
    mutate(freq=sum(freq)) %>% filter(freq>=2*(i-1)+1 & freq<=2*i) %>% distinct(coord) %>% 
    nrow
  }
}

df3
#            90-91 90-93 90-95 90-97 90-99
#1-2 years       2     5    22    46    53
#3-4 years       0     0     0     1     7
#5-6 years       0     0     0     0     0
#7-8 years       0     0     0     0     0
#9-10 years      0     0     0     0     0

(With more data, the diagonal and upper triangle tend to fill up, while the lower triangle comprises 0s by construction.)

Upvotes: 0

Views: 72

Answers (1)

pyg
pyg

Reputation: 832

I think this achieves your desired output. Uses functions from the dplyr, tidyr, and purrr packages.

suppressPackageStartupMessages(library("dplyr"))
library("tidyr")
library("purrr")

# input data
df1 <- structure(list(year = c("1997", "1996", "1998", "1995", "1997", "1995", "1995", "1997", "1997", "1999", "1997", "1999", "1996", "1997", "1997", "1991", "1996", "1992", "1993", "1999", "1997", "1995", "1997", "1996", "1997", "1997", "1991", "1997", "1997", "1998", "1999", "1996", "1997", "1998", "1996", "1998", "1997", "1999", "1999", "1995", "1998", "1996", "1996", "1997", "1995", "1996", "1995", "1995", "1994", "1997", "1998", "1999", "1999", "1999", "1995", "1999", "1999", "1999", "1999", "1997", "1999", "1994", "1994", "1996", "1994", "1999", "1999", "1993", "1997", "1999", "1999", "1998", "1999", "1998", "1999", "1997", "1999", "1996", "1996", "1996", "1999", "1996", "1995", "1995", "1997", "1996", "1999", "1999", "1997", "1994", "1995", "1995", "1999", "1996", "1998", "1997", "1997", "1995", "1999", "1998"), lng = c(127.107, 126.855, 126.896, 126.908, 126.951, 126.994, 126.833, 126.855, 126.939, 126.65, 127.01, 127.107, 126.864, 127.107, 126.65, 126.864, 126.951, 127.01, 127.139, 127.107, 127.492, 127.113, 127.119, 126.831, 127.009, 127.139, 127.047, 127.124, 126.732, 127.047, 126.908, 127.197, 126.903, 127.077, 126.951, 127.096, 126.972, 126.831, 127.047, 127.035, 126.85, 126.722, 126.932, 127.124, 126.872, 127.035, 127.069, 126.678, 127.124, 126.65, 126.887, 127.139, 127.047, 127.04, 126.855, 127.047, 127.01, 127.01, 126.841, 127.018, 126.937, 126.952, 127.047, 126.738, 126.831, 126.935, 126.871, 126.911, 126.707, 127.107, 126.972, 127.01, 126.937, 127.5, 126.968, 127.113, 127.035, 126.865, 126.831, 127.054, 127.035, 127.199, 126.722, 127.518, 126.98, 127.12, 126.972, 126.874, 126.965, 127.009, 127.518, 126.873, 126.722, 126.848, 127.009, 127, 127.113, 126.937, 127.107, 126.951), lat = c(37.5145, 37.5165, 37.5264, 37.5638, 37.3164, 37.4264, 37.5243, 37.5165, 37.5124, 37.4637, 37.3038, 37.5145, 37.4785, 37.5145, 37.4637, 37.4785, 37.3926, 37.3038, 37.434, 37.5145, 37.2298, 36.9921, 37.3827, 37.1995, 37.2911, 37.4449, 37.5172, 37.5301, 37.447, 37.5172, 37.5638, 37.5384, 37.5683, 37.1498, 37.3926, 37.5256, 37.2572, 37.3219, 37.6688, 37.2861, 37.551, 37.507, 37.3865, 37.5301, 37.4793, 37.2861, 37.205, 37.4094, 37.5301, 37.4637, 37.4954, 37.434, 37.5172, 37.5744, 37.5165, 37.5172, 37.3038, 37.3038, 37.2876, 37.5891, 37.5791, 37.4784, 37.5172, 37.5369, 37.3219, 37.3617, 37.3117, 37.388, 37.35, 37.5145, 37.2572, 37.3038, 37.5791, 37.2642, 37.3447, 36.9921, 37.2861, 37.2913, 37.3219, 37.5819, 37.2861, 37.6977, 37.507, 37.4138, 37.2965, 37.3004, 37.2572, 37.3618, 37.5384, 37.2911, 37.4138, 37.4612, 37.507, 37.534, 37.2911, 37.45, 36.9921, 37.5791, 37.5145, 37.3926)), row.names = c(NA, -100L), class = c("data.table", "data.frame"))

# cross-tabulate years and unique locations
df2 <- df1 %>%
  distinct(year, lng, lat) %>%
  unite(coord, c("lng", "lat"), sep=";") %>%
  count(year, coord, name = "freq") %>%
  complete(year, coord, fill = list(freq = 0L))

# quantify ergodicity: Bin by 2-year interval & count active years in current and previous intervals
output <- c(1990+2*1:5-1) %>% # the upper limits of the year ranges
  purrr::set_names(~paste0("1990-", .x)) %>%
  map_dfr( # filter df2 using this year range then combine into one data frame
    .f = ~ {
      df2 %>%
        filter(year <= .x & freq > 0)
    },
    .id = "year_range"
  ) %>%
  group_by(year_range, coord) %>%
  summarise(total_freq = sum(freq), .groups = "drop") %>%
  mutate(freq_range = cut( # cut the frequences into bins ("1 to 2", "3 to 4", etc). note that the data type of this column will be factors. can change this later, if desired
    total_freq,
    breaks = seq(from = 0, to = 10, by = 2))
  ) %>%
  count(year_range, freq_range)

# can reshape this as desired
output
#> # A tibble: 7 × 3
#>   year_range freq_range     n
#>   <chr>      <fct>      <int>
#> 1 1990-1991  (0,2]          2
#> 2 1990-1993  (0,2]          5
#> 3 1990-1995  (0,2]         22
#> 4 1990-1997  (0,2]         46
#> 5 1990-1997  (2,4]          1
#> 6 1990-1999  (0,2]         53
#> 7 1990-1999  (2,4]          7

Created on 2021-11-10 by the reprex package (v2.0.1)

Upvotes: 1

Related Questions