Reputation: 143
In a monitoring scheme each species (A, B, ...) is counted at least twice in each area (a1, a2, ...). For the final result the rows from the sample with the highest total count need to be selected for each area and species.
Example data:
data_joined <- data.frame("species" = c("A","A","A","A","A","A","B","B","B","B"),
"area" = c("a1","a1","a1","a1","a1","a2","a1","a1","a2","a2"),
"sample_nr" = c(1,1,1,1,2,2,1,1,2,2),
"count" = c(1,1,1,1,6,1,1,1,3,3))
My current solution is pasted below. However, the loop is very slow on the original data which contains about 18,000 rows. I can imagine there are much faster and elegant solutions. The original data are in sf format and include geometries which need to be kept after selection.
i_list <- list() # empty list
for (i in unique(data_joined$area)) # all areas that are in the data
{
loop_i_data <- data_joined[data_joined$area == i,] # select data for area i
j_list <- list()
for(j in unique(data_joined$species)) # all species that are in the data
{
loop_j_data <- loop_i_data[loop_i_data$species == j,] # select data of species j in area i
max_select <- which.max(
c(sum(loop_j_data[loop_j_data$sample_nr == "1",]$count, na.rm = TRUE), # sum first count
sum(loop_j_data[loop_j_data$sample_nr == "2",]$count, na.rm = TRUE), # sum second count
sum(loop_j_data[loop_j_data$sample_nr == "3",]$count, na.rm = TRUE), # sum third count
sum(loop_j_data[loop_j_data$sample_nr == "4",]$count, na.rm = TRUE), # sum fourth count
sum(loop_j_data[loop_j_data$sample_nr == "5",]$count, na.rm = TRUE), # sum fifth count
sum(loop_j_data[loop_j_data$sample_nr == "6",]$count, na.rm = TRUE), # sum sixth count
sum(loop_j_data[loop_j_data$sample_nr == "7",]$count, na.rm = TRUE))) # sum seventh count
j_list[[j]] <- loop_j_data[loop_j_data$sample_nr == max_select,] # add maximum count occasion to list
}
i_list[[i]] <- do.call(rbind, j_list)
}
data_final = do.call(rbind, i_list) # rbind all data
row.names(data_final) <- NULL
data_final
Upvotes: 0
Views: 234
Reputation: 388817
Using dplyr
, we can find the sum
of count
for each species
, area
and sample_nr
and select all the rows with max count in each species
and area
.
library(dplyr)
data_joined %>%
group_by(species, area, sample_nr) %>%
summarise(n = sum(count)) %>%
slice(which.max(n)) %>%
left_join(data_joined) %>%
select(-n)
# species area sample_nr count
# <fct> <fct> <dbl> <dbl>
#1 A a1 2 6
#2 A a2 2 1
#3 B a1 1 1
#4 B a1 1 1
#5 B a2 2 3
#6 B a2 2 3
Upvotes: 2