Reputation: 95
I have a dataframe containing information about local elections for municipalities and I would like to keep in my dataset only information related to the two best-voted candidates by municipality.
My dataframe was created using information like this:
Candidates <- c('Liza S.', 'John R.', 'Peter F.', 'Ignacio R.', 'Hanna F.', 'Anthony E.', 'Andrew W.')`
Municipality <- ('A', 'A', 'A', 'B', 'B', 'B', 'B')
Votes <- c(100, 120, 140, 110, 335, 225, 400)
data <- data.frame(Candidates, Municipality, Votes)
And look like this:
Candidates Municipality Votes
Liza S. A 100
John R. A 120
Peter F. A 140
Ignacio R. B 110
Hanna F. B 335
Anthony E. B 225
Andrew W. B 400
I want that my dataframe look like this:
Candidates Municipality Votes
John R. A 120
Peter F A 140
Hanna F. B 335
Andrew W. B 400
Any idea of how to accomplish this?
Upvotes: 2
Views: 75
Reputation: 72613
You could just subset with ave()
by Municipality-specific (rev
ersed) order()
.
dat[ave(dat$Votes, dat$Municipality, FUN=function(x) rev(order(x))) <= 2, ]
# Candidates Municipality Votes
# 2 John R. A 120
# 3 Peter F. A 140
# 5 Hanna F. B 335
# 7 Andrew W. B 400
Data:
dat <- structure(list(Candidates = structure(c(6L, 5L, 7L, 4L, 3L, 2L,
1L), .Label = c("Andrew W.", "Anthony E.", "Hanna F.", "Ignacio R.",
"John R.", "Liza S.", "Peter F."), class = "factor"), Municipality = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"),
Votes = c(100, 120, 140, 110, 335, 225, 400)), class = "data.frame", row.names = c(NA,
-7L))
Upvotes: 2
Reputation: 173793
In base R:
do.call("rbind",
lapply(split.data.frame(data, Municipality),
function(x) x[rev(order(x$Votes))[1:2],]))
# Candidates Municipality Votes
# A.3 Peter F. A 140
# A.2 John R. A 120
# B.7 Andrew W. B 400
# B.5 Hanna F. B 335
Upvotes: 1
Reputation: 1261
Here's a way using dplyr package;
library(dplyr)
# Creating data
Candidates <- c('Liza S.', 'John R.', 'Peter F.', 'Ignacio R.', 'Hanna F.', 'Anthony E.', 'Andrew W.')
Municipality <- c('A', 'A', 'A', 'B', 'B', 'B', 'B')
Votes <- c(100, 120, 140, 110, 335, 225, 400)
data <- data.frame(Candidates, Municipality, Votes)
data %>%
# Grouping by Municipality
group_by(Municipality) %>%
# Sorting the data desc by Votes
arrange(desc(Votes)) %>%
# Getting first and second highest values for each group
slice(1:2) %>%
ungroup() %>%
# # A tibble: 4 x 3
# # Groups: Municipality [2]
# Candidates Municipality Votes
# <fct> <fct> <dbl>
# Peter F. A 140
# John R. A 120
# Andrew W. B 400
# Hanna F. B 335
Upvotes: 1
Reputation: 417
You can use the dplyr
package for this:
library(dplyr)
data %>%
group_by(Municipality) %>%
top_n(n = 2, wt = Votes)
# A tibble: 4 x 3
# Groups: Municipality [2]
Candidates Municipality Votes
<fct> <fct> <dbl>
1 John R. A 120
2 Peter F. A 140
3 Hanna F. B 335
4 Andrew W. B 400
Upvotes: 3