Reputation: 5
R version 3.4.2
I am trying to create 3 new variables according to conditions based on other variables of the same dataframe. I managed to achieve what I intended, but it requires several lines of code to produce an output that I think other methods (possibly using dplyr) might deliver much effortlessly.
Here is a reproductible example:
city <- c("London", "London", "Leeds","Leeds", "Leeds", "Nottingham", "Glasgow", "Belfast", "Belfast", "Oxford", "Oxford", "Southampton", "Aberdeen", "Bath", "Bath", "Bath", "Preston", "Preston", "Liverpool", "Derby","Hereford")
transport <- c("cars", "scooters", "cars", "scooters", "bikes", "cars", "scooters", "cars", "bikes", "scooters", "bikes", "bikes", "scooters", "cars", "scooters", "bikes", "scooters", "bikes", "bikes", "cars", "bikes")
number <- c("153", "21", "267", "87", "13", "95", "17", "199", "8", "34", "5", "23", "40", "142", "79", "28", "37", "22", "19", "83", "23")
df <- data.frame(city, transport, number)
What I want is to know the percentage of each type of transportation in each city, like so:
> df
city transport number pct.cars pct.scooters pct.bikes
1 London cars 153 87.93 12.07 0.00
2 London scooters 21 87.93 12.07 0.00
3 Leeds cars 267 72.75 23.71 3.54
4 Leeds scooters 87 72.75 23.71 3.54
5 Leeds bikes 13 72.75 23.71 3.54
6 Nottingham cars 95 100.00 0.00 0.00
7 Glasgow scooters 17 0.00 100.00 0.00
8 Belfast cars 199 96.14 0.00 3.86
9 Belfast bikes 8 96.14 0.00 3.86
10 Oxford scooters 34 0.00 87.18 12.82
11 Oxford bikes 5 0.00 87.18 12.82
12 Southampton bikes 23 0.00 0.00 100.00
13 Aberdeen scooters 40 0.00 100.00 0.00
14 Bath cars 142 57.03 31.73 11.24
15 Bath scooters 79 57.03 31.73 11.24
16 Bath bikes 28 57.03 31.73 11.24
17 Preston scooters 37 0.00 62.71 37.29
18 Preston bikes 22 0.00 62.71 37.29
19 Liverpool bikes 19 0.00 0.00 100.00
20 Derby cars 83 100.00 0.00 0.00
21 Hereford bikes 23 0.00 0.00 100.00
The code that produces the data frame above is the following:
df <- tbl_df(df) %>%
mutate(., pct.cars = rep(as.numeric(0), length.out = length(df$city)),
pct.scooters = rep(as.numeric(0), length.out = length(df$city)),
pct.bikes = rep(as.numeric(0), length.out = length(df$city)))
for (i in 1:nrow(df)) {
cur_city <- df$city[i]
n_cars <- df$number[df$city == cur_city & df$transport == "cars"]
n_scooters <- df$number[df$city == cur_city & df$transport == "scooters"]
n_bikes <- df$number[df$city == cur_city & df$transport == "bikes"]
if (length(n_cars) == 1 & length(n_scooters) < 1 & length(n_bikes) < 1) {
# case: there are no scooters nor bikes
df$pct.cars[i] <- 100
df$pct.scooters[i] <- 0
df$pct.bikes[i] <- 0
} else if (length(n_cars) < 1 & length(n_scooters) == 1 & length(n_bikes) == 1) {
# case: there are no cars
df$pct.cars[i] <- 0
df$pct.scooters[i] <- (n_scooters/(n_scooters + n_bikes))*100
df$pct.bikes[i] <- (n_bikes/(n_scooters + n_bikes))*100
} else if (length(n_cars) == 1 & length(n_scooters) == 1 & length(n_bikes) < 1) {
# case: there are no bikes
df$pct.cars[i] <- (n_cars/(n_cars + n_scooters))*100
df$pct.scooters[i] <- (n_scooters/(n_cars + n_scooters))*100
df$pct.bikes[i] <- 0
} else if (length(n_cars) == 1 & length(n_scooters) < 1 & length(n_bikes) == 1) {
# case: there are no scooters
df$pct.cars[i] <- (n_cars/(n_cars + n_bikes))*100
df$pct.scooters[i] <- 0
df$pct.bikes[i] <- (n_bikes/(n_cars + n_bikes))*100
} else if (length(n_cars) < 1 & length(n_scooters) == 1 & length(n_bikes) < 1) {
# case: there are no cars nor bikes
df$pct.cars[i] <- 0
df$pct.scooters[i] <- 100
df$pct.bikes[i] <- 0
} else if (length(n_cars) < 1 & length(n_scooters) < 1 & length(n_bikes) == 1) {
# case: there are no cars nor scooters
df$pct.cars[i] <- 0
df$pct.scooters[i] <- 0
df$pct.bikes[i] <- 100
} else if (length(n_cars) == 1 & length(n_scooters) == 1 & length(n_bikes) == 1 ) {
# case: there are cars, scooters & bikes
df$pct.cars[i] <- (n_cars/(n_cars + n_scooters + n_bikes))*100
df$pct.scooters[i] <- (n_scooters/(n_cars + n_scooters + n_bikes))*100
df$pct.bikes[i] <- (n_bikes/(n_cars + n_scooters + n_bikes))*100
}
}
If somebody has got an easier solution or suggestion (possibly using dplyr), it would be greatly appreciated. Thank you in advance!
Upvotes: 0
Views: 93
Reputation: 50668
How about something like this?
df %>%
group_by(city) %>%
mutate(
number = as.numeric(as.character(number)),
ntot = sum(number),
percentage = number / ntot * 100,
id = paste0("perc.", transport)) %>%
spread(id, percentage) %>%
replace(., is.na(.), 0) %>%
select(-ntot) %>%
ungroup()
# city transport number perc.bikes perc.cars perc.scooters
# <fct> <fct> <dbl> <dbl> <dbl> <dbl>
# 1 Aberdeen scooters 40.0 0 0 100
# 2 Bath bikes 28.0 11.2 0 0
# 3 Bath cars 142 0 57.0 0
# 4 Bath scooters 79.0 0 0 31.7
# 5 Belfast bikes 8.00 3.86 0 0
# 6 Belfast cars 199 0 96.1 0
# 7 Derby cars 83.0 0 100 0
# 8 Glasgow scooters 17.0 0 0 100
# 9 Hereford bikes 23.0 100 0 0
#10 Leeds bikes 13.0 3.54 0 0
Or if you want to fill entries across all percentage columns:
df %>%
group_by(city) %>%
mutate(
number = as.numeric(as.character(number)),
ntot = sum(number),
percentage = number / ntot * 100,
id = paste0("perc.", transport)) %>%
spread(id, percentage) %>%
select(-ntot) %>%
fill(perc.bikes, perc.cars, perc.scooters, .direction = "up") %>%
fill(perc.bikes, perc.cars, perc.scooters, .direction = "down") %>%
replace(., is.na(.), 0) %>%
ungroup()
## A tibble: 21 x 6
# city transport number perc.bikes perc.cars perc.scooters
# <fct> <fct> <dbl> <dbl> <dbl> <dbl>
# 1 Aberdeen scooters 40.0 0 0 100
# 2 Bath bikes 28.0 11.2 57.0 31.7
# 3 Bath cars 142 11.2 57.0 31.7
# 4 Bath scooters 79.0 11.2 57.0 31.7
# 5 Belfast bikes 8.00 3.86 96.1 0
# 6 Belfast cars 199 3.86 96.1 0
# 7 Derby cars 83.0 0 100 0
# 8 Glasgow scooters 17.0 0 0 100
# 9 Hereford bikes 23.0 100 0 0
#10 Leeds bikes 13.0 3.54 72.8 23.7
## ... with 11 more rows
If you just want a table with percentages per city, you can do this:
df %>%
group_by(city) %>%
mutate(
number = as.numeric(as.character(number)),
number = number / sum(ntot) * 100,
transport = paste0("perc.", transport)) %>%
spread(transport, number, fill = 0)
ungroup()
## A tibble: 13 x 4
# city perc.bikes perc.cars perc.scooters
# <fct> <dbl> <dbl> <dbl>
# 1 Aberdeen 0 0 100
# 2 Bath 11.2 57.0 31.7
# 3 Belfast 3.86 96.1 0
# 4 Derby 0 100 0
# 5 Glasgow 0 0 100
# 6 Hereford 100 0 0
# 7 Leeds 3.54 72.8 23.7
# 8 Liverpool 100 0 0
# 9 London 0 87.9 12.1
#10 Nottingham 0 100 0
#11 Oxford 12.8 0 87.2
#12 Preston 37.3 0 62.7
#13 Southampton 100 0 0
Upvotes: 1
Reputation: 93813
Sometimes table/matrix operations can present a simple solution:
df$number <- as.numeric(as.character(df$number))
prop.table(xtabs(number ~ city + transport, data=df), 1)
# transport
#city bikes cars scooters
# Aberdeen 0.00000000 0.00000000 1.00000000
# Bath 0.11244980 0.57028112 0.31726908
# Belfast 0.03864734 0.96135266 0.00000000
# Derby 0.00000000 1.00000000 0.00000000
# Glasgow 0.00000000 0.00000000 1.00000000
# Hereford 1.00000000 0.00000000 0.00000000
# Leeds 0.03542234 0.72752044 0.23705722
# Liverpool 1.00000000 0.00000000 0.00000000
# London 0.00000000 0.87931034 0.12068966
# Nottingham 0.00000000 1.00000000 0.00000000
# Oxford 0.12820513 0.00000000 0.87179487
# Preston 0.37288136 0.00000000 0.62711864
# Southampton 1.00000000 0.00000000 0.00000000
This also gives you the flexibility to find the proportion of a type of transport in each city, just by changing one value:
prop.table(xtabs(number ~ city + transport, data=df), 2)
Upvotes: 1