Reputation: 177
I am working on a cross sectional dataset. I want to create a new column, named "initial", that will contain initial values of another column. More specifically, for each country, the column initial takes the value of another column called "ratio" for the first year for which the data is available and will take the value 0 for all remaining years. Sample code:
country <- c(rep(c("A","B","C","D"),each=5))
year <- c(1980:1984, 1980: 1984, 1980:1984, 1980:1984)
ratio <- runif(n = 20, min = 0.20, max = 0.40)
mydata <- data.frame(country, year, ratio)
mydata$ratio[[1]] <- NA
mydata$ratio[6:7] <- NA
mydata$ratio[16:18] <- NA
The output I want to obtain looks like this:
Is there a way of doing this in R preferably using the dplyr package?
Thanks very much in advance!
Upvotes: 0
Views: 168
Reputation: 389175
You can use match
to get index of first non-NA value in the group.
library(dplyr)
mydata %>%
group_by(country) %>%
mutate(initial = replace(ratio, -match(TRUE, !is.na(ratio)), 0)) %>%
ungroup
# country year ratio initial
# <chr> <int> <dbl> <dbl>
# 1 A 1980 NA 0
# 2 A 1981 0.268 0.268
# 3 A 1982 0.302 0
# 4 A 1983 0.263 0
# 5 A 1984 0.222 0
# 6 B 1980 NA 0
# 7 B 1981 NA 0
# 8 B 1982 0.397 0.397
# 9 B 1983 0.265 0
#10 B 1984 0.249 0
#11 C 1980 0.302 0.302
#12 C 1981 0.219 0
#13 C 1982 0.339 0
#14 C 1983 0.228 0
#15 C 1984 0.393 0
#16 D 1980 NA 0
#17 D 1981 NA 0
#18 D 1982 NA 0
#19 D 1983 0.303 0.303
#20 D 1984 0.218 0
Upvotes: 1
Reputation: 196
library(tidyverse)
mydata2 <- mydata %>%
group_by(country) %>%
filter(!is.na(ratio)) %>%
mutate(year_rank = rank(year)) %>%
mutate(initial = if_else(year_rank == 1, ratio, 0)) %>%
right_join(., mydata, by = c('country', 'year', 'ratio')) %>%
replace_na(list(initial = '0')) %>%
arrange(country, year) %>%
select(-year_rank)
Upvotes: 2
Reputation: 125268
Making use of dplyr::first
you could do:
library(dplyr)
mydata %>%
group_by(country) %>%
mutate(initial = first(ratio[!is.na(ratio)]),
initial = ifelse(is.na(ratio) | ratio != initial, 0, initial)) %>%
ungroup()
#> # A tibble: 20 × 4
#> country year ratio initial
#> <chr> <int> <dbl> <dbl>
#> 1 A 1980 NA 0
#> 2 A 1981 0.387 0.387
#> 3 A 1982 0.257 0
#> 4 A 1983 0.366 0
#> 5 A 1984 0.328 0
#> 6 B 1980 NA 0
#> 7 B 1981 NA 0
#> 8 B 1982 0.227 0.227
#> 9 B 1983 0.331 0
#> 10 B 1984 0.341 0
#> 11 C 1980 0.292 0.292
#> 12 C 1981 0.344 0
#> 13 C 1982 0.387 0
#> 14 C 1983 0.251 0
#> 15 C 1984 0.292 0
#> 16 D 1980 NA 0
#> 17 D 1981 NA 0
#> 18 D 1982 NA 0
#> 19 D 1983 0.295 0.295
#> 20 D 1984 0.312 0
DATA
set.seed(42)
country <- c(rep(c("A","B","C","D"),each=5))
year <- c(1980:1984, 1980: 1984, 1980:1984, 1980:1984)
ratio <- runif(n = 20, min = 0.20, max = 0.40)
mydata <- data.frame(country, year, ratio)
mydata$ratio[[1]] <- NA
mydata$ratio[6:7] <- NA
mydata$ratio[16:18] <- NA
Upvotes: 2
Reputation: 2977
This might be improved but I get your expected output:
library(dplyr)
mydata %>%
group_by(country) %>%
filter(!is.na(ratio)) %>%
filter(year == min(year)) %>%
rename(initial = ratio) %>%
full_join(., mydata) %>%
mutate(initial = ifelse(is.na(initial), 0, initial)) %>%
arrange(country, year) %>%
relocate(initial, .after = last_col())
Output:
country year ratio initial
<chr> <int> <dbl> <dbl>
1 A 1980 NA 0
2 A 1981 0.341 0.341
3 A 1982 0.330 0
4 A 1983 0.219 0
5 A 1984 0.269 0
6 B 1980 NA 0
7 B 1981 NA 0
8 B 1982 0.365 0.365
9 B 1983 0.210 0
10 B 1984 0.334 0
11 C 1980 0.284 0.284
12 C 1981 0.251 0
13 C 1982 0.358 0
14 C 1983 0.288 0
15 C 1984 0.261 0
16 D 1980 NA 0
17 D 1981 NA 0
18 D 1982 NA 0
19 D 1983 0.252 0.252
20 D 1984 0.301 0
Upvotes: 1