user3227641
user3227641

Reputation: 177

creating a new column that takes initial values of another column in R

I am working on a cross sectional dataset. I want to create a new column, named "initial", that will contain initial values of another column. More specifically, for each country, the column initial takes the value of another column called "ratio" for the first year for which the data is available and will take the value 0 for all remaining years. Sample code:

country <- c(rep(c("A","B","C","D"),each=5))
year <- c(1980:1984, 1980: 1984, 1980:1984, 1980:1984)
ratio <- runif(n = 20, min = 0.20, max = 0.40)
mydata <- data.frame(country, year, ratio)
mydata$ratio[[1]] <- NA
mydata$ratio[6:7] <- NA
mydata$ratio[16:18] <- NA

The output I want to obtain looks like this:

enter image description here

Is there a way of doing this in R preferably using the dplyr package?

Thanks very much in advance!

Upvotes: 0

Views: 168

Answers (4)

Ronak Shah
Ronak Shah

Reputation: 389175

You can use match to get index of first non-NA value in the group.

library(dplyr)

mydata %>%
  group_by(country) %>%
  mutate(initial = replace(ratio, -match(TRUE, !is.na(ratio)), 0)) %>%
  ungroup

#  country  year  ratio initial
#   <chr>   <int>  <dbl>   <dbl>
# 1 A        1980 NA       0    
# 2 A        1981  0.268   0.268
# 3 A        1982  0.302   0    
# 4 A        1983  0.263   0    
# 5 A        1984  0.222   0    
# 6 B        1980 NA       0    
# 7 B        1981 NA       0    
# 8 B        1982  0.397   0.397
# 9 B        1983  0.265   0    
#10 B        1984  0.249   0    
#11 C        1980  0.302   0.302
#12 C        1981  0.219   0    
#13 C        1982  0.339   0    
#14 C        1983  0.228   0    
#15 C        1984  0.393   0    
#16 D        1980 NA       0    
#17 D        1981 NA       0    
#18 D        1982 NA       0    
#19 D        1983  0.303   0.303
#20 D        1984  0.218   0    

Upvotes: 1

forhad
forhad

Reputation: 196

library(tidyverse)

mydata2 <- mydata %>% 
  group_by(country) %>% 
  filter(!is.na(ratio)) %>% 
  mutate(year_rank = rank(year)) %>% 
  mutate(initial = if_else(year_rank == 1, ratio, 0)) %>% 
  right_join(., mydata, by = c('country', 'year', 'ratio')) %>% 
  replace_na(list(initial = '0')) %>% 
  arrange(country, year) %>% 
  select(-year_rank)

Upvotes: 2

stefan
stefan

Reputation: 125268

Making use of dplyr::first you could do:


library(dplyr)

mydata %>% 
  group_by(country) %>% 
  mutate(initial = first(ratio[!is.na(ratio)]),
         initial = ifelse(is.na(ratio) | ratio != initial, 0, initial)) %>% 
  ungroup()
#> # A tibble: 20 × 4
#>    country  year  ratio initial
#>    <chr>   <int>  <dbl>   <dbl>
#>  1 A        1980 NA       0    
#>  2 A        1981  0.387   0.387
#>  3 A        1982  0.257   0    
#>  4 A        1983  0.366   0    
#>  5 A        1984  0.328   0    
#>  6 B        1980 NA       0    
#>  7 B        1981 NA       0    
#>  8 B        1982  0.227   0.227
#>  9 B        1983  0.331   0    
#> 10 B        1984  0.341   0    
#> 11 C        1980  0.292   0.292
#> 12 C        1981  0.344   0    
#> 13 C        1982  0.387   0    
#> 14 C        1983  0.251   0    
#> 15 C        1984  0.292   0    
#> 16 D        1980 NA       0    
#> 17 D        1981 NA       0    
#> 18 D        1982 NA       0    
#> 19 D        1983  0.295   0.295
#> 20 D        1984  0.312   0

DATA

set.seed(42)

country <- c(rep(c("A","B","C","D"),each=5))
year <- c(1980:1984, 1980: 1984, 1980:1984, 1980:1984)
ratio <- runif(n = 20, min = 0.20, max = 0.40)
mydata <- data.frame(country, year, ratio)
mydata$ratio[[1]] <- NA
mydata$ratio[6:7] <- NA
mydata$ratio[16:18] <- NA

Upvotes: 2

Paul
Paul

Reputation: 2977

This might be improved but I get your expected output:

library(dplyr)

mydata %>% 
  group_by(country) %>% 
  filter(!is.na(ratio)) %>% 
  filter(year == min(year)) %>% 
  rename(initial = ratio) %>% 
  full_join(., mydata) %>% 
  mutate(initial = ifelse(is.na(initial), 0, initial)) %>% 
  arrange(country, year) %>% 
  relocate(initial, .after = last_col())

Output:

  country  year  ratio initial
   <chr>   <int>  <dbl>   <dbl>
 1 A        1980 NA       0    
 2 A        1981  0.341   0.341
 3 A        1982  0.330   0    
 4 A        1983  0.219   0    
 5 A        1984  0.269   0    
 6 B        1980 NA       0    
 7 B        1981 NA       0    
 8 B        1982  0.365   0.365
 9 B        1983  0.210   0    
10 B        1984  0.334   0    
11 C        1980  0.284   0.284
12 C        1981  0.251   0    
13 C        1982  0.358   0    
14 C        1983  0.288   0    
15 C        1984  0.261   0    
16 D        1980 NA       0    
17 D        1981 NA       0    
18 D        1982 NA       0    
19 D        1983  0.252   0.252
20 D        1984  0.301   0   

Upvotes: 1

Related Questions