Reputation: 63
I have a data.table where every row is an individual in different countries. How can I regress x on y separately for each country, and store the regression coefficient(s) for each country in the original data.table.
Lets say, the data.table Im speaking of is as follows to which I want to add two columns (intercept and slope of the regression I explained above)
set.seed(1)
dt = data.table(x=runif(40), y=runif(40), country=rep(1:2,20))
Thanks
Upvotes: 1
Views: 410
Reputation: 460
I didn't realize that you want to add to the original table. In this case, use :=
in j
as follows (the original answer, which returns only the summary, is kept at the end).
dt[,
c('slope', 'intercept') := {
fit <- lm(y ~ x, data = .SD)
list(fit$coefficients[1], fit$coefficients[2])
},
by = country]
dt[]
## x y country slope intercept
## 1: 0.26550866 0.82094629 1 0.6275887 -0.2019328
## 2: 0.37212390 0.64706019 2 0.5042928 0.1252771
## 3: 0.57285336 0.78293276 1 0.6275887 -0.2019328
## 4: 0.90820779 0.55303631 2 0.5042928 0.1252771
## 5: 0.20168193 0.52971958 1 0.6275887 -0.2019328
## 6: 0.89838968 0.78935623 2 0.5042928 0.1252771
## 7: 0.94467527 0.02333120 1 0.6275887 -0.2019328
## 8: 0.66079779 0.47723007 2 0.5042928 0.1252771
## 9: 0.62911404 0.73231374 1 0.6275887 -0.2019328
## 10: 0.06178627 0.69273156 2 0.5042928 0.1252771
## 11: 0.20597457 0.47761962 1 0.6275887 -0.2019328
## 12: 0.17655675 0.86120948 2 0.5042928 0.1252771
## 13: 0.68702285 0.43809711 1 0.6275887 -0.2019328
## 14: 0.38410372 0.24479728 2 0.5042928 0.1252771
## 15: 0.76984142 0.07067905 1 0.6275887 -0.2019328
## 16: 0.49769924 0.09946616 2 0.5042928 0.1252771
## 17: 0.71761851 0.31627171 1 0.6275887 -0.2019328
## 18: 0.99190609 0.51863426 2 0.5042928 0.1252771
## 19: 0.38003518 0.66200508 1 0.6275887 -0.2019328
## 20: 0.77744522 0.40683019 2 0.5042928 0.1252771
## 21: 0.93470523 0.91287592 1 0.6275887 -0.2019328
## 22: 0.21214252 0.29360337 2 0.5042928 0.1252771
## 23: 0.65167377 0.45906573 1 0.6275887 -0.2019328
## 24: 0.12555510 0.33239467 2 0.5042928 0.1252771
## 25: 0.26722067 0.65087047 1 0.6275887 -0.2019328
## 26: 0.38611409 0.25801678 2 0.5042928 0.1252771
## 27: 0.01339033 0.47854525 1 0.6275887 -0.2019328
## 28: 0.38238796 0.76631067 2 0.5042928 0.1252771
## 29: 0.86969085 0.08424691 1 0.6275887 -0.2019328
## 30: 0.34034900 0.87532133 2 0.5042928 0.1252771
## 31: 0.48208012 0.33907294 1 0.6275887 -0.2019328
## 32: 0.59956583 0.83944035 2 0.5042928 0.1252771
## 33: 0.49354131 0.34668349 1 0.6275887 -0.2019328
## 34: 0.18621760 0.33377493 2 0.5042928 0.1252771
## 35: 0.82737332 0.47635125 1 0.6275887 -0.2019328
## 36: 0.66846674 0.89219834 2 0.5042928 0.1252771
## 37: 0.79423986 0.86433947 1 0.6275887 -0.2019328
## 38: 0.10794363 0.38998954 2 0.5042928 0.1252771
## 39: 0.72371095 0.77732070 1 0.6275887 -0.2019328
## 40: 0.41127443 0.96061800 2 0.5042928 0.1252771
## x y country slope intercept
This is a perfect avenue for making use of the flexible j
expression in data.table
. You can put anything in j
as long as it returns a list.
dt[,
{
fit <- lm(y ~ x, data = .SD)
list(intercept = fit$coefficients[1], slope = fit$coefficients[2])
},
by = country]
# country intercept slope
#1: 1 0.6276 -0.2019
#2: 2 0.5043 0.1253
Upvotes: 3
Reputation: 63
Here is how the whole thing can be done, extending Drumy's code:
set.seed(1)
dt = data.table(x=runif(40), y=runif(40), country=rep(1:2,20))
dt= merge(dt, dt[,
{
fit <- lm(y ~ x, data = .SD)
list(intercept = fit$coefficients[1], slope = fit$coefficients[2])
},
by = country], by="country")
Upvotes: 1
Reputation: 123783
Using some core tidyverse
packages along with broom
this can be achieved like so:
library(tidyverse)
library(broom)
library(data.table)
set.seed(1)
dt = data.table(x=runif(40), y=runif(40), country=rep(1:2,20))
dt %>%
group_by(country) %>%
nest() %>%
mutate(mod = map(data, ~ lm(y ~ x, data = .x)),
mod_tidy = map(mod, broom::tidy)) %>%
unnest(mod_tidy) %>%
select(country, data, term, estimate) %>%
pivot_wider(names_from = "term", values_from = "estimate") %>%
rename(intercept = 3, slope = 4) %>%
unnest(data) %>%
as.data.table()
#> country x y intercept slope
#> 1: 1 0.26550866 0.82094629 0.6275887 -0.2019328
#> 2: 1 0.57285336 0.78293276 0.6275887 -0.2019328
#> 3: 1 0.20168193 0.52971958 0.6275887 -0.2019328
#> 4: 1 0.94467527 0.02333120 0.6275887 -0.2019328
#> 5: 1 0.62911404 0.73231374 0.6275887 -0.2019328
#> 6: 1 0.20597457 0.47761962 0.6275887 -0.2019328
#> 7: 1 0.68702285 0.43809711 0.6275887 -0.2019328
#> 8: 1 0.76984142 0.07067905 0.6275887 -0.2019328
#> 9: 1 0.71761851 0.31627171 0.6275887 -0.2019328
#> 10: 1 0.38003518 0.66200508 0.6275887 -0.2019328
#> 11: 1 0.93470523 0.91287592 0.6275887 -0.2019328
#> 12: 1 0.65167377 0.45906573 0.6275887 -0.2019328
#> 13: 1 0.26722067 0.65087047 0.6275887 -0.2019328
#> 14: 1 0.01339033 0.47854525 0.6275887 -0.2019328
#> 15: 1 0.86969085 0.08424691 0.6275887 -0.2019328
#> 16: 1 0.48208012 0.33907294 0.6275887 -0.2019328
#> 17: 1 0.49354131 0.34668349 0.6275887 -0.2019328
#> 18: 1 0.82737332 0.47635125 0.6275887 -0.2019328
#> 19: 1 0.79423986 0.86433947 0.6275887 -0.2019328
#> 20: 1 0.72371095 0.77732070 0.6275887 -0.2019328
#> 21: 2 0.37212390 0.64706019 0.5042928 0.1252771
#> 22: 2 0.90820779 0.55303631 0.5042928 0.1252771
#> 23: 2 0.89838968 0.78935623 0.5042928 0.1252771
#> 24: 2 0.66079779 0.47723007 0.5042928 0.1252771
#> 25: 2 0.06178627 0.69273156 0.5042928 0.1252771
#> 26: 2 0.17655675 0.86120948 0.5042928 0.1252771
#> 27: 2 0.38410372 0.24479728 0.5042928 0.1252771
#> 28: 2 0.49769924 0.09946616 0.5042928 0.1252771
#> 29: 2 0.99190609 0.51863426 0.5042928 0.1252771
#> 30: 2 0.77744522 0.40683019 0.5042928 0.1252771
#> 31: 2 0.21214252 0.29360337 0.5042928 0.1252771
#> 32: 2 0.12555510 0.33239467 0.5042928 0.1252771
#> 33: 2 0.38611409 0.25801678 0.5042928 0.1252771
#> 34: 2 0.38238796 0.76631067 0.5042928 0.1252771
#> 35: 2 0.34034900 0.87532133 0.5042928 0.1252771
#> 36: 2 0.59956583 0.83944035 0.5042928 0.1252771
#> 37: 2 0.18621760 0.33377493 0.5042928 0.1252771
#> 38: 2 0.66846674 0.89219834 0.5042928 0.1252771
#> 39: 2 0.10794363 0.38998954 0.5042928 0.1252771
#> 40: 2 0.41127443 0.96061800 0.5042928 0.1252771
#> country x y intercept slope
Created on 2020-04-10 by the reprex package (v0.3.0)
Upvotes: 0