Reputation: 154
I have two dataframes - a dataframe of 7 bins, specifying the limits and name of each bin (called FJX_bins) and a frame of wavelength-sigma pairs (test_spectra). I want to create a new variable in Test_Spectra entitled bin_number based on the bin limits in the FJX_bins
data. The dput
of the two is included below.
It would be relatively easy to brute force this with mutate
and case_when
, but the key here is that I would like the solution to be extensible to an arbitrary number of bins. My feeling is that there is likely to be some sort of apply
method in dplyr
that could be of use here, but all I could think to do was use a for loop, as shown here:
df <- test_spectra %>%
mutate(bin_number = case_when(
for(ii in 1:nrow(FJX_bins)){
Wavelength >= FJX_bins$Lambda_Start[ii] & Wavelength < FJX_bins$Lambda_End[ii] ~
FJX_bins$Bin_Number[ii]}
))
This strategy fails, and throws the error
Case 1 (
for (ii in 1:nrow(FJX_bins)) {...
) must be a two-sided formula, not a NULL
Is there a way to use dplyr
to solve this question? Or do I need to step back and look at something like apply
and cut
? I'd rather stick within a dplyr
framework for other reasons, but could go outside of it, too.
Thanks
FJX_bins <- structure(list(Bin_Number = 1:7, Lambda_Start = c(289, 298.25,
307.45, 312.45, 320.3, 345, 412.45), Lambda_End = c(298.25, 307.45,
312.45, 320.3, 345, 412.45, 850)), row.names = c(NA, -7L), class = c("tbl_df",
"tbl", "data.frame"), spec = structure(list(cols = structure(list(
Bin_Number = structure(list(), class = c("collector_integer",
"collector")), Lambda_Start = structure(list(), class = c("collector_double",
"collector")), Lambda_End = structure(list(), class = c("collector_double",
"collector")), Effective_Lambda = structure(list(), class = c("collector_integer",
"collector"))), .Names = c("Bin_Number", "Lambda_Start",
"Lambda_End", "Effective_Lambda")), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"), .Names = c("Bin_Number",
"Lambda_Start", "Lambda_End"))
test_spectra <- structure(list(Wavelength = c(289L, 290L, 291L, 292L, 293L, 294L,
295L, 296L, 297L, 298L, 299L, 300L, 289L, 290L, 291L, 292L, 293L,
294L, 295L, 296L, 297L, 298L, 299L, 300L, 289L, 290L, 291L, 292L,
293L, 294L, 295L, 296L, 297L, 298L, 299L, 300L, 289L, 290L, 291L,
292L, 293L, 294L, 295L, 296L, 297L, 298L, 299L, 300L), Sigma = c(3.97790085259898e-20,
3.88773011066234e-20, 3.77170497723194e-20, 3.63990173255768e-20,
3.53611020195826e-20, 3.39379425027765e-20, 3.24540998352932e-20,
3.08629426249589e-20, 2.93243925380076e-20, 2.80431593390348e-20,
2.64345023340469e-20, 2.49597804268261e-20, 4.79587956800083e-20,
4.67040607723134e-20, 4.5134283789068e-20, 4.32731814710643e-20,
4.13196812361237e-20, 3.93856298421813e-20, 3.77050786831795e-20,
3.62340670271797e-20, 3.49404344374885e-20, 3.36066462681245e-20,
3.20871974271263e-20, 3.03438697547602e-20, 5.27803299371575e-20,
5.12475486084599e-20, 4.99112054163632e-20, 4.86399784101602e-20,
4.73236079731255e-20, 4.56798834656559e-20, 4.36887241590191e-20,
4.13697643104457e-20, 3.89697643104457e-20, 3.66909671059429e-20,
3.46634646072095e-20, 3.28648835305714e-20, 5.71590756444018e-20,
5.57618648066173e-20, 5.44949261656802e-20, 5.33110977304272e-20,
5.21177991137917e-20, 5.07478142704849e-20, 4.9100984463428e-20,
4.70660943398542e-20, 4.47661068638463e-20, 4.24314737804269e-20,
4.02176301884806e-20, 3.82570654305878e-20)), row.names = c(NA,
-48L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("Wavelength",
"Sigma"))
Upvotes: 4
Views: 4505
Reputation: 1312
In case you don't find a dplyr
solution:
library(data.table)
setDT(test_spectra)
setDT(FJX_bins)
test_spectra[FJX_bins,
bin_number := i.Bin_Number,
on = .(Wavelength >= Lambda_Start, Wavelength < Lambda_End)]
This makes a non-equi join with the bins table and sets the bin number accordingly.
Upvotes: 3
Reputation: 181
with dplyr:
To create bin number as a factor
library(dplyr)
Test_Spectra <- mutate(test_spectra,
bin = cut(Wavelength, breaks = c(FJX_bins$Lambda_Start, 850),
labels = FJX_bins$Bin_Number, right = F))
Or to create bin number as a character variable
Test_Spectra <- mutate(test_spectra,
bin = as.character(cut(Wavelength,
breaks = c(FJX_bins$Lambda_Start, 850),
labels = FJX_bins$Bin_Number, right = F)))
Upvotes: 4
Reputation: 35187
fuzzyjoin
implements dplyr
range/interval joins:
library(fuzzyjoin)
interval_left_join(
FJX_bins,
test_spectra,
by = c('Wavelength' = 'Lambda_Start', 'Wavelength' = 'Lambda_End')
)
# A tibble: 52 x 5 Wavelength Sigma Bin_Number Lambda_Start Lambda_End <int> <dbl> <int> <dbl> <dbl> 1 289 3.98e-20 1 289 298. 2 290 3.89e-20 1 289 298. 3 291 3.77e-20 1 289 298. 4 292 3.64e-20 1 289 298. 5 293 3.54e-20 1 289 298. 6 294 3.39e-20 1 289 298. 7 295 3.25e-20 1 289 298. 8 296 3.09e-20 1 289 298. 9 297 2.93e-20 1 289 298. 10 298 2.80e-20 1 289 298. # … with 42 more rows
Upvotes: 4