Reputation: 172
I'm trying to add a new column to my df that is simply my function hardfunct applied to 'values' where the row is 'hardness'. I would then like that value to fill all the rows in that column for the matching 'site' and 'dates'. How do I fill the rest of the rows? I've tried using summarise, rowwise and mutate. Sample data is below.
site=c(rep("River A",4),rep("River B",4))
dates=as.Date(c("01/01/2001","01/01/2001","01/01/2001","01/01/2001","05/08/2001","05/08/2001","05/08/2001","05/08/2001"), format = "%m/%d/%Y")
param=c("lead","hardness","mercury","cadmium","lead","hardness","mercury","cadmium")
value=c("0.2","45","0.9","1.2","0.5","1800","0.6","0.8")
df=data.frame(site,param,dates,value)
hardfunct=function(x){
if (x>=400) {
print(400)
} else if (x<=25) {
print(25)
} else {
return(x)}
}
#######Trying to use group_by and mutate
df %>% group_by(site,dates) %>%
mutate(New_Hardness=sapply(df[df$param=="hardness","value"],hardfunct))
site param dates value New_Hardness
River A lead 1/1/2001 0.2 45
River A hardness 1/1/2001 45 45
River A mercury 1/1/2001 0.9 45
River A cadmium 1/1/2001 1.2 45
River B lead 5/8/2001 0.5 400
River B hardness 5/8/2001 1800 400
River B mercury 5/8/2001 0.6 400
River B cadmium 5/8/2001 0.8 400
Upvotes: 1
Views: 215
Reputation: 76683
In base R, you can use the split/apply/combine strategy.
Note that the pmax
and pmin
idea is @Frank's.
sp <- split(df, list(df$site, df$dates))
sp <- sp[sapply(sp, function(x) nrow(x) != 0)]
newdf <- lapply(sp, function(DF) {
DF$New_Hardness <- pmax(25, pmin(400, DF$value[DF$param == "hardness"]))
DF
})
rm(sp) # tidy up
newdf <- do.call(rbind, newdf)
row.names(newdf) <- NULL
newdf
# site param dates value New_Hardness
#1 River A lead 2001-01-01 0.2 45
#2 River A hardness 2001-01-01 45.0 45
#3 River A mercury 2001-01-01 0.9 45
#4 River A cadmium 2001-01-01 1.2 45
#5 River B lead 2001-05-08 0.5 400
#6 River B hardness 2001-05-08 1800.0 400
#7 River B mercury 2001-05-08 0.6 400
#8 River B cadmium 2001-05-08 0.8 400
Upvotes: 1
Reputation: 16121
site=c(rep("River A",4),rep("River B",4))
dates=as.Date(c("01/01/2001","01/01/2001","01/01/2001","01/01/2001","05/08/2001","05/08/2001","05/08/2001","05/08/2001"), format = "%m/%d/%Y")
param=c("lead","hardness","mercury","cadmium","lead","hardness","mercury","cadmium")
value=c("0.2","45","0.9","1.2","0.5","1800","0.6","0.8")
df=data.frame(site,param,dates,value, stringsAsFactors = F)
hardfunct=function(x){
if (x>=400) {
return(400)
} else if (x<=25) {
return(25)
} else {
return(x)}
}
library(dplyr)
df %>%
group_by(site, dates) %>%
mutate(New = hardfunct(as.numeric(value[param == "hardness"]))) %>%
ungroup()
# # A tibble: 8 x 5
# site param dates value New
# <chr> <chr> <date> <chr> <dbl>
# 1 River A lead 2001-01-01 0.2 45
# 2 River A hardness 2001-01-01 45 45
# 3 River A mercury 2001-01-01 0.9 45
# 4 River A cadmium 2001-01-01 1.2 45
# 5 River B lead 2001-05-08 0.5 400
# 6 River B hardness 2001-05-08 1800 400
# 7 River B mercury 2001-05-08 0.6 400
# 8 River B cadmium 2001-05-08 0.8 400
Note that you have to change the print
in your function to return
, otherwise you also a get a printed value before your dataframe output.
Also note that you need to have character variables and not factors, because the as.numeric
applied to factors will give you different numbers that what you expect.
Upvotes: 2