Reputation: 5897
I am working with the R programming language.
I have the following data:
library(GA)
library(dplyr)
var_1 = rnorm(1000,10,10)
var_2 = rnorm(1000,5,5)
var_3 = rnorm(1000, 1,1)
goal = rnorm(1000,100,100)
my_data = data.frame(var_1, var_2, var_3, goal)
I wrote the following function that randomly splits this data into 3 different groups and evaluates a "fitness value" ("total_mean") of these groups based on the percentage of the data within each group that is less than some randomly assigned percentile:
#define fitness function
fitness <- function(x) {
x1 = x[1]
x2 = x[2]
x3 = x[3]
x4 = x[4]
x5 = x[5]
x6 = x[6]
x7 = x[7]
x8 = x[8]
x9 = x[9]
#bin data according to random criteria
train_data <- my_data %>% mutate(cat = ifelse(var_1 <= x1 & var_2 <= x2 & var_3 <= x3, "a", ifelse( var_1 <= x4 & var_2 <= x5 & var_3 <= x6, "b", "c")))
train_data$cat = as.factor(train_data$cat)
#new splits
a_table = train_data %>%
filter(cat == "a") %>%
select(var_1, var_2, var_3, goal, cat)
b_table = train_data %>%
filter(cat == "b") %>%
select(var_1, var_2, var_3, goal, cat)
c_table = train_data %>%
filter(cat == "c") %>%
select(var_1, var_2, var_3, goal, cat)
x7 = runif(1,0, 1)
x8 = runif(1, 0, 1)
x9 = runif(1, 0, 1)
#calculate quantile ("quant") for each bin
table_a = data.frame(a_table%>% group_by(cat) %>%
mutate(quant = quantile(goal, prob = x7)))
table_b = data.frame(b_table%>% group_by(cat) %>%
mutate(quant = quantile(goal, prob = x8)))
table_c = data.frame(c_table%>% group_by(cat) %>%
mutate(quant = quantile(goal, prob = x9)))
#create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
table_a$diff = ifelse(table_a$quant > table_a$goal,1,0)
table_b$diff = ifelse(table_b$quant > table_b$goal,1,0)
table_c$diff = ifelse(table_c$quant > table_c$goal,1,0)
#group all tables
final_table = rbind(table_a, table_b, table_c)
# calculate the total mean : this is what needs to be optimized
total_mean = mean(final_table$diff)
n_row_a = nrow(table_a)
n_row_b = nrow(table_b)
n_row_c = nrow(table_c)
return(total_mean)
}
I was able to then optimize this function using the Genetic Algorithm in R:
GA <- ga(type = "real-valued",
fitness = fitness,
lower = c(min(var_1), min(var_2), min(var_3), min(var_1), min(var_2), min(var_3), 0,0,0), upper = c(max(var_1), max(var_2), max(var_3), max(var_1), max(var_2), max(var_3), 1,1,1),
popSize = 50, maxiter = 10, run = 10)
My Question: I would now like to add some "constraints" to this function that prevents the arguments of this function from taking certain values and also prevents the splits made by this function having 0 rows. My logic being that these constraints will work by assigning the returned value of the function as "NaN":
if (n_row_a < 1 | n_row_b < 1 | n_row_c <1 | x4 < x1 | x5 < x2 | x6 < x3){
total_mean <- NaN
}
I tried to add these constraints to the above function:
#define fitness function
fitness <- function(x) {
x1 = x[1]
x2 = x[2]
x3 = x[3]
x4 = x[4]
x5 = x[5]
x6 = x[6]
x7 = x[7]
x8 = x[8]
x9 = x[9]
#bin data according to random criteria
train_data <- my_data %>% mutate(cat = ifelse(var_1 <= x1 & var_2 <= x2 & var_3 <= x3, "a", ifelse( var_1 <= x4 & var_2 <= x5 & var_3 <= x6, "b", "c")))
train_data$cat = as.factor(train_data$cat)
#new splits
a_table = train_data %>%
filter(cat == "a") %>%
select(var_1, var_2, var_3, goal, cat)
b_table = train_data %>%
filter(cat == "b") %>%
select(var_1, var_2, var_3, goal, cat)
c_table = train_data %>%
filter(cat == "c") %>%
select(var_1, var_2, var_3, goal, cat)
x7 = runif(1,0, 1)
x8 = runif(1, 0, 1)
x9 = runif(1, 0, 1)
#calculate quantile ("quant") for each bin
table_a = data.frame(a_table%>% group_by(cat) %>%
mutate(quant = quantile(goal, prob = x7)))
table_b = data.frame(b_table%>% group_by(cat) %>%
mutate(quant = quantile(goal, prob = x8)))
table_c = data.frame(c_table%>% group_by(cat) %>%
mutate(quant = quantile(goal, prob = x9)))
#create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
table_a$diff = ifelse(table_a$quant > table_a$goal,1,0)
table_b$diff = ifelse(table_b$quant > table_b$goal,1,0)
table_c$diff = ifelse(table_c$quant > table_c$goal,1,0)
#group all tables
final_table = rbind(table_a, table_b, table_c)
# calculate the total mean : this is what needs to be optimized
total_mean = mean(final_table$diff)
n_row_a = nrow(table_a)
n_row_b = nrow(table_b)
n_row_c = nrow(table_c)
if (n_row_a < 1 | n_row_b < 1 | n_row_c <1 | x4 < x1 | x5 < x2 | x6 < x3){
total_mean <- NaN
}
return(total_mean)
}
My Problem: However, now the constraints do not seem to be respected:
GA <- ga(type = "real-valued",
fitness = fitness,
lower = c(min(var_1), min(var_2), min(var_3), min(var_1), min(var_2), min(var_3), 0,0,0), upper = c(max(var_1), max(var_2), max(var_3), max(var_1), max(var_2), max(var_3), 1,1,1),
popSize = 50, maxiter = 1000, run = 100)
# output
> GA@solution
x1 x2 x3 x4 x5 x6 x7 x8 x9
[1,] 24 -5.3 4.4 38 12 -1.6 0.88 0.23 0.99
[2,] 21 -5.3 4.4 38 12 -1.6 0.88 0.23 0.99
As we can see here, X6 is less than X3 - it appears that these constraints were not respected. These solutions should have never been returned.
Can someone please show me how to correctly specify these constraints in my function?
Note: You can check to see whether all the solutions obey the constraints at once:
test = data.frame(GA@population)
test$constraint = ifelse(test$X4 > test$X1 & test$X5 > test$X2 & test$X6 > test$X3, "YES", "NO")
table(test$constraint)
Upvotes: 0
Views: 597
Reputation: 5232
I don't understand background of this problem, but there are few things you could improve and few you need to fix.
fitness function
fitness <- function(x, var_data, goal){
# points
x1 <- x[1:3]
x2 <- x[4:6]
x3 <- x[7:9]
# constraint 1
if(any(x1 > x2)) return(-Inf)
# grouping vector
split_var <- apply(
X = var_data,
MARGIN = 1,
FUN = function(y) if(all(y <= x1)) "a" else if (all(y <= x2)) "b" else "c"
)
# constraint 2
if(length(unique(split_var)) < 3) return(-Inf)
# for each group percentage of values that are lower than given quantile
value <- mapply(
FUN = function(x, y) mean(x < quantile(x, prob = y)),
split(goal, split_var), x3
)
# final fitness value
- mean(value) # minus because of minimization
}
You can adapt your goal of fitness function somewhat easier and faster. You shoudn't sample for x7,x8,x9 because those are your parameters and you change them randomly within fitness function. Also you are not calculating average percentage (you don't calculate percentages for each group). Keep in mind that this can be extremely biased towards creating many small groups and one big. Because GA::ga
performs maximization you need to multiply by -1 to get minimization process.
optimization call
# var data matrix
var_data <- cbind(var_1, var_2, var_3)
# column ranges
limits <- apply(var_data, 2, range)
GA <- ga(
type = "real-valued",
fitness = fitness,
lower = c(rep(limits[1,], 2), 0, 0, 0),
upper = c(rep(limits[2,], 2), 1, 1, 1),
popSize = 50,
maxiter = 1000,
run = 50,
var_data = var_data,
goal = goal
)
Upvotes: 2