Reputation: 51
I'm finding working with the arule package a bit tricky. I'm using the apriori algorithm to find association rules; something similar to an example in the arules documentation.
data("AdultUCI")
dim(AdultUCI)
AdultUCI[1:2,]
#Ignore everything from here to the last two lines, this is just data preparation
## remove attributes
AdultUCI[["fnlwgt"]] <- NULL
AdultUCI[["education-num"]] <- NULL
## map metric attributes
AdultUCI[[ "age"]] <- ordered(cut(AdultUCI[[ "age"]], c(15,25,45,65,100)),
labels = c("Young", "Middle-aged", "Senior", "Old"))
AdultUCI[[ "hours-per-week"]] <- ordered(cut(AdultUCI[[ "hours-per-week"]],
c(0,25,40,60,168)),
labels = c("Part-time", "Full-time", "Over-time", "Workaholic"))
AdultUCI[[ "capital-gain"]] <- ordered(cut(AdultUCI[[ "capital-gain"]],
c(-Inf,0,median(AdultUCI[[ "capital-gain"]][AdultUCI[[ "capital-gain"]]>0]),
Inf)), labels = c("None", "Low", "High"))
AdultUCI[[ "capital-loss"]] <- ordered(cut(AdultUCI[[ "capital-loss"]],
c(-Inf,0, median(AdultUCI[[ "capital-loss"]][AdultUCI[[ "capital-loss"]]>0]),
Inf)), labels = c("None", "Low", "High"))
#resume reading here
rules <- apriori(AdultUCI, parameter=list(support=0.6, confidence=0.75, minlen=4))
inspect(rules)
Which returns the following four rules
lhs rhs support confidence lift
1 {race=White,
capital-gain=None,
native-country=United-States} => {capital-loss=None} 0.680398 0.9457029 0.9920537
2 {race=White,
capital-loss=None,
native-country=United-States} => {capital-gain=None} 0.680398 0.9083504 0.9901500
3 {race=White,
capital-gain=None,
capital-loss=None} => {native-country=United-States} 0.680398 0.9189249 1.0239581
4 {capital-gain=None,
capital-loss=None,
native-country=United-States} => {race=White} 0.680398 0.8730100 1.0210133
I must be missing something: how do you find the rows in the source data that match an lhs rule using just arules functions?
Is there an easy way to build an SQL WHERE clause from the lhs(rules)?
Thanks
Upvotes: 1
Views: 5091
Reputation: 708
As for the first question, transactions supporting may be found using this function (should work faster, than the one from the other response):
supp_trans_ids = function(items, transactions){
# makes a logical matrix showing which set of items in rows are fully contains in transactions on rows
tmp = is.subset(items, transactions)
tmp2 = lapply(
seq_len(nrow(tmp)),
# 'which' alone would leave a name for each index, which is a complete rule (and would use a lot of memory therefore)
function(i) {
t = which(tmp[i,])
names(t) = NULL
t
}
)
# to easily idenfify sets of items
names(tmp2) = rownames(tmp)
tmp2
}
Now, you may find which transactions support each rule's lhs with:
AdultUCI_trans = as(AdultUCI, 'transactions')
trans_supporting = supp_trans_ids(lhs(rules), AdultUCI_trans)
e.g.
> str(trans_supporting)
List of 4
$ {race=White,capital-gain=None,native-country=United-States} : int [1:35140] 2 3 6 8 13 17 18 19 20 21 ...
$ {race=White,capital-loss=None,native-country=United-States} : int [1:36585] 1 2 3 6 8 9 10 13 17 18 ...
$ {race=White,capital-gain=None,capital-loss=None} : int [1:36164] 2 3 6 8 13 17 18 19 20 21 ...
$ {capital-gain=None,capital-loss=None,native-country=United-States}: int [1:38066] 2 3 4 6 8 11 13 14 17 18 ...
And data you may find with:
AdultUCI_trans[trans_supporting[[1]]] # transactions supporting
AdultUCI[trans_supporting[[1]],] # data on which these transactions are based
Upvotes: 0
Reputation: 7714
This answer is based in the following answer: https://stats.stackexchange.com/questions/21340/finding-suitable-rules-for-new-data-using-arules. The solution is very slow, i´m not sure if will work for large aplications.
library(arules)
rules <- apriori(AdultUCI, parameter=list(support=0.4, confidence=0.75, minlen=4))
inspect(rules)
rec <- function(rules, data, iter){
basket <- data[iter]
rulesMatchLHS <- is.subset(rules@lhs,basket)
suitableRules <- rulesMatchLHS & !(is.subset(rules@rhs,basket))
rules <- sort(rules[rulesMatchLHS], decreasing=TRUE, by="lift")
as(head(rules, 1), "data.frame")
}
recom_loop <- function(rules, data){
temp <- lapply(seq_along(data), function(x) rec(rules, data, x))
temp <- do.call("rbind", temp)
recom <- gsub(".*=> |\\{|\\}", "", temp$rules)
as.data.frame(cbind(as(data, "data.frame"), recom))
}
trans <- as(AdultUCI, "transactions")
recom <- recom_loop(rules, trans[1:50])
Here is some example output:
head(recom)
transactionID
1 1
2 2
3 3
4 4
5 5
6 6
items
1 {age=Middle-aged,workclass=State-gov,education=Bachelors,marital-status=Never-married,occupation=Adm-clerical,relationship=Not-in-family,race=White,sex=Male,capital-gain=Low,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
2 {age=Senior,workclass=Self-emp-not-inc,education=Bachelors,marital-status=Married-civ-spouse,occupation=Exec-managerial,relationship=Husband,race=White,sex=Male,capital-gain=None,capital-loss=None,hours-per-week=Part-time,native-country=United-States,income=small}
3 {age=Middle-aged,workclass=Private,education=HS-grad,marital-status=Divorced,occupation=Handlers-cleaners,relationship=Not-in-family,race=White,sex=Male,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
4 {age=Senior,workclass=Private,education=11th,marital-status=Married-civ-spouse,occupation=Handlers-cleaners,relationship=Husband,race=Black,sex=Male,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
5 {age=Middle-aged,workclass=Private,education=Bachelors,marital-status=Married-civ-spouse,occupation=Prof-specialty,relationship=Wife,race=Black,sex=Female,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=Cuba,income=small}
6 {age=Middle-aged,workclass=Private,education=Masters,marital-status=Married-civ-spouse,occupation=Exec-managerial,relationship=Wife,race=White,sex=Female,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
recom
1 race=White
2 race=White
3 race=White
4 race=White
5 race=White
6 capital-gain=None
Upvotes: 3