Reputation: 33
I have the following dataset, where I have a 1 to indicate a food type was put out and a 0 to indicate it wasn't. I know that the most common number of food types put out is 2, but I'd love to know what combination is the most common.
Here is a sample of the dataset
structure(list(Type_SunflowerSeeds = c(1L, 1L, 1L, 0L, 0L), Type_SafflowerSeeds = c(0L,
0L, 0L, 0L, 0L), Type_Nyjer = c(0L, 0L, 0L, 0L, 0L), Type_EconMix = c(1L,
1L, 0L, 1L, 1L), Type_PremMix = c(0L, 0L, 0L, 0L, 0L), Type_Grains = c(0L,
0L, 0L, 0L, 0L), Type_Nuts = c(0L, 0L, 1L, 1L, 0L), Type_Suet = c(1L,
0L, 0L, 0L, 0L), Type_SugarWater = c(1L, 0L, 0L, 0L, 1L), Type_FruitOrJams = c(0L,
0L, 0L, 1L, 0L), Type_Mealworms = c(0L, 0L, 0L, 0L, 0L), Type_Corn = c(0L,
0L, 0L, 0L, 0L), Type_BarkOrPeanutButter = c(0L, 0L, 0L, 0L,
0L), Type_Scraps = c(1L, 1L, 1L, 1L, 0L), Type_Bread = c(0L,
0L, 0L, 0L, 0L), Type_Other = c(0L, 0L, 0L, 0L, 0L), total = c(5,
3, 3, 4, 2)), row.names = c(NA, 5L), class = "data.frame")
I would like to know the most common pairwise combination as well as the most common three-way combination.
So the output would look like this for pairwise and similar for the three-way combo:
type1 type2 number_of_times
1 Type_SugarWater Type_EconMix 351
2 Type_SunflowerSeeds Type_SugarWater 335
Upvotes: 0
Views: 657
Reputation: 46888
You can use arules which is geared from this kind of analysis. You can read more about some of its uses here
So this is your data:
df = structure(list(Type_SunflowerSeeds = c(1L, 1L, 1L, 0L, 0L), Type_SafflowerSeeds = c(0L,
0L, 0L, 0L, 0L), Type_Nyjer = c(0L, 0L, 0L, 0L, 0L), Type_EconMix = c(1L,
1L, 0L, 1L, 1L), Type_PremMix = c(0L, 0L, 0L, 0L, 0L), Type_Grains = c(0L,
0L, 0L, 0L, 0L), Type_Nuts = c(0L, 0L, 1L, 1L, 0L), Type_Suet = c(1L,
0L, 0L, 0L, 0L), Type_SugarWater = c(1L, 0L, 0L, 0L, 1L), Type_FruitOrJams = c(0L,
0L, 0L, 1L, 0L), Type_Mealworms = c(0L, 0L, 0L, 0L, 0L), Type_Corn = c(0L,
0L, 0L, 0L, 0L), Type_BarkOrPeanutButter = c(0L, 0L, 0L, 0L,
0L), Type_Scraps = c(1L, 1L, 1L, 1L, 0L), Type_Bread = c(0L,
0L, 0L, 0L, 0L), Type_Other = c(0L, 0L, 0L, 0L, 0L), total = c(5,
3, 3, 4, 2)), row.names = c(NA, 5L), class = "data.frame")
We make it a matrix and convert it to a transactions
object, i omit the last column because you don't need total:
library(arules)
m = as(as.matrix(df[,-ncol(df)]),"transactions")
summary(m)
#gives you a lot of information about this data
# now we get a co-occurence matrix
counts = crossTable(m)
To get the the data frame you stated, you need to use dplyr
and tidyr
:
# convert to data.frame
counts[upper.tri(counts)]=NA
diag(counts)=NA
data.frame(counts) %>%
# add rownames as item1
tibble::rownames_to_column("item1") %>%
# make it long format, like you wanted
pivot_longer(-item1,names_to="item2") %>%
# remove rows where item1 == item2
filter(!is.na(value)) %>%
# sort
arrange(desc(value))
# A tibble: 120 x 3
item1 item2 value
<chr> <chr> <int>
1 Type_Scraps Type_SunflowerSeeds 3
2 Type_Scraps Type_EconMix 3
3 Type_EconMix Type_SunflowerSeeds 2
4 Type_SugarWater Type_EconMix 2
The above can be simplified by using apriori
in arules:
# number of combinations
N = 2
# create apriori object
rules = apriori(m,parameter=list(maxlen=N,minlen=N,conf =0.01,support=0.01))
gi <- generatingItemsets(rules)
d <- which(duplicated(gi))
rules = sort(rules[-d])
# output results
data.frame(
lhs=labels(lhs(rules)),
rhs=labels(rhs(rules)),
count=quality(rules)$count)
lhs rhs count
1 {Type_SunflowerSeeds} {Type_Scraps} 3
2 {Type_EconMix} {Type_Scraps} 3
3 {Type_SugarWater} {Type_EconMix} 2
4 {Type_Nuts} {Type_Scraps} 2
5 {Type_SunflowerSeeds} {Type_EconMix} 2
6 {Type_FruitOrJams} {Type_Nuts} 1
For occurrence of 3, simply change N above to 3.
Upvotes: 1
Reputation: 897
If I'm understanding your question correctly and you're looking for how frequent two of the same categories are 1 in the same row (e.g. pairwise like @M-- asked), here's how I've done it in the past. I'm sure there's a more graceful way of going about it though :D
library(dplyr)
library(tidyr)
test.df <- structure(list(Type_SunflowerSeeds = c(1L, 1L, 1L, 0L, 0L, 1L,
1L, 0L, 0L, 0L), Type_SafflowerSeeds = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), Type_Nyjer = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Type_EconMix = c(1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L,
0L, 0L), Type_PremMix = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L), Type_Grains = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Type_Nuts = c(0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Suet = c(1L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), Type_SugarWater = c(1L,
0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L), Type_FruitOrJams = c(0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Mealworms = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Corn = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), Type_BarkOrPeanutButter = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Scraps = c(1L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Bread = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Other = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), total = c(5, 3, 3, 4, 2, 3,
3, 1, 1, 2)), row.names = c(NA, 10L), class = "data.frame")
test.df %>%
mutate(food.id = 1:n()) %>%
gather(key = "type1", value = "val", -food.id, -total) %>% #create an ID column for each row
filter(val==1) %>%
select(food.id, type1) %>% #now we have a data.frame with one column for food.id and
# one column for every food.type it is associated with
left_join( # this left join is essentially doing the same thing we did before
test.df %>%
mutate(food.id = 1:n()) %>%
gather(key = "type2", value = "val", -food.id, -total) %>%
filter(val==1) %>%
select(food.id, type2),
by = c("food.id") # now we're matching each food with all of its associated types
) %>%
mutate(type1.n = as.numeric(factor(type1)), # quick way of making sure we're not counting duplicates
# (e.g. if type1 = Type_SunflowerSeeds and type2 = Type_SafflowerSeeds, that's the same if they were switched)
type2.n = as.numeric(factor(type2))) %>%
filter(type1 > type2) %>% # this filter step takes care of the flip flopping issue
group_by(type1, type2) %>%
summarise( #finally, count the combinations/pairwise values
n.times = n()
) %>%
ungroup() %>%
arrange(desc(n.times), type1, type2)
With the output of:
type1 type2 n.times
<chr> <chr> <int>
1 Type_Scraps Type_EconMix 3
2 Type_SugarWater Type_EconMix 3
3 Type_SunflowerSeeds Type_EconMix 3
4 Type_SunflowerSeeds Type_Scraps 3
5 Type_SunflowerSeeds Type_SugarWater 3
6 Type_Scraps Type_Nuts 2
7 Type_SugarWater Type_Suet 2
8 Type_SunflowerSeeds Type_Suet 2
9 Type_FruitOrJams Type_EconMix 1
10 Type_Nuts Type_EconMix 1
11 Type_Nuts Type_FruitOrJams 1
12 Type_Scraps Type_FruitOrJams 1
13 Type_Suet Type_EconMix 1
14 Type_Suet Type_Scraps 1
15 Type_SugarWater Type_PremMix 1
16 Type_SugarWater Type_Scraps 1
17 Type_SunflowerSeeds Type_Nuts 1
To extend this and do a three-way combination count, you can follow this code. I've also added some additional comments to walk-through what's going on:
# create a baseline data.frame with food.id and every food type that it matches
food.type.long.df <- test.df %>%
mutate(food.id = 1:n()) %>%
gather(key = "type1", value = "val", -food.id, -total) %>%
filter(val==1) %>%
select(food.id, type1) %>%
arrange(food.id)
# join the baseline data.frame to itself to see all possible combinations of food types
# note: this includes repeated types like type1=Type_Corn and type2=Type_Corn
# this also includes rows where the types are simply flip-flopped types
# ex. Row 2 is type1=Type_SunflowerSeeds and type2 = Type_EconMix
# but Row 6 is type1=Type_EconMix and type2 = Type_SunflowerSeeds - we don't want to count this combinations twice
food.2types.df <- food.type.long.df %>%
left_join(
select(food.type.long.df, food.id, type2 = type1),
by = "food.id"
) %>%
arrange(food.id)
# let's add the third type as well; as with before, the same issues are in this df but we'll fix the duplicates
# and flip flops later
food.3types.df <- food.2types.df %>%
left_join(
select(food.type.long.df, food.id, type3 = type1),
by = "food.id"
) %>%
arrange(food.id)
food.3types.df.fixed <- food.3types.df %>%
distinct() %>%
mutate(type1.n = as.numeric(factor(type1)), # assign each type1 a number (in alphabetical order)
type2.n = as.numeric(factor(type2)), # assign each type2 a number (in alphabetical order)
type3.n = as.numeric(factor(type3))) %>% # assign each type3 a number (in alphabetical order)
filter(type1 > type2) %>% # to remove duplicates and flip-flopped rows for types 1 and 2, use a strict inequality
filter(type2 > type3) # to remove duplicates and flip-flopped rows for types 2 and 3, use a strict inequality
food.3type.combination.count <- food.3types.df.fixed %>%
group_by(type1, type2, type3) %>% # group by all three types you want to count
summarise(
n.times = n()
) %>%
ungroup() %>%
arrange(desc(n.times), type1, type2, type3)
With the output:
type1 type2 type3 n.times
<chr> <chr> <chr> <int>
1 Type_SunflowerSeeds Type_Scraps Type_EconMix 2
2 Type_SunflowerSeeds Type_SugarWater Type_EconMix 2
3 Type_SunflowerSeeds Type_SugarWater Type_Suet 2
4 Type_Nuts Type_FruitOrJams Type_EconMix 1
5 Type_Scraps Type_FruitOrJams Type_EconMix 1
6 Type_Scraps Type_Nuts Type_EconMix 1
7 Type_Scraps Type_Nuts Type_FruitOrJams 1
8 Type_Suet Type_Scraps Type_EconMix 1
9 Type_SugarWater Type_Scraps Type_EconMix 1
10 Type_SugarWater Type_Suet Type_EconMix 1
11 Type_SugarWater Type_Suet Type_Scraps 1
12 Type_SunflowerSeeds Type_Scraps Type_Nuts 1
13 Type_SunflowerSeeds Type_Suet Type_EconMix 1
14 Type_SunflowerSeeds Type_Suet Type_Scraps 1
15 Type_SunflowerSeeds Type_SugarWater Type_Scraps 1
Upvotes: 2