Reputation: 47
I have a dataframe with 50 medition techniques, and 4 different categories they can be put into. Each person can put one technique into multiple categories. I want to count the occurences of each time a technique is selected for a specific category (1,2,3 or 4) and create a new dataframe with these added up.
Example of current dataframe:
Techniques_1 | Techniques_2 | Techniques_3 | Techniques_4 | Techniques_5 | Techniques_6 |
---|---|---|---|---|---|
1,3 | 2 | 4 | 2,3 | 1 | 1 |
2,3 | 3 | 4 | 2,3 | 1 | 1 |
2 | 2 | 3 | 3 | 1,2 | 1,2,3 |
1 | 3 | 3 | 1 | 2 | 2 |
2 | 2,3 | 2,3 | 1,2,3 | 1 | 1,2 |
Desired new dataframe:
Category_Count | Technique_1 | Technique_2 | Technique_3 | Technique_4 | Technique_5 | Technique_6 |
---|---|---|---|---|---|---|
Category_1 | 2 | 0 | 0 | 2 | 4 | 4 |
Category_2 | 3 | 3 | 1 | 3 | 2 | 3 |
Categroy_3 | 2 | 3 | 3 | 4 | 0 | 1 |
Category_4 | 0 | 0 | 2 | 0 | 0 | 0 |
Here is my data set, the counts are different from my example as they
were made up. Had a go at using dput, hopefully this is correct:
Med_Tech_Structure <- structure(list(Techniques_1 = c("2", "2", "2",
NA, "1", "1"),
Techniques_2 = c("2", "2", "2", NA, "1,2", "2"), Techniques_3 =
c("2",
"2", "2", NA, "1", "1"), Techniques_4 = c("2", "2", "2",
NA, "1", "2"), Techniques_5 = c("2,3", "4", "3", NA, "4",
"1"), Techniques_6 = c("3", "3", "3", NA, "4", "3"), Techniques_7 =
c("2",
"2", "2", NA, "1", "1"), Techniques_8 = c("1", "4", "1,3",
NA, "4", "2"), Techniques_9 = c("2", "2", "2", NA, "2", "2"
), Techniques_10 = c("1", "4", "1", NA, "2", "1"), Techniques_11 =
c("2",
"4", "1,2", NA, "4", "4"), Techniques_12 = c("1", "4", "2",
NA, "4", "4"), Techniques_13 = c("2,3", "4", "1,2", NA, "4",
"4"), Techniques_14 = c("2", "4", "1,2", NA, "2", "4"), Techniques_15
= c("2",
"4", "2,3", NA, "4", "4"), Techniques_16 = c("1", "1", "3",
NA, "1", "1"), Techniques_17 = c("3", "3", "3", NA, "4",
"4"), Techniques_18 = c("2", "4", "3", NA, "1", "1"), Techniques_19 =
c("2",
"2", "2", NA, "2", "2"), Techniques_20 = c("2", "4", "1",
NA, "4", "4"), Techniques_21 = c("1,2", "4", "1,2", NA, "4",
"4"), Techniques_22 = c("1,2", "1", "1,2", NA, "2", "4"),
Techniques_23 = c("2", "4", "2", NA, "2", "4"), Techniques_24 =
c("2",
"2", "2", NA, "2", "2"), Techniques_25 = c("2", "4", "2",
NA, "4", "4"), Techniques_26 = c("1,2", "4", "1,3", NA, "4",
"3"), Techniques_27 = c("2", "4", "3", NA, "1", "1,2"), Techniques_28
= c("2",
"4", "2", NA, "3", "4"), Techniques_29 = c("1,2", "4", "1,2",
NA, "4", "4"), Techniques_30 = c("1,2", "4", "2", NA, "4",
"4"), Techniques_31 = c("2", "4", "2,3", NA, "1,2", "1"),
Techniques_32 = c("2,3", "4", "1,3", NA, "4", "4"), Techniques_33 =
c("1,2",
"4", "1,2", NA, "4", "4"), Techniques_34 = c("1,2", "4",
"2,3", NA, "4", "4"), Techniques_35 = c("1,2", "4", "2,3",
NA, "4", "4"), Techniques_36 = c("2", "4", "2,3", NA, "4",
"4"), Techniques_37 = c("1,2", "4", "2,3", NA, "4", "4"),
Techniques_38 = c("1", "1", "2", NA, "1", "1"), Techniques_39 =
c("1,2",
"4", "2", NA, "1", "4"), Techniques_40 = c("1,2", "4", "2",
NA, "4", "4"), Techniques_41 = c("1,2", "4", "2", NA, "4",
"4"), Techniques_42 = c("1,2", "4", "1,2", NA, "4", "4"),
Techniques_43 = c("1,2,3", "4", "1,2", NA, "4", "4"), Techniques_44 =
c("1,2,3",
"3", "1,2,3", NA, "4", "4"), Techniques_45 = c("1,2", "1",
"2,3", NA, "4", "4"), Techniques_46 = c("1", "4", "1,2",
NA, "1,2", "4"), Techniques_47 = c("2", "4", "2", NA, "4",
"4"), Techniques_48 = c("1", "2", "2", NA, "1", "2"), Techniques_49 =
c("2",
"4", "2", NA, "1", "4"), Techniques_50 = c("1,2", "4", "2,3",
NA, "4", "4")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
CurrentFrame <-
data.frame(Techniques_1 = c("1,3", "2,3", "2", "1", "2"),
Techniques_2 = c("2", "3", "2", "3", "2,3"),
Techniques_3 = c("4", "4", "3", "3", "2,3"),
Techniques_4 = c("2,3", "2,3", "3", "1", "1,2,3"),
Techniques_5 = c("1", "1", "1,2", "2", "1"),
Techniques_6 = c("1", "1", "1,2,3", "2", "1,2"),
stringsAsFactors = FALSE)
dput(CurrentFrame)
dput
structure(list(Techniques_1 = c("1,3", "2,3", "2", "1", "2"),
Techniques_2 = c("2", "3", "2", "3", "2,3"), Techniques_3 = c("4",
"4", "3", "3", "2,3"), Techniques_4 = c("2,3", "2,3", "3",
"1", "1,2,3"), Techniques_5 = c("1", "1", "1,2", "2", "1"
), Techniques_6 = c("1", "1", "1,2,3", "2", "1,2")), class =
"data.frame", row.names = c(NA,
-5L))
Upvotes: 2
Views: 85
Reputation: 27742
a data.table
approach
library(data.table)
# convert to data.table
setDT(CurrentFrame)
# melt to long format
DT <- melt(CurrentFrame, measure.vars = names(CurrentFrame))
# split comma separated values to new row
DT <- DT[, .(value = unlist(strsplit(value, ","))), by = "variable"]
# cast to wide again, use langth as aggregate function
dcast(DT, paste0("Category_", value) ~ variable, value.var = "value", fun.aggregate = length)
# value Techniques_1 Techniques_2 Techniques_3 Techniques_4 Techniques_5 Techniques_6
# 1: Category_1 2 0 0 2 4 4
# 2: Category_2 3 3 1 3 2 3
# 3: Category_3 2 3 3 4 0 1
# 4: Category_4 0 0 2 0 0 0
Upvotes: 2
Reputation: 7540
An alternative approach:
df <- data.frame(
Techniques_1 = c("1,3", "2,3", "2", "1", "2"),
Techniques_2 = c("2", "3", "2", "3", "2,3"),
Techniques_3 = c("4", "4", "3", "3", "2,3"),
Techniques_4 = c("2,3", "2,3", "3", "1", "1,2,3"),
Techniques_5 = c("1", "1", "1,2", "2", "1"),
Techniques_6 = c("1", "1", "1,2,3", "2", "1,2"),
stringsAsFactors = FALSE
)
library(tidyverse)
count_selected <- function(x){
df |>
mutate(across(everything(), ~str_count(., x))) |>
summarise(across(everything(), sum))
}
map_dfr(c("1", "2", "3", "4"), count_selected)
#> Techniques_1 Techniques_2 Techniques_3 Techniques_4 Techniques_5 Techniques_6
#> 1 2 0 0 2 4 4
#> 2 3 3 1 3 2 3
#> 3 2 3 3 4 0 1
#> 4 0 0 2 0 0 0
Created on 2022-05-12 by the reprex package (v2.0.1)
Upvotes: 1
Reputation: 1284
#load data.frame
dt <- data.frame(Techniques_1 = c("1,3", "2,3", "2", "1", "2"),
Techniques_2 = c("2", "3", "2", "3", "2,3"),
Techniques_3 = c("4", "4", "3", "3", "2,3"),
Techniques_4 = c("2,3", "2,3", "3", "1", "1,2,3"),
Techniques_5 = c("1", "1", "1,2", "2", "1"),
Techniques_6 = c("1", "1", "1,2,3", "2", "1,2"),
stringsAsFactors = FALSE)
#create list to bind results to
result <- list()
#create a vector of categories
categories <- 1:4
#loop through categories
for(i in categories){
#find occurayce of techniques in a category with the use of grepl
occurance_count <- lapply(dt, FUN = function(x) length(grepl(i, x)[grepl(i, x) == TRUE]))
#convert from list to data.frame
occurance_count <- data.frame(occurance_count)
#create data.frame to bind results to
temporary <- data.frame(Category_Count = paste0("Category_", i))
#add occurance counts to data.frame
temporary <- cbind(temporary, occurance_count)
#bind data.frame to list
result[[i]] <- temporary
}
#from list to data.frame
result <- do.call(rbind, result)
result
Category_Count Techniques_1 Techniques_2 Techniques_3 Techniques_4 Techniques_5 Techniques_6
1 Category_1 2 0 0 2 4 4
2 Category_2 3 3 1 3 2 3
3 Category_3 2 3 3 4 0 1
4 Category_4 0 0 2 0 0 0
Upvotes: 1
Reputation: 17279
One option for a solution.
CurrentFrame <-
structure(list(Techniques_1 = c("1,3", "2,3", "2", "1", "2"),
Techniques_2 = c("2", "3", "2", "3", "2,3"),
Techniques_3 = c("4", "4", "3", "3", "2,3"),
Techniques_4 = c("2,3", "2,3", "3", "1", "1,2,3"),
Techniques_5 = c("1", "1", "1,2", "2", "1"),
Techniques_6 = c("1", "1", "1,2,3", "2", "1,2")),
class = "data.frame",
row.names = c(NA, -5L))
# or use the base R pipe...I'm not on that version yet though.
library(magrittr)
count_category <- function(x){
# separate each string into distinct values
category <- unlist(strsplit(x, ","))
# converting to a factor allows us to tabulate zero counts
category <- factor(category,
c("1", "2", "3", "4"))
# make a single column data frame
data.frame(table(category))$Freq
}
lapply(CurrentFrame,
count_category) %>%
do.call("cbind", .) %>%
as.data.frame()
#> Techniques_1 Techniques_2 Techniques_3 Techniques_4 Techniques_5 Techniques_6
#> 1 2 0 0 2 4 4
#> 2 3 3 1 3 2 3
#> 3 2 3 3 4 0 1
#> 4 0 0 2 0 0 0
Upvotes: 1