R_L_Med
R_L_Med

Reputation: 47

Creating column totals of individual numbers from strings of numbers in R

I have a dataframe with 50 medition techniques, and 4 different categories they can be put into. Each person can put one technique into multiple categories. I want to count the occurences of each time a technique is selected for a specific category (1,2,3 or 4) and create a new dataframe with these added up.

Example of current dataframe:

Techniques_1 Techniques_2 Techniques_3 Techniques_4 Techniques_5 Techniques_6
1,3 2 4 2,3 1 1
2,3 3 4 2,3 1 1
2 2 3 3 1,2 1,2,3
1 3 3 1 2 2
2 2,3 2,3 1,2,3 1 1,2

Desired new dataframe:

Category_Count Technique_1 Technique_2 Technique_3 Technique_4 Technique_5 Technique_6
Category_1 2 0 0 2 4 4
Category_2 3 3 1 3 2 3
Categroy_3 2 3 3 4 0 1
Category_4 0 0 2 0 0 0
    Here is my data set, the counts are different from my example as they 
were made up. Had a go at using dput, hopefully this is correct:

Med_Tech_Structure <- structure(list(Techniques_1 = c("2", "2", "2", 
NA, "1", "1"), 
Techniques_2 = c("2", "2", "2", NA, "1,2", "2"), Techniques_3 = 
c("2", 
"2", "2", NA, "1", "1"), Techniques_4 = c("2", "2", "2", 
NA, "1", "2"), Techniques_5 = c("2,3", "4", "3", NA, "4", 
"1"), Techniques_6 = c("3", "3", "3", NA, "4", "3"), Techniques_7 = 
c("2", 
"2", "2", NA, "1", "1"), Techniques_8 = c("1", "4", "1,3", 
NA, "4", "2"), Techniques_9 = c("2", "2", "2", NA, "2", "2"
), Techniques_10 = c("1", "4", "1", NA, "2", "1"), Techniques_11 = 
c("2", 
"4", "1,2", NA, "4", "4"), Techniques_12 = c("1", "4", "2", 
NA, "4", "4"), Techniques_13 = c("2,3", "4", "1,2", NA, "4", 
"4"), Techniques_14 = c("2", "4", "1,2", NA, "2", "4"), Techniques_15 
= c("2", 
"4", "2,3", NA, "4", "4"), Techniques_16 = c("1", "1", "3", 
NA, "1", "1"), Techniques_17 = c("3", "3", "3", NA, "4", 
"4"), Techniques_18 = c("2", "4", "3", NA, "1", "1"), Techniques_19 = 
c("2", 
"2", "2", NA, "2", "2"), Techniques_20 = c("2", "4", "1", 
NA, "4", "4"), Techniques_21 = c("1,2", "4", "1,2", NA, "4", 
"4"), Techniques_22 = c("1,2", "1", "1,2", NA, "2", "4"), 
Techniques_23 = c("2", "4", "2", NA, "2", "4"), Techniques_24 = 
c("2", 
"2", "2", NA, "2", "2"), Techniques_25 = c("2", "4", "2", 
NA, "4", "4"), Techniques_26 = c("1,2", "4", "1,3", NA, "4", 
"3"), Techniques_27 = c("2", "4", "3", NA, "1", "1,2"), Techniques_28 
= c("2", 
"4", "2", NA, "3", "4"), Techniques_29 = c("1,2", "4", "1,2", 
NA, "4", "4"), Techniques_30 = c("1,2", "4", "2", NA, "4", 
"4"), Techniques_31 = c("2", "4", "2,3", NA, "1,2", "1"), 
Techniques_32 = c("2,3", "4", "1,3", NA, "4", "4"), Techniques_33 = 
c("1,2", 
"4", "1,2", NA, "4", "4"), Techniques_34 = c("1,2", "4", 
"2,3", NA, "4", "4"), Techniques_35 = c("1,2", "4", "2,3", 
NA, "4", "4"), Techniques_36 = c("2", "4", "2,3", NA, "4", 
"4"), Techniques_37 = c("1,2", "4", "2,3", NA, "4", "4"), 
Techniques_38 = c("1", "1", "2", NA, "1", "1"), Techniques_39 = 
c("1,2", 
"4", "2", NA, "1", "4"), Techniques_40 = c("1,2", "4", "2", 
NA, "4", "4"), Techniques_41 = c("1,2", "4", "2", NA, "4", 
"4"), Techniques_42 = c("1,2", "4", "1,2", NA, "4", "4"), 
Techniques_43 = c("1,2,3", "4", "1,2", NA, "4", "4"), Techniques_44 = 
c("1,2,3", 
"3", "1,2,3", NA, "4", "4"), Techniques_45 = c("1,2", "1", 
"2,3", NA, "4", "4"), Techniques_46 = c("1", "4", "1,2", 
NA, "1,2", "4"), Techniques_47 = c("2", "4", "2", NA, "4", 
"4"), Techniques_48 = c("1", "2", "2", NA, "1", "2"), Techniques_49 = 
c("2", 
"4", "2", NA, "1", "4"), Techniques_50 = c("1,2", "4", "2,3", 
NA, "4", "4")), row.names = c(NA, -6L), class = c("tbl_df", 
"tbl", "data.frame"))
    CurrentFrame <- 
    data.frame(Techniques_1 = c("1,3", "2,3", "2", "1", "2"), 
             Techniques_2 = c("2", "3", "2", "3", "2,3"), 
             Techniques_3 = c("4", "4", "3", "3", "2,3"), 
             Techniques_4 = c("2,3", "2,3", "3", "1", "1,2,3"), 
             Techniques_5 = c("1", "1", "1,2", "2", "1"), 
             Techniques_6 = c("1", "1", "1,2,3", "2", "1,2"), 
             stringsAsFactors = FALSE)

    dput(CurrentFrame)

Console output from dput

    structure(list(Techniques_1 = c("1,3", "2,3", "2", "1", "2"), 
    Techniques_2 = c("2", "3", "2", "3", "2,3"), Techniques_3 = c("4", 
    "4", "3", "3", "2,3"), Techniques_4 = c("2,3", "2,3", "3", 
    "1", "1,2,3"), Techniques_5 = c("1", "1", "1,2", "2", "1"
    ), Techniques_6 = c("1", "1", "1,2,3", "2", "1,2")), class = 
    "data.frame", row.names = c(NA, 
    -5L))

Upvotes: 2

Views: 85

Answers (4)

Wimpel
Wimpel

Reputation: 27742

a data.table approach

library(data.table)
# convert to data.table
setDT(CurrentFrame)
# melt to long format
DT <- melt(CurrentFrame, measure.vars = names(CurrentFrame))
# split comma separated values to new row
DT <- DT[, .(value = unlist(strsplit(value, ","))), by = "variable"]
# cast to wide again, use langth as aggregate function
dcast(DT, paste0("Category_", value) ~ variable, value.var = "value", fun.aggregate = length)
#         value Techniques_1 Techniques_2 Techniques_3 Techniques_4 Techniques_5 Techniques_6
# 1: Category_1            2            0            0            2            4            4
# 2: Category_2            3            3            1            3            2            3
# 3: Category_3            2            3            3            4            0            1
# 4: Category_4            0            0            2            0            0            0

Upvotes: 2

Carl
Carl

Reputation: 7540

An alternative approach:

df <- data.frame(
  Techniques_1 = c("1,3", "2,3", "2", "1", "2"),
  Techniques_2 = c("2", "3", "2", "3", "2,3"),
  Techniques_3 = c("4", "4", "3", "3", "2,3"),
  Techniques_4 = c("2,3", "2,3", "3", "1", "1,2,3"),
  Techniques_5 = c("1", "1", "1,2", "2", "1"),
  Techniques_6 = c("1", "1", "1,2,3", "2", "1,2"),
  stringsAsFactors = FALSE
)

library(tidyverse)

count_selected <- function(x){
  df |> 
  mutate(across(everything(), ~str_count(., x))) |> 
  summarise(across(everything(), sum))
}

map_dfr(c("1", "2", "3", "4"), count_selected)
#>   Techniques_1 Techniques_2 Techniques_3 Techniques_4 Techniques_5 Techniques_6
#> 1            2            0            0            2            4            4
#> 2            3            3            1            3            2            3
#> 3            2            3            3            4            0            1
#> 4            0            0            2            0            0            0

Created on 2022-05-12 by the reprex package (v2.0.1)

Upvotes: 1

maarvd
maarvd

Reputation: 1284

#load data.frame
dt <- data.frame(Techniques_1 = c("1,3", "2,3", "2", "1", "2"), 
             Techniques_2 = c("2", "3", "2", "3", "2,3"), 
             Techniques_3 = c("4", "4", "3", "3", "2,3"), 
             Techniques_4 = c("2,3", "2,3", "3", "1", "1,2,3"), 
             Techniques_5 = c("1", "1", "1,2", "2", "1"), 
             Techniques_6 = c("1", "1", "1,2,3", "2", "1,2"), 
             stringsAsFactors = FALSE)

#create list to bind results to
result <- list()

#create a vector of categories
categories <- 1:4

#loop through categories
for(i in categories){
  #find occurayce of techniques in a category with the use of grepl
  occurance_count <- lapply(dt, FUN = function(x) length(grepl(i, x)[grepl(i, x) == TRUE]))
  
  #convert from list to data.frame
  occurance_count <- data.frame(occurance_count)
  
  #create data.frame to bind results to
  temporary <- data.frame(Category_Count = paste0("Category_", i))
  
  #add occurance counts to data.frame
  temporary <- cbind(temporary, occurance_count)
  
  #bind data.frame to list
  result[[i]] <- temporary
}

#from list to data.frame
result <- do.call(rbind, result)
result
  Category_Count Techniques_1 Techniques_2 Techniques_3 Techniques_4 Techniques_5 Techniques_6
1     Category_1            2            0            0            2            4            4
2     Category_2            3            3            1            3            2            3
3     Category_3            2            3            3            4            0            1
4     Category_4            0            0            2            0            0            0

Upvotes: 1

Benjamin
Benjamin

Reputation: 17279

One option for a solution.

CurrentFrame <- 
  structure(list(Techniques_1 = c("1,3", "2,3", "2", "1", "2"), 
                 Techniques_2 = c("2", "3", "2", "3", "2,3"), 
                 Techniques_3 = c("4", "4", "3", "3", "2,3"), 
                 Techniques_4 = c("2,3", "2,3", "3", "1", "1,2,3"), 
                 Techniques_5 = c("1", "1", "1,2", "2", "1"), 
                 Techniques_6 = c("1", "1", "1,2,3", "2", "1,2")), 
                 class = "data.frame", 
                 row.names = c(NA, -5L))

# or use the base R pipe...I'm not on that version yet though.  
library(magrittr)

count_category <- function(x){
  # separate each string into distinct values
  category <- unlist(strsplit(x, ","))
  # converting to a factor allows us to tabulate zero counts
  category <- factor(category, 
                     c("1", "2", "3", "4"))
  # make a single column data frame
  data.frame(table(category))$Freq
}

lapply(CurrentFrame, 
       count_category) %>% 
  do.call("cbind", .) %>% 
  as.data.frame()
#>   Techniques_1 Techniques_2 Techniques_3 Techniques_4 Techniques_5 Techniques_6
#> 1            2            0            0            2            4            4
#> 2            3            3            1            3            2            3
#> 3            2            3            3            4            0            1
#> 4            0            0            2            0            0            0

Upvotes: 1

Related Questions