Reputation: 361
I have a data frame with 4 columns that contain job titles. For each column, I want to create a new column (category1, category2, category3, category4) which assign each job a category 1-10 depending on which words the job titles contain (e.g. if the job title contains the words "frontend", "ui", "ux" then the column category1 should say 1). I managed to do the categorizing manually for each column using the following code but want to do it at the same time for all 4 columns. Any help is appreciated!
data_rel$category1 <-
ifelse(grepl("frontend|ui|ux", data$job4_clean),1, ifelse(grepl("backend", data$job4_clean),2, ifelse(grepl("fullstack", data$job4_clean),3, ifelse(grepl("entwickler|development|application|developer|software",data$job4_clean),4, ifelse(grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning",data$job4_clean),5, ifelse(grepl("research|teaching|akademischer|researcher",data$job4_clean),6, ifelse(grepl("project|manager|product|consultant|consulting",data$job4_clean),7, ifelse(grepl("it|security|technical|tech", data$job4_clean),8, ifelse(grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design",data$job4_clean),9, ifelse(grepl("founder|ceo|partner|chief|executive|cto",data$job4_clean),10,NA))))))))))
data_rel <- structure(list(job1 = c("phd fellow", "java developer intern",
"optical engineer", " dwh bi engineer", " software engineer",
"software developer", "data engineer", "application software engineer",
"software developer", " web developer", "web developer", "web developer",
"software engineer", "software engineer", " es computer", "associate software engineer",
"fullstack ios developer", "technical delivery manager project manager",
"software architect", "software developer"), job2 = c("research scientist",
"analytics analyst", " developer", " data ml engineer", "graduate teaching assistant",
"software developer", "machine learning engineer", "akademischer mitarbeiter machine learning and analytics",
"backend develope", "lead php developer", "php system analytic software specialist",
"webcreater", "data engineer", "software engineer", "assistant network administrator",
"frontend engineer", "application infrastructor lead", "software engineer",
"application developer", "software developer"), job3 = c("data scientist",
"machine learning engineer", "application developer associate manager",
NA, "co founder cto", NA, NA, NA, NA, NA, "lead php sugarcrm developer",
" php developer", "data analysing researcher ", NA, "application developer consultance",
"manager l1 ui frontend ", " software architect", "software engineering manager solution architect",
"software developer consultance", "ai developer"), job4 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, "software architect development lead",
"team leader", NA, NA, " application development specialist",
" associate experience technology", NA, " software developer",
"fullstack developer productowner", NA)), row.names = c(NA, -20L
), class = c("tbl_df", "tbl", "data.frame"))
Upvotes: 1
Views: 74
Reputation: 383
Expanding on @zx8754's comment, you can put your code in a function, use lapply
to apply to each column, and finish up with do.call
to combine the columns as a data.frame
again.
get_level <- function(col) {
ifelse(grepl("frontend|ui|ux", col), 1,
ifelse(grepl("backend", col), 2,
ifelse(grepl("fullstack", col), 3,
ifelse(grepl("entwickler|development|application|developer|software",col), 4,
ifelse(grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning",col), 5,
ifelse(grepl("research|teaching|akademischer|researcher",col), 6,
ifelse(grepl("project|manager|product|consultant|consulting",col), 7,
ifelse(grepl("it|security|technical|tech", col), 8,
ifelse(grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design", col), 9,
ifelse(grepl("founder|ceo|partner|chief|executive|cto", col), 10,
NA))))))))))
}
cats <- lapply(data_rel, get_level)
cats <- do.call(cbind.data.frame, cats)
names(cats) <- paste0("category", seq_along(data_rel))
Upvotes: 1
Reputation: 39727
You can store the jobs in a vector an then iterate over them with sapply
using grepl
and match
to get the job numbers.
jobs <- c("frontend|ui|ux"
, "backend"
, "fullstack"
, "entwickler|development|application|developer|software"
, "data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning"
, "research|teaching|akademischer|researcher"
, "project|manager|product|consultant|consulting"
, "it|security|technical|tech"
, "margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design"
, "founder|ceo|partner|chief|executive|cto")
sapply(data_rel, function(x) apply(sapply(jobs, grepl, x), 1, match, x=TRUE))
# job1 job2 job3 job4
# [1,] NA 6 5 NA
# [2,] 4 5 5 NA
# [3,] 5 4 4 NA
# [4,] 5 5 NA NA
# [5,] 4 6 10 NA
# [6,] 4 4 NA NA
# [7,] 5 5 NA NA
# [8,] 4 5 NA NA
# [9,] 4 2 NA NA
#[10,] 4 4 NA NA
#[11,] 4 4 4 4
#[12,] 4 NA 4 NA
#[13,] 4 5 5 NA
#[14,] 4 4 NA NA
#[15,] NA NA 4 4
#[16,] 4 1 1 8
#[17,] 3 4 4 NA
#[18,] 7 4 4 4
#[19,] 4 4 4 3
#[20,] 4 4 4 NA
Upvotes: 0
Reputation: 389315
You can write a function with case_when
-
library(dplyr)
change_category <- function(x) {
case_when(grepl("frontend|ui|ux", x) ~ 1L,
grepl("backend", x) ~ 2L,
grepl("fullstack", x) ~ 3L,
grepl("entwickler|development|application|developer|software",x) ~ 4L,
grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning",x) ~5L,
grepl("research|teaching|akademischer|researcher",x) ~ 6L,
grepl("project|manager|product|consultant|consulting", x) ~ 7L,
grepl("it|security|technical|tech", x) ~ 8L,
grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design",x) ~ 9L,
grepl("founder|ceo|partner|chief|executive|cto",x) ~10L)
}
and apply it with across
-
data_rel %>% mutate(across(.fns = change_category, .names = '{col}_category'))
Upvotes: 0
Reputation: 21938
We can use map_dfc
from purrr
package to iterate over each column of our data set detecting the category and bind the resulting column column-wise. I used case_when
instead of ifelse
for a slightly nicer-looked output:
library(dplyr)
library(purrr)
data_rel %>%
map_dfc(~ case_when(
grepl("frontend|ui|ux", .x) ~ 1,
grepl("backend", .x) ~ 2,
grepl("fullstack", .x) ~ 3,
grepl("entwickler|development|application|developer|software", .x) ~ 4,
grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning", .x) ~ 5,
grepl("research|teaching|akademischer|researcher", .x) ~ 6,
grepl("project|manager|product|consultant|consulting", .x) ~ 7,
grepl("it|security|technical|tech", .x) ~ 8,
grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design", .x) ~ 9,
grepl("founder|ceo|partner|chief|executive|cto", .x) ~ 10,
TRUE ~ as.numeric(NA)
))
# A tibble: 20 x 4
job1 job2 job3 job4
<dbl> <dbl> <dbl> <dbl>
1 NA 6 5 NA
2 4 5 5 NA
3 5 4 4 NA
4 5 5 NA NA
5 4 6 10 NA
6 4 4 NA NA
7 5 5 NA NA
8 4 5 NA NA
9 4 2 NA NA
10 4 4 NA NA
11 4 4 4 4
12 4 NA 4 NA
13 4 5 5 NA
14 4 4 NA NA
15 NA NA 4 4
16 4 1 1 8
17 3 4 4 NA
18 7 4 4 4
19 4 4 4 3
20 4 4 4 NA
or in base R:
cbind(sapply(data_rel, function(x) {
ifelse(grepl("frontend|ui|ux", x),1,
ifelse(grepl("backend", x),2,
ifelse(grepl("fullstack", x),3,
ifelse(grepl("entwickler|development|application|developer|software", x),4,
ifelse(grepl("data|analytics|machine|programmer|ml|engineer|engineering|programmer|learning", x),5,
ifelse(grepl("research|teaching|akademischer|researcher", x),6,
ifelse(grepl("project|manager|product|consultant|consulting", x),7,
ifelse(grepl("it|security|technical|tech", x),8,
ifelse(grepl("margketing|sales|media|saas|business|commerce|support|development|digital|markeing|graphic|designer|graphics|design", x),9,
ifelse(grepl("founder|ceo|partner|chief|executive|cto", x),10,NA))))))))))
}))
Upvotes: 0