Reputation: 44351
I have data in a table in which one cell in every row is a multiline string, which is formatted a a bit like a document with references at the end of it. For example, one of those strings looks like:
item A...1
item B...2
item C...3
item D...2
1=foo
2=bar
3=baz
My eventual goal is to extract foo/bar/baz into columns and count the matching items. So for the above, I'd end up with a row including:
foo | bar | baz
----+-----+----
1 | 2 | 1
I tried to start by extracting the "reference" mappings, as a nested data.table looking like this:
code | reason
-----+-------
1 | foo
2 | bar
3 | baz
Here's how I tried to do it, using data.table
and stringr
.
encounter_alerts[, whys := lapply(
str_extract_all(text, regex('^[0-9].*$', multiline = TRUE)),
FUN = function (s) { fread(text = s, sep = '=', header = FALSE, col.names = c('code', 'reason')) }
)]
I am very confused by the error message I get when I try to do this:
Error in fread(text = s, sep = "=", header = FALSE, col.names = c("code", :
file not found: 1=foo
I am explicitly using text
rather than file
so I'm not sure how it's trying to interpret the line of text as a filename!
When I test this with a single row, it seems to work fine:
> fread(text = str_extract_all(encounter_alerts[989]$text, regex('^[0-9].*$', multiline = TRUE))[[1]], sep = '=', header = FALSE, col.names = c('code', 'reason'))
code reason
1: 1 foo
2: 2 bar
What am I doing wrong? Is there a better way to do this?
Thanks!
Upvotes: 3
Views: 323
Reputation: 44351
Thanks so much to @prosoitos for helping with this. Here's the final code I ended up using, highly based on the accepted answer — it's a mix of different packages and so forth which I hope to clean up eventually, but deadlines happen...
get_code_reason_mapping <- function(alert_text) {
alert_text %>%
str_extract_all(regex('^[0-9]=(.*)$', multiline = T)) %>%
unlist() %>%
str_split_fixed("=", 2) %>%
as.data.table() %>%
setnames(c('code', 'reason'))
}
encounter_alerts$code_reason_mapping <- map(encounter_alerts$alert_text, get_code_reason_mapping)
get_why_codes <- function(alert_text) {
alert_text %>%
str_extract_all(regex('[/n][0-9e][0-9>][0-9]$', multiline = TRUE)) %>%
unlist() %>%
str_sub(-1) %>%
as.data.table() %>%
setnames(c('code'))
}
encounter_alerts$why_codes <- map(encounter_alerts$alert_text, get_why_codes)
get_code_counts <- function(df1, df2) {
left_join(df1, df2) %>%
count(reason) %>%
spread(reason, n)
}
code_counts <- map2_df(encounter_alerts$code_reason_mapping, encounter_alerts$why_codes, get_code_counts)
code_counts[is.na(code_counts)] <- 0
code_counts
Upvotes: 0
Reputation: 7447
Note: Edited after reading comments
From your comment, I tried to reproduce what I understand your data might look like.
library(tidyverse)
df <- tibble(
strings = c("item A...1
item B...2
item C...3
item D...2
1=foo
2=bar
3=baz",
"item A...2
item B...2
item C...3
item D...1
1=toto
2=foo
3=lala",
"item A...3
item B...3
item C...3
item D...1
1=tutu
3=ttt")
)
get_ref <- function(string) {
string %>%
str_split("\n") %>%
unlist() %>%
str_subset("=") %>%
str_split_fixed("=", 2) %>%
as_tibble() %>%
rename(code = V1, reason = V2)
}
list1 <- map(df$strings, get_ref)
get_value <- function(string) {
string %>%
str_split("\n") %>%
unlist() %>%
str_subset("\\.\\.\\.") %>%
str_replace_all(".*\\.\\.\\.", "") %>%
as_tibble() %>%
rename(code = value)
}
list2 <- map(df$strings, get_value)
get_result <- function(df1, df2) {
left_join(df1, df2) %>%
count(reason) %>%
spread(reason, n)
}
result <- map2_df(list1, list2, get_result)
result[is.na(result)] <- 0
result
# A tibble: 3 x 7
bar baz foo lala toto ttt tutu
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2 1 1 0 0 0 0
2 0 0 2 1 1 0 0
3 0 0 0 0 0 3 1
Upvotes: 2
Reputation: 76
There's probably a nicer way to do this, but here's a solution that doesn't require any additional libraries (beyond stringr, which you're already using).
sample_str <- 'item A...1
item B...2
item C...3
item D...2
1=foo
2=bar
3=baz'
lines <- stringr::str_split(sample_str, '\n', simplify = T)
extracted_strs <- lines[stringr::str_detect(lines, '^\\d=\\w+$')]
dfs_list <- lapply(extracted_strs, function(x) {
str_parts <- stringr::str_split(x, '=', simplify = T)
df_args = list()
df_args[[str_parts[2]]] = as.integer(str_parts[1])
df_args[['stringsAsFactors']] = F
do.call(data.frame, df_args)
})
df <- do.call(cbind, dfs)
Upvotes: 0
Reputation: 1443
using stringr and dplyr you can do it easily
library(stringr)
library(dplyr)
v <- as.data.frame(c( "item A...1",
"item B...2",
"item C...3",
"item D...2"))
colnames(v)<- "items"
matching <- c( "1",
"2",
"3")
Mapping <- read.table(text="code reason
1 foo
2 bar
3 baz
", header = T)
## Answer
df1<- v %>%
mutate(code = str_extract(v$items, str_c(matching, collapse = "|")))
str(df1)
str(Mapping)
df1$code <- as.numeric(df1$code )
df1 <- left_join(df1,Mapping)
please have a look
Upvotes: 0