Becky
Becky

Reputation: 39

R: Extract alphabetical string from inside of quotation mark in dataframe

I have a dataframe

data.frame(string = c('["jewelry","tailor","Jewelry"]', '["apple","banana","orange"]')) 

I want to create three columns and two rows... I want to have the dataframe to look like

data.frame(string1 = c('jewelry','apple'), string2=c('tailor','banana'), string3=c('jewelry','orange'))

Upvotes: 1

Views: 75

Answers (4)

Onyambu
Onyambu

Reputation: 79318

This looks like a valid python/json list.

Using reticulate:

library(tidyverse)

df1%>%
  rowwise() %>%
  transmute(string=list(reticulate::py_eval(string)))%>%
  unnest_wider(string, names_sep = '')

#> # A tibble: 2 × 3
#>   string1 string2 string3
#>   <chr>   <chr>   <chr>  
#> 1 jewelry tailor  Jewelry
#> 2 apple   banana  orange

using jsonlite:

a <- jsonlite::fromJSON(paste('[', paste(df1$string, collapse = ','), ']'))
setNames(data.frame(a), paste0('string', seq(ncol(a))))

#>   string1 string2 string3
#> 1 jewelry  tailor Jewelry
#> 2   apple  banana  orange

or even:

d <- do.call(rbind, lapply(df1$string, jsonlite::fromJSON))
setNames(data.frame(d), paste0('string', seq(ncol(d))))

  string1 string2 string3
1 jewelry  tailor Jewelry
2   apple  banana  orange

Upvotes: 0

margusl
margusl

Reputation: 17524

jsonlite through a list column and unnest_wider :

library(dplyr)
library(tidyr)
library(purrr)
library(jsonlite)

df <- data.frame(string = c('["jewelry","tailor","Jewelry"]', 
                            '["apple","banana","orange"]')) 
                            
df %>%  
  mutate(string = map(string, ~ parse_json(.x))) %>% 
  unnest_wider(string, names_sep = "")
#> # A tibble: 2 × 3
#>   string1 string2 string3
#>   <chr>   <chr>   <chr>  
#> 1 jewelry tailor  Jewelry
#> 2 apple   banana  orange

Created on 2023-02-13 with reprex v2.0.2

Upvotes: 0

DPH
DPH

Reputation: 4344

a optional tidyverse approach if you know the max number of columns before hand and have no gaps that matter in terms of ordering or column assignment (rows with less then 3 columns concerning this example):

# dummy data
myDf <- data.frame(string = c('["jewelry","tailor","Jewelry"]', '["apple","banana","orange"]')) 


library(dplyr)
library(tidyr)

myDf %>% 
    # select column to split, new column names and regex with capture groups (parts between brakets
    tidyr::extract(string, into = c("a", "b", "c"), regex = '"(\\w*)","(\\w*)","(\\w*)"')

        a      b       c
1 jewelry tailor Jewelry
2   apple banana  orange

Upvotes: 0

akrun
akrun

Reputation: 887651

In base R, we could remove the brackets, and use read.csv to read the column into a data.frame

read.csv(text = gsub('\\[|\\]|"', "", df1$string),
    header = FALSE, col.names =  paste0("string", 1:3))

-output

  string1 string2 string3
1 jewelry  tailor Jewelry
2   apple  banana  orange

Or using tidyverse

library(dplyr)
library(stringr)
library(tidyr)
df1 %>%
   mutate(string = str_remove_all(string, '\\[|\\]|"')) %>% 
   separate_wider_delim(string, delim = ',', 
     names = c("string1", "string2", "string3"))

-output

# A tibble: 2 × 3
  string1 string2 string3
  <chr>   <chr>   <chr>  
1 jewelry tailor  Jewelry
2 apple   banana  orange 

data

df1 <- data.frame(string = c('["jewelry","tailor","Jewelry"]', 
     '["apple","banana","orange"]')) 

Upvotes: 1

Related Questions