user1471980
user1471980

Reputation: 10626

create data frame from nested entries

I have a data frame test like this:

dput(test)
structure(list(X = 1L, entityId = structure(1L, .Label = "HOST-123", class = "factor"), 
    displayName = structure(1L, .Label = "server1", class = "factor"), 
    discoveredName = structure(1L, .Label = "server1", class = "factor"), 
    firstSeenTimestamp = 1593860000000, lastSeenTimestamp = 1603210000000, 
    tags = structure(1L, .Label = "c(\"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\"), c(\"app1\", \"client\", \"org\", \"app1\", \"DATA_CENTER\", \"PURPOSE\", \"REGION\", \"Test\"), c(NA, \"NONE\", \"Host:Environment:test123\", \"111\", \"222\", \"GENERAL\", \"444\", \"555\")", class = "factor")), .Names = c("X", 
"entityId", "displayName", "discoveredName", "firstSeenTimestamp", 
"lastSeenTimestamp", "tags"), class = "data.frame", row.names = c(NA, 
-1L))

There is a column called tags which should become a dataframe. I need to get rid of the first row in tags (which keep saying CONTEXTLESS, expand the second column in tags(make them columns. Lastly I need to insert the 3rd column values in tags under each expanded columns.

For example in needs to look like this:

structure(list(entityId = structure(1L, .Label = "HOST-123", class = "factor"), 
    displayName = structure(1L, .Label = "server1", class = "factor"), 
    discoveredName = structure(1L, .Label = "server1", class = "factor"), 
    firstSeenTimestamp = 1593860000000, lastSeenTimestamp = 1603210000000, 
    app1 = NA, client = structure(1L, .Label = "None", class = "factor"), 
    org = structure(1L, .Label = "Host:Environment:test123", class = "factor"), 
    app1.1 = 111L, data_center = 222L, purppose = structure(1L, .Label = "general", class = "factor"), 
    region = 444L, test = 555L), .Names = c("entityId", "displayName", 
"discoveredName", "firstSeenTimestamp", "lastSeenTimestamp", 
"app1", "client", "org", "app1.1", "data_center", "purppose", 
"region", "test"), class = "data.frame", row.names = c(NA, -1L
))

I need to remove the 1st vector that keeps saying "contextless", add the second vector the columns. Each 2nd vector value should be a column name. Last vector should be values of the newly added columns.

Upvotes: 2

Views: 163

Answers (2)

ekoam
ekoam

Reputation: 8844

Here is a tidyverse approach

library(dplyr)
library(tidyr)

str2dataframe <- function(txt, keep = "all") {
  # If you can confirm that all vectors are of the same length, then we can make them into columns of a data.frame
  out <- eval(parse(text = paste0("data.frame(", as.character(txt),")")))
  # rename columns as X1, X2, ...
  nms <- make.names(seq_along(out), unique = TRUE)
  if (keep == "all")
    keep <- nms
  `names<-`(out, nms)[, keep]
}

df %>% 
  mutate(
    tags = lapply(tags, str2dataframe, -1L), 
    tags = lapply(tags, function(d) within(d, X2 <- make.unique(X2)))
  ) %>% 
  unnest(tags) %>% 
  pivot_wider(names_from = "X2", values_from = "X3")

df looks like this

> df
  X entityId displayName discoveredName firstSeenTimestamp lastSeenTimestamp
1 1 HOST-123     server1        server1        1.59386e+12       1.60321e+12
                                                                                                                                                                                                                                                                                         tags
1 c("CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS"), c("app1", "client", "org", "app1", "DATA_CENTER", "PURPOSE", "REGION", "Test"), c(NA, "NONE", "Host:Environment:test123", "111", "222", "GENERAL", "444", "555")

Output looks like this

# A tibble: 1 x 14
      X entityId displayName discoveredName firstSeenTimestamp lastSeenTimestamp app1  client org                      app1.1 DATA_CENTER PURPOSE REGION Test 
  <int> <fct>    <fct>       <fct>                       <dbl>             <dbl> <chr> <chr>  <chr>                    <chr>  <chr>       <chr>   <chr>  <chr>
1     1 HOST-123 server1     server1             1593860000000     1603210000000 NA    NONE   Host:Environment:test123 111    222         GENERAL 444    555  

Upvotes: 1

IRTFM
IRTFM

Reputation: 263342

If you are willing to drop the first "row" of garbage and then do a ittle cleanup of the parse-side-effects, then this might be a good place to start:

read.table(text=gsub("\\),", ")\n", test$tags[1]), sep=",", skip=1, #drops line
                      header=TRUE)

  c.app1 client                       org app1 DATA_CENTER  PURPOSE REGION Test.
1   c(NA   NONE  Host:Environment:test123  111         222  GENERAL    444  555)

The read.table function uses the scan function which doesn't know that "c(" and ")" are meaningful. The other alternative might be to try eval(parse(text= .)) (which would know that they are enclosing vectors) on the the second and third lines, but I couldn't see a clean way to do that. I initially tried to separate the lines using strsplit, but that caused me to loose the parens.

Here's a stab at some cleanup via that addition of some more gsub operations:

read.table(text=gsub("c\\(|\\)","", # gets rid of enclosing "c(" and ")"
                     gsub("\\),", "\n", # inserts line breaks
                                 test$tags[1])), 
                                 sep=",",     #lets commas be parsed
                                 skip=1,      #drops line
                                 header=TRUE) # converts to colnames

  app1 client                       org app1.1 DATA_CENTER  PURPOSE REGION Test
1   NA   NONE  Host:Environment:test123    111         222  GENERAL    444  555

The reason for the added ".1" in the second instance of app1 is that R colnames in dataframes need to be unique unless you override that with check.names=FALSE

Upvotes: 1

Related Questions