bulbanos
bulbanos

Reputation: 41

Expanding a row into multiple rows on splitting string

I have this data.table and I would like to generate all possible 'prefixes' in the chain.

user_id         v_chain                  nr_v       root_v_chain           last_v
616905      3 -> 8 -> 16 -> 17 -> 25        5     3 -> 8 -> 16 -> 17        25

This should become

user_id         v_chain                  nr_v       root_v_chain           last_v
616905      3 -> 8                          5     3 -> 8 -> 16 -> 17        25
616905      3 -> 8 -> 16                    5     3 -> 8 -> 16 -> 17        25
616905      3 -> 8 -> 16 -> 17              5     3 -> 8 -> 16 -> 17        25
616905      3 -> 8 -> 16 -> 17 -> 25        5     3 -> 8 -> 16 -> 17        25

How do I iterate over the whole chain and generate new rows?

Data

dd <- read.table(stringsAsFactors = FALSE, header = TRUE,
text = "user_id         v_chain                  nr_v       root_v_chain           last_v
616905      '3 -> 8 -> 16 -> 17 -> 25'        5     '3 -> 8 -> 16 -> 17'        25")

Upvotes: 3

Views: 127

Answers (6)

talat
talat

Reputation: 70336

Another data.table approach, using a custom function:

library(data.table)
f <- function(x, sep = " -> ") {
 Reduce(function(...) paste(..., sep=sep), tstrsplit(x, sep, fixed=TRUE), accumulate=TRUE)[-1L]
}

setDT(dt)[, list(v_chain = f(v_chain)), by = setdiff(names(dt), "v_chain")]
#   user_id nr_v       root_v_chain last_v                  v_chain
#1:  616905    5 3 -> 8 -> 16 -> 17     25                   3 -> 8
#2:  616905    5 3 -> 8 -> 16 -> 17     25             3 -> 8 -> 16
#3:  616905    5 3 -> 8 -> 16 -> 17     25       3 -> 8 -> 16 -> 17
#4:  616905    5 3 -> 8 -> 16 -> 17     25 3 -> 8 -> 16 -> 17 -> 25

Or you could use a join:

tmp <- dt[, list(v_chain = f(v_chain)), by=user_id]
dt <- tmp[dt[,-"v_chain",with=FALSE], , on = "user_id"]

Just a note that this works without modification for different users etc, for example, if your data was

> dt
#   user_id                  v_chain nr_v             root_v_chain last_v
#1:       1 3 -> 8 -> 16 -> 17 -> 25    5 3 -> 8 -> 16 -> 17 -> 25     25
#2:       2    1 -> 5 -> 3 -> 4 -> 2    5    1 -> 5 -> 3 -> 4 -> 2     25

Then it would produce:

dt[, list(v_chain = f(v_chain)), by=setdiff(names(dt), "v_chain")]
#   user_id nr_v             root_v_chain last_v                  v_chain
#1:       1    5 3 -> 8 -> 16 -> 17 -> 25     25                   3 -> 8
#2:       1    5 3 -> 8 -> 16 -> 17 -> 25     25             3 -> 8 -> 16
#3:       1    5 3 -> 8 -> 16 -> 17 -> 25     25       3 -> 8 -> 16 -> 17
#4:       1    5 3 -> 8 -> 16 -> 17 -> 25     25 3 -> 8 -> 16 -> 17 -> 25
#5:       2    5    1 -> 5 -> 3 -> 4 -> 2     25                   1 -> 5
#6:       2    5    1 -> 5 -> 3 -> 4 -> 2     25              1 -> 5 -> 3
#7:       2    5    1 -> 5 -> 3 -> 4 -> 2     25         1 -> 5 -> 3 -> 4
#8:       2    5    1 -> 5 -> 3 -> 4 -> 2     25    1 -> 5 -> 3 -> 4 -> 2

Upvotes: 3

Tony DiFranco
Tony DiFranco

Reputation: 843

using regex and data.table:

dt <- data.table(
    user_id = 616905,
    v_chain = '3 -> 8 -> 16 -> 17 -> 25',
    nr_v = 5,
    root_v_chain = '3 -> 8 -> 16 -> 17',
    last_v = 25
)
dt <- merge(
    dt,
    # use regex to find ending position of each sub-chain and merge this back with the original data.table assuming that "user_id" is your unique key
    dt[, lapply(gregexpr('(?<=->\\s)\\d+', v_chain, perl = TRUE), function(x) Reduce("+", list(x, attr(x, "match.length"), - 1))), by = .(user_id)], 
    by = "user_id"
)
# perform the substring operation and remove the temp column using data.table syntax
dt[, `:=` (v_chain = substr(v_chain, 1, V1), V1 = NULL)]
dt
   user_id                  v_chain nr_v       root_v_chain last_v
1:  616905                   3 -> 8    5 3 -> 8 -> 16 -> 17     25
2:  616905             3 -> 8 -> 16    5 3 -> 8 -> 16 -> 17     25
3:  616905       3 -> 8 -> 16 -> 17    5 3 -> 8 -> 16 -> 17     25
4:  616905 3 -> 8 -> 16 -> 17 -> 25    5 3 -> 8 -> 16 -> 17     25

Upvotes: 0

akuiper
akuiper

Reputation: 215127

A three stages data.table solution:

library(data.table)
setDT(dd)[, v_chain := list(strsplit(v_chain, "->"))]
         [, v_chain := list(lapply(v_chain, function(ele) lapply(2:length(ele), function(i) paste0(ele[1:i], collapse = "->"))))]
         [, .(v_chain = unlist(v_chain)), .(user_id, nr_v, root_v_chain, last_v)]
   user_id nr_v       root_v_chain last_v                  v_chain
1:  616905    5 3 -> 8 -> 16 -> 17     25                  3 -> 8 
2:  616905    5 3 -> 8 -> 16 -> 17     25            3 -> 8 -> 16 
3:  616905    5 3 -> 8 -> 16 -> 17     25      3 -> 8 -> 16 -> 17 
4:  616905    5 3 -> 8 -> 16 -> 17     25 3 -> 8 -> 16 -> 17 -> 25

You can also use dplyr and tidyr package:

library(dplyr); library(tidyr);
dd %>% mutate(v_chain = strsplit(v_chain, "->")) %>% 
       mutate(v_chain = lapply(v_chain, function(ele) 
              sapply(2:length(ele), function(i) paste0(ele[1:i], collapse = "->")))) %>% 
       unnest(v_chain)
Source: local data frame [4 x 5]

  user_id  nr_v       root_v_chain last_v                  v_chain
    (int) (int)              (chr)  (int)                    (chr)
1  616905     5 3 -> 8 -> 16 -> 17     25                  3 -> 8 
2  616905     5 3 -> 8 -> 16 -> 17     25            3 -> 8 -> 16 
3  616905     5 3 -> 8 -> 16 -> 17     25      3 -> 8 -> 16 -> 17 
4  616905     5 3 -> 8 -> 16 -> 17     25 3 -> 8 -> 16 -> 17 -> 25

Upvotes: 0

lmo
lmo

Reputation: 38520

I think the following should work, although you will have to adjust your user IDs:

# create a vector of all of the V-chains, you would use olddf$v_chain
 temp <- c("3 -> 8 -> 16 -> 17 -> 25", "2 -> 6 -> 10 -> 12 -> 20")
# name it with user IDs
names(temp) <- c("user1", "user2") # you would use olddf$user_id

# get the chains
tempList <- lapply(strsplit(temp, split=" -> "), 
                   function(i) sapply(2:length(i), 
                                      function(j) paste(i[1:j], collapse=" -> ")))
# make it a data.frame
tempdf <- do.call(data.frame, tempList)
# melt the data.frame to long format
library(data.table)
tempdf <- melt(tempdf, measure.vars=list("user1", "user2"), 
               value.name="v_chain", variable.name="user_id")

Now, just merge this data.frame onto your original:

finaldf <- merge(olddf, tempdf, by=user_id)

Upvotes: 1

Jason Morgan
Jason Morgan

Reputation: 2330

Perhaps this will help:

s <- "3 -> 8 -> 16 -> 17 -> 25"
x <- strsplit(s, " -> ")[[1]]
n <- 2:length(x)
sapply(n, function(i) paste(x[1:i], collapse=" -> "))

Which produces what you have in the desired column:

[1] "3 -> 8"                   "3 -> 8 -> 16"             "3 -> 8 -> 16 -> 17"      
[4] "3 -> 8 -> 16 -> 17 -> 25"

Upvotes: 0

elevendollar
elevendollar

Reputation: 1204

You could use following code to get all prefixes

c_chain_split <- unlist(strsplit(dd$v_chain, split = ' '))

for(i in seq(3, length(c_chain_split),2)){
  paste(c_chain_split[1:i], collapse = ' ')
}

Then build the rows with c() and rbind() them together.

Upvotes: 0

Related Questions