Reputation: 41
I have this data.table and I would like to generate all possible 'prefixes' in the chain.
user_id v_chain nr_v root_v_chain last_v
616905 3 -> 8 -> 16 -> 17 -> 25 5 3 -> 8 -> 16 -> 17 25
This should become
user_id v_chain nr_v root_v_chain last_v
616905 3 -> 8 5 3 -> 8 -> 16 -> 17 25
616905 3 -> 8 -> 16 5 3 -> 8 -> 16 -> 17 25
616905 3 -> 8 -> 16 -> 17 5 3 -> 8 -> 16 -> 17 25
616905 3 -> 8 -> 16 -> 17 -> 25 5 3 -> 8 -> 16 -> 17 25
How do I iterate over the whole chain and generate new rows?
Data
dd <- read.table(stringsAsFactors = FALSE, header = TRUE,
text = "user_id v_chain nr_v root_v_chain last_v
616905 '3 -> 8 -> 16 -> 17 -> 25' 5 '3 -> 8 -> 16 -> 17' 25")
Upvotes: 3
Views: 127
Reputation: 70336
Another data.table
approach, using a custom function:
library(data.table)
f <- function(x, sep = " -> ") {
Reduce(function(...) paste(..., sep=sep), tstrsplit(x, sep, fixed=TRUE), accumulate=TRUE)[-1L]
}
setDT(dt)[, list(v_chain = f(v_chain)), by = setdiff(names(dt), "v_chain")]
# user_id nr_v root_v_chain last_v v_chain
#1: 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8
#2: 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8 -> 16
#3: 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8 -> 16 -> 17
#4: 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8 -> 16 -> 17 -> 25
Or you could use a join:
tmp <- dt[, list(v_chain = f(v_chain)), by=user_id]
dt <- tmp[dt[,-"v_chain",with=FALSE], , on = "user_id"]
Just a note that this works without modification for different users etc, for example, if your data was
> dt
# user_id v_chain nr_v root_v_chain last_v
#1: 1 3 -> 8 -> 16 -> 17 -> 25 5 3 -> 8 -> 16 -> 17 -> 25 25
#2: 2 1 -> 5 -> 3 -> 4 -> 2 5 1 -> 5 -> 3 -> 4 -> 2 25
Then it would produce:
dt[, list(v_chain = f(v_chain)), by=setdiff(names(dt), "v_chain")]
# user_id nr_v root_v_chain last_v v_chain
#1: 1 5 3 -> 8 -> 16 -> 17 -> 25 25 3 -> 8
#2: 1 5 3 -> 8 -> 16 -> 17 -> 25 25 3 -> 8 -> 16
#3: 1 5 3 -> 8 -> 16 -> 17 -> 25 25 3 -> 8 -> 16 -> 17
#4: 1 5 3 -> 8 -> 16 -> 17 -> 25 25 3 -> 8 -> 16 -> 17 -> 25
#5: 2 5 1 -> 5 -> 3 -> 4 -> 2 25 1 -> 5
#6: 2 5 1 -> 5 -> 3 -> 4 -> 2 25 1 -> 5 -> 3
#7: 2 5 1 -> 5 -> 3 -> 4 -> 2 25 1 -> 5 -> 3 -> 4
#8: 2 5 1 -> 5 -> 3 -> 4 -> 2 25 1 -> 5 -> 3 -> 4 -> 2
Upvotes: 3
Reputation: 843
using regex and data.table:
dt <- data.table(
user_id = 616905,
v_chain = '3 -> 8 -> 16 -> 17 -> 25',
nr_v = 5,
root_v_chain = '3 -> 8 -> 16 -> 17',
last_v = 25
)
dt <- merge(
dt,
# use regex to find ending position of each sub-chain and merge this back with the original data.table assuming that "user_id" is your unique key
dt[, lapply(gregexpr('(?<=->\\s)\\d+', v_chain, perl = TRUE), function(x) Reduce("+", list(x, attr(x, "match.length"), - 1))), by = .(user_id)],
by = "user_id"
)
# perform the substring operation and remove the temp column using data.table syntax
dt[, `:=` (v_chain = substr(v_chain, 1, V1), V1 = NULL)]
dt
user_id v_chain nr_v root_v_chain last_v
1: 616905 3 -> 8 5 3 -> 8 -> 16 -> 17 25
2: 616905 3 -> 8 -> 16 5 3 -> 8 -> 16 -> 17 25
3: 616905 3 -> 8 -> 16 -> 17 5 3 -> 8 -> 16 -> 17 25
4: 616905 3 -> 8 -> 16 -> 17 -> 25 5 3 -> 8 -> 16 -> 17 25
Upvotes: 0
Reputation: 215127
A three stages data.table
solution:
library(data.table)
setDT(dd)[, v_chain := list(strsplit(v_chain, "->"))]
[, v_chain := list(lapply(v_chain, function(ele) lapply(2:length(ele), function(i) paste0(ele[1:i], collapse = "->"))))]
[, .(v_chain = unlist(v_chain)), .(user_id, nr_v, root_v_chain, last_v)]
user_id nr_v root_v_chain last_v v_chain
1: 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8
2: 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8 -> 16
3: 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8 -> 16 -> 17
4: 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8 -> 16 -> 17 -> 25
You can also use dplyr
and tidyr
package:
library(dplyr); library(tidyr);
dd %>% mutate(v_chain = strsplit(v_chain, "->")) %>%
mutate(v_chain = lapply(v_chain, function(ele)
sapply(2:length(ele), function(i) paste0(ele[1:i], collapse = "->")))) %>%
unnest(v_chain)
Source: local data frame [4 x 5]
user_id nr_v root_v_chain last_v v_chain
(int) (int) (chr) (int) (chr)
1 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8
2 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8 -> 16
3 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8 -> 16 -> 17
4 616905 5 3 -> 8 -> 16 -> 17 25 3 -> 8 -> 16 -> 17 -> 25
Upvotes: 0
Reputation: 38520
I think the following should work, although you will have to adjust your user IDs:
# create a vector of all of the V-chains, you would use olddf$v_chain
temp <- c("3 -> 8 -> 16 -> 17 -> 25", "2 -> 6 -> 10 -> 12 -> 20")
# name it with user IDs
names(temp) <- c("user1", "user2") # you would use olddf$user_id
# get the chains
tempList <- lapply(strsplit(temp, split=" -> "),
function(i) sapply(2:length(i),
function(j) paste(i[1:j], collapse=" -> ")))
# make it a data.frame
tempdf <- do.call(data.frame, tempList)
# melt the data.frame to long format
library(data.table)
tempdf <- melt(tempdf, measure.vars=list("user1", "user2"),
value.name="v_chain", variable.name="user_id")
Now, just merge this data.frame onto your original:
finaldf <- merge(olddf, tempdf, by=user_id)
Upvotes: 1
Reputation: 2330
Perhaps this will help:
s <- "3 -> 8 -> 16 -> 17 -> 25"
x <- strsplit(s, " -> ")[[1]]
n <- 2:length(x)
sapply(n, function(i) paste(x[1:i], collapse=" -> "))
Which produces what you have in the desired column:
[1] "3 -> 8" "3 -> 8 -> 16" "3 -> 8 -> 16 -> 17"
[4] "3 -> 8 -> 16 -> 17 -> 25"
Upvotes: 0
Reputation: 1204
You could use following code to get all prefixes
c_chain_split <- unlist(strsplit(dd$v_chain, split = ' '))
for(i in seq(3, length(c_chain_split),2)){
paste(c_chain_split[1:i], collapse = ' ')
}
Then build the rows with c()
and rbind()
them together.
Upvotes: 0