Reputation: 267
I have a data.frame
with 2291 rows and 4 columns, and I want to pick those rows whose column 3 match with column 2 of the next row, and start again from the next matched row and end until the matching goes on until it stops.
I tried using a for
loop from 1:nrow(df)
, but this is not exactly accurate as i
(I think) doesn't really start from the point of matched row.
My current code is like this:
test <- NULL
x <- c()
y <- c()
for(i in 1:nrow(df)){
if(df[i,3]==df[i+1,2]){
x <- df[i,]
y <- df[i+1,]
i = i+1 #stuck at this
}
test <- rbind(test,x,y)
}
Sample data looks like this:
X 2670000 3750000 C
X 3830000 8680000 E3
X 8680000 10120000 E1-A
X 10120000 11130079 D
X 11170079 11810079 E3
X 11810079 12810079 E2-A
X 12810079 13530079 E3
X 13530079 14050079 E3
X 14050079 15330079 A
X 15330079 16810079 E2-A
X 16810079 17690079 E2-A
What I want is:
X 3830000 8680000 E3
X 8680000 10120000 E1-A
X 10120000 11130079 D
X 11170079 11810079 E3
X 11810079 12810079 E2-A
X 12810079 13530079 E3
X 13530079 14050079 E3
X 14050079 15330079 A
X 15330079 16810079 E2-A
X 16810079 17690079 E2-A
I'm actually interested in the column 4 values. After such a condition when
df[i,3]
is not equal to df[i+1,2]
, can the code be updated to store the column 4 values in vectors?
For example: The result for this sample would be:
vector_1
"E3" "E1-A" "D"
vector_2
"E3" "E2-A" "E3" "E3" "A" "E2-A" "E2-A"
What I get so far is:
X 3830000 8680000 E3
X 8680000 10120000 E1-A
X 8680000 10120000 E1-A
X 10120000 11130079 D
X 8680000 10120000 E1-A
X 10120000 11130079 D
X 11170079 11810079 E3
X 11810079 12810079 E2-A
X 11810079 12810079 E2-A
X 12810079 13530079 E3
If I go from row 1 to the last row of my df, I want to keep adding column 4 values in a vector as long as column 3 of i
matches column 2 of i+1
. Once that condition breaks, the next time the same condition is met, I want to keep storing the column 4 values again.
Thank you!
Upvotes: 8
Views: 423
Reputation: 47300
You can do it this way :
# indices of the changes
changed <- c(head(df[[3]],-1) != df[-1,2])
# split by cumsum of those
split_ <- split(df[-1,],cumsum(changed))
split_
# $`1`
# V1 V2 V3 V4
# 2 X 3830000 8680000 E3
# 3 X 8680000 10120000 E1-A
# 4 X 10120000 11130079 D
#
# $`2`
# V1 V2 V3 V4
# 5 X 11170079 11810079 E3
# 6 X 11810079 12810079 E2-A
# 7 X 12810079 13530079 E3
# 8 X 13530079 14050079 E3
# 9 X 14050079 15330079 A
# 10 X 15330079 16810079 E2-A
# 11 X 16810079 17690079 E2-A
# Then to get your vectors :
lapply(split_,`[[`,4)
# $`1`
# [1] "E3" "E1-A" "D"
#
# $`2`
# [1] "E3" "E2-A" "E3" "E3" "A" "E2-A" "E2-A"
data
df <- read.table(text = "
X 2670000 3750000 C
X 3830000 8680000 E3
X 8680000 10120000 E1-A
X 10120000 11130079 D
X 11170079 11810079 E3
X 11810079 12810079 E2-A
X 12810079 13530079 E3
X 13530079 14050079 E3
X 14050079 15330079 A
X 15330079 16810079 E2-A
X 16810079 17690079 E2-A
",stringsAsFactors=F)
Upvotes: 3
Reputation: 5281
Here is a function that does the tasks.
# Results
generate(df)
# [[1]]
# X1 X2 X3 X4
# 2 X 3830000 8680000 E3
# 3 X 8680000 10120000 E1-A
# 4 X 10120000 11130079 D
#
# [[2]]
# X1 X2 X3 X4
# 5 X 11170079 11810079 E3
# 6 X 11810079 12810079 E2-A
# 7 X 12810079 13530079 E3
# 8 X 13530079 14050079 E3
# 9 X 14050079 15330079 A
# 10 X 15330079 16810079 E2-A
# 11 X 16810079 17690079 E2-A
To only have the vectors in question, you can either modify the function generate
or simply
out <- generate(df)
vec <- lapply(seq_along(out), function (k) out[[k]]$X4)
vec
# [[1]]
# [1] "E3" "E1-A" "D"
#
# [[2]]
# [1] "E3" "E2-A" "E3" "E3" "A" "E2-A" "E2-A"
finder <- function (k, dt) {
dt <- dt[k:nrow(dt),]
index <- NA_integer_
col2 <- as.integer(.subset2(dt,2L))
col3_first <- as.integer(.subset2(dt,3L))[1]
index <- which(col2 == col3_first)[1L] +k-1
index
}
getIndex <- function (k, dt) {
curr <- finder(k, dt)
if (curr == nrow(dt) || is.na(curr)) return (curr)
out <- c(k, curr)
s <- k
while (!is.na(curr)) {
s <- s+1
curr <- finder(s, dt)
out <- c(out, curr)
}
out <- out[!is.na(out)]
out
}
generate <- function (dt) {
rows <- 1:nrow(dt)
skip <- integer(0)
counter <- 0L
res <- list()
for (k in rows) {
if (k %in% skip) next
index <- getIndex(k, dt)
if (is.na(index) || length(index) == 1) next
skip <- c(skip, index)
counter <- counter + 1L
res[[counter]] <- df[index,]
}
res
}
For a given row number k
finder
will yield the first n
(> k
) such that df[k,3] == df[i,2]
getIndex
will yield the longest increasing sequence (k=)k_0;k_1;...;k_n
such that
df[k_i,3] == df[k_i+1,2]
for all i = 1;...;n
Finally, generate
yields the sequence from getIndex
for all rows, however if a given row k
is contained in a previous sequence, then it skips that row.
Data
structure(list(X1 = c("X", "X", "X", "X", "X", "X", "X", "X",
"X", "X", "X"), X2 = c("2670000", "3830000", "8680000", "10120000",
"11170079", "11810079", "12810079", "13530079", "14050079", "15330079",
"16810079"), X3 = c("3750000", "8680000", "10120000", "11130079",
"11810079", "12810079", "13530079", "14050079", "15330079", "16810079",
"17690079"), X4 = c("C", "E3", "E1-A", "D", "E3", "E2-A", "E3",
"E3", "A", "E2-A", "E2-A")), class = "data.frame", row.names = c(NA,
-11L))
Upvotes: 1
Reputation: 1067
UPDATE: I realized that I was missing some conditions I updated the code
If you just want to keep the result of the 4th column you can try this:
df <- read.table(text = "
X 2670000 3750000 C
X 3830000 8680000 E3
X 8680000 10120000 E1-A
X 10120000 11130079 D
X 11170079 11810079 E3
X 11810079 12810079 E2-A
X 12810079 13530079 E3
X 13530079 14050079 E3
X 14050079 15330079 A
X 15330079 16810079 E2-A
X 16810079 17690079 E2-A
",stringsAsFactors=F)
eq=c(-1,df$V3[1:(nrow(df)-1)]-df$V2[2:nrow(df)])
storeV3 <- list()
vector_id = 0
for(i in 1:length(eq)){
if(eq[i] == 0)
storeV3[[paste0("vector_",vector_id)]]=c(storeV3[[paste0("vector_",vector_id)]],df$V4[i])
if(eq[i] != 0){
if(i> 1 && eq[i-1] == 0){
vector_id = vector_id + 1
storeV3[[paste0("vector_",vector_id)]]=c(storeV3[[paste0("vector_",vector_id)]],df$V4[i])
}
else if(i < length(eq) && eq[i+1] == 0)
storeV3[[paste0("vector_",vector_id)]]=c(storeV3[[paste0("vector_",vector_id)]],df$V4[i])
}
}
The trick I use is to first create a vector to know the indices of the matches and then store the value of V3
in lists of vectors which indices are updated on the fly. This gives me:
> storeV3
$vector_0
[1] "E3" "E1-A" "D"
$vector_1
[1] "E3" "E2-A" "E3" "E3" "A" "E2-A" "E2-A"
Upvotes: 2
Reputation: 267
I made a scrappy code like this, seems to work:
for(i in 1:nrow(df)){
if(df[i,3]==df[i+1,2]){
next
x <- df[i,]
y <- df[i+1,]
i = i+1
}
test <- rbind(test,x,y)
}
test <- test[!duplicated(test),]
v <- list()
j <- 0
for(i in 1:nrow(test)){
if(test[i,3]!=test[i+1,2]){
v[[i]] <- test[(j+1):i,4]
j <- i
next
}
}
v <- v[!is.na(v)]
Upvotes: -3
Reputation: 315
Instead of for loop use repeat. It's much easier this way. First create a duplicate column of 2nd column from 2nd row onwards and compare. I used your sample data and got desired results -
data = read.csv('clipboard', sep = "", header = F, stringsAsFactors = F)
data$v22 = c(data$V2[2:nrow(data)],0)
data
V1 V2 V3 V4 v22
1 X 2670000 3750000 C 3830000
2 X 3830000 8680000 E3 8680000
3 X 8680000 10120000 E1-A 10120000
4 X 10120000 11130079 D 11170079
5 X 11170079 11810079 E3 11810079
6 X 11810079 12810079 E2-A 12810079
7 X 12810079 13530079 E3 13530079
8 X 13530079 14050079 E3 14050079
9 X 14050079 15330079 A 15330079
10 X 15330079 16810079 E2-A 16810079
11 X 16810079 17690079 E2-A 0
find first match
i = 0
repeat {
i = i+1
if (data$V3[i] == data$v22[i]){
cat(paste('First Match at row',i))
break
}
}
Now save additional match rows and keep looking after match breaks and save additional match values
df = data.frame(NULL)
xstart = c(i)
xend = NULL
repeat{
if (data$V3[i] == data$v22[i]){
df = rbind(df,data[i,])
i = i+1
} else { xend = c(xend,i)
if (i < nrow(data)){
repeat {
i = i +1
if (data$V3[i] == data$v22[i]){
cat(paste('Next Match at row',i))
xstart = c(xstart,i)
break
}
}
}
}
if (i == nrow(data)){
break
}
}
print result
if (length(xend) < length(xstart)) {xend = c(xend,nrow(data))}
for (l in 1:length(xend)){
print(data$V4[xstart[l]:xend[l]])
}
[1] "E3" "E1-A" "D"
[1] "E3" "E2-A" "E3" "E3" "A" "E2-A" "E2-A"
Upvotes: 2
Reputation: 502
An easy way is to use the lead
function from the dplyr package.
lead(x, n = 1L, default = NA, order_by = NULL, ...)
Find the "next" or "previous" values in a vector. Useful for comparing values ahead of or behind the current values.
This also allows you to avoid the for-loop entirely. Since you haven't named your columns in the question, I'll use another example:
library(dplyr)
df <- data.frame(a = 1:5, b = c(2, 999, 4, 5, 999))
print(df) # In this example, we want to keep the 1st, 3rd, and 4th rows.
a b
1 1 2
2 2 999
3 3 4
4 4 5
5 5 999
matching_df <- df[df$b == dplyr::lead(df$a, 1, default = FALSE), ]
print(matching_df)
a b
1 1 2
3 3 4
4 4 5
non_matching_df <- df[df$b != dplyr::lead(df$a, 1, default = FALSE), ]
print(non_matching_df)
a b
2 2 999
5 5 999
Upvotes: 3