nik
nik

Reputation: 2584

count both a set of consecutive values and differences between them in a row

My original data is like this

df <- structure(list(V = structure(c(4L, 5L, 3L, 7L, 6L, 2L, 1L), .Label = c("132 B26,172 B27,107 B57,104 B59,137 B60,133 B61,103 B62,134 B63,177 B100,123 B133,184 B168,109 B197,103 B198,173 B202,157 B203,143 B266,62 B342,62 B354,92 B355,195 B368,164 B370,52 B468,74 B469,71 B484,98 B494,66 B502,63 B601,133 B622", 
"135A,510A,511A,60 B23,67 B24,70 B25,95 B26,122 B27,123 B27,109 B60", 
"25A,28 B55,31 B56,45 B57,43 B58,5 B59,47 B59,6 B60,69 B60,66 B61", 
"267 B361,786 B363,543 B392", "563 B202,983 B360", "8 B1,12 B35,10 B71,9 B154,51 B179", 
"91 B26,117 B27,117 B28,102 B29,47 B31,96 B63,78 B64,133 B65,117 B66,121 B66,112 B67,127 B100"
), class = "factor")), .Names = "V", class = "data.frame", row.names = c(NA, 
-7L))

Thanks to @Arkun I can get an output with this function

Newdf <- data.frame(v1 = sapply(str_extract_all(df$V, "(?<=[A-Z])\\d+"), toString), stringsAsFactors=FALSE)

from this output,

Then I want to calculate the consecutive numbers in each row

row 1 does not have

row 2 does not have

row 3 has 1 consecutive 55,56,57,58,59,59,60,60,61

row 4 has two consecutive 26,27, 28, 29 and 63,64,65,66,66,67

row 5 does not

row 6 has 1

row 7 has has 6 (26,27) (59,60,61,62,63) (197,198) (202,203) (354,355) (468,469) Then I want to add one column showing the differences between each consecutive to next one ,

#for example (26,27) and (59,60,61,62,63)  is 59-27= 32
#(59,60,61,62,63) and (197,198) is 197-63=134
#(197,198)  and (202,203) is 202-198= 4
#(202,203) and (354,355) is 354-203= 151
#(354,355) and (468,469) is 468-355= 113

So my output will be like this

            V2              V3
            0               0
            0               0
            1               0
            2               34
            0               0
            1               0
            6            32,134,4,151,113

Upvotes: 1

Views: 95

Answers (1)

akrun
akrun

Reputation: 887511

We could try

library(stringr)
library(data.table)
lst1 <- lapply(str_extract_all(df$V, "(?<=[A-Z])\\d+"), 
         as.numeric)
lst1 <- lapply(lst1, sort)
V2 <- sapply(lst1, function(x) {
         x1 <- x[!duplicated(x)]
         sum(rle(diff(x1)==1)$values)})
i1 <- V2 >1
V3 <- rep(0, length(V2))

V3[i1] <- unlist(lapply(lst1[i1], function(v1) {
        gr <- cumsum(c(TRUE,v1[-1]-v1[-length(v1)]>1))
        d1 <- data.table(v1, gr)
        d1[, if(.N >1) .SD, gr
             ][, list(v1[1], v1[.N]) , gr
              ][, {tmp <- V1-shift(V2)
                 list(toString(tmp[!is.na(tmp)]))}]
        }), use.names=FALSE)

d1 <- data.frame(V2, V3, stringsAsFactors=FALSE)
d1
#  V2                   V3
#1  0                    0
#2  0                    0
#3  1                    0
#4  2                   34
#5  0                    0
#6  1                    0
#7  6 32, 134, 4, 151, 113

Upvotes: 1

Related Questions