ber0.0
ber0.0

Reputation: 11

extract specific pattern from string in R

I would like to extract a series of number (the middle 2 digits from first and first 4 digits from latter) from the following imported data to create a new list as Date.

 1. 251502 091412  
 2. 81502 091549  
 3. 591601 011207  
 4. 591601 011213  
 5. 591502 1026025  
 6. 591601 011203  
 7. 591601 011206  
 8. 631601 120302

I wrote the following code for a test run and it does well:

x <- c("251502 091412", "591502 1026025")
t = do.call(rbind, strsplit(x, " "))
t1 = substring(rev(t[,1]), 3, 4) 
t2 = substring(t[, 2], 1, 4)
t = paste("20", t1, t2, sep = "")
ar <- as.Date(t, "%Y%m%d")

However, when I imported bigger set of data, if failed to apply. Is there anything that I missed?

----------------------------UPDATE----------------------------------------

I guess I did not ask the question clearly. What I wanted to do is to extract the numbers from the UPC and transfer them to date. The code from above works fine if I entered the UPC manually, but it failed to compile while I imported the UPC.

> sample <- read.csv("~/Documents/Kokko/sample/sample.csv")
> sample
   sell.date Qty            UPC  Price
1    1/12/16   1  251502 091412 139.04
2    1/12/16   1   81502 091549 318.00
3    1/13/16   1  591601 011207 998.00
4    1/13/16   1  591601 011213 338.00
5    1/13/16   1 591502 1026025 131.60
6    1/13/16   1  591601 011203 338.00
7    1/13/16   1  591601 011206 998.00
8    1/13/16   1  631601 120302 298.00
9    1/13/16   1  631601 010523 278.00
10   1/13/16   1  591601 011212 338.00
11   1/13/16   1  591601 122121 378.00
12   1/13/16   1  691502 082036 369.60
13   1/13/16   1  951502 072421 658.00
14   1/14/16   1  591601 011207 998.00
15   1/14/16   1  591601 011212 338.00
16   2/13/16   1  631601 120302 298.00
17            NA                    NA
18            NA                    NA
> 
> library(data.table)
> d <- fread("~/Documents/Kokko/sample/sample.csv", select = c("UPC"))
> x <- as.character(d)
> x
[1] "c(\"251502 091412\", \"81502 091549\", \"591601 011207\", \"591601    011213\", \"591502 1026025\", \"591601 011203\", \"591601 011206\", \"631601 120302\", \"631601 010523\", \"591601 011212\", \"591601 122121\", \"691502 082036\", \"951502 072421\", \"591601 011207\", \"591601 011212\", \"631601 120302\", \"\", \"\")"
> t = do.call(rbind, strsplit(x, " "))
> t1 = substring(rev(t[,1]), 3, 4) 
> t2 = substring(t[, 2], 1, 4)
> t = paste("20", t1, t2, sep = "")
> ar <- as.Date(t, "%Y%m%d")
> ar
[1] NA`

Upvotes: 0

Views: 85

Answers (2)

alistaire
alistaire

Reputation: 43334

You need to index the first number from the end of the first number, as they're not all the same length:

# read in data as lines of text
x <- readLines(textConnection(' 1. 251502 091412  
                                2. 81502 091549  
                                3. 591601 011207  
                                4. 591601 011213  
                                5. 591502 1026025  
                                6. 591601 011203  
                                7. 591601 011206  
                                8. 631601 120302'))

# split on 1+ whitespace characters
t <- do.call(rbind, strsplit(x, '\\s+'))

# index t1 from the end of the string using `nchar`
t1 <- substr(t[,3], nchar(t[,3])-3, nchar(t[,3])-2) 
t2 <- substr(t[,4], 1, 4)
t <- paste0(t1, t2)           # simplified with `paste0` for `sep = ""`, chop "20" and
ar <- as.Date(t, "%y%m%d")    # use `%y` instead of `%Y` here

ar
# [1] "2015-09-14" "2015-09-15" "2016-01-12" "2016-01-12" "2015-10-26" "2016-01-12" "2016-01-12"
# [8] "2016-12-03"

Upvotes: 2

akrun
akrun

Reputation: 886968

We could do this in a single step using sub

as.Date(sub("^.+(.{2}).{2}\\s+(.{4}).*", "\\1\\2", 
                      sample$UPC), "%y%m%d")
#[1] "2015-09-14" "2015-09-15" "2016-01-12" "2016-01-12" "2015-10-26"
#[6] "2016-01-12" "2016-01-12" "2016-12-03" "2016-01-05" "2016-01-12"
#[11] "2016-12-21" "2015-08-20" "2015-07-24" "2016-01-12" "2016-01-12"
#[16] "2016-12-03" NA           NA          

data

sample <- structure(list(sell.date = c("1/12/16", "1/12/16", "1/13/16", 
"1/13/16", "1/13/16", "1/13/16", "1/13/16", "1/13/16", "1/13/16", 
"1/13/16", "1/13/16", "1/13/16", "1/13/16", "1/14/16", "1/14/16", 
"2/13/16", "", ""), Qty = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, NA), UPC = c("251502 091412", 
"81502 091549", "591601 011207", "591601 011213", "591502 1026025", 
"591601 011203", "591601 011206", "631601 120302", "631601 010523", 
"591601 011212", "591601 122121", "691502 082036", "951502 072421", 
"591601 011207", "591601 011212", "631601 120302", "", ""), Price = c(139.04, 
318, 998, 338, 131.6, 338, 998, 298, 278, 338, 378, 369.6, 658, 
998, 338, 298, NA, NA)), .Names = c("sell.date", "Qty", "UPC", 
"Price"), class = "data.frame", row.names = c("1", "2", "3", 
"4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", 
"16", "17", "18"))

Upvotes: 0

Related Questions