Reputation: 31
I am trying to remove the lat/long coordinates in the string below to place into two separate columns "lat" and "long" in R. I haven't had much luck using separate in dplyr. Any help would be most appreciated.
data_clean <- c("7205 MILLS CIVIC PKWY\nWEST DES MOINES 50266\n(41.561342, -93.806489)",
"305 AIRPORT RD\nAMES 50010\n(42.001123, -93.61365)",
"210 EAST TOWER PARK DR\nWATERLOO 50702\n(42.456362, -92.352552)")
data_clean_df <- as.data.frame(data_clean)
Upvotes: 2
Views: 125
Reputation: 101189
Here is a base R solution using gsub()
df <- data.frame(data_clean = gsub("(.*)\n.*","\\1",data_clean),
lat = gsub(".*?\\((.*),.*","\\1",data_clean),
lon = gsub(".*,(.*)\\)","\\1",data_clean))
such that
data_clean lat lon
1 7205 MILLS CIVIC PKWY\nWEST DES MOINES 50266 41.561342 -93.806489
2 305 AIRPORT RD\nAMES 50010 42.001123 -93.61365
3 210 EAST TOWER PARK DR\nWATERLOO 50702 42.456362 -92.352552
Upvotes: 1
Reputation: 388862
We can use tidyr::extract
dividing the data in data_clean
into 3 groups.
library(dplyr)
library(tidyr)
data_clean_df %>%
mutate(data_clean = gsub('\n', '', data_clean)) %>%
extract(data_clean, into = c('address', 'lat', 'lon'),
regex = '(.*)\\((.*),\\s+(.*)\\)', convert = TRUE)
# data_clean lat lon
#1 7205 MILLS CIVIC PKWY\nWEST DES MOINES 50266 41.5613 -93.8065
#2 305 AIRPORT RD\nAMES 50010 42.0011 -93.6137
#3 210 EAST TOWER PARK DR\nWATERLOO 50702 42.4564 -92.3526
Upvotes: 2
Reputation: 8110
Another option if you just want to pull out lat and long:
library(tidyverse)
data_clean <- c("7205 MILLS CIVIC PKWY\nWEST DES MOINES 50266\n(41.561342, -93.806489)",
"305 AIRPORT RD\nAMES 50010\n(42.001123, -93.61365)",
"210 EAST TOWER PARK DR\nWATERLOO 50702\n(42.456362, -92.352552)")
data_clean_df <- as.data.frame(data_clean, stringsAsFactors = F)
data_clean_df %>%
mutate(lat = str_extract(data_clean, "(?<=\\().*?(?=,)"),
long = str_extract(data_clean, paste0("(?<=", lat, ",\\s).*?(?=\\))")))
#> data_clean
#> 1 7205 MILLS CIVIC PKWY\nWEST DES MOINES 50266\n(41.561342, -93.806489)
#> 2 305 AIRPORT RD\nAMES 50010\n(42.001123, -93.61365)
#> 3 210 EAST TOWER PARK DR\nWATERLOO 50702\n(42.456362, -92.352552)
#> lat long
#> 1 41.561342 -93.806489
#> 2 42.001123 -93.61365
#> 3 42.456362 -92.352552
Upvotes: 1
Reputation: 8107
library(dplyr)
library(tidyr)
library(stringr)
data_clean_df %>%
separate(data_clean, into = c("a", "b", "c"), sep = "\n") %>%
mutate(c = str_remove_all(c, "\\(|\\)")) %>%
separate(c, c("lat", "lon"), sep = ", ", convert = TRUE)
a b lat lon
1 7205 MILLS CIVIC PKWY WEST DES MOINES 50266 41.56134 -93.80649
2 305 AIRPORT RD AMES 50010 42.00112 -93.61365
3 210 EAST TOWER PARK DR WATERLOO 50702 42.45636 -92.35255
Upvotes: 1