Reputation: 841
I have a data frame as below:
## Please copy following text in your clipboard (do not copy this line)
hid ,mid ,aprps,astart ,aend ,ax ,ay ,exph
10001,1000101,3 ,2012-01-01 00:00:00,2012-01-01 00:00:00,475465.6,1272272,41.55607
10001,1000101,3 ,2012-01-01 00:00:00,2012-01-01 08:00:00,475465.6,1272272,41.55607
10001,1000101,4 ,2012-01-01 08:00:00,2012-01-01 08:15:00,475465.6,1272272,41.55607
10001,1000101,3 ,2012-01-01 08:15:00,2012-01-01 09:15:00,475465.6,1272272,41.55607
10001,1000101,4 ,2012-01-01 09:15:00,2012-01-01 09:30:00,475465.6,1272272,41.55607
10001,1000101,3 ,2012-01-01 09:30:00,2012-01-02 00:00:00,475465.6,1272272,41.55607
10001,1000102,3 ,2012-01-01 00:00:00,2012-01-01 00:00:00,475465.6,1272272,41.55607
10001,1000102,3 ,2012-01-01 00:00:00,2012-01-01 07:30:00,475465.6,1272272,41.55607
10001,1000102,4 ,2012-01-01 07:30:00,2012-01-01 07:50:00,475465.6,1272272,41.55607
10001,1000102,1 ,2012-01-01 07:50:00,2012-01-01 11:00:00,475465.6,1272272,41.55607
10001,1000102,4 ,2012-01-01 11:00:00,2012-01-01 11:20:00,475465.6,1272272,41.55607
10001,1000102,3 ,2012-01-01 11:20:00,2012-01-01 14:00:00,475465.6,1272272,41.55607
10001,1000102,4 ,2012-01-01 14:00:00,2012-01-01 14:20:00,475465.6,1272272,41.55607
10001,1000102,1 ,2012-01-01 14:20:00,2012-01-01 17:00:00,475465.6,1272272,41.55607
10001,1000102,4 ,2012-01-01 17:00:00,2012-01-01 17:20:00,475465.6,1272272,41.55607
10001,1000102,3 ,2012-01-01 17:20:00,2012-01-02 00:00:00,475465.6,1272272,41.55607
10001,1000103,3 ,2012-01-01 00:00:00,2012-01-01 00:00:00,475465.6,1272272,41.55607
10001,1000103,3 ,2012-01-01 00:00:00,2012-01-01 08:00:00,475465.6,1272272,41.55607
10001,1000103,4 ,2012-01-01 08:00:00,2012-01-01 12:00:00,475465.6,1272272,41.55607
10001,1000103,3 ,2012-01-01 12:00:00,2012-01-01 13:00:00,475465.6,1272272,41.55607
10001,1000103,4 ,2012-01-01 13:00:00,2012-01-01 19:00:00,475465.6,1272272,41.55607
10001,1000103,3 ,2012-01-01 19:00:00,2012-01-01 20:00:00,475465.6,1272272,41.55607
10001,1000103,4 ,2012-01-01 20:00:00,2012-01-01 23:00:00,475465.6,1272272,41.55607
10001,1000103,3 ,2012-01-01 23:00:00,2012-01-02 00:00:00,475465.6,1272272,41.55607
10001,1000104,3 ,2012-01-01 00:00:00,2012-01-01 00:00:00,475465.6,1272272,41.55607
10001,1000104,3 ,2012-01-01 00:00:00,2012-01-01 07:00:00,475465.6,1272272,41.55607
10001,1000104,4 ,2012-01-01 07:00:00,2012-01-01 07:30:00,473548.0,1279171,41.55607
10001,1000104,2 ,2012-01-01 07:30:00,2012-01-01 10:00:00,473548.0,1279171,41.55607
10001,1000104,4 ,2012-01-01 10:00:00,2012-01-01 10:30:00,475465.6,1272272,41.55607
10001,1000104,3 ,2012-01-01 10:30:00,2012-01-01 17:30:00,475465.6,1272272,41.55607
10001,1000104,4 ,2012-01-01 17:30:00,2012-01-01 17:45:00,484869.7,1270558,41.55607
10001,1000104,2 ,2012-01-01 17:45:00,2012-01-01 21:30:00,484869.7,1270558,41.55607
10001,1000104,4 ,2012-01-01 21:30:00,2012-01-01 21:45:00,475465.6,1272272,41.55607
10001,1000104,3 ,2012-01-01 21:45:00,2012-01-02 00:00:00,475465.6,1272272,41.55607
## Do not copy this line
You can copy above text and import as df
using {psych}
package:
install.packages("psych")
library(psych)
# Please copy above text and run following
df <- read.clipboard(header=TRUE, sep=",")
What I need to obtain from the df
are:
exph
in two pairs of rows, which are extracted at aprps==4
and previous lineaprps==4
, repeat it by group of mid
exph
and corresponded hid
in list or data frameTo make it out, I am currently using following scripts based on two loops:
library(tidyverse)
calc <- function(i) {
## Extract records by "mid" excluding the first records
temp <<- df %>% filter(mid==i) %>% filter(row_number()>1)
## Extract row number of "aprps==4"
r.aprps <- which(temp$aprps==4)
## Repeat operation by two pairs of rows based on "r.aprps"
for (j in 1:length(r.aprps)) {
## Extract movement
temp2 <<- temp[c((r.aprps[j]-1):r.aprps[j]),]
## Other operations in actual data set (jsut put example)
exp <- data.frame(mid=unique(temp2$mid),expsum=sum(temp2$exph))
## Store PPA in list
if (lp==1 & j==1) {
df.exp <<- exp
} else {
df.exp <<- rbind(df.exp,exp)
}
}
}
## Set loop conditions
list.mid <- unique(df$mid)
nloop <- length(list.mid)
## Initialize df.exp
df.exp <- data.frame(matrix(vector(),0,2,
dimnames=list(c(),c("mid","expsum"))),
stringsAsFactors=F)
## Loop to store PPA in list
for (lp in 1:nloop) {
calc(list.mid[lp])
}
However, as actual data frame df
contains around 40,000 records and actual operation contains more complicated calculations, it takes more than 30 hours. I was trying to find the way to shorten the operation and now trying to apply map
function from purrr
to store each operation in a nested data frame, not to replace variables every time in loop operation.
Following scripts are the ones that I am trying to build, however it cannot reach desired output.
## Store df by mid into list
nest <- df %>% group_by(mid) %>% nest()
## Extract row number with "aprps==4"
nest2 <- nest %>% mutate(row.aprps4=map(data,~which(.$aprps==4)))
## Obtain row numbers to extract by movement
nest3 <- nest2 %>% mutate(row.aprps4_1=map(data,~data.frame(rm1=which(.$aprps==4)-1)),
row.aprps4_2=map(data,~data.frame(rm1=which(.$aprps==4))))
## How to extract two pairs of records based on row.aprps4_1 and row.aprps4_1 and store sum of exph?
Some trials:
# It works but cannot extract records using two variables (row.aprps4_1 and .._2)
nest3 %>% mutate(move=map2(data,row.aprps4_1,~filter(.x,seq_len(nrow(.x))%in%.y)))
# Using pmap to specify range of filtering by two variables but does not work
nest4 %>% pmap(data,row.move1,row.move2,~filter(..1,seq_len(nrow(..1))%in%..2))
# Using double map function instead of double loop but does not work
pmap(nest4$data,nest4$row.move1,nest4$row.move2,~filter(..1,seq_len(nrow(..1))%in%c(..2:..3)))
Do you have any suggestion to make the operation fasten?
I prefer to use map
function to learn about it, however other alternatives are also welcomed.
I also found this post similar to this issue but could not solve the issue how to extract two rows based on dynamic variable r.aprpr4_1
and _2
.
===== UPDATE: ISSUE SOLVED =====
I could solve the issue by following scripts:
## Convert df into nested data frame by `mid`
nest <- df %>% group_by(mid) %>% nest()
## Obtain row numbers to extract aprps==4
nest2 <- nest %>% mutate(r=map(data,~which(.$aprps==4)))
## Split r and expand record
nest3 <- nest2 %>% unnest(r,.drop=FALSE)
## Extract pairs of movement
nest4 <- nest3 %>% mutate(pair=map2(data,r,~filter(.x,seq_len(nrow(.x))%in%c((.y-1):.y)))) %>% dplyr::select(mid,pair)
The points were:
unnest()
to expand each records by extracted vectors from aprps==4
(cannot apply .x%in%.y
where .y
has more than two length )mutate
is necessary to apply map2
(codes such asnest3 %>% map2(a,b,~f(.x,.y...))
is not accepted)A lot of thanks for following posts to get this solution:
Split delimited strings in a column and insert as new rows
Upvotes: 1
Views: 1076
Reputation: 107567
Since you mention other alternatives are also welcomed, consider base R. Several issues derive from your initial (non-purr) setup:
One of the biggest issue of original code is using rbind
inside a loop which leads to excessive copying in memory as explained in this SO thread, Replace rbind in for-loop with lapply? (2nd circle of hell) and Patrick Burn's R Internal - Circle 2: Growing Objects. To resolve, build a list of data frames that is appended outside of loop.
The repeated use of scoping assignment, <<-
, to affect the global environment from inside a local function appears to be unneeded, especially since temp objects are replaced with each loop so only last iteration will maintain. Often this operator is discouraged as it becomes tough to debug since global variables are adjusted. Functions are best handled when one object is returned.
You initialize an empty data frame, df.exp
before calling calc()
but overwrite it inside the loop with <<-
. Usually, after assigning an empty matrix or data frame, one assigns by rows inside loop but this is not done.
Looping through unique()
values can be replaced with by()
or split()
which also avoids using dplyr::filter()
inside function. By the way, there are performance challenges of using pipes, %>%
inside loops.
Rather than for
loop, use the apply family to build a list of objects after iteration such as lapply
which avoids the bookkeeping of for
loops which needs to initialize an empty list and assign elements to it (though there is nothing wrong with doing this approach). Also, in this way you avoid use of <<-
within function.
Base R (using by
, lapply
, and do.call
)
calc <- function(sub) {
## Extract records by "mid" excluding the first records
temp <- sub[2:nrow(temp),]
## Extract row number of "aprps==4"
r.aprps <- which(temp$aprps==4)
## Store exp dataframes in list
subdf_list <- lapply(1:length(r.aprps), function(j) {
## Extract movement by two pairs of rows based on "r.aprps"
temp2 <- temp[c((r.aprps[j]-1):r.aprps[j]),]
## Other operations in actual data set (just put example)
exp <- data.frame(mid=unique(temp2$mid), expsum=sum(temp2$exph))
return(exp)
})
df.exp <- do.call(rbind, subdf_list)
return(df.exp)
}
## subset by mid and pass subsets to calc()
df_list <- by(df, df$mid, calc)
## append all in final object
final_df <- do.call(rbind, df_list)
Because base::rbind.data.frame
has some disadvantages, consider third-party packages as replacement of do.call(rbind, ...)
such as dplyr::bind_rows()
and data.table::rbindlist()
.
df.exp <- dplyr::bind_rows(subdf_list)
...
final_df <- dplyr::bind_rows(df_list)
df.exp <- data.table::rbindlist(subdf_list)
...
final_df <- data.table::rbindlist(df_list)
Upvotes: 1