Reputation: 556
I have the following df (dput()
at the end of question):
link person type ID1 bin
1311752 793482 514677 departure 347 36900
1020897 66154 514677 arrival 379 36900
1376554 858880 514679 departure 374 42300
1374956 858880 514679 arrival 374 46800
1567933 992830 514680 arrival 140 31500
1568804 992830 514680 departure 140 32400
726546 87723 514681 departure 42 25200
726547 481929 514681 arrival 233 27900
723875 481929 514681 departure 233 45000
1396595 87723 514681 arrival 42 45900
What I want is to create a new df, where I see which person
goes from (type=="departure"
) which zone (ID1
) to (type=="arrival"
) which zone (ID1
) and at which departure bin
.
As can be seen in the last 4 rows of the example above, it is possible that the same person
has multiple arrival
and departure
. It is important therefore that the two, which are closest to one another are combined (therefore smallest delta bin
).
Therefore the new df should look something like this:
person departure arrival bin
514677 347 379 36900
514679 374 374 42300
514681 42 233 25200
514681 233 42 45000
...
As you can see, the person 514681
travels back and forth between 42 and 233.
I guess you can say that I want to aggregate the data based on the person
.
I am not quite sure how this can be accomplished and would be very happy for some help!
structure(list(link = c("793482", "66154", "858880", "858880",
"992830", "992830", "858880", "858880", "514143", "87723", "481929",
"481929", "87723", "87723", "964586", "87723", "1013453", "pt_8591034",
"973606", "pt_8591034", "481929", "481929", "87723", "87723",
"399738", "399738", "87723", "738635", "481929", "738635", "481929",
"738635", "301867", "89053", "92917", "92917", "981899", "981899",
"89053", "92917", "89053", "89053", "699715", "699715", "89053",
"452176", "738635", "452176", "738635", "641738", "712438", "699715",
"215822", "621953", "634264", "362138", "634264", "634264", "763815",
"763815", "631191", "631191", "763815", "763815", "634264", "634264",
"10826", "10826", "634264", "999977", "1046826", "1046826", "634264",
"999977", "848815", "848815", "999977", "999977", "848815", "848815",
"999977", "261239", "993653", "287772", "763815", "763815", "287772",
"494898", "824071", "594222", "494898", "494898", "452642", "452642",
"494898", "494898", "452642", "494898", "787840", "442285"),
person = c(514677L, 514677L, 514679L, 514679L, 514680L, 514680L,
514680L, 514680L, 514680L, 514681L, 514681L, 514681L, 514681L,
514682L, 514682L, 514683L, 514683L, 514684L, 514684L, 514684L,
514684L, 514684L, 514684L, 514685L, 514685L, 514685L, 514685L,
514688L, 514688L, 514688L, 514688L, 514690L, 514690L, 514707L,
514707L, 514707L, 514707L, 514707L, 514707L, 514708L, 514708L,
514709L, 514709L, 514709L, 514709L, 514710L, 514710L, 514710L,
514710L, 514711L, 514711L, 514733L, 514733L, 514757L, 514757L,
514757L, 514757L, 514759L, 514759L, 514759L, 514759L, 514759L,
514759L, 514759L, 514759L, 514760L, 514760L, 514760L, 514760L,
514774L, 514774L, 514774L, 514774L, 514774L, 514774L, 514774L,
514774L, 514774L, 514774L, 514774L, 514774L, 514782L, 514782L,
514786L, 514786L, 514786L, 514786L, 514802L, 514802L, 514802L,
514802L, 514803L, 514803L, 514803L, 514803L, 514803L, 514803L,
514804L, 514804L, 514805L), type = c("departure", "arrival",
"departure", "arrival", "arrival", "departure", "arrival",
"departure", "arrival", "departure", "arrival", "departure",
"arrival", "departure", "arrival", "departure", "arrival",
"arrival", "departure", "departure", "arrival", "departure",
"arrival", "departure", "arrival", "departure", "arrival",
"departure", "arrival", "departure", "arrival", "departure",
"arrival", "departure", "arrival", "departure", "arrival",
"departure", "arrival", "departure", "arrival", "departure",
"arrival", "departure", "arrival", "departure", "arrival",
"arrival", "departure", "departure", "arrival", "departure",
"arrival", "departure", "arrival", "departure", "arrival",
"departure", "arrival", "departure", "arrival", "departure",
"arrival", "departure", "arrival", "departure", "arrival",
"departure", "arrival", "departure", "arrival", "departure",
"arrival", "departure", "arrival", "departure", "arrival",
"departure", "arrival", "departure", "arrival", "departure",
"arrival", "departure", "arrival", "departure", "arrival",
"departure", "arrival", "departure", "arrival", "departure",
"arrival", "departure", "arrival", "departure", "arrival",
"departure", "arrival", "departure"), ID1 = c(347L, 379L,
374L, 374L, 140L, 140L, 374L, 374L, 401L, 42L, 233L, 233L,
42L, 42L, 496L, 42L, 235L, 172L, 231L, 172L, 233L, 233L,
42L, 42L, 267L, 267L, 42L, 58L, 233L, 58L, 233L, 58L, 292L,
96L, 264L, 264L, 26L, 26L, 96L, 264L, 96L, 96L, 266L, 266L,
96L, 57L, 58L, 57L, 58L, 123L, 55L, 266L, 77L, 412L, 407L,
410L, 407L, 407L, 297L, 297L, 169L, 169L, 297L, 297L, 407L,
407L, 566L, 566L, 407L, 403L, 380L, 380L, 407L, 403L, 436L,
436L, 403L, 403L, 436L, 436L, 403L, 545L, 324L, 448L, 297L,
297L, 448L, 209L, 152L, 205L, 209L, 209L, 204L, 204L, 209L,
209L, 204L, 209L, 383L, 181L), bin = c(36900, 36900, 42300,
46800, 31500, 32400, 34200, 61200, 72000, 25200, 27900, 45000,
45900, 27900, 31500, 30600, 33300, 56700, 56700, 60300, 61200,
63900, 65700, 28800, 32400, 75600, 77400, 24300, 26100, 45900,
47700, 51300, 53100, 27900, 31500, 36900, 38700, 41400, 42300,
62100, 63900, 15300, 16200, 45900, 47700, 35100, 35100, 67500,
67500, 63900, 65700, 45900, 46800, 63000, 64800, 90000, 90900,
27000, 28800, 45000, 46800, 63000, 67500, 68400, 72000, 48600,
50400, 53100, 55800, 37800, 38700, 40500, 41400, 47700, 48600,
51300, 52200, 62100, 63000, 63900, 64800, 22500, 24300, 19800,
20700, 54000, 57600, 21600, 22500, 55800, 56700, 36900, 38700,
51300, 52200, 62100, 63900, 30600, 34200, 33300)), row.names = c(1311752L,
1020897L, 1376554L, 1374956L, 1567933L, 1568804L, 1378922L, 1378604L,
796332L, 1396610L, 726547L, 723875L, 1396595L, 1398179L, 1513826L,
1396821L, 27568L, 1607272L, 1542798L, 1606758L, 721147L, 720198L,
1397549L, 1398033L, 621908L, 626763L, 1397225L, 1184210L, 729418L,
1185149L, 728038L, 1183207L, 465692L, 1426554L, 1460666L, 1457289L,
1551709L, 1550925L, 1429217L, 1461183L, 1427235L, 1428930L, 1117203L,
1119814L, 1431191L, 683362L, 1183236L, 684338L, 1184602L, 990483L,
1142535L, 1125893L, 288323L, 947577L, 966966L, 544664L, 969372L,
964351L, 1230180L, 1231933L, 960341L, 960438L, 1238195L, 1241438L,
965853L, 964667L, 105034L, 106179L, 966724L, 1586724L, 72120L,
76232L, 969048L, 1587092L, 1360314L, 1361152L, 1585499L, 1586919L,
1361549L, 1359631L, 1587109L, 378600L, 1573640L, 428095L, 1236840L,
1235995L, 430374L, 757402L, 1331227L, 916001L, 760182L, 757620L,
686066L, 685765L, 759010L, 760140L, 687455L, 758617L, 1287055L,
676520L), class = "data.frame")
Upvotes: 1
Views: 54
Reputation: 1593
With dplyr
, this could be something like this:
library(dplyr)
data %>%
group_by(person) %>%
summarize(departure = ID1[type == "departure"][1],
arrival = ID1[type == "arrival"][1],
bin = bin[type == "departure"][1])
Here a refined version that tries to satisfy the additional logic. Since the problem is not too clear, I am quite certain that it is not yet 100% fine.
data %>%
arrange(person, bin) %>%
group_by(person) %>%
mutate(pair = rep(row_number(), each = 2, length.out = n())) %>%
group_by(person, pair) %>%
summarize(departure = ID1[type == "departure"][1],
arrival = ID1[type == "arrival"][1],
bin = bin[type == "departure"][1])
Upvotes: 1
Reputation: 4358
In base R we can split the data and only keep the person/bin combinations that have more than 1 row (meaning there is an arrival and a departure) then extract the data.
splitdata <- split(df,with(df,paste0(person,bin)))
splitdata <- splitdata[sapply(splitdata,nrow)>1]
do.call(rbind,
lapply(splitdata, function(x)
data.frame(
person = x[1,2],
departure = x$ID1[x$type == "departure"],
arrival = x$ID1[x$type == "arrival"],
bin = x[1,5])))
output:
person departure arrival bin
51467736900 514677 347 379 36900
51468456700 514684 231 172 56700
51471035100 514710 57 58 35100
51471067500 514710 58 57 67500
Upvotes: 0