user1471980
user1471980

Reputation: 10626

Joining two data frames in R based on a common column name

I have two data frames like this:

dput(x)

structure(list(ICTO = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = "ICTO-6335", class = "factor"), Application = structure(c(5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("AUS-PSOFT", 
"DBA-GL-ORA-PRD", "JPN-PSOFT", "LDN-PSOFT", "LNBCV_GL", "NYBCV_GL", 
"NYK-PSOFT", "SGBCV_GL", "SNG-PSOFT", "02-PEOPLESOFT", "11-SLR-PROC", 
"AP-CIT-BATCH-STATUS", "FCIT-GARS", "GBL-EXPENSE", "GLAD", "HRDMART-MAINT", 
"MISC-PSOFT", "NYK-LATE", "NYK-WKND", "REP_PSOFT"), class = "factor"), 
    Group = structure(c(58L, 58L, 58L, 58L, 58L, 58L, 58L, 58L, 
    58L, 58L), .Label = c("AUS-AP", "AUS-CHF", "AUS-CHK", "AUS-DATE", 
    "AUS-DE", "AUS-DST", "AUS-ESS", "AUS-GL", "AUS-GLI", "AUS-GLR", 
    "AUS-LATE", "AUS-SL", "AUS-SLI", "AUS-SLR", "AUS-SM", "AUS-SMR", 
    "JPN-AM", "JPN-AP", "JPN-CHF", "JPN-CHK", "JPN-DE", "JPN-GL", 
    "JPN-GLI", "JPN-GLR", "JPN-SL", "JPN-SLI", "JPN-SLR", "LDN-AP", 
    "LDN-CHF", "LDN-ESS", "LDN-GBM", "LDN-GL", "LDN-GL-BUD", 
    "LDN-GL-CPM", "LDN-GL-CPM-FULL", "LDN-GL-EIT", "LDN-GL-ITR", 
    "LDN-GLR", "LDN-PSOFT", "LDN-SMR", "NYK-AM", "NYK-AP", "NYK-BO", 
    "NYK-BRANCH", "NYK-CHF", "NYK-ESS", "NYK-GBM", "NYK-GL", 
    "NYK-GL-BUD", "NYK-GL-BUD-HC", "NYK-GL-FOR", "NYK-GLR", "NYK-SM", 
    "NYK-SMR", "PDCGL06", "PDCGL30", "PNYPSGL1", "RFCS", "SGP-GLR", 
    "SNG-AM", "SNG-AP", "SNG-BOK", "SNG-CHF", "SNG-CHK", "SNG-DE", 
    "SNG-GBM", "SNG-GL", "SNG-GL-BUD", "SNG-GLI", "SNG-GLR", 
    "SNG-MAS", "SNG-SHB", "SNG-SL", "SNG-SLI", "SNG-SLR", "SNG-SM", 
    "SNG-SMR", "TIS", "LNBCV", "NYBCV", "NYK-WKND-MAINT", "RECYCLE_APPSERV", 
    "RECYCLE_WEBSERV", "SGBCV", "02-REP-PEOPLESOFT", "11-001-HOUSEKEEP", 
    "11-001-RCL-CHK", "11-SLR-PROC-AU", "11-SLR-PROC-HK", "11-SLR-PROC-IN", 
    "11-SLR-PROC-INT", "11-SLR-PROC-JL", "11-SLR-PROC-KR", "11-SLR-PROC-SG", 
    "11-SLR-REG-RPT", "AUS", "BREAK-GLASS", "CLOAKWARE", "CONV", 
    "EMAIL-ALERT-MONITOR", "FCIT-GLI-GARS", "GLAD-AUS", "GLAD-LON", 
    "GLAD-NYK", "HKG", "HRDMART-MON", "JPN", "LDN", "LedgerLastFeed", 
    "LON_PEOPLESOFT", "NYK", "NYK-LATE", "RECYCLE_PRCSSKED", 
    "SGP", "SGS60A-080", "SPD", "SYNCH-PROD-DR"), class = "factor"), 
    JobName = c("EXBCV06D", "EXBCV06D", "EXBCV06D", "EXBCV06D", 
    "EXBCV06D", "EXBCV06D", "EXBCV06D", "EXBCV06D", "EXBCV06D", 
    "EXBCV06D"), Date = c(120820L, 120817L, 120816L, 120815L, 
    120814L, 120813L, 120810L, 120809L, 120808L, 120807L), Status = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Ended Not OK", 
    "Ended OK", "Executing", "Wait Condition", "Wait Resource"
    ), class = "factor"), StartTime = c(20120821015845, 20120819024725, 
    20120817010722, 20120816010512, 20120815013233, 20120814005343, 
    20120811004005, 20120810004613, 20120809012701, 20120808005116
    ), EndTime = c(20120821015854, 20120819024734, 20120817010733, 
    20120816010521, 20120815013242, 20120814005354, 20120811004015, 
    20120810004623, 20120809012710, 20120808005126), ElapseSecond = c(9L, 
    9L, 11L, 9L, 9L, 11L, 10L, 10L, 9L, 10L)), .Names = c("ICTO", 
"Application", "Group", "JobName", "Date", "Status", "StartTime", 
"EndTime", "ElapseSecond"), row.names = 2689:2698, class = "data.frame")

dput(y)

structure(list(JobName = c("XAPSJCDC0D", "XHPSJCD0HD", "XSPSJCD03D", 
"EXBCV06D", "EXESS120D", "EXGL008D", "EXGL027D", "EXGL028D", 
"EXGL035D", "EXGL042S"), EntryDesc = structure(c(59L, 60L, 61L, 
64L, 53L, 71L, 37L, 70L, 35L, 41L), .Label = c("AFINA FEED", 
"Arrival of All Australia Feeds", "Arrival of All Japan Feeds", 
"Arrival of All Singapore Feeds", "Arrival of Endur Feed", "Basel II Balance Sheet Extract - Pacific", 
"Billing Manager Feed", "BOK Reg Reports Availability", "CD GL Balance Extract J11 AYE to CARAT", 
"CD GL Balance Extract SGP to CARAT", "CD Taiwan GL Extract to SYSTEX", 
"CIF Affiliate Feed", "End of Endur Feed Processing", "End of Spectal BDLite Feed Processing", 
"FTP Carat LCYBS Daily Extract", "FX Shredder Currency upload", 
"GFX FXOps Interface", "GL Balance Extract A48 to CARAT", "GLOBAL MONEY MARKET FEED", 
"Glosub interface", "GMI Feed Load", "Inspire Journal Feed", 
"Intellimatch Feed Sent", "Intellimatch Feed Sent - Australia", 
"Intellimatch Feed Sent - Japan", "Intellimatch Feed Sent - Singapore", 
"Ledger Available - Australia", "Ledger Available - HK/KR/SG", 
"Ledger Available - Japan", "Load GERS Feed", "LOAD GERS FEED", 
"Load of the VATSET Feed file to staging", "Loan IQ feed", "MAS MERP Reports Availability", 
"MONTHLY SUMMARISED JOURNAL FEED", "MyHR feed for HRMS and HR4U (prev. Headcount feed)", 
"NTPA-LOAD TO STAGING USD CCY", "NY NTPA Journal Feed", "OLD WORLD 80 ps_tipsj", 
"OLD WORLD 80 ps_tipzs", "OPC IT - Arrival of GMI Feed", "Opera Exchange Rate Extract - AUS", 
"Opera Exchange Rate Extract - SNG", "PCIT - Arrival of Spectral Feed", 
"Peoplesoft - Basel II Balance Sheet Extract - NY", "Peoplesoft - BDLite Extract", 
"Peoplesoft - End of GMI Feed Processing", "Peoplesoft - End of NTPA GLI Feed Processing", 
"Peoplesoft - FSR fcdb transactions delivered", "Peoplesoft - FSR fclonae delivered", 
"Peoplesoft - FSR gmmbal delivered", "Peoplesoft - FTP Phase II Completion", 
"PeopleSoft - FTP Phase II Completion", "Peoplesoft - FX Rates feed to Opera", 
"Peoplesoft - GL Extract feed for Hong Kong to CARAT", "Peoplesoft - GL Extract feed for India to CARAT", 
"Peoplesoft - GL Extract for Korea Branch to CARAT", "Peoplesoft - NYK Alt YE Close", 
"Poets-GLI Feed to Peoplesoft For AUS", "Poets-GLI Feed to Peoplesoft For HKG", 
"Poets-GLI Feed to Peoplesoft For SNG", "PROCESS INPUT FEED FROM FEPS GE", 
"Project Accounting upload", "Reporting Server Available", "Run application engine to process Endur feed.", 
"SL Period Balance Extract for T15 FTP", "SL YTD Balance Extract for T14 FTP", 
"SL YTD Balance Extract for T15 FTP", "SPECTRAL Feeds", "SPHERE FEED UPLOAD", 
"SUMMIT LOAD TO STAGING", "TPW Sub-ledger extract ftp to CARAT", 
"Peoplesoft - BDLite Extract ", "Peoplesoft - End of GMI Feed Processing "
), class = "factor"), SLAType = structure(c(3L, 3L, 3L, 1L, 3L, 
3L, 3L, 3L, 3L, 2L), .Label = c("DDA", "Milestone", "OLA"), class = "factor"), 
    EntryType = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L), .Label = "Automated", class = "factor"), Active = structure(c(1L, 
    2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("OK", "ON"
    ), class = "factor"), LastRun = structure(c(1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L), .Label = c("2012/08/01", "2012/09/06", 
    " 2012/10/08", " 2012/10/10", " 2012/10/12", " 2012/10/15"
    ), class = "factor"), DataCenter = structure(c(2L, 2L, 2L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("PNYSHCTM07", "PSGSHCTM03"
    ), class = "factor"), ProviderReg = structure(c(2L, 2L, 2L, 
    1L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Americas IT View", 
    "Asia Pacific IT View", "EMEA IT View"), class = "factor"), 
    ProviderDiv = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L), .Label = c("RF&CS IT", "BO IT"), class = "factor"), 
    ProviderSubDiv = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 1L), .Label = c("CFO IT - Product Control (KGK)", 
    "CFO IT – Financial Accounting (KGX)", "CFO IT - Financial Reporting [KGFX]", 
    "CFO IT ? Financial Accounting (KGX)"), class = "factor"), 
    ReceiverReg = structure(c(2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L), .Label = c("Americas Business View", "Asia Pacific Business View", 
    "EMEA Business View"), class = "factor"), ReceiverDiv = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Finance", 
    "Back Office"), class = "factor"), ReceiverSubDiv = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L), .Label = c("CFO IT – Financial Accounting (KGX)", 
    "Financial Accounting", "Product Control", "CFO - Financial Reporting", 
    "CFO IT ? Financial Accounting (KGX)"), class = "factor"), 
    Service = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 
    3L), .Label = c("Accounting Reporting", "Ledger Processing", 
    "Product Control", "Regional Financial Accounting"), class = "factor"), 
    ICTO = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
    ), .Label = "ICTO-6335", class = "factor"), SLAHour = c(4, 
    4, 4, 8.3, 7.3, 3, 3, 3, 4, 4), TargetDate = c(-1L, -1L, 
    -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L)), .Names = c("JobName", 
"EntryDesc", "SLAType", "EntryType", "Active", "LastRun", "DataCenter", 
"ProviderReg", "ProviderDiv", "ProviderSubDiv", "ReceiverReg", 
"ReceiverDiv", "ReceiverSubDiv", "Service", "ICTO", "SLAHour", 
"TargetDate"), row.names = c(NA, 10L), class = "data.frame")

I am doing this:

    xx<-merge(x, y, all.x=TRUE)

for example, the output looks like this:

head(subset(xx, JobName=="EXBCV06D"),10)
       JobName      ICTO Application Group   Date   Status    StartTime      EndTime ElapseSecond                  EntryDesc SLAType SLAHour TargetDate
35076 EXBCV06D ICTO-6335    LNBCV_GL  RFCS 120417 Ended OK 2.012042e+13 2.012042e+13            9 Reporting Server Available     DDA     8.3         -1
35077 EXBCV06D ICTO-6335    LNBCV_GL  RFCS 120417 Ended OK 2.012042e+13 2.012042e+13            9 Reporting Server Available     DDA     8.3         -1
35078 EXBCV06D ICTO-6335    LNBCV_GL  RFCS 120417 Ended OK 2.012042e+13 2.012042e+13            9 Reporting Server Available     DDA     8.3         -1
35079 EXBCV06D ICTO-6335    LNBCV_GL  RFCS 120417 Ended OK 2.012042e+13 2.012042e+13            9 Reporting Server Available     DDA     8.3         -1
35080 EXBCV06D ICTO-6335    LNBCV_GL  RFCS 120419 Ended OK 2.012042e+13 2.012042e+13            9 Reporting Server Available     DDA     8.3         -1
35081 EXBCV06D ICTO-6335    LNBCV_GL  RFCS 120419 Ended OK 2.012042e+13 2.012042e+13            9 Reporting Server Available     DDA     8.3         -1
35082 EXBCV06D ICTO-6335    LNBCV_GL  RFCS 120419 Ended OK 2.012042e+13 2.012042e+13            9 Reporting Server Available     DDA     8.3         -1
35083 EXBCV06D ICTO-6335    LNBCV_GL  RFCS 120419 Ended OK 2.012042e+13 2.012042e+13            9 Reporting Server Available     DDA     8.3         -1
35084 EXBCV06D ICTO-6335    LNBCV_GL  RFCS 120412 Ended OK 2.012041e+13 2.012041e+13            9 Reporting Server Available     DDA     8.3         -1
35085 EXBCV06D ICTO-6335    LNBCV_GL  RFCS 120412 Ended OK 2.012041e+13 2.012041e+13            9 Reporting Server Available     DDA     8.3         -1

I am seeing the same JobName 4 times for the same date:

JobName  Date   
EXBCV06D 120417 
EXBCV06D 120417 
EXBCV06D 120417 
EXBCV06D 120417

There should be one line for JobName and the Date. For example, there should only be

EXBCV06D 120417

not four of these.

There should only be Is this the right way to do this? Looks like I am getting multiple JobNames for the same Date. I need to have one JobName for each Date.

Upvotes: 1

Views: 263

Answers (1)

Justin
Justin

Reputation: 43255

using merge:

xy <- merge(x, y, by='JobName')

However, this merge will give you results you aren't expecting since there is only one unique JobName in all the x data. so you'll loose all the other rows in y. If you want to keep them, you can add all.y=TRUE but then you'll have many rows with an NA date.

I'd suggest reading ?merge carefully and creating a small example data set. The data set should reproduce the "problem" you're seeing and you should also supply the expected output. Without knowing that... its hard to help much more.

Upvotes: 3

Related Questions