Reputation: 487
I'd like to concatenate those rows with duplicated ScanNum
values.
Here is part of my data frame.
structure(list(UniprotID = c("P06493", "P06493", "P06493", "P06493",
"P16591", "Q7Z460", "Q7Z460", "Q7Z460", "Q7Z460", "Q7Z460", "P16591",
"P11802", "P09651", "P09651", "P22830", "P46734", "Q00535", "P09651",
"P63261", "P68032", "P06493", "Q9UKI8", "P63261", "P68032", "Q9NVU7",
"P06239", "P06239", "Q00535", "P06239", "P11802", "Q13164", "P06493",
"Q9UKI8", "P06239", "Q00535", "P06239", "Q09428", "O96017", "Q14289",
"O96017", "P06493", "Q9UKI8", "Q9NWZ3", "P06239", "O43318", "O43318",
"P06493", "P50613", "Q9BQI3", "Q86UE8"), Description = c("Cyclin-dependent kinase 1 OS=Homo sapiens OX=9606 GN=CDK1 PE=1 SV=3",
"Cyclin-dependent kinase 1 OS=Homo sapiens OX=9606 GN=CDK1 PE=1 SV=3",
"Cyclin-dependent kinase 1 OS=Homo sapiens OX=9606 GN=CDK1 PE=1 SV=3",
"Cyclin-dependent kinase 1 OS=Homo sapiens OX=9606 GN=CDK1 PE=1 SV=3",
"Tyrosine-protein kinase Fer OS=Homo sapiens OX=9606 GN=FER PE=1 SV=2",
"CLIP-associating protein 1 OS=Homo sapiens OX=9606 GN=CLASP1 PE=1 SV=1",
"CLIP-associating protein 1 OS=Homo sapiens OX=9606 GN=CLASP1 PE=1 SV=1",
"CLIP-associating protein 1 OS=Homo sapiens OX=9606 GN=CLASP1 PE=1 SV=1",
"CLIP-associating protein 1 OS=Homo sapiens OX=9606 GN=CLASP1 PE=1 SV=1",
"CLIP-associating protein 1 OS=Homo sapiens OX=9606 GN=CLASP1 PE=1 SV=1",
"Tyrosine-protein kinase Fer OS=Homo sapiens OX=9606 GN=FER PE=1 SV=2",
"Cyclin-dependent kinase 4 OS=Homo sapiens OX=9606 GN=CDK4 PE=1 SV=2",
"Heterogeneous nuclear ribonucleoprotein A1 OS=Homo sapiens OX=9606 GN=HNRNPA1 PE=1 SV=5",
"Heterogeneous nuclear ribonucleoprotein A1 OS=Homo sapiens OX=9606 GN=HNRNPA1 PE=1 SV=5",
"Ferrochelatase, mitochondrial OS=Homo sapiens OX=9606 GN=FECH PE=1 SV=2",
"Dual specificity mitogen-activated protein kinase kinase 3 OS=Homo sapiens OX=9606 GN=MAP2K3 PE=1 SV=2",
"Cyclin-dependent-like kinase 5 OS=Homo sapiens OX=9606 GN=CDK5 PE=1 SV=3",
"Heterogeneous nuclear ribonucleoprotein A1 OS=Homo sapiens OX=9606 GN=HNRNPA1 PE=1 SV=5",
"Actin, cytoplasmic 2 OS=Homo sapiens OX=9606 GN=ACTG1 PE=1 SV=1",
"Actin, alpha cardiac muscle 1 OS=Homo sapiens OX=9606 GN=ACTC1 PE=1 SV=1",
"Cyclin-dependent kinase 1 OS=Homo sapiens OX=9606 GN=CDK1 PE=1 SV=3",
"Serine/threonine-protein kinase tousled-like 1 OS=Homo sapiens OX=9606 GN=TLK1 PE=1 SV=2",
"Actin, cytoplasmic 2 OS=Homo sapiens OX=9606 GN=ACTG1 PE=1 SV=1",
"Actin, alpha cardiac muscle 1 OS=Homo sapiens OX=9606 GN=ACTC1 PE=1 SV=1",
"Protein SDA1 homolog OS=Homo sapiens OX=9606 GN=SDAD1 PE=1 SV=3",
"Tyrosine-protein kinase Lck OS=Homo sapiens OX=9606 GN=LCK PE=1 SV=6",
"Tyrosine-protein kinase Lck OS=Homo sapiens OX=9606 GN=LCK PE=1 SV=6",
"Cyclin-dependent-like kinase 5 OS=Homo sapiens OX=9606 GN=CDK5 PE=1 SV=3",
"Tyrosine-protein kinase Lck OS=Homo sapiens OX=9606 GN=LCK PE=1 SV=6",
"Cyclin-dependent kinase 4 OS=Homo sapiens OX=9606 GN=CDK4 PE=1 SV=2",
"Mitogen-activated protein kinase 7 OS=Homo sapiens OX=9606 GN=MAPK7 PE=1 SV=2",
"Cyclin-dependent kinase 1 OS=Homo sapiens OX=9606 GN=CDK1 PE=1 SV=3",
"Serine/threonine-protein kinase tousled-like 1 OS=Homo sapiens OX=9606 GN=TLK1 PE=1 SV=2",
"Tyrosine-protein kinase Lck OS=Homo sapiens OX=9606 GN=LCK PE=1 SV=6",
"Cyclin-dependent-like kinase 5 OS=Homo sapiens OX=9606 GN=CDK5 PE=1 SV=3",
"Tyrosine-protein kinase Lck OS=Homo sapiens OX=9606 GN=LCK PE=1 SV=6",
"ATP-binding cassette sub-family C member 8 OS=Homo sapiens OX=9606 GN=ABCC8 PE=1 SV=6",
"Serine/threonine-protein kinase Chk2 OS=Homo sapiens OX=9606 GN=CHEK2 PE=1 SV=1",
"Protein-tyrosine kinase 2-beta OS=Homo sapiens OX=9606 GN=PTK2B PE=1 SV=2",
"Serine/threonine-protein kinase Chk2 OS=Homo sapiens OX=9606 GN=CHEK2 PE=1 SV=1",
"Cyclin-dependent kinase 1 OS=Homo sapiens OX=9606 GN=CDK1 PE=1 SV=3",
"Serine/threonine-protein kinase tousled-like 1 OS=Homo sapiens OX=9606 GN=TLK1 PE=1 SV=2",
"Interleukin-1 receptor-associated kinase 4 OS=Homo sapiens OX=9606 GN=IRAK4 PE=1 SV=1",
"Tyrosine-protein kinase Lck OS=Homo sapiens OX=9606 GN=LCK PE=1 SV=6",
"Mitogen-activated protein kinase kinase kinase 7 OS=Homo sapiens OX=9606 GN=MAP3K7 PE=1 SV=1",
"Mitogen-activated protein kinase kinase kinase 7 OS=Homo sapiens OX=9606 GN=MAP3K7 PE=1 SV=1",
"Cyclin-dependent kinase 1 OS=Homo sapiens OX=9606 GN=CDK1 PE=1 SV=3",
"Cyclin-dependent kinase 7 OS=Homo sapiens OX=9606 GN=CDK7 PE=1 SV=1",
"Eukaryotic translation initiation factor 2-alpha kinase 1 OS=Homo sapiens OX=9606 GN=EIF2AK1 PE=1 SV=2",
"Serine/threonine-protein kinase tousled-like 2 OS=Homo sapiens OX=9606 GN=TLK2 PE=1 SV=2"
), Gene.name = c("CDK1", "CDK1", "CDK1", "CDK1", "FER", "CLASP1",
"CLASP1", "CLASP1", "CLASP1", "CLASP1", "FER", "CDK4", "HNRNPA1",
"HNRNPA1", "FECH", "MAP2K3", "CDK5", "HNRNPA1", "ACTG1", "ACTC1",
"CDK1", "TLK1", "ACTG1", "ACTC1", "SDAD1", "LCK", "LCK", "CDK5",
"LCK", "CDK4", "MAPK7", "CDK1", "TLK1", "LCK", "CDK5", "LCK",
"ABCC8", "CHEK2", "PTK2B", "CHEK2", "CDK1", "TLK1", "IRAK4",
"LCK", "MAP3K7", "MAP3K7", "CDK1", "CDK7", "EIF2AK1", "TLK2"),
Sequence = c("R.HKTTGQVVAMK(982.466)K.I", "R.HKTTGQVVAMK(982.466)K.I",
"R.HKTTGQVVAMK(982.466)K.I", "R.HKTTGQVVAMK(982.466)K.I",
"K.TSVAVK(982.466)TCK.E", "R.VNALKK(982.466).I", "R.VNALKK(982.466).I",
"R.VNALK(982.466)K.I", "R.VNALKK(982.466).I", "R.VNALKK(982.466).I",
"K.TSVAVK(982.466)TCK.E", "K.ARDPHSGHFVALK(982.466)SVR.V",
"R.NQGGY(982.466)GGSSSSSSYGSGR.R", "R.NQGGY(982.466)GGSSSSSSYGSGR.R",
"R.TPK(982.466)IQEQYR.R", "R.HAQSGTIMAVK(982.466)R.I", "K.NRETHEIVALK(982.466)R.V",
"R.NQGGY(982.466)GGSSSSSSYGSGR.R", "K.DSY(982.466)VGDEAQSKR.G",
"K.DSY(982.466)VGDEAQSKR.G", "K.TTGQVVAMKK(982.466).I", "R.YAAVK(982.466)IHQLNK.S",
"K.DSY(982.466)VGDEAQSKR.G", "K.DSY(982.466)VGDEAQSKR.G",
"K.AMK(982.466)VLK.K", "K.VAVK(982.466)SLK.Q", "K.VAVK(982.466)SLK.Q",
"K.NRETHEIVALK(982.466)R.V", "K.VAVK(982.466)SLK.Q", "K.ARDPHSGHFVALK(982.466)SVR.V",
"R.LTGQQVAIKK(982.466).I", "K.TTGQVVAMKK(982.466).I", "R.YAAVK(982.466)IHQLNK.S",
"K.VAVK(982.466)SLK.Q", "K.NRETHEIVALK(982.466)R.V", "K.VAVK(982.466)SLK.Q",
"K.GIK(982.466)LLK.L", "K.KVAIK(982.466)IISK.R", "K.INVAVK(982.466)TCK.K",
"K.KVAIK(982.466)IISK.R", "K.TTGQVVAMKK(982.466).I", "R.YAAVK(982.466)IHQLNK.S",
"K.GYVNNTTVAVKK(982.466).L", "K.VAVK(982.466)SLK.Q", "R.AKDVAIK(982.466)QIESESER.K",
"R.AKDVAIK(982.466)QIESESER.K", "K.TTGQVVAMK(982.466)K.I",
"R.DKNTNQIVAIK(982.466)K.I", "R.NKLDGQYYAIK(982.466)K.I",
"R.YVAVK(982.466)IHQLNK.N"), `m/z_126.127725_int` = c(7328,
1431, 0, 0, 0, 1534, 1208, 0, 0, 0, 0, 5472, 0, 0, 0, 0,
3059, 0, 0, 0, 14694, 0, 0, 0, 767, 15399, 8508, 5963, 3329,
2850, 866, 15159, 0, 12952, 3607, 61261, 1594, 0, 0, 0, 9174,
0, 0, 4064, 0, 0, 4193, 3903, 0, 0), `m/z_127.12476_int` = c(22305,
4867, 2166, 3183, 1615, 9900, 6436, 1924, 4641, 3176, 568,
10705, 0, 0, 1373, 689, 11166, 0, 0, 0, 35789, 2580, 0, 0,
5881, 63064, 30110, 18335, 10285, 5732, 1249, 42999, 1566,
39681, 9785, 309388, 5557, 2419, 1038, 3424, 29050, 722,
1554, 12719, 1443, 0, 12181, 10057, 0, 2435), `m/z_128.134433_int` = c(38137,
8048, 5042, 5280, 5324, 22723, 20533, 5320, 5032, 4471, 813,
12294, 863, 0, 2362, 1407, 13618, 648, 733, 733, 82205, 11746,
1359, 1359, 7196, 166646, 75239, 23451, 30788, 6175, 4969,
70456, 5618, 106209, 11896, 829224, 8316, 2921, 3481, 5204,
51919, 1575, 6209, 39754, 4444, 3658, 24940, 32154, 1757,
6020), `m/z_129.131468_int` = c(44762, 7626, 6014, 8076,
11264, 52091, 63456, 13223, 11973, 10541, 2274, 12982, 3369,
1938, 5093, 7320, 16850, 3051, 4353, 4353, 83011, 25283,
2897, 2897, 15137, 176041, 83912, 24140, 30193, 9100, 13435,
81335, 9670, 105429, 15821, 819311, 12094, 7961, 5593, 13966,
54175, 4243, 11926, 49495, 8842, 7331, 24976, 28836, 5722,
14175), `m/z_130.141141_int` = c(46636, 10425, 7086, 8641,
11370, 85939, 81372, 18722, 22222, 17278, 2397, 16696, 4024,
4826, 8287, 18216, 13907, 5872, 4442, 4442, 82328, 38189,
4520, 4520, 22714, 182513, 80678, 25336, 33127, 10046, 25467,
77154, 14168, 129888, 17157, 880050, 13502, 14193, 7167,
20157, 48899, 7369, 16091, 46048, 12467, 10887, 27694, 21979,
8712, 19013), `m/z_131.138176_int` = c(49103, 9367, 9452,
11609, 9746, 85046, 99942, 27284, 27647, 22801, 5214, 15570,
13161, 12293, 17222, 38651, 16360, 15486, 11286, 11286, 80727,
37110, 10795, 10795, 30313, 194256, 87209, 26696, 36470,
13323, 36787, 70568, 13075, 128171, 16578, 805814, 18556,
25095, 10181, 31390, 54114, 9680, 15058, 56991, 18002, 11603,
26753, 17995, 17081, 22651), TMT_purity = c("0.98141234715268",
"0.71134850965744001", "0.76128382110317905", "0.76128382110317905",
"1", "0.78702255963842904", "0.78170688482709405", "0.776974521760607",
"1", "0.76160370785582798", "1", "0.64272765210635596", "0.90646438991621103",
"0.82319643556607203", "0.58148349410262401", "1", "0.95294631885274494",
"0.91291708141626005", "0.698686479445912", "0.698686479445912",
"0.96955605239368403", "0.79561280886225205", "0.64177968168606403",
"0.64177968168606403", "0.88734015495342999", "0.88733946625779203",
"0.93493384401468704", "1", "1", "0.61903261519569497", "0.73288251651566405",
"1", "0.88757994170849897", "0.91888430409069299", "0.94640973341271395",
"0.91492692770042205", "0.72126051188328899", "0.32216956233298499",
"1", "0.75795632756268905", "0.961165374625497", "0.56361697494671903",
"1", "0.84217317911923095", "0.76607291679043199", "0.94119959458560598",
"0.97979179200421396", "0.81295038316780099", "0.67927222063109804",
"0.81685860457191595"), `Signal-noise` = c(40.21, 9.04, 7.52,
10.02, 10.51, 39.74, 29.91, 16.62, 18.66, 14.05, 3.42, 16.39,
5.42, 4.65, 8.06, 16.31, 18.58, 6.21, 5.53, 5.53, 74.27,
23.62, 5, 5, 19.45, 110.03, 69.9, 29.31, 32.85, 11.91, 17.31,
24.61, 9.46, 25.76, 16.86, 26.44, 12.86, 12.31, 6.77, 14.35,
25.33, 5.41, 11.65, 24.88, 9.96, 7.64, 25.44, 23.6, 8.06,
10.43), ScanNum = c(9809, 10035, 10254, 10269, 10521, 10567,
10597, 10716, 10807, 10816, 11002, 11031, 11056, 11061, 11064,
11085, 11194, 11288, 11314, 11314, 11320, 11322, 11326, 11326,
11330, 11340, 11361, 11412, 11423, 11432, 11454, 11549, 11553,
11601, 11640, 11698, 11720, 11726, 11750, 11757, 11775, 11802,
11840, 11886, 11928, 11935, 11996, 12004, 12011, 12016),
CState = c(4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 5, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 3, 4, 3, 4, 2, 5, 4, 3, 4, 3, 4,
4, 3, 4, 3, 4, 3, 4, 3, 3, 5, 4, 3, 4, 4, 4), Filename = c("Y20210222-09",
"Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09",
"Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09",
"Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09",
"Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09",
"Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09",
"Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09",
"Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09",
"Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09",
"Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09",
"Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09",
"Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09",
"Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09",
"Y20210222-09"), sequence1 = c("HKTTGQVVAMK(982.466)K", "HKTTGQVVAMK(982.466)K",
"HKTTGQVVAMK(982.466)K", "HKTTGQVVAMK(982.466)K", "TSVAVK(982.466)TCK",
"VNALKK(982.466)", "VNALKK(982.466)", "VNALK(982.466)K",
"VNALKK(982.466)", "VNALKK(982.466)", "TSVAVK(982.466)TCK",
"ARDPHSGHFVALK(982.466)SVR", "NQGGY(982.466)GGSSSSSSYGSGR",
"NQGGY(982.466)GGSSSSSSYGSGR", "TPK(982.466)IQEQYR", "HAQSGTIMAVK(982.466)R",
"NRETHEIVALK(982.466)R", "NQGGY(982.466)GGSSSSSSYGSGR", "DSY(982.466)VGDEAQSKR",
"DSY(982.466)VGDEAQSKR", "TTGQVVAMKK(982.466)", "YAAVK(982.466)IHQLNK",
"DSY(982.466)VGDEAQSKR", "DSY(982.466)VGDEAQSKR", "AMK(982.466)VLK",
"VAVK(982.466)SLK", "VAVK(982.466)SLK", "NRETHEIVALK(982.466)R",
"VAVK(982.466)SLK", "ARDPHSGHFVALK(982.466)SVR", "LTGQQVAIKK(982.466)",
"TTGQVVAMKK(982.466)", "YAAVK(982.466)IHQLNK", "VAVK(982.466)SLK",
"NRETHEIVALK(982.466)R", "VAVK(982.466)SLK", "GIK(982.466)LLK",
"KVAIK(982.466)IISK", "INVAVK(982.466)TCK", "KVAIK(982.466)IISK",
"TTGQVVAMKK(982.466)", "YAAVK(982.466)IHQLNK", "GYVNNTTVAVKK(982.466)",
"VAVK(982.466)SLK", "AKDVAIK(982.466)QIESESER", "AKDVAIK(982.466)QIESESER",
"TTGQVVAMK(982.466)K", "DKNTNQIVAIK(982.466)K", "NKLDGQYYAIK(982.466)K",
"YVAVK(982.466)IHQLNK"), Mod.or.not = c("Y", "Y", "Y", "Y",
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"), Mod.position.in.pep = c(11L,
11L, 11L, 11L, 6L, 6L, 6L, 5L, 6L, 6L, 6L, 13L, 5L, 5L, 3L,
11L, 11L, 5L, 3L, 3L, 10L, 5L, 3L, 3L, 3L, 4L, 4L, 11L, 4L,
13L, 10L, 10L, 5L, 4L, 11L, 4L, 3L, 5L, 6L, 5L, 10L, 5L,
12L, 4L, 7L, 7L, 9L, 11L, 11L, 5L), Mod.sequence = c("HKTTGQVVAMKK",
"HKTTGQVVAMKK", "HKTTGQVVAMKK", "HKTTGQVVAMKK", "TSVAVKTCK",
"VNALKK", "VNALKK", "VNALKK", "VNALKK", "VNALKK", "TSVAVKTCK",
"ARDPHSGHFVALKSVR", "NQGGYGGSSSSSSYGSGR", "NQGGYGGSSSSSSYGSGR",
"TPKIQEQYR", "HAQSGTIMAVKR", "NRETHEIVALKR", "NQGGYGGSSSSSSYGSGR",
"DSYVGDEAQSKR", "DSYVGDEAQSKR", "TTGQVVAMKK", "YAAVKIHQLNK",
"DSYVGDEAQSKR", "DSYVGDEAQSKR", "AMKVLK", "VAVKSLK", "VAVKSLK",
"NRETHEIVALKR", "VAVKSLK", "ARDPHSGHFVALKSVR", "LTGQQVAIKK",
"TTGQVVAMKK", "YAAVKIHQLNK", "VAVKSLK", "NRETHEIVALKR", "VAVKSLK",
"GIKLLK", "KVAIKIISK", "INVAVKTCK", "KVAIKIISK", "TTGQVVAMKK",
"YAAVKIHQLNK", "GYVNNTTVAVKK", "VAVKSLK", "AKDVAIKQIESESER",
"AKDVAIKQIESESER", "TTGQVVAMKK", "DKNTNQIVAIKK", "NKLDGQYYAIKK",
"YVAVKIHQLNK"), start.position = c(23L, 23L, 23L, 23L, 586L,
342L, 342L, 342L, 342L, 342L, 586L, 23L, 353L, 353L, 116L,
83L, 23L, 353L, 53L, 53L, 25L, 481L, 53L, 53L, 266L, 270L,
270L, 23L, 270L, 23L, 76L, 25L, 481L, 270L, 23L, 270L, 505L,
245L, 452L, 245L, 25L, 481L, 203L, 270L, 57L, 57L, 25L, 31L,
186L, 487L), pos.in.protein = c(33, 33, 33, 33, 591, 347,
347, 346, 347, 347, 591, 35, 357, 357, 118, 93, 33, 357,
55, 55, 34, 485, 55, 55, 268, 273, 273, 33, 273, 35, 85,
34, 485, 273, 33, 273, 507, 249, 457, 249, 34, 485, 214,
273, 63, 63, 33, 41, 196, 491), Mod.site = c("P06493_33",
"P06493_33", "P06493_33", "P06493_33", "P16591_591", "Q7Z460_347",
"Q7Z460_347", "Q7Z460_346", "Q7Z460_347", "Q7Z460_347", "P16591_591",
"P11802_35", "P09651_357", "P09651_357", "P22830_118", "P46734_93",
"Q00535_33", "P09651_357", "P63261_55", "P68032_55", "P06493_34",
"Q9UKI8_485", "P63261_55", "P68032_55", "Q9NVU7_268", "P06239_273",
"P06239_273", "Q00535_33", "P06239_273", "P11802_35", "Q13164_85",
"P06493_34", "Q9UKI8_485", "P06239_273", "Q00535_33", "P06239_273",
"Q09428_507", "O96017_249", "Q14289_457", "O96017_249", "P06493_34",
"Q9UKI8_485", "Q9NWZ3_214", "P06239_273", "O43318_63", "O43318_63",
"P06493_33", "P50613_41", "Q9BQI3_196", "Q86UE8_491")), row.names = c(NA,
-50L), class = c("tbl_df", "tbl", "data.frame"))
For example, the following rows are duplicated in terms of ScanNum
(11314).
structure(list(UniprotID = c("P63261", "P68032"), Description = c("Actin, cytoplasmic 2 OS=Homo sapiens OX=9606 GN=ACTG1 PE=1 SV=1",
"Actin, alpha cardiac muscle 1 OS=Homo sapiens OX=9606 GN=ACTC1 PE=1 SV=1"
), Gene.name = c("ACTG1", "ACTC1"), Sequence = c("K.DSY(982.466)VGDEAQSKR.G",
"K.DSY(982.466)VGDEAQSKR.G"), `m/z_126.127725_int` = c(0, 0),
`m/z_127.12476_int` = c(0, 0), `m/z_128.134433_int` = c(733,
733), `m/z_129.131468_int` = c(4353, 4353), `m/z_130.141141_int` = c(4442,
4442), `m/z_131.138176_int` = c(11286, 11286), TMT_purity = c("0.698686479445912",
"0.698686479445912"), `Signal-noise` = c(5.53, 5.53), ScanNum = c(11314,
11314), CState = c(4, 4), Filename = c("Y20210222-09", "Y20210222-09"
), sequence1 = c("DSY(982.466)VGDEAQSKR", "DSY(982.466)VGDEAQSKR"
), Mod.or.not = c("Y", "Y"), Mod.position.in.pep = c(3L,
3L), Mod.sequence = c("DSYVGDEAQSKR", "DSYVGDEAQSKR"), start.position = c(53L,
53L), pos.in.protein = c(55, 55), Mod.site = c("P63261_55",
"P68032_55")), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame"))
So, I'd like to concatenate these two rows into one, which should look like the following. Basically, keep all the unique values in the new row and separate those values by ;
if they are different.
structure(list(X = 2L, UniprotID = "P68032;P63261", Description = "Actin, alpha cardiac muscle 1 OS=Homo sapiens OX=9606 GN=ACTC1 PE=1 SV=1",
Gene.name = "ACTC1", Sequence = "K.DSY(982.466)VGDEAQSKR.G",
m.z_126.127725_int = 0L, m.z_127.12476_int = 0L, m.z_128.134433_int = 733L,
m.z_129.131468_int = 4353L, m.z_130.141141_int = 4442L, m.z_131.138176_int = 11286L,
TMT_purity = 0.698686479, Signal.noise = 5.53, ScanNum = 11314L,
CState = 4L, Filename = "Y20210222-09", sequence1 = "DSY(982.466)VGDEAQSKR",
Mod.or.not = "Y", Mod.position.in.pep = 3L, Mod.sequence = "DSYVGDEAQSKR",
start.position = 53L, pos.in.protein = 55L, Mod.site = "P68032_55; P63261_55"), class = "data.frame", row.names = c(NA,
-1L))
Upvotes: 0
Views: 126
Reputation: 79271
Try this:
library(dplyr)
df %>%
group_by(ScanNum) %>%
mutate(across(everything(), ~paste(., collapse = ","))) %>%
distinct()
UniprotID Description Gene.name Sequence `m/z_126.127725~ `m/z_127.12476_~ `m/z_128.134433~ `m/z_129.131468~ `m/z_130.141141~ `m/z_131.138176~ TMT_purity `Signal-noise` ScanNum CState Filename sequence1
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
1 P06493 Cyclin-dep~ CDK1 R.HKTTG~ 7328 22305 38137 44762 46636 49103 0.9814123~ 40.21 9809 4 Y202102~ HKTTGQVV~
2 P06493 Cyclin-dep~ CDK1 R.HKTTG~ 1431 4867 8048 7626 10425 9367 0.7113485~ 9.04 10035 4 Y202102~ HKTTGQVV~
3 P06493 Cyclin-dep~ CDK1 R.HKTTG~ 0 2166 5042 6014 7086 9452 0.7612838~ 7.52 10254 4 Y202102~ HKTTGQVV~
4 P06493 Cyclin-dep~ CDK1 R.HKTTG~ 0 3183 5280 8076 8641 11609 0.7612838~ 10.02 10269 4 Y202102~ HKTTGQVV~
5 P16591 Tyrosine-p~ FER K.TSVAV~ 0 1615 5324 11264 11370 9746 1 10.51 10521 3 Y202102~ TSVAVK(9~
6 Q7Z460 CLIP-assoc~ CLASP1 R.VNALK~ 1534 9900 22723 52091 85939 85046 0.7870225~ 39.74 10567 3 Y202102~ VNALKK(9~
7 Q7Z460 CLIP-assoc~ CLASP1 R.VNALK~ 1208 6436 20533 63456 81372 99942 0.7817068~ 29.91 10597 3 Y202102~ VNALKK(9~
8 Q7Z460 CLIP-assoc~ CLASP1 R.VNALK~ 0 1924 5320 13223 18722 27284 0.7769745~ 16.62 10716 3 Y202102~ VNALK(98~
9 Q7Z460 CLIP-assoc~ CLASP1 R.VNALK~ 0 4641 5032 11973 22222 27647 1 18.66 10807 3 Y202102~ VNALKK(9~
10 Q7Z460 CLIP-assoc~ CLASP1 R.VNALK~ 0 3176 4471 10541 17278 22801 0.7616037~ 14.05 10816 3 Y202102~ VNALKK(9~
11 P16591 Tyrosine-p~ FER K.TSVAV~ 0 568 813 2274 2397 5214 1 3.42 11002 3 Y202102~ TSVAVK(9~
12 P11802 Cyclin-dep~ CDK4 K.ARDPH~ 5472 10705 12294 12982 16696 15570 0.6427276~ 16.39 11031 5 Y202102~ ARDPHSGH~
13 P09651 Heterogene~ HNRNPA1 R.NQGGY~ 0 0 863 3369 4024 13161 0.9064643~ 5.42 11056 4 Y202102~ NQGGY(98~
14 P09651 Heterogene~ HNRNPA1 R.NQGGY~ 0 0 0 1938 4826 12293 0.8231964~ 4.65 11061 4 Y202102~ NQGGY(98~
15 P22830 Ferrochela~ FECH R.TPK(9~ 0 1373 2362 5093 8287 17222 0.5814834~ 8.06 11064 4 Y202102~ TPK(982.~
16 P46734 Dual speci~ MAP2K3 R.HAQSG~ 0 689 1407 7320 18216 38651 1 16.31 11085 4 Y202102~ HAQSGTIM~
17 Q00535 Cyclin-dep~ CDK5 K.NRETH~ 3059 11166 13618 16850 13907 16360 0.9529463~ 18.58 11194 4 Y202102~ NRETHEIV~
18 P09651 Heterogene~ HNRNPA1 R.NQGGY~ 0 0 648 3051 5872 15486 0.9129170~ 6.21 11288 4 Y202102~ NQGGY(98~
19 P63261,P~ Actin, cyt~ ACTG1,AC~ K.DSY(9~ 0,0 0,0 733,733 4353,4353 4442,4442 11286,11286 0.6986864~ 5.53,5.53 11314 4,4 Y202102~ DSY(982.~
20 P06493 Cyclin-dep~ CDK1 K.TTGQV~ 14694 35789 82205 83011 82328 80727 0.9695560~ 74.27 11320 3 Y202102~ TTGQVVAM~
21 Q9UKI8 Serine/thr~ TLK1 R.YAAVK~ 0 2580 11746 25283 38189 37110 0.7956128~ 23.62 11322 4 Y202102~ YAAVK(98~
22 P63261,P~ Actin, cyt~ ACTG1,AC~ K.DSY(9~ 0,0 0,0 1359,1359 2897,2897 4520,4520 10795,10795 0.6417796~ 5,5 11326 4,4 Y202102~ DSY(982.~
23 Q9NVU7 Protein SD~ SDAD1 K.AMK(9~ 767 5881 7196 15137 22714 30313 0.8873401~ 19.45 11330 3 Y202102~ AMK(982.~
24 P06239 Tyrosine-p~ LCK K.VAVK(~ 15399 63064 166646 176041 182513 194256 0.8873394~ 110.03 11340 4 Y202102~ VAVK(982~
25 P06239 Tyrosine-p~ LCK K.VAVK(~ 8508 30110 75239 83912 80678 87209 0.9349338~ 69.9 11361 3 Y202102~ VAVK(982~
26 Q00535 Cyclin-dep~ CDK5 K.NRETH~ 5963 18335 23451 24140 25336 26696 1 29.31 11412 4 Y202102~ NRETHEIV~
27 P06239 Tyrosine-p~ LCK K.VAVK(~ 3329 10285 30788 30193 33127 36470 1 32.85 11423 2 Y202102~ VAVK(982~
28 P11802 Cyclin-dep~ CDK4 K.ARDPH~ 2850 5732 6175 9100 10046 13323 0.6190326~ 11.91 11432 5 Y202102~ ARDPHSGH~
29 Q13164 Mitogen-ac~ MAPK7 R.LTGQQ~ 866 1249 4969 13435 25467 36787 0.7328825~ 17.31 11454 4 Y202102~ LTGQQVAI~
30 P06493 Cyclin-dep~ CDK1 K.TTGQV~ 15159 42999 70456 81335 77154 70568 1 24.61 11549 3 Y202102~ TTGQVVAM~
31 Q9UKI8 Serine/thr~ TLK1 R.YAAVK~ 0 1566 5618 9670 14168 13075 0.8875799~ 9.46 11553 4 Y202102~ YAAVK(98~
32 P06239 Tyrosine-p~ LCK K.VAVK(~ 12952 39681 106209 105429 129888 128171 0.9188843~ 25.76 11601 3 Y202102~ VAVK(982~
33 Q00535 Cyclin-dep~ CDK5 K.NRETH~ 3607 9785 11896 15821 17157 16578 0.9464097~ 16.86 11640 4 Y202102~ NRETHEIV~
34 P06239 Tyrosine-p~ LCK K.VAVK(~ 61261 309388 829224 819311 880050 805814 0.9149269~ 26.44 11698 4 Y202102~ VAVK(982~
35 Q09428 ATP-bindin~ ABCC8 K.GIK(9~ 1594 5557 8316 12094 13502 18556 0.7212605~ 12.86 11720 3 Y202102~ GIK(982.~
36 O96017 Serine/thr~ CHEK2 K.KVAIK~ 0 2419 2921 7961 14193 25095 0.3221695~ 12.31 11726 4 Y202102~ KVAIK(98~
37 Q14289 Protein-ty~ PTK2B K.INVAV~ 0 1038 3481 5593 7167 10181 1 6.77 11750 3 Y202102~ INVAVK(9~
38 O96017 Serine/thr~ CHEK2 K.KVAIK~ 0 3424 5204 13966 20157 31390 0.7579563~ 14.35 11757 4 Y202102~ KVAIK(98~
39 P06493 Cyclin-dep~ CDK1 K.TTGQV~ 9174 29050 51919 54175 48899 54114 0.9611653~ 25.33 11775 3 Y202102~ TTGQVVAM~
40 Q9UKI8 Serine/thr~ TLK1 R.YAAVK~ 0 722 1575 4243 7369 9680 0.5636169~ 5.41 11802 4 Y202102~ YAAVK(98~
41 Q9NWZ3 Interleuki~ IRAK4 K.GYVNN~ 0 1554 6209 11926 16091 15058 1 11.65 11840 3 Y202102~ GYVNNTTV~
42 P06239 Tyrosine-p~ LCK K.VAVK(~ 4064 12719 39754 49495 46048 56991 0.8421731~ 24.88 11886 3 Y202102~ VAVK(982~
43 O43318 Mitogen-ac~ MAP3K7 R.AKDVA~ 0 1443 4444 8842 12467 18002 0.7660729~ 9.96 11928 5 Y202102~ AKDVAIK(~
44 O43318 Mitogen-ac~ MAP3K7 R.AKDVA~ 0 0 3658 7331 10887 11603 0.9411995~ 7.64 11935 4 Y202102~ AKDVAIK(~
45 P06493 Cyclin-dep~ CDK1 K.TTGQV~ 4193 12181 24940 24976 27694 26753 0.9797917~ 25.44 11996 3 Y202102~ TTGQVVAM~
46 P50613 Cyclin-dep~ CDK7 R.DKNTN~ 3903 10057 32154 28836 21979 17995 0.8129503~ 23.6 12004 4 Y202102~ DKNTNQIV~
47 Q9BQI3 Eukaryotic~ EIF2AK1 R.NKLDG~ 0 0 1757 5722 8712 17081 0.6792722~ 8.06 12011 4 Y202102~ NKLDGQYY~
48 Q86UE8 Serine/thr~ TLK2 R.YVAVK~ 0 2435 6020 14175 19013 22651 0.8168586~ 10.43 12016 4 Y202102~ YVAVK(98~
# ... with 6 more variables: Mod.or.not <chr>, Mod.position.in.pep <chr>, Mod.sequence <chr>, start.position <chr>, pos.in.protein <chr>, Mod.site <chr>
Upvotes: 1
Reputation: 161155
If dat2
is your second example (with 2 rows), then
library(dplyr)
dat2 %>%
group_by(ScanNum) %>%
summarize(across(everything(), ~ if (is.numeric(.)) .[1] else paste(unique(.), collapse = ";"))) %>%
str(.)
# tibble [1 x 22] (S3: tbl_df/tbl/data.frame)
# $ ScanNum : num 11314
# $ UniprotID : chr "P63261;P68032"
# $ Description : chr "Actin, cytoplasmic 2 OS=Homo sapiens OX=9606 GN=ACTG1 PE=1 SV=1;Actin, alpha cardiac muscle 1 OS=Homo sapiens O"| __truncated__
# $ Gene.name : chr "ACTG1;ACTC1"
# $ Sequence : chr "K.DSY(982.466)VGDEAQSKR.G"
# $ m/z_126.127725_int : num 0
# $ m/z_127.12476_int : num 0
# $ m/z_128.134433_int : num 733
# $ m/z_129.131468_int : num 4353
# $ m/z_130.141141_int : num 4442
# $ m/z_131.138176_int : num 11286
# $ TMT_purity : chr "0.698686479445912"
# $ Signal-noise : num 5.53
# $ CState : num 4
# $ Filename : chr "Y20210222-09"
# $ sequence1 : chr "DSY(982.466)VGDEAQSKR"
# $ Mod.or.not : chr "Y"
# $ Mod.position.in.pep: int 3
# $ Mod.sequence : chr "DSYVGDEAQSKR"
# $ start.position : int 53
# $ pos.in.protein : num 55
# $ Mod.site : chr "P63261_55;P68032_55"
Note that I'm grouping by ScanNum
(it's what you said was duplicated), and for any columns that inherit numeric
, I arbitrarily take the first value found. Strings are a little more robust in that we combine the unique values.
If you need to group by more variables, increase group_by
. Warning, though, grouping by integer
is perfectly safe, but grouping by floating-point (numeric
) may pose some issues with high-precision data; for references, see Why are these numbers not equal?, Is floating point math broken?, and https://en.wikipedia.org/wiki/IEEE_754.
Upvotes: 2