Pan
Pan

Reputation: 213

lists match with dataframe

I want to match lists with dataframe

> lists
    [[1]]
    [1] "SURFSKINTEMP=6" "MODIS_LST=1"   

    [[2]]
    [1] "TOTCO=13"    "MODIS_LST=1"

    [[3]]
    [1] "TOTCO=6"     "MODIS_LST=1"

    [[4]]
    [1] "TOTO3=15"    "MODIS_LST=1"

    [[5]]
    [1] "TOTH2OVAP=6" "MODIS_LST=1"

    [[6]]
    [1] "TOTH2OVAP=1" "MODIS_LST=1"

and the test dataframe

 > test
      LONGITUDE LATITUDE DATE_START   DATE_END FLAG SURFSKINTEMP SURFAIRTEMP TOTH2OVAP TOTO3 TOTCO TOTCH4 OLR_ARIS CLROLR_ARIS OLR_NOAA MODIS_LST
    1     118.5    -11.5 2014-12-30 2015-01-06    2            6           6         6    16    13     13       10          10       10         1

in above example. data.frame match 3 lists

[1] "SURFSKINTEMP=6" "MODIS_LST=1" 
[1] "TOTCO=13"    "MODIS_LST=1"  
[1] "TOTH2OVAP=6" "MODIS_LST=1"

so the match count is 3, matching ratio is 3/6 = 50%.

my expect will add two columns(MATCH_COUNT MATCH_RATIO), like:

  LONGITUDE LATITUDE DATE_START   DATE_END FLAG SURFSKINTEMP SURFAIRTEMP TOTH2OVAP TOTO3 TOTCO TOTCH4  OLR_ARIS CLROLR_ARIS OLR_NOAA MODIS_LST  MATCH_COUNT  MATCH_RATIO          
1     118.5    -11.5 2014-12-30 2015-01-06    2            6           6         6    16    13     13        10          10       10         1            3          0.5

the list and test dataframe I used is:

lists <- list(c("SURFSKINTEMP=6", "MODIS_LST=1"), c("TOTCO=13", "MODIS_LST=1"
), c("TOTCO=6", "MODIS_LST=1"), c("TOTO3=15", "MODIS_LST=1"), 
    c("TOTH2OVAP=6", "MODIS_LST=1"), c("TOTH2OVAP=1", "MODIS_LST=1"
    ))



test <- structure(list(LONGITUDE = 118.5, LATITUDE = -11.5, DATE_START = structure(1419897600, class = c("POSIXct", 
"POSIXt")), DATE_END = structure(1420502400, class = c("POSIXct", 
"POSIXt")), FLAG = 2, SURFSKINTEMP = 6L, SURFAIRTEMP = 6L, TOTH2OVAP = 6L, 
    TOTO3 = 16L, TOTCO = 13L, TOTCH4 = 13L, OLR_ARIS = 10L, CLROLR_ARIS = 10L, 
    OLR_NOAA = 10L, MODIS_LST = 1L), .Names = c("LONGITUDE", 
"LATITUDE", "DATE_START", "DATE_END", "FLAG", "SURFSKINTEMP", 
"SURFAIRTEMP", "TOTH2OVAP", "TOTO3", "TOTCO", "TOTCH4", "OLR_ARIS", 
"CLROLR_ARIS", "OLR_NOAA", "MODIS_LST"), row.names = 1L, class = "data.frame")

How to implement it, thanks

and for multi-rows

> test1
  LONGITUDE LATITUDE DATE_START   DATE_END FLAG SURFSKINTEMP SURFAIRTEMP TOTH2OVAP TOTO3 TOTCO TOTCH4 OLR_ARIS CLROLR_ARIS OLR_NOAA MODIS_LST
1     118.5    -11.5 2014-12-30 2015-01-06    2            6           6         6    16    13     13       10          10       10         1
2     118.5    -11.5 2014-12-31 2015-01-07    2            1           6         1    16     6     14        4           4       10         1
3     118.5    -11.5 2015-01-01 2015-01-08    2           16           6        17    16     8      6        4           4       10         2

my expect is

  LONGITUDE LATITUDE DATE_START   DATE_END FLAG SURFSKINTEMP SURFAIRTEMP TOTH2OVAP TOTO3 TOTCO TOTCH4 OLR_ARIS CLROLR_ARIS OLR_NOAA MODIS_LST  MATCH_COUNT  MATCH_RATIO
1     118.5    -11.5 2014-12-30 2015-01-06    2            6           6         6    16    13     13       10          10       10         1            3          0.5
2     118.5    -11.5 2014-12-31 2015-01-07    2            1           6         1    16     6     14        4           4       10         1            2        0.333
3     118.5    -11.5 2015-01-01 2015-01-08    2           16           6        17    16     8      6        4           4       10         2            0            0

used data here is

test1 <- structure(list(LONGITUDE = c(118.5, 118.5, 118.5), LATITUDE = c(-11.5,                                                            
-11.5, -11.5), DATE_START = structure(c(1419897600, 1419984000,                                                                   
1420070400), class = c("POSIXct", "POSIXt")), DATE_END = structure(c(1420502400,                                                 
1420588800, 1420675200), class = c("POSIXct", "POSIXt")), FLAG = c(2,                                                             
2, 2), SURFSKINTEMP = c(6L, 1L, 16L), SURFAIRTEMP = c(6L, 6L,                     
6L), TOTH2OVAP = c(6L, 1L, 17L), TOTO3 = c(16L, 16L, 16L), TOTCO = c(13L,         
6L, 8L), TOTCH4 = c(13L, 14L, 6L), OLR_ARIS = c(10L, 4L, 4L),                     
    CLROLR_ARIS = c(10L, 4L, 4L), OLR_NOAA = c(10L, 10L, 10L),                   
    MODIS_LST = c(1L, 1L, 2L)), .Names = c("LONGITUDE", "LATITUDE",              
"DATE_START", "DATE_END", "FLAG", "SURFSKINTEMP", "SURFAIRTEMP",                 
"TOTH2OVAP", "TOTO3", "TOTCO", "TOTCH4", "OLR_ARIS", "CLROLR_ARIS",              
"OLR_NOAA", "MODIS_LST"), row.names = c(NA, 3L), class = "data.frame") 

Upvotes: 1

Views: 64

Answers (1)

akrun
akrun

Reputation: 886948

We could paste the column names of 'test' with the single row of 'test', check whether all the elements in each list element are there in the pasted vector to get a logical vector. Create the column by getting the sum and mean of the logical vector

v1 <- paste(names(test), unlist(test), sep="=")
i1 <- sapply(lists, function(x) all(x %in% v1))
test[c('MATCH_COUNT', 'MATCH_RATIO')] <- list(sum(i1), mean(i1))
test
#  LONGITUDE LATITUDE          DATE_START            DATE_END FLAG SURFSKINTEMP SURFAIRTEMP TOTH2OVAP TOTO3 TOTCO TOTCH4 OLR_ARIS CLROLR_ARIS OLR_NOAA MODIS_LST
#1     118.5    -11.5 2014-12-30 05:30:00 2015-01-06 05:30:00    2            6           6         6    16    13     13       10          10       10         1
#  MATCH_COUNT MATCH_RATIO
#1           3         0.5

If the dataset 'test' have greater than 1 row, then

m1 <- matrix(paste(names(test1)[col(test1)], unlist(test1), sep="="), ncol = ncol(test1))
i2 <- sapply(lists, function(x) apply(m1, 1, function(y) all(x %in% y)))
test1[c('MATCH_COUNT', 'MATCH_RATIO')] <- list(rowSums(i2), rowMeans(i2))

Upvotes: 1

Related Questions