ash
ash

Reputation: 107

Compare two text files and if the second file has a row which contains both the columns of first file delete that row

I have two files as shown below. file1 has two columns and file2 has varying number of columns depending on the row. I want to compare both files and if $1 and $2 both of file1 is in a row of file2, I want to delete that row. Also file2 is comma separated. How can this be done using awk? or any other text processing tools?

file1

5052 5051  
4952 4951  

file2

         2001,       5052,       7001,       5051,       1000  
         2002,       5052,       7001,       1500,       2500  
         2003,       5051,       3500,       4500,       4952  
         2004,       4952,       4999,       4500,       4951  
        

Expected output:

         2002,       5052,       7001,       1500,       2500  
         2003,       5051,       3500,       4500,       4952  
      

I have tried below awk code but did not run.

awk 'NR==FNR{A[$1]=$1;A[$2]=$2; next} {if ($0=A[$1] && $0=A[$2]){next} else {print $0}' file1 file2 >> test.inp

Upvotes: 1

Views: 306

Answers (3)

Ed Morton
Ed Morton

Reputation: 203219

This assumes the pairs in file1 never have the same value in both fields:

$ cat tst.awk
NR==FNR {
    pairs1[NR] = $1
    pairs2[NR] = $2
    next
}
{
    orig = $0
    gsub(/[[:space:],]+/," ")
    delete vals
    for (i=1; i<=NF; i++) {
        vals[$i]
    }
    for (nr in pairs1) {
        if ( (pairs1[nr] in vals) && (pairs2[nr] in vals) ) {
            next
        }
    }
    print orig
}

$ awk -f tst.awk file1 file2
         2002,       5052,       7001,       1500,       2500
         2003,       5051,       3500,       4500,       4952

Upvotes: 2

tshiono
tshiono

Reputation: 22012

Would you please try the following:

awk 'NR==FNR {
    a[$1] = $2                                  # save $1 as key, and $2 as value
    next                                        # skip the following codes for file1
}
{
    len = split(gensub("^ +", "", 1), ary, " *, *")
                                                # remove leading spaces and split on commas,
                                                # then assign ary to the field values of file2
    matched = 0                                 # reset the flag
    for (i = 1; i <= len; i++) {                # loop over the fields of file2
        if (ary[i] in a) {                      # if $1 in file1 is found
            for (j = 1; j <= len; j++) {        # then go into a deeper loop to search $2 in file1
                if (a[ary[i]] == ary[j]) {      # if $2 is also found in the same record
                    matched = 1                 # then set the flag
                    break                       # and exit from the inner loop
                }
            }
        }
    }
    if (!matched) print                         # print the line unless "matched"
}' file1 file2

The result for the provided example is all records except the line:

     4651,       4651,       4652,       4752,       4751

Upvotes: 0

Raman Sailopal
Raman Sailopal

Reputation: 12867

awk 'NR==FNR { map[$1]="1";map1[$2]="1";next } {lin=gensub(" ","","g",$0);split(lin,map3,",");ok=1;for (i in map3) { if (map1[map3[i]]==1 || map[map3[i]]==1 ) { ok=0 } } if (ok==1) { print $0 } }' file1 file2

Explanation:

awk 'NR==FNR {                                   # Process the first file
               map[$1]="";                       # Set up two arrays, one for 
                                                   the first space delimited 
                                                   field and the other for 
                                                   the second
               map1[$2]="";
               next                              # Skip to the next record
              } 
              {
               lin=gensub(" ","","g",$0);        # Process the second file and remove 
                                                   all spaces from the the line 
                                                   putting the result in a variable 
                                                   lin
               split(lin,map3,",");              # Split the variable lin into the 
                                                   array map3 based on commas as the 
                                                   separator
               ok=1;                             # Initialise a variable
               for (i in map3) { 
                 if (map1[map3[i]] || map[map3[i]]) { 
                    ok=0                         # Loop through each entry in the map3 
                                                   array (on the line) and check if it 
                                                   exists in map1 or map. If it does 
                                                   exist, set ok to 0
                 } 
               } 
               if (ok==1) { 
                 print $0                         # Only if the variable ok is 1, 
                                                    print
               } 
              }' file1 file2

Upvotes: 0

Related Questions