Reputation: 565
I've an issue with formatting output on the below.
I've duplicate lines in many files SHORT_LIST.a SHORT_LIST.b SHORT_LIST.c, but there can be many, many more.
the line "test1" exists in all three files, as does the string "sample".
The line "test" exists in two files, but exists more than once in one of the files, I'd like to have it output this just once per file name.
function check_duplicates {
awk 'END {
for (R in rec) {
#split out the SHORT_LIST files
n = split(rec[R], t, "/SHORT_LIST")
#printf n dup[n]
count = 0
if ( n > 2 )
dup[n] = dup[n] ? dup[n] RS sprintf( R, rec[R]) :
sprintf("\t%-20s %s ", R, rec[R]);
}
for (D in dup) {
((count++))
printf "%s\n \n", d
printf count " ). Duplicate record(s) found in the following files: " dup[D]
}
}
{
# build an array named rec (short for record), indexed by
# the content of the current record ($0), concatenating
# the filenames separated by / as values
rec[$0] = rec[$0] ? rec[$0] "\n \t" FILENAME : FILENAME
}' $SITEFILES
}
check_duplicates
Current output below :
Duplicate records found in the following files:
1 ). Duplicate record(s) found in the following files: test1
SHORT_LIST.a
SHORT_LIST.b
SHORT_LIST.c
sample
2 ). Duplicate record(s) found in the following files: test
SHORT_LIST.c
SHORT_LIST.b
SHORT_LIST.b
SHORT_LIST.b
3 ). Duplicate record(s) found in the following files: /path/to/file
SHORT_LIST.a
SHORT_LIST.c
testa
Desired Output below :
Duplicate records found in the following files:
1 ). Duplicate record(s) found in the following files: test1
SHORT_LIST.a
SHORT_LIST.b
SHORT_LIST.c
2 ). Duplicate record(s) found in the following files: sample
SHORT_LIST.a
SHORT_LIST.b
SHORT_LIST.c
3 ). Duplicate record(s) found in the following files: test
SHORT_LIST.c
SHORT_LIST.b
4 ). Duplicate record(s) found in the following files: /path/to/file
SHORT_LIST.a
SHORT_LIST.c
5 ). Duplicate record(s) found in the following files: testa SHORT_LIST.a SHORT_LIST.c
Any suggestions would be greatly appreciated, I'm having trouble with this level of AWK.
Upvotes: 0
Views: 306
Reputation: 565
I split it out by multiple files, and within the same file, I also put stuff in to allow for comments to be ignored, you could do this with white space too, etc.
Thanks so much to @karakfa your answer was amazing, thanks.
function check_duplicates {
#Check multiple files for duplicates.
awk '
FNR==1{files[FILENAME]}
{if((FILENAME, $0) in a) dupsInFile[FILENAME]
else
{a[FILENAME, $0]
dups[$0] = $0 in dups ? (dups[$0] RS FILENAME) : FILENAME
count[$0]++}}
#ignore comment lines
{if ($0 ~ /#/) {
delete dups[$0]
}}
#Print duplicates in more than one file
END{for(k in dups)
{if(count[k] > 1)
{print ("\n\n\tDuplicate line found: " k) "\n\tIn the following file(s)"
print dups[k] }}
printf "\n";
}' $SITEFILES
#Check single files for duplicates.
awk '
NR {
b[$0]++
}
#ignore comment lines
$0 in b {
if ($0 ~ /#/) {
delete b[$0]
}
if (b[$0]>1) {
print ("\n\n\tDuplicate line found: "$0) "\n\tIn the following file"
print FILENAME
delete b[$0]
}
}' $SITEFILES
}
Upvotes: 0
Reputation: 67507
You can follow this template and fix the output format as desired
$ awk -f dups.awk fa fb fc
dups for : /path/to/file in files
fa fc
dups for : test in files
fa fb fc
dups for : sample in files
fa fb fc
no dups in
fc
$ cat dups.awk
FNR==1{files[FILENAME]}
{if((FILENAME, $0) in a) dupsInFile[FILENAME]
else
{a[FILENAME, $0]
dups[$0] = $0 in dups ? (dups[$0] FS FILENAME) : FILENAME
count[$0]++}}
END{for(k in dups)
{if(count[k] > 1)
{print ("dups for : " k) " in files"
print dups[k]}}
for(f in dupsInFile) delete files[f];
print "no dups in";
for(f in files) printf "%s", f FS;
printf "\n";
}
where
$ head f{a,b,c}
==> fa <==
test
test
test1
sample
/path/to/file
==> fb <==
test
test
sample
==> fc <==
test
sample
/path/to/file
ps. always provide sample input.
Upvotes: 2
Reputation: 37278
Maybe something like
awk '{print FILENAME "\t" $0}' $SITEFILES \
| sort \
| uniq -c -f1 \
| awk "{if ($1 + .0 != 1) print $0}'
will get you started
Without small sample data sets, it's not practical to do more on this.
IHTH
Upvotes: 1