Reputation: 37
I have 3 files 1.csv and 2.csv and 3.csv
1.csv
TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Alphanumeric_A_MSISDN_blocking,1
CABLE&WIRELESS_BARBADOS,BARBADOS,Alphanumeric_A_MSISDN_blocking,791
SIMINN_ICELAND_TELECOM,ICELAND,Alphanumeric_A_MSISDN_blocking,109373
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,Alphanumeric_A_MSISDN_blocking,2
CABLE&WIRELESS_JAMAICA,JAMAICA,Alphanumeric_A_MSISDN_blocking,85
2.csv
SIMINN_ICELAND_TELECOM,ICELAND,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638
3.csv:
TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Calling_Party_Address_Blocking,79
CABLE&WIRELESS_BARBADOS,BARBADOS,Calling_Party_Address_Blocking,30
MOBILKOM_LIECHTENSTEIN,LIECHTENSTEIN,Calling_Party_Address_Blocking,6
SYNIVERSE_ANSI,UNITED_STATES,Calling_Party_Address_Blocking,12
I want to merge files so that it will print output file as below
TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Alphanumeric_A_MSISDN_blocking,1,NA,NA,Calling_Party_Address_Blocking,79
CABLE&WIRELESS_BARBADOS,BARBADOS,Alphanumeric_A_MSISDN_blocking,791,NA,NA,Calling_Party_Address_Blocking,30
SIMINN_ICELAND_TELECOM,ICELAND,Alphanumeric_A_MSISDN_blocking,109373,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795,NA,NA
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,Alphanumeric_A_MSISDN_blocking,2,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638,NA,NA
CABLE&WIRELESS_JAMAICA,JAMAICA,Alphanumeric_A_MSISDN_blocking,85,NA,NA,NA,NA
MOBILKOM_LIECHTENSTEIN,LIECHTENSTEIN,NA,NA,NA,NA,Calling_Party_Address_Blocking,6
SYNIVERSE_ANSI,UNITED_STATES,NA,NA,NA,NA,Calling_Party_Address_Blocking,12
My Script:
awk ' BEGIN { FS = OFS ="," }
FNR==NR {
a[$1 FS $2] = $3 FS $4;
next
}
{
print $0, (($1 FS $2) in a?a[$1 FS $2]:"NA,NA")
delete a[$1 FS $2]
}
END {
for(left in a) print left,"NA,NA", a[left]
}' 1.csv 2.csv 3.csv
Output:
SIMINN_ICELAND_TELECOM,ICELAND,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795,Alphanumeric_A_MSISDN_blocking,109373
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638,Alphanumeric_A_MSISDN_blocking,2
TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Calling_Party_Address_Blocking,79,Alphanumeric_A_MSISDN_blocking,1
CABLE&WIRELESS_BARBADOS,BARBADOS,Calling_Party_Address_Blocking,30,Alphanumeric_A_MSISDN_blocking,791
MOBILKOM_LIECHTENSTEIN,LIECHTENSTEIN,Calling_Party_Address_Blocking,6,NA,NA
SYNIVERSE_ANSI,UNITED_STATES,Calling_Party_Address_Blocking,12,NA,NA
CABLE&WIRELESS_JAMAICA,JAMAICA,NA,NA,Alphanumeric_A_MSISDN_blocking,85
As you can see my output is mismanaged, please suggest solution.
Upvotes: 1
Views: 131
Reputation: 74625
Here's how you could do it:
BEGIN { FS = "," }
FNR==1 { ++file }
{
a[$1,$2,file] = $3 FS $4
++seen[$1,$2]
}
END {
for (j in seen) {
split(j, b, SUBSEP)
s = b[1] FS b[2]
for (i=1; i<=file; ++i) {
s = s FS (j SUBSEP i in a ? a[j,i] : "NA" FS "NA")
}
print s
}
}
Previously it looked like 1.csv was the master record and that 2.csv and 3.csv only contained additional data for existing records.
Every time a new file begins, increment the file
counter. Add every record found to the array a
, using first two columns and the file counter as a key. Add the first two columns to the seen
array so the full list of keys is known.
In the end block, loop through the seen array. Merge together each record, using "NA" if the part of the record is missing.
Testing it out:
$ awk -f merge.awk 1.csv 2.csv 3.csv
SYNIVERSE_ANSI,UNITED_STATES,NA,NA,NA,NA,Calling_Party_Address_Blocking,12
TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Alphanumeric_A_MSISDN_blocking,1,NA,NA,Calling_Party_Address_Blocking,79
MOBILKOM_LIECHTENSTEIN,LIECHTENSTEIN,NA,NA,NA,NA,Calling_Party_Address_Blocking,6
CABLE&WIRELESS_JAMAICA,JAMAICA,Alphanumeric_A_MSISDN_blocking,85,NA,NA,NA,NA
SIMINN_ICELAND_TELECOM,ICELAND,Alphanumeric_A_MSISDN_blocking,109373,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795,NA,NA
CABLE&WIRELESS_BARBADOS,BARBADOS,Alphanumeric_A_MSISDN_blocking,791,NA,NA,Calling_Party_Address_Blocking,30
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,Alphanumeric_A_MSISDN_blocking,2,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638,NA,NA
Note that the ordering of the output has changed because it depends on the order in which the keys appear in seen
. To impose a strict order would be an additional step and I'm not sure whether it is required.
Upvotes: 1
Reputation: 75488
#!/usr/bin/awk -f
BEGIN { FS = "," }
function include_missing( key) {
append = ""
for (i = field_count[file_index] - 2; i > 0; --i) {
append = append ",NA"
}
for (i = 1; i <= k; ++i) {
key = keys[i]
if (validity[key] < file_index) {
data[key] = data[key] append
++validity[key]
}
}
}
{
sub(/[ \t\r]*$/, "")
key = $1 FS $2
if (FNR == 1) {
if (file_index) {
include_missing()
}
field_count[++file_index] = NF
}
if (file_index == 1) {
keys[++k] = key
}
++validity[key]
data[key] = data[key] substr($0, length(key) + 1)
}
END {
include_missing()
for (i = 1; i <= k; ++i) {
key = keys[i]
print key data[key]
}
}
Usage:
awk -f script.awk file1 file2 fil3
Output:
TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Alphanumeric_A_MSISDN_blocking,1,NA,NA,Calling_Party_Address_Blocking,79
CABLE&WIRELESS_BARBADOS,BARBADOS,Alphanumeric_A_MSISDN_blocking,791,NA,NA,Calling_Party_Address_Blocking,30
SIMINN_ICELAND_TELECOM,ICELAND,Alphanumeric_A_MSISDN_blocking,109373,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795,Calling_Party_Address_Blocking,1160
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,Alphanumeric_A_MSISDN_blocking,2,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638,NA,NA
CABLE&WIRELESS_JAMAICA,JAMAICA,Alphanumeric_A_MSISDN_blocking,85,NA,NA,Calling_Party_Address_Blocking,1
With GNU Awk it can also be simpler:
#!/usr/bin/awk -f
BEGIN { FS = "," }
{
sub(/[ \t\r]*$/, "")
key = $1 FS $2
if (ARGIND == 1) {
keys[++k] = key
}
++validity[key]
data[key] = data[key] substr($0, length(key) + 1)
}
ENDFILE {
append = ""
for (i = NF - 2; i > 0; --i) {
append = append ",NA"
}
for (i = 1; i <= k; ++i) {
key = keys[i]
if (validity[key] < ARGIND) {
data[key] = data[key] append
++validity[key]
}
}
}
END {
for (i = 1; i <= k; ++i) {
key = keys[i]
print key data[key]
}
}
Usage:
gawk -f script.awk file1 file2 file3
Upvotes: 0