Reputation: 77
I need to use "FPAT" or the equivalent function "patsplit" of gawk. But it seems the installed version of gawk is 3.1.5 on our CentOs server.
I tried updating gawk with these commands:
yum update gawk;
And the server showed: "No Packages marked for Update"
I also tried reinstalling gawk with:
yum install gawk;
server output: "Package gawk-3.1.5-15.el5.x86_64 already installed and latest version "
Where i need gawk 4.0 or above to use those FPAT OR patsplit. And why i need to use them? well i am trying to process a CSV file, and it seems the CSV file has optional quotation and embedded comma.
Example:
From a csv row like this:
this,is,a,"csv,with,embedded coma"
I need to split the fields like this:
this
is
a
"csv,with,embedded comma"
And here is the gawk code:
awk '{patsplit("this,is,a,\"csv,with,embedded comma\"",a,"([^,]*)|(\"([^\"]|\"\")+\"[^,]*)",seps); for(i=0;i<length(a);i++) print a[i];}';
Can anyone help me on this please ?
Upvotes: 0
Views: 2193
Reputation: 463
if you have access to yum, which should be called dnf now, see
then you could just run as a normal user
git clone https://git.savannah.gnu.org/git/gawk.git
cd gawk
./configure
make
sudo make install
then your question of a tmp.txt input file of
this,is,a,"csv,with,embedded coma"
is easily converted by
gawk '{patsplit("this,is,a,\"csv,with,embedded comma\"",a,"([^,]*)|(\"([^\"]|\"\")+\"[^,]*)",seps); for(i=0;i<length(a);i++) print a[i];}' tmp.txt
you may have issue compiling gawk, you should address those if they pop up.
Upvotes: 0
Reputation: 576
Here is a pure GAWK solution:
{ # Split on double quotes to handle lines like "this, this, or this".
printf("LINE: '%s'\nFIELDS:", $0)
n = split($0,q,/"/)
f = 0
}
n == 1 { # If n is 1, there are no double quotes on the line.
n = split($0,c,/,/)
for (i = 1; i <= n; i++) {
printf(" %d='%s'", i, c[i])
}
printf("\n")
next
}
{ # There are "strings"; the EVEN entries in q are the quoted strings.
for (i = 1; i <= n; i++) {
if (0 == and(i,1)) { # i is EVEN: This is a double-quoted string.
printf(" %d='\"%s\"'", ++f, q[i])
continue
}
if (0 == length(q[i])) { # First/last field is a quoted string.
continue
}
if (q[i] == ",") {
# First/last field empty, or comma between two quoted strings.
if (i == 1 || i == n) { # First/last field empty
printf(" %d=''", ++f)
}
continue
}
# Remove commas before/after a quoted string then split on commas.
sub(/^,/,"",q[i])
sub(/,$/,"",q[i])
m = split(q[i],cq,/,/)
for (j = 1; j <= m; j++) {
printf(" %d='%s'", ++f, cq[j])
}
}
printf("\n")
}
With this input:
This is one,23,$9.32,Another string.
Line 2,234,$88.34,Blah blah
"This is another",763,$0.00,"trouble, or not?"
"This is, perhaps, trouble too...",763,$0.00,"trouble, or not?"
2,"This is, perhaps, trouble too...",763,"trouble, or not?"
3,,"number, number","well?"
,,,
"1,one","2,two","3,three","4,four"
",commas,","no commas",",,,,,",
,"Fields 1 and 4 are empty","But 2 and 3 are not",
This output is produced:
LINE: 'This is one,23,$9.32,Another string.'
FIELDS: 1='This is one' 2='23' 3='$9.32' 4='Another string.'
LINE: 'Line 2,234,$88.34,Blah blah'
FIELDS: 1='Line 2' 2='234' 3='$88.34' 4='Blah blah'
LINE: '"This is another",763,$0.00,"trouble, or not?"'
FIELDS: 1='"This is another"' 2='763' 3='$0.00' 4='"trouble, or not?"'
LINE: '"This is, perhaps, trouble too...",763,$0.00,"trouble, or not?"'
FIELDS: 1='"This is, perhaps, trouble too..."' 2='763' 3='$0.00' 4='"trouble, or not?"'
LINE: '2,"This is, perhaps, trouble too...",763,"trouble, or not?"'
FIELDS: 1='2' 2='"This is, perhaps, trouble too..."' 3='763' 4='"trouble, or not?"'
LINE: '3,,"number, number","well?"'
FIELDS: 1='3' 2='' 3='"number, number"' 4='"well?"'
LINE: ',,,'
FIELDS: 1='' 2='' 3='' 4=''
LINE: '"1,one","2,two","3,three","4,four"'
FIELDS: 1='"1,one"' 2='"2,two"' 3='"3,three"' 4='"4,four"'
LINE: '",commas,","no commas",",,,,,",'
FIELDS: 1='",commas,"' 2='"no commas"' 3='",,,,,"' 4=''
LINE: ',"Fields 1 and 4 are empty","But 2 and 3 are not",'
FIELDS: 1='' 2='"Fields 1 and 4 are empty"' 3='"But 2 and 3 are not"' 4=''
Upvotes: 0
Reputation: 563
Try using csvquote in your pipeline to make the data easy for awk to interpret. This is a script I wrote that replaces the commas inside quoted fields with nonprinting characters, and then restores them.
So if your awk command looked like this originally:
awk -F, '{print $3 "," $5}' inputfile.csv
... it can be made to work with csv quoted separators like this:
csvquote inputfile.csv | awk -F, '{print $3 "," $5}' | csvquote -u
For code and more documentation, see https://github.com/dbro/csvquote
Upvotes: 2
Reputation: 204310
The simplest thing to do is convert the commas outside of the quotes to something else before you do your real processing. For example:
$ cat file
this,is,a,"csv,with,embedded coma",and,here,"is,another",one
and,here,"is,another,line"
$
$ awk 'BEGIN{FS=OFS="\""}{for (i=1;i<=NF;i+=2) gsub(/,/,";",$i)}1' file
this;is;a;"csv,with,embedded coma";and;here;"is,another";one
and;here;"is,another,line"
If you don't like ";"s as field separators, pick something else like a control character or here's an example using newlines as the FSs and blank lines as the RSs:
$ awk 'BEGIN{FS=OFS="\""; ORS="\n\n"}{for (i=1;i<=NF;i+=2) gsub(/,/,"\n",$i)}1' file
this
is
a
"csv,with,embedded coma"
and
here
"is,another"
one
and
here
"is,another,line"
$ awk 'BEGIN{FS=OFS="\""; ORS="\n\n"}{for (i=1;i<=NF;i+=2) gsub(/,/,"\n",$i)}1' file |
awk -F'\n' -v RS= '{for (i=1;i<=NF;i++) print NR,i,"<" $i ">"}'
1 1 <this>
1 2 <is>
1 3 <a>
1 4 <"csv,with,embedded coma">
1 5 <and>
1 6 <here>
1 7 <"is,another">
1 8 <one>
2 1 <and>
2 2 <here>
2 3 <"is,another,line">
It only gets tricky if you have embedded newlines or embedded escaped double quotes.
Upvotes: 1
Reputation: 195209
I think we could use match() to get the fields.
here are the codes:
awk '{ $0=$0","
while($0) {
match($0,/ *"[^"]*" *,|[^,]*,/)
field=substr($0,RSTART,RLENGTH)
gsub(/,$/,"",field)
print field
$0=substr($0,RLENGTH+1)
}}' file
test with your input example:
kent$ echo 'this,is,a,"csv,with,embedded coma"'|awk '{
$0=$0","
while($0) {
match($0,/ *"[^"]*" *,|[^,]*,/)
field=substr($0,RSTART,RLENGTH)
gsub(/,$/,"",field)
print field
$0=substr($0,RLENGTH+1)
}}'
this
is
a
"csv,with,embedded coma"
Upvotes: 1