Reputation: 449
I have files that look like this -- already sorted by year (inside years sorted by id, which appears to be equivalent to strict sorting by id, but this may not always apply).
ID,COU,YEA, VOT
1,USA,2000,1
2,USA,2000,0
3,USA,2001,1
4,USA,2003,2
5,USA,2003,0
I would like to rewrite them like this (ids for year N after 1999 in column 2N-1, corresponding votes in column 2N):
2000 IDS, VOTE, 2001 IDS, VOTE, 2002 IDS, VOTE, 2003 IDS, VOTE
1,1,3,1, , ,4,2
2,0, , , , ,5,0
I don't know how to do it. My basic thinking with awk
was:
But there are two problems:
The only working idea I have is, to create 20 unneeded files and glue them with paste
which I have never used, but which seems suitable, according to man
on my system.
Upvotes: 1
Views: 47
Reputation: 133538
With respect and permission of glenn jackman I am taking his suggested code the only thing I am trying to add here is get maximum and minimum year in awk
's variable itself and NOT calculating it inside main block of awk
program, since OP confirmed that Input_file is sorted by year. Answers by Glenn and Ed sir are awesome, just thought to add a variant here.
BTW we could use awk
in stead of using tail
and head
s in variables too here :)
awk -v max=$(tail -1 Input_file | cut -d, -f3) -v min=$(head -2 Input_file | tail -1 | cut -d, -f3) '
BEGIN { FS = "," }
NR > 1 {
year=$3
count[year]++
id[year, count[year]] = $1
vote[year, count[year]] = $4
if (count[year] > maxCount) maxCount = count[year]
}
END {
sep = ""
for (y=min; y<=max; y++) {
printf "%s%d ID,VOTE", sep, y
sep = ","
}
print ""
for (i=1; i<=maxCount; i++) {
sep = ""
for (y=min; y<=max; y++) {
printf "%s%s,%s", sep, id[y, i], vote[y, i]
sep = ","
}
print ""
}
}' Input_file
Upvotes: 2
Reputation: 203665
This is functionally the same as @Glenn's and no better than it in any way so his should remain the accepted answer but I came up with it before looking at his and thought it might be useful to post it anyway to show some small alternatives in style and implementation details:
$ cat tst.awk
BEGIN { FS=OFS="," }
NR == 1 { next }
{
id = $1
year = $3
votes = $4
if ( ++numYears == 1 ) {
begYear = year
}
endYear = year
yearIds[year,++numIds[year]] = id
yearVotes[year,numIds[year]] = votes
maxIds = (numIds[year] > maxIds ? numIds[year] : maxIds)
}
END {
for (year=begYear; year<=endYear; year++) {
printf "%s IDS%sVOTE%s", year, OFS, (year<endYear ? OFS : ORS)
}
for (idNr=1; idNr<=maxIds; idNr++) {
for (year=begYear; year<=endYear; year++) {
id = votes = " "
if ( (year,idNr) in yearIds ) {
id = yearIds[year,idNr]
votes = yearVotes[year,idNr]
}
printf "%s%s%s%s", id, OFS, votes, (year<endYear ? OFS : ORS)
}
}
}
$ awk -f tst.awk file
2000 IDS,VOTE,2001 IDS,VOTE,2002 IDS,VOTE,2003 IDS,VOTE
1,1,3,1, , ,4,2
2,0, , , , ,5,0
Upvotes: 3
Reputation: 246847
The key is to use multidimensional arrays
BEGIN {FS = ","}
NR == 2 {minYear = maxYear = $3}
NR > 1 {
year=$3
count[year]++
id[year, count[year]] = $1
vote[year, count[year]] = $4
if (year < minYear) minYear = year
if (year > maxYear) maxYear = year
if (count[year] > maxCount) maxCount = count[year]
}
END {
sep = ""
for (y=minYear; y<=maxYear; y++) {
printf "%s%d ID,VOTE", sep, y
sep = ","
}
print ""
for (i=1; i<=maxCount; i++) {
sep = ""
for (y=minYear; y<=maxYear; y++) {
printf "%s%s,%s", sep, id[y, i], vote[y, i]
sep = ","
}
print ""
}
}
Then,
$ awk -f transpose.awk input_file
2000 ID,VOTE,2001 ID,VOTE,2002 ID,VOTE,2003 ID,VOTE
1,1,3,1,,,4,2
2,0,,,,,5,0
If you really want hte spaces in the output, change the last printf
to
printf "%s%s,%s", sep,
((y, i) in id ? id[y, i] : " "),
((y, i) in vote ? vote[y, i] : " ")
Upvotes: 4