Reputation: 449

Writing from lines into columns based on third column

I have files that look like this -- already sorted by year (inside years sorted by id, which appears to be equivalent to strict sorting by id, but this may not always apply).

ID,COU,YEA, VOT 
1,USA,2000,1
2,USA,2000,0
3,USA,2001,1
4,USA,2003,2
5,USA,2003,0

I would like to rewrite them like this (ids for year N after 1999 in column 2N-1, corresponding votes in column 2N):

2000 IDS, VOTE, 2001 IDS, VOTE, 2002 IDS, VOTE, 2003 IDS, VOTE
1,1,3,1, , ,4,2
2,0, , , , ,5,0

I don't know how to do it. My basic thinking with awk was:

if $3 == 2000, { print $1, $4 }
if $3 == 2001, { print " "," ", $1, $4 } etc

But there are two problems:

this way the columns for years other than 2000 would start with a lot of empty lines
I have found no intelligent way to generalise the print command, so I would have to write 20 if-statements

The only working idea I have is, to create 20 unneeded files and glue them with paste which I have never used, but which seems suitable, according to man on my system.

Upvotes: 1

Answers (3)

RavinderSingh13

Reputation: 133538

With respect and permission of glenn jackman I am taking his suggested code the only thing I am trying to add here is get maximum and minimum year in awk's variable itself and NOT calculating it inside main block of awk program, since OP confirmed that Input_file is sorted by year. Answers by Glenn and Ed sir are awesome, just thought to add a variant here.

BTW we could use awk in stead of using tail and heads in variables too here :)

awk -v max=$(tail -1 Input_file | cut -d, -f3) -v min=$(head -2 Input_file | tail -1 | cut -d, -f3) '
BEGIN { FS = "," }
NR > 1 {
    year=$3
    count[year]++
    id[year, count[year]] = $1
    vote[year, count[year]] = $4
    if (count[year] > maxCount) maxCount = count[year]
}
END {
    sep = ""
    for (y=min; y<=max; y++) {
        printf "%s%d ID,VOTE", sep, y
        sep = ","
    }
    print ""
    for (i=1; i<=maxCount; i++) {
        sep = ""
        for (y=min; y<=max; y++) {
            printf "%s%s,%s", sep, id[y, i], vote[y, i]
            sep = ","
        }
        print ""
    }
}' Input_file

Upvotes: 2

Ed Morton

Reputation: 203665

This is functionally the same as @Glenn's and no better than it in any way so his should remain the accepted answer but I came up with it before looking at his and thought it might be useful to post it anyway to show some small alternatives in style and implementation details:

$ cat tst.awk
BEGIN { FS=OFS="," }
NR == 1 { next }
{
    id    = $1
    year  = $3
    votes = $4

    if ( ++numYears == 1 ) {
        begYear = year
    }
    endYear = year

    yearIds[year,++numIds[year]] = id
    yearVotes[year,numIds[year]] = votes
    maxIds = (numIds[year] > maxIds ? numIds[year] : maxIds)
}
END {
    for (year=begYear; year<=endYear; year++) {
        printf "%s IDS%sVOTE%s", year, OFS, (year<endYear ? OFS : ORS)
    }

    for (idNr=1; idNr<=maxIds; idNr++) {
        for (year=begYear; year<=endYear; year++) {
            id = votes = " "
            if ( (year,idNr) in yearIds ) {
                id = yearIds[year,idNr]
                votes = yearVotes[year,idNr]
            }
            printf "%s%s%s%s", id, OFS, votes, (year<endYear ? OFS : ORS)
        }
    }
}

$ awk -f tst.awk file
2000 IDS,VOTE,2001 IDS,VOTE,2002 IDS,VOTE,2003 IDS,VOTE
1,1,3,1, , ,4,2
2,0, , , , ,5,0

Upvotes: 3

glenn jackman

Reputation: 246847

The key is to use multidimensional arrays

BEGIN {FS = ","}
NR == 2 {minYear = maxYear = $3}
NR > 1 {
    year=$3
    count[year]++
    id[year, count[year]] = $1
    vote[year, count[year]] = $4

    if (year < minYear) minYear = year
    if (year > maxYear) maxYear = year
    if (count[year] > maxCount) maxCount = count[year]
}
END {
    sep = ""
    for (y=minYear; y<=maxYear; y++) {
        printf "%s%d ID,VOTE", sep, y
        sep = ","
    }
    print ""
    for (i=1; i<=maxCount; i++) {
        sep = ""
        for (y=minYear; y<=maxYear; y++) {
            printf "%s%s,%s", sep, id[y, i], vote[y, i]
            sep = ","
        }
        print ""
    }
}

Then,

$ awk -f transpose.awk input_file
2000 ID,VOTE,2001 ID,VOTE,2002 ID,VOTE,2003 ID,VOTE
1,1,3,1,,,4,2
2,0,,,,,5,0

If you really want hte spaces in the output, change the last printf to

            printf "%s%s,%s", sep, 
                ((y, i) in id   ? id[y, i]   : " "),
                ((y, i) in vote ? vote[y, i] : " ")

Upvotes: 4

Writing from lines into columns based on third column

Answers (3)

Related Questions