chiappa
chiappa

Reputation: 1345

Merging rows based on cell contents in a CSV file

I am trying to merge rows with a matching first cell in a CSV file. So that the following cells are placed within their rightful columns based on matching strings.

I have a file with the following contents:

item,pieces,color,last order
"apples","4 pieces"
"apples","red color"
"apples","last ordered 2 hours ago"
"mangos","1 piece"
"mangos","last ordered 1 day ago"
"carrots","10 pieces"
"carrots","orange color"

Which then should be merged into the following:

item,pieces,color,last order
"apples","4 pieces","red color","last ordered 2 hours ago"
"mangos","1 piece","","last ordered 1 day ago"
"carrots","10 pieces","orange color",""

The code I have used for this:

awk '{ printf "%s", $0; if (NR % 3 == 0) print ""; else printf "," }' file.csv

This method of merging three rows at a time worked with a little manual editing for as long as all items had the three pieces of data "pieces", "color" & "last order".

However, this is not working as different items have different sets of data.

Upvotes: 0

Views: 114

Answers (4)

Michael Back
Michael Back

Reputation: 1871

The following implementation works so long as all our items are grouped together in blocks of lines within the input file. The implementation caches fields for a single output record and prints that when it sees a different item (or alternatively when the script END's).

awk -F, -v OFS=, '
    NR==1 { f1=$1; f2=$2; f3=$3; f4=$4 }
    FNR==1 { next }
    f1 != $1 {
        print f1, f2, f3, f4
        f1=$1
        f2=f3=f4="\"\""
    }
    $2 ~ /piece/      { f2 = $2; next }
    $2 ~ /color/      { f3 = $2; next }
    $2 ~ /last order/ { f4 = $2; next }
    END { print f1, f2, f3, f4 }
' file.csv

If items are not grouped, we have to cache the whole file(s)... In the following, f basically stores/represents the file as a table. The k array stores the the line number for the output record of a given item so that when we print, our output records will come out in the input file's order.

awk -F, -v OFS=, '
    BEGIN { n = 0; split("", k); split("", f) }
    NR==1 { f[++n,1]=$1; f[n,2]=$2; f[n,3]=$3; f4[n,4]=$4 }
    FNR==1 { next }
    !($1 in k) {
        k[$1] = ++n
        f[n,1]=$1
        f[n,2]=f[n,3]=f[n,4]="\"\""
    }
    $2 ~ /piece/      { f[k[$1],2] = $2; next }
    $2 ~ /color/      { f[k[$1],3] = $2; next }
    $2 ~ /last order/ { f[k[$1],4] = $2; next }
    END {
        for (i=1; i<=n; ++i)
            print f[i,1], f[i,2], f[i,3], f[i,4]
    }
' file.csv

Notes:

  1. The NR/FNR dance in both implementations is an attempt at handling the header line and multiple input files (and we may or may not care about handling multiple file input).

  2. The latter implementation permits overwriting of an item's fields... If we don't want that behavior, we would have to change the logic -- the changes to the logic would be about the lines that have the regex comparisons on input field $2.

Upvotes: 0

Ed Morton
Ed Morton

Reputation: 203607

$ cat tst.awk
BEGIN { FS=OFS="," }
{ gsub(/"/,"") }
NR==1 {
    print
    sub(/s,/,",")
    numTags = split($0,tags)
    next
}
$1 != prev {
    if ( prev != "" ) {
         prt()
    }
    prev=$1
}
{
    tag = tags[1]
    tag2val[tag] = $1
    for (tagNr=2; tagNr<=numTags; tagNr++) {
        tag = tags[tagNr]
        if ( index($2,tag) ) {
            tag2val[tag] = $2
            next
        }
    }
}
END { prt() }

function prt(   tagNr,tag,val) {
    for (tagNr=1; tagNr<=numTags; tagNr++) {
        tag = tags[tagNr]
        val = tag2val[tag]
        printf "\"%s\"%s", val, (tagNr<numTags ? OFS : ORS)
    }
    delete tag2val
}

$ awk -f tst.awk file
item,pieces,color,last order
"apples","4 pieces","red color","last ordered 2 hours ago"
"mangos","1 piece","","last ordered 1 day ago"
"carrots","10 pieces","orange color",""

Upvotes: 2

anubhava
anubhava

Reputation: 785186

You may try this awk:

awk 'BEGIN {FS=OFS=","} NR == 1 {print; next} item != $1 {if (item != "") print item, pieces, color, order; item = $1; pieces = $2; color = order = "\"\""; next} {if ($2 ~ /color/) color = $2; else order = $2} END {print item, pieces, color, order}' file

item,pieces,color,last order
"apples","4 pieces","red color","last ordered 2 hours ago"
"mangos","1 piece","","last ordered 1 day ago"
"carrots","10 pieces","orange color",""

A more readable version:

awk 'BEGIN {
   FS = OFS = ","
}
NR == 1 {
   print
   next
}
item != $1 {
   if (item != "")
      print item, pieces, color, order
   item = $1
   pieces = $2
   color = order = "\"\""
   next
}
{
   if ($2 ~ /color/)
      color = $2
   else
      order = $2
}
END {
   print item, pieces, color, order
}' file

Upvotes: 2

Freeman
Freeman

Reputation: 12728

check this out :

awk -F, '{ if (f == $1) { for (c=0; c <length($1) + length(FS); c++) printf " "; print $2 FS $3 } else { print $0 } } { f = $1 }' yourfile.csv

Upvotes: -1

Related Questions