714
714

Reputation: 13

Split one column into rows

I have a large tab-delimited file containing 80 or so columns which looks like this:

184     
2       
P   2853263 4998463
SS      
AG0001-C        
T/T      C/C      A/A
AG0002-C        
T/T      C/C      A/T   
AG0003-C        
T/T      C/C      A/A   
AG0004-C         
T/T      C/C      T/A

I want to substitute the slash Characters ("\") to a new line so that the contents of one column are split into two rows like so:

184     
2       
P   2853263 4998463
SS      
AG0001-C        
T        C         A
T        C         A
AG0002-C        
T        C         A
T        C         T
AG0003-C         
T        C         A
T        C         A
AG0004-C        
T        C         T
T        C         A

Upvotes: 1

Views: 175

Answers (4)

potong
potong

Reputation: 58351

This might work for you (GNU sed):

sed '/\//!b;h;s|/.||g;G;s|./||g' file

For lines containing a / copy the line. Remove the / and the following character. Append the copied line and remove the character's before any /'s.

Upvotes: 0

konsolebox
konsolebox

Reputation: 75458

For an input like this (no initial tabs at left of first column):

184
2
P   2853263 4998463
SS
AG0001-C
T/T C/C A/A
AG0002-C
T/T C/C A/T
AG0003-C
T/T C/C A/A
AG0004-C
T/T C/C T/A

This script should work with Mawk:

#!/usr/bin/awk -f

NR <= 4 || NR % 2 { print; next; }
{
    rows = 0
    for (i = 1; i <= NF; ++i) {
        count = split($i, b, /\//)
        if (count > rows) {
            rows = count
        }
        for (j = 1; j <= count; ++j) {
            key = i "|" j
            a[key] = b[j]
        }
    }
    for (i = 1; i <= rows; ++i) {
        key = 1 "|" i
        printf("%s", a[key])
        for (j = 2; j <= NF; ++j) {
            key = j "|" i
            printf("\t%s", a[key])
        }
        print ""
    }
    for (i in a) {
        delete a[i]
    }
}

Output:

184
2
P   2853263 4998463
SS
AG0001-C
T   C   A
T   C   A
AG0002-C
T   C   A
T   C   T
AG0003-C
T   C   A
T   C   A
AG0004-C
T   C   T
T   C   A

It should work even with varying formats like this:

184
2
P   2853263 4998463
SS
AG0001-C
A/A/C/X/Y/Z T/T C/C A/A A/A/C/X A/A/B   A/A/C/X/Y
AG0002-C
T/T C/C A/T
AG0003-C
T/T C/C A/A
AG0004-C
T/T C/C T/A

Output:

184
2
P   2853263 4998463
SS
AG0001-C
A   T   C   A   A   A   A
A   T   C   A   A   A   A
C               C   B   C
X               X       X
Y                       Y
Z                       
AG0002-C
T   C   A
T   C   T
AG0003-C
T   C   A
T   C   A
AG0004-C
T   C   T
T   C   A

For an input with tabs on the left:

    184
    2
    P   2853263 4998463
    SS
    AG0001-C
    T/T C/C A/A
    AG0002-C
    T/T C/C A/T
    AG0003-C
    T/T C/C A/A
    AG0004-C
    T/T C/C T/A

This code

#!/usr/bin/awk -f

NR <= 4 || NR % 2 { print; next; }
{
    rows = 0
    for (i = 1; i <= NF; ++i) {
        count = split($i, b, /\//)
        if (count > rows) {
            rows = count
        }
        for (j = 1; j <= count; ++j) {
            key = i "|" j
            a[key] = b[j]
        }
    }
    for (i = 1; i <= rows; ++i) {
        for (j = 1; j <= NF; ++j) {
            key = j "|" i
            printf("\t%s", a[key])
        }
        print ""
    }
    for (i in a) {
        delete a[i]
    }
}

Would give an output of

    184
    2
    P   2853263 4998463
    SS
    AG0001-C
    T   C   A
    T   C   A
    AG0002-C
    T   C   A
    T   C   T
    AG0003-C
    T   C   A
    T   C   A
    AG0004-C
    T   C   T
    T   C   A

Upvotes: 3

Chris Seymour
Chris Seymour

Reputation: 85765

A GNU awk solution:

$ awk '/[/]/{print $1,$3,$6;print $2,$4,$6;next}1' FS='/| +' OFS='\t' file
184
2
P   2853263 4998463
SS
AG0001-C
T       C       A
T       C       A
AG0002-C
T       C       T
T       C       T
AG0003-C
T       C       A
T       C       A
AG0004-C
T       C       A
T       C       A

Upvotes: 2

devnull
devnull

Reputation: 123448

Using sed:

$ sed -e "s|/|\t|g" -e "s/\([^\t]*\t[^\t]*\t[^\t]*\)\t\(.*\)/\1\n\2/" inputfile
184
2
P   2853263 4998463
SS
AG0001-C
T   T   C   
C   A   A   
AG0002-C
T   T   C   
C   A   T   
AG0003-C
T   T   C   
C   A   A   
AG0004-C
T   T   C   
C   T   A   

Upvotes: 1

Related Questions