user5811454
user5811454

Reputation:

Python not writing headers

I am splitting a very large csv with this python program based on the second column "ParentID". I recently updated to "a" instead of "w" due to the large amount of files and the per process limit. In doing so my headers were writing each time instead of just the first time in each file.

I updated to add "write_header= true" and "write_header=false" but now it is only writing the header on the first file... I have over 29,000 files

#!/usr/bin/env python3
import binascii
import csv
import os.path
import sys
from tkinter.filedialog import askopenfilename, askdirectory
from tkinter.simpledialog import askinteger

def split_csv_file(f, dst_dir, keyfunc):
    csv_reader = csv.reader(f)
    header = next(csv_reader)
    write_header = True
    csv_writers = {}
    for row in csv_reader:
        k = keyfunc(row)
        with open(os.path.join(dst_dir, k), mode='a', newline='') as output:
            writer = csv.writer(output)
            while write_header:
                writer.writerow(header)
                write_header = False
            csv_writers[k] = writer
            csv_writers[k].writerow(row[0:1])

def get_args_from_cli():
    input_filename = sys.argv[1]
    column = int(sys.argv[2])
    dst_dir = sys.argv[3]
    return (input_filename, column, dst_dir)

def get_args_from_gui():
    input_filename = askopenfilename(
        filetypes=(('TXT','.txt'),('CSV', '.csv')),
        title='Select CSV Input File')
    column = askinteger('Choose Table Column', 'Table column')
    dst_dir = askdirectory(title='Select Destination Directory')
    return (input_filename, column, dst_dir)

if __name__ == '__main__':
    if len(sys.argv) == 1:
        input_filename, column, dst_dir = get_args_from_gui()
    elif len(sys.argv) == 4:
        input_filename, column, dst_dir = get_args_from_cli()
    else:
         raise Exception("Invalid number of arguments")
    with open(input_filename, mode='r', newline='') as f:
        split_csv_file(f, dst_dir, lambda r: r[column-1]+'.txt')
        # if the column has funky values resulting in invalid filenames
        # replace the line from above with:
        # split_csv_file(f, dst_dir, lambda r: binascii.b2a_hex(r[column-1].encode('utf-8')).decode('utf-8')+'.csv')

Here is an example of the file being split..

<option value=''>Choose SubGroup</option>, ParentID
<option value='/1990-Accord-DX-Glass-s/37918.htm'>Glass</option>,Accord1990DX422F22A1BodyHardwareBackGlass
<option value='/1990-Accord-DX-Glass-s/37919.htm'>Glass</option>,Accord1990DX422F22A1BodyHardwareBackGlass
<option value='/1990-Accord-DX-Reveal-Moldings-s/69090.htm'>Reveal Moldings</option>,Accord1990DX422F22A1BodyHardwareBackGlass
<option value='/1990-Accord-DX-Reveal-Moldings-s/69091.htm'>Reveal Moldings</option>,Accord1990DX422F22A1BodyHardwareBackGlass
<option value='/1990-Accord-DX-Center-s/10331.htm'>Center</option>,Accord1990DX422F22A1BodyHardwareConsole
<option value='/1990-Accord-DX-Cowl-s/16006.htm'>Cowl</option>,Accord1990DX422F22A1BodyHardwareCowl
<option value='/1990-Accord-DX-Exterior-Trim-s/26889.htm'>Exterior Trim</option>,Accord1990DX422F22A1BodyHardwareFender
<option value='/1990-Accord-DX-Exterior-Trim-s/26890.htm'>Exterior Trim</option>,Accord1990DX422F22A1BodyHardwareFender

How can I get the header to write ONLY once on each output file?

Upvotes: 1

Views: 403

Answers (1)

Martijn Pieters
Martijn Pieters

Reputation: 1122292

You are setting write_header to false the first time you write a header. Thus only the first file you open gains that header.

Track what files have a header set in a set:

def split_csv_file(f, dst_dir, keyfunc):
    csv_reader = csv.reader(f)
    header = next(csv_reader)
    header_written = set()
    for row in csv_reader:
        k = keyfunc(row)
        with open(os.path.join(dst_dir, k), mode='a', newline='') as output:
            writer = csv.writer(output)
            if k not in header_written:
                writer.writerow(header)
                header_written.add(k)
        writer.writerow(row[0:1])

You may want to investigate keeping your files open longer by tracking when you last wrote to one, and closing those you didn't write to longest. That'd take a custom class that'd transparently track files as you request them by key, more work than fits in an answer.

Upvotes: 2

Related Questions