xlsxwriter consuming too much memory and process gets killed

Question

I'm using xlsxwriter python package to export data from a PostgreSQL database to excel in a django project. I've implemented a Django command to do this, but the problem is that there are more than 4 millions of records of data and writting the file consumes all my RAM, and the process gets killed.

Log:

[export_user_data_to_excel]> Generating excel file with:
     3913616 Instagram publications 
     1250156 Instagram hashtags  
     513124 Twitter publications 
     127912 Twitter hashtags
Killed

I've tried with a parameter called 'constant_memory' but it doesn't seem to make a difference. Here is the method that writes the excel file:

def write_to_excel_perf(filename, instagram_publications, instagram_tags, twitter_publications, twitter_tags, instance):
    """
    Export the current queryset to an excel file in xlsx format.
    Optimized for low memory consumption and better performance
    http://xlsxwriter.readthedocs.io/working_with_memory.html#memory-perf
    """
    logger.info("[write_to_excel_perf]> Openning Workbook..")
    book = xlsxwriter.Workbook(filename, {'constant_memory': True})
    if 'instagram' in instance:
        logger.info("[write_to_excel_perf]> Writting Instagram publications..")
        sheet = book.add_worksheet('Instagram Media')
        # Adding media page
        titles = ["Type", "City", "Date", "Instagram Id", "Instagram URL", "caption", "likes",
                  "author", "location id", "location name", "lat", "lng"]
        i = 0
        for title in titles:
            sheet.write(0, i, title)
            i += 1
        row_index = 1
        # We improve the performance making sure that we query by related data using select_related
        # and prefetch_related when needed
        instagram_publications = instagram_publications.select_related('location__spot__city', 'author', 'location')
        for el in instagram_publications:
            # ["Type", "Date", "Instagram Id", "Instagram URL", "caption", "likes", "author", "author_profile",
            #      "location id", "location name", "lat", "lng"]
            mediaType = 'Photo' if el.mediaType == '1' else 'Video'
            city = el.location.spot.city.name if el.location is not None and el.location.spot.city is not None else "Undefined"
            publication_date = el.publication_date.strftime("%d/%m/%Y %H:%M")
            username = el.author.username if el.author is not None else "Undefined"
            location_id = el.location.instagramID if el.location is not None else "Undefined"
            location_name = el.location.name if el.location is not None else "Undefined"
            location_lat = el.location.position.y if el.location is not None else "Undefined"
            location_lng = el.location.position.x if el.location is not None else "Undefined"

            row = [mediaType, city, publication_date, el.instagramID, el.instagram_url, el.caption, el.likes,
                   username, location_id, location_name, location_lat,
                   location_lng]
            column_index = 0
            for value in row:
                sheet.write(row_index, column_index, value)
                column_index += 1
            row_index += 1

        # Adding tag page
        sheet = book.add_worksheet('Instagram Tags')
        titles = ["Hashtag", "Quantity"]
        i = 0
        for title in titles:
            sheet.write(0, i, title)
            i += 1
        row_index = 1
        if instagram_tags is not None:
            logger.info("[write_to_excel_perf]> Writting Instagram hashtags..")
            for el in instagram_tags:
                hashtag_id = el.get('hashtag__id')
                label = Hashtag.objects.get(id=hashtag_id).label
                sheet.write(row_index, 0, label)
                sheet.write(row_index, 1, el.get('count'))
                row_index += 1
        else:
            sheet.write(1, 0, "No hashtags in query")

    if 'twitter' in instance:
        # TwitterPublication
        logger.info("[write_to_excel_perf]> Writting Twitter publications..")
        sheet = book.add_worksheet('Twitter Media')

        titles = ["City", "Date", "Twitter Id", "Twitter URL", "caption", "likes",
                  "author", "lat", "lng"]
        i = 0
        for title in titles:
            sheet.write(0, i, title)
            i += 1
        row_index = 1

        twitter_publications = twitter_publications.select_related('location__spot__city', 'author', 'location')
        for el in twitter_publications:
            city = el.location.spot.city.name if el.location is not None and el.location.spot.city is not None else "Undefined"
            publication_date = el.publication_date.strftime("%d/%m/%Y %H:%M")
            username = el.author.username if el.author is not None else "Undefined"
            location_lat = el.location.position.y if el.location is not None else "Undefined"
            location_lng = el.location.position.x if el.location is not None else "Undefined"

            row = [city, publication_date, el.twitterID, el.twitter_url, el.caption, el.likes,
                   username, location_lat, location_lng]
            column_index = 0
            for value in row:
                sheet.write(row_index, column_index, value)
                column_index += 1
            row_index += 1

        # Adding tag page
        sheet = book.add_worksheet('Twitter Tags')
        titles = ["Hashtag", "Quantity"]
        i = 0
        for title in titles:
            sheet.write(0, i, title)
            i += 1
        row_index = 1
        if twitter_tags is not None:
            logger.info("[write_to_excel_perf]> Writting Twitter hashtags..")
            for el in twitter_tags:
                hashtag_id = el.get('hashtag__id')
                label = TwitterHashtag.objects.get(id=hashtag_id).label
                sheet.write(row_index, 0, label)
                sheet.write(row_index, 1, el.get('count'))
                row_index += 1
        else:
            sheet.write(1, 0, "No hashtags in query")

    book.close()

    logger.info("[write_to_excel_perf]> Export file generated sucessfully.")
    return book

jmcnamara · Accepted Answer

I've tried with a parameter called constant_memory but it doesn't seem to make a difference.

It should do. As shown in the XlsxWriter Documentation the constant_memory option keeps the memory usage constant and small.

So if it doesn't make a difference to your application then maybe the issue isn't with XlsxWriter and something else is consuming the memory.

Can you verify that by commenting out all calls to worksheet.write() and running the test again.

xlsxwriter consuming too much memory and process gets killed

Answers (1)

Related Questions