Reputation: 540
I'm using xlsxwriter python package to export data from a PostgreSQL database to excel in a django project. I've implemented a Django command to do this, but the problem is that there are more than 4 millions of records of data and writting the file consumes all my RAM, and the process gets killed.
Log:
[export_user_data_to_excel]> Generating excel file with:
3913616 Instagram publications
1250156 Instagram hashtags
513124 Twitter publications
127912 Twitter hashtags
Killed
I've tried with a parameter called 'constant_memory' but it doesn't seem to make a difference. Here is the method that writes the excel file:
def write_to_excel_perf(filename, instagram_publications, instagram_tags, twitter_publications, twitter_tags, instance):
"""
Export the current queryset to an excel file in xlsx format.
Optimized for low memory consumption and better performance
http://xlsxwriter.readthedocs.io/working_with_memory.html#memory-perf
"""
logger.info("[write_to_excel_perf]> Openning Workbook..")
book = xlsxwriter.Workbook(filename, {'constant_memory': True})
if 'instagram' in instance:
logger.info("[write_to_excel_perf]> Writting Instagram publications..")
sheet = book.add_worksheet('Instagram Media')
# Adding media page
titles = ["Type", "City", "Date", "Instagram Id", "Instagram URL", "caption", "likes",
"author", "location id", "location name", "lat", "lng"]
i = 0
for title in titles:
sheet.write(0, i, title)
i += 1
row_index = 1
# We improve the performance making sure that we query by related data using select_related
# and prefetch_related when needed
instagram_publications = instagram_publications.select_related('location__spot__city', 'author', 'location')
for el in instagram_publications:
# ["Type", "Date", "Instagram Id", "Instagram URL", "caption", "likes", "author", "author_profile",
# "location id", "location name", "lat", "lng"]
mediaType = 'Photo' if el.mediaType == '1' else 'Video'
city = el.location.spot.city.name if el.location is not None and el.location.spot.city is not None else "Undefined"
publication_date = el.publication_date.strftime("%d/%m/%Y %H:%M")
username = el.author.username if el.author is not None else "Undefined"
location_id = el.location.instagramID if el.location is not None else "Undefined"
location_name = el.location.name if el.location is not None else "Undefined"
location_lat = el.location.position.y if el.location is not None else "Undefined"
location_lng = el.location.position.x if el.location is not None else "Undefined"
row = [mediaType, city, publication_date, el.instagramID, el.instagram_url, el.caption, el.likes,
username, location_id, location_name, location_lat,
location_lng]
column_index = 0
for value in row:
sheet.write(row_index, column_index, value)
column_index += 1
row_index += 1
# Adding tag page
sheet = book.add_worksheet('Instagram Tags')
titles = ["Hashtag", "Quantity"]
i = 0
for title in titles:
sheet.write(0, i, title)
i += 1
row_index = 1
if instagram_tags is not None:
logger.info("[write_to_excel_perf]> Writting Instagram hashtags..")
for el in instagram_tags:
hashtag_id = el.get('hashtag__id')
label = Hashtag.objects.get(id=hashtag_id).label
sheet.write(row_index, 0, label)
sheet.write(row_index, 1, el.get('count'))
row_index += 1
else:
sheet.write(1, 0, "No hashtags in query")
if 'twitter' in instance:
# TwitterPublication
logger.info("[write_to_excel_perf]> Writting Twitter publications..")
sheet = book.add_worksheet('Twitter Media')
titles = ["City", "Date", "Twitter Id", "Twitter URL", "caption", "likes",
"author", "lat", "lng"]
i = 0
for title in titles:
sheet.write(0, i, title)
i += 1
row_index = 1
twitter_publications = twitter_publications.select_related('location__spot__city', 'author', 'location')
for el in twitter_publications:
city = el.location.spot.city.name if el.location is not None and el.location.spot.city is not None else "Undefined"
publication_date = el.publication_date.strftime("%d/%m/%Y %H:%M")
username = el.author.username if el.author is not None else "Undefined"
location_lat = el.location.position.y if el.location is not None else "Undefined"
location_lng = el.location.position.x if el.location is not None else "Undefined"
row = [city, publication_date, el.twitterID, el.twitter_url, el.caption, el.likes,
username, location_lat, location_lng]
column_index = 0
for value in row:
sheet.write(row_index, column_index, value)
column_index += 1
row_index += 1
# Adding tag page
sheet = book.add_worksheet('Twitter Tags')
titles = ["Hashtag", "Quantity"]
i = 0
for title in titles:
sheet.write(0, i, title)
i += 1
row_index = 1
if twitter_tags is not None:
logger.info("[write_to_excel_perf]> Writting Twitter hashtags..")
for el in twitter_tags:
hashtag_id = el.get('hashtag__id')
label = TwitterHashtag.objects.get(id=hashtag_id).label
sheet.write(row_index, 0, label)
sheet.write(row_index, 1, el.get('count'))
row_index += 1
else:
sheet.write(1, 0, "No hashtags in query")
book.close()
logger.info("[write_to_excel_perf]> Export file generated sucessfully.")
return book
Upvotes: 1
Views: 3615
Reputation: 41644
I've tried with a parameter called
constant_memory
but it doesn't seem to make a difference.
It should do. As shown in the XlsxWriter Documentation the constant_memory
option keeps the memory usage constant and small.
So if it doesn't make a difference to your application then maybe the issue isn't with XlsxWriter and something else is consuming the memory.
Can you verify that by commenting out all calls to worksheet.write()
and running the test again.
Upvotes: 2