Reputation: 23
I have a program with a function that needs to open big pickle files (a few GB), look at the obtained dictionary (dict), and return a partial view of it (a few elements). Curiously, the large amount of data opened by the function remains in memory. So I did a few tests with the following code:
import numpy as np
def test_mem_1():
data = np.random.random((2**27)) #1 GB
input("Data generated, awaiting input to continue")
return 4
def test_mem_2():
keys = list(range(100000))
lvls = list(range(10))
data = {}
for k in keys:
data[k] = {}
for lvl in lvls:
data[k][lvl] = np.random.random(100) #1'000'000 X 8 = 800 MB
input("Data generated, awaiting input to continue")
data = None
return 4
if __name__ == "__main__":
a = test_mem_1()
input("Tested mem 1, continue to test mem 2")
a = test_mem_2()#Memory usage falls from 995 MB inside test_mem_1 to 855 MB when returned
input("Finished")
exit()
When running this experiment, the first test allocates 1 GB, then this data is freed as soon as the function returns. At the same time, the second test (working with a dict), first allocates 995 MB, then, when the function returns, only 140 MB are freed (resulting in a memory footprint, after test_mem_2, of 855 MB). What is happening here? How can I free this memory?
P.S.
I have tried deleting the data in test_mem_2 in several methods: Not doing anything, using "del", assigning to a new dict, and (as in this example) assigning the reference to None
Upvotes: 2
Views: 1903
Reputation: 4401
Answer after comment discussion.
Memory management is handled by python itself using Garbage Collection. Normally you should not touch this at all. Garbage collection is automagic in python. Unless you actually have a good reason to mess with it, don't.
However, you can force garbage collection, which can be usefull if you are dealing with a limited resource system fe.
I've combined your code with a function to get the memory usage which I've shamelessly stolen from this excellent answer, and I implemented the most basic garbage collection...
By running the loopity()
multiple times I've not had it crash yet.
Note that I did add a data = None
in test_mem_1()
file: memleak.py
import numpy as np
import sys
import gc
import tracemalloc
import linecache
import os
tracemalloc.start()
def display_top(snapshot, key_type='lineno', limit=3, where=''):
#
#
# Shamelessly stolen from:
# https://stackoverflow.com/a/45679009/9267296
#
# I replaced all old string formatting with f-strings
#
#
print('======================================================================')
if where != '':
print(f'Printing stats:\n {where}')
print('======================================================================')
snapshot = snapshot.filter_traces((
tracemalloc.Filter(False, '<frozen importlib._bootstrap>'),
tracemalloc.Filter(False, '<unknown>'),
))
top_stats = snapshot.statistics(key_type)
print(f'Top {limit} lines')
for index, stat in enumerate(top_stats[:limit], 1):
frame = stat.traceback[0]
# replace '/path/to/module/file.py' with 'module/file.py'
filename = os.sep.join(frame.filename.split(os.sep)[-2:])
print(f'#{index}: {filename}:{frame.lineno}: {stat.size / 1024:.1f} KiB')
line = linecache.getline(frame.filename, frame.lineno).strip()
if line:
print(f' {line}')
other = top_stats[limit:]
if other:
size = sum(stat.size for stat in other)
print(f'{len(other)} other: {size / 1024:.1f} KiB')
total = sum(stat.size for stat in top_stats)
print()
print(f'=====> Total allocated size: {total / 1024:.1f} KiB')
print()
def test_mem_1():
display_top(tracemalloc.take_snapshot(), where='test_mem_1: start')
data = np.random.random((2**27)) #1 GB
display_top(tracemalloc.take_snapshot(), where='test_mem_1: data generated')
input('Data generated, awaiting input to continue')
data = None
display_top(tracemalloc.take_snapshot(), where='test_mem_1: data == None')
gc.collect()
display_top(tracemalloc.take_snapshot(), where='test_mem_1: gc collected')
return 4
def test_mem_2():
display_top(tracemalloc.take_snapshot(), where='test_mem_2: start')
keys = list(range(100000))
lvls = list(range(10))
display_top(tracemalloc.take_snapshot(), where='test_mem_2: lists generated')
data = {}
for k in keys:
data[k] = {}
for lvl in lvls:
data[k][lvl] = np.random.random(100) #1'000'000 X 8 = 800 MB
display_top(tracemalloc.take_snapshot(), where='test_mem_2: np data generated')
input('Data generated, awaiting input to continue')
data = None
display_top(tracemalloc.take_snapshot(), where='test_mem_2: data == None')
gc.collect()
display_top(tracemalloc.take_snapshot(), where='test_mem_2: gc collected')
return 4
def loopity():
# added this logic to be able to run multiple times.
# stops when input for finished != ''
inp = ''
while inp == '':
display_top(tracemalloc.take_snapshot(), where='loopity: start')
a = test_mem_1()
display_top(tracemalloc.take_snapshot(), where='loopity: test_mem_1 done')
input('Tested mem 1, continue to test mem 2')
a = test_mem_2()
display_top(tracemalloc.take_snapshot(), where='loopity: test_mem_2 done')
inp = input('Finished')
if __name__ == '__main__':
loopity()
this is the output from a Windows box running python 3.8.10 (don't ask):
======================================================================
Printing stats:
loopity: start
======================================================================
Top 3 lines
#1: .\memleak.py:93: 0.1 KiB
def loopity():
#2: .\memleak.py:69: 0.1 KiB
def test_mem_2():
#3: .\memleak.py:53: 0.1 KiB
def test_mem_1():
1 other: 0.1 KiB
=====> Total allocated size: 0.5 KiB
======================================================================
Printing stats:
test_mem_1: start
======================================================================
Top 3 lines
#1: lib\linecache.py:137: 8.3 KiB
lines = fp.readlines()
#2: .\memleak.py:39: 1.2 KiB
line = linecache.getline(frame.filename, frame.lineno).strip()
#3: lib\tracemalloc.py:509: 1.2 KiB
statistics.sort(reverse=True, key=Statistic._sort_key)
59 other: 20.4 KiB
=====> Total allocated size: 31.1 KiB
======================================================================
Printing stats:
test_mem_1: data generated
======================================================================
Top 3 lines
#1: .\memleak.py:56: 1048576.3 KiB
data = np.random.random((2**27)) #1 GB
#2: lib\linecache.py:137: 63.9 KiB
lines = fp.readlines()
#3: lib\tracemalloc.py:65: 3.8 KiB
return (self.size, self.count, self.traceback)
59 other: 26.3 KiB
=====> Total allocated size: 1048670.3 KiB
Data generated, awaiting input to continue
======================================================================
Printing stats:
test_mem_1: data == None
======================================================================
Top 3 lines
#1: lib\linecache.py:137: 63.8 KiB
lines = fp.readlines()
#2: lib\tracemalloc.py:532: 5.8 KiB
traces = _get_traces()
#3: lib\tracemalloc.py:65: 3.9 KiB
return (self.size, self.count, self.traceback)
66 other: 25.2 KiB
=====> Total allocated size: 98.6 KiB
======================================================================
Printing stats:
test_mem_1: gc collected
======================================================================
Top 3 lines
#1: lib\linecache.py:137: 63.8 KiB
lines = fp.readlines()
#2: .\memleak.py:39: 1.2 KiB
line = linecache.getline(frame.filename, frame.lineno).strip()
#3: lib\tracemalloc.py:509: 1.2 KiB
statistics.sort(reverse=True, key=Statistic._sort_key)
56 other: 19.0 KiB
=====> Total allocated size: 85.3 KiB
======================================================================
Printing stats:
loopity: test_mem_1 done
======================================================================
Top 3 lines
#1: lib\linecache.py:137: 63.8 KiB
lines = fp.readlines()
#2: lib\tracemalloc.py:65: 3.7 KiB
return (self.size, self.count, self.traceback)
#3: lib\tracemalloc.py:185: 2.8 KiB
self._frames = tuple(reversed(frames))
70 other: 22.9 KiB
=====> Total allocated size: 93.2 KiB
Tested mem 1, continue to test mem 2
======================================================================
Printing stats:
test_mem_2: start
======================================================================
Top 3 lines
#1: lib\linecache.py:137: 63.8 KiB
lines = fp.readlines()
#2: lib\tracemalloc.py:65: 4.6 KiB
return (self.size, self.count, self.traceback)
#3: lib\tracemalloc.py:532: 4.5 KiB
traces = _get_traces()
71 other: 26.8 KiB
=====> Total allocated size: 99.7 KiB
======================================================================
Printing stats:
test_mem_2: lists generated
======================================================================
Top 3 lines
#1: .\memleak.py:72: 3508.7 KiB
keys = list(range(100000))
#2: lib\linecache.py:137: 63.8 KiB
lines = fp.readlines()
#3: lib\tracemalloc.py:532: 9.2 KiB
traces = _get_traces()
73 other: 31.6 KiB
=====> Total allocated size: 3613.3 KiB
======================================================================
Printing stats:
test_mem_2: np data generated
======================================================================
Top 3 lines
#1: .\memleak.py:80: 911719.1 KiB
data[k][lvl] = np.random.random(100) #1'000'000 X 8 = 800 MB
#2: .\memleak.py:78: 11370.0 KiB
data[k] = {}
#3: .\memleak.py:72: 3508.7 KiB
keys = list(range(100000))
71 other: 96.4 KiB
=====> Total allocated size: 926694.2 KiB
Data generated, awaiting input to continue
======================================================================
Printing stats:
test_mem_2: data == None
======================================================================
Top 3 lines
#1: .\memleak.py:72: 3508.7 KiB
keys = list(range(100000))
#2: lib\linecache.py:137: 63.8 KiB
lines = fp.readlines()
#3: .\memleak.py:80: 5.7 KiB
data[k][lvl] = np.random.random(100) #1'000'000 X 8 = 800 MB
75 other: 37.6 KiB
=====> Total allocated size: 3615.8 KiB
======================================================================
Printing stats:
test_mem_2: gc collected
======================================================================
Top 3 lines
#1: .\memleak.py:72: 3508.7 KiB
keys = list(range(100000))
#2: lib\linecache.py:137: 63.8 KiB
lines = fp.readlines()
#3: .\memleak.py:80: 5.5 KiB
data[k][lvl] = np.random.random(100) #1'000'000 X 8 = 800 MB
60 other: 22.0 KiB
=====> Total allocated size: 3600.0 KiB
======================================================================
Printing stats:
loopity: test_mem_2 done
======================================================================
Top 3 lines
#1: lib\linecache.py:137: 63.8 KiB
lines = fp.readlines()
#2: .\memleak.py:80: 5.5 KiB
data[k][lvl] = np.random.random(100) #1'000'000 X 8 = 800 MB
#3: lib\tracemalloc.py:65: 3.9 KiB
return (self.size, self.count, self.traceback)
73 other: 26.4 KiB
=====> Total allocated size: 99.7 KiB
Finished
Upvotes: 3