Reputation: 11100
I'm often working with large data files which I need to access over the network (mostly via NFS, but sometimes also via CIFS). For performance reasons it would be good to cache these files on the local harddrive to minimize network use.
So basically I'm looking for a file-object which automatically takes care of the local caching, something along these lines:
import CachedFileObject as cfo
cfo.set_local_cache_dir("/tmp")
handle = cfo.open("/nfs/server1/bigdatafile.nc", "r") # copy file to /tmp, open the copy
# do stuff with the filehandle
del handle # delete the local copy
I really only need this for reading files. If there should be an easy way to also get/implement file creation (or even writing), that would be a bonus.
Any ideas are greatly appreciated
Upvotes: 1
Views: 5408
Reputation: 1767
There is a simple solution (with full copy on open for read access and full copy on close with write access):
import os
import shutil
from tempfile import mkstemp
class CachedFileObject(object):
def __init__(self, cache_dir="/tmp"):
self.cache_dir = cache_dir
self.local_file = None
self.local_path = None
self.remote_path = None
self.mode = None
def open(self, path, mode="r", buffering=-1):
if self.local_file and not self.local_file.closed:
raise ValueError("Already open")
fd, self.local_path = mkstemp(dir=self.cache_dir)
os.close(fd)
try:
if "r" in mode and not os.path.exists(path):
raise ValueError("No such remote file")
if os.path.exists(path):
# have remote file
self._cache_remote(path, self.local_path)
self.local_file = open(self.local_path, mode=mode, buffering=buffering)
self.mode = mode
self.remote_path = path
except Exception as e:
os.unlink(self.local_path)
raise
return self
def close(self):
self.local_file.close()
try:
if set("wa+").intersection(set(self.mode)):
# have writes, sync file back to remote side
self._sync_remote(self.remote_path, self.local_path)
finally:
os.unlink(self.local_path)
def _cache_remote(self, remote_path, local_path):
# simple cp
shutil.copy(remote_path, local_path)
def _sync_remote(self, remote_path, local_path):
shutil.copy(local_path, remote_path)
def __getattr__(self, attr):
if self.local_file is not None:
return getattr(self.local_file, attr)
else:
raise ValueError("File is not opened")
Created object will behavior as regular file, and just copy/sync on open/close.
Usage:
f = CachedFileObject(cache_dir="/your/tmp/dir")
f.open("/path/to/remote/file")
# ..your f.read()'s here..
f.close()
Upvotes: 3
Reputation: 27734
I would use the operating system for file caching. NFS mounts can be set to cache with -o fsc
and SMB mounts already have some caching on by default.
Upvotes: 3