lalatnayak
lalatnayak

Reputation: 170

Creating a tar stream in memory from multiple file byte streams

I'm trying to create a tar stream in memory add files to it and then save it to S3. But there is some issue and the files inside the ta have zero size. Can any one please advise? Code snippet below-

def tar_and_upload(bucket, keys, dest_bucket):
    s3 = boto3.client('s3')
    file_obj = io.BytesIO()
    tar_file_obj = tarfile.open(mode = "w:gz", fileobj=file_obj)    
    response = {}
    for key in keys:
        obj = s3.get_object(Bucket=bucket, Key=key)
        _bytes = obj["Body"].read()
        _file_name = key.split("/")[-1]
        tar_file_obj.addfile(tarfile.TarInfo(_file_name), _bytes)
    tar_file_obj.close()
    try:
        obj_name = "{}.tar.gz".format(str(uuid.uuid4()))
        s3.put_object(Body=file_obj.getvalue(), Bucket=dest_bucket, Key=obj_name)
    except Exception as e:
        logging.error("Can't save tar to S3", exc_info=True)
        return

Upvotes: 0

Views: 1318

Answers (2)

Mohammad Hanan
Mohammad Hanan

Reputation: 21

def tar_and_upload(bucket, keys, dest_bucket):
    s3 = boto3.client('s3')
    file_obj = io.BytesIO()
    tar_file_obj = tarfile.open(mode = "w:gz", fileobj=file_obj)    
    response = {}
    for key in keys:
        obj = s3.get_object(Bucket=bucket, Key=key)
        _bytes = obj["Body"].read()
        _file_name = key.split("/")[-1]
        info = tarfile.TarInfo(_file_name)
        info.size = obj["ContentLength"]
        info.mtime = s3.head_object(Bucket=bucket, Key=key)['LastModified'].timestamp()
        tar_file_obj.addfile(info, io.BytesIO(_bytes))
    tar_file_obj.close()
    try:
        obj_name = "{}.tar.gz".format(str(uuid.uuid4()))
        s3.put_object(Body=file_obj.getvalue(), Bucket=dest_bucket, Key=obj_name)
    except Exception as e:
        logging.error("Can't save tar to S3", exc_info=True)
        return

For others, looking to do the same for s3 object

Upvotes: 1

lalatnayak
lalatnayak

Reputation: 170

Okay apparently when adding byte streams to a tar, we need to explicitly specify the size. Sample code-

import tarfile
import uuid
import io
import os

def tar_and_upload():
    file_obj = io.BytesIO()
    tar_file_obj = tarfile.open(mode = "w:gz", fileobj=file_obj)
    for filename in os.listdir("images"):
      print(filename)
      file_path = os.path.join("images", filename)
      #tar_file_obj.add(file_path)
      with open(file_path, "rb") as f:
        _bytes = f.read()
        tar_info = tarfile.TarInfo(filename)
        tar_info.size = len(_bytes)
        tar_file_obj.addfile(tar_info, io.BytesIO(_bytes))
    tar_file_obj.close()
    try:
        obj_name = "{}.tar.gz".format(str(uuid.uuid4()))
        object_path = os.path.join("temp", obj_name)
        with open(object_path, "wb") as f:
          f.write(file_obj.getvalue())
        print(obj_name)
    except Exception as e:
        print(str(e))

if __name__ == "__main__":
    tar_and_upload()

Upvotes: 0

Related Questions