Murilo Sitonio
Murilo Sitonio

Reputation: 307

How to repair a PDF file that was transmitted with a wrong MIME type

I have a service A (flask) that transmits a file to service B (Django) using python's requests library.

from typing import TYPE_CHECKING
import magic

if TYPE_CHECKING:
    from werkzeug.datastructures import FileStorage

    from backendssc.access.viewer_context import ViewerContext


def multipartify(data: DictData, file: FileStorage) -> DictData:
    converted = {}
    for key, value in data.items():
        converted[key] = (None, value)  # multipart representation of value

    converted["file"] = (  # type: ignore
        file.filename,
        file.stream,
        magic.from_buffer(file.read(2048), mime=True),
    )
    return converted


@retry_send_request()
def send_request(
    method: str,
    url: str,
    queries: Optional[dict] = None,
    body: Optional[dict] = None,
    file: Optional[dict] = None,
    api_token: Optional[str] = None,
    jwt_token: Optional[str] = None,
    extra_headers: Optional[Dict[str, Any]] = None,
    tries: int = 1,
    exp_backoff: int = 1,
) -> Response:
    # Process header, queries and body
    header = attach_api_auth_data(
        url=url,
        jwt_token=jwt_token,
        api_token=api_token,
        **(extra_headers if extra_headers else {}),
    )

    url = add_url_params(url, queries if queries else {})

    response = requests.request(
        method=method,
        url=url,
        json=body,
        headers=header,
        files=file,
    )

    return response


def create(
    self,
    *,
    body: EvidenceLockerCreateRequestBody,
    file: FileStorage,
    viewer_context: ViewerContext,
    raise_for_status: bool = False,
) -> Tuple[EvidenceLockerCreateApiResponse, bool]:
    payload = multipartify(
        data={
            **body.as_dict(remove_null=True),
            "ip_address": viewer_context.ip_address,
            "user_agent": viewer_context.user_agent,
        },
        file=file,
    )
    response = send_request(
        method="POST",
        url=self.url,
        file=payload,
        jwt_token=viewer_context.jwt_token,
    )
    if raise_for_status:
        response.raise_for_status()
    return response.json(), response.ok

If I remove the MIME type from file, ie...

converted["file"] = (file.filename, file.stream)

...then the file can be downloaded back from s3 without problems.

I'm trying to repair the files that were transmitted with MIME so folks don't need to reupload them.

I've tried a couple of different libraries/solutions such as convertapi, pikepdf, ghostscript, I love pdf (and similars)... but unlucky thus far.

Any ideas?

I've tried to repair the files. I'm expecting to get help to repair the files.

Upvotes: 0

Views: 46

Answers (0)

Related Questions