Reading .msg attachments from a .msg file

Question

I am trying to go through a large number of files in my folder and read them into a dataframe ready to do some NLP on them. Some are .pdf, .xlsx, .txt, etc. However, I am having the biggest issue with some documents which are .msg. I have been able to use the extract-msg package with moderate success, however it is really struggling to process emails which have attachments, specifically if those attachments happen to be emails themselves.

I have built this function, and it works for when an attachment is anything but an email, however it does not process the email that has an email attachment:

import os
import extract_msg
import pandas as pd
import pdfplumber
from docx import Document
import re
import glob

# Function to process emails
def process_msg(file_path, input_folder):
    try:
        msg = extract_msg.openMsg(file_path)
        email_content = {
            "sender": msg.sender,
            "subject": msg.subject,
            "received_time": msg.receivedTime,
            "body": msg.body or msg.htmlBody or "No content available"
        }
        
        attachments_data = [] 
        attachments_folder = os.path.join(input_folder, "attachments")
        os.makedirs(attachments_folder, exist_ok=True)
        
        for att in msg.attachments:
            att_name = get_unique_filename(sanitize_filename(att.name), input_folder)
            att_ext = re.search(r"\.(\w+)$", att_name)
            att_ext = att_ext.group(1).lower() if att_ext else None
            
            saved_path = os.path.join(attachments_folder, att_name)
            att.save(customPath=attachments_folder, customFilename=att_name)
            
            if att_ext == "msg":
                attachment_content  = process_msg(saved_path, input_folder)
                attachments_data.append({
                    "type": "email",
                    "name": att_name,
                    "content": attachment_content
                })

            # This specifically removes .png and .jpg files from the attachments folder.
            else:
                print(f"Unsupported attachment type: {att_name}")
                os.remove(saved_path)
                attachments_data.append({
                    "type": "unsupported",
                    "name": att_name,
                    "content": f"Unsupported file type: {att_ext}"
                })

        return {
            "email_content": email_content,
            "attachments": attachments_data
        }    
    
    except Exception as e:
        print(f"Error processing email {file_path}: {e}")
        return {
            "email_content": "Error processing email",
            "attachments": [],
            "error": str(e)
        }

Ultimately, I would like a table that resembles this:

Doc ID	Data	File path	Attachment	ID of Attachment
1	Email 1	Filepath 1	N
2	Word1	Filepath 2	Y	Email 1
3	PDF1	Filepath 3	N
4	Email 2	Filepath 4	Y	Email 1
5	Email 3	Filepath 5	Y	Email 2

Where Email 1 had 2 attachments, a Word document (word 1) and an email attachment (email 2). Also email 2 had an attachment (Email 3).

Reading .msg attachments from a .msg file

Answers (1)

Related Questions