esh_08
esh_08

Reputation: 21

Showing HTML preview of .msg file

I'm attempting to display an HTML preview of a .msg file using Java for my backend. However, when I use Apache POI to read the .msg file, I only receive the content in either plain text or RTF format. The RTF format does contain the HTML content, but I'm struggling to parse it into HTML for the email preview on my website. I attempted to parse the RTF content using Apache Tika, but it was unsuccessful. Currently, I am using the method detailed in this stackoveflow thread, but it's not parsing all the HTML tags. Any suggestions on how to resolve this issue?

Here is the code I am using:

    public static String rtfToHtml(String rtfText)
    {
        StringBuilder sb = new StringBuilder();

        if (rtfText != null)
        {
            String[] lignes = rtfText.split("[\\r\\n]+");
            for (String ligne : lignes)
            {
                String tempLine = ligne.replaceAll("\\{\\\\\\*\\\\[m]?htmltag[\\d]*([^}]*)\\}", "$1")
                    .replaceAll("\\\\htmlrtf0([^\\\\]*)\\\\htmlrtf", "$1")
                    .replaceAll("\\\\htmlrtf \\{(.*)\\}\\\\htmlrtf0", "$1")
                    .replaceAll("\\\\htmlrtf (.*)\\\\htmlrtf0", "")
                    .replaceAll("\\\\htmlrtf[0]?", "")
                    .replaceAll("\\\\field\\{\\\\\\*\\\\fldinst\\{[^}]*\\}\\}", "")
                    .replaceAll("\\{\\\\fldrslt\\\\cf1\\\\ul([^}]*)\\}", "$1")
                    .replaceAll("\\\\htmlbase", "")
                    .replaceAll("\\\\*\\\bkmkstart BM\\_", "")
                    .replaceAll("\\\\par", "\n")
                    .replaceAll("\\\\tab", "\t")
                    .replaceAll("\\\\line", "\n")
                    .replaceAll("\\\\page", "\n\n")
                    .replaceAll("\\\\sect", "\n\n")
                    .replaceAll("\\\\emdash", "ߞ")
                    .replaceAll("\\\\endash", "ߝ")
                    .replaceAll("\\\\emspace", "ߓ")
                    .replaceAll("\\\\enspace", "ߒ")
                    .replaceAll("\\\\qmspace", "ߕ")
                    .replaceAll("\\\\bullet", "ߦ")
                    .replaceAll("\\\\lquote", "ߢ")
                    .replaceAll("\\\\rquote", "ߣ")
                    .replaceAll("\\\\ldblquote", "&#201C;")
                    .replaceAll("\\\\rdblquote", "&#201D;")
                    .replaceAll("\\\\row", "\n")
                    .replaceAll("\\\\cell", "|")
                    .replaceAll("\\\\nestcell", "|")
                    .replaceAll("([^\\\\])\\{", "$1")
                    .replaceAll("([^\\\\])}", "$1")
                    .replaceAll("[\\\\](\\{)", "$1")
                    .replaceAll("[\\\\](})", "$1")
                    .replaceAll("\\\\u([0-9]{2,5})", "&#$1;")
                    .replaceAll("\\\\'([0-9A-Fa-f]{2})", "&#x$1;")
                    .replaceAll("\"cid:(.*)@.*\"", "\"$1\"")
                    .replaceAll(" {2,}", " ")
                    .replaceAll("\\\\htmlrtf[1]?(.*)\\\\htmlrtf0", "")
                    .replaceAll("\\\\htmlrtf[01]?", "");

                if (!tempLine.replaceAll("\\s+", "").isEmpty())
                {
                    sb.append(tempLine).append("\r\n");
                }
            }

            rtfText = sb.toString();

            int index = rtfText.indexOf("<html");
            if (index != -1)
            {
                return rtfText.substring(index);
            }
        }

        return null;
    }
InputStream inputStream = new FileInputStream(file);
MAPIMessage msgMessage = new MAPIMessage(inputStream);
return rtfToHtml(msgMessage.getRtfBody());

I tried using Apache Tika but it was not able to parse the content and was returning empty HTML

    public String convertUsingApacheTika(String rtfString) throws TikaException, IOException, SAXException
    {
        Tika tika = new Tika();
        InputStream stream = TikaInputStream.get(rtfString.getBytes(StandardCharsets.UTF_8));
        BodyContentHandler handler = new BodyContentHandler(-1);
        Metadata metadata = new Metadata();

        AutoDetectParser parser = new AutoDetectParser();
        parser.parse(stream, handler, metadata);

        String htmlString = handler.toString();
        return htmlString;
    }

Upvotes: 1

Views: 138

Answers (0)

Related Questions