jim
jim

Reputation: 71

I get wrong characters when retreiving the message body of an email using TIdIMAP4.UIDRetrieveTextPeek2()

In my application (Windows 10, Delphi 10.4), I use TIdIMAP4 to retrieve an email's body from the server with this code:

var aBody : string := '';
UIDRetrieveTextPeek2(MsgID,aBody);

If the returned string starts with <!DOCTYPE html>'#$D#$A'<html lang="en" xmlns="http://www.w3.org/1999/xhtml" xmlns:o="urn:schemas-microsoft-com:...

or

<html><head><style type="text/css">'#$D#$A'@media screen and (max-width:480px) {'#$D#$A' .background_inner {'#$D#$A' padding: 0!important;'#$D#$A'....

I can see the HTML content formed right, but when it starts with:

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.='#$D#$A'w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">=0A<html xmlns=3D"http://www.='#$D#$A'w3.org/1999/xhtml"><head><style type=3D"text/css" media=3D"all">=0A=09a:hov='#$D#$A'er {=09color: red;=09}=0A=09a {=0A=09=09text-decoration: underline;=0A=09='#$D#$A'=09color: #0088cc;=0A=09}....

The HTML is malformed (=0A=09a:hov='#$D#$A'er {=09color: red;=09}=0A=09a {=0A=09=09text-decoration: underline;=0A=09=)

What can I do to fix it?

UPDATE

Here is a reproducible example:

unit Unit11;

interface

uses
  Winapi.Windows, Winapi.Messages, System.SysUtils, System.Variants, System.Classes, Vcl.Graphics,
  Vcl.Controls, Vcl.Forms, Vcl.Dialogs, Vcl.StdCtrls,
  IdIMAP4,IdMessage,IdSSLOpenSSL,IdExplicitTLSClientServerBase, IdBaseComponent, IdIntercept, IdLogBase, IdLogFile;

type
  TForm11 = class(TForm)
    Button1: TButton;
    procedure Button1Click(Sender: TObject);
  private
    { Private declarations }
  public
    { Public declarations }
  end;

var
  Form11: TForm11;

implementation

{$R *.dfm}

procedure TForm11.Button1Click(Sender: TObject);
var SearchInfo: array of TIdIMAP4SearchRec;
    IdMessage1: TIdMessage;
    IdSSLIOHandlerSocketOpenSSL1: TIdSSLIOHandlerSocketOpenSSL;
    i, msgs : integer;
    MsgID, aBody : string;
begin
    with TIdIMAP4.create do try
        IdSSLIOHandlerSocketOpenSSL1 := TIdSSLIOHandlerSocketOpenSSL.Create(nil);
        IdSSLIOHandlerSocketOpenSSL1.SSLOptions.Method := sslvSSLv23;
        IOhandler := IdSSLIOHandlerSocketOpenSSL1;
        authType := iatUserPass;
        Host := 'imap.gmail.com';
        userName := 'xxxxxxxxxx';
        password := 'yyyyyyyyyy';
        UseTLS := utUseImplicitTLS;
        if Connect(TRUE) then
        try
            SelectMailBox('INBOX');
            SetLength(SearchInfo, 1);
            SearchInfo[0].SearchKey := skAll;
            if SearchMailBox(SearchInfo)
            and (High(MailBox.SearchResult) > -1) then
            try
                msgs := High(MailBox.SearchResult)+1;

                for i := 0 to msgs - 1 do
                begin
                    MsgID := '';
                    GetUID(MailBox.SearchResult[i], MsgID);
                    // some bodyies are unreadable, some ok and some as the following are unencoded QP
                    if MsgID = '16805' then begin
                        var IdLogFile1: TIdLogFile := TIdLogFile.Create(nil);
                        IdLogFile1.Filename := 'log.txt';
                        intercept := IdLogFile1;
                        IdLogFile1.Active := TRUE;
                        UIDRetrieveTextPeek2(MsgID,aBody);
                        IdLogFile1.Active := FALSE;
                        IdLogFile1.Free;
                    end;
                end;
            finally

            end;
        finally

        end;
    finally
        IdSSLIOHandlerSocketOpenSSL1.free;
    end;
end;

end.

And its captured log:

Sent 26/7/2021 10:09:45 ??: C55 UID FETCH 16805 (BODYSTRUCTURE)<EOL>
Recv 26/7/2021 10:09:45 ??: * 51 FETCH (UID 16805 BODYSTRUCTURE (("TEXT" "PLAIN" ("CHARSET" "utf-8") NIL NIL "8BIT" 760 16 NIL NIL NIL)("TEXT" "HTML" ("CHARSET" "utf-8") NIL NIL "QUOTED-PRINTABLE" 3962 80 NIL NIL NIL) "ALTERNATIVE" ("BOUNDARY" "----=_NextPart_000_0008_01D583AF.54009F20") NIL NIL))<EOL>
Recv 26/7/2021 10:09:45 ??: C55 OK Success<EOL>
Sent 26/7/2021 10:09:45 ??: C56 UID FETCH 16805 (BODY.PEEK[2])<EOL>
Recv 26/7/2021 10:09:46 ??: * 51 FETCH (UID 16805 BODY[2] 
Recv 26/7/2021 10:09:46 ??: {3962}<EOL>
Recv 26/7/2021 10:09:46 ??: <html><EOL>    <head><EOL>  <style type=3D"text/css"><EOL>            body, td, span, p, th { font-size: 11px; }<EOL>       table.html-email {margin:10px auto;background:#fff;border:solid =<EOL>#dad8d8 1px;}<EOL>        .html-email tr{border-bottom : 1px solid #eee;}<EOL>        span.grey {color:#666;}<EOL>        span.date {color:#666;font-size: 10px;}<EOL>        a.default:link, a.default:hover, a.default:visited =<EOL>{color:#666;line-height:25px;background: #f2f2f2;margin: 10px ;padding: =<EOL>3px 8px 1px 8px;border: solid #CAC9C9 1px;border-radius: =<EOL>4px;-webkit-border-radius: 4px;-moz-border-radius: 4px;text-shadow: 1px =<EOL>1px 1px #f2f2f2;font-size: 12px;background-position: 0px 0px;display: =<EOL>inline-block;text-decoration: none;}<EOL>       a.default:hover {color:#888;background: #f8f8f8;}<EOL>      .cart-summary{ }<EOL>       .html-email th { background: #ccc;margin: 0px;padding: 10px;}<EOL>      .sectiontableentry2, .html-email th, .cart-summary th{ background: =<EOL>#ccc;margin: 0px;padding: 10px;}<EOL>      .sectiontableentry1, .html-email td, .cart-summary td {background: =<EOL>#fff;margin: 0px;padding: 10px;}<EOL>  </style><EOL><EOL>    </head><EOL><EOL>    <body style=3D"background: #F2F2F2;word-wrap: break-word;"><EOL> <div style=3D"background-color: #e6e6e6;" width=3D"100%"><EOL>      <table style=3D"margin: auto;" cellpadding=3D"0" cellspacing=3D"0"  =<EOL>><EOL>        <tr><EOL>           <td><EOL>           <table width=3D"100%" border=3D"0" cellpadding=3D"0" =<EOL>cellspacing=3D"0" class=3D"html-email"><EOL>             <tr><EOL>               <td ><EOL><EOL>                 =CE=9A=CE=B1=CE=BB=CF=8E=CF=82 =<EOL>=CE=AE=CF=81=CE=B8=CE=B1=CF=84=CE=B5 =CF=83=CF=84=CE=BF =<EOL>=CE=91=CE=A1=CE=93=CE=A5=CE=A1=CE=A9 =<EOL>=CE=A0=CE=91=CE=A0=CE=91=CE=93=CE=95=CE=A9=CE=A1=CE=93=CE=99=CE=9F=CE=A5  =<EOL>              <br /><EOL>                                 </td><EOL>              </tr><EOL>          </table><EOL><EOL>          <table class=3D"html-email" cellspacing=3D"0" cellpadding=3D"0" =<EOL>border=3D"0" width=3D"100%"><EOL>             <tr><EOL>               <th width=3D"100%"><EOL>                    =CE=A4=CE=B1 =CF=83=CF=84=CE=BF=CE=B9=CF=87=CE=B5=CE=AF=CE=B1 =<EOL>=CF=84=CE=B7=CF=82 =CE=B5=CE=B3=CE=B3=CF=81=CE=B1=CF=86=CE=AE=CF=82 =<EOL>=CF=83=CE=B1=CF=82                </th><EOL><EOL>             </tr><EOL>              <tr><EOL>               <td valign=3D"top" width=3D"100%"><EOL>                 =CE=8C=CE=BD=CE=BF=CE=BC=CE=B1 =<EOL>=CF=83=CF=8D=CE=BD=CE=B4=CE=B5=CF=83=CE=B7=CF=82dpap<br />=CE=A4=CE=BF =<EOL>=CF=8C=CE=BD=CE=BF=CE=BC=CE=B1 =CF=80=CE=BF=CF=85 =<EOL>=CE=B5=CE=BC=CF=86=CE=B1=CE=BD=CE=AF=CE=B6=CE=B5=CF=84=CE=B1=CE=B9: =<EOL>=CE=91=CE=A1=CE=93=CE=A5=CE=A1=CE=A9 =<EOL>=CE=A0=CE=91=CE=A0=CE=91=CE=93=CE=95=CE=A9=CE=A1=CE=93=CE=99=CE=9F=CE=A5<=<EOL>br />O =CE=BA=CF=89=CE=B4=CE=B9=CE=BA=CF=8C=CF=82 =<EOL>=CF=83=CE=B1=CF=82staran<br /><br />=CE=97 =<EOL>=CE=B4=CE=B9=CE=B5=CF=8D=CE=B8=CF=85=CE=BD=CF=83=CE=B7 =<EOL>=CF=83=CE=B1=CF=82: <br />E-Mail: [email protected]<br =<EOL>/>=CE=A0=CF=81=CE=BF=CE=B2=CE=B1=CE=BB=CE=BB=CF=8C=CE=BC=CE=B5=CE=BD=CE=BF=<EOL> =CF=8C=CE=BD=CE=BF=CE=BC=CE=B1: =CE=91=CE=A1=CE=93=CE=A5=CE=A1=CE=A9 =<EOL>=CE=A0=CE=91=CE=A0=CE=91=CE=93=CE=95=CE=A9=CE=A1=CE=93=CE=99=CE=9F=CE=A5<=<EOL>br />=CE=9F=CE=BD=CE=BF=CE=BC=CE=B1 =<EOL>=CE=B5=CF=84=CE=B1=CE=B9=CF=81=CE=AF=CE=B1=CF=82: =<EOL>=CE=91.=CE=94.=CE=A0=CE=91=CE=A0=CE=91=CE=93=CE=95=CE=A9=CE=A1=CE=93=CE=99=<EOL>=CE=9F=CE=A5<br />=CE=9F=CE=BD=CE=BF=CE=BC=CE=B1: =<EOL>=CE=91=CE=A1=CE=93=CE=A5=CE=A1=CE=A9<br =<EOL>/>=CE=95=CF=80=CE=AF=CE=B8=CE=B5=CF=84=CE=BF: =<EOL>=CE=A0=CE=91=CE=A0=CE=91=CE=93=CE=95=CE=A9=CE=A1=CE=93=CE=99=CE=9F=CE=A5<=<EOL>br />=CE=94=CE=B9=CE=B5=CF=8D=CE=B8=CF=85=CE=BD=CF=83=CE=B7 1: =<EOL>=CE=A6=CE=95=CE=99=CE=94=CE=99=CE=A0=CE=A0=CE=99=CE=94=CE=9F=CE=A5 2<br =<EOL>/>=CE=A4=CE=B1=CF=87. =CE=BA=CF=89=CE=B4=CE=B9=CE=BA=CF=8C=CF=82: =<EOL>32131<br />=CE=A0=CF=8C=CE=BB=CE=B7: =<EOL>=CE=9B=CE=99=CE=92=CE=91=CE=94=CE=95=CE=99=CE=91<br =<EOL>/>=CE=A7=CF=8E=CF=81=CE=B1: Greece<br />=CE=9D=CE=BF=CE=BC=CF=8C=CF=82 / =<EOL>=CE=A0=CE=B5=CF=81=CE=B9=CE=BF=CF=87=CE=AE: =<EOL>=CE=92=CE=9F=CE=99=CE=A9=CE=A4=CE=99=CE=91=CE=A3<br =<EOL>/>=CE=A4=CE=B7=CE=BB.: 2261089120<br />=CE=BA=CE=B9=CE=BD.: =<EOL>6974398860<br />               </td><EOL>              </tr><EOL>          </table><EOL>           </td><EOL>      </tr><EOL>      </table><EOL>   </div><EOL>    </body><EOL></html><EOL><EOL>
Recv 26/7/2021 10:09:46 ??: )<EOL>
Recv 26/7/2021 10:09:46 ??: C56 OK Success<EOL>

Upvotes: 0

Views: 566

Answers (1)

Remy Lebeau
Remy Lebeau

Reputation: 595827

TIdIMAP4.UIDRetrieveTextPeek2() first retrieves the email's body structure, then scans it looking for the first text part that has a non-zero size reported. If none are found, the last text part is used. It then uses the chosen part's specified byte encoding and charset to decode the part's text for output.

At least, that is the theory, anyway.

In your log, the email in question has 2 text parts being reported by the IMAP server:

  • text/plain, size 760, encoding "8BIT", charset "utf-8"
  • text/html, size 3962, encoding "QUOTED-PRINTABLE", charset "utf-8"

However, in comments, you say that UIDRetrieveStructure() (which UIDRetrieveTextPeek2() uses internally) is actually reporting 3 text parts instead:

  • multipart/alternative
  • text/plain, size 760, encoding "8BIT", charset "utf-8"
  • text/html, size 3962, encoding "QUOTED-PRINTABLE", charset "utf-8"

Your log shows UIDRetrieveTextPeek2() is retrieving BODY.PEEK[2], so it thinks it is requesting the content of the text/plain part (which makes sense, since that is the first non-empty text part), but is actually requesting the content of the text/html part instead. That will have to be fixed. I have opened a ticket for that:

#368 TIdIMAP4.InternalRetrieveText() does not retreive text correctly

Since the byte encoding of the text/plain part is 8bit, UIDRetrieveTextPeek2() is not trying to decode the QP-encoded characters in the HTML, which explains why you are seeing them (=0A, =3D, =09, etc) in the output string.

Upvotes: 1

Related Questions