Reputation: 71
In my application (Windows 10, Delphi 10.4), I use TIdIMAP4
to retrieve an email's body from the server with this code:
var aBody : string := '';
UIDRetrieveTextPeek2(MsgID,aBody);
If the returned string starts with <!DOCTYPE html>'#$D#$A'<html lang="en" xmlns="http://www.w3.org/1999/xhtml" xmlns:o="urn:schemas-microsoft-com:...
or
<html><head><style type="text/css">'#$D#$A'@media screen and (max-width:480px) {'#$D#$A' .background_inner {'#$D#$A' padding: 0!important;'#$D#$A'....
I can see the HTML content formed right, but when it starts with:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.='#$D#$A'w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">=0A<html xmlns=3D"http://www.='#$D#$A'w3.org/1999/xhtml"><head><style type=3D"text/css" media=3D"all">=0A=09a:hov='#$D#$A'er {=09color: red;=09}=0A=09a {=0A=09=09text-decoration: underline;=0A=09='#$D#$A'=09color: #0088cc;=0A=09}....
The HTML is malformed (=0A=09a:hov='#$D#$A'er {=09color: red;=09}=0A=09a {=0A=09=09text-decoration: underline;=0A=09=
)
What can I do to fix it?
UPDATE
Here is a reproducible example:
unit Unit11;
interface
uses
Winapi.Windows, Winapi.Messages, System.SysUtils, System.Variants, System.Classes, Vcl.Graphics,
Vcl.Controls, Vcl.Forms, Vcl.Dialogs, Vcl.StdCtrls,
IdIMAP4,IdMessage,IdSSLOpenSSL,IdExplicitTLSClientServerBase, IdBaseComponent, IdIntercept, IdLogBase, IdLogFile;
type
TForm11 = class(TForm)
Button1: TButton;
procedure Button1Click(Sender: TObject);
private
{ Private declarations }
public
{ Public declarations }
end;
var
Form11: TForm11;
implementation
{$R *.dfm}
procedure TForm11.Button1Click(Sender: TObject);
var SearchInfo: array of TIdIMAP4SearchRec;
IdMessage1: TIdMessage;
IdSSLIOHandlerSocketOpenSSL1: TIdSSLIOHandlerSocketOpenSSL;
i, msgs : integer;
MsgID, aBody : string;
begin
with TIdIMAP4.create do try
IdSSLIOHandlerSocketOpenSSL1 := TIdSSLIOHandlerSocketOpenSSL.Create(nil);
IdSSLIOHandlerSocketOpenSSL1.SSLOptions.Method := sslvSSLv23;
IOhandler := IdSSLIOHandlerSocketOpenSSL1;
authType := iatUserPass;
Host := 'imap.gmail.com';
userName := 'xxxxxxxxxx';
password := 'yyyyyyyyyy';
UseTLS := utUseImplicitTLS;
if Connect(TRUE) then
try
SelectMailBox('INBOX');
SetLength(SearchInfo, 1);
SearchInfo[0].SearchKey := skAll;
if SearchMailBox(SearchInfo)
and (High(MailBox.SearchResult) > -1) then
try
msgs := High(MailBox.SearchResult)+1;
for i := 0 to msgs - 1 do
begin
MsgID := '';
GetUID(MailBox.SearchResult[i], MsgID);
// some bodyies are unreadable, some ok and some as the following are unencoded QP
if MsgID = '16805' then begin
var IdLogFile1: TIdLogFile := TIdLogFile.Create(nil);
IdLogFile1.Filename := 'log.txt';
intercept := IdLogFile1;
IdLogFile1.Active := TRUE;
UIDRetrieveTextPeek2(MsgID,aBody);
IdLogFile1.Active := FALSE;
IdLogFile1.Free;
end;
end;
finally
end;
finally
end;
finally
IdSSLIOHandlerSocketOpenSSL1.free;
end;
end;
end.
And its captured log:
Sent 26/7/2021 10:09:45 ??: C55 UID FETCH 16805 (BODYSTRUCTURE)<EOL>
Recv 26/7/2021 10:09:45 ??: * 51 FETCH (UID 16805 BODYSTRUCTURE (("TEXT" "PLAIN" ("CHARSET" "utf-8") NIL NIL "8BIT" 760 16 NIL NIL NIL)("TEXT" "HTML" ("CHARSET" "utf-8") NIL NIL "QUOTED-PRINTABLE" 3962 80 NIL NIL NIL) "ALTERNATIVE" ("BOUNDARY" "----=_NextPart_000_0008_01D583AF.54009F20") NIL NIL))<EOL>
Recv 26/7/2021 10:09:45 ??: C55 OK Success<EOL>
Sent 26/7/2021 10:09:45 ??: C56 UID FETCH 16805 (BODY.PEEK[2])<EOL>
Recv 26/7/2021 10:09:46 ??: * 51 FETCH (UID 16805 BODY[2]
Recv 26/7/2021 10:09:46 ??: {3962}<EOL>
Recv 26/7/2021 10:09:46 ??: <html><EOL> <head><EOL> <style type=3D"text/css"><EOL> body, td, span, p, th { font-size: 11px; }<EOL> table.html-email {margin:10px auto;background:#fff;border:solid =<EOL>#dad8d8 1px;}<EOL> .html-email tr{border-bottom : 1px solid #eee;}<EOL> span.grey {color:#666;}<EOL> span.date {color:#666;font-size: 10px;}<EOL> a.default:link, a.default:hover, a.default:visited =<EOL>{color:#666;line-height:25px;background: #f2f2f2;margin: 10px ;padding: =<EOL>3px 8px 1px 8px;border: solid #CAC9C9 1px;border-radius: =<EOL>4px;-webkit-border-radius: 4px;-moz-border-radius: 4px;text-shadow: 1px =<EOL>1px 1px #f2f2f2;font-size: 12px;background-position: 0px 0px;display: =<EOL>inline-block;text-decoration: none;}<EOL> a.default:hover {color:#888;background: #f8f8f8;}<EOL> .cart-summary{ }<EOL> .html-email th { background: #ccc;margin: 0px;padding: 10px;}<EOL> .sectiontableentry2, .html-email th, .cart-summary th{ background: =<EOL>#ccc;margin: 0px;padding: 10px;}<EOL> .sectiontableentry1, .html-email td, .cart-summary td {background: =<EOL>#fff;margin: 0px;padding: 10px;}<EOL> </style><EOL><EOL> </head><EOL><EOL> <body style=3D"background: #F2F2F2;word-wrap: break-word;"><EOL> <div style=3D"background-color: #e6e6e6;" width=3D"100%"><EOL> <table style=3D"margin: auto;" cellpadding=3D"0" cellspacing=3D"0" =<EOL>><EOL> <tr><EOL> <td><EOL> <table width=3D"100%" border=3D"0" cellpadding=3D"0" =<EOL>cellspacing=3D"0" class=3D"html-email"><EOL> <tr><EOL> <td ><EOL><EOL> =CE=9A=CE=B1=CE=BB=CF=8E=CF=82 =<EOL>=CE=AE=CF=81=CE=B8=CE=B1=CF=84=CE=B5 =CF=83=CF=84=CE=BF =<EOL>=CE=91=CE=A1=CE=93=CE=A5=CE=A1=CE=A9 =<EOL>=CE=A0=CE=91=CE=A0=CE=91=CE=93=CE=95=CE=A9=CE=A1=CE=93=CE=99=CE=9F=CE=A5 =<EOL> <br /><EOL> </td><EOL> </tr><EOL> </table><EOL><EOL> <table class=3D"html-email" cellspacing=3D"0" cellpadding=3D"0" =<EOL>border=3D"0" width=3D"100%"><EOL> <tr><EOL> <th width=3D"100%"><EOL> =CE=A4=CE=B1 =CF=83=CF=84=CE=BF=CE=B9=CF=87=CE=B5=CE=AF=CE=B1 =<EOL>=CF=84=CE=B7=CF=82 =CE=B5=CE=B3=CE=B3=CF=81=CE=B1=CF=86=CE=AE=CF=82 =<EOL>=CF=83=CE=B1=CF=82 </th><EOL><EOL> </tr><EOL> <tr><EOL> <td valign=3D"top" width=3D"100%"><EOL> =CE=8C=CE=BD=CE=BF=CE=BC=CE=B1 =<EOL>=CF=83=CF=8D=CE=BD=CE=B4=CE=B5=CF=83=CE=B7=CF=82dpap<br />=CE=A4=CE=BF =<EOL>=CF=8C=CE=BD=CE=BF=CE=BC=CE=B1 =CF=80=CE=BF=CF=85 =<EOL>=CE=B5=CE=BC=CF=86=CE=B1=CE=BD=CE=AF=CE=B6=CE=B5=CF=84=CE=B1=CE=B9: =<EOL>=CE=91=CE=A1=CE=93=CE=A5=CE=A1=CE=A9 =<EOL>=CE=A0=CE=91=CE=A0=CE=91=CE=93=CE=95=CE=A9=CE=A1=CE=93=CE=99=CE=9F=CE=A5<=<EOL>br />O =CE=BA=CF=89=CE=B4=CE=B9=CE=BA=CF=8C=CF=82 =<EOL>=CF=83=CE=B1=CF=82staran<br /><br />=CE=97 =<EOL>=CE=B4=CE=B9=CE=B5=CF=8D=CE=B8=CF=85=CE=BD=CF=83=CE=B7 =<EOL>=CF=83=CE=B1=CF=82: <br />E-Mail: [email protected]<br =<EOL>/>=CE=A0=CF=81=CE=BF=CE=B2=CE=B1=CE=BB=CE=BB=CF=8C=CE=BC=CE=B5=CE=BD=CE=BF=<EOL> =CF=8C=CE=BD=CE=BF=CE=BC=CE=B1: =CE=91=CE=A1=CE=93=CE=A5=CE=A1=CE=A9 =<EOL>=CE=A0=CE=91=CE=A0=CE=91=CE=93=CE=95=CE=A9=CE=A1=CE=93=CE=99=CE=9F=CE=A5<=<EOL>br />=CE=9F=CE=BD=CE=BF=CE=BC=CE=B1 =<EOL>=CE=B5=CF=84=CE=B1=CE=B9=CF=81=CE=AF=CE=B1=CF=82: =<EOL>=CE=91.=CE=94.=CE=A0=CE=91=CE=A0=CE=91=CE=93=CE=95=CE=A9=CE=A1=CE=93=CE=99=<EOL>=CE=9F=CE=A5<br />=CE=9F=CE=BD=CE=BF=CE=BC=CE=B1: =<EOL>=CE=91=CE=A1=CE=93=CE=A5=CE=A1=CE=A9<br =<EOL>/>=CE=95=CF=80=CE=AF=CE=B8=CE=B5=CF=84=CE=BF: =<EOL>=CE=A0=CE=91=CE=A0=CE=91=CE=93=CE=95=CE=A9=CE=A1=CE=93=CE=99=CE=9F=CE=A5<=<EOL>br />=CE=94=CE=B9=CE=B5=CF=8D=CE=B8=CF=85=CE=BD=CF=83=CE=B7 1: =<EOL>=CE=A6=CE=95=CE=99=CE=94=CE=99=CE=A0=CE=A0=CE=99=CE=94=CE=9F=CE=A5 2<br =<EOL>/>=CE=A4=CE=B1=CF=87. =CE=BA=CF=89=CE=B4=CE=B9=CE=BA=CF=8C=CF=82: =<EOL>32131<br />=CE=A0=CF=8C=CE=BB=CE=B7: =<EOL>=CE=9B=CE=99=CE=92=CE=91=CE=94=CE=95=CE=99=CE=91<br =<EOL>/>=CE=A7=CF=8E=CF=81=CE=B1: Greece<br />=CE=9D=CE=BF=CE=BC=CF=8C=CF=82 / =<EOL>=CE=A0=CE=B5=CF=81=CE=B9=CE=BF=CF=87=CE=AE: =<EOL>=CE=92=CE=9F=CE=99=CE=A9=CE=A4=CE=99=CE=91=CE=A3<br =<EOL>/>=CE=A4=CE=B7=CE=BB.: 2261089120<br />=CE=BA=CE=B9=CE=BD.: =<EOL>6974398860<br /> </td><EOL> </tr><EOL> </table><EOL> </td><EOL> </tr><EOL> </table><EOL> </div><EOL> </body><EOL></html><EOL><EOL>
Recv 26/7/2021 10:09:46 ??: )<EOL>
Recv 26/7/2021 10:09:46 ??: C56 OK Success<EOL>
Upvotes: 0
Views: 566
Reputation: 595827
TIdIMAP4.UIDRetrieveTextPeek2()
first retrieves the email's body structure, then scans it looking for the first text part that has a non-zero size reported. If none are found, the last text part is used. It then uses the chosen part's specified byte encoding and charset to decode the part's text for output.
At least, that is the theory, anyway.
In your log, the email in question has 2 text parts being reported by the IMAP server:
text/plain
, size 760, encoding "8BIT"
, charset "utf-8"
text/html
, size 3962, encoding "QUOTED-PRINTABLE"
, charset "utf-8"
However, in comments, you say that UIDRetrieveStructure()
(which UIDRetrieveTextPeek2()
uses internally) is actually reporting 3 text parts instead:
multipart/alternative
text/plain
, size 760, encoding "8BIT"
, charset "utf-8"
text/html
, size 3962, encoding "QUOTED-PRINTABLE"
, charset "utf-8"
Your log shows UIDRetrieveTextPeek2()
is retrieving BODY.PEEK[2]
, so it thinks it is requesting the content of the text/plain
part (which makes sense, since that is the first non-empty text part), but is actually requesting the content of the text/html
part instead. That will have to be fixed. I have opened a ticket for that:
#368 TIdIMAP4.InternalRetrieveText() does not retreive text correctly
Since the byte encoding of the text/plain
part is 8bit
, UIDRetrieveTextPeek2()
is not trying to decode the QP-encoded characters in the HTML, which explains why you are seeing them (=0A
, =3D
, =09
, etc) in the output string
.
Upvotes: 1