Reputation: 283
I have a sqlite database (UTF-8 encoding). And inside the database i have such record (hex bytes):
D0 9E D0 BA 29 F0 9F 92 8B F0 9F 92 8B
Thus the bytes are not alligned, that is:
$D09E = O
$D0BA = к
$29 = )
$F09F928B = U+1F48b (KISS MARK)
$F09F928B = U+1F48b (KISS MARK)
This is how it looks like in the SQLite manager:
But whatever i do (UTF8Decode and some custom transformations) i cannot make it to be displayed in my Unicode enabled TNTStringGrid.
Yes, i can get the Ok)?? displayed but then goes either Ок)рџ’‹рџ’‹ or this is how it looks like real (UTF8Encoded): РћРє)рџ’‹рџ’‹
I know it's a hard question, but there must be a solution because the SQLite manager displays it absolutely correct.
So how to display those U+ characters along with normal text?
Please help! 3 days and nights fighting with this task.
Upvotes: 0
Views: 2118
Reputation: 598279
Ok)??
means the UTF-8 data was correctly decoded to UTF-16, but then the UTF-16 data was converted to an Ansi codepage that does not support those Unicode characters.
Ок)рџ’‹рџ’‹
means you have raw 8-bit UTF-8 octets being stored as-is, extended into 16-bit values, not being decoded from UTF-8 to UTF-16 at all.
Upvotes: 0
Reputation: 9106
You are dealing with UTF "Surrogate pairs":
Using UTF-16, the value ranges from $D800-DBFF and $DC00-DCFF are used to specify so-called surrogate pairs.
Using these surrogate pairs, we can map Unicode code points of $10000 and higher (in the range $10000 to $10FFFD).
This is done by subtracting $10000 from the value, leaving a value in the range 0 to $FFFFD, which can be represented in 20 bits.
These 20 bits are split in two pairs of 10 bits each, added to the $D800 resp. $DC00 pairs.
So for the Unicode code point $1D11E the UTF-16 surrogate pair is calculated as follows: first subtract $10000, which leaves $D11E,
which is 00001101000100011110 in 20 bits, split in $34 and $11E. $34 is added to $D800, and $11E is added to $DC00
resulting in $D834 for the most significant surrogate, and $DD1E for the least significant surrogate.
[Note that the Unicode code points $D800 to $DFFD will not be assigned a valid character by the Unicode standard (to avoid problems with UTF-16), so the individual surrogate characters are never mapped to actual characters themselves (but should always be used as a pair).]
See also http://en.wikipedia.org/wiki/UTF-16
To properly display surrogate pair characters you need a font that contains them. E.g. the musical symbols in the range U+1D100 – U+1D1FF (119040–119295) are supported by the Windows fonts Code2001, Euterpe, Free Serif, Musica, Quivira, Symbola (http://www.alanwood.net/unicode/fontsbyrange.html#u1d100)
You need to download and install the Musica font (formerly called Musical Symbols) on your system in order for this example to work. Download location e.g. http://users.teilar.gr/~g1951d/
[Install under Win7: Right-click on the ttf file and choose 'install']
[Test page: http://www.alanwood.net/unicode/musical_symbols.html]
Here is my sample Delphi XE2 test code that uses the above (You have D2007, but this may get you on your way).
unit uSurrogatePairs;
interface
uses
Winapi.Windows, Winapi.Messages, System.SysUtils, System.Variants, System.Classes, Vcl.Graphics,
Vcl.Controls, Vcl.Forms, Vcl.Dialogs, Vcl.StdCtrls;
type
TFrmSurrogatePairs = class(TForm)
MmoCharacter: TMemo;
Mmo: TMemo;
procedure FormShow(Sender: TObject);
private
procedure Log(S: String);
public
{ Public declarations }
end;
var
FrmSurrogatePairs: TFrmSurrogatePairs;
implementation
{$R *.dfm}
type
TDanishString = type ansistring(1252);
procedure TFrmSurrogatePairs.FormShow(Sender: TObject);
// Code adapted from http://compaspascal.blogspot.nl/2008/10/delphi-2009-strings-explained-by.html
var
UTF16Str : string;
UTF8Str : utf8string;
DanishStr: TDanishString;
L : Integer;
begin
{ TODO -oJan -cShouldHave : Test if Musica font is installed }
UTF16Str:=#$1D160;
MmoCharacter.Text := UTF16Str;
L := length(UTF16Str);
Assert (L=2);
Log('Assigned: UTF16Str := #$1D160');
Log(' This is a musical note (000011101000101100000),');
log(' see http://unicode.org/charts/PDF/U1D100.pdf');
Log('Length(UTF16Str)=2');
Log(' This character occupies 2 positions in UTF-16');
Assert (UTF16Str[1]=#$D834); // 110110 0000110100 First half of the symbol
Assert (UTF16Str[2]=#$DD60); // 110111 0101100000 Second half of the symbol
Log('UTF16Str[1]=#$D834');
Log('UTF16Str[2]=#$DD60');
UTF8Str := utf8string(UTF16Str);
MmoCharacter.Lines.Add(String(UTF8Str));
Log('');
Log('Assigned: UTF8Str := UTF16Str');
Log(' This is the second line (char) in the left memo');
L := Length(UTF8Str);
Assert (L=4);
Log('Length(UTF8Str)=4');
Log(' This character occupies 4 positions in UTF-8, each 1 byte');
Assert (UTF8Str[1]=#$F0); // 11110 000
Assert (UTF8Str[2]=#$9D); // 10 011101
Assert (UTF8Str[3]=#$85); // 10 000101
Assert (UTF8Str[4]=#$A0); // 10 100000
DanishStr:=UTF16Str;
Assert (DanishStr='??'); // Note how Windows incorrectly converts to two letters!
Assert (length(DanishStr)=2);
DanishStr:=UTF8Str;
Assert (DanishStr='??'); // Note how Windows incorrectly converts to two letters!
Assert (length(DanishStr)=2);
end;
procedure TFrmSurrogatePairs.Log(S: String);
begin
Mmo.Lines.Add(S);
end;
end.
and the DFM:
object FrmSurrogatePairs: TFrmSurrogatePairs
Left = 0
Top = 0
Caption = 'Surrogate pairs'
ClientHeight = 273
ClientWidth = 600
Color = clBtnFace
Font.Charset = DEFAULT_CHARSET
Font.Color = clWindowText
Font.Height = -11
Font.Name = 'Tahoma'
Font.Style = []
OldCreateOrder = False
OnShow = FormShow
PixelsPerInch = 96
TextHeight = 13
object MmoCharacter: TMemo
AlignWithMargins = True
Left = 3
Top = 3
Width = 134
Height = 267
Align = alLeft
Font.Charset = DEFAULT_CHARSET
Font.Color = clWindowText
Font.Height = -107
Font.Name = 'Musica'
Font.Style = []
ParentFont = False
ReadOnly = True
TabOrder = 0
end
object Mmo: TMemo
AlignWithMargins = True
Left = 143
Top = 3
Width = 454
Height = 267
Align = alClient
Lines.Strings = (
'')
ReadOnly = True
TabOrder = 1
end
end
Upvotes: 2