Reputation:
Now I can get unicode value, I have to get its charset before from unicode value calling
HFONT CreateFont(
int nHeight, // height of font
int nWidth, // average character width
int nEscapement, // angle of escapement
int nOrientation, // base-line orientation angle
int fnWeight, // font weight
DWORD fdwItalic, // italic attribute option
DWORD fdwUnderline, // underline attribute option
DWORD fdwStrikeOut, // strikeout attribute option
DWORD fdwCharSet, // character set identifier
DWORD fdwOutputPrecision, // output precision
DWORD fdwClipPrecision, // clipping precision
DWORD fdwQuality, // output quality
DWORD fdwPitchAndFamily, // pitch and family
LPCTSTR lpszFace // typeface name
);
and I got the folloing message from MSDN:
fdwCharSet
[in] Specifies the character set. The following values are predefined:
ANSI_CHARSET
BALTIC_CHARSET
CHINESEBIG5_CHARSET
DEFAULT_CHARSET
EASTEUROPE_CHARSET
GB2312_CHARSET
GREEK_CHARSET
HANGUL_CHARSET
MAC_CHARSET
OEM_CHARSET
RUSSIAN_CHARSET
SHIFTJIS_CHARSET
SYMBOL_CHARSET
TURKISH_CHARSET
VIETNAMESE_CHARSET
Korean language edition of Windows:
JOHAB_CHARSET
Middle East language edition of Windows:
ARABIC_CHARSET
HEBREW_CHARSET
Thai language edition of Windows:
THAI_CHARSET
The OEM_CHARSET value specifies a character set that is operating-system dependent.
Windows 95/98/Me: You can use the DEFAULT_CHARSET value to allow the name and size of a font to fully describe the logical font. If the specified font name does not exist, a font from any character set can be substituted for the specified font, so you should use DEFAULT_CHARSET sparingly to avoid unexpected results.
Here is what I have now:
FX_INT32 CharSetFromUnicode(FX_WORD word)
{
int nACP = GetACP();
switch (nACP)
{
case 932:
case 936:
case 950:
case 949:
if ((word >= 0x2E80 && word <= 0x2EFF) ||
(word >= 0x3000 && word <= 0x303F) ||
(word >= 0x3200 && word <= 0x32FF) ||
(word >= 0x3300 && word <= 0x33FF) ||
(word >= 0x3400 && word <= 0x4DB5) ||
(word >= 0x4E00 && word <= 0x9FFF) ||
(word >= 0xF900 && word <= 0xFAFF) ||
(word >= 0xFE30 && word <= 0xFE4F) ||
(word >= 0x20000 && word <= 0x2A6D6) ||
(word >= 0x2F800 && word <= 0x2FA1F))
{
switch (nACP)
{
case 932:
return SHIFTJIS_CHARSET;
case 936:
case 950:
return GB2312_CHARSET;
case 949:
return HANGUL_CHARSET;
}
}
break;
}
//find new charset
if ((word >= 0x4E00 && word <= 0x9FA5) ||
(word >= 0xE7C7 && word <= 0xE7F3) ||
(word >= 0x3000 && word <= 0x303F) || //)"《" "》" "。" "、"
(word >= 0x2000 && word <= 0x206F))
{
return GB2312_CHARSET;
}
if (((word >= 0x3040) && (word <= 0x309F)) ||
((word >= 0x30A0) && (word <= 0x30FF)) ||
((word >= 0x31F0) && (word <= 0x31FF)) ||
((word >= 0xFF00) && (word <= 0xFFEF)) )
{
return SHIFTJIS_CHARSET;
}
if (((word >= 0xAC00) && (word <= 0xD7AF)) ||
((word >= 0x1100) && (word <= 0x11FF)) ||
((word >= 0x3130) && (word <= 0x318F)))
{
return HANGUL_CHARSET;
}
if (word >= 0x0E00 && word <= 0x0E7F)
return THAI_CHARSET;
if ((word >= 0x0370 && word <= 0x03FF) ||
(word >= 0x1F00 && word <= 0x1FFF))
return GREEK_CHARSET;
if ((word >= 0x0600 && word <= 0x06FF) ||
(word >= 0xFB50 && word <= 0xFEFC))
return ARABIC_CHARSET;
if (word >= 0x0590 && word <= 0x05FF)
return HEBREW_CHARSET;
if (word >= 0x0400 && word <= 0x04FF)
return RUSSIAN_CHARSET;
if (word == 0x11E || word == 0x11F || word == 0x130 || word == 0x131 || word == 0x15E || word == 0x15F)
return TURKISH_CHARSET;
if (word >= 0x0100 && word <= 0x024F)
return EASTEUROPE_CHARSET;
if (word >= 0x1E00 && word <= 0x1EFF)
return VIETNAMESE_CHARSET;
return GB2312_CHARSET;
}
..But the function doesn't work properly. Can Anyone help me to fix it?
Upvotes: 1
Views: 843
Reputation: 116377
In general, there is no guaranteed way to guess the encoding.
However, in practice one can guess. For example, there is very good universal character set detection library created by Mozilla: uchardet.
It is used in Firefox to automatically guess charset of random pages you visit (which do not always provide proper encoding, if any), and it seems to work very well in practice.
Upvotes: 2