Reputation: 1595
I am trying to read some text fields from an IFC format file, and all of them seem to be hexadecimal. (the result may be some russian text).
I could find different posts describing how to convert to Ascii, and found also an answer on how to convert from hexadecimal to string for unicode, here is the code given :
public static string FromHexString(string hexString)
{
var bytes = new byte[hexString.Length / 2];
for (var i = 0; i < bytes.Length; i++)
{
bytes[i] = Convert.ToByte(hexString.Substring(i * 2, 2), 16);
}
return Encoding.Unicode.GetString(bytes); // returns: "Hello world" for "48656C6C6F20776F726C64"
}
But it's not working for me, using this function, here is the result I got :
The original string looks like that \X2\0420043504310440043E\X0\
. I don't know what the \X2\
and \X0\
mean, but I guess (wrong?) it is specific to IFC format to define the coding???
Upvotes: 1
Views: 549
Reputation: 111940
The encoding is using Big Endian Unicode:
return Encoding.BigEndianUnicode.GetString(bytes);
Tested with your string and with a test case from http://www.steptools.com/stds/step/IS_final_p21e3.html
Note that the whole IFC format is quite complex. It would take me at least 2-4 hours to write a full decoder supporting the various \S\(something)
, \P(something)\
, \X2\(hex)
, \X4\(hex)
, \X\(hex)
(plus the ending \X0\
). There is even a problem in the documentation about the \X4\
examples (that are given with 7 hex digits instead of 8 hex digits), and it seems that the whole file should be UTF-8 encoded outside the escape sequences.
Aaaaand done:
Some tests:
// With .NET Core/.NET 5.0 you'll need the nuget
// https://www.nuget.org/packages/System.Text.Encoding.CodePages/
// And this line
//Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
// Nothing is needed with .NET Framework
string strExampleUnquoted = ItfStringDecoder.DecodeItfString(@"\X2\0420043504310440043E\X0\");
string str1Unquoted = ItfStringDecoder.DecodeItfString(@"CAT");
string str2Unquoted = ItfStringDecoder.DecodeItfString(@"Don''t");
string str3Unquoted = ItfStringDecoder.DecodeItfString(@"''");
string str4Unquoted = ItfStringDecoder.DecodeItfString(@"");
string str5Unquoted = ItfStringDecoder.DecodeItfString(@"\S\Drger");
string str6Unquoted = ItfStringDecoder.DecodeItfString(@"h\S\ttel");
string str7Unquoted = ItfStringDecoder.DecodeItfString(@"\PE\\S\*\S\U\S\b");
string str8Unquoted = ItfStringDecoder.DecodeItfString(@"\X2\03C0\X0\");
string str9Unquoted = ItfStringDecoder.DecodeItfString(@"\X2\03B103B203B3\X0\");
string str10Unquoted = ItfStringDecoder.DecodeItfString(@"\X4\0001F600\X0\");
string str11Unquoted = ItfStringDecoder.DecodeItfString(@"\X4\0001F6000001F638\X0\");
string str12Unquoted = ItfStringDecoder.DecodeItfString(@"see \X\A7 4.1");
string str13Unquoted = ItfStringDecoder.DecodeItfString(@"line one\X\0Aline two");
string str1Quoted = ItfStringDecoder.DecodeItfString(@"'CAT'", true);
string str2Quoted = ItfStringDecoder.DecodeItfString(@"'Don''t'", true);
string str3Quoted = ItfStringDecoder.DecodeItfString(@"''''", true);
string str4Quoted = ItfStringDecoder.DecodeItfString(@"''", true);
string str5Quoted = ItfStringDecoder.DecodeItfString(@"'\S\Drger'", true);
string str6Quoted = ItfStringDecoder.DecodeItfString(@"'h\S\ttel'", true);
string str7Quoted = ItfStringDecoder.DecodeItfString(@"'\PE\\S\*\S\U\S\b'", true);
string str8Quoted = ItfStringDecoder.DecodeItfString(@"'\X2\03C0\X0\'", true);
string str9Quoted = ItfStringDecoder.DecodeItfString(@"'\X2\03B103B203B3\X0\'", true);
string str10Quoted = ItfStringDecoder.DecodeItfString(@"'\X4\0001F600\X0\'", true);
string str11Quoted = ItfStringDecoder.DecodeItfString(@"'\X4\0001F6000001F638\X0\'", true);
string str12Quoted = ItfStringDecoder.DecodeItfString(@"'see \X\A7 4.1'", true);
string str13Quoted = ItfStringDecoder.DecodeItfString(@"'line one\X\0Aline two'", true);
And the decoder:
public class ItfStringDecoder
{
/// <summary>
///
/// </summary>
/// <param name="bytes"></param>
/// <param name="quoted">true = 'XYZ', false = XYZ</param>
/// <returns></returns>
public static string DecodeItfString(byte[] bytes, bool quoted = false)
{
return DecodeItfString(Encoding.UTF8.GetString(bytes), quoted);
}
/// <summary>
///
/// </summary>
/// <param name="str"></param>
/// <param name="quoted">true = 'XYZ', false = XYZ</param>
/// <returns></returns>
public static string DecodeItfString(string str, bool quoted = false)
{
// We start with iso-8859-1 that is null
Encoding encoding = null;
int start = 0;
int end = str.Length - 1;
if (quoted)
{
if (!str.StartsWith('\''))
{
throw new FormatException("Malformed string, non starting with \"'\"");
}
if (!str.EndsWith('\''))
{
throw new FormatException("Malformed string, non ending with \"'\"");
}
start = 1;
end = str.Length - 2;
}
var sb = new StringBuilder();
for (int i = start; i <= end; i++)
{
char ch0 = str[i];
if (ch0 == '\'')
{
if (i + 1 > end || str[i + 1] != '\'')
{
throw new FormatException($"Malformed string, \"'\" not followed by \"'\" at position {i}");
}
sb.Append('\'');
i++;
}
else if (ch0 == '\\')
{
if (i + 1 > end)
{
throw new FormatException($"Malformed string, \"\\\" not followed by legal character at position {i}");
}
char ch1 = str[i + 1];
switch (ch1)
{
case '\\':
sb.Append('\\');
i++;
break;
case 'S':
i += DecodeItfStringPage(str, i, end, sb, encoding);
break;
case 'P':
i += DecodeItfStringAlphabet(str, i, end, out encoding);
break;
case 'X':
i += DecodeItfStringExtendedOrArbitary(str, i, end, sb);
break;
default:
throw new FormatException($"Malformed string, \"\\\" followed by illegal character at position {i}");
}
}
else
{
sb.Append(ch0);
}
}
return sb.ToString();
}
private static int DecodeItfStringPage(string str, int i, int end, StringBuilder sb, Encoding encoding)
{
if (i + 3 > end || str[i + 2] != '\\')
{
throw new FormatException($"Malformed string, \"\\S\" not followed by legal character at position {i}");
}
char ch3 = str[i + 3];
// Latin codepoint
if (ch3 == ' ' ||
(ch3 >= '0' && ch3 <= '9') ||
(ch3 >= 'a' && ch3 <= 'z') ||
(ch3 >= 'A' && ch3 <= 'Z') ||
ch3 == '_' ||
ch3 == '!' || ch3 == '"' || ch3 == '*' || ch3 == '$' || ch3 == '%' || ch3 == '&' || ch3 == '.' || ch3 == '#' ||
ch3 == '+' || ch3 == ',' || ch3 == '-' || ch3 == '(' || ch3 == ')' || ch3 == '?' || ch3 == '/' || ch3 == ':' ||
ch3 == ';' || ch3 == '<' || ch3 == '=' || ch3 == '>' || ch3 == '@' || ch3 == '[' || ch3 == ']' || ch3 == '{' ||
ch3 == '|' || ch3 == '}' || ch3 == '^' || ch3 == '`' || ch3 == '~' ||
ch3 == '\\' || ch3 == '\'')
{
// ok
}
else
{
throw new FormatException($"Malformed string, \"\\S\" not followed by legal character at position {i}");
}
// Little cheat for iso-8859-1
if (encoding == null)
{
// The iso-8859-1 encoding maps 1:1 with the first 256 unicode codepoints
sb.Append((char)(ch3 + 128));
}
else
{
// Without array allocation (this is allocated on the stack)
ReadOnlySpan<byte> bytes = stackalloc byte[] { (byte)(ch3 + 128) };
// Classic with array
//var bytes = new byte[] { (byte)(ch3 + 128) };
sb.Append(encoding.GetString(bytes));
}
return 3;
}
private static int DecodeItfStringAlphabet(string str, int i, int end, out Encoding encoding)
{
if (i + 3 > end || str[i + 3] != '\\')
{
throw new FormatException($"Malformed string, \"\\P\" not followed by legal character at position {i}");
}
char ch2 = str[i + 2];
if (ch2 < 'A' || ch2 > 'I')
{
throw new FormatException($"Malformed string, \"\\P\" not followed by legal character at position {i}");
}
int ix = ch2 - 'A';
// We don't need an encoder for iso-8859-1
// and 28591 is iso-8859-1, 28592 is iso-8859-2...
encoding = ix == 0 ? null : Encoding.GetEncoding(28591 + ix);
return 3;
}
private static int DecodeItfStringExtendedOrArbitary(string str, int i, int end, StringBuilder sb)
{
if (i + 4 > end)
{
throw new FormatException($"Malformed string, \"\\X\" not followed by legal character at position {i}");
}
char ch2 = str[i + 2];
if (ch2 == '\\')
{
byte b1, b2;
if (!TryFromHex(str[i + 3], out b1) || !TryFromHex(str[i + 4], out b2))
{
throw new FormatException($"Malformed string, \"\\X\\\" not followed by legal character at position {i}");
}
byte b = (byte)(b1 * 16 + b2);
sb.Append((char)b);
return 4;
}
if (ch2 == '2')
{
if (str[i + 3] != '\\')
{
throw new FormatException($"Malformed string, \"\\X2\" not followed by legal character at position {i}");
}
int j = i + 4;
while (true)
{
if (j + 3 > end)
{
throw new FormatException($"Malformed string, \"\\X2\" not followed by legal sequence of characters at position {j}");
}
byte b1, b2, b3, b4;
if (!TryFromHex(str[j], out b1) || !TryFromHex(str[j + 1], out b2) ||
!TryFromHex(str[j + 2], out b3) || !TryFromHex(str[j + 3], out b4))
{
throw new FormatException($"Malformed string, \"\\X2\\\" not followed by legal character at position {j}");
}
char ch = (char)(b1 << 12 | b2 << 8 | b3 << 4 | b4);
sb.Append(ch);
j += 4;
if (j + 3 > end)
{
throw new FormatException($"Malformed string, \"\\X2\" not followed by legal sequence of characters at position {j}");
}
if (str[j] == '\\')
{
if (str[j + 1] == 'X' && str[j + 2] == '0' && str[j + 3] == '\\')
{
j += 3;
return j - i;
}
throw new FormatException($"Malformed string, \"\\X2\" not followed by legal sequence of characters at position {j}");
}
}
}
if (ch2 == '4')
{
if (str[i + 3] != '\\')
{
throw new FormatException($"Malformed string, \"\\X4\" not followed by legal character at position {i}");
}
int j = i + 4;
while (true)
{
if (j + 7 > end)
{
throw new FormatException($"Malformed string, \"\\X4\" not followed by legal sequence of characters at position {j}");
}
int utf32;
{
byte b1, b2, b3, b4;
if (!TryFromHex(str[j], out b1) || !TryFromHex(str[j + 1], out b2) ||
!TryFromHex(str[j + 2], out b3) || !TryFromHex(str[j + 3], out b4))
{
throw new FormatException($"Malformed string, \"\\X4\\\" not followed by legal character at position {j}");
}
utf32 = b1 << 12 | b2 << 8 | b3 << 4 | b4;
utf32 <<= 16;
j += 4;
}
{
byte b1, b2, b3, b4;
if (!TryFromHex(str[j], out b1) || !TryFromHex(str[j + 1], out b2) ||
!TryFromHex(str[j + 2], out b3) || !TryFromHex(str[j + 3], out b4))
{
throw new FormatException($"Malformed string, \"\\X4\\\" not followed by legal character at position {j}");
}
utf32 |= b1 << 12 | b2 << 8 | b3 << 4 | b4;
j += 4;
}
sb.Append(char.ConvertFromUtf32(utf32));
if (j + 3 > end)
{
throw new FormatException($"Malformed string, \"\\X4\" not followed by legal sequence of characters at position {j}");
}
if (str[j] == '\\')
{
if (str[j + 1] == 'X' && str[j + 2] == '0' && str[j + 3] == '\\')
{
j += 3;
return j - i;
}
throw new FormatException($"Malformed string, \"\\X4\" not followed by legal sequence of characters at position {j}");
}
}
}
throw new FormatException($"Malformed string, \"\\X\" not followed by legal character at position {i}");
}
private static bool TryFromHex(char ch, out byte value)
{
if (ch >= '0' && ch <= '9')
{
value = (byte)(ch - '0');
return true;
}
else if (ch >= 'A' && ch <= 'F')
{
value = (byte)(10 + ch - 'A');
return true;
}
else if (ch >= 'a' && ch <= 'f')
{
value = (byte)(10 + ch - 'a');
return true;
}
value = 0;
return false;
}
}
Upvotes: 3