Reputation: 934
I'm having to recreate a vendor's XML file. I don't have access to their code, schema, or anything, so I'm doing this using the XmlSerializer
and attributes. I'm doing it this way because the system is using a generic XmlWriter
I've built to write other system XML files, so I'm killing two birds with one stone. Everything has been working out great, with exception of one property value. The vendor XML looks like this:
<TextOutlTxt>
<p style="text-align:left;margin-top:0pt;margin-bottom:0pt;">
<span>SUBSTA SF6 CIRCUIT BKR CONC FDN "C"</span>
</p>
</TextOutlTxt>
Here's my property configuration:
private string _value;
[XmlElement("TextOutlTxt")]
public XmlNode Value
{
get
{
string text = _value;
text = Regex.Replace(text, @"[\a\b\f\n\r\t\v\\""'&<>]", m => string.Join(string.Empty, m.Value.Select(c => string.Format("&#x{0:X};", Convert.ToInt32(c))).ToArray()));
string value = "\n<p style=\"text-align:left;margin-top:0pt;margin-bottom:0pt;\">\n<span>ReplaceMe</span>\n</p>\n";
XmlDocument document = new XmlDocument();
document.InnerXml = "<root>" + value + "</root>";
XmlNode innerNode = document.DocumentElement.FirstChild;
innerNode.InnerText = text;
return innerNode;
}
set
{ }
}
And this gives me:
<TextOutlTxt>
<p style="text-align:left;margin-top:0pt;margin-bottom:0pt;" xmlns="">SUBSTA SF6 CIRCUIT BKR CONC FDN &#x22;C&#x22;</p>
</TextOutlTxt>
So I'm close, but no cigar. There is an unwanted xmlns="..."
attribute; it must not be present. In my XmlWriter
, I have done the following to remove the namespace unless found atop the object it is serializing:
protected override void OnWrite<T>(T sourceData, Stream outputStream)
{
IKnownTypesLocator knownTypesLocator = KnownTypesLocator.Instance;
//Let's see if we can get the default namespace
XmlRootAttribute xmlRootAttribute = sourceData.GetType().GetCustomAttributes<XmlRootAttribute>().FirstOrDefault();
XmlSerializer serializer = null;
if (xmlRootAttribute != null)
{
string nameSpace = xmlRootAttribute.Namespace ?? string.Empty;
XmlSerializerNamespaces nameSpaces = new XmlSerializerNamespaces();
nameSpaces.Add(string.Empty, nameSpace);
serializer = new XmlSerializer(typeof(T), new XmlAttributeOverrides(), knownTypesLocator.XmlItems.ToArray(), xmlRootAttribute, nameSpace);
//Now we can serialize
using (StreamWriter writer = new StreamWriter(outputStream))
{
serializer.Serialize(writer, sourceData, nameSpaces);
}
}
else
{
serializer = new XmlSerializer(typeof(T), knownTypesLocator.XmlItems.ToArray());
//Now we can serialize
using (StreamWriter writer = new StreamWriter(outputStream))
{
serializer.Serialize(writer, sourceData);
}
}
}
I'm sure I'm overlooking something. Any help would be greatly appreciated!
UPDATE 9/26/2017 So... I've been asked to provide more detail, specifically an explanation of the purpose of my code, and a reproducible example. So here's both:
Fully functional example code.... I've tried generalizing the code in a reproducible form.
[XmlRoot("OutlTxt", Namespace = "http://www.mynamespace/09262017")]
public class OutlineText
{
private string _value;
[XmlElement("TextOutlTxt")]
public XmlNode Value
{
get
{
string text = _value;
text = Regex.Replace(text, @"[\a\b\f\n\r\t\v\\""'&<>]", m => string.Join(string.Empty, m.Value.Select(c => string.Format("&#x{0:X};", Convert.ToInt32(c))).ToArray()));
string value = "\n<p style=\"text-align:left;margin-top:0pt;margin-bottom:0pt;\">\n<span>ReplaceMe</span>\n</p>\n";
XmlDocument document = new XmlDocument();
document.InnerXml = "<root>" + value + "</root>";
XmlNode innerNode = document.DocumentElement.FirstChild;
innerNode.InnerText = text;
return innerNode;
}
set
{ }
}
private OutlineText()
{ }
public OutlineText(string text)
{
_value = text;
}
}
public class XmlFileWriter
{
public void Write<T>(T sourceData, FileInfo targetFile) where T : class
{
//This is actually retrieved through a locator object, but surely no one will mind an empty
//collection for the sake of an example
Type[] knownTypes = new Type[] { };
using (FileStream targetStream = targetFile.OpenWrite())
{
//Let's see if we can get the default namespace
XmlRootAttribute xmlRootAttribute = sourceData.GetType().GetCustomAttributes<XmlRootAttribute>().FirstOrDefault();
XmlSerializer serializer = null;
if (xmlRootAttribute != null)
{
string nameSpace = xmlRootAttribute.Namespace ?? string.Empty;
XmlSerializerNamespaces nameSpaces = new XmlSerializerNamespaces();
nameSpaces.Add(string.Empty, nameSpace);
serializer = new XmlSerializer(typeof(T), new XmlAttributeOverrides(), knownTypes, xmlRootAttribute, nameSpace);
//Now we can serialize
using (StreamWriter writer = new StreamWriter(targetStream))
{
serializer.Serialize(writer, sourceData, nameSpaces);
}
}
else
{
serializer = new XmlSerializer(typeof(T), knownTypes);
//Now we can serialize
using (StreamWriter writer = new StreamWriter(targetStream))
{
serializer.Serialize(writer, sourceData);
}
}
}
}
}
public static void Main()
{
OutlineText outlineText = new OutlineText(@"SUBSTA SF6 CIRCUIT BKR CONC FDN ""C""");
XmlFileWriter fileWriter = new XmlFileWriter();
fileWriter.Write<OutlineText>(outlineText, new FileInfo(@"C:\MyDirectory\MyXml.xml"));
Console.ReadLine();
}
The result produced:
<?xml version="1.0" encoding="utf-8"?>
<OutlTxt xmlns="http://www.mynamespace/09262017">
<TextOutlTxt>
<p style="text-align:left;margin-top:0pt;margin-bottom:0pt;" xmlns="">SUBSTA SF6 CIRCUIT BKR CONC FDN &#x22;C&#x22;</p>
</TextOutlTxt>
</OutlTxt>
Edit 9/27/2017 Per the request in the solution below, a secondary issue I've ran into is keeping the hexadecimal codes. To illustrate this issue based on the above example, let's say the value between is
SUBSTA SF6 CIRCUIT BKR CONC FDN "C"
The vendor file is expecting the literals to be in their hex code format like so
SUBSTA SF6 CIRCUIT BKR CONC FDN "C"
I've rearranged the sample code Value property to be like so:
private string _value;
[XmlAnyElement("TextOutlTxt", Namespace = "http://www.mynamespace/09262017")]
public XElement Value
{
get
{
string value = string.Format("<p xmlns=\"{0}\" style=\"text-align:left;margin-top:0pt;margin-bottom:0pt;\"><span>{1}</span></p>", "http://www.mynamespace/09262017", _value);
string innerXml = string.Format("<TextOutlTxt xmlns=\"{0}\">{1}</TextOutlTxt>", "http://www.mynamespace/09262017", value);
XElement element = XElement.Parse(innerXml);
//Remove redundant xmlns attributes
foreach (XElement descendant in element.DescendantsAndSelf())
{
descendant.Attributes().Where(att => att.IsNamespaceDeclaration && att.Value == "http://www.mynamespace/09262017").Remove();
}
return element;
}
set
{
_value = value == null ? null : value.ToString();
}
}
if I use the code
string text = Regex.Replace(element.Value, @"[\a\b\f\n\r\t\v\\""'&<>]", m => string.Join(string.Empty, m.Value.Select(c => string.Format("&#x{0:X};", Convert.ToInt32(c))).ToArray()));
to create the hex code values ahead of the XElement.Parse(), the XElement converts them back to their literal values. If I try to set the XElement.Value directly after the XElement.Parse()(or through SetValue()), it changes the " to " Not only that, but it seems to mess with the element output and adds additional elements throwing it all out of whack.
Edit 9/27/2017 #2 to clarify, the original implementation had a related problem, namely that the escaped text was re-escaped. I.e. I was getting
SUBSTA SF6 CIRCUIT BKR CONC FDN &#x22;C&#x22;
But wanted
SUBSTA SF6 CIRCUIT BKR CONC FDN "C"
Upvotes: 0
Views: 2385
Reputation: 116991
Your question now has two requirements:
Suppress certain xmlns="..."
attributes on an embedded XElement
or XmlNode
while serializing, AND
Force certain characters inside element text to be escaped (e.g. "
=> "
). Even though this is not required by the XML standard, your legacy receiving system apparently needs this.
Issue #1 can be addressed as in this answer
For issue #2, however, there is no way to force certain characters to be unnecessarily escaped using XmlNode
or XElement
because escaping is handled at the level of XmlWriter
during output. And Microsoft's built-in implementations of XmlWriter
seem not to have any settings that can force certain characters that do not need to be escaped to nevertheless be escaped. You would need to try to subclass XmlWriter
or XmlTextWriter
(as described e.g. here and here) then intercept string values as they are written and escape quote characters as desired.
Thus, as an alternate approach that solves both #1 and #2, you could implement IXmlSerializable
and write your desired XML directly with XmlWriter.WriteRaw()
:
[XmlRoot("OutlTxt", Namespace = OutlineText.Namespace)]
public class OutlineText : IXmlSerializable
{
public const string Namespace = "http://www.mynamespace/09262017";
private string _value;
// For debugging purposes.
internal string InnerValue { get { return _value; } }
static string EscapeTextValue(string text)
{
return Regex.Replace(text, @"[\a\b\f\n\r\t\v\\""'&<>]", m => string.Join(string.Empty, m.Value.Select(c => string.Format("&#x{0:X};", Convert.ToInt32(c))).ToArray()));
}
private OutlineText()
{ }
public OutlineText(string text)
{
_value = text;
}
#region IXmlSerializable Members
XmlSchema IXmlSerializable.GetSchema()
{
return null;
}
void IXmlSerializable.ReadXml(XmlReader reader)
{
_value = ((XElement)XNode.ReadFrom(reader)).Value;
}
void IXmlSerializable.WriteXml(XmlWriter writer)
{
var escapedValue = EscapeTextValue(_value);
var nestedXml = string.Format("<p style=\"text-align:left;margin-top:0pt;margin-bottom:0pt;\"><span>{0}</span></p>", escapedValue);
writer.WriteRaw(nestedXml);
}
#endregion
}
And the output will be
<OutlTxt xmlns="http://www.mynamespace/09262017"><p style="text-align:left;margin-top:0pt;margin-bottom:0pt;"><span>SUBSTA SF6 CIRCUIT BKR CONC FDN "C"</span></p></OutlTxt>
Note that, if you use WriteRaw()
, you can easily generate invalid XML simply by writing markup characters embedded in text values. You should be sure to add unit tests that verify that does not occur, e.g. that new OutlineText(@"<")
does not cause problems. (A quick check seems to show your Regex
is escaping <
and >
appropriately.)
New sample .Net fiddle.
Upvotes: 1
Reputation: 116991
The reason you are getting xmlns=""
added to your embedded XML is that your container element(s) <OutlineText>
and <TextOutlTxt>
are declared to be in the "http://www.mynamespace/09262017"
namespace by use of the [XmlRootAttribute.Namespace]
attribute, whereas the embedded literal XML elements are in the empty namespace. To fix this, your embedded XML literal must be in the same namespace as its parent elements.
Here is the XML literal. Notice there is no xmlns="..."
declaration anywhere in the XML:
<p style="text-align:left;margin-top:0pt;margin-bottom:0pt;" xmlns="">SUBSTA SF6 CIRCUIT BKR CONC FDN &#x22;C&#x22;</p>
Lacking such a declaration, the <p>
element is in the empty namespace. Conversely, your OutlineText
type is decorated with an [XmlRoot]
attribute:
[XmlRoot("OutlTxt", Namespace = "http://www.mynamespace/09262017")]
public class OutlineText
{
}
Thus the corresponding OutlTxt
root element will be in the http://www.mynamespace/09262017
namespace. All its child elements will default to this namespace as well unless overridden. Placing the embedded XmlNode
in the empty namespace counts as overriding the parent namespace, and so an xmlns=""
attribute is required.
The simplest way to avoid this problem is for your embedded XML string literal to place itself in the correct namespace as follows:
<p xmlns="http://www.mynamespace/09262017" style="text-align:left;margin-top:0pt;margin-bottom:0pt;">
<span>ReplaceMe</span>
</p>
Then, in your Value
method, strip redundant namespace declarations. This is somewhat easier to do with the LINQ to XML API:
[XmlRoot("OutlTxt", Namespace = OutlineText.Namespace)]
public class OutlineText
{
public const string Namespace = "http://www.mynamespace/09262017";
private string _value;
[XmlAnyElement("TextOutlTxt", Namespace = OutlineText.Namespace)]
public XElement Value
{
get
{
var escapedValue = EscapeTextValue(_value);
var nestedXml = string.Format("<p xmlns=\"{0}\" style=\"text-align:left;margin-top:0pt;margin-bottom:0pt;\"><span>{1}</span></p>", Namespace, escapedValue);
var outerXml = string.Format("<TextOutlTxt xmlns=\"{0}\">{1}</TextOutlTxt>", Namespace, nestedXml);
var element = XElement.Parse(outerXml);
//Remove redundant xmlns attributes
element.DescendantsAndSelf().SelectMany(e => e.Attributes()).Where(a => a.IsNamespaceDeclaration && a.Value == Namespace).Remove();
return element;
}
set
{
_value = value == null ? null : value.Value;
}
}
static string EscapeTextValue(string text)
{
return Regex.Replace(text, @"[\a\b\f\n\r\t\v\\""'&<>]", m => string.Join(string.Empty, m.Value.Select(c => string.Format("&#x{0:X};", Convert.ToInt32(c))).ToArray()));
}
private OutlineText()
{ }
public OutlineText(string text)
{
_value = text;
}
}
And the resulting XML will look like:
<OutlTxt xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.mynamespace/09262017">
<TextOutlTxt>
<p style="text-align:left;margin-top:0pt;margin-bottom:0pt;">
<span>SUBSTA SF6 CIRCUIT BKR CONC FDN "C"</span>
</p>
</TextOutlTxt>
</OutlTxt>
Note that I have changed the attribute on Value
from [XmlElement]
to [XmlAnyElement]
. I did this because it appears your value
XML might contain multiple mixed content nodes at the root level, e.g.:
Start Text <p>Middle Text</p> End Text
Using [XmlAnyElement]
enables this by allowing a container node to be returned without causing an extra level of XML element nesting.
Sample working .Net fiddle.
Upvotes: 1