Reputation: 3642
I have a database with tables that have schemaless XML columns which contain arbitrary non-XML data (plain-text). Here a sample script to gerenate and fill such a table:
CREATE TABLE TestTable (
ID INT NOT NULL IDENTITY (1, 1),
XmlColumn XML NOT NULL,
CONSTRAINT [PK_TestTable] PRIMARY KEY CLUSTERED (ID ASC) ON [PRIMARY]
) ON [PRIMARY]
GO
INSERT INTO TestTable (XmlColumn) VALUES ('<root><child /></root>');
INSERT INTO TestTable (XmlColumn) VALUES ('Foo, this is not XML');
INSERT INTO TestTable (XmlColumn) VALUES ('<root><parent><child /></parent></root>');
GO
How can I (preferred) enforce that only well-formed XML can be added?
Or else, how can I determine which entries are not well-formed and NULL
them out?
I have read several posts that suggest a CAST
/ CONVERT
in conjunction with a TRY CATCH
, (e.g. stackoverflow.com/questions/14753119), but I never get an exception, the CAST
/ CONVERT
always succeeds:
DECLARE @xml AS XML;
DECLARE @isValid AS BIT = 1;
BEGIN TRY
SET @xml = CONVERT(xml, 'Foo')
END TRY
BEGIN CATCH
SET @isValid = 0;
END CATCH;
SELECT @isValid; -- returns 1
Any ideas?
Upvotes: 0
Views: 1725
Reputation: 22321
You can try the following three approaches.
Method #: 1
We will find out if there is at least one root level node. By the way SQL Server allows not well-formed XML, i.e. just XML fragments without a root element. That's why I added that use case to the sample data population.
Also, I added an XML comment for completeness.
The outcome's meaning:
SQL
-- DDL and sample data population, start
DECLARE @TestTable TABLE (ID INT IDENTITY (1, 1) PRIMARY KEY, XmlColumn XML NOT NULL);
INSERT INTO @TestTable (XmlColumn) VALUES
(N'<root><child /></root>'),
(N'<city>Miami</city><city>Orlando</city>'),
(N'Foo, this is not XML'),
(N'<root><child /></root>Foo'),
(N'<!-- -->'),
(N'<root><parent><child /></parent></root>');
-- DDL and sample data population, end
SELECT *
, XmlColumn.value('count(/*)', 'INT') AS Result
FROM @TestTable;
Output
+----+-----------------------------------------+-----------+
| ID | XmlColumn | Result |
+----+-----------------------------------------+-----------+
| 1 | <root><child /></root> | 1 |
| 2 | <city>Miami</city><city>Orlando</city> | 2 |
| 3 | Foo, this is not XML | 0 |
| 4 | <!-- --> | 0 |
| 5 | <root><parent><child /></parent></root> | 1 |
+----+-----------------------------------------+-----------+
Method #: 2
By using an instance of element()
XQuery construct
SELECT *
, XmlColumn.query('<root>{
for $x in /*
return
if ($x instance of element()) then <r/> else ()
}</root>').value('count(/root/r)', 'INT') AS Result
, XmlColumn.query('for $x in .
return if ($x eq /*[1]) then "well-formed" else "not well- formed"').value('.','VARCHAR(20)') AS [well-formed]
FROM @TestTable;
Output
+----+-----------------------------------------+--------+-----------------+
| ID | XmlColumn | Result | well-formed |
+----+-----------------------------------------+--------+-----------------+
| 1 | <root><child /></root> | 1 | well-formed |
| 2 | <city>Miami</city><city>Orlando</city> | 2 | not well-formed |
| 3 | Foo, this is not XML | 0 | not well-formed |
| 4 | <root><child /></root>Foo | 1 | not well-formed |
| 5 | <!-- --> | 0 | not well-formed |
| 6 | <root><parent><child /></parent></root> | 1 | well-formed |
+----+-----------------------------------------+--------+-----------------+
Method #: 3
A complete solution. The algorithm is based on comparison of counts: count of any type of nodes vs. count of elements only. Additionally, it gives a breakdown of node types in the NodeList column for easy understanding of what is going on.
;WITH rs AS
(
SELECT *
, XmlColumn.value('count(/node())', 'INT') AS NodeCount -- any type of nodes
, XmlColumn.value('count(/*)', 'INT') AS ElementCount -- elements only
, XmlColumn.query('
for $x in /node()
return if ($x instance of element()) then text {"element()"}
else if ($x instance of text()) then text {"text()"}
else if ($x instance of comment()) then text {"comment()"}
else if ($x instance of processing-instruction()) then text {"processing-instruction()"}
else ()
') AS NodeList
FROM @TestTable
)
SELECT *
, CASE
WHEN NodeCount = 1 AND ElementCount = 1 THEN 'well-formed'
WHEN NodeCount = 2 AND ElementCount = 1
AND LEFT(NodeList, 24) = 'processing-instruction()' THEN 'well-formed'
WHEN NodeCount > 1 AND (ElementCount = NodeCount) THEN 'XML fragment'
WHEN NodeCount > ElementCount THEN 'not well-formed'
ELSE '???'
END AS Result
FROM rs;
Output
+----+-----------------------------------------+-----------+--------------+--------------------+-----------------+
| ID | XmlColumn | NodeCount | ElementCount | NodeList | Result |
+----+-----------------------------------------+-----------+--------------+--------------------+-----------------+
| 1 | <root><child /></root> | 1 | 1 | element() | well-formed |
| 2 | <city>Miami</city><city>Orlando</city> | 2 | 2 | element()element() | XML fragment |
| 3 | Foo, this is not XML | 1 | 0 | text() | not well-formed |
| 4 | <root><child /></root>Foo | 2 | 1 | element()text() | not well-formed |
| 5 | <!-- --> | 1 | 0 | comment() | not well-formed |
| 6 | <root><parent><child /></parent></root> | 1 | 1 | element() | well-formed |
+----+-----------------------------------------+-----------+--------------+--------------------+-----------------+
Upvotes: 2
Reputation: 6808
use tempdb
go
drop table if exists TestTable;
drop table if exists TestTablewithcheck;
drop function if exists dbo.mywellformedxml
go
CREATE TABLE TestTable (
ID INT NOT NULL IDENTITY (1, 1),
XmlColumn XML NOT NULL,
CONSTRAINT [PK_TestTable] PRIMARY KEY CLUSTERED (ID ASC) ON [PRIMARY]
) ON [PRIMARY]
GO
INSERT INTO TestTable (XmlColumn) VALUES ('<root><child /></root>');
INSERT INTO TestTable (XmlColumn) VALUES ('Foo, this is not XML');
INSERT INTO TestTable (XmlColumn) VALUES ('<root><parent><child /></parent></root>');
INSERT INTO TestTable (XmlColumn) VALUES ('<root><child /></root>Foo, this is not XML');
INSERT INTO TestTable (XmlColumn) VALUES ('<root xmlns="test"><child /></root>');
INSERT INTO TestTable (XmlColumn) VALUES ('<!-- comment -->');
INSERT INTO TestTable (XmlColumn) VALUES ('<!-- comment --><root><child /></root>');
INSERT INTO TestTable (XmlColumn) VALUES ('<noroot><child /></noroot><noroot><child /></noroot>');
INSERT INTO TestTable (XmlColumn) VALUES ('<?pi my processing instruction?>');
GO
create or alter function dbo.mywellformedxml(@xml xml)
returns bit
with schemabinding
as
begin
return
(
isnull(
(
select 1
where @xml.exist('/*[1]') = 1 --root..
and @xml.exist('/*[2]') = 0 --..only..
and @xml.exist('text()') = 0 --..without text..
), 0)
)
end
go
CREATE TABLE TestTablewithcheck (
ID INT NOT NULL IDENTITY (1, 1),
XmlColumn XML NOT NULL,
CONSTRAINT [PK_TestTablewithcheck] PRIMARY KEY CLUSTERED (ID ASC) ON [PRIMARY],
constraint chkwfxml check(dbo.mywellformedxml(XmlColumn) = 1)
) ON [PRIMARY]
GO
declare @i int = 1
while @i <= 9
begin
insert into TestTablewithcheck(XmlColumn)
select XmlColumn
from TestTable
where id = @i;
select @i = @i + 1;
end
go
select *
from TestTablewithcheck;
go
select *, dbo.mywellformedxml(XmlColumn) as wfxml
from TestTable
go
drop table if exists TestTable;
drop table if exists TestTablewithcheck;
drop function if exists dbo.mywellformedxml
go
Upvotes: 2
Reputation: 96013
Oddly, SQL Server can convert a value like 'foo'
to XML, so just trying the conversion won't actually work. What you could do, however, is check to ensure that the value starts and ends with '<'
and '>'
(which valid XML should have) and perform a TRY_CONVERT
:
SELECT CASE WHEN TRY_CONVERT(xml,XMLColumn) IS NOT NULL AND XMLColumn LIKE '<%>' THEN 1 ELSE 0 END AS IsValid
FROM (VALUES ('<root><child /></root>'),
('Foo, this is not XML'),
('<root><parent><child /></parent></root>'))V(XMLColumn);
Upvotes: 1