Reputation: 874
I'm trying to do some Schematron validation with lxml. For the specific application I'm working at, it's important that any tests that failed the validation are reported back. The lxml documentation mentions the presence of the validation_report
property object. I think this should contain the info I'm looking for, but I just can't figure out how work with it. Here's some example code that demonstrates my problem (adapted from http://lxml.de/validation.html#id2; tested with Python 2.7.4):
import StringIO
from lxml import isoschematron
from lxml import etree
def main():
# Schema
f = StringIO.StringIO('''\
<schema xmlns="http://purl.oclc.org/dsdl/schematron" >
<pattern id="sum_equals_100_percent">
<title>Sum equals 100%.</title>
<rule context="Total">
<assert test="sum(//Percent)=100">Sum is not 100%.</assert>
</rule>
</pattern>
</schema>
''')
# Parse schema
sct_doc = etree.parse(f)
schematron = isoschematron.Schematron(sct_doc, store_report = True)
# XML to validate - validation will fail because sum of numbers
# not equal to 100
notValid = StringIO.StringIO('''\
<Total>
<Percent>30</Percent>
<Percent>30</Percent>
<Percent>50</Percent>
</Total>
''')
# Parse xml
doc = etree.parse(notValid)
# Validate against schema
validationResult = schematron.validate(doc)
# Validation report (assuming here this is where reason
# for validation failure is stored, but perhaps I'm wrong?)
report = isoschematron.Schematron.validation_report
print("is valid: " + str(validationResult))
print(dir(report.__doc__))
main()
Now, from the value of validationResult
I can see that the validation failed (as expected), so next I would like to know why. The result of the second print statement gives me:
['__add__', '__class__', '__contains__', '__delattr__', '__doc__', '__eq__', '__
format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__get
slice__', '__gt__', '__hash__', '__init__', '__le__', '__len__', '__lt__', '__mo
d__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__',
'__rmod__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook
__', '_formatter_field_name_split', '_formatter_parser', 'capitalize', 'center',
'count', 'decode', 'encode', 'endswith', 'expandtabs', 'find', 'format', 'index
', 'isalnum', 'isalpha', 'isdigit', 'islower', 'isspace', 'istitle', 'isupper',
'join', 'ljust', 'lower', 'lstrip', 'partition', 'replace', 'rfind', 'rindex', '
rjust', 'rpartition', 'rsplit', 'rstrip', 'split', 'splitlines', 'startswith', '
strip', 'swapcase', 'title', 'translate', 'upper', 'zfill']
Which is about as far as I'm getting, based on the documentation and this related question. Could well be something really obvious I'm overlooking?
Upvotes: 8
Views: 3400
Reputation: 7548
After running into this problem, here's my general approach (for posterity):
My schematron file looks like this:
<schema xmlns="http://purl.oclc.org/dsdl/schematron" >
<pattern>
<rule context="mastery">
<assert test="count(*) > 0">
Ability Check Mastery must have > 0 values set.
</assert>
</rule>
<rule context="fate">
<assert test="count(*) > 0">
Ability Check Fate must have > 0 values set.
</assert>
</rule>
</pattern>
</schema>
Validating my xml using the schematron above produces the following report xml tree:
<svrl:schematron-output xmlns:svrl="http://purl.oclc.org/dsdl/svrl" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:schold="http://www.ascc.net/xml/schematron" xmlns:sch="http://www.ascc.net/xml/schematron" xmlns:iso="http://purl.oclc.org/dsdl/schematron" title="" schemaVersion="">
<!--
-->
<svrl:active-pattern/>
<svrl:fired-rule context="mastery"/>
<svrl:fired-rule context="fate"/>
<svrl:fired-rule context="mastery"/>
<svrl:fired-rule context="fate"/>
<svrl:fired-rule context="mastery"/>
<svrl:fired-rule context="fate"/>
<svrl:fired-rule context="mastery"/>
<svrl:fired-rule context="fate"/>
<svrl:failed-assert test="count(*) > 0" location="/abilitygroup/ability[5]/abilitycheck/fate">
<svrl:text>
Ability Check Fate must have > 0 values set.
</svrl:text>
</svrl:failed-assert>
<svrl:fired-rule context="mastery"/>
<svrl:fired-rule context="mastery"/>
</svrl:schematron-output>
The code that runs the validation and parses the report follows:
import lxml
from lxml.isoschematron import Schematron
#** 1. Validate the source xml tree against the schemetron validator.
schematron_doc = etree.parse("foo.schematron"))
schematron_validator = Schematron(schematron_doc, store_report=True)
# Parse the xml
tree = etree.parse(fname)
# Schematron validation
result = schematron_validator.validate(tree)
if not result: # Failed validation?
#** 2. That generates a report which is also an xml tree.
# The report is just an xml tree.
report = schematron_validator.validation_report
# DEBUGGING: You can look at the report xml
report_xml_str = etree.tostring(report, pretty_print=True).decode()
print(f"Report:\n{xml_report_str}\n\n")
#** 3. Extract the failure messages from the report xml
# You can iterate over the elements of the report..
for child in report.getiterator():
match (child.__class__, tag):
case (lxml.etree._Comment, _):
pass
case (_, "schematron-output"):
pass
case (_, "active-pattern"):
pass
case (_, "fired-rule"):
pass
case (_, "text"):
pass
case (_, "failed-assert"):
#** 4. Present failures in a human readable format
# These are the only entries I care about (so far)
log_schematron_assert_error(child, fname, tree)
# found something that may or may not be interesting.
# needs further investigation so we can decide to log or ignore it.
case _:
raise Exception(f"UNKNOWN CHILD {child.__class__} {tag}")
raise Exception(f"Schematron error {report}")
def log_schematron_assert_error(child, fname, source_tree):
"""
Go crazy with the error logging...
"""
# You can look at the items for the child.
# for k, v in child.items():
# print(f"\t{k} {v}")
# Get the test that failed
test = child.get("test")
# Get the xpath to the element in the source that failed
source_xpath = child.get("location")
# Get the element from the source tree
elements = source_tree.xpath(source_xpath)
# For some reason it's a list.
if len(elements) != 1:
raise Exception("Unknown or missing elements")
element = elements[0]
# print some more detailed debug information
print(f"Schematron error '{test}' at "
f"{element.tag} in {fname}:{element.sourceline}")
# Go the extra mile with debug logs (optional)
print(get_error_context(fname, element.sourceline))
def get_error_context(fname, error_line_number):
"""
Returns the neighbouring lines around an xml error for
debug context.
"""
context = ""
with open(fname, "r") as f:
lines = f.readlines()
from_line = max(error_line_number - 7, 0)
to_line = min(error_line_number + 7, len(lines))
for line_number in range(from_line, to_line):
if line_number + 1 == error_line_number:
ptr = "=>"
else:
ptr = " "
context += "%5s %2s %s" % (line_number, ptr, lines[line_number])
return context
That code above produces the following output, which is basically the schematron assert that failed and the file, line and context of the assertion error in the source xml file. N.B YMMV as complicated Schematron assertions may have context all over the place:
Schematron error 'count(*) > 0' at fate in foobar.xml:222
215 <keywords></keywords>
216 <dc><opponents-defend/></dc>
217 <mastery>
218 <critsuccess></critsuccess>
219 <critfail></critfail>
220 </mastery>
221 => <fate>
222 <!-- <boon></boon> -->
223 <!-- <indifferent></indifferent> -->
224 <!-- <bane></bane> -->
225 </fate>
226 </abilitycheck>
227 <spline><p x="90" y="90"/></spline>
228 <abilitydescription>
Problem parsing file: foobar.xml
Traceback (most recent call last):
... STACK TRACE HERE ...
Upvotes: 0
Reputation: 874
OK, so someone on Twitter gave me a suggestion which made me realise that I mistakenly got the reference to the schematron class all wrong. Since there don't seem to be any clear examples, I'll share my working solution below:
import StringIO
from lxml import isoschematron
from lxml import etree
def main():
# Example adapted from http://lxml.de/validation.html#id2
# Schema
f = StringIO.StringIO('''\
<schema xmlns="http://purl.oclc.org/dsdl/schematron" >
<pattern id="sum_equals_100_percent">
<title>Sum equals 100%.</title>
<rule context="Total">
<assert test="sum(//Percent)=100">Sum is not 100%.</assert>
</rule>
</pattern>
</schema>
''')
# Parse schema
sct_doc = etree.parse(f)
schematron = isoschematron.Schematron(sct_doc, store_report = True)
# XML to validate - validation will fail because sum of numbers
# not equal to 100
notValid = StringIO.StringIO('''\
<Total>
<Percent>30</Percent>
<Percent>30</Percent>
<Percent>50</Percent>
</Total>
''')
# Parse xml
doc = etree.parse(notValid)
# Validate against schema
validationResult = schematron.validate(doc)
# Validation report
report = schematron.validation_report
print("is valid: " + str(validationResult))
print(type(report))
print(report)
main()
The print statement on the report now results in the following output:
<?xml version="1.0" standalone="yes"?>
<svrl:schematron-output xmlns:svrl="http://purl.oclc.org/dsdl/svrl" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:schold="http://www.ascc.net/xml/schematron" xmlns:sch="http://www.ascc.net/xml/schematron" xmlns:iso="http://purl.oclc.org/dsdl/schematron" title="" schemaVersion="">
<!--
-->
<svrl:active-pattern id="sum_equals_100_percent" name="Sum equals 100%."/>
<svrl:fired-rule context="Total"/>
<svrl:failed-assert test="sum(//Percent)=100" location="/Total">
<svrl:text>Sum is not 100%.</svrl:text>
</svrl:failed-assert>
</svrl:schematron-output>
Which is exactly what I was looking for!
Upvotes: 10