Reputation: 491
I have some html stripped out of an online dictionary that I want to convert to XML for eventual conversion into a word list for a BK-tree. The online dictionary records variant spellings, but sometimes does so by putting a vowel or ending that may or may not appear in parenthesis, like so:
<td>
<span class="FORM">
<span class="HDORTH">a</span>
<span class="POS"> indef. art. </span> Also
<span class="ORTH">an</span>. Early forms: as subj.,
<span class="ORTH">ane</span>,
<span class="ORTH">on</span>,
<span class="ORTH">o</span>; as obj.,
<span class="ORTH">ane</span>,
<span class="ORTH">on(e</span>,
<span class="ORTH">o</span>, & (chiefly masc.)
<span class="ORTH">an(n)e</span>,
<span class="ORTH">æn(n)e</span>,
<span class="ORTH">en(n)e</span>,
<span class="ORTH">en</span>; after prep.,chiefly
<span class="ORTH">ane</span>,
<span class="ORTH">on(e</span>, masc. also
<span class="ORTH">anne</span>,
<span class="ORTH">æn(n)e</span>, fem. also
<span class="ORTH">anre</span>,
<span class="ORTH">are</span>,
<span class="ORTH">hare</span>,
<span class="ORTH">ore</span>; gen.
<span class="ORTH">anes</span>,
<span class="ORTH">æn(n)es</span>,
<span class="ORTH">en(n)es</span>.</span>
</td>
I've written the following XQuery to convert the HTML TO XML, stripping anything that's not in tags and selecting elements based on the class of the particular span:
declare function local:node-change($nodes as node()*) as node()* {
for $span in $nodes
return
if ($span/@class = "HDORTH") then <headword>{$span/text()}</headword>
else if ($span/@class = "POS") then <part_of_speech>{$span/text()}</part_of_speech>
else if ($span/@class = "ORTH") then <variant>{$span/text()}</variant>
else $span
} ;
<list>
{
let $collection:=concat($collection, '?select=*.xml')
let $q:=collection($collection)
for $y in $q
let $s := $y/td/span/*
let $c := local:node-change($s)
(:let $l := local:stripleftparen($c):)
order by number(substring(substring-before(tokenize(document-uri($y), "/")[last()],"."),4))
return
<entry ref="{number(substring(substring-before(tokenize(document-uri($y), "/")[last()],"."),4))}">{$c}</entry>
}
</list>
This returns the following XML:
<entry ref="3">
<headword>a</headword>
<part_of_speech> indef. art. </part_of_speech>
<variant>an</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>o</variant>
<variant>ane</variant>
<variant>on(e</variant>
<variant>o</variant>
<variant>an(n)e</variant>
<variant>æn(n)e</variant>
<variant>en(n)e</variant>
<variant>en</variant>
<variant>ane</variant>
<variant>on(e</variant>
<variant>anne</variant>
<variant>æn(n)e</variant>
<variant>anre</variant>
<variant>are</variant>
<variant>hare</variant>
<variant>ore</variant>
<variant>anes</variant>
<variant>æn(n)es</variant>
<variant>en(n)es</variant>
</entry>
What I need to do now is clone the nodes that have parens, so that I can modify the clone and have the following result, but I'm not sure how to do so.
<entry ref="3">
<headword>a</headword>
<part_of_speech> indef. art. </part_of_speech>
<variant>an</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>o</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>one</variant>
<variant>o</variant>
<variant>ane</variant>
<variant>anne</variant>
<variant>æne</variant>
<variant>ænne</variant>
<variant>ene</variant>
<variant>enne</variant>
<variant>en</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>one</variant>
<variant>anne</variant>
<variant>æne</variant>
<variant>ænne</variant>
<variant>anre</variant>
<variant>are</variant>
<variant>hare</variant>
<variant>ore</variant>
<variant>anes</variant>
<variant>ænes</variant>
<variant>ænnes</variant>
<variant>enes</variant>
<variant>ennes</variant>
</entry>
I know I need to use substring, substring-before, or substring-after to actually modify the node, but where I'm having problems is in the actual cloning process. Copy
doesn't work within the for/return loop, and everything I've found online either suggests that for copying nodes or talks about de-duplicating data (which I'll need to do, but I want to get it exactly as I want it before I do so). How can I copy a node, modify the copy, and display the results so that I can get what I'm looking for?
Upvotes: 0
Views: 273
Reputation: 8877
I am not clear what the rules are. But it seems to me that you should be able to do it all at once in the node-change function.
I would think you could do something along these lines:
declare function local:node-change($nodes as node()*) as node()* {
for $span in $nodes
let $varient1 :=
if ($span/@class = "HDORTH") then <headword>{$span/text()}</headword>
else if ($span/@class = "POS") then <part_of_speech>{$span/text()}</part_of_speech>
else if ($span/@class = "ORTH" and contains($span/text(),')')) then <variant>{translate($span/text(),'()','')}</variant>
else $span
(:Implement some if here to get the other varient if needed :)
let $varient2 := <varient/>
return
($varient1, $varient2)
} ;
If there are three varients (not sure) just follow the same pattern. Of course, the else for any one of the addition varients could be empty element and you could strip that at the end (i.e. else <empty/> ) and then just strip any <empty/> in the result.
something like this maybe (guessing at the rules):
xquery version "3.0";
declare function local:node-change($nodes as node()*) as node()* {
for $span in $nodes
let $varient1 :=
if ($span/@class = "HDORTH") then <headword>{$span/text()}</headword>
else if ($span/@class = "POS") then <part_of_speech>{$span/text()}</part_of_speech>
else if ($span/@class = "ORTH" and contains($span/text(),'(') and not(contains($span/text(),')'))) then <variant>{substring-before($span/text(),'(')}</variant>
else if ($span/@class = "ORTH" and contains($span/text(),'(') and contains($span/text(),')')) then <variant>{concat(substring-before($span/text(),'('),substring-after($span/text(),')'))}</variant>
else if ($span/@class = "ORTH" and not(contains($span/text(),'(')) and not(contains($span/text(),')'))) then <variant>{$span/text()}</variant>
else $span
let $varient2 := if ($span/@class = "ORTH" and contains($span/text(),'(') and not(contains($span/text(),')'))) then <variant>{translate($span/text(),'(','')}</variant>
else <empty/>
let $varient3 := if ($span/@class = "ORTH" and contains($span/text(),'(') and contains($span/text(),')')) then <variant>{translate($span/text(),'()','')}</variant>
else <empty/>
return
($varient1, $varient2, $varient3)
} ;
let $cell := <td>
<span class="FORM">
<span class="HDORTH">a</span>
<span class="POS"> indef. art. </span> Also
<span class="ORTH">an</span>. Early forms: as subj.,
<span class="ORTH">ane</span>,
<span class="ORTH">on</span>,
<span class="ORTH">o</span>; as obj.,
<span class="ORTH">ane</span>,
<span class="ORTH">on(e</span>,
<span class="ORTH">o</span>, & (chiefly masc.)
<span class="ORTH">an(n)e</span>,
<span class="ORTH">æn(n)e</span>,
<span class="ORTH">en(n)e</span>,
<span class="ORTH">en</span>; after prep.,chiefly
<span class="ORTH">ane</span>,
<span class="ORTH">on(e</span>, masc. also
<span class="ORTH">anne</span>,
<span class="ORTH">æn(n)e</span>, fem. also
<span class="ORTH">anre</span>,
<span class="ORTH">are</span>,
<span class="ORTH">hare</span>,
<span class="ORTH">ore</span>; gen.
<span class="ORTH">anes</span>,
<span class="ORTH">æn(n)es</span>,
<span class="ORTH">en(n)es</span>.</span>
</td>
let $s := $cell/span/*
let $c := local:node-change($s)
return
$c[not(local-name()='empty')]
returns this:
<headword>a</headword>
<part_of_speech> indef. art. </part_of_speech>
<variant>an</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>o</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>one</variant>
<variant>o</variant>
<variant>ane</variant>
<variant>anne</variant>
<variant>æne</variant>
<variant>ænne</variant>
<variant>ene</variant>
<variant>enne</variant>
<variant>en</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>one</variant>
<variant>anne</variant>
<variant>æne</variant>
<variant>ænne</variant>
<variant>anre</variant>
<variant>are</variant>
<variant>hare</variant>
<variant>ore</variant>
<variant>anes</variant>
<variant>ænes</variant>
<variant>ænnes</variant>
<variant>enes</variant>
<variant>ennes</variant>
Upvotes: 1