Reputation: 13
I'm trying to write an XQuery function to tokenize a string on a delimiter whilst ignoring delimiters inside nested bracketed expressions e.g.
tokenizeOutsideBrackets("1,(2,3)" , ",") => ( "1" , "(2,3)" )
tokenizeOutsideBrackets("1,(2,(3,4))" , ",") => ( "1" , "(2,(3,4))" )
tokenizeOutsideBrackets("1,(2,(3,(4,5)))" , ",") => ( "1" , "(2,(3,(4,5)))" )
tokenizeOutsideBrackets("1,(2,(3,4),5),6" , ",") => ( "1" , "(2,(3,4),5)" , "6" )
If I had recursive regexes or an imperative language this would be fairly trivial but I'm struggling to find a simple, easy way to do this in XQuery.
Thanks!
Upvotes: 1
Views: 1566
Reputation: 13
Been playing about and the function below seems to work, although I can't help thinking there's an easier way.
This code uses the functx:index-of-string function to find the indexes of all the delimiters. It then tries each to find the first delimiter where everything to the left has an equal number of opening and closing brackets. After this is found, this is repeated with everything to the right of this delimiter.
declare function local:tokenizeOutsideBrackets(
$arg as xs:string?,
$delimiter as xs:string) as xs:string*
{
if (contains($arg, $delimiter))
then
(:find positions of all the delimiters:)
let $delimiterPositions := (
functx:index-of-string($arg,$delimiter),
string-length($arg)+1 (:Add in end of string too:)
)
(:strip out all the fragments that have matching
brackets to the left of each delimiter:)
let $fragments :=
for $endPos in $delimiterPositions
let $candidateString := substring($arg,1,$endPos - 1)
return
if (local:hasMatchedBrackets($candidateString))
then $candidateString
else ()
let $firstFragment := $fragments[1]
let $endPos := string-length($firstFragment)
(:recursively return the first matching fragment,
plus the fragments in the remaining string:)
return
(
$firstFragment,
local:tokenizeOutsideBrackets(
substring(
$arg,
$endPos+string-length($delimiter)+1,
string-length($arg) - $endPos -(string-length($delimiter))
),
$delimiter
)
)
else if ($arg='') then () else ($arg)
};
declare function local:hasMatchedBrackets($arg as xs:string) as xs:boolean
{
count(tokenize($arg,'\(')) = count(tokenize($arg,'\)'))
};
Upvotes: 0
Reputation:
This XQuery expression:
tokenize(replace('1,(2,(3,4),5),6','([0123456789]+|\(.*\))(,)?','$1;'),';')
Output:
1 (2,(3,4),5) 6
Update: If there is going to be strings like '1,(2,3),(4,5),6'
, then you will need a parser for this grammar:
exp ::= term ( ',' term ) *
term ::= num | '(' exp ')'
num ::= ( '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' ) +
Upvotes: 1
Reputation: 5256
One way to do this is to split first, then join tokens with unbalanced parentheses to their right hand side neighbors.
The code below will get you the desired results. It uses fn:tokenize to split, then (tail-) recursively processes the result tokens, concatenating when the preceding token has non-matching counts of "(" and ")". There are some deficiencies of this approach, namely failure to match left and right brackets properly, and treating $delimiter both as a pattern and as a literal. More coding is necessary to properly handle, however you might get the idea.
declare function local:tokenizeOutsideBrackets($string, $delimiter)
{
local:joinBrackets(tokenize($string, $delimiter), $delimiter, ())
};
declare function local:joinBrackets($tokens, $delimiter, $result)
{
if (empty($tokens)) then
$result
else
let $last := $result[last()]
let $new-result :=
if (string-length(translate($last, "(", ""))
= string-length(translate($last, ")", ""))) then
($result, $tokens[1])
else
($result[position() < last()], concat($last, $delimiter, $tokens[1]))
return local:joinBrackets($tokens[position() > 1], $delimiter, $new-result)
};
Upvotes: 0