I dont think fn:tokenize  can do this directly.

Here's an example I put together that seems to do the trick:

query

declare variable $phrase := 'one two "three four" five six "seven eight" nine 
10';


(: $before is the resultant sequence,
   $after is the sequence left to be combined
 :)
declare function local:combine( $before as xs:string* , $after as xs:string* ) 
as xs:string*
{
    if( empty($after) ) then $before
    else
    let $blanks := fn:index-of( $after , '' )
    return
        if( empty($blanks) ) then 
            ( $before , $after )
        else
            local:combine( ($before , 
                fn:subsequence( $after , 1 , $blanks[1] - 1 ) , 
                fn:string-join( fn:subsequence($after , $blanks[1] + 1 , 
$blanks[2] - $blanks[1] ) , " " )
                ),
                fn:subsequence( $after , $blanks[2] + 1 ) )


};




(: split phrase into quoted words 

   Note that tokenize will produce a blank word when it encounters two 
seperators

  This does require you put blanks before and after your quotes

:)

declare function local:split( $p as xs:string ) as xs:string*
{
    local:combine((),fn:tokenize($p,' +|"'))
};

<words>
{
for $w in local:split( $phrase) 
return
<word>{$w}</word>
}
</words> 

 

 

---------------------------   Result

<words>

<word>one</word>

<word>two</word>

<word>three four </word>

<word>five</word>

<word>six</word>

<word>seven eight </word>

<word>nine</word>

<word>10</word

></words> 

 

 

 

From: [email protected] 
[mailto:[email protected]] On Behalf Of Mariano Grau Calín
Sent: Sunday, December 13, 2009 8:55 AM
To: [email protected]
Subject: [MarkLogic Dev General] Expression regular for tokenizing wordsquotes

 

Hi all,

 

I want to tokenize a text as:

 

'one two "three four" five six" 

 

in

 

"one", "two", "three four", "five", "six"

 

I tried

 

cts:word-query(fn:tokenize('one two "three four" five six"', 
' |"'))

 

result:

<results warning="non-element node">cts:word-query(("one", "two", "", "three", 
"four", "", "five", "six", ""), ("lang=es"), 1)</results>

and

 

cts:word-query(fn:tokenize('one two "three four" five six"', 
'["\.*"]'))

 

result:

 <results warning="non-element node">cts:word-query(("one two ", "three four", 
" five six", ""), ("lang=es"), 1)</results> 

 

Thanks,

 

Mariano Grau

Dpto. Sistemas

Grupo Joly

 

_______________________________________________
General mailing list
[email protected]
http://xqzone.com/mailman/listinfo/general

Reply via email to