Curious what you had to modify ? 

I actually ran this inside Stylus Studio using Data Direct XQuery ... didnt 
test directly in ML, but I'm surprised if it needed changes to work in ML.  
(besides the obvious of creating the search instead of printing the words).  
Its prety generic xquery.

 

 

 

From: [email protected] 
[mailto:[email protected]] On Behalf Of Mariano Grau Calín
Sent: Sunday, December 13, 2009 2:04 PM
To: General Mark Logic Developer Discussion
Subject: RE: [MarkLogic Dev General] Expression regular for 
tokenizingwordsquotes

 

Thanks a lot.

 

I have had to modify code for ML 3.2 but works fine

 

 

Mariano Grau

Dpto. Sistemas

Grupo Joly

 

 

________________________________

De: [email protected] en nombre de Lee, David
Enviado el: dom 13/12/2009 15:36
Para: General Mark Logic Developer Discussion
Asunto: RE: [MarkLogic Dev General] Expression regular for tokenizingwordsquotes

I dont think fn:tokenize  can do this directly.

Here's an example I put together that seems to do the trick:

query

declare variable $phrase := 'one two "three four" five six "seven eight" nine 
10';


(: $before is the resultant sequence,
   $after is the sequence left to be combined
 :)
declare function local:combine( $before as xs:string* , $after as xs:string* ) 
as xs:string*
{
    if( empty($after) ) then $before
    else
    let $blanks := fn:index-of( $after , '' )
    return
        if( empty($blanks) ) then 
            ( $before , $after )
        else
            local:combine( ($before , 
                fn:subsequence( $after , 1 , $blanks[1] - 1 ) , 
                fn:string-join( fn:subsequence($after , $blanks[1] + 1 , 
$blanks[2] - $blanks[1] ) , " " )
                ),
                fn:subsequence( $after , $blanks[2] + 1 ) )


};



(: split phrase into quoted words 

   Note that tokenize will produce a blank word when it encounters two 
seperators

  This does require you put blanks before and after your quotes

:)

declare function local:split( $p as xs:string ) as xs:string*
{
    local:combine((),fn:tokenize($p,' +|"'))
};

<words>
{
for $w in local:split( $phrase) 
return
<word>{$w}</word>
}
</words> 

 

 

---------------------------   Result

<words>

<word>one</word>

<word>two</word>

<word>three four </word>

<word>five</word>

<word>six</word>

<word>seven eight </word>

<word>nine</word>

<word>10</word

></words> 

 

 

 

From: [email protected] 
[mailto:[email protected]] On Behalf Of Mariano Grau Calín
Sent: Sunday, December 13, 2009 8:55 AM
To: [email protected]
Subject: [MarkLogic Dev General] Expression regular for tokenizing wordsquotes

 

Hi all,

 

I want to tokenize a text as:

 

'one two "three four" five six" 

 

in

 

"one", "two", "three four", "five", "six"

 

I tried

 

cts:word-query(fn:tokenize('one two "three four" five six"', 
' |"'))

 

result:

<results warning="non-element node">cts:word-query(("one", "two", "", "three", 
"four", "", "five", "six", ""), ("lang=es"), 1)</results>

and

 

cts:word-query(fn:tokenize('one two "three four" five six"', 
'["\.*"]'))

 

result:

 <results warning="non-element node">cts:word-query(("one two ", "three four", 
" five six", ""), ("lang=es"), 1)</results> 

 

Thanks,

 

Mariano Grau

Dpto. Sistemas

Grupo Joly

 

_______________________________________________
General mailing list
[email protected]
http://xqzone.com/mailman/listinfo/general

Reply via email to