Dear Omar,

Some spontaneous ideas:

• You could try to evaluate redundant expressions once and bind them to a
variable instead (see the attached code).
• You could save each document to a separate database via db:create
(depending on your data, this may be faster than replacements in a single
database), or save all new elements in a single document.
• Instead of creating full index structures with each update operation, you
may save a lot of time if you only update parts of the data that have
actually changed.
• If that’s close to impossible (because the types of updates are too
manifold), you could work with daily databases that only contain
incremental changes, and merge them with the main database every night.

2,4 million tags are a lot, though; and the string length of the created
attribute values seem to exceed 100.000 characters, which is a lot, too.
What will you do with the resulting documents?

Best,
Christian


declare namespace _ = "https://www.oeaw.ac.at/acdh/tools/vle/util";;

let $id-string := function($nodes) {
  $nodes/(@ID, @xml:id)
  => subsequence(1, 10000)
  => string-join(' ')
}

let $db := '_qdb-TEI-02__cache'
let $nodes := db:open($db)/_:dryed[@order = 'none']/_:d

let $vutlsk := sort($nodes, (), function($n) { $n/@vutlsk })
let $archiv := sort($nodes, (), function($n) { $n/@vutlsk-archiv })

return (
  db:replace($db, 'ascending_cache.xml',
    <_:dryed order="ascending" ids="{ $id-string($vutlsk) }"/>),
  db:replace($db, 'descending_cache.xml',
    <_:dryed order="descending" ids="{ $id-string(reverse($vutlsk)) }"/>),
  db:replace($db, 'ascending-archiv_cache.xml',
    <_:dryed order="ascending" ids="{ $id-string($archiv) }"
label="archiv"/>),
  db:replace($db, 'descending-archiv_cache.xml',
    <_:dryed order="descending" ids="{ $id-string(reverse($archiv)) }"
label="archiv"/>)
)
____________________________

On Tue, Nov 12, 2019 at 6:00 PM Omar Siam <omar.s...@oeaw.ac.at> wrote:

> Hi,
>
> I have a custom index that looks like this (one db, different files):
>
> <_:dryed xmlns:_="https://www.oeaw.ac.at/acdh/tools/vle/util";
> db_name="z881_qdb-TEI-02n" order="none">
>    <_:d pre="15627" db_name="z881_qdb-TEI-02n" xml:id="z881_qdbn-d16e2"
> vutlsk="tsįttr Ziter [Subst]" vutlsk-archiv="HK 881, z8810118.sch#1"/>
>    <_:d pre="15673" db_name="z881_qdb-TEI-02n" xml:id="z881_qdbn-d16e21"
> vutlsk="tsįttr Ziter [Subst]" vutlsk-archiv="HK 881, z8810118.sch#1"/>
> ...
> </_:dryed>
> <_:dryed xmlns:_="https://www.oeaw.ac.at/acdh/tools/vle/util";
> db_name="f227_qdb-TEI-02n" order="none">
>    <_:d pre="467" db_name="f227_qdb-TEI-02n" xml:id="f237_qdb-d1e29398"
> vutlsk="(aus)faren [Verb]" vutlsk-archiv="HK 327, f227#944.1 =
> fare0126.eck#1.1"/>
>    <_:d pre="591" db_name="f227_qdb-TEI-02n" xml:id="f237_qdb-d1e29438"
> vutlsk="(aus)faren [Verb]" vutlsk-archiv="HK 327, f227#945.1 =
> fare0126.eck#2.1"/>
> ...
> </_:dryed>
>
> There are about 2.4 Mio _:d tags in this db.
>
> I need to sort them by the @vutlsk* attributes alphabetically in
> ascending and descending order.
>
> With the code I have now:
>
> declare namespace _ = "https://www.oeaw.ac.at/acdh/tools/vle/util";;
>
> let $sorted-ascending := subsequence(for $d in
> collection('_qdb-TEI-02__cache')//*[@order="none"]/_:d
>    order by $d/@vutlsk ascending
>    return $d/(@ID, @xml:id)/data(), 1, 10000)
> let $sorted-descending := subsequence(for $d in
> collection('_qdb-TEI-02__cache')//*[@order="none"]/_:d
>    order by $d/@vutlsk descending
>    return $d/(@ID, @xml:id)/data(), 1, 10000)
> let $sorted-ascending-archiv := subsequence(for $d in
> collection('_qdb-TEI-02__cache')//*[@order="none"]/_:d
>    order by $d/@vutlsk-archiv ascending
>    return $d/(@ID, @xml:id)/data(), 1, 10000)
> let $sorted-descending-archiv := subsequence(for $d in
> collection('_qdb-TEI-02__cache')//*[@order="none"]/_:d
>    order by $d/@vutlsk-archiv descending
>    return $d/(@ID, @xml:id)/data(), 1, 10000)
> return (db:replace("_qdb-TEI-02__cache", 'ascending_cache.xml', <_:dryed
> order="ascending" ids="{string-join($sorted-ascending, ' ')}"/>),
> db:replace("_qdb-TEI-02__cache", 'descending_cache.xml', <_:dryed
> order="descending" ids="{string-join($sorted-descending, ' ')}"/>),
> db:replace("_qdb-TEI-02__cache", 'ascending-archiv_cache.xml', <_:dryed
> order="ascending" label="archiv"
> ids="{string-join($sorted-ascending-archiv, ' ')}"/>),
> db:replace("_qdb-TEI-02__cache", 'descending-archiv_cache.xml', <_:dryed
> order="descending" label="archiv"
> ids="{string-join($sorted-descending-archiv, ' ')}"/>))
>
> This takes 30 s to about a minute depending on the subsequence I choose.
>
> I did experiments with doing multithreading and not. Multiple jobs or
> fork-join make it worse.
>
> Worst case I need to do it every time I save a change to the original
> DBs for which I maintain that index.
>
> Any ideas how to speed this up?
>
> Best regards
>
> Omar Siam
>
>

Reply via email to