Hello everyone,
Let me go straight to the point. I have the upcoming version of kdepim
installed and found myself with a huge amount of duplicate information in
nepomuk. This slows down searches and makes things like kpeopletag nearly
unusable. I started playing around with a script I found in playground
(playground/nepomuk-kde/toolbox/mergeresources.py). The attached version
exposes this problem.
After a month of using the upcoming kdepim this script took nearly 6 hours to
complete for the tasks alone. Since more and more nepomuk-feeders are being
deployed and used this should be fixed in there. I am contributing my changes
to the mentioned script so that people involved in the kdepim development can
solve this issue.
Thanks a lot for all of your work. It is greatly appreciated.
--
Luis A. C. Silva
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from PyKDE4.nepomuk import Nepomuk
from PyKDE4.soprano import Soprano
from PyQt4.QtCore import QUrl, QDateTime
import sys
def mergeResources(res, resToMerge):
for uri in resToMerge:
print Soprano.Node(uri).toString()
Nepomuk.ResourceManager.instance().mainModel().removeAllStatements(Soprano.Node(uri),Soprano.Node(),Soprano.Node())
sl = Nepomuk.ResourceManager.instance().mainModel().listStatements(Soprano.Node(),Soprano.Node(),Soprano.Node(uri)).allStatements()
Nepomuk.ResourceManager.instance().mainModel().removeAllStatements(Soprano.Node(),Soprano.Node(),Soprano.Node(uri))
for s in sl:
s.setObject(Soprano.Node(res))
Nepomuk.ResourceManager.instance().mainModel().addStatement(s)
def purgeResources(resToMerge):
for uri in resToMerge:
print Soprano.Node(uri).toString()
Nepomuk.ResourceManager.instance().mainModel().removeAllStatements(Soprano.Node(uri),Soprano.Node(),Soprano.Node())
Nepomuk.ResourceManager.instance().mainModel().removeAllStatements(Soprano.Node(),Soprano.Node(),Soprano.Node(uri))
def findMusicAlbumDuplicates():
"Returns a list of lists of QUrls"
result = []
query = "select distinct ?title count(?r) where { ?r a nmm:MusicAlbum . ?r nie:title ?title . }"
albumTitles = Nepomuk.ResourceManager.instance().mainModel().executeQuery(query,Soprano.Query.QueryLanguageSparql).allBindings()
for title in albumTitles:
print "Music Album: %s (%d)" % (title.value(0).toString(),title.value(1).literal().toInt())
query = "select ?r where { ?r a nmm:MusicAlbum . ?r nie:title \"%s\"^^xsd:string . }" % title.value(0).toString().replace('"','\\"')
result.append( [ n.uri() for n in Nepomuk.ResourceManager.instance().mainModel().executeQuery(query,Soprano.Query.QueryLanguageSparql).iterateBindings(0).allNodes() ] )
return result
def findPeopleDuplicates():
"Returns a list of lists of QUrls"
result = []
query = "select distinct ?people count(?r) where { ?r a nco:PersonContact . ?r nao:prefLabel ?people . }"
names = Nepomuk.ResourceManager.instance().mainModel().executeQuery(query,Soprano.Query.QueryLanguageSparql).allBindings()
for name in names:
print "Contact: %s (%d)" % (name.value(0).toString(),name.value(1).literal().toInt())
query = "select ?r where { ?r a nco:PersonContact . ?r nao:prefLabel \"%s\"^^xsd:string . }" % name.value(0).toString().replace('"','\\"')
result.append( [ n.uri() for n in Nepomuk.ResourceManager.instance().mainModel().executeQuery(query,Soprano.Query.QueryLanguageSparql).iterateBindings(0).allNodes() ] )
return result
def findTagDuplicates():
"Returns a list of lists of QUrls"
result = []
query = "select distinct ?label count(?r) where { ?r a nao:Tag . ?r nao:prefLabel ?label . }"
tagLabels = Nepomuk.ResourceManager.instance().mainModel().executeQuery(query,Soprano.Query.QueryLanguageSparql).allBindings()
for label in tagLabels:
print "Tag: %s (%d)" % (label.value(0).toString(),label.value(1).literal().toInt())
query = "select ?r where { ?r a nao:Tag . ?r nao:prefLabel \"%s\"^^xsd:string . }" % label.value(0).toString().replace('"','\\"')
result.append( [ n.uri() for n in Nepomuk.ResourceManager.instance().mainModel().executeQuery(query,Soprano.Query.QueryLanguageSparql).iterateBindings(0).allNodes() ] )
return result
def findTaskDuplicates():
"Returns a list of lists of QUrls"
result = []
query = "select distinct ?task count(?r) where { ?r nao:hasSymbol 'view-pim-task'^^<http://www.w3.org/2001/XMLSchema#string> . ?r nao:prefLabel ?task . }"
tasks = Nepomuk.ResourceManager.instance().mainModel().executeQuery(query,Soprano.Query.QueryLanguageSparql).allBindings()
for task in tasks:
print "Task: %s (%d)" % (task.value(0).toString(),task.value(1).literal().toInt())
query = "select ?r where { ?r nao:hasSymbol 'view-pim-task'^^<http://www.w3.org/2001/XMLSchema#string> . ?r nao:prefLabel \"%s\"^^xsd:string . }" % task.value(0).toString().replace('"','\\"')
result.append( [ n.uri() for n in Nepomuk.ResourceManager.instance().mainModel().executeQuery(query,Soprano.Query.QueryLanguageSparql).iterateBindings(0).allNodes() ] )
return result
print "Merging Music Album Duplicates..."
albums = findMusicAlbumDuplicates()
for duplicates in albums:
mergeResources(duplicates[0], duplicates[1:])
print "Merging Tag Duplicates..."
tags = findTagDuplicates()
for duplicates in tags:
mergeResources(duplicates[0],duplicates[1:])
print "Merging People Duplicates..."
people = findPeopleDuplicates()
for duplicates in people:
mergeResources(duplicates[0],duplicates[1:])
print "Merging Task Duplicates..."
tasks = findTaskDuplicates()
for duplicates in tasks:
mergeResources(duplicates[0],duplicates[1:])
# purgeResources(duplicates[1:])
# print "Purging Task Duplicates..."
# tasks = findTaskDuplicates()
# for duplicates in tasks:
# purgeResources(duplicates[1:])
_______________________________________________
Nepomuk mailing list
[email protected]
https://mail.kde.org/mailman/listinfo/nepomuk