Hello everyone,

Let me go straight to the point. I have the upcoming version of kdepim 
installed and found myself with a huge amount of duplicate information in 
nepomuk. This slows down searches and makes things like kpeopletag nearly 
unusable. I started playing around with a script I found in playground 
(playground/nepomuk-kde/toolbox/mergeresources.py). The attached version 
exposes this problem. 

After a month of using the upcoming kdepim this script took nearly 6 hours to 
complete for the tasks alone. Since more and more nepomuk-feeders are being 
deployed and used this should be fixed in there. I am contributing my changes 
to the mentioned script so that people involved in the kdepim development can 
solve this issue.

Thanks a lot for all of your work. It is greatly appreciated.
-- 
Luis A. C. Silva
#!/usr/bin/env python
# -*- coding: utf-8 -*-

 
from PyKDE4.nepomuk import Nepomuk
from PyKDE4.soprano import Soprano
from PyQt4.QtCore import QUrl, QDateTime

import sys

def mergeResources(res, resToMerge):
  for uri in resToMerge:
    print Soprano.Node(uri).toString()
    Nepomuk.ResourceManager.instance().mainModel().removeAllStatements(Soprano.Node(uri),Soprano.Node(),Soprano.Node())
    sl = Nepomuk.ResourceManager.instance().mainModel().listStatements(Soprano.Node(),Soprano.Node(),Soprano.Node(uri)).allStatements()
    Nepomuk.ResourceManager.instance().mainModel().removeAllStatements(Soprano.Node(),Soprano.Node(),Soprano.Node(uri))
    for s in sl:
      s.setObject(Soprano.Node(res))
      Nepomuk.ResourceManager.instance().mainModel().addStatement(s)

def purgeResources(resToMerge):
  for uri in resToMerge:
    print Soprano.Node(uri).toString()
    Nepomuk.ResourceManager.instance().mainModel().removeAllStatements(Soprano.Node(uri),Soprano.Node(),Soprano.Node())
    Nepomuk.ResourceManager.instance().mainModel().removeAllStatements(Soprano.Node(),Soprano.Node(),Soprano.Node(uri))


def findMusicAlbumDuplicates():
  "Returns a list of lists of QUrls"
  result = []
  query = "select distinct ?title count(?r) where { ?r a nmm:MusicAlbum . ?r nie:title ?title . }"
  albumTitles = Nepomuk.ResourceManager.instance().mainModel().executeQuery(query,Soprano.Query.QueryLanguageSparql).allBindings()
  for title in albumTitles:
    print "Music Album: %s (%d)" % (title.value(0).toString(),title.value(1).literal().toInt())
    query = "select ?r where { ?r a nmm:MusicAlbum . ?r nie:title \"%s\"^^xsd:string . }" % title.value(0).toString().replace('"','\\"')
    result.append( [ n.uri() for n in Nepomuk.ResourceManager.instance().mainModel().executeQuery(query,Soprano.Query.QueryLanguageSparql).iterateBindings(0).allNodes() ] )
  return result
  
def findPeopleDuplicates():
  "Returns a list of lists of QUrls"
  result = []
  query = "select distinct ?people count(?r) where { ?r a nco:PersonContact . ?r nao:prefLabel ?people . }"
  names = Nepomuk.ResourceManager.instance().mainModel().executeQuery(query,Soprano.Query.QueryLanguageSparql).allBindings()
  for name in names:
    print "Contact: %s (%d)" % (name.value(0).toString(),name.value(1).literal().toInt())
    query = "select ?r where { ?r a nco:PersonContact . ?r nao:prefLabel \"%s\"^^xsd:string . }" % name.value(0).toString().replace('"','\\"')
    result.append( [ n.uri() for n in Nepomuk.ResourceManager.instance().mainModel().executeQuery(query,Soprano.Query.QueryLanguageSparql).iterateBindings(0).allNodes() ] )
  return result


def findTagDuplicates():
  "Returns a list of lists of QUrls"
  result = []
  query = "select distinct ?label count(?r) where { ?r a nao:Tag . ?r nao:prefLabel ?label . }"
  tagLabels = Nepomuk.ResourceManager.instance().mainModel().executeQuery(query,Soprano.Query.QueryLanguageSparql).allBindings()
  for label in tagLabels:
    print "Tag: %s (%d)" % (label.value(0).toString(),label.value(1).literal().toInt())
    query = "select ?r where { ?r a nao:Tag . ?r nao:prefLabel \"%s\"^^xsd:string . }" % label.value(0).toString().replace('"','\\"')
    result.append( [ n.uri() for n in Nepomuk.ResourceManager.instance().mainModel().executeQuery(query,Soprano.Query.QueryLanguageSparql).iterateBindings(0).allNodes() ] )
  return result

def findTaskDuplicates():
  "Returns a list of lists of QUrls"
  result = []
  query = "select distinct ?task count(?r) where { ?r nao:hasSymbol 'view-pim-task'^^<http://www.w3.org/2001/XMLSchema#string> . ?r nao:prefLabel ?task . }"
  tasks = Nepomuk.ResourceManager.instance().mainModel().executeQuery(query,Soprano.Query.QueryLanguageSparql).allBindings()
  for task in tasks:
    print "Task: %s (%d)" % (task.value(0).toString(),task.value(1).literal().toInt())
    query = "select ?r where { ?r nao:hasSymbol 'view-pim-task'^^<http://www.w3.org/2001/XMLSchema#string> . ?r nao:prefLabel \"%s\"^^xsd:string . }" % task.value(0).toString().replace('"','\\"')
    result.append( [ n.uri() for n in Nepomuk.ResourceManager.instance().mainModel().executeQuery(query,Soprano.Query.QueryLanguageSparql).iterateBindings(0).allNodes() ] )
  return result

print "Merging Music Album Duplicates..."
albums = findMusicAlbumDuplicates()
for duplicates in albums:
  mergeResources(duplicates[0], duplicates[1:])

print "Merging Tag Duplicates..."
tags = findTagDuplicates()
for duplicates in tags:
  mergeResources(duplicates[0],duplicates[1:])
  
print "Merging People Duplicates..."
people = findPeopleDuplicates()
for duplicates in people:
  mergeResources(duplicates[0],duplicates[1:])

print "Merging Task Duplicates..."
tasks = findTaskDuplicates()
for duplicates in tasks:
  mergeResources(duplicates[0],duplicates[1:])
#  purgeResources(duplicates[1:])
# print "Purging Task Duplicates..."
# tasks = findTaskDuplicates()
# for duplicates in tasks:
#  purgeResources(duplicates[1:])

_______________________________________________
Nepomuk mailing list
[email protected]
https://mail.kde.org/mailman/listinfo/nepomuk

Reply via email to