Yes, I finally implemented it. :-D

Please note that this is just the initial design. If you don't like the API
design, or anything in particular, please tell me!

I've debugged it, and it seems to running okay, but I'll test it more
thoroughly, and benchmark it later. For what it's worth, it seems to be
somewhat faster.

There is one obvious bug in the implementation which I've highlighted. There
are ways to fix it, but that would make the code messier than it already is,
and AFIAK it currently isn't a problem, but it could be in the future.

- Vishesh Handa
Index: strigifeeder.h
===================================================================
--- strigifeeder.h	(revision 0)
+++ strigifeeder.h	(revision 0)
@@ -0,0 +1,76 @@
+/*
+  Copyright (C) 2010 Vishesh Handa <[email protected]>
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Library General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this library; see the file COPYING.  If not, write to
+  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+  Boston, MA 02110-1301, USA.
+*/
+
+
+#ifndef STRIGIFEEDER_H
+#define STRIGIFEEDER_H
+
+#include <QtCore/QThread>
+#include <QtCore/QMutex>
+#include <QtCore/QWaitCondition>
+#include <QtCore/QUrl>
+#include <QtCore/QQueue>
+#include <QtCore/QStack>
+#include <QtCore/QSet>
+
+namespace Soprano {
+    class Model;
+    class Statement;
+    class Node;
+}
+
+namespace Nepomuk {
+    class StrigiFeeder : public QThread
+    {
+        Q_OBJECT
+    public:
+        StrigiFeeder( Soprano::Model* model, QObject* parent = 0);
+        virtual ~StrigiFeeder();
+
+        void stop();
+        void run();
+
+    public Q_SLOTS:
+        void begin( const QUrl & url );
+        void addStatement( const Soprano::Statement & st );
+        void addStatement( const Soprano::Node & subject,
+                           const Soprano::Node & predicate,
+                           const Soprano::Node & object );
+        void end();
+
+    private:
+        struct Request {
+            QUrl url;
+            QSet<Soprano::Statement> statements;
+        };
+        QQueue<Request> m_queue;
+        QStack<Request> m_stack;
+
+        Soprano::Model* m_model;
+
+        QMutex m_queueMutex;
+        QWaitCondition m_queueWaiter;
+        bool m_stopped;
+
+        QUrl generateGraph( const QUrl& resourceUri );
+    };
+
+}
+
+#endif // STRIGIFEEDER_H
Index: nepomukindexwriter.cpp
===================================================================
--- nepomukindexwriter.cpp	(revision 1148863)
+++ nepomukindexwriter.cpp	(working copy)
@@ -22,6 +22,7 @@
 #include "nfo.h"
 #include "nie.h"
 #include "nrl.h"
+#include "strigifeeder.h"
 
 #include <Soprano/Soprano>
 #include <Soprano/Vocabulary/RDF>
@@ -118,6 +119,25 @@ namespace {
         return uri;
     }
 
+    class RegisteredFieldData
+    {
+    public:
+        RegisteredFieldData( const QUrl& prop, QVariant::Type t )
+        : property( prop ),
+        dataType( t ),
+        isRdfType( prop == Vocabulary::RDF::type() ) {
+        }
+
+        /// The actual property URI
+        QUrl property;
+
+        /// the literal range of the property (if applicable)
+        QVariant::Type dataType;
+
+        /// caching QUrl comparison
+        bool isRdfType;
+    };
+
     /**
      * Data objects that are used to store information relative to one
      * indexing run.
@@ -128,7 +148,7 @@ namespace {
         FileMetaData( const Strigi::AnalysisResult* idx );
 
         /// stores basic data including the nie:url and the nrl:GraphMetadata in \p model
-        void storeBasicData( Soprano::Model* model );
+        void storeBasicData( Nepomuk::StrigiFeeder* feeder );
 
         /// map a blank node to a resource
         QUrl mapNode( const std::string& s );
@@ -142,8 +162,8 @@ namespace {
         /// The file info - saved to prevent multiple stats
         QFileInfo fileInfo;
 
-        /// The URI of the graph that contains all indexed statements
-        QUrl context;
+        ///// The URI of the graph that contains all indexed statements
+        //QUrl context;
 
         /// a buffer for all plain-text content generated by strigi
         std::string content;
@@ -156,25 +176,6 @@ namespace {
         QMap<std::string, QUrl> m_blankNodeMap;
     };
 
-    class RegisteredFieldData
-    {
-    public:
-        RegisteredFieldData( const QUrl& prop, QVariant::Type t )
-            : property( prop ),
-              dataType( t ),
-              isRdfType( prop == Vocabulary::RDF::type() ) {
-        }
-
-        /// The actual property URI
-        QUrl property;
-
-        /// the literal range of the property (if applicable)
-        QVariant::Type dataType;
-
-        /// caching QUrl comparison
-        bool isRdfType;
-    };
-
     FileMetaData::FileMetaData( const Strigi::AnalysisResult* idx )
         : m_analysisResult( idx )
     {
@@ -187,7 +188,7 @@ namespace {
         resourceUri = Nepomuk::Resource( fileUrl ).resourceUri();
 
         // use a new random context URI
-        context = Nepomuk::ResourceManager::instance()->generateUniqueUri( "ctx" );
+        //context = Nepomuk::ResourceManager::instance()->generateUniqueUri( "ctx" );
     }
 
     QUrl FileMetaData::mapNode( const std::string& s )
@@ -211,47 +212,20 @@ namespace {
         }
     }
 
-    void FileMetaData::storeBasicData( Soprano::Model* model )
+    void FileMetaData::storeBasicData( Nepomuk::StrigiFeeder * feeder )
     {
-        model->addStatement( resourceUri, Nepomuk::Vocabulary::NIE::url(), fileUrl, context );
+        feeder->addStatement( resourceUri, Nepomuk::Vocabulary::NIE::url(), fileUrl );
 
         // Strigi only indexes files and extractors mostly (if at all) store the nie:DataObject type (i.e. the contents)
         // Thus, here we go the easy way and mark each indexed file as a nfo:FileDataObject.
-        model->addStatement( resourceUri,
+        feeder->addStatement( resourceUri,
                              Vocabulary::RDF::type(),
-                             Nepomuk::Vocabulary::NFO::FileDataObject(),
-                             context );
+                              Nepomuk::Vocabulary::NFO::FileDataObject() );
         if ( fileInfo.isDir() ) {
-            model->addStatement( resourceUri,
+            feeder->addStatement( resourceUri,
                                  Vocabulary::RDF::type(),
-                                 Nepomuk::Vocabulary::NFO::Folder(),
-                                 context );
+                                  Nepomuk::Vocabulary::NFO::Folder() );
         }
-
-
-        // create the provedance data for the data graph
-        // TODO: add more data at some point when it becomes of interest
-        QUrl metaDataContext = Nepomuk::ResourceManager::instance()->generateUniqueUri( "ctx" );
-        model->addStatement( context,
-                             Vocabulary::RDF::type(),
-                             Nepomuk::Vocabulary::NRL::DiscardableInstanceBase(),
-                             metaDataContext );
-        model->addStatement( context,
-                             Vocabulary::NAO::created(),
-                             LiteralValue( QDateTime::currentDateTime() ),
-                             metaDataContext );
-        model->addStatement( context,
-                             Strigi::Ontology::indexGraphFor(),
-                             resourceUri,
-                             metaDataContext );
-        model->addStatement( metaDataContext,
-                             Vocabulary::RDF::type(),
-                             Nepomuk::Vocabulary::NRL::GraphMetadata(),
-                             metaDataContext );
-        model->addStatement( metaDataContext,
-                             Nepomuk::Vocabulary::NRL::coreGraphMetadataFor(),
-                             context,
-                             metaDataContext );
     }
 
     FileMetaData* fileDataForResult( const Strigi::AnalysisResult* idx )
@@ -264,13 +238,24 @@ namespace {
 class Strigi::NepomukIndexWriter::Private
 {
 public:
-    Private()
+    Private( Soprano::Model * model )
+        : repository( model )
     {
         literalTypes[FieldRegister::stringType] = QVariant::String;
         literalTypes[FieldRegister::floatType] = QVariant::Double;
         literalTypes[FieldRegister::integerType] = QVariant::Int;
         literalTypes[FieldRegister::binaryType] = QVariant::ByteArray;
         literalTypes[FieldRegister::datetimeType] = QVariant::DateTime; // Strigi encodes datetime as unsigned integer, i.e. addValue( ..., uint )
+
+        feeder = new Nepomuk::StrigiFeeder( model );
+        feeder->start();
+    }
+
+    ~Private()
+    {
+        feeder->stop();
+        feeder->wait();
+        delete feeder;
     }
 
     QVariant::Type literalType( const Strigi::FieldProperties& strigiType ) {
@@ -310,6 +295,8 @@ public:
 
     QStack<const Strigi::AnalysisResult*> currentResultStack;
 
+    Nepomuk::StrigiFeeder* feeder;
+
 private:
     QHash<std::string, QVariant::Type> literalTypes;
 };
@@ -318,8 +305,8 @@ private:
 Strigi::NepomukIndexWriter::NepomukIndexWriter( Soprano::Model* model )
     : Strigi::IndexWriter()
 {
-    d = new Private;
-    d->repository = model;
+    kDebug() << "Creating Private";
+    d = new Private( model );
     Util::storeStrigiMiniOntology( d->repository );
 }
 
@@ -358,6 +345,7 @@ void Strigi::NepomukIndexWriter::deleteA
 // called for each indexed file
 void Strigi::NepomukIndexWriter::startAnalysis( const AnalysisResult* idx )
 {
+    kDebug() << "Starting Analysis!";
     // we need to remember the AnalysisResult since addTriplet does not provide it
     d->currentResultStack.push(idx);
 
@@ -387,8 +375,12 @@ void Strigi::NepomukIndexWriter::startAn
     if ( data->resourceUri.isEmpty() )
         data->resourceUri = Nepomuk::ResourceManager::instance()->generateUniqueUri( QString() );
 
+    // Start the feeder
+    kDebug() << "Starting the feeder";
+    d->feeder->begin( data->resourceUri );
+
     // store initial data to make sure newly created URIs are reused directly by libnepomuk
-    data->storeBasicData( d->repository );
+    data->storeBasicData( d->feeder );
 
     // remember the file data
     idx->setWriterData( data );
@@ -419,7 +411,7 @@ void Strigi::NepomukIndexWriter::addValu
         RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
 
         // the statement we will create, we will determine the object below
-        Soprano::Statement statement( md->resourceUri, rfd->property, Soprano::Node(), md->context );
+        Soprano::Statement statement( md->resourceUri, rfd->property, Soprano::Node() );
 
         //
         // Strigi uses rdf:type improperly since it stores the value as a string. We have to
@@ -461,12 +453,12 @@ void Strigi::NepomukIndexWriter::addValu
             if ( value[0] == ':' ) {
                 Nepomuk::Types::Property property( rfd->property );
                 if ( property.range().isValid() ) {
-                    statement.setObject( md->mapNode( value ) );
+                    statement.setObject( Soprano::Node( QUrl( QString::fromUtf8( value.c_str() ) ) ) );
                 }
             }
         }
 
-        d->repository->addStatement( statement );
+        d->feeder->addStatement( statement );
     }
 }
 
@@ -504,10 +496,7 @@ void Strigi::NepomukIndexWriter::addValu
         val = QDateTime::fromTime_t( value );
     }
 
-    d->repository->addStatement( Statement( md->resourceUri,
-                                            rfd->property,
-                                            val,
-                                            md->context) );
+    d->feeder->addStatement( md->resourceUri, rfd->property, val);
 }
 
 
@@ -522,10 +511,7 @@ void Strigi::NepomukIndexWriter::addValu
     FileMetaData* md = fileDataForResult( idx );
     RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
 
-    d->repository->addStatement( Statement( md->resourceUri,
-                                            rfd->property,
-                                            LiteralValue( value ),
-                                            md->context) );
+    d->repository->addStatement( md->resourceUri, rfd->property, LiteralValue( value ) );
 }
 
 
@@ -540,10 +526,7 @@ void Strigi::NepomukIndexWriter::addValu
     FileMetaData* md = fileDataForResult( idx );
     RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
 
-    d->repository->addStatement( Statement( md->resourceUri,
-                                            rfd->property,
-                                            LiteralValue( value ),
-                                            md->context) );
+    d->repository->addStatement( md->resourceUri, rfd->property, LiteralValue( value ) );
 }
 
 
@@ -557,15 +540,15 @@ void Strigi::NepomukIndexWriter::addTrip
 
     FileMetaData* md = fileDataForResult( d->currentResultStack.top() );
 
-    QUrl subject = md->mapNode( s );
+    Soprano::Node subject( QUrl( QString::fromUtf8( s.c_str() ) ) );
     Nepomuk::Types::Property property( md->mapNode( p ) );
     Soprano::Node object;
     if ( property.range().isValid() )
-        object = md->mapNode( o );
+        object = Soprano::Node( QUrl( QString::fromUtf8( o.c_str() ) ) );
     else
         object = Soprano::LiteralValue::fromString( QString::fromUtf8( o.c_str() ), property.literalRangeType().dataTypeUri() );
 
-    d->repository->addStatement( subject, property.uri(), object, md->context );
+    d->feeder->addStatement( subject, property.uri(), object );
 }
 
 
@@ -582,17 +565,17 @@ void Strigi::NepomukIndexWriter::finishA
 
     // store the full text of the file
     if ( md->content.length() > 0 ) {
-        d->repository->addStatement( Statement( md->resourceUri,
+        d->feeder->addStatement( md->resourceUri,
                                                 Nepomuk::Vocabulary::NIE::plainTextContent(),
-                                                LiteralValue( QString::fromUtf8( md->content.c_str() ) ),
-                                                md->context ) );
-        if ( d->repository->lastError() )
-            kDebug() << "Failed to add" << md->resourceUri << "as text" << QString::fromUtf8( md->content.c_str() );
+                                 LiteralValue( QString::fromUtf8( md->content.c_str() ) ) );
     }
 
     // cleanup
     delete md;
     idx->setWriterData( 0 );
+
+    // Handle the feeder
+    d->feeder->end();
 }
 
 
Index: CMakeLists.txt
===================================================================
--- CMakeLists.txt	(revision 1148863)
+++ CMakeLists.txt	(working copy)
@@ -11,6 +11,7 @@ set( strigi_nepomuk_indexer_SRCS
   nepomukindexmanager.cpp
   nepomukindexreader.cpp
   nepomukindexwriter.cpp
+  strigifeeder.cpp
   util.cpp
 )
 
Index: strigifeeder.cpp
===================================================================
--- strigifeeder.cpp	(revision 0)
+++ strigifeeder.cpp	(revision 0)
@@ -0,0 +1,269 @@
+/*
+  Copyright (C) 2010 Vishesh Handa <[email protected]>
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Library General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this library; see the file COPYING.  If not, write to
+  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+  Boston, MA 02110-1301, USA.
+*/
+
+
+#include "strigifeeder.h"
+#include "nrl.h"
+#include "util.h"
+
+#include <QtCore/QDateTime>
+
+#include <Soprano/Model>
+#include <Soprano/Statement>
+#include <Soprano/QueryResultIterator>
+#include <Soprano/Vocabulary/RDF>
+#include <Soprano/Vocabulary/NAO>
+
+#include <Nepomuk/ResourceManager>
+#include <Nepomuk/Resource>
+
+#include <KDebug>
+
+
+Nepomuk::StrigiFeeder::StrigiFeeder(Soprano::Model* model, QObject* parent)
+    : QThread( parent ),
+      m_model( model )
+{
+    m_stopped = false;
+}
+
+
+Nepomuk::StrigiFeeder::~StrigiFeeder()
+{
+}
+
+
+void Nepomuk::StrigiFeeder::begin( const QUrl & url )
+{
+    kDebug() << "BEGINING";
+    Request req;
+    req.url = url;
+
+    m_stack.push( req );
+}
+
+
+void Nepomuk::StrigiFeeder::addStatement(const Soprano::Statement& st)
+{
+    Q_ASSERT( !m_stack.isEmpty() );
+    Request & req = m_stack.top();
+
+    // Since we are adding them to a set, duplicates are automatically resolved
+    req.statements.insert( st );
+}
+
+
+void Nepomuk::StrigiFeeder::addStatement(const Soprano::Node& subject, const Soprano::Node& predicate, const Soprano::Node& object)
+{
+    addStatement( Soprano::Statement( subject, predicate, object, Soprano::Node() ) );
+}
+
+
+void Nepomuk::StrigiFeeder::end()
+{
+    if( m_stack.isEmpty() )
+        return;
+    kDebug() << "ENDING";
+
+    Request req = m_stack.pop();
+
+    m_queueMutex.lock();
+    m_queue.enqueue( req );
+
+    m_queueMutex.unlock();
+    m_queueWaiter.wakeAll();
+}
+
+
+void Nepomuk::StrigiFeeder::stop()
+{
+    QMutexLocker lock( &m_queueMutex );
+    m_stopped = true;
+    m_queueWaiter.wakeAll();
+}
+
+namespace {
+
+    struct ResourceStruct {
+        QUrl uri;
+        QMultiHash<QUrl, Soprano::Node> propHash;
+    };
+
+    typedef QHash<QUrl, ResourceStruct> ResourceHash;
+
+    ResourceHash convertToResourceHash(const QSet<Soprano::Statement> & set ) {
+        ResourceHash hash;
+
+        foreach( const Soprano::Statement & st, set ) {
+            //kDebug() << st;
+            const QUrl & subUri = st.subject().uri();
+            if( !hash.contains( subUri ) ) {
+                //kDebug() << "Creating " << subUri;
+                ResourceStruct rs;
+                rs.uri = subUri;
+
+                hash.insert( subUri, rs );
+            }
+
+            ResourceStruct & rs = hash[ subUri ];
+            rs.propHash.insert( st.predicate().uri(), st.object() );
+        }
+        return hash;
+    }
+
+    QString toSparql( const ResourceStruct & rs ) {
+        QString query = QString::fromLatin1("select distinct ?r where { ");
+
+        QList<QUrl> keys = rs.propHash.uniqueKeys();
+        foreach( const QUrl & prop, keys ) {
+            const QList<Soprano::Node>& values = rs.propHash.values( prop );
+
+            foreach( const Soprano::Node & n, values ) {
+                query += " ?r " + Soprano::Node::resourceToN3( prop ) + " " + n.toN3() + " . ";
+            }
+        }
+        query += " } LIMIT 1";
+        return query;
+    }
+
+    void add( Soprano::Model * model, const ResourceStruct &rs, const QUrl & context ) {
+        QHashIterator<QUrl, Soprano::Node> iter( rs.propHash );
+        while( iter.hasNext() ) {
+            iter.next();
+
+            Soprano::Statement st( rs.uri, iter.key(), iter.value(), context );
+            kDebug() << "ADDING : " << st;
+            model->addStatement( st );
+        }
+    }
+}
+
+
+void Nepomuk::StrigiFeeder::run()
+{
+    m_stopped = false;
+    while( !m_stopped ) {
+
+        // lock for initial iteration
+        m_queueMutex.lock();
+
+        // work the queue
+        while( !m_queue.isEmpty() ) {
+            Request request = m_queue.dequeue();
+
+            // unlock after queue utilization
+            m_queueMutex.unlock();
+
+            kDebug() << " Converting to ResourceHash ..";
+            // Convert to Resource Hash
+            ResourceHash hash = convertToResourceHash( request.statements );
+
+            // Search for the resources or create them
+            kDebug() << " Searching for duplicates or creating them ... ";
+            QMutableHashIterator<QUrl, ResourceStruct> it( hash );
+            while( it.hasNext() ) {
+                it.next();
+                if( it.key() == request.url )
+                    continue;
+
+                // If it already exists
+                ResourceStruct & rs = it.value();
+                QString query = toSparql( rs );
+                kDebug() << query;
+                Soprano::QueryResultIterator it =  m_model->executeQuery( query, Soprano::Query::QueryLanguageSparql );
+
+                if( it.next() ) {
+                    kDebug() << "Found exact match " << rs.uri << " " << it[0].uri();
+                    rs.uri = it[0].uri();
+                }
+                else {
+                    kDebug() << "Creating ..";
+                    rs.uri = ResourceManager::instance()->generateUniqueUri( QString() );
+                }
+
+                // Add to the repository
+                QUrl context = generateGraph( rs.uri );
+                add( m_model, rs, context );
+            }
+
+            // Fix links for main
+            ResourceStruct & rs = hash[ request.url ];
+            QMutableHashIterator<QUrl, Soprano::Node> iter( rs.propHash );
+            while( iter.hasNext() ) {
+                iter.next();
+                QUrl uri = iter.value().uri();
+
+                if( uri.isEmpty() )
+                    continue;
+
+                if( uri.scheme().isEmpty() ) {
+                    QUrl newUri = hash.value( uri ).uri;
+                    kDebug() << uri << " ---> " << newUri;
+                    iter.value() = Soprano::Node( newUri );
+                }
+            }
+
+            // Add main file to the repository
+            QUrl context = generateGraph( rs.uri );
+            add( m_model, rs, context );
+
+            // lock for next iteration
+            m_queueMutex.lock();
+        }
+
+        // wait for more input
+        kDebug() << "Waiting...";
+        m_queueWaiter.wait( &m_queueMutex );
+        m_queueMutex.unlock();
+        kDebug() << "Woke up.";
+
+    }
+}
+
+
+QUrl Nepomuk::StrigiFeeder::generateGraph( const QUrl & resourceUri )
+{
+    QUrl context = Nepomuk::ResourceManager::instance()->generateUniqueUri( "ctx" );
+
+    // create the provedance data for the data graph
+    // TODO: add more data at some point when it becomes of interest
+    QUrl metaDataContext = Nepomuk::ResourceManager::instance()->generateUniqueUri( "ctx" );
+    m_model->addStatement( context,
+                           Soprano::Vocabulary::RDF::type(),
+                           Nepomuk::Vocabulary::NRL::DiscardableInstanceBase(),
+                           metaDataContext );
+    m_model->addStatement( context,
+                           Soprano::Vocabulary::NAO::created(),
+                           Soprano::LiteralValue( QDateTime::currentDateTime() ),
+                           metaDataContext );
+    m_model->addStatement( context,
+                           Strigi::Ontology::indexGraphFor(),
+                           resourceUri,
+                           metaDataContext );
+    m_model->addStatement( metaDataContext,
+                           Soprano::Vocabulary::RDF::type(),
+                           Nepomuk::Vocabulary::NRL::GraphMetadata(),
+                           metaDataContext );
+    m_model->addStatement( metaDataContext,
+                           Nepomuk::Vocabulary::NRL::coreGraphMetadataFor(),
+                           context,
+                           metaDataContext );
+
+    return context;
+}
_______________________________________________
Nepomuk mailing list
[email protected]
https://mail.kde.org/mailman/listinfo/nepomuk

Reply via email to