Author: dbera
Date: Sun Feb 17 00:34:20 2008
New Revision: 4493
URL: http://svn.gnome.org/viewvc/beagle?rev=4493&view=rev

Log:
Remove the cumbersome GetDocsWithProperty method. Instead store a field with 
the names of all the other properties, whitespace separated and use that to 
query. Interestingly, this increased the query time; however the earlier method 
only searched in the PrimaryIndex while this one searches in both the indexes 
(and is of course much cleaner and a lot less code).
Use a FieldSelector in LuceneBitArray ... again no improvement in query time. 
Still it is the right thing to do.
Several other minor fixes.


Modified:
   branches/beagle-rdf/beagled/DumpIndex.cs
   branches/beagle-rdf/beagled/LuceneBitArray.cs
   branches/beagle-rdf/beagled/LuceneCommon.cs
   branches/beagle-rdf/beagled/LuceneQueryingDriver.cs
   branches/beagle-rdf/beagled/NoiseFilter.cs

Modified: branches/beagle-rdf/beagled/DumpIndex.cs
==============================================================================
--- branches/beagle-rdf/beagled/DumpIndex.cs    (original)
+++ branches/beagle-rdf/beagled/DumpIndex.cs    Sun Feb 17 00:34:20 2008
@@ -205,7 +205,7 @@
                                int freq;
                                freq = term_enum.DocFreq ();
 
-                               Console.WriteLine ("{0} {1} {2}", index_name, 
term_enum.Term ().Text (), freq);
+                               Console.WriteLine ("{0} '{1}' {2}", index_name, 
term_enum.Term ().Text (), freq);
 
                                // FIXME: spew these as a count
                                ++distinct_term_count;

Modified: branches/beagle-rdf/beagled/LuceneBitArray.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneBitArray.cs       (original)
+++ branches/beagle-rdf/beagled/LuceneBitArray.cs       Sun Feb 17 00:34:20 2008
@@ -198,6 +198,8 @@
 
                ////////////////////////////////////////////////////////////
 
+               static string[] fields_uri = { "Timestamp", "Uri" };
+
                public void ProjectOnto (LuceneBitArray other)
                {
                        int j = 0;
@@ -209,7 +211,7 @@
                                j = i+1;
 
                                Document doc;
-                               doc = searcher.Doc (i);
+                               doc = searcher.Doc (i, fields_uri);
 
                                other.AddUri (doc.Get ("Uri"));
                        }

Modified: branches/beagle-rdf/beagled/LuceneCommon.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneCommon.cs (original)
+++ branches/beagle-rdf/beagled/LuceneCommon.cs Sun Feb 17 00:34:20 2008
@@ -83,7 +83,8 @@
                // 18: add IsPersistent to properties, and adjust coded values
                //     in AddPropertyToDocument() and GetPropertyFromDocument();
                //     changed subdate field format rules for better readability
-               private const int MAJOR_VERSION = 18;
+               // 19: store a list of current properties in a field
+               private const int MAJOR_VERSION = 19;
                private int minor_version = 0;
 
                private string index_name;
@@ -524,6 +525,9 @@
                                        }
                                } else if (fieldName == "PropertyKeyword")
                                        return new LowerCaseFilter (new 
SingletonTokenStream (reader.ReadToEnd ()));
+                               else if (fieldName == "Properties")
+                                       return new WhitespaceTokenizer (new 
StringReader (reader.ReadToEnd ()));
+
 
                                TokenStream outstream;
                                outstream = base.TokenStream (fieldName, 
reader);
@@ -856,6 +860,11 @@
                                        
                                AddPropertyToDocument (prop, target_doc);
                        }
+
+                       // Now add a field containing a whitespace separated 
list of other fields in the document
+                       AddFieldProperies (primary_doc);
+                       if (secondary_doc != null)
+                               AddFieldProperies (secondary_doc);
                }
 
                static private Document CreateSecondaryDocument (Uri uri, Uri 
parent_uri)
@@ -928,6 +937,7 @@
                                }
                        }
 
+                       AddFieldProperies (new_doc);
                        return new_doc;
                }
 
@@ -949,9 +959,38 @@
                                }
                        }
 
+                       AddFieldProperies (doc);
                        return doc;
                }
 
+               // Add a new field with whitespace separated names of the 
existing fields
+               static protected void AddFieldProperies (Document doc)
+               {
+                       const string Separator = " ";
+
+                       StringBuilder sb = new StringBuilder ();
+                       bool seen_properties = false;
+
+                       foreach (Field f in doc.Fields ()) {
+                               if (f.Name () == "Properties") {
+                                       seen_properties = true;
+                                       continue;
+                               }
+
+                               sb.Append (f.Name ());
+                               sb.Append (Separator);
+                       }
+
+                       if (sb.Length > 0)
+                               sb.Length -= Separator.Length;
+
+                       if (seen_properties)
+                               doc.RemoveFields ("Properties");
+
+                       Field field = new Field ("Properties", sb.ToString (), 
Field.Store.YES, Field.Index.TOKENIZED); // FIXME: Field.Store.No
+                       doc.Add (field);
+               }
+
                static protected Uri GetUriFromDocument (Document doc)
                {
                        string uri;
@@ -1633,11 +1672,13 @@
                                else
                                        field_name = PropertyToFieldName 
(part.Type, part.Key);
 
+                               // Details of the conversion here depends on 
BeagleAnalyzer::TokenStream
                                if (part.Type == PropertyType.Text)
                                        primary_query = StringToQuery 
(field_name, part.Value, term_list);
                                else {
                                        Term term;
-                                       if (field_name.StartsWith ("prop:k:" + 
Property.PrivateNamespace))
+                                       // FIXME: Handle date queries for other 
date fields
+                                       if (part.Type == PropertyType.Internal 
|| field_name.StartsWith ("prop:k:" + Property.PrivateNamespace))
                                                term = new Term (field_name, 
part.Value);
                                        else
                                                term = new Term (field_name, 
part.Value.ToLower ());

Modified: branches/beagle-rdf/beagled/LuceneQueryingDriver.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneQueryingDriver.cs (original)
+++ branches/beagle-rdf/beagled/LuceneQueryingDriver.cs Sun Feb 17 00:34:20 2008
@@ -355,7 +355,15 @@
 
                        // Return uris for all documents with this property
                        if (subject == String.Empty && predicate != 
String.Empty && _object == String.Empty) {
-                               return GetDocsWithProperty (predicate, 
pred_type);
+                               string field_name = PropertyToFieldName 
(pred_type, predicate);
+
+                               QueryPart_Property part = new 
QueryPart_Property ();
+                               part.Type = PropertyType.Internal;
+                               part.Key = "Properties";
+                               part.Value = field_name;
+                               query.AddPart (part);
+
+                               return DoLowLevelRDFQuery (query, field_name, 
null);
                        }
 
                        // Property query
@@ -425,120 +433,6 @@
                        throw new Exception ("Never reaches");
                }
 
-               // FIXME FIXME FIXME: Rewrite this horrible method by keeping a 
field containing
-               // the names of all properties in that document ?
-               // What about SecondaryDocument ? Which index to store this 
field in ?
-               private ICollection GetDocsWithProperty (string propname, 
PropertyType prop_type)
-               {
-                       // This is the hardest!
-                       // Most of the times either all docs will have the 
property or
-                       // neither will, but we also have to cover the rare 
cases.
-                       // Possible approach: Do a term_enum with this property 
name.
-                       // Keep a Set of all Docs (rather Uris) which contain 
that term
-                       // (pretty expensive - since most probably all 
documents will contain that
-                       // property).
-                       //
-                       // Another approach: Get all hits from the driver, scan 
them one by one
-                       // and return URIs for the hits which contain the 
property *shudder*
-                       //
-
-                       // FIXME: Uses PrimaryIndex only!
-                       // Create a bitarray and mark all docs with that 
property by using a termenum
-
-                       IndexReader primary_reader;
-                       primary_reader = LuceneCommon.GetReader (PrimaryStore);
-
-                       BetterBitArray all_docs = new BetterBitArray 
(primary_reader.MaxDoc ());
-
-                       TermDocs docs = primary_reader.TermDocs ();
-                       string field_name = PropertyToFieldName (prop_type, 
propname);
-                       Console.WriteLine (field_name);
-                       TermEnum enumerator = primary_reader.Terms (new Term 
(field_name, String.Empty));
-                       Term term;
-                       bool field_present = false;
-
-                       do {
-                               // Find all terms with given field
-                               term = enumerator.Term ();
-                       
-                               if (term.Field () != field_name)
-                                       break;
-
-                               field_present = true;
-
-                               docs.Seek (enumerator);
-
-                               // Find all docs with that term
-                               while (docs.Next ())
-                                       all_docs [docs.Doc ()] = true;
-                       } while (enumerator.Next ());
-                       Console.WriteLine (field_present);
-
-                       enumerator.Close ();
-
-                       // Maxdoc could be millions!
-                       ArrayList hits = new ArrayList (primary_reader.MaxDoc 
());
-
-                       // If field_present is false, preempt
-                       if (! field_present) {
-                               docs.Close ();
-                               LuceneCommon.ReleaseReader (primary_reader);
-
-                               return hits;
-                       }
-
-                       IndexReader secondary_reader = null;
-                       LNS.IndexSearcher secondary_searcher = null;
-
-                       if (SecondaryStore != null) {
-                               secondary_reader = LuceneCommon.GetReader 
(SecondaryStore);
-                               if (secondary_reader.NumDocs () == 0) {
-                                       ReleaseReader (secondary_reader);
-                                       secondary_reader = null;
-                               }
-                       }
-
-                       if (secondary_reader != null)
-                               secondary_searcher = new LNS.IndexSearcher 
(secondary_reader);
-
-                       TermDocs secondary_term_docs = null;
-                       if (secondary_searcher != null)
-                               secondary_term_docs = 
secondary_searcher.Reader.TermDocs ();
-
-                       string[] fields = { "Uri", "Timestamp", field_name };
-
-                       // Go through all Uris now
-                       enumerator = primary_reader.Terms (new Term ("Uri", 
String.Empty));
-                       Document doc;
-
-                       do {
-                               // Find all terms with 
-                               term = enumerator.Term ();
-                       
-                               if (term.Field () != "Uri")
-                                       break;
-
-                               docs.Seek (enumerator);
-                               // Assume only one doc with an uri.
-                               // Go to the doc with this uri
-                               // If this doc's id is present in bit_array, 
return the uri
-                               if (docs.Next () && all_docs [docs.Doc ()]) {
-                                       doc = primary_reader.Document (docs.Doc 
(), fields);
-                                       Hit hit = CreateHit (doc, 
secondary_searcher, secondary_term_docs, fields);
-                                       hits.Add (hit); 
-                               }
-
-                       } while (enumerator.Next ());
-
-                       // Traverse all docs in all_docs
-
-                       enumerator.Close ();
-                       docs.Close ();
-                       LuceneCommon.ReleaseReader (primary_reader);
-
-                       return hits;
-               }
-
                private ICollection DoLowLevelRDFQuery (Query query,
                                                        string field_name,
                                                        string field_value)

Modified: branches/beagle-rdf/beagled/NoiseFilter.cs
==============================================================================
--- branches/beagle-rdf/beagled/NoiseFilter.cs  (original)
+++ branches/beagle-rdf/beagled/NoiseFilter.cs  Sun Feb 17 00:34:20 2008
@@ -248,7 +248,13 @@
        }
 
 #if false
+       // To build: gmcs NoiseFilter.cs LuceneCommon.cs -r:../Util/Util.dll 
-r:../BeagleClient/Beagle.dll -r:BeagleDaemonLib.dll
        public class AnalyzerTest {
+               public static void Main ()
+               {
+                       Analyze (Console.In);
+               }
+
                public static void Analyze (TextReader reader)
                {
                        Lucene.Net.Analysis.Token lastToken = null;
_______________________________________________
SVN-commits-list mailing list (read only)
http://mail.gnome.org/mailman/listinfo/svn-commits-list

Want to limit the commits to a few modules? Go to above URL, log in to edit 
your options and select the modules ('topics') you want.
Module maintainer? It is possible to set the reply-to to your development 
mailing list. Email [EMAIL PROTECTED] if interested.

Reply via email to