rclabo commented on issue #403:
URL: https://github.com/apache/lucenenet/issues/403#issuecomment-764964147


   The first thing to know is that there are many aspects to the "Lucene 
Index".  When not using compound files, this manifests in the various files 
that are created.  Just looking at two of those, we can talk about the inverted 
index which is called postings and we can talk about the stored documents.  Of 
these two, there aren't any readily available tunable settings regarding the 
compression of the inverted index as best I can tell.  
   
   The HIGH_COMPRESSION mode relates to the stored fields.  If you are not 
storing fields and you are only using Lucene.Net to create an inverted index 
then doing work to turn on high compression for stored fields won't reduce the 
size of the "Lucene Index".
   
   That said, if you _are_ storing fields and want to use high compression on 
that stored fields data, then you will need to create your own codec that has 
high compression turned on for stored fields.  And to do that, you will first 
need a Stored fields class that has high compression turned on.   Below are 
those two classes followed by a unit test that uses this new codec that I have 
written for you.  I haven't tried this code on a large amount of data to see 
the effect, I leave that for you as an exercise, but this should point the way 
to getting your stored fields compressed with High Compression.
   
   
   ```
   /*
        * Licensed to the Apache Software Foundation (ASF) under one or more
        * contributor license agreements.  See the NOTICE file distributed with
        * this work for additional information regarding copyright ownership.
        * The ASF licenses this file to You under the Apache License, Version 
2.0
        * (the "License"); you may not use this file except in compliance with
        * the License.  You may obtain a copy of the License at
        *
        *     http://www.apache.org/licenses/LICENSE-2.0
        *
        * Unless required by applicable law or agreed to in writing, software
        * distributed under the License is distributed on an "AS IS" BASIS,
        * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
implied.
        * See the License for the specific language governing permissions and
        * limitations under the License.
        */
   
   public sealed class Lucene41StoredFieldsHighCompressionFormat : 
CompressingStoredFieldsFormat {
           /// <summary>
           /// Sole constructor. </summary>
           public Lucene41StoredFieldsHighCompressionFormat()
               : base("Lucene41StoredFieldsHighCompression", 
CompressionMode.HIGH_COMPRESSION, 1 << 14) {
           }
       }
   ```
   Here is a custom codec to use this High Compression format:
   
   ```
   /*
        * Licensed to the Apache Software Foundation (ASF) under one or more
        * contributor license agreements.  See the NOTICE file distributed with
        * this work for additional information regarding copyright ownership.
        * The ASF licenses this file to You under the Apache License, Version 
2.0
        * (the "License"); you may not use this file except in compliance with
        * the License.  You may obtain a copy of the License at
        *
        *     http://www.apache.org/licenses/LICENSE-2.0
        *
        * Unless required by applicable law or agreed to in writing, software
        * distributed under the License is distributed on an "AS IS" BASIS,
        * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
implied.
        * See the License for the specific language governing permissions and
        * limitations under the License.
        */
   
       using Lucene40LiveDocsFormat = 
Lucene.Net.Codecs.Lucene40.Lucene40LiveDocsFormat;
       using Lucene41StoredFieldsFormat = 
Lucene.Net.Codecs.Lucene41.Lucene41StoredFieldsFormat;
       using Lucene42NormsFormat = 
Lucene.Net.Codecs.Lucene42.Lucene42NormsFormat;
       using Lucene42TermVectorsFormat = 
Lucene.Net.Codecs.Lucene42.Lucene42TermVectorsFormat;
       using PerFieldDocValuesFormat = 
Lucene.Net.Codecs.PerField.PerFieldDocValuesFormat;
       using PerFieldPostingsFormat = 
Lucene.Net.Codecs.PerField.PerFieldPostingsFormat;
   
       /// <summary>
       /// Implements the Lucene 4.6 index format, with configurable per-field 
postings
       /// and docvalues formats.
       /// <para/>
       /// If you want to reuse functionality of this codec in another codec, 
extend
       /// <see cref="FilterCodec"/>.
       /// <para/>
       /// See <see cref="Lucene.Net.Codecs.Lucene46"/> package documentation 
for file format details.
       /// <para/>
       /// @lucene.experimental 
       /// </summary>
       // NOTE: if we make largish changes in a minor release, easier to just 
make Lucene46Codec or whatever
       // if they are backwards compatible or smallish we can probably do the 
backwards in the postingsreader
       // (it writes a minor version, etc).
       [CodecName("Lucene46HighCompression")]
       public class Lucene46HighCompressionCodec : Codec {
           private readonly StoredFieldsFormat fieldsFormat = new 
Lucene41StoredFieldsHighCompressionFormat();    //<--This is the only line 
different then the stock Lucene46Codec
           private readonly TermVectorsFormat vectorsFormat = new 
Lucene42TermVectorsFormat();
           private readonly FieldInfosFormat fieldInfosFormat = new 
Lucene46FieldInfosFormat();
           private readonly SegmentInfoFormat segmentInfosFormat = new 
Lucene46SegmentInfoFormat();
           private readonly LiveDocsFormat liveDocsFormat = new 
Lucene40LiveDocsFormat();
   
           private readonly PostingsFormat postingsFormat;
   
           private class PerFieldPostingsFormatAnonymousInnerClassHelper : 
PerFieldPostingsFormat {
               private readonly Lucene46HighCompressionCodec outerInstance;
   
               public 
PerFieldPostingsFormatAnonymousInnerClassHelper(Lucene46HighCompressionCodec 
outerInstance) {
                   this.outerInstance = outerInstance;
               }
   
               [MethodImpl(MethodImplOptions.AggressiveInlining)]
               public override PostingsFormat GetPostingsFormatForField(string 
field) {
                   return outerInstance.GetPostingsFormatForField(field);
               }
           }
   
           private readonly DocValuesFormat docValuesFormat;
   
           private class PerFieldDocValuesFormatAnonymousInnerClassHelper : 
PerFieldDocValuesFormat {
               private readonly Lucene46HighCompressionCodec outerInstance;
   
               public 
PerFieldDocValuesFormatAnonymousInnerClassHelper(Lucene46HighCompressionCodec 
outerInstance) {
                   this.outerInstance = outerInstance;
               }
   
               [MethodImpl(MethodImplOptions.AggressiveInlining)]
               public override DocValuesFormat 
GetDocValuesFormatForField(string field) {
                   return outerInstance.GetDocValuesFormatForField(field);
               }
           }
   
           /// <summary>
           /// Sole constructor. </summary>
           public Lucene46HighCompressionCodec()
               : base() {
               postingsFormat = new 
PerFieldPostingsFormatAnonymousInnerClassHelper(this);
               docValuesFormat = new 
PerFieldDocValuesFormatAnonymousInnerClassHelper(this);
           }
   
           public override sealed StoredFieldsFormat StoredFieldsFormat => 
fieldsFormat;
   
           public override sealed TermVectorsFormat TermVectorsFormat => 
vectorsFormat;
   
           public override sealed PostingsFormat PostingsFormat => 
postingsFormat;
   
           public override sealed FieldInfosFormat FieldInfosFormat => 
fieldInfosFormat;
   
           public override sealed SegmentInfoFormat SegmentInfoFormat => 
segmentInfosFormat;
   
           public override sealed LiveDocsFormat LiveDocsFormat => 
liveDocsFormat;
   
           /// <summary>
           /// Returns the postings format that should be used for writing
           /// new segments of <paramref name="field"/>.
           /// <para/>
           /// The default implementation always returns "Lucene41"
           /// </summary>
           [MethodImpl(MethodImplOptions.AggressiveInlining)]
           public virtual PostingsFormat GetPostingsFormatForField(string 
field) {
               // LUCENENET specific - lazy initialize the codec to ensure we 
get the correct type if overridden.
               if (defaultFormat == null) {
                   defaultFormat = 
Lucene.Net.Codecs.PostingsFormat.ForName("Lucene41");
               }
               return defaultFormat;
           }
   
           /// <summary>
           /// Returns the docvalues format that should be used for writing
           /// new segments of <paramref name="field"/>.
           /// <para/>
           /// The default implementation always returns "Lucene45"
           /// </summary>
           [MethodImpl(MethodImplOptions.AggressiveInlining)]
           public virtual DocValuesFormat GetDocValuesFormatForField(string 
field) {
               // LUCENENET specific - lazy initialize the codec to ensure we 
get the correct type if overridden.
               if (defaultDVFormat == null) {
                   defaultDVFormat = 
Lucene.Net.Codecs.DocValuesFormat.ForName("Lucene45");
               }
               return defaultDVFormat;
           }
   
           public override sealed DocValuesFormat DocValuesFormat => 
docValuesFormat;
   
           // LUCENENET specific - lazy initialize the codecs to ensure we get 
the correct type if overridden.
           private PostingsFormat defaultFormat;
           private DocValuesFormat defaultDVFormat;
   
           private readonly NormsFormat normsFormat = new Lucene42NormsFormat();
   
           public override sealed NormsFormat NormsFormat => normsFormat;
       }
   ```
   
   Here is a unit test to demonstrate use of the High Compression Codec:
   
   ```
   public class TestCompression {
   
   
           [Fact]
           public void HighCompression() {
               FxTest.Setup();
   
               Directory indexDir = new RAMDirectory();
   
               Analyzer standardAnalyzer = new 
StandardAnalyzer(LuceneVersion.LUCENE_48);
   
               IndexWriterConfig indexConfig = new 
IndexWriterConfig(LuceneVersion.LUCENE_48, standardAnalyzer);
               indexConfig.Codec = new Lucene46HighCompressionCodec();          
//<--------Install the High Compression codec.
   
               indexConfig.UseCompoundFile = true;
   
               IndexWriter writer = new IndexWriter(indexDir, indexConfig);
   
               //souce: 
https://github.com/apache/lucenenet/blob/Lucene.Net_4_8_0_beta00006/src/Lucene.Net/Search/SearcherFactory.cs
               SearcherManager searcherManager = new SearcherManager(writer, 
applyAllDeletes: true, new SearchWarmer());
   
               Document doc = new Document();
               doc.Add(new StringField("examplePrimaryKey", "001", 
Field.Store.YES));
               doc.Add(new TextField("exampleField", "Unique gifts are great 
gifts.", Field.Store.YES));
               writer.AddDocument(doc);
   
               doc = new Document();
               doc.Add(new StringField("examplePrimaryKey", "002", 
Field.Store.YES));
               doc.Add(new TextField("exampleField", "Everyone is gifted.", 
Field.Store.YES));
               writer.AddDocument(doc);
   
               doc = new Document();
               doc.Add(new StringField("examplePrimaryKey", "003", 
Field.Store.YES));
               doc.Add(new TextField("exampleField", "Gifts are meant to be 
shared.", Field.Store.YES));
               writer.AddDocument(doc);
   
               writer.Commit();
   
               searcherManager.MaybeRefreshBlocking();
               IndexSearcher indexSearcher = searcherManager.Acquire();
               try {
                   QueryParser parser = new 
QueryParser(LuceneVersion.LUCENE_48, "exampleField", standardAnalyzer);
                   Query query = parser.Parse("everyone");
   
                   TopDocs topDocs = indexSearcher.Search(query, int.MaxValue);
   
                   int numMatchingDocs = topDocs.ScoreDocs.Length;
                   Assert.Equal(1, numMatchingDocs);
   
   
                   Document docRead = 
indexSearcher.Doc(topDocs.ScoreDocs[0].Doc);
                   string primaryKey = docRead.Get("examplePrimaryKey");
                   Assert.Equal("002", primaryKey);
   
               } finally {
                   searcherManager.Release(indexSearcher);
               }
   
           }
   
       }
   ```


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to