The problem appears to be using a WhitespaceAnalyzer inside of a 
PerFieldAnalyzerWrapper. Try running this program and change the 
analyzer sub-type by toggling the #defines:


#include <cstdio>
#include <CLucene.h>

#define INDEX_PATH "index"
#define USE_PER_FIELD_ANALYZER
#define SUB_ANALYZER_TYPE lucene::analysis::WhitespaceAnalyzer
//#define SUB_ANALYZER_TYPE lucene::analysis::standard::StandardAnalyzer

int main(int argc, char *argv[]) {
   try {
#ifdef USE_PER_FIELD_ANALYZER
     lucene::analysis::PerFieldAnalyzerWrapper analyzer(
       _CLNEW lucene::analysis::standard::StandardAnalyzer());
     analyzer.addAnalyzer(_T("First"), _CLNEW SUB_ANALYZER_TYPE());
     analyzer.addAnalyzer(_T("Second"), _CLNEW SUB_ANALYZER_TYPE());
     analyzer.addAnalyzer(_T("Third"), _CLNEW SUB_ANALYZER_TYPE());
     analyzer.addAnalyzer(_T("Fourth"), _CLNEW SUB_ANALYZER_TYPE());
     analyzer.addAnalyzer(_T("Fifth"), _CLNEW SUB_ANALYZER_TYPE());
#else
     lucene::analysis::WhitespaceAnalyzer analyzer;
#endif
     lucene::index::IndexWriter writer(INDEX_PATH, &analyzer, true);
     lucene::document::Document doc;
     int flags = lucene::document::Field::STORE_YES
                 | lucene::document::Field::INDEX_TOKENIZED;
     for (int i = 0; i < 1000000; i++) {
       doc.clear();
       doc.add(*(_CLNEW lucene::document::Field(
         _T("First"), _T("Blah blah blah"), flags)));
       doc.add(*(_CLNEW lucene::document::Field(
         _T("Second"), _T("Blah blah-- blah"), flags)));
       doc.add(*(_CLNEW lucene::document::Field(
         _T("Fifth"), _T("Blah blah__ blah"), flags)));
       doc.add(*(_CLNEW lucene::document::Field(
         _T("Eigth"), _T("Blah blah blah++"), flags)));
       doc.add(*(_CLNEW lucene::document::Field(
         _T("Ninth"), _T("Blah123 blah blah"), flags)));
       writer.addDocument(&doc);
     }
     writer.close();
   } catch (CLuceneError err) {
     printf("CLuceneError: %s", err.what());
   }
   return 0;
}


Running valgrind gives this:
> ==5003== Memcheck, a memory error detector
> ==5003== Copyright (C) 2002-2009, and GNU GPL'd, by Julian Seward et al.
> ==5003== Using Valgrind-3.5.0-Debian and LibVEX; rerun with -h for copyright 
> info
> ==5003== Command: ./testcl
> ==5003== Parent PID: 25703
> ==5003== 
> ==5003== 
> ==5003== HEAP SUMMARY:
> ==5003==     in use at exit: 31,840,378 bytes in 50,010 blocks
> ==5003==   total heap usage: 231,219 allocs, 181,209 frees, 39,843,697 bytes 
> allocated
> ==5003== 
> ==5003== 254 (32 direct, 222 indirect) bytes in 1 blocks are definitely lost 
> in loss record 10 of 13
> ==5003==    at 0x4025390: operator new(unsigned int) (vg_replace_malloc.c:214)
> ==5003==    by 0x41D8C6D: lucene::store::FSDirectory::getDirectory(char 
> const*, bool, lucene::store::LockFactory*) (FSDirectory.cpp:485)
> ==5003==    by 0x42375F8: lucene::index::IndexWriter::IndexWriter(char 
> const*, lucene::analysis::Analyzer*, bool) (IndexWriter.cpp:152)
> ==5003==    by 0x80490D9: main (testcl.cc:23)
> ==5003== 
> ==5003== 14,672 bytes in 14 blocks are possibly lost in loss record 11 of 13
> ==5003==    at 0x4025390: operator new(unsigned int) (vg_replace_malloc.c:214)
> ==5003==    by 0x41CCDC2: 
> lucene::analysis::WhitespaceAnalyzer::tokenStream(wchar_t const*, 
> lucene::util::Reader*) (Analyzers.cpp:113)
> ==5003==    by 0x41CC309: 
> lucene::analysis::PerFieldAnalyzerWrapper::tokenStream(wchar_t const*, 
> lucene::util::Reader*) (Analyzers.cpp:298)
> ==5003==    by 0x41CFFCE: 
> lucene::analysis::Analyzer::reusableTokenStream(wchar_t const*, 
> lucene::util::Reader*) (AnalysisHeader.cpp:36)
> ==5003==    by 0x4206228: 
> lucene::index::DocumentsWriter::ThreadState::FieldData::invertField(lucene::document::Field*,
>  lucene::analysis::Analyzer*, int) (DocumentsWriterThreadState.cpp:889)
> ==5003==    by 0x42082A0: 
> lucene::index::DocumentsWriter::ThreadState::FieldData::processField(lucene::analysis::Analyzer*)
>  (DocumentsWriterThreadState.cpp:795)
> ==5003==    by 0x42086B6: 
> lucene::index::DocumentsWriter::ThreadState::processDocument(lucene::analysis::Analyzer*)
>  (DocumentsWriterThreadState.cpp:554)
> ==5003==    by 0x41FE293: 
> lucene::index::DocumentsWriter::updateDocument(lucene::document::Document*, 
> lucene::analysis::Analyzer*, lucene::index::Term*) (DocumentsWriter.cpp:934)
> ==5003==    by 0x41FE406: 
> lucene::index::DocumentsWriter::addDocument(lucene::document::Document*, 
> lucene::analysis::Analyzer*) (DocumentsWriter.cpp:918)
> ==5003==    by 0x423BE41: 
> lucene::index::IndexWriter::addDocument(lucene::document::Document*, 
> lucene::analysis::Analyzer*) (IndexWriter.cpp:668)
> ==5003==    by 0x8049331: main (testcl.cc:39)
> ==5003== 
> ==5003== 400,000 bytes in 20,000 blocks are definitely lost in loss record 12 
> of 13
> ==5003==    at 0x4025390: operator new(unsigned int) (vg_replace_malloc.c:214)
> ==5003==    by 0x41C9AA0: 
> lucene::analysis::standard::StandardAnalyzer::tokenStream(wchar_t const*, 
> lucene::util::Reader*) (StandardAnalyzer.cpp:64)
> ==5003==    by 0x41CC309: 
> lucene::analysis::PerFieldAnalyzerWrapper::tokenStream(wchar_t const*, 
> lucene::util::Reader*) (Analyzers.cpp:298)
> ==5003==    by 0x41CFFCE: 
> lucene::analysis::Analyzer::reusableTokenStream(wchar_t const*, 
> lucene::util::Reader*) (AnalysisHeader.cpp:36)
> ==5003==    by 0x4206228: 
> lucene::index::DocumentsWriter::ThreadState::FieldData::invertField(lucene::document::Field*,
>  lucene::analysis::Analyzer*, int) (DocumentsWriterThreadState.cpp:889)
> ==5003==    by 0x42082A0: 
> lucene::index::DocumentsWriter::ThreadState::FieldData::processField(lucene::analysis::Analyzer*)
>  (DocumentsWriterThreadState.cpp:795)
> ==5003==    by 0x42086B6: 
> lucene::index::DocumentsWriter::ThreadState::processDocument(lucene::analysis::Analyzer*)
>  (DocumentsWriterThreadState.cpp:554)
> ==5003==    by 0x41FE293: 
> lucene::index::DocumentsWriter::updateDocument(lucene::document::Document*, 
> lucene::analysis::Analyzer*, lucene::index::Term*) (DocumentsWriter.cpp:934)
> ==5003==    by 0x41FE406: 
> lucene::index::DocumentsWriter::addDocument(lucene::document::Document*, 
> lucene::analysis::Analyzer*) (DocumentsWriter.cpp:918)
> ==5003==    by 0x423BE41: 
> lucene::index::IndexWriter::addDocument(lucene::document::Document*, 
> lucene::analysis::Analyzer*) (IndexWriter.cpp:668)
> ==5003==    by 0x8049331: main (testcl.cc:39)
> ==5003== 
> ==5003== 31,425,328 bytes in 29,986 blocks are definitely lost in loss record 
> 13 of 13
> ==5003==    at 0x4025390: operator new(unsigned int) (vg_replace_malloc.c:214)
> ==5003==    by 0x41CCDC2: 
> lucene::analysis::WhitespaceAnalyzer::tokenStream(wchar_t const*, 
> lucene::util::Reader*) (Analyzers.cpp:113)
> ==5003==    by 0x41CC309: 
> lucene::analysis::PerFieldAnalyzerWrapper::tokenStream(wchar_t const*, 
> lucene::util::Reader*) (Analyzers.cpp:298)
> ==5003==    by 0x41CFFCE: 
> lucene::analysis::Analyzer::reusableTokenStream(wchar_t const*, 
> lucene::util::Reader*) (AnalysisHeader.cpp:36)
> ==5003==    by 0x4206228: 
> lucene::index::DocumentsWriter::ThreadState::FieldData::invertField(lucene::document::Field*,
>  lucene::analysis::Analyzer*, int) (DocumentsWriterThreadState.cpp:889)
> ==5003==    by 0x42082A0: 
> lucene::index::DocumentsWriter::ThreadState::FieldData::processField(lucene::analysis::Analyzer*)
>  (DocumentsWriterThreadState.cpp:795)
> ==5003==    by 0x42086B6: 
> lucene::index::DocumentsWriter::ThreadState::processDocument(lucene::analysis::Analyzer*)
>  (DocumentsWriterThreadState.cpp:554)
> ==5003==    by 0x41FE293: 
> lucene::index::DocumentsWriter::updateDocument(lucene::document::Document*, 
> lucene::analysis::Analyzer*, lucene::index::Term*) (DocumentsWriter.cpp:934)
> ==5003==    by 0x41FE406: 
> lucene::index::DocumentsWriter::addDocument(lucene::document::Document*, 
> lucene::analysis::Analyzer*) (DocumentsWriter.cpp:918)
> ==5003==    by 0x423BE41: 
> lucene::index::IndexWriter::addDocument(lucene::document::Document*, 
> lucene::analysis::Analyzer*) (IndexWriter.cpp:668)
> ==5003==    by 0x8049331: main (testcl.cc:39)
> ==5003== 
> ==5003== LEAK SUMMARY:
> ==5003==    definitely lost: 31,825,360 bytes in 49,987 blocks
> ==5003==    indirectly lost: 222 bytes in 5 blocks
> ==5003==      possibly lost: 14,672 bytes in 14 blocks
> ==5003==    still reachable: 124 bytes in 4 blocks
> ==5003==         suppressed: 0 bytes in 0 blocks
> ==5003== Reachable blocks (those to which a pointer was found) are not shown.
> ==5003== To see them, rerun with: --leak-check=full --show-reachable=yes
> ==5003== 
> ==5003== For counts of detected and suppressed errors, rerun with: -v
> ==5003== ERROR SUMMARY: 4 errors from 4 contexts (suppressed: 27 from 8)

Thanks for looking into this!


Itamar Syn-Hershko wrote:
> Hi,
> 
> I ran TestAnalyzers.cpp (specifically testPerFieldAnalzyerWrapper() ) from
> our test suite, and detected no leaks. I also tried replacing
> 
>       analyzer.addAnalyzer(_T("special"), _CLNEW SimpleAnalyzer());
> 
> With
> 
>       analyzer.addAnalyzer(_T("special"), _CLNEW StandardAnalyzer());
> 
> And still found nothing.
> 
> I used our 2_3_2 master branch from the git repository (see
> http://clucene.sourceforge.net/download.shtml).
> 
> If you're using this branch, please let me know the details of the leaks
> you're detecting.
> 
> Itamar. 
> 
> -----Original Message-----
> From: Michael Levin [mailto:mele...@stanford.edu] 
> Sent: Monday, November 02, 2009 8:47 PM
> To: clucene-developers@lists.sourceforge.net
> Subject: [CLucene-dev] PerFieldAnalyzerWrapper memory leak
> 
> Hi,
> 
> I am working on a program to index about 25gb of data and when I run CLucene
> with a PerFieldAnalyzerWrapper it leaks memory and inevitably crashes
> because it runs out of memory.
> 
> Here is my code:
> 
> lucene::analysis::PerFieldAnalyzerWrapper
>    analyzer(new lucene::analysis::standard::StandardAnalyzer());
> analyzer.addAnalyzer(_T("Authors"),
>    new lucene::analysis::WhitespaceAnalyzer());
> analyzer.addAnalyzer(_T("ReprintAuthor"),
>    new lucene::analysis::WhitespaceAnalyzer());
> analyzer.addAnalyzer(_T("Name"),
>    new lucene::analysis::WhitespaceAnalyzer());
> analyzer.addAnalyzer(_T("Email"),
>    new lucene::analysis::WhitespaceAnalyzer());
> 
> If I replace that snippet with a plain WhitespaceAnalyzer there is no memory
> leak:
> 
> lucene::analysis::WhitespaceAnalyzer analyzer;
> 
> Am I using the PerFieldAnalyzerWrapper class wrong or is this a bug in
> CLucene?
> 
> Thanks!
> 
> --
> Michael Levin <mele...@stanford.edu>
> 
> ----------------------------------------------------------------------------
> --
> Come build with us! The BlackBerry(R) Developer Conference in SF, CA is the
> only developer event you need to attend this year. Jumpstart your developing
> skills, take BlackBerry mobile applications to market and stay ahead of the
> curve. Join us from November 9 - 12, 2009. Register now!
> http://p.sf.net/sfu/devconference
> _______________________________________________
> CLucene-developers mailing list
> CLucene-developers@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/clucene-developers
> 
> 
> 
> ------------------------------------------------------------------------------
> Come build with us! The BlackBerry(R) Developer Conference in SF, CA
> is the only developer event you need to attend this year. Jumpstart your
> developing skills, take BlackBerry mobile applications to market and stay 
> ahead of the curve. Join us from November 9 - 12, 2009. Register now!
> http://p.sf.net/sfu/devconference
> _______________________________________________
> CLucene-developers mailing list
> CLucene-developers@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/clucene-developers

-- 
Michael Levin <mele...@stanford.edu>

------------------------------------------------------------------------------
Let Crystal Reports handle the reporting - Free Crystal Reports 2008 30-Day 
trial. Simplify your report design, integration and deployment - and focus on 
what you do best, core application coding. Discover what's new with
Crystal Reports now.  http://p.sf.net/sfu/bobj-july
_______________________________________________
CLucene-developers mailing list
CLucene-developers@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/clucene-developers

Reply via email to