Each shard has around 4.2 million documents which are around 40GB on disk. Two nodes have 3 shard replicas each and the third has 2 shard replicas.
The text of the exception is: java.lang.OutOfMemoryError: Java heap space And the heap dump is a full 24GB indicating the full heap space was being used. Here is the solrconfig as output by the config request handler: { "responseHeader":{ "status":0, "QTime":0}, "config":{ "znodeVersion":0, "luceneMatchVersion":"org.apache.lucene.util.Version:6.5.1", "updateHandler":{ "indexWriter":{"closeWaitsForMerges":true}, "commitWithin":{"softCommit":true}, "autoCommit":{ "maxDocs":50000, "maxTime":300000, "openSearcher":false}, "autoSoftCommit":{ "maxDocs":-1, "maxTime":30000}}, "query":{ "useFilterForSortedQuery":false, "queryResultWindowSize":1, "queryResultMaxDocsCached":2147483647, "enableLazyFieldLoading":false, "maxBooleanClauses":1024, "":{ "size":"10000", "showItems":"-1", "initialSize":"10", "name":"fieldValueCache"}}, "jmx":{ "agentId":null, "serviceUrl":null, "rootName":null}, "requestHandler":{ "/select":{ "name":"/select", "defaults":{ "rows":10, "echoParams":"explicit"}, "class":"solr.SearchHandler"}, "/update":{ "useParams":"_UPDATE", "class":"solr.UpdateRequestHandler", "name":"/update"}, "/update/json":{ "useParams":"_UPDATE_JSON", "class":"solr.UpdateRequestHandler", "invariants":{"update.contentType":"application/json"}, "name":"/update/json"}, "/update/csv":{ "useParams":"_UPDATE_CSV", "class":"solr.UpdateRequestHandler", "invariants":{"update.contentType":"application/csv"}, "name":"/update/csv"}, "/update/json/docs":{ "useParams":"_UPDATE_JSON_DOCS", "class":"solr.UpdateRequestHandler", "invariants":{ "update.contentType":"application/json", "json.command":"false"}, "name":"/update/json/docs"}, "update":{ "class":"solr.UpdateRequestHandlerApi", "useParams":"_UPDATE_JSON_DOCS", "name":"update"}, "/config":{ "useParams":"_CONFIG", "class":"solr.SolrConfigHandler", "name":"/config"}, "/schema":{ "class":"solr.SchemaHandler", "useParams":"_SCHEMA", "name":"/schema"}, "/replication":{ "class":"solr.ReplicationHandler", "useParams":"_REPLICATION", "name":"/replication"}, "/get":{ "class":"solr.RealTimeGetHandler", "useParams":"_GET", "defaults":{ "omitHeader":true, "wt":"json", "indent":true}, "name":"/get"}, "/admin/ping":{ "class":"solr.PingRequestHandler", "useParams":"_ADMIN_PING", "invariants":{ "echoParams":"all", "q":"{!lucene}*:*"}, "name":"/admin/ping"}, "/admin/segments":{ "class":"solr.SegmentsInfoRequestHandler", "useParams":"_ADMIN_SEGMENTS", "name":"/admin/segments"}, "/admin/luke":{ "class":"solr.LukeRequestHandler", "useParams":"_ADMIN_LUKE", "name":"/admin/luke"}, "/admin/system":{ "class":"solr.SystemInfoHandler", "useParams":"_ADMIN_SYSTEM", "name":"/admin/system"}, "/admin/mbeans":{ "class":"solr.SolrInfoMBeanHandler", "useParams":"_ADMIN_MBEANS", "name":"/admin/mbeans"}, "/admin/plugins":{ "class":"solr.PluginInfoHandler", "name":"/admin/plugins"}, "/admin/threads":{ "class":"solr.ThreadDumpHandler", "useParams":"_ADMIN_THREADS", "name":"/admin/threads"}, "/admin/properties":{ "class":"solr.PropertiesRequestHandler", "useParams":"_ADMIN_PROPERTIES", "name":"/admin/properties"}, "/admin/logging":{ "class":"solr.LoggingHandler", "useParams":"_ADMIN_LOGGING", "name":"/admin/logging"}, "/admin/file":{ "class":"solr.ShowFileRequestHandler", "useParams":"_ADMIN_FILE", "name":"/admin/file"}, "/export":{ "class":"solr.ExportHandler", "useParams":"_EXPORT", "components":["query"], "defaults":{"wt":"json"}, "invariants":{ "rq":"{!xport}", "distrib":false}, "name":"/export"}, "/graph":{ "class":"solr.GraphHandler", "useParams":"_ADMIN_GRAPH", "invariants":{ "wt":"graphml", "distrib":false}, "name":"/graph"}, "/stream":{ "class":"solr.StreamHandler", "useParams":"_STREAM", "defaults":{"wt":"json"}, "invariants":{"distrib":false}, "name":"/stream"}, "/sql":{ "class":"solr.SQLHandler", "useParams":"_SQL", "defaults":{"wt":"json"}, "invariants":{"distrib":false}, "name":"/sql"}, "/terms":{ "class":"solr.SearchHandler", "useParams":"_TERMS", "components":["terms"], "name":"/terms"}, "/analysis/document":{ "class":"solr.DocumentAnalysisRequestHandler", "startup":"lazy", "useParams":"_ANALYSIS_DOCUMENT", "name":"/analysis/document"}, "/analysis/field":{ "class":"solr.FieldAnalysisRequestHandler", "startup":"lazy", "useParams":"_ANALYSIS_FIELD", "name":"/analysis/field"}, "/debug/dump":{ "class":"solr.DumpRequestHandler", "useParams":"_DEBUG_DUMP", "defaults":{ "echoParams":"explicit", "echoHandler":true}, "name":"/debug/dump"}}, "updateRequestProcessorChain":[{ "default":"true", "name":"customupdatechain", "":[{"class":"org.apache.solr.update.processor.CustomDedupProcessorFactory"}, {"class":"solr.LogUpdateProcessorFactory"}, {"class":"solr.RunUpdateProcessorFactory"}]}], "updateHandlerupdateLog":{ "dir":"", "numVersionBuckets":65536}, "requestDispatcher":{ "handleSelect":true, "httpCaching":{ "never304":false, "etagSeed":"Solr", "lastModFrom":"opentime", "cacheControl":null}, "requestParsers":{ "multipartUploadLimitKB":2048, "formUploadLimitKB":2048, "addHttpRequestToContext":false}}, "indexConfig":{ "useCompoundFile":false, "maxBufferedDocs":-1, "maxMergeDocs":-1, "mergeFactor":-1, "ramBufferSizeMB":100.0, "writeLockTimeout":-1, "lockType":"native", "infoStreamEnabled":false, "metrics":{}}, "peerSync":{"useRangeVersions":true}}} On Mon, Oct 16, 2017 at 3:38 PM Shawn Heisey <apa...@elyograg.org> wrote: > On 10/16/2017 3:19 PM, Randy Fradin wrote: > > We are seeing a lot of full GC events and eventual OOM errors in Solr > > during indexing. This is Solr 6.5.1 running in cloud mode with a 24G > heap. > > At these times indexing is the only activity taking place. The collection > > has 4 shards and 2 replicas across 3 nodes. Each document is ~10KB (a few > > hundred fields each), and indexing is using the normal update handler, 1 > > document per request, up to 240 request at a time. > > > > The heap dump taken automatically on OOM shows 18.3GB of heap taken by 3 > > instances of DocumentsWriter. Within those instances, all of the heap is > > retained by the blockedFlushes LinkedList inside the flushControl object. > > Each node in the LinkedList appears to be retaining around 55MB. > > > > Clearly something to do with flushing is at play here but I'm at a loss > > what tuning parameters I should be looking at. I would expect things to > > start blocking if I fall too far behind on flushing but apparently that's > > not happening. The ramBufferSizeMB is set to the default 100. My heap > size > > is already absurdly more than I thought we would need for this volume. > > One of the first things we need to find out is about your index size. > > In each of your shards, how many documents are there? How much disk > space does one shard replica take up? How many shard replica cores does > each node have on it in total? > > I would also like to get a look at your full solrconfig.xml file. The > schema may be helpful at a later date, along with an example of a > document that you're indexing. With ramBufferSizeMB at the default, > having a ton of memory used up by a class used for indexing seems very odd. > > Do you have the text of the OOM exception? Is it saying out of heap > space, or some other problem? > > Thanks, > Shawn > >