Modified: stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/solrconfig.xml URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/solrconfig.xml?rev=1469266&r1=1469265&r2=1469266&view=diff ============================================================================== --- stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/solrconfig.xml (original) +++ stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/solrconfig.xml Thu Apr 18 10:17:06 2013 @@ -29,27 +29,15 @@ have your own custom plugins. --> - <!-- Set this to 'false' if you want solr to continue working after - it has encountered an severe configuration error. In a - production environment, you may want solr to keep working even - if one handler is mis-configured. - - You may also set this to false using by setting the system - property: - - -Dsolr.abortOnConfigurationError=false - --> - <abortOnConfigurationError>${solr.abortOnConfigurationError:true}</abortOnConfigurationError> - <!-- Controls what version of Lucene various components of Solr adhere to. Generally, you want to use the latest version to get all bug fixes and improvements. It is highly recommended that you fully re-index after changing this setting as it can affect both how text is indexed and queried. - --> - <luceneMatchVersion>LUCENE_32</luceneMatchVersion> + --> + <luceneMatchVersion>LUCENE_42</luceneMatchVersion> - <!-- lib directives can be used to instruct Solr to load an Jars + <!-- <lib/> directives can be used to instruct Solr to load an Jars identified and use them to resolve any "plugins" specified in your solrconfig.xml or schema.xml (ie: Analyzers, Request Handlers, etc...). @@ -57,36 +45,18 @@ All directories and paths are resolved relative to the instanceDir. + Please note that <lib/> directives are processed in the order + that they appear in your solrconfig.xml file, and are "stacked" + on top of each other when building a ClassLoader - so if you have + plugin jars with dependencies on other jars, the "lower level" + dependency jars should be loaded first. + If a "./lib" directory exists in your instanceDir, all files found in it are included as if you had used the following syntax... <lib dir="./lib" /> --> - <!-- A dir option by itself adds any files found in the directory to - the classpath, this is useful for including all jars in a - directory. - --> - <lib dir="../../contrib/extraction/lib" /> - <!-- When a regex is specified in addition to a directory, only the - files in that directory which completely match the regex - (anchored on both ends) will be included. - --> - <lib dir="../../dist/" regex="apache-solr-cell-\d.*\.jar" /> - <lib dir="../../dist/" regex="apache-solr-clustering-\d.*\.jar" /> - <lib dir="../../dist/" regex="apache-solr-dataimporthandler-\d.*\.jar" /> - - <!-- If a dir option (with or without a regex) is used and nothing - is found that matches, it will be ignored - --> - <lib dir="../../contrib/clustering/lib/" /> - <lib dir="/total/crap/dir/ignored" /> - <!-- an exact path can be used to specify a specific file. This - will cause a serious error to be logged if it can't be loaded. - --> - <!-- - <lib path="../a-jar-that-does-not-exist.jar" /> - --> <!-- Data Directory @@ -100,59 +70,83 @@ <!-- The DirectoryFactory to use for indexes. - solr.StandardDirectoryFactory, the default, is filesystem - based. solr.RAMDirectoryFactory is memory based, not - persistent, and doesn't work with replication. - --> - <directoryFactory name="DirectoryFactory" - class="${solr.directoryFactory:solr.StandardDirectoryFactory}"/> - - - <!-- Index Defaults + solr.StandardDirectoryFactory is filesystem + based and tries to pick the best implementation for the current + JVM and platform. solr.NRTCachingDirectoryFactory, the default, + wraps solr.StandardDirectoryFactory and caches small files in memory + for better NRT performance. - Values here affect all index writers and act as a default - unless overridden. + One can force a particular implementation via solr.MMapDirectoryFactory, + solr.NIOFSDirectoryFactory, or solr.SimpleFSDirectoryFactory. - WARNING: See also the <mainIndex> section below for parameters - that overfor Solr's main Lucene index. + solr.RAMDirectoryFactory is memory based, not + persistent, and doesn't work with replication. --> - <indexDefaults> - - <useCompoundFile>false</useCompoundFile> + <directoryFactory name="DirectoryFactory" + class="${solr.directoryFactory:solr.NRTCachingDirectoryFactory}"/> - <mergeFactor>10</mergeFactor> - <!-- Sets the amount of RAM that may be used by Lucene indexing - for buffering added documents and deletions before they are - flushed to the Directory. --> - <ramBufferSizeMB>32</ramBufferSizeMB> - <!-- If both ramBufferSizeMB and maxBufferedDocs is set, then - Lucene will flush based on whichever limit is hit first. - --> + <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Index Config - These settings control low-level behavior of indexing + Most example settings here show the default value, but are commented + out, to more easily see where customizations have been made. + + Note: This replaces <indexDefaults> and <mainIndex> from older versions + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --> + <indexConfig> + <!-- maxFieldLength was removed in 4.0. To get similar behavior, include a + LimitTokenCountFilterFactory in your fieldType definition. E.g. + <filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="10000"/> + --> + <!-- Maximum time to wait for a write lock (ms) for an IndexWriter. Default: 1000 --> + <writeLockTimeout>5000</writeLockTimeout> + + <!-- The maximum number of simultaneous threads that may be + indexing documents at once in IndexWriter; if more than this + many threads arrive they will wait for others to finish. + Default in Solr/Lucene is 8. --> + <!-- <maxIndexingThreads>8</maxIndexingThreads> --> + + <!-- Expert: Enabling compound file will use less files for the index, + using fewer file descriptors on the expense of performance decrease. + Default in Lucene is "true". Default in Solr is "false" (since 3.6) --> + <!-- <useCompoundFile>false</useCompoundFile> --> + + <!-- ramBufferSizeMB sets the amount of RAM that may be used by Lucene + indexing for buffering added documents and deletions before they are + flushed to the Directory. + maxBufferedDocs sets a limit on the number of documents buffered + before flushing. + If both ramBufferSizeMB and maxBufferedDocs is set, then + Lucene will flush based on whichever limit is hit first. --> + <!-- <ramBufferSizeMB>100</ramBufferSizeMB> --> <!-- <maxBufferedDocs>1000</maxBufferedDocs> --> - <maxFieldLength>10000</maxFieldLength> - <writeLockTimeout>1000</writeLockTimeout> - <commitLockTimeout>10000</commitLockTimeout> - <!-- Expert: Merge Policy - - The Merge Policy in Lucene controls how merging is handled by - Lucene. The default in 2.3 is the LogByteSizeMergePolicy, - previous versions used LogDocMergePolicy. - - LogByteSizeMergePolicy chooses segments to merge based on - their size. The Lucene 2.2 default, LogDocMergePolicy chose - when to merge based on number of documents - - Other implementations of MergePolicy must have a no-argument - constructor + The Merge Policy in Lucene controls how merging of segments is done. + The default since Solr/Lucene 3.3 is TieredMergePolicy. + The default since Lucene 2.3 was the LogByteSizeMergePolicy, + Even older versions of Lucene used LogDocMergePolicy. --> <!-- - <mergePolicy class="org.apache.lucene.index.LogByteSizeMergePolicy"/> - --> + <mergePolicy class="org.apache.lucene.index.TieredMergePolicy"> + <int name="maxMergeAtOnce">10</int> + <int name="segmentsPerTier">10</int> + </mergePolicy> + --> + + <!-- Merge Factor + The merge factor controls how many segments will get merged at a time. + For TieredMergePolicy, mergeFactor is a convenience parameter which + will set both MaxMergeAtOnce and SegmentsPerTier at once. + For LogByteSizeMergePolicy, mergeFactor decides how many new segments + will be allowed before they are merged into one. + Default is 10 for both merge policies. + --> + <!-- + <mergeFactor>10</mergeFactor> + --> <!-- Expert: Merge Scheduler - The Merge Scheduler in Lucene controls how merges are performed. The ConcurrentMergeScheduler (Lucene 2.3 default) can perform merges in the background using separate threads. @@ -161,7 +155,7 @@ <!-- <mergeScheduler class="org.apache.lucene.index.ConcurrentMergeScheduler"/> --> - + <!-- LockFactory This option specifies which Lucene LockFactory implementation @@ -175,66 +169,60 @@ JVM are attempting to share a single index. simple = SimpleFSLockFactory - uses a plain file for locking - (For backwards compatibility with Solr 1.2, 'simple' is the - default if not specified.) + Defaults: 'native' is default for Solr3.6 and later, otherwise + 'simple' is the default More details on the nuances of each LockFactory... http://wiki.apache.org/lucene-java/AvailableLockFactories --> - <lockType>native</lockType> - - <!-- Expert: Controls how often Lucene loads terms into memory - Default is 128 and is likely good for most everyone. - --> - <!-- <termIndexInterval>256</termIndexInterval> --> - </indexDefaults> - - <!-- Main Index - - Values here override the values in the <indexDefaults> section - for the main on disk index. - --> - <mainIndex> - - <useCompoundFile>false</useCompoundFile> - <ramBufferSizeMB>32</ramBufferSizeMB> - <mergeFactor>10</mergeFactor> + <!-- <lockType>native</lockType> --> <!-- Unlock On Startup If true, unlock any held write or commit locks on startup. This defeats the locking mechanism that allows multiple processes to safely access a lucene index, and should be used - with care. + with care. Default is "false". This is not needed if lock type is 'none' or 'single' --> + <!-- <unlockOnStartup>false</unlockOnStartup> + --> + <!-- Expert: Controls how often Lucene loads terms into memory + Default is 128 and is likely good for most everyone. + --> + <!-- <termIndexInterval>128</termIndexInterval> --> + <!-- If true, IndexReaders will be reopened (often more efficient) - instead of closed and then opened. + instead of closed and then opened. Default: true --> + <!-- <reopenReaders>true</reopenReaders> + --> <!-- Commit Deletion Policy - Custom deletion policies can specified here. The class must + Custom deletion policies can be specified here. The class must implement org.apache.lucene.index.IndexDeletionPolicy. - http://lucene.apache.org/java/2_9_1/api/all/org/apache/lucene/index/IndexDeletionPolicy.html + http://lucene.apache.org/java/3_5_0/api/core/org/apache/lucene/index/IndexDeletionPolicy.html - The standard Solr IndexDeletionPolicy implementation supports + The default Solr IndexDeletionPolicy implementation supports deleting index commit points on number of commits, age of commit point and optimized status. The latest commit point should always be preserved regardless of the criteria. --> + <!-- <deletionPolicy class="solr.SolrDeletionPolicy"> + --> <!-- The number of commit points to be kept --> - <str name="maxCommitsToKeep">1</str> + <!-- <str name="maxCommitsToKeep">1</str> --> <!-- The number of optimized commit points to be kept --> - <str name="maxOptimizedCommitsToKeep">0</str> + <!-- <str name="maxOptimizedCommitsToKeep">0</str> --> <!-- Delete all commit points once they have reached the given age. Supports DateMathParser syntax e.g. @@ -243,7 +231,9 @@ <str name="maxCommitAge">30MINUTES</str> <str name="maxCommitAge">1DAY</str> --> + <!-- </deletionPolicy> + --> <!-- Lucene Infostream @@ -251,11 +241,11 @@ of detailed information when indexing. Setting The value to true will instruct the underlying Lucene - IndexWriter to write it's debugging info the specified file + IndexWriter to write its debugging info the specified file --> - <infoStream file="INFOSTREAM.txt">false</infoStream> + <!-- <infoStream file="INFOSTREAM.txt">false</infoStream> --> + </indexConfig> - </mainIndex> <!-- JMX @@ -278,31 +268,57 @@ <!-- The default high-performance update handler --> <updateHandler class="solr.DirectUpdateHandler2"> + <!-- Enables a transaction log, used for real-time get, durability, and + and solr cloud replica recovery. The log can grow as big as + uncommitted changes to the index, so use of a hard autoCommit + is recommended (see below). + "dir" - the target directory for transaction logs, defaults to the + solr data directory. --> + <!-- updateLog> + <str name="dir">${solr.ulog.dir:}</str> + </updateLog --> + <!-- AutoCommit - Perform a <commit/> automatically under certain conditions. + Perform a hard commit automatically under certain conditions. Instead of enabling autoCommit, consider using "commitWithin" when adding documents. http://wiki.apache.org/solr/UpdateXmlMessages maxDocs - Maximum number of documents to add since the last - commit before automaticly triggering a new commit. + commit before automatically triggering a new commit. - maxTime - Maximum amount of time that is allowed to pass + maxTime - Maximum amount of time in ms that is allowed to pass since a document was added before automaticly triggering a new commit. + openSearcher - if false, the commit causes recent index changes + to be flushed to stable storage, but does not cause a new + searcher to be opened to make those changes visible. + + If the updateLog is enabled, then it's highly recommended to + have some sort of hard autoCommit to limit the log size. + --> + <!-- The Stanbol Entityhub SolrYard uses commitWithin + <autoCommit> + <maxTime>15000</maxTime> + <openSearcher>false</openSearcher> + </autoCommit> + --> + <!-- softAutoCommit is like autoCommit except it causes a + 'soft' commit which only ensures that changes are visible + but does not ensure that data is synced to disk. This is + faster and more near-realtime friendly than a hard commit. --> - <!-- - <autoCommit> - <maxDocs>10000</maxDocs> + <!-- + <autoSoftCommit> <maxTime>1000</maxTime> - </autoCommit> + </autoSoftCommit> --> <!-- Update Related Event Listeners - Various IndexWriter realted events can trigger Listeners to + Various IndexWriter related events can trigger Listeners to take actions. postCommit - fired after every commit or optimize command @@ -331,6 +347,7 @@ <arr name="env"> <str>MYVAR=val1</str> </arr> </listener> --> + </updateHandler> <!-- IndexReaderFactory @@ -370,7 +387,9 @@ </indexReaderFactory > --> - + <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Query section - these settings control query time things like caches + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --> <query> <!-- Max Boolean Clauses @@ -419,14 +438,20 @@ autowarmCount - the number of entries to prepopulate from and old cache. --> - <filterCache class="solr.FastLRUCache" size="512" initialSize="512" autowarmCount="0"/> + <filterCache class="solr.FastLRUCache" + size="2048" + initialSize="1024" + autowarmCount="512"/> <!-- Query Result Cache Caches results of searches - ordered lists of document ids (DocList) based on a query, a sort, and the range of documents requested. --> - <queryResultCache class="solr.LRUCache" size="512" initialSize="512" autowarmCount="0"/> + <queryResultCache class="solr.LRUCache" + size="2048" + initialSize="1024" + autowarmCount="512"/> <!-- Document Cache @@ -434,7 +459,10 @@ document). Since Lucene internal document ids are transient, this cache will not be autowarmed. --> - <documentCache class="solr.LRUCache" size="512" initialSize="512" autowarmCount="0"/> + <documentCache class="solr.LRUCache" + size="4096" + initialSize="1024" + autowarmCount="0"/> <!-- Field Value Cache @@ -489,7 +517,7 @@ that. For most situations, this will not be useful unless you - frequently get the same search repeatedly with differnet sort + frequently get the same search repeatedly with different sort options, and none of them ever use "score" --> <!-- @@ -575,17 +603,22 @@ This section contains instructions for how the SolrDispatchFilter should behave when processing requests for this SolrCore. - handleSelect affects the behavior of requests such as /select?qt=XXX + handleSelect is a legacy option that affects the behavior of requests + such as /select?qt=XXX handleSelect="true" will cause the SolrDispatchFilter to process - the request and will result in consistent error handling and - formating for all types of requests. + the request and dispatch the query to a handler specified by the + "qt" param, assuming "/select" isn't already registered. handleSelect="false" will cause the SolrDispatchFilter to - ignore "/select" requests and fallback to using the legacy - SolrServlet and it's Solr 1.1 style error formatting + ignore "/select" requests, resulting in a 404 unless a handler + is explicitly registered with the name "/select" + + handleSelect="true" is not recommended for new users, but is the default + for backwards compatibility --> - <requestDispatcher handleSelect="true" > + <!-- TODO mayve we need to set this to true --> + <requestDispatcher handleSelect="false" > <!-- Request Parsing These settings indicate how Solr Requests may be parsed, and @@ -593,10 +626,15 @@ those requests enableRemoteStreaming - enables use of the stream.file - and stream.url paramaters for specifying remote streams. + and stream.url parameters for specifying remote streams. - multipartUploadLimitInKB - specifies the max size of - Multipart File Uploads that Solr will alow in a Request. + multipartUploadLimitInKB - specifies the max size (in KiB) of + Multipart File Uploads that Solr will allow in a Request. + + formdataUploadLimitInKB - specifies the max size (in KiB) of + form data (application/x-www-form-urlencoded) sent via + POST. You can use POST to pass request parameters not + fitting into the URL. *** WARNING *** The settings below authorize Solr to fetch remote files, You @@ -605,7 +643,8 @@ --> <requestParsers enableRemoteStreaming="true" - multipartUploadLimitInKB="2048000" /> + multipartUploadLimitInKB="2048000" + formdataUploadLimitInKB="2048"/> <!-- HTTP Caching @@ -629,7 +668,7 @@ <cacheControl>max-age=30, public</cacheControl> </httpCaching> --> - <!-- To enable Solr to responde with automaticly generated HTTP + <!-- To enable Solr to respond with automatically generated HTTP Caching headers, and to response to Cache Validation requests correctly, set the value of never304="false" @@ -643,12 +682,12 @@ Last-Modified value (and validation against If-Modified-Since requests) will all be relative to when the current Searcher was opened. You can change it to lastModFrom="dirLastMod" if - you want the value to exactly corrispond to when the physical + you want the value to exactly correspond to when the physical index was last modified. etagSeed="..." is an option you can change to force the ETag header (and validation against If-None-Match requests) to be - differnet even if the index has not changed (ie: when making + different even if the index has not changed (ie: when making significant changes to your config file) (lastModifiedFrom and etagSeed are both ignored if you use @@ -666,17 +705,17 @@ http://wiki.apache.org/solr/SolrRequestHandler - incoming queries will be dispatched to the correct handler - based on the path or the qt (query type) param. + Incoming queries will be dispatched to a specific handler by name + based on the path specified in the request. + + Legacy behavior: If the request path uses "/select" but no Request + Handler has that name, and if handleSelect="true" has been specified in + the requestDispatcher, then the Request Handler is dispatched based on + the qt parameter. Handlers without a leading '/' are accessed this way + like so: http://host/app/[core/]select?qt=name If no qt is + given, then the requestHandler that declares default="true" will be + used or the one named "standard". - Names starting with a '/' are accessed with the a path equal to - the registered name. Names without a leading '/' are accessed - with: http://host/app/[core/]select?qt=name - - If a /select request is processed with out a qt param - specified, the requestHandler that declares default="true" will - be used. - If a Request Handler is declared with startup="lazy", then it will not be initialized until the first request that uses it. @@ -690,14 +729,15 @@ of SearchComponents (see below) and supports distributed queries across multiple shards --> - <requestHandler name="search" class="solr.SearchHandler" default="true"> + <requestHandler name="/select" class="solr.SearchHandler"> <!-- default values for query parameters can be specified, these will be overridden by parameters in the request --> <lst name="defaults"> <str name="echoParams">explicit</str> <int name="rows">10</int> - </lst> + <str name="df">_text</str> <!-- _text is the full text field used by the entityhub --> + </lst> <!-- In addition to defaults, "appends" params can be specified to identify values which should be appended to the list of multi-val params from the query (or the existing "defaults"). @@ -751,16 +791,39 @@ </arr> --> </requestHandler> - - <!-- Request Handler for similarity queries and topic classification --> - <requestHandler name="/mlt" class="solr.MoreLikeThisHandler" startup="lazy" /> - <!-- A Robust Example + <!-- Request Handler for similarity queries and topic classification --> + <requestHandler name="/mlt" class="solr.MoreLikeThisHandler" startup="lazy" /> + + <!-- A request handler that returns indented JSON by default --> + <requestHandler name="/query" class="solr.SearchHandler"> + <lst name="defaults"> + <str name="echoParams">explicit</str> + <str name="wt">json</str> + <str name="indent">true</str> + <str name="df">text</str> + </lst> + </requestHandler> + + + <!-- realtime get handler, guaranteed to return the latest stored fields of + any document, without the need to commit or open a new searcher. The + current implementation relies on the updateLog feature being enabled. --> + <requestHandler name="/get" class="solr.RealTimeGetHandler"> + <lst name="defaults"> + <str name="omitHeader">true</str> + <str name="wt">json</str> + <str name="indent">true</str> + </lst> + </requestHandler> + + <!-- A Robust Example + This example SearchHandler declaration shows off usage of the SearchHandler with many defaults declared - Note that multiple instances of hte same Request Handler + Note that multiple instances of the same Request Handler (SearchHandler) can be registered multiple times with different names (and different init parameters) --> @@ -770,37 +833,48 @@ <!-- VelocityResponseWriter settings --> <str name="wt">velocity</str> - <str name="v.template">browse</str> <str name="v.layout">layout</str> <str name="title">Solritas</str> + <!-- Query settings --> <str name="defType">edismax</str> + <str name="qf"> + text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 + title^10.0 description^5.0 keywords^5.0 author^2.0 resourcename^1.0 + </str> + <str name="df">text</str> + <str name="mm">100%</str> <str name="q.alt">*:*</str> <str name="rows">10</str> <str name="fl">*,score</str> + <str name="mlt.qf"> text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 + title^10.0 description^5.0 keywords^5.0 author^2.0 resourcename^1.0 </str> - <str name="mlt.fl">text,features,name,sku,id,manu,cat</str> + <str name="mlt.fl">text,features,name,sku,id,manu,cat,title,description,keywords,author,resourcename</str> <int name="mlt.count">3</int> - <str name="qf"> - text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 - </str> - + <!-- Faceting defaults --> <str name="facet">on</str> <str name="facet.field">cat</str> <str name="facet.field">manu_exact</str> + <str name="facet.field">content_type</str> + <str name="facet.field">author_s</str> <str name="facet.query">ipod</str> <str name="facet.query">GB</str> <str name="facet.mincount">1</str> <str name="facet.pivot">cat,inStock</str> + <str name="facet.range.other">after</str> <str name="facet.range">price</str> <int name="f.price.facet.range.start">0</int> <int name="f.price.facet.range.end">600</int> <int name="f.price.facet.range.gap">50</int> - <str name="f.price.facet.range.other">after</str> + <str name="facet.range">popularity</str> + <int name="f.popularity.facet.range.start">0</int> + <int name="f.popularity.facet.range.end">10</int> + <int name="f.popularity.facet.range.gap">3</int> <str name="facet.range">manufacturedate_dt</str> <str name="f.manufacturedate_dt.facet.range.start">NOW/YEAR-10YEARS</str> <str name="f.manufacturedate_dt.facet.range.end">NOW</str> @@ -808,34 +882,59 @@ <str name="f.manufacturedate_dt.facet.range.other">before</str> <str name="f.manufacturedate_dt.facet.range.other">after</str> - <!-- Highlighting defaults --> <str name="hl">on</str> - <str name="hl.fl">text features name</str> + <str name="hl.fl">content features title name</str> + <str name="hl.encoder">html</str> + <str name="hl.simple.pre"><b></str> + <str name="hl.simple.post"></b></str> + <str name="f.title.hl.fragsize">0</str> + <str name="f.title.hl.alternateField">title</str> <str name="f.name.hl.fragsize">0</str> <str name="f.name.hl.alternateField">name</str> + <str name="f.content.hl.snippets">3</str> + <str name="f.content.hl.fragsize">200</str> + <str name="f.content.hl.alternateField">content</str> + <str name="f.content.hl.maxAlternateFieldLength">750</str> + + <!-- Spell checking defaults --> + <str name="spellcheck">on</str> + <str name="spellcheck.extendedResults">false</str> + <str name="spellcheck.count">5</str> + <str name="spellcheck.alternativeTermCount">2</str> + <str name="spellcheck.maxResultsForSuggest">5</str> + <str name="spellcheck.collate">true</str> + <str name="spellcheck.collateExtendedResults">true</str> + <str name="spellcheck.maxCollationTries">5</str> + <str name="spellcheck.maxCollations">3</str> </lst> + + <!-- append spellchecking to our list of components --> <arr name="last-components"> <str>spellcheck</str> </arr> - <!-- - <str name="url-scheme">httpx</str> - --> </requestHandler> - <!-- XML Update Request Handler. + + <!-- Update Request Handler. http://wiki.apache.org/solr/UpdateXmlMessages The canonical Request Handler for Modifying the Index through - commands specified using XML. + commands specified using XML, JSON, CSV, or JAVABIN Note: Since solr1.1 requestHandlers requires a valid content type header if posted in the body. For example, curl now requires: -H 'Content-type:text/xml; charset=utf-8' + + To override the request content type and force a specific + Content-type, use the request parameter: + ?update.contentType=text/csv + + This handler will pick a response format to match the input + if the 'wt' parameter is not explicit --> - <requestHandler name="/update" - class="solr.XmlUpdateRequestHandler"> + <requestHandler name="/update" class="solr.UpdateRequestHandler"> <!-- See below for information on defining updateRequestProcessorChains that can be used by name on each Update Request @@ -845,26 +944,19 @@ <str name="update.chain">dedupe</str> </lst> --> - </requestHandler> - <!-- Binary Update Request Handler - http://wiki.apache.org/solr/javabin - --> - <requestHandler name="/update/javabin" - class="solr.BinaryUpdateRequestHandler" /> - - <!-- CSV Update Request Handler - http://wiki.apache.org/solr/UpdateCSV - --> - <requestHandler name="/update/csv" - class="solr.CSVRequestHandler" - startup="lazy" /> + </requestHandler> - <!-- JSON Update Request Handler - http://wiki.apache.org/solr/UpdateJSON - --> - <requestHandler name="/update/json" - class="solr.JsonUpdateRequestHandler" - startup="lazy" /> + <!-- for back compat with clients using /update/json and /update/csv --> + <requestHandler name="/update/json" class="solr.JsonUpdateRequestHandler"> + <lst name="defaults"> + <str name="stream.contentType">application/json</str> + </lst> + </requestHandler> + <requestHandler name="/update/csv" class="solr.CSVRequestHandler"> + <lst name="defaults"> + <str name="stream.contentType">application/csv</str> + </lst> + </requestHandler> <!-- Solr Cell Update Request Handler @@ -875,9 +967,6 @@ startup="lazy" class="solr.extraction.ExtractingRequestHandler" > <lst name="defaults"> - <!-- All the main content goes into "text"... if you need to return - the extracted text or do highlighting, use a stored field. --> - <str name="fmap.content">text</str> <str name="lowernames">true</str> <str name="uprefix">ignored_</str> @@ -888,6 +977,7 @@ </lst> </requestHandler> + <!-- Field Analysis Request Handler RequestHandler that provides much the same functionality as @@ -916,7 +1006,7 @@ http://wiki.apache.org/solr/AnalysisRequestHandler An analysis handler that provides a breakdown of the analysis - process of provided docuemnts. This handler expects a (single) + process of provided documents. This handler expects a (single) content stream with the following format: <docs> @@ -931,12 +1021,12 @@ </docs> Note: Each document must contain a field which serves as the - unique key. This key is used in the returned response to assoicate - ananalysis breakdown to the analyzed document. + unique key. This key is used in the returned response to associate + an analysis breakdown to the analyzed document. Like the FieldAnalysisRequestHandler, this handler also supports query analysis by sending either an "analysis.query" or "q" - request paraemter that holds the query text to be analyized. It + request parameter that holds the query text to be analyzed. It also supports the "analysis.showmatch" parameter which when set to true, all field tokens that match the query tokens will be marked as a "match". @@ -952,7 +1042,7 @@ --> <requestHandler name="/admin/" class="solr.admin.AdminHandlers" /> - <!-- This single handler is equivilent to the following... --> + <!-- This single handler is equivalent to the following... --> <!-- <requestHandler name="/admin/luke" class="solr.admin.LukeRequestHandler" /> <requestHandler name="/admin/system" class="solr.admin.SystemInfoHandler" /> @@ -976,11 +1066,18 @@ <!-- ping/healthcheck --> <requestHandler name="/admin/ping" class="solr.PingRequestHandler"> - <lst name="defaults"> - <str name="qt">search</str> + <lst name="invariants"> <str name="q">solrpingquery</str> + </lst> + <lst name="defaults"> <str name="echoParams">all</str> </lst> + <!-- An optional feature of the PingRequestHandler is to configure the + handler with a "healthcheckFile" which can be used to enable/disable + the PingRequestHandler. + relative paths are resolved against the data dir + --> + <!-- <str name="healthcheckFile">server-enabled.txt</str> --> </requestHandler> <!-- Echo the request contents back to the client --> @@ -994,34 +1091,44 @@ <!-- Solr Replication The SolrReplicationHandler supports replicating indexes from a - "master" used for indexing and "salves" used for queries. + "master" used for indexing and "slaves" used for queries. http://wiki.apache.org/solr/SolrReplication - In the example below, remove the <lst name="master"> section if - this is just a slave and remove the <lst name="slave"> section - if this is just a master. + It is also neccessary for SolrCloud to function (in Cloud mode, the + replication handler is used to bulk transfer segments when nodes + are added or need to recover). + + https://wiki.apache.org/solr/SolrCloud/ --> - <!-- - <requestHandler name="/replication" class="solr.ReplicationHandler" > + <requestHandler name="/replication" class="solr.ReplicationHandler" > + <!-- + To enable simple master/slave replication, uncomment one of the + sections below, depending on wether this solr instance should be + the "master" or a "slave". If this instance is a "slave" you will + also need to fill in the masterUrl to point to a real machine. + --> + <!-- <lst name="master"> <str name="replicateAfter">commit</str> <str name="replicateAfter">startup</str> <str name="confFiles">schema.xml,stopwords.txt</str> </lst> + --> + <!-- <lst name="slave"> - <str name="masterUrl">http://localhost:8983/solr/replication</str> + <str name="masterUrl">http://your-master-hostname:8983/solr</str> <str name="pollInterval">00:00:60</str> </lst> - </requestHandler> --> + </requestHandler> <!-- Search Components Search components are registered to SolrCore and used by instances of SearchHandler (which can access them by name) - By default, the following components are avaliable: + By default, the following components are available: <searchComponent name="query" class="solr.QueryComponent" /> <searchComponent name="facet" class="solr.FacetComponent" /> @@ -1058,7 +1165,7 @@ always be executed after the "last-components" --> - + <!-- Spell Check The spell check component can return a list of alternative spelling @@ -1074,13 +1181,38 @@ component --> - <!-- a spellchecker built from a field of hte main index, and - written to disk - --> + <!-- a spellchecker built from a field of the main index --> <lst name="spellchecker"> <str name="name">default</str> <str name="field">name</str> - <str name="spellcheckIndexDir">spellchecker</str> + <str name="classname">solr.DirectSolrSpellChecker</str> + <!-- the spellcheck distance measure used, the default is the internal levenshtein --> + <str name="distanceMeasure">internal</str> + <!-- minimum accuracy needed to be considered a valid spellcheck suggestion --> + <float name="accuracy">0.5</float> + <!-- the maximum #edits we consider when enumerating terms: can be 1 or 2 --> + <int name="maxEdits">2</int> + <!-- the minimum shared prefix when enumerating terms --> + <int name="minPrefix">1</int> + <!-- maximum number of inspections per result. --> + <int name="maxInspections">5</int> + <!-- minimum length of a query term to be considered for correction --> + <int name="minQueryLength">4</int> + <!-- maximum threshold of documents a query term can appear to be considered for correction --> + <float name="maxQueryFrequency">0.01</float> + <!-- uncomment this to require suggestions to occur in 1% of the documents + <float name="thresholdTokenFrequency">.01</float> + --> + </lst> + + <!-- a spellchecker that can break or combine words. See "/spell" handler below for usage --> + <lst name="spellchecker"> + <str name="name">wordbreak</str> + <str name="classname">solr.WordBreakSolrSpellChecker</str> + <str name="field">name</str> + <str name="combineWords">true</str> + <str name="breakWords">true</str> + <int name="maxChanges">10</int> </lst> <!-- a spellchecker that uses a different distance measure --> @@ -1088,10 +1220,10 @@ <lst name="spellchecker"> <str name="name">jarowinkler</str> <str name="field">spell</str> + <str name="classname">solr.DirectSolrSpellChecker</str> <str name="distanceMeasure"> org.apache.lucene.search.spell.JaroWinklerDistance </str> - <str name="spellcheckIndexDir">spellcheckerJaro</str> </lst> --> @@ -1106,9 +1238,8 @@ <lst name="spellchecker"> <str name="name">freq</str> <str name="field">lowerfilt</str> - <str name="spellcheckIndexDir">spellcheckerFreq</str> + <str name="classname">solr.DirectSolrSpellChecker</str> <str name="comparatorClass">freq</str> - <str name="buildOnCommit">true</str> --> <!-- A spellchecker that reads the list of words from a file --> @@ -1138,9 +1269,22 @@ --> <requestHandler name="/spell" class="solr.SearchHandler" startup="lazy"> <lst name="defaults"> - <str name="spellcheck.onlyMorePopular">false</str> - <str name="spellcheck.extendedResults">false</str> - <str name="spellcheck.count">1</str> + <str name="df">text</str> + <!-- Solr will use suggestions from both the 'default' spellchecker + and from the 'wordbreak' spellchecker and combine them. + collations (re-written queries) can include a combination of + corrections from both spellcheckers --> + <str name="spellcheck.dictionary">default</str> + <str name="spellcheck.dictionary">wordbreak</str> + <str name="spellcheck">on</str> + <str name="spellcheck.extendedResults">true</str> + <str name="spellcheck.count">10</str> + <str name="spellcheck.alternativeTermCount">5</str> + <str name="spellcheck.maxResultsForSuggest">5</str> + <str name="spellcheck.collate">true</str> + <str name="spellcheck.collateExtendedResults">true</str> + <str name="spellcheck.maxCollationTries">10</str> + <str name="spellcheck.maxCollations">5</str> </lst> <arr name="last-components"> <str>spellcheck</str> @@ -1160,8 +1304,9 @@ In reality you will likely want to add the component to your already specified request handlers. --> - <requestHandler name="tvrh" class="solr.SearchHandler" startup="lazy"> + <requestHandler name="/tvrh" class="solr.SearchHandler" startup="lazy"> <lst name="defaults"> + <str name="df">text</str> <bool name="tv">true</bool> </lst> <arr name="last-components"> @@ -1173,14 +1318,13 @@ http://wiki.apache.org/solr/ClusteringComponent - This relies on third party jars which are notincluded in the - release. To use this component (and the "/clustering" handler) - Those jars will need to be downloaded, and you'll need to set - the solr.cluster.enabled system property when running solr... + You'll need to set the solr.clustering.enabled system property + when running solr to run with clustering enabled: + + java -Dsolr.clustering.enabled=true -jar start.jar - java -Dsolr.clustering.enabled=true -jar start.jar --> - <searchComponent name="clustering" + <searchComponent name="clustering" enable="${solr.clustering.enabled:false}" class="solr.clustering.ClusteringComponent" > <!-- Declare an engine --> @@ -1188,8 +1332,8 @@ <!-- The name, only one can be named "default" --> <str name="name">default</str> - <!-- Class name of Carrot2 clustering algorithm. - + <!-- Class name of Carrot2 clustering algorithm. + Currently available algorithms are: * org.carrot2.clustering.lingo.LingoClusteringAlgorithm @@ -1211,7 +1355,7 @@ name and attribute value as parameter value. --> <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str> - + <!-- Location of Carrot2 lexical resources. A directory from which to load Carrot2-specific stop words @@ -1226,7 +1370,7 @@ <str name="carrot.lexicalResourcesDir">clustering/carrot2</str> <!-- The language to assume for the documents. - + For a list of allowed values, see: http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage --> @@ -1267,7 +1411,7 @@ <str name="defType">edismax</str> <str name="qf"> - text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 + text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 </str> <str name="q.alt">*:*</str> <str name="rows">10</str> @@ -1291,6 +1435,7 @@ <requestHandler name="/terms" class="solr.SearchHandler" startup="lazy"> <lst name="defaults"> <bool name="terms">true</bool> + <bool name="distrib">false</bool> </lst> <arr name="components"> <str>terms</str> @@ -1316,6 +1461,7 @@ <requestHandler name="/elevate" class="solr.SearchHandler" startup="lazy"> <lst name="defaults"> <str name="echoParams">explicit</str> + <str name="df">text</str> </lst> <arr name="last-components"> <str>elevator</str> @@ -1369,13 +1515,17 @@ <!-- Configure the standard fragListBuilder --> <fragListBuilder name="simple" - default="true" class="solr.highlight.SimpleFragListBuilder"/> - + <!-- Configure the single fragListBuilder --> <fragListBuilder name="single" class="solr.highlight.SingleFragListBuilder"/> - + + <!-- Configure the weighted fragListBuilder --> + <fragListBuilder name="weighted" + default="true" + class="solr.highlight.WeightedFragListBuilder"/> + <!-- default tag FragmentsBuilder --> <fragmentsBuilder name="default" default="true" @@ -1400,6 +1550,27 @@ <str name="hl.tag.post"><![CDATA[</b>]]></str> </lst> </fragmentsBuilder> + + <boundaryScanner name="default" + default="true" + class="solr.highlight.SimpleBoundaryScanner"> + <lst name="defaults"> + <str name="hl.bs.maxScan">10</str> + <str name="hl.bs.chars">.,!? 	 </str> + </lst> + </boundaryScanner> + + <boundaryScanner name="breakIterator" + class="solr.highlight.BreakIteratorBoundaryScanner"> + <lst name="defaults"> + <!-- type should be one of CHARACTER, WORD(default), LINE and SENTENCE --> + <str name="hl.bs.type">WORD</str> + <!-- language and country are used when constructing Locale object. --> + <!-- And the Locale object will be used when getting instance of BreakIterator --> + <str name="hl.bs.language">en</str> + <str name="hl.bs.country">US</str> + </lst> + </boundaryScanner> </highlighting> </searchComponent> @@ -1434,7 +1605,47 @@ <processor class="solr.RunUpdateProcessorFactory" /> </updateRequestProcessorChain> --> + + <!-- Language identification + + This example update chain identifies the language of the incoming + documents using the langid contrib. The detected language is + written to field language_s. No field name mapping is done. + The fields used for detection are text, title, subject and description, + making this example suitable for detecting languages form full-text + rich documents injected via ExtractingRequestHandler. + See more about langId at http://wiki.apache.org/solr/LanguageDetection + --> + <!-- + <updateRequestProcessorChain name="langid"> + <processor class="org.apache.solr.update.processor.TikaLanguageIdentifierUpdateProcessorFactory"> + <str name="langid.fl">text,title,subject,description</str> + <str name="langid.langField">language_s</str> + <str name="langid.fallback">en</str> + </processor> + <processor class="solr.LogUpdateProcessorFactory" /> + <processor class="solr.RunUpdateProcessorFactory" /> + </updateRequestProcessorChain> + --> + <!-- Script update processor + + This example hooks in an update processor implemented using JavaScript. + + See more about the script update processor at http://wiki.apache.org/solr/ScriptUpdateProcessor + --> + <!-- + <updateRequestProcessorChain name="script"> + <processor class="solr.StatelessScriptUpdateProcessorFactory"> + <str name="script">update-script.js</str> + <lst name="params"> + <str name="config_param">example config parameter</str> + </lst> + </processor> + <processor class="solr.RunUpdateProcessorFactory" /> + </updateRequestProcessorChain> + --> + <!-- Response Writers http://wiki.apache.org/solr/QueryResponseWriter @@ -1458,15 +1669,22 @@ <queryResponseWriter name="ruby" class="solr.RubyResponseWriter"/> <queryResponseWriter name="php" class="solr.PHPResponseWriter"/> <queryResponseWriter name="phps" class="solr.PHPSerializedResponseWriter"/> - <queryResponseWriter name="velocity" class="solr.VelocityResponseWriter"/> <queryResponseWriter name="csv" class="solr.CSVResponseWriter"/> --> + + <queryResponseWriter name="json" class="solr.JSONResponseWriter"> + <!-- For the purposes of the tutorial, JSON responses are written as + plain text so that they are easy to read in *any* browser. + If you expect a MIME type of "application/json" just remove this override. + --> + <str name="content-type">text/plain; charset=UTF-8</str> + </queryResponseWriter> + <!-- Custom response writers can be declared as needed... --> - <!-- - <queryResponseWriter name="custom" class="com.example.MyResponseWriter"/> - --> + <queryResponseWriter name="velocity" class="solr.VelocityResponseWriter" startup="lazy"/> + <!-- XSLT response writer transforms the XML output by any xslt file found in Solr's conf/xslt directory. Changes to xslt files are checked for @@ -1501,17 +1719,36 @@ <valueSourceParser name="myfunc" class="com.mycompany.MyValueSourceParser" /> --> + + + <!-- Document Transformers + http://wiki.apache.org/solr/DocTransformers + --> + <!-- + Could be something like: + <transformer name="db" class="com.mycompany.LoadFromDatabaseTransformer" > + <int name="connection">jdbc://....</int> + </transformer> + + To add a constant value to all docs, use: + <transformer name="mytrans2" class="org.apache.solr.response.transform.ValueAugmenterFactory" > + <int name="value">5</int> + </transformer> + + If you want the user to still be able to change it with _value:something_ use this: + <transformer name="mytrans3" class="org.apache.solr.response.transform.ValueAugmenterFactory" > + <double name="defaultValue">5</double> + </transformer> + + If you are using the QueryElevationComponent, you may wish to mark documents that get boosted. The + EditorialMarkerFactory will do exactly that: + <transformer name="qecBooster" class="org.apache.solr.response.transform.EditorialMarkerFactory" /> + --> + <!-- Legacy config for the admin interface --> <admin> <defaultQuery>*:*</defaultQuery> - - <!-- configure a healthcheck file for servers behind a - loadbalancer - --> - <!-- - <healthcheck type="file">server-enabled</healthcheck> - --> </admin> </config>
Modified: stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/spellings.txt URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/spellings.txt?rev=1469266&r1=1469265&r2=1469266&view=diff ============================================================================== --- stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/spellings.txt (original) +++ stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/spellings.txt Thu Apr 18 10:17:06 2013 @@ -0,0 +1,2 @@ +pizza +history Modified: stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/stopwords.txt URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/stopwords.txt?rev=1469266&r1=1469265&r2=1469266&view=diff ============================================================================== --- stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/stopwords.txt (original) +++ stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/stopwords.txt Thu Apr 18 10:17:06 2013 @@ -12,43 +12,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -#----------------------------------------------------------------------- - -#Standard english stop words taken from Lucene's StopAnalyzer -a -an -and -are -as -at -be -but -by -for -if -in -into -is -it -no -not -of -on -or -s -such -t -that -the -their -then -there -these -they -this -to -was -will -with - Modified: stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/synonyms.txt URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/synonyms.txt?rev=1469266&r1=1469265&r2=1469266&view=diff ============================================================================== --- stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/synonyms.txt (original) +++ stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/synonyms.txt Thu Apr 18 10:17:06 2013 @@ -11,6 +11,12 @@ # limitations under the License. #----------------------------------------------------------------------- +#some test synonym mappings unlikely to appear in real input text +aaafoo => aaabar +bbbfoo => bbbfoo bbbbar +cccfoo => cccbar cccbaz +fooaaa,baraaa,bazaaa + # Some synonym groups specific to this example GB,gib,gigabyte,gigabytes MB,mib,megabyte,megabytes @@ -19,5 +25,5 @@ Television, Televisions, TV, TVs #after us won't split it into two words. # Synonym mappings can be used for spelling correction too -# pixima => pixma +pixima => pixma Added: stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/update-script.js URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/update-script.js?rev=1469266&view=auto ============================================================================== --- stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/update-script.js (added) +++ stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/update-script.js Thu Apr 18 10:17:06 2013 @@ -0,0 +1,53 @@ +/* + This is a basic skeleton JavaScript update processor. + + In order for this to be executed, it must be properly wired into solrconfig.xml; by default it is commented out in + the example solrconfig.xml and must be uncommented to be enabled. + + See http://wiki.apache.org/solr/ScriptUpdateProcessor for more details. +*/ + +function processAdd(cmd) { + + doc = cmd.solrDoc; // org.apache.solr.common.SolrInputDocument + id = doc.getFieldValue("id"); + logger.info("update-script#processAdd: id=" + id); + +// Set a field value: +// doc.setField("foo_s", "whatever"); + +// Get a configuration parameter: +// config_param = params.get('config_param'); // "params" only exists if processor configured with <lst name="params"> + +// Get a request parameter: +// some_param = req.getParams().get("some_param") + +// Add a field of field names that match a pattern: +// - Potentially useful to determine the fields/attributes represented in a result set, via faceting on field_name_ss +// field_names = doc.getFieldNames().toArray(); +// for(i=0; i < field_names.length; i++) { +// field_name = field_names[i]; +// if (/attr_.*/.test(field_name)) { doc.addField("attribute_ss", field_names[i]); } +// } + +} + +function processDelete(cmd) { + // no-op +} + +function processMergeIndexes(cmd) { + // no-op +} + +function processCommit(cmd) { + // no-op +} + +function processRollback(cmd) { + // no-op +} + +function finish() { + // no-op +}
