Hey all, I have a fairly odd case of duplicate documents in our solr index
(See attached xml sample). THe index is roughtly 35k in documents. The only
way I've found to fix the problem is to run a delete statement by id, which
deletes both, I can then re-index that one document. This happened
previosuly but it ended up being an issue with case-sensitivity but this
time the id's appear identical! 

Any assistance in tracking this down would be appeciated! I can provide any
other logs if nesseary.

Thanks,

Dan

Sample Select Query:
  <?xml version="1.0" encoding="UTF-8" ?> 
- <response>
- <lst name="responseHeader">
  <int name="status">0</int> 
  <int name="QTime">0</int> 
  </lst>
- <result name="response" numFound="2" start="0">
- <doc>
- <arr name="categoryId">
  <int>151</int> 
  <int>962</int> 
  <int>1493</int> 
  <int>1830</int> 
  </arr>
- <arr name="finish">
  <str>N/A</str> 
  </arr>
  <bool name="hasDigiCast">false</bool> 
  <bool name="hasDigiVista">false</bool> 
  <str name="id">hr-802waclighting</str> 
- <arr name="inStock">
  <bool>false</bool> 
  </arr>
  <bool name="isNew">false</bool> 
  <bool name="isTopSeller">true</bool> 
  <str name="manufacturer">wac lighting</str> 
- <arr name="masterFinish">
  <str>not applicable</str> 
  </arr>
  <date name="modifiedDate">2007-10-15T23:10:01.510Z</date> 
  <bool name="onSale">false</bool> 
  <int name="popularity">1683</int> 
- <arr name="price">
  <float>53.91</float> 
  </arr>
  <date name="productAddDate">2007-07-05T00:00:00Z</date> 
  <str name="productID">HR-802</str> 
  <str name="productTitle">Low Voltage Miniature Housing for Recessed
Lighting Fixture</str> 
  <str name="series">low voltage miniature housings</str> 
- <arr name="sku">
  <str /> 
  </arr>
  <str name="theme" /> 
- <arr name="upc">
  <str /> 
  </arr>
  </doc>
- <doc>
- <arr name="categoryId">
  <int>151</int> 
  <int>962</int> 
  <int>1493</int> 
  <int>1830</int> 
  </arr>
- <arr name="finish">
  <str>N/A</str> 
  </arr>
  <bool name="hasDigiCast">false</bool> 
  <bool name="hasDigiVista">false</bool> 
  <str name="id">hr-802waclighting</str> 
- <arr name="inStock">
  <bool>false</bool> 
  </arr>
  <bool name="isNew">false</bool> 
  <bool name="isTopSeller">true</bool> 
  <str name="manufacturer">wac lighting</str> 
- <arr name="masterFinish">
  <str>not applicable</str> 
  </arr>
  <date name="modifiedDate">2007-11-02T15:33:21.154Z</date> 
  <bool name="onSale">false</bool> 
  <int name="popularity">1683</int> 
- <arr name="price">
  <float>53.91</float> 
  </arr>
  <date name="productAddDate">2007-07-05T00:00:00Z</date> 
  <str name="productID">HR-802</str> 
  <str name="productTitle">Low Voltage Miniature Housing for Recessed
Lighting Fixture</str> 
  <str name="series">low voltage miniature housings</str> 
- <arr name="sku">
  <str /> 
  </arr>
  <str name="theme" /> 
- <arr name="upc">
  <str /> 
  </arr>
  </doc>
  </result>
  </response>

Schema.xml
 <field name="id" type="string" indexed="true" stored="true"/>
   <field name="sku" type="textTight" indexed="true" stored="true"
multiValued="true"/>
   <field name="upc" type="textTight" indexed="true" stored="true"
multiValued="true"/>
.....
<!-- field to use to determine and enforce document uniqueness. -->
 <uniqueKey>id</uniqueKey>

 <!-- field for the QueryParser to use when an explicit fieldname is absent
-->
 <defaultSearchField>text</defaultSearchField>

 <!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
 <solrQueryParser defaultOperator="OR"/>

-- 
View this message in context: 
http://www.nabble.com/SOLR-1.2---Duplicate-Documents---tf4762687.html#a13621332
Sent from the Solr - User mailing list archive at Nabble.com.

Reply via email to