Author: ab Date: Wed Nov 25 20:58:10 2009 New Revision: 884269 URL: http://svn.apache.org/viewvc?rev=884269&view=rev Log: NUTCH-760 Allow field mapping from nutch to solr index.
Added: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java (with props) Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SolrSearchBean.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=884269&r1=884268&r2=884269&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Nov 25 20:58:10 2009 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-760 Allow field mapping from Nutch to Solr index (David Stuart, ab) + * NUTCH-761 Avoid cloning CrawlDatum in CrawlDbReducer (Julien Nioche, ab) * NUTCH-753 Prevent new Fetcher from retrieving the robots twice (Julien Nioche via ab) Added: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java?rev=884269&view=auto ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java Wed Nov 25 20:58:10 2009 @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.solr; + +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.util.HashMap; +import java.util.Map; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.ObjectCache; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +public class SolrMappingReader { + public static Log LOG = LogFactory.getLog(SolrMappingReader.class); + + /** The property name of the parse solr index mapping location */ + private static final String SS_FILE_MAPPING = "solrindex.mapping.file"; + + private Configuration conf; + + private Map<String, String> keyMap = new HashMap<String, String>(); + private Map<String, String> copyMap = new HashMap<String, String>(); + private String uniqueKey = "id"; + + public static synchronized SolrMappingReader getInstance(Configuration conf) { + ObjectCache cache = ObjectCache.get(conf); + SolrMappingReader instance = (SolrMappingReader)cache.getObject(SolrMappingReader.class.getName()); + if (instance == null) { + instance = new SolrMappingReader(conf); + cache.setObject(SolrMappingReader.class.getName(), instance); + } + return instance; + } + + protected SolrMappingReader(Configuration conf) { + this.conf = conf; + parseMapping(); + } + + private void parseMapping() { + InputStream ssInputStream = null; + ssInputStream = conf.getConfResourceAsInputStream(conf.get(SS_FILE_MAPPING, "solrindex-mapping.xml")); + InputSource inputSource = new InputSource(ssInputStream); + try { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + DocumentBuilder builder = factory.newDocumentBuilder(); + Document document = builder.parse(inputSource); + Element rootElement = document.getDocumentElement(); + NodeList fieldList = rootElement.getElementsByTagName("field"); + if (fieldList.getLength() > 0) { + for (int i = 0; i < fieldList.getLength(); i++) { + Element element = (Element) fieldList.item(i); + LOG.info("source: " + element.getAttribute("source") + " dest: " + element.getAttribute("dest")); + keyMap.put(element.getAttribute("source"), element.getAttribute("dest")); + } + } + NodeList copyFieldList = rootElement.getElementsByTagName("copyField"); + if (copyFieldList.getLength() > 0) { + for (int i = 0; i < copyFieldList.getLength(); i++) { + Element element = (Element) copyFieldList.item(i); + LOG.info("source: " + element.getAttribute("source") + " dest: " + element.getAttribute("dest")); + copyMap.put(element.getAttribute("source"), element.getAttribute("dest")); + } + } + NodeList uniqueKeyItem = rootElement.getElementsByTagName("uniqueKey"); + if (uniqueKeyItem.getLength() > 1) { + LOG.warn("More than one unique key definitions found in solr index mapping, using default 'id'"); + uniqueKey = "id"; + } + else if (uniqueKeyItem.getLength() == 0) { + LOG.warn("No unique key definition found in solr index mapping using, default 'id'"); + } + else{ + uniqueKey = uniqueKeyItem.item(0).getFirstChild().getNodeValue(); + } + } catch (MalformedURLException e) { + LOG.warn(e.toString()); + } catch (SAXException e) { + LOG.warn(e.toString()); + } catch (IOException e) { + LOG.warn(e.toString()); + } catch (ParserConfigurationException e) { + LOG.warn(e.toString()); + } + } + + public Map<String, String> getKeyMap() { + return keyMap; + } + + public Map<String, String> getCopyMap() { + return copyMap; + } + + public String getUniqueKey() { + return uniqueKey; + } + + public String hasCopy(String key) { + if (copyMap.containsKey(key)) { + key = (String) copyMap.get(key); + } + return key; + } + + public String mapKey(String key) throws IOException { + if(keyMap.containsKey(key)) { + key = (String) keyMap.get(key); + } + return key; + } + + public String mapCopyKey(String key) throws IOException { + if(copyMap.containsKey(key)) { + key = (String) copyMap.get(key); + } + return key; + } +} Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java?rev=884269&r1=884268&r2=884269&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java Wed Nov 25 20:58:10 2009 @@ -32,23 +32,28 @@ public class SolrWriter implements NutchIndexWriter { private SolrServer solr; + private SolrMappingReader solrMapping; private final List<SolrInputDocument> inputDocs = new ArrayList<SolrInputDocument>(); private int commitSize; - public void open(JobConf job, String name) - throws IOException { + public void open(JobConf job, String name) throws IOException { solr = new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL)); commitSize = job.getInt(SolrConstants.COMMIT_SIZE, 1000); + solrMapping = SolrMappingReader.getInstance(job); } public void write(NutchDocument doc) throws IOException { final SolrInputDocument inputDoc = new SolrInputDocument(); for(final Entry<String, List<String>> e : doc) { for (final String val : e.getValue()) { - inputDoc.addField(e.getKey(), val); + inputDoc.addField(solrMapping.mapKey(e.getKey()), val); + String sCopy = solrMapping.mapCopyKey(e.getKey()); + if (sCopy != e.getKey()) { + inputDoc.addField(sCopy, val); + } } } inputDoc.setDocumentBoost(doc.getScore()); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SolrSearchBean.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SolrSearchBean.java?rev=884269&r1=884268&r2=884269&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SolrSearchBean.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SolrSearchBean.java Wed Nov 25 20:58:10 2009 @@ -35,6 +35,7 @@ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.util.ToStringUtils; +import org.apache.nutch.indexer.solr.SolrMappingReader; import org.apache.nutch.indexer.solr.SolrWriter; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServer; @@ -52,11 +53,15 @@ private final SolrServer solr; private final QueryFilters filters; + + private String searchUID; public SolrSearchBean(Configuration conf, String solrServer) throws IOException { solr = new CommonsHttpSolrServer(solrServer); filters = new QueryFilters(conf); + SolrMappingReader mapping = SolrMappingReader.getInstance(conf); + searchUID = mapping.getUniqueKey(); } public String getExplanation(Query query, Hit hit) throws IOException { @@ -76,10 +81,10 @@ solrQuery.setRows(numHits); if (sortField == null) { - solrQuery.setFields(dedupField, "score", "id"); + solrQuery.setFields(dedupField, "score", searchUID); sortField = "score"; } else { - solrQuery.setFields(dedupField, sortField, "id"); + solrQuery.setFields(dedupField, sortField, searchUID); solrQuery.setSortField(sortField, reverse ? ORDER.asc : ORDER.desc); } @@ -113,7 +118,7 @@ final String dedupValue = (String) solrDoc.getFirstValue(dedupField); - final String uniqueKey = (String )solrDoc.getFirstValue("id"); + final String uniqueKey = (String )solrDoc.getFirstValue(searchUID); hitArr[i] = new Hit(uniqueKey, sortValue, dedupValue); } @@ -124,7 +129,7 @@ public HitDetails getDetails(Hit hit) throws IOException { QueryResponse response; try { - response = solr.query(new SolrQuery("id:\"" + hit.getUniqueKey() + "\"")); + response = solr.query(new SolrQuery(searchUID + ":\"" + hit.getUniqueKey() + "\"")); } catch (final SolrServerException e) { throw SolrWriter.makeIOException(e); } @@ -141,7 +146,7 @@ final StringBuilder buf = new StringBuilder(); buf.append("("); for (final Hit hit : hits) { - buf.append(" id:\""); + buf.append(" " + searchUID + ":\""); buf.append(hit.getUniqueKey()); buf.append("\""); } @@ -169,7 +174,7 @@ new HashMap<String, HitDetails>(hits.length); for (final SolrDocument solrDoc : docList) { final HitDetails details = buildDetails(solrDoc); - detailsMap.put(details.getValue("id"), details); + detailsMap.put(details.getValue(searchUID), details); } final HitDetails[] detailsArr = new HitDetails[hits.length];