Added: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java?rev=1558349&view=auto ============================================================================== --- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java (added) +++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java Wed Jan 15 12:01:45 2014 @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexwriter.solr; + +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.util.HashMap; +import java.util.Map; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.ObjectCache; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +public class SolrMappingReader { + public static Logger LOG = LoggerFactory.getLogger(SolrMappingReader.class); + + private Configuration conf; + + private Map<String, String> keyMap = new HashMap<String, String>(); + private Map<String, String> copyMap = new HashMap<String, String>(); + private String uniqueKey = "id"; + + public static synchronized SolrMappingReader getInstance(Configuration conf) { + ObjectCache cache = ObjectCache.get(conf); + SolrMappingReader instance = (SolrMappingReader)cache.getObject(SolrMappingReader.class.getName()); + if (instance == null) { + instance = new SolrMappingReader(conf); + cache.setObject(SolrMappingReader.class.getName(), instance); + } + return instance; + } + + protected SolrMappingReader(Configuration conf) { + this.conf = conf; + parseMapping(); + } + + private void parseMapping() { + InputStream ssInputStream = null; + ssInputStream = conf.getConfResourceAsInputStream(conf.get(SolrConstants.MAPPING_FILE, "solrindex-mapping.xml")); + + InputSource inputSource = new InputSource(ssInputStream); + try { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + DocumentBuilder builder = factory.newDocumentBuilder(); + Document document = builder.parse(inputSource); + Element rootElement = document.getDocumentElement(); + NodeList fieldList = rootElement.getElementsByTagName("field"); + if (fieldList.getLength() > 0) { + for (int i = 0; i < fieldList.getLength(); i++) { + Element element = (Element) fieldList.item(i); + LOG.info("source: " + element.getAttribute("source") + " dest: " + element.getAttribute("dest")); + keyMap.put(element.getAttribute("source"), element.getAttribute("dest")); + } + } + NodeList copyFieldList = rootElement.getElementsByTagName("copyField"); + if (copyFieldList.getLength() > 0) { + for (int i = 0; i < copyFieldList.getLength(); i++) { + Element element = (Element) copyFieldList.item(i); + LOG.info("source: " + element.getAttribute("source") + " dest: " + element.getAttribute("dest")); + copyMap.put(element.getAttribute("source"), element.getAttribute("dest")); + } + } + NodeList uniqueKeyItem = rootElement.getElementsByTagName("uniqueKey"); + if (uniqueKeyItem.getLength() > 1) { + LOG.warn("More than one unique key definitions found in solr index mapping, using default 'id'"); + uniqueKey = "id"; + } + else if (uniqueKeyItem.getLength() == 0) { + LOG.warn("No unique key definition found in solr index mapping using, default 'id'"); + } + else{ + uniqueKey = uniqueKeyItem.item(0).getFirstChild().getNodeValue(); + } + } catch (MalformedURLException e) { + LOG.warn(e.toString()); + } catch (SAXException e) { + LOG.warn(e.toString()); + } catch (IOException e) { + LOG.warn(e.toString()); + } catch (ParserConfigurationException e) { + LOG.warn(e.toString()); + } + } + + public Map<String, String> getKeyMap() { + return keyMap; + } + + public Map<String, String> getCopyMap() { + return copyMap; + } + + public String getUniqueKey() { + return uniqueKey; + } + + public String hasCopy(String key) { + if (copyMap.containsKey(key)) { + key = copyMap.get(key); + } + return key; + } + + public String mapKey(String key) throws IOException { + if(keyMap.containsKey(key)) { + key = keyMap.get(key); + } + return key; + } + + public String mapCopyKey(String key) throws IOException { + if(copyMap.containsKey(key)) { + key = copyMap.get(key); + } + return key; + } +}
Added: nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java?rev=1558349&view=auto ============================================================================== --- nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java (added) +++ nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java Wed Jan 15 12:01:45 2014 @@ -0,0 +1,62 @@ +package org.apache.nutch.indexwriter.solr; + + +import org.apache.http.impl.client.DefaultHttpClient; +import org.apache.http.auth.AuthScope; +import org.apache.http.auth.UsernamePasswordCredentials; +import org.apache.http.client.params.HttpClientParams; +import org.apache.http.params.HttpParams; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.solr.client.solrj.impl.HttpSolrServer; + +import java.net.MalformedURLException; + +public class SolrUtils { + + public static Logger LOG = LoggerFactory.getLogger(SolrUtils.class); + + public static HttpSolrServer getHttpSolrServer(Configuration job) throws MalformedURLException { + DefaultHttpClient client = new DefaultHttpClient(); + + // Check for username/password + if (job.getBoolean(SolrConstants.USE_AUTH, false)) { + String username = job.get(SolrConstants.USERNAME); + + LOG.info("Authenticating as: " + username); + + AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT, AuthScope.ANY_REALM, AuthScope.ANY_SCHEME); + + client.getCredentialsProvider().setCredentials(scope, new UsernamePasswordCredentials(username, job.get(SolrConstants.PASSWORD))); + + HttpParams params = client.getParams(); + HttpClientParams.setAuthenticating(params, true); + + client.setParams(params); + } + + return new HttpSolrServer(job.get(SolrConstants.SERVER_URL), client); + } + + public static String stripNonCharCodepoints(String input) { + StringBuilder retval = new StringBuilder(); + char ch; + + for (int i = 0; i < input.length(); i++) { + ch = input.charAt(i); + + // Strip all non-characters http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:] + // and non-printable control characters except tabulator, new line and carriage return + if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000 + ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range + (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef + (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) { + + retval.append(ch); + } + } + + return retval.toString(); + } +} \ No newline at end of file Modified: nutch/branches/2.x/src/plugin/nutch-extensionpoints/plugin.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/nutch-extensionpoints/plugin.xml?rev=1558349&r1=1558348&r2=1558349&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/nutch-extensionpoints/plugin.xml (original) +++ nutch/branches/2.x/src/plugin/nutch-extensionpoints/plugin.xml Wed Jan 15 12:01:45 2014 @@ -56,4 +56,7 @@ id="org.apache.nutch.scoring.ScoringFilter" name="Nutch Scoring"/> +<extension-point + id="org.apache.nutch.indexer.IndexWriter" + name="Nutch Index Writer"/> </plugin>
