This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 2c3d864222ef79ed19f33399b5abcd392f27c82a Author: Jakob Berlin <derhe...@yahoo.de> AuthorDate: Mon Aug 3 15:56:44 2020 +0200 NUTCH-1190 MoreIndexingFilter: move data formats used to parse "lastModified" to a config file --- conf/date-styles.txt.template | 52 +++++++++ .../nutch/indexer/more/MoreIndexingFilter.java | 123 ++++++++++++++------- 2 files changed, 132 insertions(+), 43 deletions(-) diff --git a/conf/date-styles.txt.template b/conf/date-styles.txt.template new file mode 100644 index 0000000..61ee0ac --- /dev/null +++ b/conf/date-styles.txt.template @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Set of date time formats +# used by the plugin index-more when filling the index field `lastModified'. +# +# Format (line separated date time patterns following definition in +# https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html, +# comment lines start with `#'): +# +# <date format> <CR> +# <date format2> [<CR> <date format3> ...] +# +# Examples and currently used formats: +# + +EEE MMM dd HH:mm:ss yyyy +EEE MMM dd HH:mm:ss yyyy zzz +EEE MMM dd HH:mm:ss zzz yyyy +EEE, MMM dd HH:mm:ss yyyy zzz +EEE, dd MMM yyyy HH:mm:ss zzz +EEE,dd MMM yyyy HH:mm:ss zzz +EEE, dd MMM yyyy HH:mm:sszzz +EEE, dd MMM yyyy HH:mm:ss +EEE, dd-MMM-yy HH:mm:ss zzz +yyyy/MM/dd HH:mm:ss.SSS zzz +yyyy/MM/dd HH:mm:ss.SSS +yyyy/MM/dd HH:mm:ss zzz +yyyy/MM/dd +yyyy.MM.dd HH:mm:ss +yyyy-MM-dd HH:mm +MMM dd yyyy HH:mm:ss. zzz +MMM dd yyyy HH:mm:ss zzz +dd.MM.yyyy HH:mm:ss zzz +dd MM yyyy HH:mm:ss zzz +dd.MM.yyyy; HH:mm:ss +dd.MM.yyyy HH:mm:ss +dd.MM.yyyy zzz +yyyy-MM-dd'T'HH:mm:ssXXX +yyyy-MM-dd'T'HH:mm:ss diff --git a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java index 45b79b7..2a475c5 100644 --- a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java +++ b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java @@ -54,19 +54,26 @@ import java.util.regex.PatternSyntaxException; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.time.DateUtils; +import java.io.File; +import java.net.URL; +import java.util.List; +import java.util.ArrayList; +import org.apache.commons.io.FileUtils; +import java.nio.charset.StandardCharsets; + /** * Add (or reset) a few metaData properties as respective fields (if they are * available), so that they can be accurately used within the search index. - * + * * 'lastModifed' is indexed to support query by date, 'contentLength' obtains * content length from the HTTP header, 'type' field is indexed to support query * by type and finally the 'title' field is an attempt to reset the title if a * content-disposition hint exists. The logic is that such a presence is * indicative that the content provider wants the filename therein to be used as * the title. - * + * * Still need to make content-length searchable! - * + * * @author John Xing */ @@ -83,15 +90,30 @@ public class MoreIndexingFilter implements IndexingFilter { private boolean mapMimes = false; private String mapFieldName; + /** Date-styles used to parse date. */ + private String[] defaultDateStyles = new String[] { + "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz", + "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, MMM dd HH:mm:ss yyyy zzz", + "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz", + "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss", + "EEE, dd-MMM-yy HH:mm:ss zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz", + "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd", + "yyyy.MM.dd HH:mm:ss", "yyyy-MM-dd HH:mm", + "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz", + "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz", + "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz", + "yyyy-MM-dd'T'HH:mm:ssXXX" }; + private String[] dateStyles = null; + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { String url_s = url.toString(); addTime(doc, parse.getData(), url_s, datum); - addLength(doc, parse.getData(), url_s); + addLength(doc, parse.getData()); addType(doc, parse.getData(), url_s, datum); - resetTitle(doc, parse.getData(), url_s); + resetTitle(doc, parse.getData()); return doc; } @@ -126,43 +148,32 @@ public class MoreIndexingFilter implements IndexingFilter { private long getTime(String date, String url) { long time = -1; + try { time = HttpDateFormat.toLong(date); } catch (ParseException e) { // try to parse it as date in alternative format try { - Date parsedDate = DateUtils.parseDate(date, new String[] { - "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz", - "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, MMM dd HH:mm:ss yyyy zzz", - "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz", - "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss", - "EEE, dd-MMM-yy HH:mm:ss zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz", - "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd", - "yyyy.MM.dd HH:mm:ss", "yyyy-MM-dd HH:mm", - "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz", - "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz", - "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz", - "yyyy-MM-dd'T'HH:mm:ssXXX" }); + Date parsedDate = DateUtils.parseDate(date, dateStyles); time = parsedDate.getTime(); - // if (LOG.isWarnEnabled()) { - // LOG.warn(url + ": parsed date: " + date +" to:"+time); - // } + LOG.info(url + ": parsed date: " + date +" to: " + time); } catch (Exception e2) { if (LOG.isWarnEnabled()) { LOG.warn(url + ": can't parse erroneous date: " + date); } } } + return time; } // Add Content-Length - private NutchDocument addLength(NutchDocument doc, ParseData data, String url) { + private NutchDocument addLength(NutchDocument doc, ParseData data) { String contentLength = data.getMeta(Response.CONTENT_LENGTH); if (contentLength != null) { // NUTCH-1010 ContentLength not trimmed - String trimmed = contentLength.toString().trim(); + String trimmed = contentLength.trim(); if (!trimmed.isEmpty()) doc.add("contentLength", trimmed); } @@ -183,7 +194,7 @@ public class MoreIndexingFilter implements IndexingFilter { * all case insensitive. The query filter is implemented in * {@link TypeQueryFilter}. * </p> - * + * * @param doc * @param data * @param url @@ -196,10 +207,13 @@ public class MoreIndexingFilter implements IndexingFilter { Writable tcontentType = datum.getMetaData().get( new Text(Response.CONTENT_TYPE)); + if (tcontentType != null) { contentType = tcontentType.toString(); - } else + } else { contentType = data.getMeta(Response.CONTENT_TYPE); + } + if (contentType == null) { // Note by Jerome Charron on 20050415: // Content Type not solved by a previous plugin @@ -224,14 +238,11 @@ public class MoreIndexingFilter implements IndexingFilter { } // Check if we have to map mime types - if (mapMimes) { - // Check if the current mime is mapped - if (mimeMap.containsKey(mimeType)) { - if (mapFieldName != null) { - doc.add(mapFieldName, mimeMap.get(mimeType)); - } else { - mimeType = mimeMap.get(mimeType); - } + if (mapMimes && mimeMap.containsKey(mimeType)) { + if (mapFieldName != null) { + doc.add(mapFieldName, mimeMap.get(mimeType)); + } else { + mimeType = mimeMap.get(mimeType); } } @@ -255,7 +266,7 @@ public class MoreIndexingFilter implements IndexingFilter { /** * Utility method for splitting mime type into type and subtype. - * + * * @param mimeType * @return */ @@ -272,7 +283,7 @@ public class MoreIndexingFilter implements IndexingFilter { // Content-Disposition: inline; filename="foo.ppt" private Configuration conf; - static Pattern patterns[] = { null, null }; + static Pattern[] patterns = { null, null }; static { try { @@ -284,7 +295,7 @@ public class MoreIndexingFilter implements IndexingFilter { } } - private NutchDocument resetTitle(NutchDocument doc, ParseData data, String url) { + private NutchDocument resetTitle(NutchDocument doc, ParseData data) { String contentDisposition = data.getMeta(Metadata.CONTENT_DISPOSITION); if (contentDisposition == null || doc.getFieldValue("title") != null) return doc; @@ -316,6 +327,29 @@ public class MoreIndexingFilter implements IndexingFilter { LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); } } + + URL dateStylesResource = conf.getResource("date-styles.txt"); + if (dateStylesResource == null) { + dateStyles = defaultDateStyles; + LOG.warn("Can't find resource: date-styles.txt - Defaults will be used."); + } else { + try { + List<String> usedLines = new ArrayList<String>(); + for (String dateStyle: FileUtils.readLines(new File(dateStylesResource.getFile()), + StandardCharsets.US_ASCII)) { + if (StringUtils.isBlank(dateStyle) || dateStyle.startsWith("#")) { + continue; + } + + usedLines.add(StringUtils.trim(dateStyle)); + } + + dateStyles = new String[usedLines.size()]; + usedLines.toArray(dateStyles); + } catch (IOException e) { + LOG.error("Failed to load resource: date-styles.txt"); + } + } } public Configuration getConf() { @@ -324,16 +358,19 @@ public class MoreIndexingFilter implements IndexingFilter { private void readConfiguration() throws IOException { LOG.info("Reading content type mappings from file contenttype-mapping.txt"); - BufferedReader reader = new BufferedReader( - conf.getConfResourceAsReader("contenttype-mapping.txt")); - String line; - String parts[]; - boolean formatWarningShown = false; + try (BufferedReader reader = new BufferedReader( + conf.getConfResourceAsReader("contenttype-mapping.txt"))) { + String line; + String[] parts; + boolean formatWarningShown = false; - mimeMap = new HashMap<String, String>(); + mimeMap = new HashMap<String, String>(); + + while ((line = reader.readLine()) != null) { + if (StringUtils.isBlank(line) || line.startsWith("#")) { + continue; + } - while ((line = reader.readLine()) != null) { - if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { line = line.trim(); parts = line.split("\t");