Author: kwright
Date: Tue Jan 8 22:01:06 2013
New Revision: 1430565
URL: http://svn.apache.org/viewvc?rev=1430565&view=rev
Log:
Fix for CONNECTORS-600.
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
Modified: manifoldcf/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1430565&r1=1430564&r2=1430565&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue Jan 8 22:01:06 2013
@@ -3,6 +3,10 @@ $Id$
======================= 1.1-dev =====================
+CONNECTORS-600: Add a field to the RSS connector that contains
+document origination date in ISO 8601 format.
+(David Morana, Karl Wright)
+
CONNECTORS-598: Add an RSS connector mode that allows just
metadata to be consumed, in conjunction with content from description
or content fields.
Modified:
manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java?rev=1430565&r1=1430564&r2=1430565&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
(original)
+++
manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
Tue Jan 8 22:01:06 2013
@@ -41,6 +41,7 @@ import org.apache.http.HttpException;
import java.io.*;
import java.util.*;
import java.net.*;
+import java.text.*;
import java.util.regex.*;
/** This is the RSS implementation of the IRepositoryConnector interface.
@@ -1449,24 +1450,33 @@ public class RSSConnector extends org.ap
// The pubdates are a ms since epoch value; we want the minimum
one for the origination time.
Long minimumOrigTime = null;
String[] pubDateValues = new String[pubDates.size()];
+ String[] pubDateValuesISO = new String[pubDates.size()];
+ TimeZone tz = TimeZone.getTimeZone("UTC");
+ DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm'Z'");
+ df.setTimeZone(tz);
k = 0;
while (k < pubDates.size())
{
String pubDate = (String)pubDates.get(k);
- pubDateValues[k++] = pubDate;
+ pubDateValues[k] = pubDate;
try
{
Long pubDateLong = new Long(pubDate);
if (minimumOrigTime == null || pubDateLong.longValue() <
minimumOrigTime.longValue())
minimumOrigTime = pubDateLong;
+ pubDateValuesISO[k] = df.format(new
Date(pubDateLong.longValue()));
}
catch (NumberFormatException e)
{
// Do nothing; the version string seems to not mean anything
}
+ k++;
}
if (k > 0)
+ {
rd.addField("pubdate",pubDateValues);
+ rd.addField("pubdateiso",pubDateValuesISO);
+ }
if (minimumOrigTime != null)
activities.setDocumentOriginationTime(urlValue,minimumOrigTime);