Author: kwright
Date: Sat Jan 5 10:46:02 2013
New Revision: 1429250
URL: http://svn.apache.org/viewvc?rev=1429250&view=rev
Log:
Fix for CONNECTORS-598, minus Japanese translations required in the RSS
connector.
Added:
manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/interfaces/NullCharacterInput.java
(with props)
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
manifoldcf/trunk/connectors/rss/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/rss/common_en_US.properties
manifoldcf/trunk/connectors/rss/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/rss/common_ja_JP.properties
manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/interfaces/TempFileCharacterInput.java
Modified: manifoldcf/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1429250&r1=1429249&r2=1429250&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Sat Jan 5 10:46:02 2013
@@ -3,6 +3,11 @@ $Id$
======================= 1.1-dev =====================
+CONNECTORS-598: Add an RSS connector mode that allows just
+metadata to be consumed, in conjunction with content from description
+or content fields.
+(David Morana, Karl Wright)
+
CONNECTORS-596: RSS and Web connectors need to peel off any
namespace qualifies from tag names when processing XML feeds.
(David Morana, Karl Wright)
Modified:
manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java?rev=1429250&r1=1429249&r2=1429250&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
(original)
+++
manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
Sat Jan 5 10:46:02 2013
@@ -92,11 +92,13 @@ public class RSSConnector extends org.ap
/** Dechromed content mode - content field */
public static final int DECHROMED_CONTENT = 2;
- /** Chromed suppression mode - use chromed content */
+ /** Chromed suppression mode - use chromed content if dechromed content not
available */
public static final int CHROMED_USE = 0;
- /** Chromed suppression mode - skip all chromed content */
+ /** Chromed suppression mode - skip documents if dechromed content not
available */
public static final int CHROMED_SKIP = 1;
-
+ /** Chromed suppression mode - index metadata only if dechromed content not
available */
+ public static final int CHROMED_METADATA_ONLY = 2;
+
/** Robots usage flag */
protected int robotsUsage = ROBOTS_ALL;
@@ -2406,6 +2408,9 @@ public class RSSConnector extends org.ap
" <tr>\n"+
" <td class=\"value\"><nobr><input type=\"radio\" name=\"chromedmode\"
value=\"skip\"
"+(chromedMode.equals("skip")?"checked=\"true\"":"")+"/>"+Messages.getBodyString(locale,"RSSConnector.NeverUseChromedContent")+"</nobr></td>\n"+
" </tr>\n"+
+" <tr>\n"+
+" <td class=\"value\"><nobr><input type=\"radio\" name=\"chromedmode\"
value=\"metadata\"
"+(chromedMode.equals("metadata")?"checked=\"true\"":"")+"/>"+Messages.getBodyString(locale,"RSSConnector.NoContentMetadataOnly")+"</nobr></td>\n"+
+" </tr>\n"+
"</table>\n"
);
}
@@ -3939,7 +3944,7 @@ public class RSSConnector extends org.ap
((origDate==null)?"null":origDate.toString()));
if (filter.isLegalURL(newIdentifier))
{
- if (contentsFile == null)
+ if (contentsFile == null && filter.getChromedContentMode() !=
CHROMED_METADATA_ONLY)
{
// It's a reference! Add it.
String[] dataNames = new
String[]{"pubdate","title","source","category","description"};
@@ -3988,19 +3993,37 @@ public class RSSConnector extends org.ap
if (descriptionField != null)
dataValues[5] = new String[]{descriptionField};
-
- CharacterInput ci = new TempFileCharacterInput(contentsFile);
- try
+
+ if (contentsFile == null)
{
- contentsFile = null;
- dataValues[4] = new Object[]{ci};
+ CharacterInput ci = new NullCharacterInput();
+ try
+ {
+ dataValues[4] = new Object[]{ci};
- // Add document reference, including the data to pass down,
and the dechromed content too
-
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
+ // Add document reference, including the data to pass
down, and the dechromed content too
+
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
+ }
+ finally
+ {
+ ci.discard();
+ }
}
- finally
+ else
{
- ci.discard();
+ CharacterInput ci = new TempFileCharacterInput(contentsFile);
+ try
+ {
+ contentsFile = null;
+ dataValues[4] = new Object[]{ci};
+
+ // Add document reference, including the data to pass
down, and the dechromed content too
+
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
+ }
+ finally
+ {
+ ci.discard();
+ }
}
}
}
@@ -4323,7 +4346,7 @@ public class RSSConnector extends org.ap
((origDate==null)?"null":origDate.toString()));
if (filter.isLegalURL(newIdentifier))
{
- if (contentsFile == null)
+ if (contentsFile == null && filter.getChromedContentMode() !=
CHROMED_METADATA_ONLY)
{
// It's a reference! Add it.
String[] dataNames = new
String[]{"pubdate","title","source","description"};
@@ -4356,18 +4379,37 @@ public class RSSConnector extends org.ap
dataValues[2] = new String[]{documentIdentifier};
if (descriptionField != null)
dataValues[4] = new String[]{descriptionField};
- CharacterInput ci = new TempFileCharacterInput(contentsFile);
- try
+
+ if (contentsFile == null)
{
- contentsFile = null;
- dataValues[3] = new Object[]{ci};
+ CharacterInput ci = new NullCharacterInput();
+ try
+ {
+ dataValues[3] = new Object[]{ci};
- // Add document reference, including the data to pass down,
and the dechromed content too
-
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
+ // Add document reference, including the data to pass
down, and the dechromed content too
+
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
+ }
+ finally
+ {
+ ci.discard();
+ }
}
- finally
+ else
{
- ci.discard();
+ CharacterInput ci = new TempFileCharacterInput(contentsFile);
+ try
+ {
+ contentsFile = null;
+ dataValues[3] = new Object[]{ci};
+
+ // Add document reference, including the data to pass
down, and the dechromed content too
+
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
+ }
+ finally
+ {
+ ci.discard();
+ }
}
}
}
@@ -4699,7 +4741,7 @@ public class RSSConnector extends org.ap
((origDate==null)?"null":origDate.toString()));
if (filter.isLegalURL(newIdentifier))
{
- if (contentsFile == null)
+ if (contentsFile == null && filter.getChromedContentMode() !=
CHROMED_METADATA_ONLY)
{
// It's a reference! Add it.
String[] dataNames = new
String[]{"pubdate","title","source","category","description"};
@@ -4746,20 +4788,38 @@ public class RSSConnector extends org.ap
}
if (descriptionField != null)
dataValues[5] = new String[]{descriptionField};
-
- CharacterInput ci = new TempFileCharacterInput(contentsFile);
- try
+
+ if (contentsFile == null)
{
- contentsFile = null;
-
- dataValues[4] = new Object[]{ci};
+ CharacterInput ci = new NullCharacterInput();
+ try
+ {
+ dataValues[4] = new Object[]{ci};
- // Add document reference, including the data to pass
down, and the dechromed content too
-
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
+ // Add document reference, including the data to pass
down, and the dechromed content too
+
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
+ }
+ finally
+ {
+ ci.discard();
+ }
}
- finally
+ else
{
- ci.discard();
+ CharacterInput ci = new
TempFileCharacterInput(contentsFile);
+ try
+ {
+ contentsFile = null;
+
+ dataValues[4] = new Object[]{ci};
+
+ // Add document reference, including the data to pass
down, and the dechromed content too
+
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
+ }
+ finally
+ {
+ ci.discard();
+ }
}
}
}
@@ -6124,6 +6184,8 @@ public class RSSConnector extends org.ap
chromedContentMode = CHROMED_USE;
else if (mode.equals("skip"))
chromedContentMode = CHROMED_SKIP;
+ else if (mode.equals("metadata"))
+ chromedContentMode = CHROMED_METADATA_ONLY;
}
}
}
Modified:
manifoldcf/trunk/connectors/rss/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/rss/common_en_US.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/rss/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/rss/common_en_US.properties?rev=1429250&r1=1429249&r2=1429250&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/rss/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/rss/common_en_US.properties
(original)
+++
manifoldcf/trunk/connectors/rss/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/rss/common_en_US.properties
Sat Jan 5 10:46:02 2013
@@ -76,7 +76,8 @@ RSSConnector.NoDechromedContent=No dechr
RSSConnector.DechromedContentIfPresentInDescriptionField=Dechromed content, if
present, in 'description' field
RSSConnector.DechromedContentIfPresentInContentField=Dechromed content, if
present, in 'content' field
RSSConnector.UseChromedContentIfNoDechromedContentFound=Use chromed content if
no dechromed content found
-RSSConnector.NeverUseChromedContent=Never use chromed content
+RSSConnector.NeverUseChromedContent=Skip documents if dechromed content
unavailable
+RSSConnector.NoContentMetadataOnly=Include only metadata if dechromed content
unavailable
RSSConnector.DeleteToken=Delete token #
RSSConnector.AddAccessToken=Add access token
RSSConnector.DeleteMetadata=Delete metadata #
Modified:
manifoldcf/trunk/connectors/rss/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/rss/common_ja_JP.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/rss/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/rss/common_ja_JP.properties?rev=1429250&r1=1429249&r2=1429250&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/rss/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/rss/common_ja_JP.properties
(original)
+++
manifoldcf/trunk/connectors/rss/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/rss/common_ja_JP.properties
Sat Jan 5 10:46:02 2013
@@ -76,7 +76,9 @@ RSSConnector.NoDechromedContent=ãã
RSSConnector.DechromedContentIfPresentInDescriptionField=ãã¯ãã
ã³ã³ãã³ããé
ç®ãåè¦ããã«ããå ´å
RSSConnector.DechromedContentIfPresentInContentField=ãã¯ãã
ã³ã³ãã³ããããé
ç®ãã³ã³ãã³ããã«ããå ´å
RSSConnector.UseChromedContentIfNoDechromedContentFound=ã¯ãã
ã³ã³ãã³ããè¦ã¤ãããªãå ´åã¯ã¯ãã ã³ã³ãã³ãã使ã
-RSSConnector.NeverUseChromedContent=ã¯ãã ã³ã³ãã³ãã¯ä½¿ããªã
+#RSSConnector.NeverUseChromedContent=ã¯ãã ã³ã³ãã³ãã¯ä½¿ããªã
+RSSConnector.NeverUseChromedContent=Skip documents if dechromed content
unavailable
+RSSConnector.NoContentMetadataOnly=Include only metadata if dechromed content
unavailable
RSSConnector.DeleteToken=ãã¼ã¯ã³ãåé¤ï¼ #
RSSConnector.AddAccessToken=ã¢ã¯ã»ã¹ãã¼ã¯ã³ã追å
RSSConnector.DeleteMetadata=ã¡ã¿ãã¼ã¿ãåé¤ï¼ #
Added:
manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/interfaces/NullCharacterInput.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/interfaces/NullCharacterInput.java?rev=1429250&view=auto
==============================================================================
---
manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/interfaces/NullCharacterInput.java
(added)
+++
manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/interfaces/NullCharacterInput.java
Sat Jan 5 10:46:02 2013
@@ -0,0 +1,117 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.interfaces;
+
+import java.io.*;
+
+import org.apache.manifoldcf.core.system.ManifoldCF;
+
+/** This class represents a null character stream, which has no characters.
+*/
+public class NullCharacterInput extends CharacterInput
+{
+ public static final String _rcsid = "@(#)$Id$";
+
+ /** Construct from nothing.
+ */
+ public NullCharacterInput()
+ {
+ super();
+ }
+
+ @Override
+ public Reader getStream()
+ throws ManifoldCFException
+ {
+ return new StringReader("");
+ }
+
+ @Override
+ public void doneWithStream()
+ throws ManifoldCFException
+ {
+ }
+
+ @Override
+ public long getCharacterLength()
+ throws ManifoldCFException
+ {
+ return 0L;
+ }
+
+ @Override
+ public String getHashValue()
+ throws ManifoldCFException
+ {
+ return ManifoldCF.getHashValue(ManifoldCF.startHash());
+ }
+
+ /** Open a Utf8 stream directly */
+ @Override
+ public InputStream getUtf8Stream()
+ throws ManifoldCFException
+ {
+ return new ByteArrayInputStream(new byte[]{});
+ }
+
+ /** Transfer to a new object; this causes the current object to become
"already discarded" */
+ @Override
+ public CharacterInput transfer()
+ {
+ return new NullCharacterInput();
+ }
+
+ /** Discard this object permanently */
+ @Override
+ public void discard()
+ throws ManifoldCFException
+ {
+ }
+
+ // Protected methods
+
+ /** Open a reader, for use by a caller, until closeStream is called */
+ @Override
+ protected void openStream()
+ throws ManifoldCFException
+ {
+ }
+
+ /** Close any open reader */
+ @Override
+ protected void closeStream()
+ throws ManifoldCFException
+ {
+ }
+
+ /** Calculate the datum's length in characters */
+ @Override
+ protected void calculateLength()
+ throws ManifoldCFException
+ {
+ }
+
+ /** Calculate the datum's hash value */
+ @Override
+ protected void calculateHashValue()
+ throws ManifoldCFException
+ {
+ }
+
+}
Propchange:
manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/interfaces/NullCharacterInput.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange:
manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/interfaces/NullCharacterInput.java
------------------------------------------------------------------------------
svn:keywords = Id
Modified:
manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/interfaces/TempFileCharacterInput.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/interfaces/TempFileCharacterInput.java?rev=1429250&r1=1429249&r2=1429250&view=diff
==============================================================================
---
manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/interfaces/TempFileCharacterInput.java
(original)
+++
manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/interfaces/TempFileCharacterInput.java
Sat Jan 5 10:46:02 2013
@@ -134,6 +134,7 @@ public class TempFileCharacterInput exte
}
/** Open a Utf8 stream directly from the backing file */
+ @Override
public InputStream getUtf8Stream()
throws ManifoldCFException
{
@@ -151,6 +152,7 @@ public class TempFileCharacterInput exte
return null;
}
+ @Override
protected void openStream()
throws ManifoldCFException
{
@@ -171,6 +173,7 @@ public class TempFileCharacterInput exte
}
/** Transfer to a new object; this causes the current object to become
"already discarded" */
+ @Override
public CharacterInput transfer()
{
// Create a new TempFileCharacterInput object, and fill it with our
current stuff
@@ -186,6 +189,7 @@ public class TempFileCharacterInput exte
return rval;
}
+ @Override
public void discard()
throws ManifoldCFException
{
@@ -199,6 +203,7 @@ public class TempFileCharacterInput exte
}
/** Calculate the datum's length in characters */
+ @Override
protected void calculateLength()
throws ManifoldCFException
{
@@ -206,6 +211,7 @@ public class TempFileCharacterInput exte
}
/** Calculate the datum's hash value */
+ @Override
protected void calculateHashValue()
throws ManifoldCFException
{