Author: kwright
Date: Sun Dec 6 17:56:29 2015
New Revision: 1718194
URL: http://svn.apache.org/viewvc?rev=1718194&view=rev
Log:
Fix for CONNECTORS-1264.
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java
Modified: manifoldcf/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1718194&r1=1718193&r2=1718194&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Sun Dec 6 17:56:29 2015
@@ -3,6 +3,10 @@ $Id$
======================= 2.3-dev =====================
+CONNECTORS-1264: Fix handling of slashes in unquoted attribute values
+in html parsing.
+(Issei Nishigata, Karl Wright)
+
CONNECTORS-1249: Independent priority setting for different
connectors, and bring individual connectors up to speed with proper
document bin names.
Modified:
manifoldcf/trunk/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java?rev=1718194&r1=1718193&r2=1718194&view=diff
==============================================================================
---
manifoldcf/trunk/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java
(original)
+++
manifoldcf/trunk/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java
Sun Dec 6 17:56:29 2015
@@ -75,7 +75,8 @@ public class TagParseState extends Singl
protected static final int TAGPARSESTATE_IN_CDATA_BODY = 27;
protected static final int TAGPARSESTATE_SAWRIGHTBRACKET = 28;
protected static final int TAGPARSESTATE_SAWSECONDRIGHTBRACKET = 29;
-
+ protected static final int TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE_SAW_SLASH =
30;
+
protected int currentState = TAGPARSESTATE_NORMAL;
/** The btag depth, which indicates btag behavior when > 0. */
@@ -724,6 +725,8 @@ public class TagParseState extends Singl
currentState = TAGPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE;
else if (thisChar == '"')
currentState = TAGPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE;
+ else if (thisChar == '/')
+ currentState = TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE_SAW_SLASH;
else if (!isWhitespace(thisChar))
{
currentState = TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE;
@@ -864,9 +867,10 @@ public class TagParseState extends Singl
currentValueBuffer.append(thisChar);
break;
- case TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE:
+ case TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE_SAW_SLASH:
if (isWhitespace(thisChar))
{
+ currentValueBuffer.append('/');
currentAttrList.add(new
AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
currentAttrName = null;
currentValueBuffer = null;
@@ -875,10 +879,38 @@ public class TagParseState extends Singl
}
else if (thisChar == '/')
{
+ currentValueBuffer.append('/');
+ }
+ else if (thisChar == '>')
+ {
currentAttrList.add(new
AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
+ currentAttrName = null;
+ currentValueBuffer = null;
+ currentState = TAGPARSESTATE_NORMAL;
if (noteTag(currentTagName,currentAttrList))
return true;
- currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
+ currentTagName = null;
+ currentAttrList = null;
+ }
+ else
+ {
+ currentValueBuffer.append('/');
+ currentValueBuffer.append(thisChar);
+ }
+ break;
+
+ case TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE:
+ if (isWhitespace(thisChar))
+ {
+ currentAttrList.add(new
AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
+ currentAttrName = null;
+ currentValueBuffer = null;
+ currentState = TAGPARSESTATE_IN_ATTR_NAME;
+ currentAttrNameBuffer = newBuffer();
+ }
+ else if (thisChar == '/')
+ {
+ currentState = TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE_SAW_SLASH;
}
else if (thisChar == '>')
{