Author: pkluegl
Date: Mon Apr 28 14:38:40 2014
New Revision: 1590658

URL: http://svn.apache.org/r1590658
Log:
UIMA-3780
- configuration parameter for skipWhitespaces

Modified:
    
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java
    
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java
    
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java
    
uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml

Modified: 
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java
URL: 
http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java?rev=1590658&r1=1590657&r2=1590658&view=diff
==============================================================================
--- 
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java
 (original)
+++ 
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java
 Mon Apr 28 14:38:40 2014
@@ -82,6 +82,9 @@ public class HtmlConverter extends JCasA
   public static final String CONVERSION_PATTERNS = "conversionPatterns";
 
   public static final String CONVERSION_REPLACEMENTS = 
"conversionReplacements";
+  
+  public static final String SKIP_WHITESPACES = "skipWhitespaces";
+  
 
   // default values:
   private static final String DEFAULT_MODIFIED_VIEW = "plaintext";
@@ -100,6 +103,8 @@ public class HtmlConverter extends JCasA
   private Boolean replaceLinebreaks;
   
   private String linebreakReplacement;
+  
+  private Boolean skipWhitespaces;
 
   enum StringConversionPolicy {
     HEURISTIC, EXPLICIT, NONE
@@ -119,6 +124,8 @@ public class HtmlConverter extends JCasA
             : modifiedViewName;
     replaceLinebreaks = (Boolean) 
aContext.getConfigParameterValue(REPLACE_LINEBREAKS);
     replaceLinebreaks = replaceLinebreaks == null ? true : replaceLinebreaks;
+    skipWhitespaces = (Boolean) 
aContext.getConfigParameterValue(SKIP_WHITESPACES);
+    skipWhitespaces = skipWhitespaces == null ? true : skipWhitespaces;
     linebreakReplacement = (String) 
aContext.getConfigParameterValue(LINEBREAK_REPLACEMENT);
     linebreakReplacement = linebreakReplacement == null ? "" : 
linebreakReplacement;
     String conversionPolicyString = (String) 
aContext.getConfigParameterValue(CONVERSION_POLICY);
@@ -220,7 +227,7 @@ public class HtmlConverter extends JCasA
     try {
       Parser parser = new Parser(documentText);
       NodeList list = parser.parse(null);
-      HtmlConverterVisitor visitor = new 
HtmlConverterVisitor(newlineInducingTags);
+      HtmlConverterVisitor visitor = new 
HtmlConverterVisitor(newlineInducingTags, skipWhitespaces);
       list.visitAllNodesWith(visitor);
       visibleSpansSoFar = visitor.getTextSpans();
       linebreaksFromHtmlTags = visitor.getLinebreaksFromHtmlTags();

Modified: 
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java
URL: 
http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java?rev=1590658&r1=1590657&r2=1590658&view=diff
==============================================================================
--- 
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java
 (original)
+++ 
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java
 Mon Apr 28 14:38:40 2014
@@ -48,8 +48,9 @@ public class HtmlConverterVisitor extend
 
   private Set<String> newlineInducingTags;
 
-  public HtmlConverterVisitor(Set<String> newlineInducingTags) {
+  public HtmlConverterVisitor(Set<String> newlineInducingTags, boolean 
skipWhitespace) {
     this.newlineInducingTags = newlineInducingTags;
+    this.skipWhitespace = skipWhitespace;
   }
 
   @Override

Modified: 
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java
URL: 
http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java?rev=1590658&r1=1590657&r2=1590658&view=diff
==============================================================================
--- 
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java
 (original)
+++ 
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java
 Mon Apr 28 14:38:40 2014
@@ -143,7 +143,7 @@ public class TreeWordList implements Rut
     this.root = new TextNode();
     while (scan.hasNextLine()) {
       String s = scan.nextLine().trim();
-
+      // HOTFIX for old formats
       if (s.endsWith("=")) {
         s = s.substring(0, s.length() - 1);
         s = s.trim();

Modified: 
uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml
URL: 
http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml?rev=1590658&r1=1590657&r2=1590658&view=diff
==============================================================================
--- 
uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml
 (original)
+++ 
uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml
 Mon Apr 28 14:38:40 2014
@@ -1,4 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
 <!--
   Licensed to the Apache Software Foundation (ASF) under one
   or more contributor license agreements.  See the NOTICE file
@@ -77,8 +78,21 @@ Defaults to heuristic.</description>
         <multiValued>true</multiValued>
         <mandatory>false</mandatory>
       </configurationParameter>
+    <configurationParameter>
+        <name>skipWhitespaces</name>
+        <type>Boolean</type>
+        <multiValued>false</multiValued>
+        <mandatory>false</mandatory>
+      </configurationParameter>
     </configurationParameters>
-    <configurationParameterSettings/>
+    <configurationParameterSettings>
+      <nameValuePair>
+        <name>skipWhitespaces</name>
+        <value>
+          <boolean>true</boolean>
+        </value>
+      </nameValuePair>
+    </configurationParameterSettings>
     <typeSystemDescription/>
     <typePriorities/>
     <fsIndexCollection/>
@@ -96,4 +110,4 @@ Defaults to heuristic.</description>
     </operationalProperties>
   </analysisEngineMetaData>
   <resourceManagerConfiguration/>
-</analysisEngineDescription>
\ No newline at end of file
+</analysisEngineDescription>


Reply via email to