Author: pkluegl Date: Mon Apr 28 14:38:40 2014 New Revision: 1590658 URL: http://svn.apache.org/r1590658 Log: UIMA-3780 - configuration parameter for skipWhitespaces
Modified: uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml Modified: uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java?rev=1590658&r1=1590657&r2=1590658&view=diff ============================================================================== --- uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java (original) +++ uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java Mon Apr 28 14:38:40 2014 @@ -82,6 +82,9 @@ public class HtmlConverter extends JCasA public static final String CONVERSION_PATTERNS = "conversionPatterns"; public static final String CONVERSION_REPLACEMENTS = "conversionReplacements"; + + public static final String SKIP_WHITESPACES = "skipWhitespaces"; + // default values: private static final String DEFAULT_MODIFIED_VIEW = "plaintext"; @@ -100,6 +103,8 @@ public class HtmlConverter extends JCasA private Boolean replaceLinebreaks; private String linebreakReplacement; + + private Boolean skipWhitespaces; enum StringConversionPolicy { HEURISTIC, EXPLICIT, NONE @@ -119,6 +124,8 @@ public class HtmlConverter extends JCasA : modifiedViewName; replaceLinebreaks = (Boolean) aContext.getConfigParameterValue(REPLACE_LINEBREAKS); replaceLinebreaks = replaceLinebreaks == null ? true : replaceLinebreaks; + skipWhitespaces = (Boolean) aContext.getConfigParameterValue(SKIP_WHITESPACES); + skipWhitespaces = skipWhitespaces == null ? true : skipWhitespaces; linebreakReplacement = (String) aContext.getConfigParameterValue(LINEBREAK_REPLACEMENT); linebreakReplacement = linebreakReplacement == null ? "" : linebreakReplacement; String conversionPolicyString = (String) aContext.getConfigParameterValue(CONVERSION_POLICY); @@ -220,7 +227,7 @@ public class HtmlConverter extends JCasA try { Parser parser = new Parser(documentText); NodeList list = parser.parse(null); - HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags); + HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags, skipWhitespaces); list.visitAllNodesWith(visitor); visibleSpansSoFar = visitor.getTextSpans(); linebreaksFromHtmlTags = visitor.getLinebreaksFromHtmlTags(); Modified: uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java?rev=1590658&r1=1590657&r2=1590658&view=diff ============================================================================== --- uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java (original) +++ uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java Mon Apr 28 14:38:40 2014 @@ -48,8 +48,9 @@ public class HtmlConverterVisitor extend private Set<String> newlineInducingTags; - public HtmlConverterVisitor(Set<String> newlineInducingTags) { + public HtmlConverterVisitor(Set<String> newlineInducingTags, boolean skipWhitespace) { this.newlineInducingTags = newlineInducingTags; + this.skipWhitespace = skipWhitespace; } @Override Modified: uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java?rev=1590658&r1=1590657&r2=1590658&view=diff ============================================================================== --- uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java (original) +++ uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/resource/TreeWordList.java Mon Apr 28 14:38:40 2014 @@ -143,7 +143,7 @@ public class TreeWordList implements Rut this.root = new TextNode(); while (scan.hasNextLine()) { String s = scan.nextLine().trim(); - + // HOTFIX for old formats if (s.endsWith("=")) { s = s.substring(0, s.length() - 1); s = s.trim(); Modified: uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml?rev=1590658&r1=1590657&r2=1590658&view=diff ============================================================================== --- uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml (original) +++ uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml Mon Apr 28 14:38:40 2014 @@ -1,4 +1,5 @@ <?xml version="1.0" encoding="UTF-8"?> + <!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file @@ -77,8 +78,21 @@ Defaults to heuristic.</description> <multiValued>true</multiValued> <mandatory>false</mandatory> </configurationParameter> + <configurationParameter> + <name>skipWhitespaces</name> + <type>Boolean</type> + <multiValued>false</multiValued> + <mandatory>false</mandatory> + </configurationParameter> </configurationParameters> - <configurationParameterSettings/> + <configurationParameterSettings> + <nameValuePair> + <name>skipWhitespaces</name> + <value> + <boolean>true</boolean> + </value> + </nameValuePair> + </configurationParameterSettings> <typeSystemDescription/> <typePriorities/> <fsIndexCollection/> @@ -96,4 +110,4 @@ Defaults to heuristic.</description> </operationalProperties> </analysisEngineMetaData> <resourceManagerConfiguration/> -</analysisEngineDescription> \ No newline at end of file +</analysisEngineDescription>