Hello all,

I have the following DIH data-config.xml file. Adding 
HTMLStripTransformer and the associated stripHTML on the 
para tag seems to have broke things. I am using a nightly 
build from 12-jan-2009

The /record/sect1/para contains HTML sub tags which need
to be discarded. Is my use of stripHTML correct?

<dataConfig>
 <dataSource name="myfilereader" type="FileDataSource"/>    
  <document>
     <entity name="jcurrent"
        processor="FileListEntityProcessor"
        fileName=".*xml"
        newerThan="'NOW-1000DAYS'"
        recursive="true"
        rootEntity="false"
        dataSource="null"
        baseDir="/Volumes/spare/ts/jxml/data/news/groups">

        <entity name="x"
           dataSource="myfilereader"
           processor="XPathEntityProcessor"
           url="${jcurrent.fileAbsolutePath}"
           stream="false"
           forEach="/record"
           
transformer="DateFormatTransformer,TemplateTransformer,RegexTransformer,HTMLStripTransformer">

           <field column="fileAbsPath" template="${jcurrent.fileAbsolutePath}" 
/>
           <field column="fileWebPath" regex="/Volumes/spare/ts/(.*)" 
replaceWith="$1" sourceColName="fileAbsePath"/>
           <field column="title"    xpath="/record/title" />
           <field column="para"     xpath="/record/sect1/para" stripHTML="true" 
/>
           <field column="subject"  
xpath="/record/metadata/subje...@qualifier='fullTitle']"   />
           <field column="pubname"  
xpath="/record/metadata/subje...@qualifier='publication']" />
           <field column="pubdate"  
xpath="/record/metadata/da...@qualifier='pubDate']" dateTimeFormat="yyyyMMdd"   
/>
           </entity>
        </entity>
     </document>
  </dataConfig>

-- 

===============================================================
Fergus McMenemie               Email:fer...@twig.me.uk
Techmore Ltd                   Phone:(UK) 07721 376021

Unix/Mac/Intranets             Analyst Programmer
===============================================================

Reply via email to