Update of /cvsroot/nutch/nutch/src/plugin/parse-rtf/src/java/net/nutch/parse/rtf
In directory 
sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv10432/src/plugin/parse-rtf/src/java/net/nutch/parse/rtf

Added Files:
        RTFParseFactory.java RTFParserDelegateImpl.java package.html 
Log Message:
Added plugin parse-rtf, contributed by Andy Hedges.


--- NEW FILE: package.html ---
<html>
<body>
<p>A RTF parsing plugin.</p><p></p>
</body>
</html>

--- NEW FILE: RTFParserDelegateImpl.java ---
/* Copyright (c) 2004 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.parse.rtf;

import com.etranslate.tm.processing.rtf.RTFParserDelegate;

import java.util.Arrays;
import java.util.List;
import java.util.Properties;

/**
 * A parser delegate for handling rtf events.
 * @author Andy Hedges
 */
public class RTFParserDelegateImpl implements RTFParserDelegate {

  String tabs = "";
  Properties metadata = new Properties();

  String[] META_NAMES_TEXT = {"title", "subject", "author", "manager",
                              "company", "operator", "category", "keywords",
                              "comment", "doccomm", "hlinkbase"};
  String[] META_NAMES_DATE = {"creatim", "creatim", "printim", "buptim"};

  String metaName = "";
  List metaNamesText = Arrays.asList(META_NAMES_TEXT);
  List metaNamesDate = Arrays.asList(META_NAMES_DATE);
  boolean isMetaTextValue = false;
  boolean isMetaDateValue = false;
  String content = "";
  boolean justOpenedGroup = false;
  boolean ignoreMode = false;

  public void text(String text, String style, int context) {
    justOpenedGroup = false;
    if (isMetaTextValue && context == IN_INFO) {
      metadata.setProperty(metaName, text);
      isMetaTextValue = false;
    } else if (context == IN_DOCUMENT && !ignoreMode) {
      content += text;
    }
  }

  public void controlSymbol(String controlSymbol, int context) {
    if("\\*".equals(controlSymbol) && justOpenedGroup){
      ignoreMode = true;
    }
    justOpenedGroup = false;
  }

  public void controlWord(String controlWord, int value, int context) {
    justOpenedGroup = false;
    controlWord = controlWord.substring(1);
    switch (context) {
      case IN_INFO:
        if (metaNamesText.contains(controlWord)) {
          isMetaTextValue = true;
          metaName = controlWord;
        } else if (metaNamesDate.contains(controlWord)) {
          //TODO: collect up the dates
        }
        break;
      case IN_DOCUMENT:
        //System.out.println(controlWord);
        break;
    }
  }

  public void openGroup(int depth) {
    justOpenedGroup = true;
  }

  public void closeGroup(int depth) {
    justOpenedGroup = false;
    ignoreMode = false;
  }

  public void styleList(List styles) {
  }

  public void startDocument() {
  }

  public void endDocument() {
  }

  public String getText() {
    return content;
  }

  public Properties getMetaData() {
    return metadata;
  }
}

--- NEW FILE: RTFParseFactory.java ---
/* Copyright (c) 2004 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.parse.rtf;

import net.nutch.parse.*;
import net.nutch.parse.ParseException;
import net.nutch.protocol.Content;

import java.io.ByteArrayInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Properties;

import com.etranslate.tm.processing.rtf.RTFParser;

/**
 * A parser for RTF documents
 * @author Andy Hedges
 */
public class RTFParseFactory implements Parser {

  public Parse getParse(Content content) throws ParseException {
    byte[] raw = content.getContent();
    Reader reader = new InputStreamReader(new ByteArrayInputStream(raw));
    RTFParserDelegateImpl delegate = new RTFParserDelegateImpl();
    RTFParser rtfParser = null;
    rtfParser = RTFParser.createParser(reader);
    rtfParser.setNewLine("\n");
    rtfParser.setDelegate(delegate);

    try {
      rtfParser.parse();
    } catch (com.etranslate.tm.processing.rtf.ParseException e) {
      throw new ParseException("Exception parsing RTF document", e);
    }

    Properties metadata = new Properties();
    metadata.putAll(content.getMetadata());
    metadata.putAll(delegate.getMetaData());
    String title = metadata.getProperty("title");

    if(title != null){
      metadata.remove(title);
    } else {
      title = "";
    }

    ParseData parseData = new ParseData(title, new Outlink[0], metadata);

    return new ParseImpl(delegate.getText(), parseData);
  }


}



-------------------------------------------------------
This SF.net email is sponsored by: IT Product Guide on ITManagersJournal
Use IT products in your business? Tell us what you think of them. Give us
Your Opinions, Get Free ThinkGeek Gift Certificates! Click to find out more
http://productguide.itmanagersjournal.com/guidepromo.tmpl
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to