Update of 
/cvsroot/nutch/nutch/src/plugin/languageidentifier/src/java/net/nutch/analysis/lang
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20161

Modified Files:
        LanguageIdentifier.java 
Log Message:
Correct whitespace to be consistent with other files.

Compactify startup log messages.



Index: LanguageIdentifier.java
===================================================================
RCS file: 
/cvsroot/nutch/nutch/src/plugin/languageidentifier/src/java/net/nutch/analysis/lang/LanguageIdentifier.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** LanguageIdentifier.java     25 Sep 2004 16:13:29 -0000      1.3
--- LanguageIdentifier.java     22 Nov 2004 16:38:43 -0000      1.4
***************
*** 32,88 ****
  import java.util.Properties;
  import java.util.Enumeration;
  /**
   * 
   * @author Sami Siren
!  *
   */
  public class LanguageIdentifier implements IndexingFilter {
!   public static final Logger LOG =
!     LogFormatter.getLogger("net.nutch.analysis.lang.LanguageIdentifier");
  
!   private Vector languages=new Vector();
!   private Vector supportedLanguages=new Vector();
  
    private static LanguageIdentifier identifier = new LanguageIdentifier(true);
!   private static float SCORE_THRESOLD=0.00F;
  
    //public constructor needed for extension mechanism
!   public LanguageIdentifier(){
!   }
!       
!   private LanguageIdentifier(boolean fake){
!       Properties p=new Properties();
!       try{
!       p.load(this.getClass().getResourceAsStream("langmappings.properties"));
  
!     Enumeration alllanguages=p.keys();
  
!     while(alllanguages.hasMoreElements()){
!       String lang=(String)(alllanguages.nextElement());
  
!       InputStream 
is=this.getClass().getClassLoader().getResourceAsStream("net/nutch/analysis/lang/"
 + lang + "." + NGramProfile.NGRAM_FILE_EXTENSION);
!                       
!       if(is!=null){
!         NGramProfile profile=new NGramProfile(lang);
!         try {
!           profile.load(is);
!           languages.add(profile);
!           supportedLanguages.add(lang);
!           LOG.info(lang + " was added as supported language");
!           is.close();
!         } catch (IOException e1) {
!           LOG.severe(e1.toString());
          }
        }
      }
-       } catch (Exception e){
-               LOG.severe(e.toString());
-       }
    }
  
    /**
!    *  return handle to singleton instance
     */
!   public static LanguageIdentifier getInstance(){
      return identifier;
    }
--- 32,92 ----
  import java.util.Properties;
  import java.util.Enumeration;
+ 
  /**
   * 
   * @author Sami Siren
!  *  
   */
  public class LanguageIdentifier implements IndexingFilter {
!   public static final Logger LOG = 
LogFormatter.getLogger("net.nutch.analysis.lang.LanguageIdentifier");
  
!   private Vector languages = new Vector();
! 
!   private Vector supportedLanguages = new Vector();
  
    private static LanguageIdentifier identifier = new LanguageIdentifier(true);
! 
!   private static float SCORE_THRESOLD = 0.00F;
  
    //public constructor needed for extension mechanism
!   public LanguageIdentifier() {}
  
!   private LanguageIdentifier(boolean fake) {
!     Properties p = new Properties();
!     try {
!       p.load(this.getClass().getResourceAsStream("langmappings.properties"));
  
!       Enumeration alllanguages = p.keys();
  
!       StringBuffer list = new StringBuffer("Language identifier plugin 
supports:");
!       while (alllanguages.hasMoreElements()) {
!         String lang = (String) (alllanguages.nextElement());
! 
!         InputStream is = this.getClass().getClassLoader().getResourceAsStream(
!                 "net/nutch/analysis/lang/" + lang + "." + 
NGramProfile.NGRAM_FILE_EXTENSION);
! 
!         if (is != null) {
!           NGramProfile profile = new NGramProfile(lang);
!           try {
!             profile.load(is);
!             languages.add(profile);
!             supportedLanguages.add(lang);
!             list.append(" " + lang);
!             is.close();
!           } catch (IOException e1) {
!             LOG.severe(e1.toString());
!           }
          }
        }
+       LOG.info(list.toString());
+     } catch (Exception e) {
+       LOG.severe(e.toString());
      }
    }
  
    /**
!    * return handle to singleton instance
     */
!   public static LanguageIdentifier getInstance() {
      return identifier;
    }
***************
*** 90,217 ****
    /**
     * main method used for testing
     * @param args
     */
!   public static void main(String args[]){
  
      String usage = "Usage: LanguageIdentifier [-identifyrows filename 
maxlines] [-identifyfile filename] [-identifyfileset files] [-identifytext 
text] [-identifyurl url]";
!     int command=0;
!         
!     final int IDFILE=1;
!     final int IDTEXT=2;
!     final int IDURL=3;
!     final int IDFILESET=4;
!     final int IDROWS=5;
  
-     Vector fileset=new Vector();
-     String filename="";
-     String url="";
-     String text="";
-     int max=0;
-         
      if (args.length == 0) {
        System.err.println(usage);
        System.exit(-1);
      }
!       
!     for (int i = 0; i < args.length; i++) {       // parse command line
        if (args[i].equals("-identifyfile")) {
!         command=IDFILE;
          filename = args[++i];
!       } 
  
        if (args[i].equals("-identifyurl")) {
!         command=IDURL;
          filename = args[++i];
!       } 
  
-             
        if (args[i].equals("-identifyrows")) {
!         command=IDROWS;
          filename = args[++i];
!         max=Integer.parseInt(args[++i]);
!       } 
!             
        if (args[i].equals("-identifytext")) {
!         command=IDTEXT;
!         for(i++;i<args.length-1;i++)
!           text+=args[i] + " ";
        }
!         
        if (args[i].equals("-identifyfileset")) {
!         command=IDFILESET;
!         for(i++;i<args.length;i++){
            fileset.add(args[i]);
            System.out.println(args[i]);
          }
!       } 
!         
      }
!         
!     String lang=null;
!     LanguageIdentifier idfr=LanguageIdentifier.getInstance();
      File f;
      FileInputStream fis;
!     try{
!       switch(command){
!         
!       case IDTEXT:
!         lang=idfr.identify(text);
!         break;
!                       
!       case IDFILE:
!         f=new File(filename);
!         fis=new FileInputStream(f);
!         lang=idfr.identify(fis);
!         fis.close();
!         break;
  
!       case IDURL:
!         text=getUrlContent(filename);
!         lang=idfr.identify(text);
!         break;
  
!                               
!       case IDROWS:
!         f=new File(filename);
!         BufferedReader br=new BufferedReader(new InputStreamReader(new 
FileInputStream(f)));
!         String line;
!         while (max>0 && (line=br.readLine())!=null){
!           line=line.trim();
!           if(line.length()>2){
!             max--;
!             lang=idfr.identify(line);
!             System.out.println("R=" + lang + ":" + line);
!           }
!         }
!                               
!         br.close();
!         System.exit(0);
!         break;
  
!       case IDFILESET:
!         System.out.println("FILESET");
!         Iterator i=fileset.iterator();
!                               
!         while(i.hasNext()){
!           try{
!             filename=(String)i.next();
!             f=new File(filename);
!             fis=new FileInputStream(f);
!             lang=idfr.identify(fis);
!             fis.close();
!           } catch (Exception e){
!             System.out.println(e);
            }
  
!           System.out.println(filename + " was identified as " + lang);
!         }
!         System.exit(0);
!         break;
  
-                               
-               
        }
!     } catch (Exception e){
!       System.out.println(e);  
      }
      System.out.println("text was identified as " + lang);
--- 94,218 ----
    /**
     * main method used for testing
+    * 
     * @param args
     */
!   public static void main(String args[]) {
  
      String usage = "Usage: LanguageIdentifier [-identifyrows filename 
maxlines] [-identifyfile filename] [-identifyfileset files] [-identifytext 
text] [-identifyurl url]";
!     int command = 0;
! 
!     final int IDFILE = 1;
!     final int IDTEXT = 2;
!     final int IDURL = 3;
!     final int IDFILESET = 4;
!     final int IDROWS = 5;
! 
!     Vector fileset = new Vector();
!     String filename = "";
!     String url = "";
!     String text = "";
!     int max = 0;
  
      if (args.length == 0) {
        System.err.println(usage);
        System.exit(-1);
      }
! 
!     for (int i = 0; i < args.length; i++) { // parse command line
        if (args[i].equals("-identifyfile")) {
!         command = IDFILE;
          filename = args[++i];
!       }
  
        if (args[i].equals("-identifyurl")) {
!         command = IDURL;
          filename = args[++i];
!       }
  
        if (args[i].equals("-identifyrows")) {
!         command = IDROWS;
          filename = args[++i];
!         max = Integer.parseInt(args[++i]);
!       }
! 
        if (args[i].equals("-identifytext")) {
!         command = IDTEXT;
!         for (i++; i < args.length - 1; i++)
!           text += args[i] + " ";
        }
! 
        if (args[i].equals("-identifyfileset")) {
!         command = IDFILESET;
!         for (i++; i < args.length; i++) {
            fileset.add(args[i]);
            System.out.println(args[i]);
          }
!       }
! 
      }
! 
!     String lang = null;
!     LanguageIdentifier idfr = LanguageIdentifier.getInstance();
      File f;
      FileInputStream fis;
!     try {
!       switch (command) {
  
!         case IDTEXT:
!           lang = idfr.identify(text);
!           break;
  
!         case IDFILE:
!           f = new File(filename);
!           fis = new FileInputStream(f);
!           lang = idfr.identify(fis);
!           fis.close();
!           break;
  
!         case IDURL:
!           text = getUrlContent(filename);
!           lang = idfr.identify(text);
!           break;
! 
!         case IDROWS:
!           f = new File(filename);
!           BufferedReader br = new BufferedReader(new InputStreamReader(new 
FileInputStream(f)));
!           String line;
!           while (max > 0 && (line = br.readLine()) != null) {
!             line = line.trim();
!             if (line.length() > 2) {
!               max--;
!               lang = idfr.identify(line);
!               System.out.println("R=" + lang + ":" + line);
!             }
            }
  
!           br.close();
!           System.exit(0);
!           break;
! 
!         case IDFILESET:
!           System.out.println("FILESET");
!           Iterator i = fileset.iterator();
! 
!           while (i.hasNext()) {
!             try {
!               filename = (String) i.next();
!               f = new File(filename);
!               fis = new FileInputStream(f);
!               lang = idfr.identify(fis);
!               fis.close();
!             } catch (Exception e) {
!               System.out.println(e);
!             }
! 
!             System.out.println(filename + " was identified as " + lang);
!           }
!           System.exit(0);
!           break;
  
        }
!     } catch (Exception e) {
!       System.out.println(e);
      }
      System.out.println("text was identified as " + lang);
***************
*** 232,250 ****
        System.out.println("text:" + parse.getText());
        return parse.getText();
!                       
      } catch (ProtocolNotFound e) {
-       // TODO Auto-generated catch block
        e.printStackTrace();
      } catch (ProtocolException e) {
-       // TODO Auto-generated catch block
        e.printStackTrace();
      } catch (ParserNotFound e) {
-       // TODO Auto-generated catch block
        e.printStackTrace();
      } catch (ParseException e) {
-       // TODO Auto-generated catch block
        e.printStackTrace();
      }
-     // TODO Auto-generated method stub
      return null;
    }
--- 233,246 ----
        System.out.println("text:" + parse.getText());
        return parse.getText();
! 
      } catch (ProtocolNotFound e) {
        e.printStackTrace();
      } catch (ProtocolException e) {
        e.printStackTrace();
      } catch (ParserNotFound e) {
        e.printStackTrace();
      } catch (ParseException e) {
        e.printStackTrace();
      }
      return null;
    }
***************
*** 252,348 ****
    /**
     * Identify language based on submitted content
!    * @param text      text of doc
!    * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if 
unknown
     */
    public String identify(String text) {
!               
      return identify(new StringBuffer(text));
    }
  
    public String identify(StringBuffer text) {
!               
!     NGramProfile p=new NGramProfile("suspect");
      p.analyze(text);
-               
-     float topscore=Float.MAX_VALUE;
-     String lang="";
  
!     Iterator i=languages.iterator();
!     while(i.hasNext()){
  
!       NGramProfile profile=(NGramProfile)i.next();
!       float score=profile.getSimilarity(p);
  
        //LOG.fine(profile.getName() + ":" + score);
!                       
!       if(score < topscore) {
          topscore = score;
!         lang=profile.getName();
        }
      }
!               
      p.ngrams.clear();
!     p=null;
!               
      LOG.finest("TOPSCORE: " + lang + " with " + topscore);
!               
!     if(topscore > SCORE_THRESOLD)
        return lang;
!               
      else return null;
    }
  
!       
!   /** 
     * Identify language from inputstream
     * 
     * @param is
!    * @return
     * @throws IOException
     */
!   public String identify(InputStream is) throws IOException{
!               
!     StringBuffer text=new StringBuffer();
!     byte buffer[]=new byte[2000];
!     int len=0;
!               
!     while((len=is.read(buffer))!=-1){
!       text.append(new String(buffer,0,len));
      }
!               
      return identify(text.toString());
    }
!       
!   public Document filter(Document doc, Parse parse, FetcherOutput fo) 
!     throws IndexingException{
  
      //check if X-meta-lang found, possibly put there by HTMLLanguageParser
      String lang = parse.getData().get(HTMLLanguageParser.META_LANG_NAME);
!               
      //check if HTTP-header tels us the language
!     if(lang==null)
!       lang = parse.getData().get("Content-Language");
!               
!     if(lang==null){
!       StringBuffer text=new StringBuffer();
!       /*                      
!                       String[] anchors = fo.getAnchors();
!                         for (int i = 0; i < anchors.length; i++) {
!                         text+=anchors[i] + " ";
!                         }
!       */
        text.append(parse.getData().getTitle()).append(" ");
        text.append(parse.getText());
!       lang=LanguageIdentifier.getInstance().identify(text);
      }
!               
!     if(lang==null){
!       lang="unknown"; 
      }
!               
      doc.add(Field.Keyword("lang", lang));
!           
!     return doc;       
    }
  
! }
--- 248,341 ----
    /**
     * Identify language based on submitted content
!    * 
!    * @param text text of doc
!    * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if
!    *         unknown
     */
    public String identify(String text) {
! 
      return identify(new StringBuffer(text));
    }
  
    public String identify(StringBuffer text) {
! 
!     NGramProfile p = new NGramProfile("suspect");
      p.analyze(text);
  
!     float topscore = Float.MAX_VALUE;
!     String lang = "";
  
!     Iterator i = languages.iterator();
!     while (i.hasNext()) {
! 
!       NGramProfile profile = (NGramProfile) i.next();
!       float score = profile.getSimilarity(p);
  
        //LOG.fine(profile.getName() + ":" + score);
! 
!       if (score < topscore) {
          topscore = score;
!         lang = profile.getName();
        }
      }
! 
      p.ngrams.clear();
!     p = null;
! 
      LOG.finest("TOPSCORE: " + lang + " with " + topscore);
! 
!     if (topscore > SCORE_THRESOLD)
        return lang;
! 
      else return null;
    }
  
!   /**
     * Identify language from inputstream
     * 
     * @param is
!    * @return language code
     * @throws IOException
     */
!   public String identify(InputStream is) throws IOException {
! 
!     StringBuffer text = new StringBuffer();
!     byte buffer[] = new byte[2000];
!     int len = 0;
! 
!     while ((len = is.read(buffer)) != -1) {
!       text.append(new String(buffer, 0, len));
      }
! 
      return identify(text.toString());
    }
! 
!   public Document filter(Document doc, Parse parse, FetcherOutput fo) throws 
IndexingException {
  
      //check if X-meta-lang found, possibly put there by HTMLLanguageParser
      String lang = parse.getData().get(HTMLLanguageParser.META_LANG_NAME);
! 
      //check if HTTP-header tels us the language
!     if (lang == null) lang = parse.getData().get("Content-Language");
! 
!     if (lang == null) {
!       StringBuffer text = new StringBuffer();
!       /*
!        * String[] anchors = fo.getAnchors(); for (int i = 0; i < 
anchors.length;
!        * i++) { text+=anchors[i] + " "; }
!        */
        text.append(parse.getData().getTitle()).append(" ");
        text.append(parse.getText());
!       lang = LanguageIdentifier.getInstance().identify(text);
      }
! 
!     if (lang == null) {
!       lang = "unknown";
      }
! 
      doc.add(Field.Keyword("lang", lang));
! 
!     return doc;
    }
  
! }
\ No newline at end of file



-------------------------------------------------------
SF email is sponsored by - The IT Product Guide
Read honest & candid reviews on hundreds of IT Products from real users.
Discover which products truly live up to the hype. Start reading now. 
http://productguide.itmanagersjournal.com/
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to