Update of /cvsroot/nutch/nutch/src/plugin/languageidentifier/src/java/net/nutch/analysis/lang In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20161
Modified Files: LanguageIdentifier.java Log Message: Correct whitespace to be consistent with other files. Compactify startup log messages. Index: LanguageIdentifier.java =================================================================== RCS file: /cvsroot/nutch/nutch/src/plugin/languageidentifier/src/java/net/nutch/analysis/lang/LanguageIdentifier.java,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** LanguageIdentifier.java 25 Sep 2004 16:13:29 -0000 1.3 --- LanguageIdentifier.java 22 Nov 2004 16:38:43 -0000 1.4 *************** *** 32,88 **** import java.util.Properties; import java.util.Enumeration; /** * * @author Sami Siren ! * */ public class LanguageIdentifier implements IndexingFilter { ! public static final Logger LOG = ! LogFormatter.getLogger("net.nutch.analysis.lang.LanguageIdentifier"); ! private Vector languages=new Vector(); ! private Vector supportedLanguages=new Vector(); private static LanguageIdentifier identifier = new LanguageIdentifier(true); ! private static float SCORE_THRESOLD=0.00F; //public constructor needed for extension mechanism ! public LanguageIdentifier(){ ! } ! ! private LanguageIdentifier(boolean fake){ ! Properties p=new Properties(); ! try{ ! p.load(this.getClass().getResourceAsStream("langmappings.properties")); ! Enumeration alllanguages=p.keys(); ! while(alllanguages.hasMoreElements()){ ! String lang=(String)(alllanguages.nextElement()); ! InputStream is=this.getClass().getClassLoader().getResourceAsStream("net/nutch/analysis/lang/" + lang + "." + NGramProfile.NGRAM_FILE_EXTENSION); ! ! if(is!=null){ ! NGramProfile profile=new NGramProfile(lang); ! try { ! profile.load(is); ! languages.add(profile); ! supportedLanguages.add(lang); ! LOG.info(lang + " was added as supported language"); ! is.close(); ! } catch (IOException e1) { ! LOG.severe(e1.toString()); } } } - } catch (Exception e){ - LOG.severe(e.toString()); - } } /** ! * return handle to singleton instance */ ! public static LanguageIdentifier getInstance(){ return identifier; } --- 32,92 ---- import java.util.Properties; import java.util.Enumeration; + /** * * @author Sami Siren ! * */ public class LanguageIdentifier implements IndexingFilter { ! public static final Logger LOG = LogFormatter.getLogger("net.nutch.analysis.lang.LanguageIdentifier"); ! private Vector languages = new Vector(); ! ! private Vector supportedLanguages = new Vector(); private static LanguageIdentifier identifier = new LanguageIdentifier(true); ! ! private static float SCORE_THRESOLD = 0.00F; //public constructor needed for extension mechanism ! public LanguageIdentifier() {} ! private LanguageIdentifier(boolean fake) { ! Properties p = new Properties(); ! try { ! p.load(this.getClass().getResourceAsStream("langmappings.properties")); ! Enumeration alllanguages = p.keys(); ! StringBuffer list = new StringBuffer("Language identifier plugin supports:"); ! while (alllanguages.hasMoreElements()) { ! String lang = (String) (alllanguages.nextElement()); ! ! InputStream is = this.getClass().getClassLoader().getResourceAsStream( ! "net/nutch/analysis/lang/" + lang + "." + NGramProfile.NGRAM_FILE_EXTENSION); ! ! if (is != null) { ! NGramProfile profile = new NGramProfile(lang); ! try { ! profile.load(is); ! languages.add(profile); ! supportedLanguages.add(lang); ! list.append(" " + lang); ! is.close(); ! } catch (IOException e1) { ! LOG.severe(e1.toString()); ! } } } + LOG.info(list.toString()); + } catch (Exception e) { + LOG.severe(e.toString()); } } /** ! * return handle to singleton instance */ ! public static LanguageIdentifier getInstance() { return identifier; } *************** *** 90,217 **** /** * main method used for testing * @param args */ ! public static void main(String args[]){ String usage = "Usage: LanguageIdentifier [-identifyrows filename maxlines] [-identifyfile filename] [-identifyfileset files] [-identifytext text] [-identifyurl url]"; ! int command=0; ! ! final int IDFILE=1; ! final int IDTEXT=2; ! final int IDURL=3; ! final int IDFILESET=4; ! final int IDROWS=5; - Vector fileset=new Vector(); - String filename=""; - String url=""; - String text=""; - int max=0; - if (args.length == 0) { System.err.println(usage); System.exit(-1); } ! ! for (int i = 0; i < args.length; i++) { // parse command line if (args[i].equals("-identifyfile")) { ! command=IDFILE; filename = args[++i]; ! } if (args[i].equals("-identifyurl")) { ! command=IDURL; filename = args[++i]; ! } - if (args[i].equals("-identifyrows")) { ! command=IDROWS; filename = args[++i]; ! max=Integer.parseInt(args[++i]); ! } ! if (args[i].equals("-identifytext")) { ! command=IDTEXT; ! for(i++;i<args.length-1;i++) ! text+=args[i] + " "; } ! if (args[i].equals("-identifyfileset")) { ! command=IDFILESET; ! for(i++;i<args.length;i++){ fileset.add(args[i]); System.out.println(args[i]); } ! } ! } ! ! String lang=null; ! LanguageIdentifier idfr=LanguageIdentifier.getInstance(); File f; FileInputStream fis; ! try{ ! switch(command){ ! ! case IDTEXT: ! lang=idfr.identify(text); ! break; ! ! case IDFILE: ! f=new File(filename); ! fis=new FileInputStream(f); ! lang=idfr.identify(fis); ! fis.close(); ! break; ! case IDURL: ! text=getUrlContent(filename); ! lang=idfr.identify(text); ! break; ! ! case IDROWS: ! f=new File(filename); ! BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(f))); ! String line; ! while (max>0 && (line=br.readLine())!=null){ ! line=line.trim(); ! if(line.length()>2){ ! max--; ! lang=idfr.identify(line); ! System.out.println("R=" + lang + ":" + line); ! } ! } ! ! br.close(); ! System.exit(0); ! break; ! case IDFILESET: ! System.out.println("FILESET"); ! Iterator i=fileset.iterator(); ! ! while(i.hasNext()){ ! try{ ! filename=(String)i.next(); ! f=new File(filename); ! fis=new FileInputStream(f); ! lang=idfr.identify(fis); ! fis.close(); ! } catch (Exception e){ ! System.out.println(e); } ! System.out.println(filename + " was identified as " + lang); ! } ! System.exit(0); ! break; - - } ! } catch (Exception e){ ! System.out.println(e); } System.out.println("text was identified as " + lang); --- 94,218 ---- /** * main method used for testing + * * @param args */ ! public static void main(String args[]) { String usage = "Usage: LanguageIdentifier [-identifyrows filename maxlines] [-identifyfile filename] [-identifyfileset files] [-identifytext text] [-identifyurl url]"; ! int command = 0; ! ! final int IDFILE = 1; ! final int IDTEXT = 2; ! final int IDURL = 3; ! final int IDFILESET = 4; ! final int IDROWS = 5; ! ! Vector fileset = new Vector(); ! String filename = ""; ! String url = ""; ! String text = ""; ! int max = 0; if (args.length == 0) { System.err.println(usage); System.exit(-1); } ! ! for (int i = 0; i < args.length; i++) { // parse command line if (args[i].equals("-identifyfile")) { ! command = IDFILE; filename = args[++i]; ! } if (args[i].equals("-identifyurl")) { ! command = IDURL; filename = args[++i]; ! } if (args[i].equals("-identifyrows")) { ! command = IDROWS; filename = args[++i]; ! max = Integer.parseInt(args[++i]); ! } ! if (args[i].equals("-identifytext")) { ! command = IDTEXT; ! for (i++; i < args.length - 1; i++) ! text += args[i] + " "; } ! if (args[i].equals("-identifyfileset")) { ! command = IDFILESET; ! for (i++; i < args.length; i++) { fileset.add(args[i]); System.out.println(args[i]); } ! } ! } ! ! String lang = null; ! LanguageIdentifier idfr = LanguageIdentifier.getInstance(); File f; FileInputStream fis; ! try { ! switch (command) { ! case IDTEXT: ! lang = idfr.identify(text); ! break; ! case IDFILE: ! f = new File(filename); ! fis = new FileInputStream(f); ! lang = idfr.identify(fis); ! fis.close(); ! break; ! case IDURL: ! text = getUrlContent(filename); ! lang = idfr.identify(text); ! break; ! ! case IDROWS: ! f = new File(filename); ! BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f))); ! String line; ! while (max > 0 && (line = br.readLine()) != null) { ! line = line.trim(); ! if (line.length() > 2) { ! max--; ! lang = idfr.identify(line); ! System.out.println("R=" + lang + ":" + line); ! } } ! br.close(); ! System.exit(0); ! break; ! ! case IDFILESET: ! System.out.println("FILESET"); ! Iterator i = fileset.iterator(); ! ! while (i.hasNext()) { ! try { ! filename = (String) i.next(); ! f = new File(filename); ! fis = new FileInputStream(f); ! lang = idfr.identify(fis); ! fis.close(); ! } catch (Exception e) { ! System.out.println(e); ! } ! ! System.out.println(filename + " was identified as " + lang); ! } ! System.exit(0); ! break; } ! } catch (Exception e) { ! System.out.println(e); } System.out.println("text was identified as " + lang); *************** *** 232,250 **** System.out.println("text:" + parse.getText()); return parse.getText(); ! } catch (ProtocolNotFound e) { - // TODO Auto-generated catch block e.printStackTrace(); } catch (ProtocolException e) { - // TODO Auto-generated catch block e.printStackTrace(); } catch (ParserNotFound e) { - // TODO Auto-generated catch block e.printStackTrace(); } catch (ParseException e) { - // TODO Auto-generated catch block e.printStackTrace(); } - // TODO Auto-generated method stub return null; } --- 233,246 ---- System.out.println("text:" + parse.getText()); return parse.getText(); ! } catch (ProtocolNotFound e) { e.printStackTrace(); } catch (ProtocolException e) { e.printStackTrace(); } catch (ParserNotFound e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } return null; } *************** *** 252,348 **** /** * Identify language based on submitted content ! * @param text text of doc ! * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if unknown */ public String identify(String text) { ! return identify(new StringBuffer(text)); } public String identify(StringBuffer text) { ! ! NGramProfile p=new NGramProfile("suspect"); p.analyze(text); - - float topscore=Float.MAX_VALUE; - String lang=""; ! Iterator i=languages.iterator(); ! while(i.hasNext()){ ! NGramProfile profile=(NGramProfile)i.next(); ! float score=profile.getSimilarity(p); //LOG.fine(profile.getName() + ":" + score); ! ! if(score < topscore) { topscore = score; ! lang=profile.getName(); } } ! p.ngrams.clear(); ! p=null; ! LOG.finest("TOPSCORE: " + lang + " with " + topscore); ! ! if(topscore > SCORE_THRESOLD) return lang; ! else return null; } ! ! /** * Identify language from inputstream * * @param is ! * @return * @throws IOException */ ! public String identify(InputStream is) throws IOException{ ! ! StringBuffer text=new StringBuffer(); ! byte buffer[]=new byte[2000]; ! int len=0; ! ! while((len=is.read(buffer))!=-1){ ! text.append(new String(buffer,0,len)); } ! return identify(text.toString()); } ! ! public Document filter(Document doc, Parse parse, FetcherOutput fo) ! throws IndexingException{ //check if X-meta-lang found, possibly put there by HTMLLanguageParser String lang = parse.getData().get(HTMLLanguageParser.META_LANG_NAME); ! //check if HTTP-header tels us the language ! if(lang==null) ! lang = parse.getData().get("Content-Language"); ! ! if(lang==null){ ! StringBuffer text=new StringBuffer(); ! /* ! String[] anchors = fo.getAnchors(); ! for (int i = 0; i < anchors.length; i++) { ! text+=anchors[i] + " "; ! } ! */ text.append(parse.getData().getTitle()).append(" "); text.append(parse.getText()); ! lang=LanguageIdentifier.getInstance().identify(text); } ! ! if(lang==null){ ! lang="unknown"; } ! doc.add(Field.Keyword("lang", lang)); ! ! return doc; } ! } --- 248,341 ---- /** * Identify language based on submitted content ! * ! * @param text text of doc ! * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if ! * unknown */ public String identify(String text) { ! return identify(new StringBuffer(text)); } public String identify(StringBuffer text) { ! ! NGramProfile p = new NGramProfile("suspect"); p.analyze(text); ! float topscore = Float.MAX_VALUE; ! String lang = ""; ! Iterator i = languages.iterator(); ! while (i.hasNext()) { ! ! NGramProfile profile = (NGramProfile) i.next(); ! float score = profile.getSimilarity(p); //LOG.fine(profile.getName() + ":" + score); ! ! if (score < topscore) { topscore = score; ! lang = profile.getName(); } } ! p.ngrams.clear(); ! p = null; ! LOG.finest("TOPSCORE: " + lang + " with " + topscore); ! ! if (topscore > SCORE_THRESOLD) return lang; ! else return null; } ! /** * Identify language from inputstream * * @param is ! * @return language code * @throws IOException */ ! public String identify(InputStream is) throws IOException { ! ! StringBuffer text = new StringBuffer(); ! byte buffer[] = new byte[2000]; ! int len = 0; ! ! while ((len = is.read(buffer)) != -1) { ! text.append(new String(buffer, 0, len)); } ! return identify(text.toString()); } ! ! public Document filter(Document doc, Parse parse, FetcherOutput fo) throws IndexingException { //check if X-meta-lang found, possibly put there by HTMLLanguageParser String lang = parse.getData().get(HTMLLanguageParser.META_LANG_NAME); ! //check if HTTP-header tels us the language ! if (lang == null) lang = parse.getData().get("Content-Language"); ! ! if (lang == null) { ! StringBuffer text = new StringBuffer(); ! /* ! * String[] anchors = fo.getAnchors(); for (int i = 0; i < anchors.length; ! * i++) { text+=anchors[i] + " "; } ! */ text.append(parse.getData().getTitle()).append(" "); text.append(parse.getText()); ! lang = LanguageIdentifier.getInstance().identify(text); } ! ! if (lang == null) { ! lang = "unknown"; } ! doc.add(Field.Keyword("lang", lang)); ! ! return doc; } ! } \ No newline at end of file ------------------------------------------------------- SF email is sponsored by - The IT Product Guide Read honest & candid reviews on hundreds of IT Products from real users. Discover which products truly live up to the hype. Start reading now. http://productguide.itmanagersjournal.com/ _______________________________________________ Nutch-cvs mailing list [EMAIL PROTECTED] https://lists.sourceforge.net/lists/listinfo/nutch-cvs