Update of /cvsroot/nutch/nutch/src/plugin/languageidentifier/src/test/net/nutch/analysis/lang In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv25929/src/plugin/languageidentifier/src/test/net/nutch/analysis/lang
Added Files: TestHTMLLanguageParser.java TestNGramProfile.java Log Message: added 3 more lang profiles (ru,hu,pl), converted all profiles to utf8, changed the similarity calculation --- NEW FILE: TestNGramProfile.java --- /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.analysis.lang; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.InputStream; import java.util.Iterator; import java.util.Vector; import junit.framework.TestCase; import org.apache.lucene.analysis.Token; public class TestNGramProfile extends TestCase { String tokencontent1 = "testaddtoken"; String tokencontent2 = "anotherteststring"; int[] counts1 = { 3, 2, 2, 2, 1, 1, 1, 1, 1 }; String[] chars1 = { "t", "_", "d", "e", "a", "k", "n", "o", "s" }; /** * Test addFromToken method * */ public void testAddToken() { NGramProfile p = new NGramProfile("test", 1, 1); Token t = new Token(tokencontent1, 0, tokencontent1.length()); p.addFromToken(t); p.normalize(); testCounts(p.getSorted(), counts1); testContents(p.getSorted(), chars1); } /** * Test analyze method */ public void testAnalyze() { String tokencontent = "testmeagain"; NGramProfile p = new NGramProfile("test", 1, 1); p.analyze(new StringBuffer(tokencontent)); //test that profile size is ok, eg 9 different NGramEntries "_tesmagin" assertEquals(9, p.getSorted().size()); } /** * Test addNGrams method with StringBuffer argument * */ public void testAddNGramsStringBuffer() { String tokencontent = "testmeagain"; NGramProfile p = new NGramProfile("test", 1, 1); p.addNGrams(new StringBuffer(tokencontent)); //test that profile size is ok, eg 8 different NGramEntries "tesmagin" assertEquals(8, p.getSorted().size()); } /** * test getSorted method */ public void testGetSorted() { int[] count = { 4, 3, 2, 1 }; String[] ngram = { "a", "b", "" + NGramProfile.SEPARATOR, "c" }; String teststring = "AAaaBbbC"; NGramProfile p = new NGramProfile("test", 1, 1); p.analyze(new StringBuffer(teststring)); //test size of profile assertEquals(4, p.getSorted().size()); testCounts(p.getSorted(), count); testContents(p.getSorted(), ngram); } public void testGetSimilarity() { NGramProfile a = new NGramProfile("a", 1, 1); NGramProfile b = new NGramProfile("b", 1, 1); a.analyze(new StringBuffer(tokencontent1)); b.analyze(new StringBuffer(tokencontent2)); //because of rounding errors might slightly return different results assertEquals(a.getSimilarity(b), b.getSimilarity(a), 0.0000001); } public void testExactMatch() { NGramProfile a = new NGramProfile("a", 1, 1); a.analyze(new StringBuffer(tokencontent1)); assertEquals(a.getSimilarity(a), 0, 0); } public void testIO() { //Create profile and set some contents NGramProfile a = new NGramProfile("a", 1, 1); a.analyze(new StringBuffer(this.tokencontent1)); NGramProfile b = new NGramProfile("a_from_inputstream", 1, 1); //save profile ByteArrayOutputStream os = new ByteArrayOutputStream(); try { a.save(os); os.close(); } catch (Exception e) { fail(); } //load profile InputStream is = new ByteArrayInputStream(os.toByteArray()); try { b.load(is); is.close(); } catch (Exception e) { fail(); } //check it testCounts(b.getSorted(), counts1); testContents(b.getSorted(), chars1); } private void testContents(Vector entries, String contents[]) { int c = 0; Iterator i = entries.iterator(); while (i.hasNext()) { NGramProfile.NGramEntry nge = (NGramProfile.NGramEntry) i.next(); assertEquals(contents[c], nge.getSeq().toString()); c++; } } private void testCounts(Vector entries, int counts[]) { int c = 0; Iterator i = entries.iterator(); while (i.hasNext()) { NGramProfile.NGramEntry nge = (NGramProfile.NGramEntry) i.next(); assertEquals(counts[c], nge.getCount()); c++; } } } --- NEW FILE: TestHTMLLanguageParser.java --- /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.analysis.lang; import java.util.Properties; import junit.framework.TestCase; import net.nutch.parse.Parse; import net.nutch.parse.Parser; import net.nutch.parse.ParserFactory; import net.nutch.protocol.Content; public class TestHTMLLanguageParser extends TestCase { private static String URL = "http://foo.bar/"; private static String BASE = "http://foo.bar/"; String docs[] = { "<html lang=\"fi\"><head>document 1 title</head><body>jotain suomeksi</body></html>", "<html><head><meta http-equiv=\"content-language\" content=\"en\"><title>document 2 title</head><body>this is english</body></html>", "<html><head><meta name=\"dc.language\" content=\"en\"><title>document 3 title</head><body>this is english</body></html>" }; String metalanguages[] = { "fi", "en", "en" }; /** * Test parsing of language identifiers from html **/ public void testMetaHTMLParsing() { try { /* loop through the test documents and validate result */ for (int t = 0; t < docs.length; t++) { Content content = getContent(docs[t]); Parser parser = ParserFactory.getParser("text/html", URL); Parse parse = parser.getParse(content); assertEquals(metalanguages[t], (String) parse.getData().get( HTMLLanguageParser.META_LANG_NAME)); } } catch (Exception e) { e.printStackTrace(System.out); fail(e.toString()); } } private Content getContent(String text) { Properties p = new Properties(); p.put("Content-Type", "text/html"); Content content = new Content(URL, BASE, text.getBytes(), "text/html", p); return content; } } ------------------------------------------------------- This SF.Net email is sponsored by: YOU BE THE JUDGE. Be one of 170 Project Admins to receive an Apple iPod Mini FREE for your judgement on who ports your project to Linux PPC the best. Sponsored by IBM. Deadline: Sept. 24. Go here: http://sf.net/ppc_contest.php _______________________________________________ Nutch-cvs mailing list [EMAIL PROTECTED] https://lists.sourceforge.net/lists/listinfo/nutch-cvs