Author: jerome Date: Fri Feb 17 16:23:35 2006 New Revision: 378667 URL: http://svn.apache.org/viewcvs?rev=378667&view=rev Log: Adapts parse-mp3 to nutch APIs changes (metadata, parse, protocol, ...)
Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java?rev=378667&r1=378666&r2=378667&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java Fri Feb 17 16:23:35 2006 @@ -16,10 +16,14 @@ package org.apache.nutch.parse.mp3; +// JDK imports +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.Iterator; -import org.apache.nutch.parse.*; -import org.apache.nutch.protocol.Content; -import org.apache.hadoop.conf.Configuration; +// Java ID3 Tag imports import org.farng.mp3.MP3File; import org.farng.mp3.TagException; import org.farng.mp3.id3.AbstractID3v2; @@ -27,29 +31,35 @@ import org.farng.mp3.id3.ID3v1; import org.farng.mp3.object.AbstractMP3Object; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.net.MalformedURLException; -import java.util.Iterator; +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +// Nutch imports +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.protocol.Content; + /** * A parser for MP3 audio files * @author Andy Hedges */ - public class MP3Parser implements Parser { private MetadataCollector metadataCollector; private Configuration conf; - public Parse getParse(Content content) throws ParseException { - Parse parse = null; - metadataCollector.putAll(content.getMetadata()); + public Parse getParse(Content content) { + Parse parse = null; byte[] raw = content.getContent(); - File tmp = null; + try { tmp = File.createTempFile("nutch", ".mp3"); FileOutputStream fos = new FileOutputStream(tmp); @@ -58,25 +68,31 @@ MP3File mp3 = new MP3File(tmp); if (mp3.hasID3v2Tag()) { - parse = getID3v2Parse(mp3); + parse = getID3v2Parse(mp3, content.getMetadata()); } else if (mp3.hasID3v1Tag()) { - parse = getID3v1Parse(mp3); + parse = getID3v1Parse(mp3, content.getMetadata()); } else { - throw new ParseException("No textual content available"); + return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_MISSING_CONTENT, + "No textual content available").getEmptyParse(conf); } - - } catch (IOException e) { - throw new ParseException("Couldn't create temporary file", e); + return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_EXCEPTION, + "Couldn't create temporary file:" + e).getEmptyParse(conf); } catch (TagException e) { - throw new ParseException("ID3 Tags could not be parsed", e); + return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_EXCEPTION, + "ID3 Tags could not be parsed:" + e).getEmptyParse(conf); } finally{ tmp.delete(); } return parse; } - private Parse getID3v1Parse(MP3File mp3) throws MalformedURLException { + private Parse getID3v1Parse(MP3File mp3, Metadata contentMeta) + throws MalformedURLException { + ID3v1 tag = mp3.getID3v1Tag(); metadataCollector.notifyProperty("TALB-Text", tag.getAlbum()); metadataCollector.notifyProperty("TPE1-Text", tag.getArtist()); @@ -84,13 +100,17 @@ metadataCollector.notifyProperty("TCON-Text", "(" + tag.getGenre() + ")"); metadataCollector.notifyProperty("TIT2-Text", tag.getTitle()); metadataCollector.notifyProperty("TYER-Text", tag.getYear()); - ParseData parseData = new ParseData(metadataCollector.getTitle(), - metadataCollector.getOutlinks(), - metadataCollector.getData(), getConf()); + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, + metadataCollector.getTitle(), + metadataCollector.getOutlinks(), + contentMeta, + metadataCollector.getData()); return new ParseImpl(metadataCollector.getText(), parseData); } - public Parse getID3v2Parse(MP3File mp3) throws IOException { + public Parse getID3v2Parse(MP3File mp3, Metadata contentMeta) + throws IOException { + AbstractID3v2 tag = mp3.getID3v2Tag(); Iterator it = tag.iterator(); while (it.hasNext()) { @@ -108,9 +128,11 @@ } } } - ParseData parseData = new ParseData(metadataCollector.getTitle(), - metadataCollector.getOutlinks(), - metadataCollector.getData()); + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, + metadataCollector.getTitle(), + metadataCollector.getOutlinks(), + contentMeta, + metadataCollector.getData()); return new ParseImpl(metadataCollector.getText(), parseData); } Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java?rev=378667&r1=378666&r2=378667&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java (original) +++ lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java Fri Feb 17 16:23:35 2006 @@ -21,7 +21,7 @@ import java.net.MalformedURLException; import java.util.ArrayList; -import java.util.Properties; +import org.apache.nutch.metadata.Metadata; /** * This class allows meta data to be collected and manipulated @@ -29,7 +29,7 @@ */ public class MetadataCollector { - private Properties metadata = new Properties(); + private Metadata metadata = new Metadata(); private String title = null; private String artist = null; private String album = null; @@ -55,14 +55,10 @@ text += value + "\n"; } - metadata.setProperty(name, value); + metadata.set(name, value); } - public void putAll(Properties properties) { - metadata.putAll(properties); - } - - public Properties getData() { + public Metadata getData() { return metadata; } Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java?rev=378667&r1=378666&r2=378667&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java Fri Feb 17 16:23:35 2006 @@ -27,6 +27,12 @@ import org.apache.nutch.protocol.ProtocolFactory; import java.util.Properties; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.UTF8; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.util.NutchConfiguration; /** * Unit tests for TestMP3Parser. (Adapted from John Xing msword unit tests). @@ -62,22 +68,23 @@ Content content; Parse parse; + Configuration conf = NutchConfiguration.create(); urlString = "file:" + sampleDir + fileSeparator + id3v2; - protocol = ProtocolFactory.getProtocol(urlString); - content = protocol.getContent(urlString); - - parse = ParseUtil.parseByParserId("parse-mp3",content); - Properties metadata = parse.getData().getMetadata(); - assertEquals("postgresql comment id3v2", metadata.getProperty("COMM-Text")); - assertEquals("postgresql composer id3v2", metadata.getProperty("TCOM-Text")); - assertEquals("02", metadata.getProperty("TRCK-Text")); - assertEquals("http://localhost/", metadata.getProperty("WCOP-URL Link")); - assertEquals("postgresql artist id3v2", metadata.getProperty("TPE1-Text")); - assertEquals("(28)", metadata.getProperty("TCON-Text")); - assertEquals("2004", metadata.getProperty("TYER-Text")); - assertEquals("postgresql title id3v2", metadata.getProperty("TIT2-Text")); - assertEquals("postgresql album id3v2", metadata.getProperty("TALB-Text")); - assertEquals("postgresql encoded by id3v2", metadata.getProperty("TENC-Text")); + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()) + .getContent(); + parse = new ParseUtil(conf).parseByParserId("parse-mp3", content); + Metadata metadata = parse.getData().getParseMeta(); + assertEquals("postgresql comment id3v2", metadata.get("COMM-Text")); + assertEquals("postgresql composer id3v2", metadata.get("TCOM-Text")); + assertEquals("02", metadata.get("TRCK-Text")); + assertEquals("http://localhost/", metadata.get("WCOP-URL Link")); + assertEquals("postgresql artist id3v2", metadata.get("TPE1-Text")); + assertEquals("(28)", metadata.get("TCON-Text")); + assertEquals("2004", metadata.get("TYER-Text")); + assertEquals("postgresql title id3v2", metadata.get("TIT2-Text")); + assertEquals("postgresql album id3v2", metadata.get("TALB-Text")); + assertEquals("postgresql encoded by id3v2", metadata.get("TENC-Text")); assertEquals("postgresql title id3v2 - " + "postgresql album id3v2 - " @@ -91,22 +98,22 @@ String urlString; Protocol protocol; Content content; - Parser parser; Parse parse; + Configuration conf = NutchConfiguration.create(); urlString = "file:" + sampleDir + fileSeparator + id3v1; - protocol = ProtocolFactory.getProtocol(urlString); - content = protocol.getContent(urlString); - parser = ParserFactory.getParser(content.getContentType(), urlString); - parse = parser.getParse(content); - - Properties metadata = parse.getData().getMetadata(); - assertEquals("postgresql comment id3v1", metadata.getProperty("COMM-Text")); - assertEquals("postgresql artist id3v1", metadata.getProperty("TPE1-Text")); - assertEquals("(28)", metadata.getProperty("TCON-Text")); - assertEquals("2004", metadata.getProperty("TYER-Text")); - assertEquals("postgresql title id3v1", metadata.getProperty("TIT2-Text")); - assertEquals("postgresql album id3v1", metadata.getProperty("TALB-Text")); + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()) + .getContent(); + parse = new ParseUtil(conf).parseByParserId("parse-mp3", content); + + Metadata metadata = parse.getData().getParseMeta(); + assertEquals("postgresql comment id3v1", metadata.get("COMM-Text")); + assertEquals("postgresql artist id3v1", metadata.get("TPE1-Text")); + assertEquals("(28)", metadata.get("TCON-Text")); + assertEquals("2004", metadata.get("TYER-Text")); + assertEquals("postgresql title id3v1", metadata.get("TIT2-Text")); + assertEquals("postgresql album id3v1", metadata.get("TALB-Text")); assertEquals("postgresql title id3v1 - " + "postgresql album id3v1 - " @@ -118,21 +125,18 @@ String urlString; Protocol protocol; Content content; - Parser parser; Parse parse; + Configuration conf = NutchConfiguration.create(); urlString = "file:" + sampleDir + fileSeparator + none; - protocol = ProtocolFactory.getProtocol(urlString); - content = protocol.getContent(urlString); - parser = ParserFactory.getParser(content.getContentType(), urlString); - try { - parse = parser.getParse(content); - Properties metadata = parse.getData().getMetadata(); - } catch (ParseException e) { - return; + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()) + .getContent(); + parse = new ParseUtil(conf).parseByParserId("parse-mp3", content); + Metadata metadata = parse.getData().getParseMeta(); + if (parse.getData().getStatus().isSuccess()) { + fail("Expected ParseException"); } - fail("Expected ParseException"); - } }