Author: jukka
Date: Fri Oct 2 11:23:16 2009
New Revision: 820967
URL: http://svn.apache.org/viewvc?rev=820967&view=rev
Log:
TIKA-295: Rough cut of mbox parser
Patch by Ken Krugler
Added:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/complex.mbox
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/headers.mbox
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/multiline.mbox
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/quoted.mbox
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/simple.mbox
Modified:
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml
lucene/tika/trunk/tika-parsers/pom.xml
Modified:
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=820967&r1=820966&r2=820967&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
(original)
+++
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Fri Oct 2 11:23:16 2009
@@ -179,6 +179,7 @@
<mime-type type="application/mbms-register-response+xml"/>
<mime-type type="application/mbms-user-service-description+xml"/>
<mime-type type="application/mbox">
+ <sub-class-of type="text/plain"/>
<glob pattern="*.mbox"/>
</mime-type>
<mime-type type="application/media_control+xml"/>
Modified:
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml?rev=820967&r1=820966&r2=820967&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml
(original)
+++
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml
Fri Oct 2 11:23:16 2009
@@ -160,6 +160,10 @@
<mime>audio/x-aiff</mime>
</parser>
+ <parser name="parse-mbox"
class="org.apache.tika.parser.mbox.MboxParser">
+ <mime>application/mbox</mime>
+ </parser>
+
</parsers>
</properties>
\ No newline at end of file
Modified: lucene/tika/trunk/tika-parsers/pom.xml
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/pom.xml?rev=820967&r1=820966&r2=820967&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/pom.xml (original)
+++ lucene/tika/trunk/tika-parsers/pom.xml Fri Oct 2 11:23:16 2009
@@ -111,6 +111,12 @@
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>org.mockito</groupId>
+ <artifactId>mockito-core</artifactId>
+ <version>1.7</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>
Added:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java?rev=820967&view=auto
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
(added)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
Fri Oct 2 11:23:16 2009
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
+import java.util.Collections;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.log4j.Logger;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Mbox (mailbox) parser. This version returns the headers for the first email
+ * via metadata, which means headers from subsequent emails will be lost.
+ */
+public class MboxParser implements Parser {
+ private static final Logger LOGGER = Logger.getLogger(MboxParser.class);
+
+ public static final String MBOX_MIME_TYPE = "application/mbox";
+ public static final String MBOX_RECORD_DIVIDER = "From ";
+ private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^
]+):[ \t]*(.*)");
+
+ private static final String EMAIL_HEADER_METADATA_PREFIX =
MboxParser.class.getSimpleName() + "-";
+ private static final String EMAIL_FROMLINE_METADATA =
EMAIL_HEADER_METADATA_PREFIX + "from";
+
+ private enum ParseStates {
+ START, IN_HEADER, IN_CONTENT
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, Map<String, Object> context)
+ throws IOException, TikaException, SAXException {
+
+ InputStreamReader isr;
+ try {
+ // Headers are going to be 7-bit ascii
+ isr = new InputStreamReader(stream, "us-ascii");
+ } catch (UnsupportedEncodingException e) {
+ LOGGER.error("Unexpected exception setting up MboxParser", e);
+ isr = new InputStreamReader(stream);
+ }
+
+ BufferedReader reader = new BufferedReader(isr);
+
+ metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
+ metadata.set(Metadata.CONTENT_ENCODING, "us-ascii");
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ ParseStates parseState = ParseStates.START;
+ String multiLine = null;
+ boolean inQuote = false;
+ int numEmails = 0;
+
+ // We're going to scan, line-by-line, for a line that starts with
+ // "From "
+ for (String curLine = reader.readLine(); curLine != null; curLine =
reader.readLine()) {
+ boolean newMessage = curLine.startsWith(MBOX_RECORD_DIVIDER);
+ if (newMessage) {
+ numEmails += 1;
+ }
+
+ switch (parseState) {
+ case START:
+ if (newMessage) {
+ parseState = ParseStates.IN_HEADER;
+ newMessage = false;
+ // Fall through to IN_HEADER
+ } else {
+ break;
+ }
+
+ case IN_HEADER:
+ if (newMessage) {
+ saveHeaderInMetadata(numEmails, metadata, multiLine);
+ multiLine = curLine;
+ } else if (curLine.length() == 0) {
+ // Blank line is signal that we're transitioning to the
content.
+ saveHeaderInMetadata(numEmails, metadata, multiLine);
+ parseState = ParseStates.IN_CONTENT;
+
+ // Mimic what PackageParser does between entries.
+ xhtml.startElement("div", "class", "email-entry");
+ xhtml.startElement("p");
+ inQuote = false;
+ } else if (curLine.startsWith(" ") ||
curLine.startsWith("\t")) {
+ multiLine += " " + curLine.trim();
+ } else {
+ saveHeaderInMetadata(numEmails, metadata, multiLine);
+ multiLine = curLine;
+ }
+
+ break;
+
+ // TODO - use real email parsing support so we can correctly
handle
+ // things like multipart messages and quoted-printable
encoding.
+ // We'd also want this for charset handling, where content
isn't 7-bit
+ // ascii.
+ case IN_CONTENT:
+ if (newMessage) {
+ endMessage(xhtml, inQuote);
+ parseState = ParseStates.IN_HEADER;
+ multiLine = curLine;
+ } else {
+ boolean quoted = curLine.startsWith(">");
+ if (inQuote) {
+ if (!quoted) {
+ xhtml.endElement("q");
+ inQuote = false;
+ }
+ } else if (quoted) {
+ xhtml.startElement("q");
+ inQuote = true;
+ }
+
+ xhtml.characters(curLine);
+
+ // For plain text email, each line is a real break
position.
+ xhtml.element("br", "");
+ }
+ }
+ }
+
+ if (parseState == ParseStates.IN_HEADER) {
+ saveHeaderInMetadata(numEmails, metadata, multiLine);
+ } else if (parseState == ParseStates.IN_CONTENT) {
+ endMessage(xhtml, inQuote);
+ }
+
+ xhtml.endDocument();
+ }
+
+ private void endMessage(XHTMLContentHandler xhtml, boolean inQuote) throws
SAXException {
+ if (inQuote) {
+ xhtml.endElement("q");
+ }
+
+ xhtml.endElement("p");
+ xhtml.endElement("div");
+ }
+
+ private void saveHeaderInMetadata(int numEmails, Metadata metadata, String
curLine) {
+ if ((curLine == null) || (numEmails > 1)) {
+ return;
+ } else if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
+ metadata.add(EMAIL_FROMLINE_METADATA,
curLine.substring(MBOX_RECORD_DIVIDER.length()));
+ return;
+ }
+
+ Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
+ if (!headerMatcher.matches()) {
+ LOGGER.warn("Malformed email header in mbox file: " + curLine);
+ return;
+ }
+
+ String headerTag = headerMatcher.group(1).toLowerCase();
+ String headerContent = headerMatcher.group(2);
+
+ if (headerTag.equalsIgnoreCase("From")) {
+ metadata.add(Metadata.AUTHOR, headerContent);
+ metadata.add(Metadata.CREATOR, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Subject")) {
+ metadata.add(Metadata.SUBJECT, headerContent);
+ metadata.add(Metadata.TITLE, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Date")) {
+ // TODO - parse and convert to ISO format YYYY-MM-DD
+ metadata.add(Metadata.DATE, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Message-Id")) {
+ metadata.add(Metadata.IDENTIFIER, headerContent);
+ } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
+ metadata.add(Metadata.RELATION, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Content-Type")) {
+ // TODO - key off content-type in headers to
+ // set mapping to use for content and convert if necessary.
+
+ metadata.add(Metadata.CONTENT_TYPE, headerContent);
+ metadata.add(Metadata.FORMAT, headerContent);
+ } else {
+ metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag,
headerContent);
+ }
+ }
+
+ @SuppressWarnings("unchecked")
+ public void parse(InputStream stream, ContentHandler handler, Metadata
metadata) throws IOException, SAXException, TikaException {
+ parse(stream, handler, metadata, Collections.EMPTY_MAP);
+ }
+
+}
Added:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java?rev=820967&view=auto
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
(added)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
Fri Oct 2 11:23:16 2009
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.times;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class MboxParserTest extends TestCase {
+
+ public void testSimple() {
+ Parser parser = new MboxParser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/simple.mbox");
+ ContentHandler handler = mock(DefaultHandler.class);
+ Map<String, Object> context = new HashMap<String, Object>();
+
+ try {
+ parser.parse(stream, handler, metadata, context);
+ verify(handler).startDocument();
+ verify(handler,
times(2)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"),
any(Attributes.class));
+ verify(handler, times(2)).endElement(XHTMLContentHandler.XHTML,
"p", "p");
+ verify(handler).characters(new String("Test content
1").toCharArray(), 0, 14);
+ verify(handler).characters(new String("Test content
2").toCharArray(), 0, 14);
+ verify(handler).endDocument();
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ public void testHeaders() {
+ Parser parser = new MboxParser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/headers.mbox");
+ ContentHandler handler = mock(DefaultHandler.class);
+ Map<String, Object> context = new HashMap<String, Object>();
+
+ try {
+ parser.parse(stream, handler, metadata, context);
+
+ verify(handler).startDocument();
+ verify(handler).startElement(eq(XHTMLContentHandler.XHTML),
eq("p"), eq("p"), any(Attributes.class));
+ verify(handler).characters(new String("Test
content").toCharArray(), 0, 12);
+ verify(handler).endDocument();
+
+ assertEquals("subject", metadata.get(Metadata.TITLE));
+ assertEquals("subject", metadata.get(Metadata.SUBJECT));
+ assertEquals("<[email protected]>", metadata.get(Metadata.AUTHOR));
+ assertEquals("<[email protected]>",
metadata.get(Metadata.CREATOR));
+ assertEquals("<[email protected]>",
metadata.get("MboxParser-return-path"));
+ assertEquals("Tue, 9 Jun 2009 23:58:45 -0400",
metadata.get(Metadata.DATE));
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ public void testMultilineHeader() {
+ Parser parser = new MboxParser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/multiline.mbox");
+ ContentHandler handler = mock(DefaultHandler.class);
+ Map<String, Object> context = new HashMap<String, Object>();
+
+ try {
+ parser.parse(stream, handler, metadata, context);
+
+ verify(handler).startDocument();
+ verify(handler).startElement(eq(XHTMLContentHandler.XHTML),
eq("p"), eq("p"), any(Attributes.class));
+ verify(handler).characters(new String("Test
content").toCharArray(), 0, 12);
+ verify(handler).endDocument();
+
+ assertEquals("from xxx by xxx with xxx; date",
metadata.get("MboxParser-received"));
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ public void testQuoted() {
+ Parser parser = new MboxParser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/quoted.mbox");
+ ContentHandler handler = mock(DefaultHandler.class);
+ Map<String, Object> context = new HashMap<String, Object>();
+
+ try {
+ parser.parse(stream, handler, metadata, context);
+
+ verify(handler).startDocument();
+ verify(handler).startElement(eq(XHTMLContentHandler.XHTML),
eq("p"), eq("p"), any(Attributes.class));
+ verify(handler).startElement(eq(XHTMLContentHandler.XHTML),
eq("q"), eq("q"), any(Attributes.class));
+ verify(handler).endElement(eq(XHTMLContentHandler.XHTML), eq("q"),
eq("q"));
+ verify(handler).endElement(eq(XHTMLContentHandler.XHTML), eq("p"),
eq("p"));
+ verify(handler).characters(new String("Test
content").toCharArray(), 0, 12);
+ verify(handler).characters(new String("> quoted
stuff").toCharArray(), 0, 14);
+ verify(handler).endDocument();
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ public void testComplex() {
+ Parser parser = new MboxParser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/complex.mbox");
+ ContentHandler handler = mock(DefaultHandler.class);
+ Map<String, Object> context = new HashMap<String, Object>();
+
+ try {
+ parser.parse(stream, handler, metadata, context);
+
+ verify(handler).startDocument();
+ verify(handler,
times(3)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"),
any(Attributes.class));
+ verify(handler,
times(3)).endElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"));
+ verify(handler,
times(3)).startElement(eq(XHTMLContentHandler.XHTML), eq("q"), eq("q"),
any(Attributes.class));
+ verify(handler,
times(3)).endElement(eq(XHTMLContentHandler.XHTML), eq("q"), eq("q"));
+ verify(handler).endDocument();
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ private static InputStream getStream(String name) {
+ return Thread.currentThread().getContextClassLoader()
+ .getResourceAsStream(name);
+ }
+
+
+}
Added:
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/complex.mbox
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/complex.mbox?rev=820967&view=auto
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/complex.mbox
(added)
+++
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/complex.mbox
Fri Oct 2 11:23:16 2009
@@ -0,0 +1,291 @@
+From
core-user-return-14700-apmail-hadoop-core-user-archive=hadoop.apache....@hadoop.apache.org
Mon Jun 01 04:28:28 2009
+Return-Path:
<core-user-return-14700-apmail-hadoop-core-user-archive=hadoop.apache....@hadoop.apache.org>
+Delivered-To: [email protected]
+Received: (qmail 19921 invoked from network); 1 Jun 2009 04:28:28 -0000
+Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3)
+ by minotaur.apache.org with SMTP; 1 Jun 2009 04:28:28 -0000
+Received: (qmail 84995 invoked by uid 500); 1 Jun 2009 04:28:38 -0000
+Delivered-To: [email protected]
+Received: (qmail 84895 invoked by uid 500); 1 Jun 2009 04:28:38 -0000
+Mailing-List: contact [email protected]; run by ezmlm
+Precedence: bulk
+List-Help: <mailto:[email protected]>
+List-Unsubscribe: <mailto:[email protected]>
+List-Post: <mailto:[email protected]>
+List-Id: <core-user.hadoop.apache.org>
+Reply-To: [email protected]
+Delivered-To: mailing list [email protected]
+Received: (qmail 84885 invoked by uid 99); 1 Jun 2009 04:28:38 -0000
+Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 01 Jun 2009 04:28:38 +0000
+X-ASF-Spam-Status: No, hits=1.2 required=10.0
+ tests=SPF_NEUTRAL
+X-Spam-Check-By: apache.org
+Received-SPF: neutral (athena.apache.org: local policy)
+Received: from [69.147.107.21] (HELO mrout2-b.corp.re1.yahoo.com)
(69.147.107.21)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 01 Jun 2009 04:28:26 +0000
+Received: from SNV-EXPF01.ds.corp.yahoo.com (snv-expf01.ds.corp.yahoo.com
[207.126.227.250])
+ by mrout2-b.corp.re1.yahoo.com (8.13.8/8.13.8/y.out) with ESMTP id
n514QYA6099963
+ for <[email protected]>; Sun, 31 May 2009 21:26:35 -0700 (PDT)
+DomainKey-Signature: a=rsa-sha1; s=serpent; d=yahoo-inc.com; c=nofws; q=dns;
+ h=received:user-agent:date:subject:from:to:message-id:
+ thread-topic:thread-index:in-reply-to:mime-version:content-type:
+ content-transfer-encoding:x-originalarrivaltime;
+ b=YVtSNdgjeeSBS1yY3XDolul49i+HrgNG7QszMo9LzGnrwejjgsl5+iUM6EiQgEpV
+Received: from SNV-EXVS08.ds.corp.yahoo.com ([207.126.227.9]) by
SNV-EXPF01.ds.corp.yahoo.com with Microsoft SMTPSVC(6.0.3790.3959);
+ Sun, 31 May 2009 21:26:34 -0700
+Received: from 10.66.92.213 ([10.66.92.213]) by SNV-EXVS08.ds.corp.yahoo.com
([207.126.227.58]) with Microsoft Exchange Server HTTP-DAV ;
+ Mon, 1 Jun 2009 04:26:33 +0000
+User-Agent: Microsoft-Entourage/12.17.0.090302
+Date: Mon, 01 Jun 2009 09:56:31 +0530
+Subject: Re: question about when shuffle/sort start working
+From: Jothi Padmanabhan <[email protected]>
+To: <[email protected]>
+Message-ID: <c649564f.1435f%[email protected]>
+Thread-Topic: question about when shuffle/sort start working
+Thread-Index: AcnicSNoBw19cMU8UEaXwAdZ1YYhuw==
+In-Reply-To: <[email protected]>
+Mime-version: 1.0
+Content-type: text/plain;
+ charset="US-ASCII"
+Content-transfer-encoding: 7bit
+X-OriginalArrivalTime: 01 Jun 2009 04:26:34.0501 (UTC)
FILETIME=[257EAB50:01C9E271]
+X-Virus-Checked: Checked by ClamAV on apache.org
+
+When a Mapper completes, MapCompletionEvents are generated. Reducers try to
+fetch map outputs for a given map only on the receipt of such events.
+
+Jothi
+
+
+On 5/30/09 10:00 AM, "Jianmin Woo" <[email protected]> wrote:
+
+> Hi,
+> I am being confused by the protocol between mapper and reducer. When mapper
+> emitting the (key,value) pair done, is there any signal the mapper send out
to
+> hadoop framework in protocol to indicate that map is done and the
shuffle/sort
+> can begin for reducer? If there is no this signal in protocol, when the
+> framework begin the shuffle/sort?
+>
+> Thanks,
+> Jianmin
+>
+>
+>
+>
+
+
+From
core-user-return-14701-apmail-hadoop-core-user-archive=hadoop.apache....@hadoop.apache.org
Mon Jun 01 05:31:14 2009
+Return-Path:
<core-user-return-14701-apmail-hadoop-core-user-archive=hadoop.apache....@hadoop.apache.org>
+Delivered-To: [email protected]
+Received: (qmail 38243 invoked from network); 1 Jun 2009 05:31:14 -0000
+Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3)
+ by minotaur.apache.org with SMTP; 1 Jun 2009 05:31:14 -0000
+Received: (qmail 15621 invoked by uid 500); 1 Jun 2009 05:31:24 -0000
+Delivered-To: [email protected]
+Received: (qmail 15557 invoked by uid 500); 1 Jun 2009 05:31:24 -0000
+Mailing-List: contact [email protected]; run by ezmlm
+Precedence: bulk
+List-Help: <mailto:[email protected]>
+List-Unsubscribe: <mailto:[email protected]>
+List-Post: <mailto:[email protected]>
+List-Id: <core-user.hadoop.apache.org>
+Reply-To: [email protected]
+Delivered-To: mailing list [email protected]
+Received: (qmail 15547 invoked by uid 99); 1 Jun 2009 05:31:24 -0000
+Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 01 Jun 2009 05:31:24 +0000
+X-ASF-Spam-Status: No, hits=2.2 required=10.0
+ tests=HTML_MESSAGE,SPF_PASS
+X-Spam-Check-By: apache.org
+Received-SPF: pass (nike.apache.org: local policy)
+Received: from [68.142.237.94] (HELO n9.bullet.re3.yahoo.com) (68.142.237.94)
+ by apache.org (qpsmtpd/0.29) with SMTP; Mon, 01 Jun 2009 05:31:11 +0000
+Received: from [68.142.237.88] by n9.bullet.re3.yahoo.com with NNFMP; 01 Jun
2009 05:30:50 -0000
+Received: from [67.195.9.82] by t4.bullet.re3.yahoo.com with NNFMP; 01 Jun
2009 05:30:49 -0000
+Received: from [67.195.9.99] by t2.bullet.mail.gq1.yahoo.com with NNFMP; 01
Jun 2009 05:30:49 -0000
+Received: from [127.0.0.1] by omp103.mail.gq1.yahoo.com with NNFMP; 01 Jun
2009 05:28:01 -0000
+X-Yahoo-Newman-Property: ymail-3
+X-Yahoo-Newman-Id: [email protected]
+Received: (qmail 35264 invoked by uid 60001); 1 Jun 2009 05:30:49 -0000
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=yahoo.com; s=s1024;
t=1243834249; bh=R8qzdi/IbLyO8UwpnaujDpT9E+6bJ7nkmZN2803EmRk=;
h=Message-ID:X-YMail-OSG:Received:X-Mailer:References:Date:From:Subject:To:In-Reply-To:MIME-Version:Content-Type;
b=vq4c6RIDbkuLPYd8mirusIXf6DqTb/IeT55In7W00Y5Sxx1ZiXBb78yE9+TDfXJ0elsEZvqv4ocyvolGE0eGtyYeJA0mZikpRNu6pidxPNpCplOcLHBRz7YQ7iERwv3TagRlWy2Xd3oD9ZeV0A05P7WUOiNNX1PUUJD1IVdrEZo=
+DomainKey-Signature:a=rsa-sha1; q=dns; c=nofws;
+ s=s1024; d=yahoo.com;
+
h=Message-ID:X-YMail-OSG:Received:X-Mailer:References:Date:From:Subject:To:In-Reply-To:MIME-Version:Content-Type;
+
b=6HXZV98ON5vBwmE/xS8stVD0D2F4dkMY7a0suX5KVTb736JdR8G59mqBq/dWcpbFTLiCLtxi18LMb/dU1RKRGOEdn3l3j/jKXhBrhIgfg3qtNskPedXDKBvn7JGXiSkqpA/tUtPjvc0Uuk8/LaA01SQTz40Engg7nD8/EJdIAhA=;
+Message-ID: <[email protected]>
+X-YMail-OSG:
KzhhrJYVM1m.MCS6vRpRP2ZZO2PrfnbngosELDCIa91ZqvhJph4RdmzfUW0jw9W04RCSch1K730bPohwNpNBIk2QR_zt4_mfbhfq7YEPkSoz9LSXG90P9vIo5Fc8qyZN0U6vA9gtdyGQTpN5ahvillUH9nAF0TMWv2SvZJLjPlQ0Z0p8oK8ltBwGTgLrM8Jtdn9D29yoRyi3_EpVOfdD9OP.EK50Vr1XwSUYMbnpZ0WGHMwd.Yig7A6Elwadm3YVbfOdx2mfrG.jQsUAxQjRBNvbrOM57.FaE11kHTe9aoBWSeihNg--
+Received: from [216.145.54.7] by web111010.mail.gq1.yahoo.com via HTTP; Sun,
31 May 2009 22:30:49 PDT
+X-Mailer: YahooMailRC/1277.43 YahooMailWebService/0.7.289.10
+References: <c649564f.1435f%[email protected]>
+Date: Sun, 31 May 2009 22:30:49 -0700 (PDT)
+From: Jianmin Woo <[email protected]>
+Subject: Re: question about when shuffle/sort start working
+To: [email protected]
+In-Reply-To: <c649564f.1435f%[email protected]>
+MIME-Version: 1.0
+Content-Type: multipart/alternative; boundary="0-1193839393-1243834249=:35091"
+X-Virus-Checked: Checked by ClamAV on apache.org
+
+--0-1193839393-1243834249=:35091
+Content-Type: text/plain; charset=us-ascii
+
+Thanks a lot for your explanation, Jothi.
+
+So is this event generated by hadoop framework? Is there any API in mapper to
fire this event? Actually, I am thinking to implement a mapper that will emit
some <key, value> pairs, then fire this event to let the reducer works, the
same mapper task then emit some other <key, value> pairs and repeat. Do you
think is this logic feasible by current API?
+
+Thanks,
+Jianmin
+
+
+
+
+
+________________________________
+From: Jothi Padmanabhan <[email protected]>
+To: [email protected]
+Sent: Monday, June 1, 2009 12:26:31 PM
+Subject: Re: question about when shuffle/sort start working
+
+When a Mapper completes, MapCompletionEvents are generated. Reducers try to
+fetch map outputs for a given map only on the receipt of such events.
+
+Jothi
+
+
+On 5/30/09 10:00 AM, "Jianmin Woo" <[email protected]> wrote:
+
+> Hi,
+> I am being confused by the protocol between mapper and reducer. When mapper
+> emitting the (key,value) pair done, is there any signal the mapper send out
to
+> hadoop framework in protocol to indicate that map is done and the
shuffle/sort
+> can begin for reducer? If there is no this signal in protocol, when the
+> framework begin the shuffle/sort?
+>
+> Thanks,
+> Jianmin
+>
+>
+>
+>
+
+
+
+--0-1193839393-1243834249=:35091--
+
+
+From
core-user-return-14702-apmail-hadoop-core-user-archive=hadoop.apache....@hadoop.apache.org
Mon Jun 01 06:04:30 2009
+Return-Path:
<core-user-return-14702-apmail-hadoop-core-user-archive=hadoop.apache....@hadoop.apache.org>
+Delivered-To: [email protected]
+Received: (qmail 53387 invoked from network); 1 Jun 2009 06:04:29 -0000
+Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3)
+ by minotaur.apache.org with SMTP; 1 Jun 2009 06:04:29 -0000
+Received: (qmail 39066 invoked by uid 500); 1 Jun 2009 06:04:39 -0000
+Delivered-To: [email protected]
+Received: (qmail 38970 invoked by uid 500); 1 Jun 2009 06:04:39 -0000
+Mailing-List: contact [email protected]; run by ezmlm
+Precedence: bulk
+List-Help: <mailto:[email protected]>
+List-Unsubscribe: <mailto:[email protected]>
+List-Post: <mailto:[email protected]>
+List-Id: <core-user.hadoop.apache.org>
+Reply-To: [email protected]
+Delivered-To: mailing list [email protected]
+Received: (qmail 38955 invoked by uid 99); 1 Jun 2009 06:04:39 -0000
+Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 01 Jun 2009 06:04:39 +0000
+X-ASF-Spam-Status: No, hits=1.2 required=10.0
+ tests=SPF_NEUTRAL
+X-Spam-Check-By: apache.org
+Received-SPF: neutral (athena.apache.org: local policy)
+Received: from [216.145.54.172] (HELO mrout2.yahoo.com) (216.145.54.172)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 01 Jun 2009 06:04:28 +0000
+Received: from SNV-EXBH01.ds.corp.yahoo.com (snv-exbh01.ds.corp.yahoo.com
[207.126.227.249])
+ by mrout2.yahoo.com (8.13.6/8.13.6/y.out) with ESMTP id n5163FGq038852
+ for <[email protected]>; Sun, 31 May 2009 23:03:15 -0700 (PDT)
+DomainKey-Signature: a=rsa-sha1; s=serpent; d=yahoo-inc.com; c=nofws; q=dns;
+ h=received:user-agent:date:subject:from:to:message-id:
+ thread-topic:thread-index:in-reply-to:mime-version:content-type:
+ content-transfer-encoding:x-originalarrivaltime;
+ b=rChE4SCnwtWaZpjhovkiXDKfDiVNdRRvsadSGG9S9bgvOexn/9/5JjEQx1pOR7Nb
+Received: from SNV-EXVS08.ds.corp.yahoo.com ([207.126.227.9]) by
SNV-EXBH01.ds.corp.yahoo.com with Microsoft SMTPSVC(6.0.3790.3959);
+ Sun, 31 May 2009 23:03:15 -0700
+Received: from 10.66.92.213 ([10.66.92.213]) by SNV-EXVS08.ds.corp.yahoo.com
([207.126.227.58]) with Microsoft Exchange Server HTTP-DAV ;
+ Mon, 1 Jun 2009 06:03:15 +0000
+User-Agent: Microsoft-Entourage/12.17.0.090302
+Date: Mon, 01 Jun 2009 11:33:13 +0530
+Subject: Re: question about when shuffle/sort start working
+From: Jothi Padmanabhan <[email protected]>
+To: <[email protected]>
+Message-ID: <c6496cf9.1437c%[email protected]>
+Thread-Topic: question about when shuffle/sort start working
+Thread-Index: AcnifqWrLG6N7GAk7kqy9QalVWfegQ==
+In-Reply-To: <[email protected]>
+Mime-version: 1.0
+Content-type: text/plain;
+ charset="US-ASCII"
+Content-transfer-encoding: 7bit
+X-OriginalArrivalTime: 01 Jun 2009 06:03:15.0462 (UTC)
FILETIME=[A7231260:01C9E27E]
+X-Virus-Checked: Checked by ClamAV on apache.org
+
+
+No you cannot raise this event yourself, this event is generated internally
+by the framework.
+
+I am guessing that what you probably want is to have a chain of MapReduce
+Jobs where the output of one is automatically fed as input to another. You
+can look at these classes: JobControl and ChainMapper/ChainReducer.
+
+Jothi
+
+On 6/1/09 11:00 AM, "Jianmin Woo" <[email protected]> wrote:
+
+> Thanks a lot for your explanation, Jothi.
+>
+> So is this event generated by hadoop framework? Is there any API in mapper to
+> fire this event? Actually, I am thinking to implement a mapper that will emit
+> some <key, value> pairs, then fire this event to let the reducer works, the
+> same mapper task then emit some other <key, value> pairs and repeat. Do you
+> think is this logic feasible by current API?
+>
+> Thanks,
+> Jianmin
+>
+>
+>
+>
+>
+> ________________________________
+> From: Jothi Padmanabhan <[email protected]>
+> To: [email protected]
+> Sent: Monday, June 1, 2009 12:26:31 PM
+> Subject: Re: question about when shuffle/sort start working
+>
+> When a Mapper completes, MapCompletionEvents are generated. Reducers try to
+> fetch map outputs for a given map only on the receipt of such events.
+>
+> Jothi
+>
+>
+> On 5/30/09 10:00 AM, "Jianmin Woo" <[email protected]> wrote:
+>
+>> Hi,
+>> I am being confused by the protocol between mapper and reducer. When mapper
+>> emitting the (key,value) pair done, is there any signal the mapper send out
+>> to
+>> hadoop framework in protocol to indicate that map is done and the
+>> shuffle/sort
+>> can begin for reducer? If there is no this signal in protocol, when the
+>> framework begin the shuffle/sort?
+>>
+>> Thanks,
+>> Jianmin
+>>
+>>
+>>
+>>
+>
+>
+>
+
+
Added:
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/headers.mbox
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/headers.mbox?rev=820967&view=auto
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/headers.mbox
(added)
+++
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/headers.mbox
Fri Oct 2 11:23:16 2009
@@ -0,0 +1,7 @@
+From envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009
+Return-Path: <[email protected]>
+Subject: subject
+From: <[email protected]>
+Date: Tue, 9 Jun 2009 23:58:45 -0400
+
+Test content
Added:
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/multiline.mbox
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/multiline.mbox?rev=820967&view=auto
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/multiline.mbox
(added)
+++
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/multiline.mbox
Fri Oct 2 11:23:16 2009
@@ -0,0 +1,5 @@
+From envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009
+Received: from xxx
+ by xxx with xxx; date
+
+Test content
Added:
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/quoted.mbox
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/quoted.mbox?rev=820967&view=auto
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/quoted.mbox
(added)
+++
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/quoted.mbox
Fri Oct 2 11:23:16 2009
@@ -0,0 +1,4 @@
+From envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009
+
+Test content
+> quoted stuff
\ No newline at end of file
Added:
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/simple.mbox
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/simple.mbox?rev=820967&view=auto
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/simple.mbox
(added)
+++
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/simple.mbox
Fri Oct 2 11:23:16 2009
@@ -0,0 +1,7 @@
+From envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009
+
+Test content 1
+
+From envelope-sender-mailbox-name Mon Jun 01 11:00:00 2009
+
+Test content 2
\ No newline at end of file