Author: jukka
Date: Fri Oct  2 11:23:16 2009
New Revision: 820967

URL: http://svn.apache.org/viewvc?rev=820967&view=rev
Log:
TIKA-295: Rough cut of mbox parser

Patch by Ken Krugler

Added:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
    lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/
    
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
    
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/complex.mbox
    
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/headers.mbox
    
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/multiline.mbox
    lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/quoted.mbox
    lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/simple.mbox
Modified:
    
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml
    lucene/tika/trunk/tika-parsers/pom.xml

Modified: 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=820967&r1=820966&r2=820967&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
 (original)
+++ 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
 Fri Oct  2 11:23:16 2009
@@ -179,6 +179,7 @@
   <mime-type type="application/mbms-register-response+xml"/>
   <mime-type type="application/mbms-user-service-description+xml"/>
   <mime-type type="application/mbox">
+    <sub-class-of type="text/plain"/>
     <glob pattern="*.mbox"/>
   </mime-type>
   <mime-type type="application/media_control+xml"/>

Modified: 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml?rev=820967&r1=820966&r2=820967&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml 
(original)
+++ 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml 
Fri Oct  2 11:23:16 2009
@@ -160,6 +160,10 @@
                 <mime>audio/x-aiff</mime>
         </parser>
 
+        <parser name="parse-mbox" 
class="org.apache.tika.parser.mbox.MboxParser">
+                <mime>application/mbox</mime>
+        </parser>
+
     </parsers>
 
 </properties>
\ No newline at end of file

Modified: lucene/tika/trunk/tika-parsers/pom.xml
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/pom.xml?rev=820967&r1=820966&r2=820967&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/pom.xml (original)
+++ lucene/tika/trunk/tika-parsers/pom.xml Fri Oct  2 11:23:16 2009
@@ -111,6 +111,12 @@
       <artifactId>junit</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+       <groupId>org.mockito</groupId>
+       <artifactId>mockito-core</artifactId>
+       <version>1.7</version>
+       <scope>test</scope>
+    </dependency>
   </dependencies>
 
   <build>

Added: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java?rev=820967&view=auto
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
 (added)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
 Fri Oct  2 11:23:16 2009
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
+import java.util.Collections;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.log4j.Logger;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Mbox (mailbox) parser. This version returns the headers for the first email
+ * via metadata, which means headers from subsequent emails will be lost.
+ */
+public class MboxParser implements Parser {
+    private static final Logger LOGGER = Logger.getLogger(MboxParser.class);
+
+    public static final String MBOX_MIME_TYPE = "application/mbox";
+    public static final String MBOX_RECORD_DIVIDER = "From ";
+    private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ 
]+):[ \t]*(.*)");
+
+    private static final String EMAIL_HEADER_METADATA_PREFIX = 
MboxParser.class.getSimpleName() + "-";
+    private static final String EMAIL_FROMLINE_METADATA = 
EMAIL_HEADER_METADATA_PREFIX + "from";
+
+    private enum ParseStates {
+        START, IN_HEADER, IN_CONTENT
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, Map<String, Object> context)
+            throws IOException, TikaException, SAXException {
+
+        InputStreamReader isr;
+        try {
+            // Headers are going to be 7-bit ascii
+            isr = new InputStreamReader(stream, "us-ascii");
+        } catch (UnsupportedEncodingException e) {
+            LOGGER.error("Unexpected exception setting up MboxParser", e);
+            isr = new InputStreamReader(stream);
+        }
+
+        BufferedReader reader = new BufferedReader(isr);
+
+        metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
+        metadata.set(Metadata.CONTENT_ENCODING, "us-ascii");
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        ParseStates parseState = ParseStates.START;
+        String multiLine = null;
+        boolean inQuote = false;
+        int numEmails = 0;
+
+        // We're going to scan, line-by-line, for a line that starts with
+        // "From "
+        for (String curLine = reader.readLine(); curLine != null; curLine = 
reader.readLine()) {
+            boolean newMessage = curLine.startsWith(MBOX_RECORD_DIVIDER);
+            if (newMessage) {
+                numEmails += 1;
+            }
+
+            switch (parseState) {
+            case START:
+                if (newMessage) {
+                    parseState = ParseStates.IN_HEADER;
+                    newMessage = false;
+                    // Fall through to IN_HEADER
+                } else {
+                    break;
+                }
+
+            case IN_HEADER:
+                if (newMessage) {
+                    saveHeaderInMetadata(numEmails, metadata, multiLine);
+                    multiLine = curLine;
+                } else if (curLine.length() == 0) {
+                    // Blank line is signal that we're transitioning to the 
content.
+                    saveHeaderInMetadata(numEmails, metadata, multiLine);
+                    parseState = ParseStates.IN_CONTENT;
+
+                    // Mimic what PackageParser does between entries.
+                    xhtml.startElement("div", "class", "email-entry");
+                    xhtml.startElement("p");
+                    inQuote = false;
+                } else if (curLine.startsWith(" ") || 
curLine.startsWith("\t")) {
+                    multiLine += " " + curLine.trim();
+                } else {
+                    saveHeaderInMetadata(numEmails, metadata, multiLine);
+                    multiLine = curLine;
+                }
+
+                break;
+
+                // TODO - use real email parsing support so we can correctly 
handle
+                // things like multipart messages and quoted-printable 
encoding.
+                // We'd also want this for charset handling, where content 
isn't 7-bit
+                // ascii.
+            case IN_CONTENT:
+                if (newMessage) {
+                    endMessage(xhtml, inQuote);
+                    parseState = ParseStates.IN_HEADER;
+                    multiLine = curLine;
+                } else {
+                    boolean quoted = curLine.startsWith(">");
+                    if (inQuote) {
+                        if (!quoted) {
+                            xhtml.endElement("q");
+                            inQuote = false;
+                        }
+                    } else if (quoted) {
+                        xhtml.startElement("q");
+                        inQuote = true;
+                    }
+
+                    xhtml.characters(curLine);
+
+                    // For plain text email, each line is a real break 
position.
+                    xhtml.element("br", "");
+                }
+            }
+        }
+
+        if (parseState == ParseStates.IN_HEADER) {
+            saveHeaderInMetadata(numEmails, metadata, multiLine);
+        } else if (parseState == ParseStates.IN_CONTENT) {
+            endMessage(xhtml, inQuote);
+        }
+
+        xhtml.endDocument();
+    }
+
+    private void endMessage(XHTMLContentHandler xhtml, boolean inQuote) throws 
SAXException {
+        if (inQuote) {
+            xhtml.endElement("q");
+        }
+
+        xhtml.endElement("p");
+        xhtml.endElement("div");
+    }
+
+    private void saveHeaderInMetadata(int numEmails, Metadata metadata, String 
curLine) {
+        if ((curLine == null) || (numEmails > 1)) {
+            return;
+        } else if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
+            metadata.add(EMAIL_FROMLINE_METADATA, 
curLine.substring(MBOX_RECORD_DIVIDER.length()));
+            return;
+        }
+
+        Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
+        if (!headerMatcher.matches()) {
+            LOGGER.warn("Malformed email header in mbox file: " + curLine);
+            return;
+        }
+
+        String headerTag = headerMatcher.group(1).toLowerCase();
+        String headerContent = headerMatcher.group(2);
+
+        if (headerTag.equalsIgnoreCase("From")) {
+            metadata.add(Metadata.AUTHOR, headerContent);
+            metadata.add(Metadata.CREATOR, headerContent);
+        } else if (headerTag.equalsIgnoreCase("Subject")) {
+            metadata.add(Metadata.SUBJECT, headerContent);
+            metadata.add(Metadata.TITLE, headerContent);
+        } else if (headerTag.equalsIgnoreCase("Date")) {
+            // TODO - parse and convert to ISO format YYYY-MM-DD
+            metadata.add(Metadata.DATE, headerContent);
+        } else if (headerTag.equalsIgnoreCase("Message-Id")) {
+            metadata.add(Metadata.IDENTIFIER, headerContent);
+        } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
+            metadata.add(Metadata.RELATION, headerContent);
+        } else if (headerTag.equalsIgnoreCase("Content-Type")) {
+            // TODO - key off content-type in headers to
+            // set mapping to use for content and convert if necessary.
+
+            metadata.add(Metadata.CONTENT_TYPE, headerContent);
+            metadata.add(Metadata.FORMAT, headerContent);
+        } else {
+            metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, 
headerContent);
+        }
+    }
+
+    @SuppressWarnings("unchecked")
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata) throws IOException, SAXException, TikaException {
+        parse(stream, handler, metadata, Collections.EMPTY_MAP);
+    }
+
+}

Added: 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java?rev=820967&view=auto
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
 (added)
+++ 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
 Fri Oct  2 11:23:16 2009
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.times;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class MboxParserTest extends TestCase {
+
+    public void testSimple() {
+        Parser parser = new MboxParser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/simple.mbox");
+        ContentHandler handler = mock(DefaultHandler.class);
+        Map<String, Object> context = new HashMap<String, Object>();
+
+        try {
+            parser.parse(stream, handler, metadata, context);
+            verify(handler).startDocument();
+            verify(handler, 
times(2)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), 
any(Attributes.class));
+            verify(handler, times(2)).endElement(XHTMLContentHandler.XHTML, 
"p", "p");
+            verify(handler).characters(new String("Test content 
1").toCharArray(), 0, 14);
+            verify(handler).characters(new String("Test content 
2").toCharArray(), 0, 14);
+            verify(handler).endDocument();
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    public void testHeaders() {
+        Parser parser = new MboxParser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/headers.mbox");
+        ContentHandler handler = mock(DefaultHandler.class);
+        Map<String, Object> context = new HashMap<String, Object>();
+
+        try {
+            parser.parse(stream, handler, metadata, context);
+
+            verify(handler).startDocument();
+            verify(handler).startElement(eq(XHTMLContentHandler.XHTML), 
eq("p"), eq("p"), any(Attributes.class));
+            verify(handler).characters(new String("Test 
content").toCharArray(), 0, 12);
+            verify(handler).endDocument();
+
+            assertEquals("subject", metadata.get(Metadata.TITLE));
+            assertEquals("subject", metadata.get(Metadata.SUBJECT));
+            assertEquals("<[email protected]>", metadata.get(Metadata.AUTHOR));
+            assertEquals("<[email protected]>", 
metadata.get(Metadata.CREATOR));
+            assertEquals("<[email protected]>", 
metadata.get("MboxParser-return-path"));
+            assertEquals("Tue, 9 Jun 2009 23:58:45 -0400", 
metadata.get(Metadata.DATE));
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    public void testMultilineHeader() {
+        Parser parser = new MboxParser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/multiline.mbox");
+        ContentHandler handler = mock(DefaultHandler.class);
+        Map<String, Object> context = new HashMap<String, Object>();
+
+        try {
+            parser.parse(stream, handler, metadata, context);
+
+            verify(handler).startDocument();
+            verify(handler).startElement(eq(XHTMLContentHandler.XHTML), 
eq("p"), eq("p"), any(Attributes.class));
+            verify(handler).characters(new String("Test 
content").toCharArray(), 0, 12);
+            verify(handler).endDocument();
+
+            assertEquals("from xxx by xxx with xxx; date", 
metadata.get("MboxParser-received"));
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    public void testQuoted() {
+        Parser parser = new MboxParser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/quoted.mbox");
+        ContentHandler handler = mock(DefaultHandler.class);
+        Map<String, Object> context = new HashMap<String, Object>();
+
+        try {
+            parser.parse(stream, handler, metadata, context);
+
+            verify(handler).startDocument();
+            verify(handler).startElement(eq(XHTMLContentHandler.XHTML), 
eq("p"), eq("p"), any(Attributes.class));
+            verify(handler).startElement(eq(XHTMLContentHandler.XHTML), 
eq("q"), eq("q"), any(Attributes.class));
+            verify(handler).endElement(eq(XHTMLContentHandler.XHTML), eq("q"), 
eq("q"));
+            verify(handler).endElement(eq(XHTMLContentHandler.XHTML), eq("p"), 
eq("p"));
+            verify(handler).characters(new String("Test 
content").toCharArray(), 0, 12);
+            verify(handler).characters(new String("> quoted 
stuff").toCharArray(), 0, 14);
+            verify(handler).endDocument();
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    public void testComplex() {
+        Parser parser = new MboxParser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/complex.mbox");
+        ContentHandler handler = mock(DefaultHandler.class);
+        Map<String, Object> context = new HashMap<String, Object>();
+
+        try {
+            parser.parse(stream, handler, metadata, context);
+
+            verify(handler).startDocument();
+            verify(handler, 
times(3)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), 
any(Attributes.class));
+            verify(handler, 
times(3)).endElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"));
+            verify(handler, 
times(3)).startElement(eq(XHTMLContentHandler.XHTML), eq("q"), eq("q"), 
any(Attributes.class));
+            verify(handler, 
times(3)).endElement(eq(XHTMLContentHandler.XHTML), eq("q"), eq("q"));
+            verify(handler).endDocument();
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    private static InputStream getStream(String name) {
+        return Thread.currentThread().getContextClassLoader()
+        .getResourceAsStream(name);
+    }
+
+
+}

Added: 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/complex.mbox
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/complex.mbox?rev=820967&view=auto
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/complex.mbox 
(added)
+++ 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/complex.mbox 
Fri Oct  2 11:23:16 2009
@@ -0,0 +1,291 @@
+From 
core-user-return-14700-apmail-hadoop-core-user-archive=hadoop.apache....@hadoop.apache.org
 Mon Jun 01 04:28:28 2009
+Return-Path: 
<core-user-return-14700-apmail-hadoop-core-user-archive=hadoop.apache....@hadoop.apache.org>
+Delivered-To: [email protected]
+Received: (qmail 19921 invoked from network); 1 Jun 2009 04:28:28 -0000
+Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3)
+  by minotaur.apache.org with SMTP; 1 Jun 2009 04:28:28 -0000
+Received: (qmail 84995 invoked by uid 500); 1 Jun 2009 04:28:38 -0000
+Delivered-To: [email protected]
+Received: (qmail 84895 invoked by uid 500); 1 Jun 2009 04:28:38 -0000
+Mailing-List: contact [email protected]; run by ezmlm
+Precedence: bulk
+List-Help: <mailto:[email protected]>
+List-Unsubscribe: <mailto:[email protected]>
+List-Post: <mailto:[email protected]>
+List-Id: <core-user.hadoop.apache.org>
+Reply-To: [email protected]
+Delivered-To: mailing list [email protected]
+Received: (qmail 84885 invoked by uid 99); 1 Jun 2009 04:28:38 -0000
+Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136)
+    by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 01 Jun 2009 04:28:38 +0000
+X-ASF-Spam-Status: No, hits=1.2 required=10.0
+       tests=SPF_NEUTRAL
+X-Spam-Check-By: apache.org
+Received-SPF: neutral (athena.apache.org: local policy)
+Received: from [69.147.107.21] (HELO mrout2-b.corp.re1.yahoo.com) 
(69.147.107.21)
+    by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 01 Jun 2009 04:28:26 +0000
+Received: from SNV-EXPF01.ds.corp.yahoo.com (snv-expf01.ds.corp.yahoo.com 
[207.126.227.250])
+       by mrout2-b.corp.re1.yahoo.com (8.13.8/8.13.8/y.out) with ESMTP id 
n514QYA6099963
+       for <[email protected]>; Sun, 31 May 2009 21:26:35 -0700 (PDT)
+DomainKey-Signature: a=rsa-sha1; s=serpent; d=yahoo-inc.com; c=nofws; q=dns;
+       h=received:user-agent:date:subject:from:to:message-id:
+       thread-topic:thread-index:in-reply-to:mime-version:content-type:
+       content-transfer-encoding:x-originalarrivaltime;
+       b=YVtSNdgjeeSBS1yY3XDolul49i+HrgNG7QszMo9LzGnrwejjgsl5+iUM6EiQgEpV
+Received: from SNV-EXVS08.ds.corp.yahoo.com ([207.126.227.9]) by 
SNV-EXPF01.ds.corp.yahoo.com with Microsoft SMTPSVC(6.0.3790.3959);
+        Sun, 31 May 2009 21:26:34 -0700
+Received: from 10.66.92.213 ([10.66.92.213]) by SNV-EXVS08.ds.corp.yahoo.com 
([207.126.227.58]) with Microsoft Exchange Server HTTP-DAV ;
+ Mon,  1 Jun 2009 04:26:33 +0000
+User-Agent: Microsoft-Entourage/12.17.0.090302
+Date: Mon, 01 Jun 2009 09:56:31 +0530
+Subject: Re: question about when shuffle/sort start working
+From: Jothi Padmanabhan <[email protected]>
+To: <[email protected]>
+Message-ID: <c649564f.1435f%[email protected]>
+Thread-Topic: question about when shuffle/sort start working
+Thread-Index: AcnicSNoBw19cMU8UEaXwAdZ1YYhuw==
+In-Reply-To: <[email protected]>
+Mime-version: 1.0
+Content-type: text/plain;
+       charset="US-ASCII"
+Content-transfer-encoding: 7bit
+X-OriginalArrivalTime: 01 Jun 2009 04:26:34.0501 (UTC) 
FILETIME=[257EAB50:01C9E271]
+X-Virus-Checked: Checked by ClamAV on apache.org
+
+When a Mapper completes, MapCompletionEvents are generated. Reducers try to
+fetch map outputs for a given map only on the receipt of such events.
+
+Jothi
+
+
+On 5/30/09 10:00 AM, "Jianmin Woo" <[email protected]> wrote:
+
+> Hi, 
+> I am being confused by the protocol between mapper and reducer. When mapper
+> emitting the (key,value) pair done, is there any signal the mapper send out 
to
+> hadoop framework in protocol to indicate that map is done and the 
shuffle/sort
+> can begin for reducer? If there is no this signal in protocol, when the
+> framework begin the shuffle/sort?
+> 
+> Thanks,
+> Jianmin
+> 
+> 
+> 
+>       
+
+
+From 
core-user-return-14701-apmail-hadoop-core-user-archive=hadoop.apache....@hadoop.apache.org
 Mon Jun 01 05:31:14 2009
+Return-Path: 
<core-user-return-14701-apmail-hadoop-core-user-archive=hadoop.apache....@hadoop.apache.org>
+Delivered-To: [email protected]
+Received: (qmail 38243 invoked from network); 1 Jun 2009 05:31:14 -0000
+Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3)
+  by minotaur.apache.org with SMTP; 1 Jun 2009 05:31:14 -0000
+Received: (qmail 15621 invoked by uid 500); 1 Jun 2009 05:31:24 -0000
+Delivered-To: [email protected]
+Received: (qmail 15557 invoked by uid 500); 1 Jun 2009 05:31:24 -0000
+Mailing-List: contact [email protected]; run by ezmlm
+Precedence: bulk
+List-Help: <mailto:[email protected]>
+List-Unsubscribe: <mailto:[email protected]>
+List-Post: <mailto:[email protected]>
+List-Id: <core-user.hadoop.apache.org>
+Reply-To: [email protected]
+Delivered-To: mailing list [email protected]
+Received: (qmail 15547 invoked by uid 99); 1 Jun 2009 05:31:24 -0000
+Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230)
+    by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 01 Jun 2009 05:31:24 +0000
+X-ASF-Spam-Status: No, hits=2.2 required=10.0
+       tests=HTML_MESSAGE,SPF_PASS
+X-Spam-Check-By: apache.org
+Received-SPF: pass (nike.apache.org: local policy)
+Received: from [68.142.237.94] (HELO n9.bullet.re3.yahoo.com) (68.142.237.94)
+    by apache.org (qpsmtpd/0.29) with SMTP; Mon, 01 Jun 2009 05:31:11 +0000
+Received: from [68.142.237.88] by n9.bullet.re3.yahoo.com with NNFMP; 01 Jun 
2009 05:30:50 -0000
+Received: from [67.195.9.82] by t4.bullet.re3.yahoo.com with NNFMP; 01 Jun 
2009 05:30:49 -0000
+Received: from [67.195.9.99] by t2.bullet.mail.gq1.yahoo.com with NNFMP; 01 
Jun 2009 05:30:49 -0000
+Received: from [127.0.0.1] by omp103.mail.gq1.yahoo.com with NNFMP; 01 Jun 
2009 05:28:01 -0000
+X-Yahoo-Newman-Property: ymail-3
+X-Yahoo-Newman-Id: [email protected]
+Received: (qmail 35264 invoked by uid 60001); 1 Jun 2009 05:30:49 -0000
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=yahoo.com; s=s1024; 
t=1243834249; bh=R8qzdi/IbLyO8UwpnaujDpT9E+6bJ7nkmZN2803EmRk=; 
h=Message-ID:X-YMail-OSG:Received:X-Mailer:References:Date:From:Subject:To:In-Reply-To:MIME-Version:Content-Type;
 
b=vq4c6RIDbkuLPYd8mirusIXf6DqTb/IeT55In7W00Y5Sxx1ZiXBb78yE9+TDfXJ0elsEZvqv4ocyvolGE0eGtyYeJA0mZikpRNu6pidxPNpCplOcLHBRz7YQ7iERwv3TagRlWy2Xd3oD9ZeV0A05P7WUOiNNX1PUUJD1IVdrEZo=
+DomainKey-Signature:a=rsa-sha1; q=dns; c=nofws;
+  s=s1024; d=yahoo.com;
+  
h=Message-ID:X-YMail-OSG:Received:X-Mailer:References:Date:From:Subject:To:In-Reply-To:MIME-Version:Content-Type;
+  
b=6HXZV98ON5vBwmE/xS8stVD0D2F4dkMY7a0suX5KVTb736JdR8G59mqBq/dWcpbFTLiCLtxi18LMb/dU1RKRGOEdn3l3j/jKXhBrhIgfg3qtNskPedXDKBvn7JGXiSkqpA/tUtPjvc0Uuk8/LaA01SQTz40Engg7nD8/EJdIAhA=;
+Message-ID: <[email protected]>
+X-YMail-OSG: 
KzhhrJYVM1m.MCS6vRpRP2ZZO2PrfnbngosELDCIa91ZqvhJph4RdmzfUW0jw9W04RCSch1K730bPohwNpNBIk2QR_zt4_mfbhfq7YEPkSoz9LSXG90P9vIo5Fc8qyZN0U6vA9gtdyGQTpN5ahvillUH9nAF0TMWv2SvZJLjPlQ0Z0p8oK8ltBwGTgLrM8Jtdn9D29yoRyi3_EpVOfdD9OP.EK50Vr1XwSUYMbnpZ0WGHMwd.Yig7A6Elwadm3YVbfOdx2mfrG.jQsUAxQjRBNvbrOM57.FaE11kHTe9aoBWSeihNg--
+Received: from [216.145.54.7] by web111010.mail.gq1.yahoo.com via HTTP; Sun, 
31 May 2009 22:30:49 PDT
+X-Mailer: YahooMailRC/1277.43 YahooMailWebService/0.7.289.10
+References: <c649564f.1435f%[email protected]>
+Date: Sun, 31 May 2009 22:30:49 -0700 (PDT)
+From: Jianmin Woo <[email protected]>
+Subject: Re: question about when shuffle/sort start working
+To: [email protected]
+In-Reply-To: <c649564f.1435f%[email protected]>
+MIME-Version: 1.0
+Content-Type: multipart/alternative; boundary="0-1193839393-1243834249=:35091"
+X-Virus-Checked: Checked by ClamAV on apache.org
+
+--0-1193839393-1243834249=:35091
+Content-Type: text/plain; charset=us-ascii
+
+Thanks a lot for your explanation, Jothi. 
+
+So is this event generated by hadoop framework? Is there any API in mapper to 
fire this event? Actually, I am thinking to implement a mapper that will emit 
some <key, value> pairs, then fire this event to let the reducer works, the 
same mapper task then emit some other <key, value> pairs and repeat. Do you 
think is this logic feasible by current API?
+
+Thanks,
+Jianmin
+
+
+
+
+
+________________________________
+From: Jothi Padmanabhan <[email protected]>
+To: [email protected]
+Sent: Monday, June 1, 2009 12:26:31 PM
+Subject: Re: question about when shuffle/sort start working
+
+When a Mapper completes, MapCompletionEvents are generated. Reducers try to
+fetch map outputs for a given map only on the receipt of such events.
+
+Jothi
+
+
+On 5/30/09 10:00 AM, "Jianmin Woo" <[email protected]> wrote:
+
+> Hi, 
+> I am being confused by the protocol between mapper and reducer. When mapper
+> emitting the (key,value) pair done, is there any signal the mapper send out 
to
+> hadoop framework in protocol to indicate that map is done and the 
shuffle/sort
+> can begin for reducer? If there is no this signal in protocol, when the
+> framework begin the shuffle/sort?
+> 
+> Thanks,
+> Jianmin
+> 
+> 
+> 
+>      
+
+
+      
+--0-1193839393-1243834249=:35091--
+
+
+From 
core-user-return-14702-apmail-hadoop-core-user-archive=hadoop.apache....@hadoop.apache.org
 Mon Jun 01 06:04:30 2009
+Return-Path: 
<core-user-return-14702-apmail-hadoop-core-user-archive=hadoop.apache....@hadoop.apache.org>
+Delivered-To: [email protected]
+Received: (qmail 53387 invoked from network); 1 Jun 2009 06:04:29 -0000
+Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3)
+  by minotaur.apache.org with SMTP; 1 Jun 2009 06:04:29 -0000
+Received: (qmail 39066 invoked by uid 500); 1 Jun 2009 06:04:39 -0000
+Delivered-To: [email protected]
+Received: (qmail 38970 invoked by uid 500); 1 Jun 2009 06:04:39 -0000
+Mailing-List: contact [email protected]; run by ezmlm
+Precedence: bulk
+List-Help: <mailto:[email protected]>
+List-Unsubscribe: <mailto:[email protected]>
+List-Post: <mailto:[email protected]>
+List-Id: <core-user.hadoop.apache.org>
+Reply-To: [email protected]
+Delivered-To: mailing list [email protected]
+Received: (qmail 38955 invoked by uid 99); 1 Jun 2009 06:04:39 -0000
+Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136)
+    by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 01 Jun 2009 06:04:39 +0000
+X-ASF-Spam-Status: No, hits=1.2 required=10.0
+       tests=SPF_NEUTRAL
+X-Spam-Check-By: apache.org
+Received-SPF: neutral (athena.apache.org: local policy)
+Received: from [216.145.54.172] (HELO mrout2.yahoo.com) (216.145.54.172)
+    by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 01 Jun 2009 06:04:28 +0000
+Received: from SNV-EXBH01.ds.corp.yahoo.com (snv-exbh01.ds.corp.yahoo.com 
[207.126.227.249])
+       by mrout2.yahoo.com (8.13.6/8.13.6/y.out) with ESMTP id n5163FGq038852
+       for <[email protected]>; Sun, 31 May 2009 23:03:15 -0700 (PDT)
+DomainKey-Signature: a=rsa-sha1; s=serpent; d=yahoo-inc.com; c=nofws; q=dns;
+       h=received:user-agent:date:subject:from:to:message-id:
+       thread-topic:thread-index:in-reply-to:mime-version:content-type:
+       content-transfer-encoding:x-originalarrivaltime;
+       b=rChE4SCnwtWaZpjhovkiXDKfDiVNdRRvsadSGG9S9bgvOexn/9/5JjEQx1pOR7Nb
+Received: from SNV-EXVS08.ds.corp.yahoo.com ([207.126.227.9]) by 
SNV-EXBH01.ds.corp.yahoo.com with Microsoft SMTPSVC(6.0.3790.3959);
+        Sun, 31 May 2009 23:03:15 -0700
+Received: from 10.66.92.213 ([10.66.92.213]) by SNV-EXVS08.ds.corp.yahoo.com 
([207.126.227.58]) with Microsoft Exchange Server HTTP-DAV ;
+ Mon,  1 Jun 2009 06:03:15 +0000
+User-Agent: Microsoft-Entourage/12.17.0.090302
+Date: Mon, 01 Jun 2009 11:33:13 +0530
+Subject: Re: question about when shuffle/sort start working
+From: Jothi Padmanabhan <[email protected]>
+To: <[email protected]>
+Message-ID: <c6496cf9.1437c%[email protected]>
+Thread-Topic: question about when shuffle/sort start working
+Thread-Index: AcnifqWrLG6N7GAk7kqy9QalVWfegQ==
+In-Reply-To: <[email protected]>
+Mime-version: 1.0
+Content-type: text/plain;
+       charset="US-ASCII"
+Content-transfer-encoding: 7bit
+X-OriginalArrivalTime: 01 Jun 2009 06:03:15.0462 (UTC) 
FILETIME=[A7231260:01C9E27E]
+X-Virus-Checked: Checked by ClamAV on apache.org
+
+
+No you cannot raise this event yourself, this event is generated internally
+by the framework. 
+
+I am guessing that what you probably want is to have a chain of MapReduce
+Jobs where the output of one is automatically fed as input to another.  You
+can look at these classes: JobControl and ChainMapper/ChainReducer.
+
+Jothi
+
+On 6/1/09 11:00 AM, "Jianmin Woo" <[email protected]> wrote:
+
+> Thanks a lot for your explanation, Jothi.
+> 
+> So is this event generated by hadoop framework? Is there any API in mapper to
+> fire this event? Actually, I am thinking to implement a mapper that will emit
+> some <key, value> pairs, then fire this event to let the reducer works, the
+> same mapper task then emit some other <key, value> pairs and repeat. Do you
+> think is this logic feasible by current API?
+> 
+> Thanks,
+> Jianmin
+> 
+> 
+> 
+> 
+> 
+> ________________________________
+> From: Jothi Padmanabhan <[email protected]>
+> To: [email protected]
+> Sent: Monday, June 1, 2009 12:26:31 PM
+> Subject: Re: question about when shuffle/sort start working
+> 
+> When a Mapper completes, MapCompletionEvents are generated. Reducers try to
+> fetch map outputs for a given map only on the receipt of such events.
+> 
+> Jothi
+> 
+> 
+> On 5/30/09 10:00 AM, "Jianmin Woo" <[email protected]> wrote:
+> 
+>> Hi, 
+>> I am being confused by the protocol between mapper and reducer. When mapper
+>> emitting the (key,value) pair done, is there any signal the mapper send out
+>> to
+>> hadoop framework in protocol to indicate that map is done and the
+>> shuffle/sort
+>> can begin for reducer? If there is no this signal in protocol, when the
+>> framework begin the shuffle/sort?
+>> 
+>> Thanks,
+>> Jianmin
+>> 
+>> 
+>> 
+>>      
+> 
+> 
+>       
+
+

Added: 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/headers.mbox
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/headers.mbox?rev=820967&view=auto
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/headers.mbox 
(added)
+++ 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/headers.mbox 
Fri Oct  2 11:23:16 2009
@@ -0,0 +1,7 @@
+From envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009
+Return-Path: <[email protected]>
+Subject: subject
+From: <[email protected]>
+Date: Tue, 9 Jun 2009 23:58:45 -0400
+
+Test content

Added: 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/multiline.mbox
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/multiline.mbox?rev=820967&view=auto
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/multiline.mbox 
(added)
+++ 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/multiline.mbox 
Fri Oct  2 11:23:16 2009
@@ -0,0 +1,5 @@
+From envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009
+Received: from xxx
+    by xxx with xxx; date
+
+Test content

Added: 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/quoted.mbox
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/quoted.mbox?rev=820967&view=auto
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/quoted.mbox 
(added)
+++ 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/quoted.mbox 
Fri Oct  2 11:23:16 2009
@@ -0,0 +1,4 @@
+From envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009
+
+Test content
+> quoted stuff
\ No newline at end of file

Added: 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/simple.mbox
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/simple.mbox?rev=820967&view=auto
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/simple.mbox 
(added)
+++ 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/simple.mbox 
Fri Oct  2 11:23:16 2009
@@ -0,0 +1,7 @@
+From envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009
+
+Test content 1
+
+From envelope-sender-mailbox-name Mon Jun 01 11:00:00 2009
+
+Test content 2
\ No newline at end of file


Reply via email to