Index: ompi/datatype/datatype_unpack.c
===================================================================
--- ompi/datatype/datatype_unpack.c	(revision 11970)
+++ ompi/datatype/datatype_unpack.c	(working copy)
@@ -196,6 +196,8 @@
     dt_stack_t* stack = &(pConv->pStack[1]);
     long initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp;

+    opal_output( 0, "unpack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n",
+                 pConv->pBaseBuf, *out_size );
     for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
         packed_buffer = (char*)iov[iov_count].iov_base;
         remaining = pConv->local_size - pConv->bConverted;
@@ -204,8 +206,8 @@
         bConverted = remaining; /* how much will get unpacked this time */
         user_memory = pConv->pBaseBuf + initial_displ;

-        /*opal_output( 0, "unpack_homogeneous_contig( user_memory %p, packed_buffer %p length %d\n",
-          user_memory, packed_buffer, remaining );*/
+        opal_output( 0, "unpack_homogeneous_contig( user_memory %p, packed_buffer %p length %d\n",
+                     user_memory, packed_buffer, remaining );

         if( (long)pData->size == extent ) {
             user_memory += pConv->bConverted;
@@ -213,30 +215,34 @@
             /* contiguous data or basic datatype with count */
             OMPI_DDT_SAFEGUARD_POINTER( user_memory, remaining,
                                         pConv->pBaseBuf, pData, pConv->count );
-            /*opal_output( 0, "1. unpack contig dest %p src %p length %d\n",
-              user_memory, packed_buffer, remaining );*/
+            opal_output( 0, "1. unpack contig dest %p src %p length %d\n",
+                         user_memory, packed_buffer, remaining );
             MEMCPY_CSUM( user_memory, packed_buffer, remaining, pConv );
         } else {
             user_memory += stack->disp;

             length = pConv->bConverted / pData->size;  /* already done */
-            length = pConv->bConverted - length * pData->size;  /* still left on the last element */
+            length = pConv->bConverted - length * pData->size;  /* how much of the last data we convert */
+
             /* complete the last copy */
             if( length != 0 ) {
-                OMPI_DDT_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf,
-                                            pData, pConv->count );
-                /*opal_output( 0, "1. unpack dest %p src %p length %d\n",
-                  user_memory, packed_buffer, length );*/
-                MEMCPY_CSUM( user_memory, packed_buffer, length, pConv );
-                packed_buffer += length;
-                user_memory   += (extent - (pData->size - length));
-                remaining     -= length;
+                length = pData->size - length;
+                if( length <= remaining ) {
+                    OMPI_DDT_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf,
+                                                pData, pConv->count );
+                    opal_output( 0, "1. unpack dest %p src %p length %d\n",
+                                 user_memory, packed_buffer, length );
+                    MEMCPY_CSUM( user_memory, packed_buffer, length, pConv );
+                    packed_buffer += length;
+                    user_memory   += (extent - (pData->size - length));
+                    remaining     -= length;
+                }
             }
             for( i = 0; pData->size <= remaining; i++ ) {
                 OMPI_DDT_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf,
                                             pData, pConv->count );
-                /*opal_output( 0, "2. unpack dest %p src %p length %d\n",
-                  user_memory, packed_buffer, pData->size );*/
+                opal_output( 0, "2. unpack dest %p src %p length %d\n",
+                             user_memory, packed_buffer, pData->size );
                 MEMCPY_CSUM( user_memory, packed_buffer, pData->size, pConv );
                 packed_buffer += pData->size;
                 user_memory   += extent;
@@ -246,8 +252,8 @@
             if( remaining != 0 ) {
                 OMPI_DDT_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf,
                                             pData, pConv->count );
-                /*opal_output( 0, "3. unpack dest %p src %p length %d\n",
-                  user_memory, packed_buffer, remaining );*/
+                opal_output( 0, "3. unpack dest %p src %p length %d\n",
+                             user_memory, packed_buffer, remaining );
                 MEMCPY_CSUM( user_memory, packed_buffer, remaining, pConv );
                 user_memory += remaining;
             }
Index: ompi/datatype/convertor.c
===================================================================
--- ompi/datatype/convertor.c	(revision 11970)
+++ ompi/datatype/convertor.c	(working copy)
@@ -234,34 +234,42 @@
          * use the bConverted to manage the conversion.
          */
         uint32_t i;
-        size_t next_length;
         char* base_pointer;

+        /*opal_output( 0, "ompi_convertor_unpack at %p max_data %ld bConverted %ld size %ld count %d\n",
+          pConv->pBaseBuf, (long)*max_data, (long)pConv->bConverted,
+          (long)pConv->local_size, pConv->count );
+          ompi_ddt_dump( pConv->pDesc );*/
         *max_data = pConv->bConverted;
+        base_pointer = pConv->pBaseBuf + pConv->bConverted + 
+            pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp;
         for( i = 0; i < *out_size; i++ ) {
-            base_pointer = pConv->pBaseBuf + pConv->bConverted + pConv->pDesc->true_lb;
-            next_length = pConv->bConverted + iov[i].iov_len;
-            if( next_length >= pConv->local_size ) {
-                pConv->bConverted = pConv->local_size;
-                iov[i].iov_len -= (next_length - pConv->local_size);
-                MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len );
-                /*opal_output( 0, "copy at %p %d bytes [initial ptr %p] *last*\n", base_pointer,
-                  iov[i].iov_len, pConv->pBaseBuf );*/
+            if( (pConv->bConverted + iov[i].iov_len) >= pConv->local_size ) {
                 goto predefined_data_unpack;
             }
             MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len );
             /*opal_output( 0, "copy at %p %d bytes [initial ptr %p]\n", base_pointer,
               iov[i].iov_len, pConv->pBaseBuf );*/
-            pConv->bConverted = next_length;
+            pConv->bConverted += iov[i].iov_len;
+            base_pointer += iov[i].iov_len;
         }
         *max_data = pConv->bConverted - (*max_data);
         return 0;
     predefined_data_unpack:
+        iov[i].iov_len = pConv->local_size - pConv->bConverted;
+        MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len );
+        /*opal_output( 0, "copy at %p %d bytes [initial ptr %p] *last*\n", base_pointer,
+          iov[i].iov_len, pConv->pBaseBuf );*/
+        pConv->bConverted = pConv->local_size;
         *out_size = i + 1;
         *max_data = pConv->bConverted - (*max_data);
         pConv->flags |= CONVERTOR_COMPLETED;
         return 1;
     }
+    /*opal_output( 0, "ompi_convertor_unpack generic at %p max_data %ld bConverted %ld size %ld count %d\n",
+      pConv->pBaseBuf, (long)*max_data, (long)pConv->bConverted,
+      (long)pConv->local_size, pConv->count );
+      ompi_ddt_dump( pConv->pDesc );*/

     return pConv->fAdvance( pConv, iov, out_size, max_data, freeAfter );
 }
@@ -469,7 +477,6 @@
         } else
 #endif
         if( convertor->pDesc->flags & DT_FLAG_CONTIGUOUS ) {
-            assert( convertor->flags & DT_FLAG_CONTIGUOUS );
             convertor->fAdvance = ompi_unpack_homogeneous_contig_checksum;
         } else {
             convertor->fAdvance = ompi_generic_simple_unpack_checksum;
@@ -481,7 +488,6 @@
         } else
 #endif
         if( convertor->pDesc->flags & DT_FLAG_CONTIGUOUS ) {
-            assert( convertor->flags & DT_FLAG_CONTIGUOUS );
             convertor->fAdvance = ompi_unpack_homogeneous_contig;
         } else {
             convertor->fAdvance = ompi_generic_simple_unpack;
Index: ompi/datatype/dt_optimize.c
===================================================================
--- ompi/datatype/dt_optimize.c	(revision 11970)
+++ ompi/datatype/dt_optimize.c	(working copy)
@@ -190,9 +190,11 @@
                         changes++; optimized++;
                         goto complete_loop;
                     } else if( loop->loops < 3 ) {
+                        long elem_displ = elem->disp;
                         for( i = 0; i < loop->loops; i++ ) {
                             CREATE_ELEM( pElemDesc, elem->common.type, elem->common.flags,
-                                         elem->count, elem->disp, loop->extent );
+                                         elem->count, elem_displ, elem->extent );
+                            elem_displ += loop->extent;
                             pElemDesc++; nbElems++;
                         }
                         pos_desc += loop->items + 1;
