All,

I am writing to update you on the HDF5 1.8.6 release.

The release is on hold due to tests failures in the parallel HDF5 library. We 
still do not know if there is a problem in HDF5, in the version of MPI I/O 
library we are using, or in both.  

We now have a standalone HDF5 program (attached) that consistently reproduces 
the problem on NCSA's abe with Lustre and MVAPICH2. The data in the created 
file is not correct. There is no problem when OpenMPI is used.
 
While we are working on the MPI counterpart of the program, we will highly 
appreciate if you can test the program with the parallel HDF5 libraries 
installed on your systems and report 
 - testing results
 - version of HDF5 (any of the 1.8.* series will be OK to use)
 - version of the MPI I/O library
 - version of OS and file system 
 - version of compilers used to build MPI I/O and HDF5 

You should be able to compile the program by using h5pcc compiler script 
(usually it can be found in the bin directory under the HDF5 installation 
directory). The program should be run on several nodes with the number of 
processes greater than number of nodes, for example, 4 nodes and 6 processes.

Thanks a lot for your help!

Elena on behalf of The HDF Team

#include <hdf5.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

int nerrors = 0;

#define VRFY(val, mesg) do {                                            \
    if(!(val)) {                                                        \
        printf("Proc %d: ", mpi_rank);                                  \
        printf("*** PHDF5 ERROR ***\n");                                \
        printf("        Assertion (%s) failed at line %4d in %s\n",     \
               mesg, (int)__LINE__, __FILE__);                          \
        ++nerrors;                                                      \
        fflush(stdout);                                                 \
        printf("aborting MPI process\n");                               \
        MPI_Finalize();                                                 \
        exit(nerrors);                                                  \
    }                                                                   \
} while(0)

#define EDGE_SIZE 10
#define RANK 2

/*-------------------------------------------------------------------------
 * Function:    hyperslab_dr_pio_setup__run_test()
 *
 * Purpose:     Tests the setup code for
 *              contig_hyperslab_dr_pio_test__run_test and
 *              checker_board_hyperslab_dr_pio_test__run_test.
 *
 * Return:      void
 *
 * Programmer:  NAF -- 12/2/09
 *
 * Modifications:
 *
 *-------------------------------------------------------------------------
 */

#define PAR_SS_DR_MAX_RANK      5

static void
hyperslab_dr_pio_setup__run_test(const hid_t dset_type)
{
    hbool_t     mis_match = 0;
    int         i;
    int         mrc;
    int         mpi_size = -1;
    int         mpi_rank = -1;
    const int   test_max_rank = 5;  /* must update code if this changes */
    uint32_t    expected_value;
    uint32_t  * large_ds_buf_0 = NULL;
    uint32_t  * large_ds_buf_1 = NULL;
    uint32_t  * ptr_0;
    uint32_t  * ptr_1;
    MPI_Comm    mpi_comm = MPI_COMM_NULL;
    MPI_Info    mpi_info = MPI_INFO_NULL;
    hid_t       fid;                    /* HDF5 file ID */
    hid_t       acc_tpl;                /* File access templates */
    hid_t       xfer_plist = H5P_DEFAULT;
    hid_t       full_mem_large_ds_sid;
    hid_t       full_file_large_ds_sid;
    hid_t       mem_large_ds_sid;
    hid_t       file_large_ds_sid;
    hid_t       large_ds_dcpl_id = H5P_DEFAULT;
    hid_t       large_dataset;     /* Dataset ID                   */
    size_t      large_ds_size = 1;
    size_t      buf_size = (size_t)EDGE_SIZE;
    hsize_t     dims[PAR_SS_DR_MAX_RANK];
    hsize_t     start[PAR_SS_DR_MAX_RANK];
    hsize_t     stride[PAR_SS_DR_MAX_RANK];
    hsize_t     count[PAR_SS_DR_MAX_RANK];
    hsize_t     block[PAR_SS_DR_MAX_RANK];
    herr_t      ret;            /* Generic return value */

    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);

    assert( mpi_size >= 1 );

    mpi_comm = MPI_COMM_WORLD;
    mpi_info = MPI_INFO_NULL;

    for ( i = 0; i < RANK - 1; i++ ) {

        large_ds_size *= (size_t)EDGE_SIZE;
    }
    large_ds_size *= (size_t)(mpi_size + 1);

    if(mpi_rank == 0)
        buf_size *= 2;

    large_ds_buf_0 = (uint32_t *)malloc(sizeof(uint32_t) * buf_size);
    VRFY((large_ds_buf_0 != NULL), "malloc of large_ds_buf_0 succeeded");
    large_ds_buf_1 = (uint32_t *)malloc(sizeof(uint32_t) * large_ds_size);
    VRFY((large_ds_buf_1 != NULL), "malloc of large_ds_buf_1 succeeded");
    ptr_0 = large_ds_buf_0;
    for(i = 0; i < EDGE_SIZE; i++)
        *ptr_0++ = (uint32_t)i + (EDGE_SIZE * mpi_rank);
    if(mpi_rank == 0)
        for(i = 0; i < EDGE_SIZE; i++)
            *ptr_0++ = (uint32_t)i + (EDGE_SIZE * mpi_size);
    memset(large_ds_buf_1, 0, sizeof(uint32_t) * large_ds_size);

    /* ----------------------------------------
     * CREATE AN HDF5 FILE WITH PARALLEL ACCESS
     * ---------------------------------------*/
    /* setup file access template */
    acc_tpl = H5Pcreate(H5P_FILE_ACCESS);
    ret = H5Pset_fapl_mpio(acc_tpl, mpi_comm, mpi_info);
    VRFY((ret >= 0), "H5Pset_fapl_mpio() succeeded");

    ret = H5Pset_fclose_degree(acc_tpl, H5F_CLOSE_SEMI);
    VRFY((ret >= 0), "H5Pset_fclose_degree() succeeded");

    /* create the file collectively */
    fid = H5Fcreate("hsss.h5", H5F_ACC_TRUNC, H5P_DEFAULT, acc_tpl);
    VRFY((fid >= 0), "H5Fcreate succeeded");


    /* setup dims: */
    dims[0] = (int)(mpi_size + 1);
    dims[1] = EDGE_SIZE;

    /* Create large ds dataspaces */
    full_mem_large_ds_sid = H5Screate_simple(RANK, dims, NULL);
    VRFY((full_mem_large_ds_sid >= 0),
         "H5Screate_simple() full_mem_large_ds_sid succeeded");

    full_file_large_ds_sid = H5Screate_simple(RANK, dims, NULL);
    VRFY((full_file_large_ds_sid >= 0),
         "H5Screate_simple() full_file_large_ds_sid succeeded");

    if(mpi_rank == 0) {
        dims[0] = 2;
        mem_large_ds_sid = H5Screate_simple(2, dims, NULL);
        VRFY((mem_large_ds_sid >= 0),
            "H5Screate_simple() mem_large_ds_sid succeeded");
        dims[0] = (int)(mpi_size + 1);
    } else {
        mem_large_ds_sid = H5Screate_simple(1, &(dims[1]), NULL);
        VRFY((mem_large_ds_sid >= 0),
            "H5Screate_simple() mem_large_ds_sid succeeded");
    }

    file_large_ds_sid = H5Screate_simple(RANK, dims, NULL);
    VRFY((file_large_ds_sid >= 0),
         "H5Screate_simple() file_large_ds_sid succeeded");


    /* create the large dataset */
    large_dataset = H5Dcreate2(fid, "large_dataset", dset_type,
                               file_large_ds_sid, H5P_DEFAULT,
                               large_ds_dcpl_id, H5P_DEFAULT);
    VRFY((ret >= 0), "H5Dcreate2() large_dataset succeeded");



    /* setup xfer property list */
    xfer_plist = H5Pcreate(H5P_DATASET_XFER);
    VRFY((xfer_plist >= 0), "H5Pcreate(H5P_DATASET_XFER) succeeded");

    ret = H5Pset_dxpl_mpio(xfer_plist, H5FD_MPIO_COLLECTIVE);
    VRFY((ret >= 0), "H5Pset_dxpl_mpio succeeded");

    /* setup selection to write initial data to the small and large data sets */
    start[0] = mpi_rank;
    stride[0] = 2 * (mpi_size + 1);
    count[0] = 1;
    block[0] = 1;

    for ( i = 1; i < RANK; i++ ) {

        start[i] = 0;
        stride[i] = 2 * EDGE_SIZE;
        count[i] = 1;
        block[i] = EDGE_SIZE;
    }


    /* setup selections for writing initial data to the large data set */

    start[0] = mpi_rank;

    ret = H5Sselect_hyperslab(file_large_ds_sid,
                              H5S_SELECT_SET,
                              start,
                              stride,
                              count,
                              block);
    VRFY((ret >= 0), "H5Sselect_hyperslab(file_large_ds_sid, set) suceeded");

    if ( mpi_rank == 0 ) { /* add an additional slice to the selections */

        start[0] = mpi_size;

        ret = H5Sselect_hyperslab(file_large_ds_sid,
                                  H5S_SELECT_OR,
                                  start,
                                  stride,
                                  count,
                                  block);
        VRFY((ret>= 0), "H5Sselect_hyperslab(file_large_ds_sid, or) suceeded");
    }

    /* write the initial value of the large data set to file */
    ret = H5Dwrite(large_dataset, dset_type, mem_large_ds_sid, 
file_large_ds_sid,
                   xfer_plist, large_ds_buf_0);
    if ( ret < 0 ) H5Eprint2(H5E_DEFAULT, stderr);
    VRFY((ret >= 0), "H5Dwrite() large_dataset initial write succeeded");

    /* Close and reopen the file, to satisfy sequential consistency semantics */
    ret = H5Dclose(large_dataset);
    VRFY((ret >= 0), "H5Dclose(large_dataset) succeeded");
    ret = H5Fclose(fid);
    VRFY((ret >= 0), "file close succeeded");

    mrc = MPI_Barrier(MPI_COMM_WORLD);
    VRFY((mrc==MPI_SUCCESS), "Sync after small dataset writes");

    fid = H5Fopen("hsss.h5", H5F_ACC_RDONLY, acc_tpl);
    if(fid < 0) H5Eprint2(0,0);
    VRFY((fid >= 0), "H5Fopen succeeded");
    large_dataset = H5Dopen2(fid, "large_dataset", H5P_DEFAULT);

    /* read the small data set back to verify that it contains the
     * expected data.  Note that each process reads in the entire
     * data set.
     */
    ret = H5Dread(large_dataset,
                  H5T_NATIVE_UINT32,
                  full_mem_large_ds_sid,
                  full_file_large_ds_sid,
                  xfer_plist,
                  large_ds_buf_1);
    VRFY((ret >= 0), "H5Dread() large_dataset initial read succeeded");


    /* verify that the correct data was written to the large data set */
    expected_value = 0;
    mis_match = 0;
    ptr_1 = large_ds_buf_1;

    /* Check that the write buffer didn't get corrupted */
    ptr_0 = large_ds_buf_0;
    for(i = 0; i < EDGE_SIZE; i++)
        if(*ptr_0++ != (uint32_t)i + (EDGE_SIZE * mpi_rank))
            printf("Invalid wbuf rank %d!\n", mpi_rank);
    if(mpi_rank == 0)
        for(i = 0; i < EDGE_SIZE; i++)
            if(*ptr_0++ != (uint32_t)i + (EDGE_SIZE * mpi_size))
                printf("Invalid wbuf rank 0 (p2)!\n");

    for ( i = 0; i < (int)large_ds_size; i++ ) {

        if ( *ptr_1 != expected_value ) {

            mis_match = 1;
        }
        ptr_1++;
        expected_value++;
    }
    /*VRFY( (mis_match == 0), "large ds init data good.");*/


    /* Close dataspaces */
    ret = H5Sclose(full_mem_large_ds_sid);
    VRFY((ret >= 0), "H5Sclose(full_mem_large_ds_sid) succeeded");

    ret = H5Sclose(full_file_large_ds_sid);
    VRFY((ret >= 0), "H5Sclose(full_file_large_ds_sid) succeeded");

    ret = H5Sclose(mem_large_ds_sid);
    VRFY((ret >= 0), "H5Sclose(mem_large_ds_sid) succeeded");

    ret = H5Sclose(file_large_ds_sid);
    VRFY((ret >= 0), "H5Sclose(mem_large_ds_sid) succeeded");


    /* Close Datasets */
    ret = H5Dclose(large_dataset);
    VRFY((ret >= 0), "H5Dclose(large_dataset) succeeded");

    /* Release file-access template */
    ret = H5Pclose(acc_tpl);
    VRFY((ret >= 0), "H5Pclose(acc_tpl) succeeded");

    /* close the file collectively */
    ret = H5Fclose(fid);
    VRFY((ret >= 0), "file close succeeded");

    /* Free memory buffers */
    if ( large_ds_buf_0 != NULL ) free(large_ds_buf_0);
    if ( large_ds_buf_1 != NULL ) free(large_ds_buf_1);

    MPI_Barrier(MPI_COMM_WORLD);

    VRFY( (mis_match == 0), "large ds init data good.");

    return;

} /* hyperslab_dr_pio_setup__run_test() */


/*-------------------------------------------------------------------------
 * Function:    main()
 *
 * Purpose:     Tests the setup code for
 *              contig_hyperslab_dr_pio_test__run_test and
 *              checker_board_hyperslab_dr_pio_test__run_test.
 *
 * Return:      void
 *
 * Programmer:  NAF -- 12/2/10
 *
 * Modifications:
 *
 *-------------------------------------------------------------------------
 */

int
main(int argc, char **argv)
{
    int         i;
    hid_t       dset_type = H5T_NATIVE_UINT;

    MPI_Init(&argc, &argv);

    for (i=0; i<100; i++) {
            /* contiguous data set, collective I/O */
            hyperslab_dr_pio_setup__run_test(dset_type);
    }

    puts("Test passed");

    H5close();
    MPI_Finalize();

    return 0;

} /* hyperslab_dr_pio_setup_test() */


On Oct 13, 2010, at 9:19 AM, Elena Pourmal wrote:

> Hi All,
> 
> Thanks to everyone who tested 1.8.6-pre1. We greatly appreciate your effort. 
> 
> The release date was pushed to November due to the issues discovered during 
> the testing. We have been working on the fixes and will make the next 
> pre-release candidate available for testing as soon as possible. Stay tune.
> 
> Thank you!
> 
> The HDF Team
> 
> 
> On Sep 24, 2010, at 9:08 PM, [email protected] wrote:
> 
>> Hi all,
>> 
>> A pre-release candidate version of HDF5 1.8.6 is available for testing,
>> and can be downloaded at the following link:
>> 
>> http://www.hdfgroup.uiuc.edu/ftp/pub/outgoing/hdf5/hdf5-1.8.6-pre1/
>> 
>> If you have some time to test this within the next two weeks, we would
>> greatly appreciate it, as while we test on the platforms we have access
>> to, there are many more that we are unable to test on ourselves. Please
>> report any errors found by October 10th. If no critical errors are
>> reported beforehand, we're targeting mid-October for our release date.
>> 
>> Thank you!
>> 
>> The HDF Team
>> 
>> 
>> _______________________________________________
>> Hdf-forum is for HDF software users discussion.
>> [email protected]
>> http://mail.hdfgroup.org/mailman/listinfo/hdf-forum_hdfgroup.org
> 
> 
> _______________________________________________
> Hdf-forum is for HDF software users discussion.
> [email protected]
> http://mail.hdfgroup.org/mailman/listinfo/hdf-forum_hdfgroup.org

_______________________________________________
Hdf-forum is for HDF software users discussion.
[email protected]
http://mail.hdfgroup.org/mailman/listinfo/hdf-forum_hdfgroup.org

Reply via email to