#ifndef PMPIO_H #define PMPIO_H /*----------------------------------------------------------------------------- * Audience: Public * Chapter: Introduction * * Description: * This header file defines the Poor Main's Parallel I/O support interface. * To use it, include pmpio.h /after/ mpi.h in your application. * * PMPIO makes it *very*simple* to take an existing application that writes * a file per processor and add the ability to vary the number of files * the application writes to from 1 (e.g. serial I/O) to one for each * processor and anything in between. * * I/O libraries like serial HDF5 and Silo can be used very effectively * in parallel without having to resort to writing a file per processor. * PMPIO impliments a simple approach to using serial I/O libraries * effectively in parallel by having the application decompose the set of * processors into /N/ groups and write a file per group. At any one moment, * only one processor from each group has a file open for writing. Hence, * the I/O is serial /within/ a group. However, because a processor in each * of the /N/ groups is writing to its own file, simultaneously, the I/O is * parallel /across/ groups. * * The number of files, /N/, can be chosen wholly * independently of the total number of processors permitting the application * to tune /N/ to the underlying filesystem. For example, a parallel * application running on 2,000 processors and writing to a filesystem that * supports 8 parallel I/O channels could select /N=8/ and achieve nearly * optimum I/O performance and create only 8 files. * * This is a simple and effective I/O strategy that has been used by codes * like Ale3d and SAMRAI for many years. PMPIO impliments some basic * functions to help take an existing application that generates a file per * processor and modify it to generate a file per group of processors. * * In order for the strategy implemented here to work, some functionality * must be available in the I/O library being used to read/write the data. * The I/O library needs to support the following * A) Random access to named data objects * B) Separate name spaces within a file (e.g. like unix directories) * C) Variable sized I/O requests for same data objects from different * processors * * A large number of I/O libraries and/or file formats support these * features including HDF5, Silo, PDB (Portable Database), * HDF4, Boxlib. There are probably numerous others. Some noteable I/O * libraries and/or file formats that do not support these features are * netCDF and VTK. * * The pmpio.h header file contains three key routines to facilitate * using a basic serial I/O library in the fashion of * 'Poor Man's Parallel I/O'. Pseudocode pattern of its use is below... * * PMPIO_baton_t *bat = PMPIO_Init(MPI_COMM_WORLD,...) * fileHandle = ( *) PMPIO_WaitForBaton(bat, a, b) * . * do this processor's work on the file * . * PMPIO_HandOffBaton(fileHandle,...) * * All processors call all of these methods. PMPIO_Init() returns immediately. * PMPIO_WaitForBaton() returns immediatly only on the /first/ processor in * each group sharing a file. All other processors wait to recieve a baton * from the previous processor. When a processor finishes its work on the file * and reaches the PMPIO_HandOffBaton(), that call returns immediately and it * also passes the baton to the next processor in the group. In turn, that * processor then returns from the PMPIO_WaitForBaton() call it is waiting. * This process continues with each processor in a group handing off a baton * to the next processor. *----------------------------------------------------------------------------- */ #include /*----------------------------------------------------------------------------- * Audience: Public * Chapter: Initialization * Purpose: I/O Modes * Description: * Used to indicate if PMPIO is being used to write data or * read it *----------------------------------------------------------------------------- */ typedef enum { PMPIO_READ=0x1, PMPIO_WRITE=0x3 } PMPIO_iomode_t; struct _PMPIO_baton_t; /*----------------------------------------------------------------------------- * Audience: Public * Chapter: Create Callback * Description: * Defines the create file callback interface. When used for * writing, the create file callback should 1) create a file, 2) create a * namespace in the file for the first processor of the group to write to * and 3) set the file to that namespace. The create file call back is * never called during reading. * * typedef void * (*PMPIO_CreateFileCallBack)( * const char *fname, name of the file to create * const char *nsname, name of the namespace in the file to create * void *udata optional, user data passed by PMPIO * ); *----------------------------------------------------------------------------- */ typedef void * (*PMPIO_CreateFileCallBack)(const char *fname, const char *nsname, void *udata); /*----------------------------------------------------------------------------- * Audience: Public * Chapter: Open Callback * Description: * Defines the open file callback interface. When used for * writing, the open file callback should 1) open a file, 2) create a * namespace in the file for the current processor of the group to write * to and 3) set the file to that namespace. When used for reading, * the open file callback should 1) open a file and 2) set the file * to the namespace the current processor should read from. * * typedef void * (*PMPIO_OpenFileCallBack)( * const char *fname, name of the file to open * const char *nsname, name of the namespace to create (write) * or set (read) * PMPIO_iomode_t iomode, indicates if read or write * void *udata optional user data passed by PMPIO * ); *----------------------------------------------------------------------------- */ typedef void * (*PMPIO_OpenFileCallBack)(const char *fname, const char *nsname, PMPIO_iomode_t iomode, void *udata); /*----------------------------------------------------------------------------- * Audience: Public * Chapter: Close Callback * Description: * Defines the close file callback interface. This should close the file. * * typedef void (*PMPIO_CloseFileCallBack)( * void *file, pointer to the file object to close * void *udata optional user data passed by PMPIO * ); *----------------------------------------------------------------------------- */ typedef void (*PMPIO_CloseFileCallBack)(void *file, void *udata); typedef struct _PMPIO_baton_t { PMPIO_iomode_t ioMode; int commSize; int rankInComm; int numGroups; int groupSize; int numGroupsWithExtraProc; int commSplit; int groupRank; int rankInGroup; int procBeforeMe; int procAfterMe; int mpiVal; int mpiTag; MPI_Comm mpiComm; PMPIO_CreateFileCallBack createCb; PMPIO_OpenFileCallBack openCb; PMPIO_CloseFileCallBack closeCb; void *userData; } PMPIO_baton_t; #define PMPIO_BATON_OK 0xBabeFace #define PMPIO_BATON_ERR 0xDeadBeef /*----------------------------------------------------------------------------- * Audience: Public * Chapter: Initialization * Purpose: Initialize a PMPIO baton * Description: * Initializes PMPIO to write a specified number of files. If the * specified number of files is one (1), the result will effectively be serial * I/O. If the number of files is equal to the number of processors, the result * will be file per processor. It is best to choose a number of files that does * not exceed the total number of available I/O channels. On large parallel * systems, this number can typically be between 8 and 64. * * Given the MPI communicator, and the desired number of files, this function * will divide all processors in the communicator into a number of groups. * If the number of groups does not equally divide the number of processors, * some groups will contain an extra processor. * Each group of processors will write to its own file. * * PMPIO_Init() makes no MPI calls other than obtaining the size of the MPI * communicator and the rank of the calling processor. * * A call to PMPIO_Init() should be followed by a call to PMPIO_WaitForBaton, * followed by a call to PMPIO_HandOffBaton and finally by a call to * PMPIO_Finish(). A call to PMPIO_Init() returns a /baton/, a pointer to * a PMPIO_baton_t, that is /active/ until PMPIO_Finish() is called. * * Multiple calls to PMPIO_Init() can be made within a single execution. In * fact, as long as the mpiTag used in baton-passing messages is unique * across all /active/batons/, calls can be interleaved between /active/ * batons. * * All processors should call this function with identical arguments otherwise * behavior is undefined. The call returns immediatly on all processors. *----------------------------------------------------------------------------- */ static PMPIO_baton_t * PMPIO_Init( int numFiles, /* The number of files to be created */ PMPIO_iomode_t ioMode, /* Indicates whether this is for writing a file or for reading a file. Specify either PMPIO_READ or PMPIO_WRITE. Note, read and write at the same time is not supported. */ MPI_Comm mpiComm, /* The MPI communicator PMPIO should use */ int mpiTag, /* The message tag PMPIO should use for its baton-pass messages. */ PMPIO_CreateFileCallBack createCb, /* The create file callback. In the case of PMPIO_WRITE, this callback should 1) create a file, 2) create a namespace in the file for the first processor of the group to write to and 3) set the file to that namespace. In the case of PMPIO_READ, this callback is never called. */ PMPIO_OpenFileCallBack openCb, /* The open file callback. In the case of PMPIO_WRITE, this callback should 1) open the file, 2) create a namespace in the file for the current processor to write to and 3) set the file to that namespace. In the case of PMPIO_READ, this callback should 1) open the file and 2) set the file to the namespace for the current processor to read from. */ PMPIO_CloseFileCallBack closeCb, /* The close file callback. This method should close the file. */ void *userData /* Optional, user-specified data that PMPIO passes into the callbacks. Pass 0 if you have no need for this. */ ) { int numGroups = numFiles; int commSize, rankInComm; int groupSize, numGroupsWithExtraProc, commSplit, groupRank, rankInGroup, procBeforeMe, procAfterMe; PMPIO_baton_t *ret = 0; procBeforeMe = -1; procAfterMe = -1; MPI_Comm_size(mpiComm, &commSize); MPI_Comm_rank(mpiComm, &rankInComm); groupSize = commSize / numGroups; numGroupsWithExtraProc = commSize % numGroups; commSplit = numGroupsWithExtraProc * (groupSize + 1); if (rankInComm < commSplit) { groupRank = rankInComm / (groupSize + 1); rankInGroup = rankInComm % (groupSize + 1); if (rankInGroup < groupSize) procAfterMe = rankInComm + 1; } else { groupRank = numGroupsWithExtraProc + (rankInComm - commSplit) / groupSize; rankInGroup = (rankInComm - commSplit) % groupSize; if (rankInGroup < groupSize - 1) procAfterMe = rankInComm + 1; } if (rankInGroup > 0) procBeforeMe = rankInComm - 1; if (createCb == 0 || openCb == 0 || closeCb == 0) return 0; ret = (PMPIO_baton_t *) malloc(sizeof(PMPIO_baton_t)); ret->ioMode = ioMode; ret->commSize = commSize; ret->rankInComm = rankInComm; ret->numGroups = numGroups; ret->groupSize = groupSize; ret->numGroupsWithExtraProc = numGroupsWithExtraProc; ret->commSplit = commSplit; ret->groupRank = groupRank; ret->rankInGroup = rankInGroup; ret->procBeforeMe = procBeforeMe; ret->procAfterMe = procAfterMe; ret->mpiVal = PMPIO_BATON_OK; ret->mpiTag = mpiTag; ret->mpiComm = mpiComm; ret->createCb = createCb; ret->openCb = openCb; ret->closeCb = closeCb; ret->userData = userData; return ret; } /*----------------------------------------------------------------------------- * Audience: Public * Chapter: Initialization * Purpose: Finish use of an active baton * Description: * Finishes the use of an active PMPIO baton and deallocates any storage. *----------------------------------------------------------------------------- */ static void PMPIO_Finish( PMPIO_baton_t *bat ) { free(bat); } /*----------------------------------------------------------------------------- * Audience: Public * Chapter: Baton Passing * Purpose: Wait for a PMPIO baton * Description: * Causes the calling processor to wait until it is handed a * baton. This call will return immediately *only* for the /first/ processor * in each group. All other processors in a group will block, waiting to get * the baton from their predecessor. To give up the baton, a processor must * call PMPIO_HandOffBaton(). *----------------------------------------------------------------------------- */ static void * PMPIO_WaitForBaton( PMPIO_baton_t *Bat, /* The PMPIO baton obtained from a PMPIO_Init() call. */ const char *fname, /* The name of the file this processor will open. */ const char *nsname /* The name of the namespace in the file this processor will work on. */ ) { if (Bat->procBeforeMe != -1) { MPI_Status mpi_stat; int baton; int mpi_err = MPI_Recv(&baton, 1, MPI_INT, Bat->procBeforeMe, Bat->mpiTag, Bat->mpiComm, &mpi_stat); if (mpi_err == MPI_SUCCESS && baton != PMPIO_BATON_ERR) { return Bat->openCb(fname, nsname, Bat->ioMode, Bat->userData); } else { Bat->mpiVal = PMPIO_BATON_ERR; return 0; } } else { if (Bat->ioMode == PMPIO_WRITE) return Bat->createCb(fname, nsname, Bat->userData); else return Bat->openCb(fname, nsname, Bat->ioMode, Bat->userData); } } /*----------------------------------------------------------------------------- * Audience: Public * Chapter: Baton Passing * Purpose: Wait for a PMPIO baton * Description: * Causes the calling processor to hand off its baton to the next processor. * This call returns immediately. *----------------------------------------------------------------------------- */ static void PMPIO_HandOffBaton( const PMPIO_baton_t *Bat, /* The PMPIO baton obtained from a PMPIO_Init() call. */ void *file /* A void pointer to the file object obtained from a PMPIO_WaitForBaton() call. */ ) { Bat->closeCb(file, Bat->userData); if (Bat->procAfterMe != -1) { int baton = Bat->mpiVal; MPI_Ssend(&baton, 1, MPI_INT, Bat->procAfterMe, Bat->mpiTag, Bat->mpiComm); } } /*----------------------------------------------------------------------------- * Audience: Public * Chapter: Ranking * Purpose: Return 'group rank' of a given processor in the communicator * Description: * Given a processor's rank in the communicator used to initialize PMPIO, * determine the 'group rank' of the processor (e.g. which group) indexed * from zero. *----------------------------------------------------------------------------- */ static int PMPIO_GroupRank(const PMPIO_baton_t *Bat, int rankInComm) { int retval; if (rankInComm < Bat->commSplit) { retval = rankInComm / (Bat->groupSize + 1); } else { retval = Bat->numGroupsWithExtraProc + (rankInComm - Bat->commSplit) / Bat->groupSize; } return retval; } /*----------------------------------------------------------------------------- * Audience: Public * Chapter: Ranking * Purpose: Return 'rank in group' of a given processor in the communicator * Description: * Given a processor's rank in the communicator used to initialize PMPIO, * determine the 'rank in grouop' of the processor (e.g. which processor in the * group) indexed from zero. *----------------------------------------------------------------------------- */ static int PMPIO_RankInGroup(const PMPIO_baton_t *Bat, int rankInComm) { int retval; if (rankInComm < Bat->commSplit) { retval = rankInComm % (Bat->groupSize + 1); } else { retval = (rankInComm - Bat->commSplit) % Bat->groupSize; } return retval; } /* Define this Default PMPIO functions only if we have silo.h. We use existence of 'DB_HDF5' as indication that silo.h is present. */ #ifdef DB_HDF5 /*----------------------------------------------------------------------------- * Audience: Public * Chapter: Callbacks * Purpose: Impliment the create callback * * Description: The caller can specify which Silo driver to use by passing * a void pointer to the driver specification as the userData argument. If * that pointer is NULL or the value to which it points is not a known * driver type (DB_HDF5 or DB_PDB), it is set to DB_PDB driver as that is * guarenteed to exist in any Silo implmentation. *----------------------------------------------------------------------------- */ static void * PMPIO_DefaultCreate(const char *fname, const char *nsname, void *userData) { DBfile *siloFile; int driver = userData ? *((int*) userData) : DB_PDB; if (driver != DB_PDB && driver != DB_HDF5) driver = DB_PDB; siloFile = DBCreate(fname, DB_CLOBBER, DB_LOCAL, "PMPIO_DefaultCreate", driver); if (siloFile) { DBMkDir(siloFile, nsname); DBSetDir(siloFile, nsname); } return (void *) siloFile; } /*----------------------------------------------------------------------------- * Audience: Public * Chapter: Callbacks * Purpose: Impliment the open callback *----------------------------------------------------------------------------- */ static void * PMPIO_DefaultOpen(const char *fname, const char *nsname, PMPIO_iomode_t ioMode, void *userData) { DBfile *siloFile = DBOpen(fname, DB_UNKNOWN, ioMode == PMPIO_WRITE ? DB_APPEND : DB_READ); if (siloFile) { if (ioMode == PMPIO_WRITE) DBMkDir(siloFile, nsname); DBSetDir(siloFile, nsname); } return (void *) siloFile; } /*----------------------------------------------------------------------------- * Audience: Public * Chapter: Callbacks * Purpose: Impliment the close callback *----------------------------------------------------------------------------- */ static void PMPIO_DefaultClose(void *file, void *userData) { DBfile *siloFile = (DBfile *) file; if (siloFile) DBClose(siloFile); } #endif #endif