/*
** (c) 1996-2000 The Regents of the University of California (through
** E.O. Lawrence Berkeley National Laboratory), subject to approval by
** the U.S. Department of Energy.  Your use of this software is under
** license -- the license agreement is attached and included in the
** directory as license.txt or you may contact Berkeley Lab's Technology
** Transfer Department at TTD@lbl.gov.  NOTICE OF U.S. GOVERNMENT RIGHTS.
** The Software was developed under funding from the U.S. Government
** which consequently retains certain rights as follows: the
** U.S. Government has been granted for itself and others acting on its
** behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
** Software to reproduce, prepare derivative works, and perform publicly
** and display publicly.  Beginning five (5) years after the date
** permission to assert copyright is obtained from the U.S. Department of
** Energy, and subject to any subsequent five (5) year renewals, the
** U.S. Government is granted for itself and others acting on its behalf
** a paid-up, nonexclusive, irrevocable, worldwide license in the
** Software to reproduce, prepare derivative works, distribute copies to
** the public, perform publicly and display publicly, and to permit
** others to do so.
*/

#ifndef BL_PARALLELDESCRIPTOR_H
#define BL_PARALLELDESCRIPTOR_H
//
// $Id: ParallelDescriptor.H,v 1.76 2002/12/03 00:02:42 lijewski Exp $
//
#include <BLassert.H>
#include <REAL.H>
#include <Box.H>
#include <ccse-mpi.H>
//
// Used for collecting information used in communicating FABs.
//
struct FabComTag
{
    int fromProc;
    int toProc;
    int fabIndex;
    int fineIndex;
    int srcComp;
    int destComp;
    int nComp;
    int face;
    int fabArrayId;
    int fillBoxId;
    int procThatNeedsData;
    int procThatHasData;
    Box box;

    FabComTag ();
};

//
// Data structure used by a few routines when MPI is enabled.
//
// Used to communicate up to seven integers and a box.
//
// We'll store all the info in a single array of integers.
//
struct CommData
{
    //
    // We encapsulate seven `int's and a `Box' as an `int[3*BL_SPACEDIM+7]'.
    //
    enum { DIM = 3*BL_SPACEDIM+7 };

    int m_data[DIM];

    CommData ();
    CommData (int        face,
              int        fabindex,
              int        fromproc,
              int        id,
              int        ncomp,
              int        srccomp,
              int        fabarrayid,
              const Box& box);

    CommData (const CommData& rhs);

    CommData& operator= (const CommData& rhs);
    //
    // Compare two CommData's.
    //
    bool operator== (const CommData& rhs) const;

    bool operator!= (const CommData& rhs) const { return !operator==(rhs); }
    //
    // The number of integers.
    //
    int length () const { return DIM; }
    //
    // Pointer to the data.
    //
    int* dataPtr() { return &m_data[0]; }
    //
    // The face.
    //
    int face () const { return m_data[0]; }
    //
    // The fabindex.
    //
    int fabindex () const { return m_data[1]; }
    //
    // The processor sending this data.
    //
    int fromproc () const { return m_data[2]; }
    //
    // The ID of this message.
    //
    // Meant to be used as the MPI tag in a send/receive of additional
    // data associated with this data.
    //
    int id () const { return m_data[3]; }
    //
    // The number of components.
    //
    int nComp () const { return m_data[4]; }

    void nComp (int ncomp) { m_data[4] = ncomp; }
    //
    // The src component.
    //
    int srcComp () const { return m_data[5]; }

    void srcComp (int scomp) { m_data[5] = scomp; }
    //
    // The ID of the fab array.
    //
    int fabarrayid () const { return m_data[6]; }
    //
    // The contained box.
    //
    Box box () const
    {
        return Box(IntVect(&m_data[7]),
                   IntVect(&m_data[7+BL_SPACEDIM]),
                   IntVect(&m_data[7+2*BL_SPACEDIM]));
    }
};

//
// Yes you can output CommData.
//
std::ostream& operator<< (std::ostream& os, const CommData& cd);

std::ostream& operator<< (std::ostream& os, const Array<CommData>& cd);

//
//@Man:
//@Memo: Parallel functions.
/*@Doc:

  This class contains functions used for implementing parallelism.
*/

namespace ParallelDescriptor
{
    class Message
    {
    public:

	Message ();
	Message (MPI_Request req_, MPI_Datatype type_);
	Message (MPI_Status stat_, MPI_Datatype type_);
	void wait ();
	bool test ();
	size_t count () const;
	int tag () const;
	int pid () const;
	MPI_Datatype type () const;
	MPI_Request  req () const;

    private:

	bool               m_finished;
	MPI_Datatype       m_type;
	MPI_Request        m_req;
	mutable MPI_Status m_stat;
    };

    /*@ManDoc: Perform any needed parallel initialization.  This MUST be the
               first routine in this class called from within a program.
    */
    void StartParallel (int*    argc = 0,
			char*** argv = 0);

    /*@ManDoc: Perform any needed parallel finalization.  This MUST be the
               last routine in this class called from within a program.
    */
    void EndParallel ();
    //
    //@ManDoc: Returns processor number of calling program.
    //
    extern int m_MyId;
    inline int
    MyProc ()
    {
        BL_ASSERT(m_MyId != -1);
        return m_MyId;
    }
    //
    //@ManDoc: Returns number of CPUs involved in the computation.
    //
    extern int m_nProcs;
    inline int
    NProcs ()
    {
        BL_ASSERT(m_nProcs != -1);
        return m_nProcs;
    }
    //
    //@ManDoc: The CPU number of the I/O Processor.
    //
    extern const int ioProcessor;
    inline int
    IOProcessorNumber ()
    {
        return ioProcessor;
    }
    //
    //@ManDoc: Is this CPU the I/O Processor?
    //
    inline bool
    IOProcessor ()
    {
         return MyProc() == IOProcessorNumber();
    }

    //
    //@ManDoc: Returns number of CPUs to use in CFD portion of computation.
    //
    int NProcsCFD ();
    //
    //@ManDoc: BoxLib's Parallel Communicator, probably MPI_COMM_WORLD
    //
    extern MPI_Comm m_comm;
    inline MPI_Comm Communicator ()
    {
        return m_comm;
    }
    //
    //@ManDoc: 
    //
    void Barrier ();

    void Barrier (MPI_Comm comm);
    //
    //@ManDoc: 
    //
    void Test (MPI_Request& request, int& flag, MPI_Status& status);
    //
    //@ManDoc: 
    //
    void Comm_dup (MPI_Comm comm, MPI_Comm& newcomm);
    //
    //@ManDoc: Issue architecture specific Abort.
    //
    void Abort ();
    //
    //@ManDoc: Abort with specified error code.
    //
    void Abort (int errorcode);
    //
    //@ManDoc: ErrorString return string associated with error internal error condition
    //
    const char* ErrorString (int errcode);
    //
    //@ManDoc: Returns wall-clock seconds since start of execution.
    //
    double second ();
    //
    //@ManDoc: And-wise boolean reduction.
    //
    void ReduceBoolAnd (bool& rvar);
    //
    //@ManDoc: And-wise boolean reduction to specified cpu.
    //
    void ReduceBoolAnd (bool& rvar, int cpu);
    //
    //@ManDoc: Or-wise boolean reduction.
    //
    void ReduceBoolOr  (bool& rvar);
    //
    //@ManDoc: Or-wise boolean reduction to specified cpu.
    //
    void ReduceBoolOr  (bool& rvar, int cpu);
    //
    //@ManDoc: Real sum reduction.
    //
    void ReduceRealSum (Real& rvar);
    //
    //@ManDoc: Real sum reduction to specified cpu.
    //
    void ReduceRealSum (Real& rvar, int cpu);
    //
    //@ManDoc: Real max reduction.
    //
    void ReduceRealMax (Real& rvar);
    //
    //@ManDoc: Real max reduction to specified cpu.
    //
    void ReduceRealMax (Real& rvar, int cpu);
    //
    //@ManDoc: Real min reduction.
    //
    void ReduceRealMin (Real& rvar);
    //
    //@ManDoc: Real min reduction to specified cpu.
    //
    void ReduceRealMin (Real& rvar, int cpu);
    //
    //@ManDoc: Integer sum reduction.
    //
    void ReduceIntSum (int& rvar);
    //
    //@ManDoc: Integer sum reduction to specified cpu.
    //
    void ReduceIntSum (int& rvar, int cpu);
    //
    //@ManDoc: Integer max reduction.
    //
    void ReduceIntMax (int& rvar);
    //
    //@ManDoc: Integer max reduction to specified cpu.
    //
    void ReduceIntMax (int& rvar, int cpu);
    //
    //@ManDoc: Integer min reduction.
    //
    void ReduceIntMin (int& rvar);
    //
    //@ManDoc: Integer min reduction to specified cpu.
    //
    void ReduceIntMin (int& rvar, int cpu);
    //
    //@ManDoc: Long sum reduction.
    //
    void ReduceLongSum (long& rvar);
    //
    //@ManDoc: Long sum reduction to specified cpu.
    //
    void ReduceLongSum (long& rvar, int cpu);
    //
    //@ManDoc: Long max reduction.
    //
    void ReduceLongMax (long& rvar);
    //
    //@ManDoc: Long max reduction to specified cpu.
    //
    void ReduceLongMax (long& rvar, int cpu);
    //
    //@ManDoc: Long min reduction.
    //
    void ReduceLongMin (long& rvar);
    //
    //@ManDoc: Long min reduction to specified cpu.
    //
    void ReduceLongMin (long& rvar, int cpu);
    //
    //@ManDoc: Long and-wise reduction.
    //
    void ReduceLongAnd (long& rvar);
    //
    //@ManDoc: Long and-wise reduction to specified cpu.
    //
    void ReduceLongAnd (long& rvar, int cpu);
    //
    //@ManDoc: Parallel gather.
    //
    void Gather (Real* sendbuf,
                 int   sendcount,
                 Real* recvbuf,
                 int   root);
    //
    //@ManDoc: Returns sequential message sequence numbers in range 1000-9000.
    //
    int SeqNum ();

    template <class T> Message Asend(const T*, size_t n, int pid, int tag);
    template <class T> Message Asend(const T*, size_t n, int pid, int tag, MPI_Comm comm);
    template <class T> Message Asend(const std::vector<T>& buf, int pid, int tag);

    template <class T> Message Arecv(T*, size_t n, int pid, int tag);
    template <class T> Message Arecv(T*, size_t n, int pid, int tag, MPI_Comm comm);
    template <class T> Message Arecv(std::vector<T>& buf, int pid, int tag);

    template <class T> Message Send(const T* buf, size_t n, int dst_pid, int tag);
    template <class T> Message Send(const std::vector<T>& buf, int dst_pid, int tag);

    template <class T> Message Recv(T*, size_t n, int pid, int tag);
    template <class T> Message Recv(std::vector<T>& t, int pid, int tag);

    template <class T> void Bcast(T*, size_t n, int root = 0);

    template <class Op, class T> T Reduce(const T& t);

    template <class T, class T1> void Scatter(T*, size_t n, const T1*, size_t n1, int root);

    template <class T, class T1> void Gather(const T*, size_t n, T1*, size_t n1, int root);
    template <class T> std::vector<T> Gather(const T&, int root);

    void Waitsome (Array<MPI_Request>&, int&, Array<int>&, Array<MPI_Status>&);

    void MPI_Error(const char* file, int line, const char* msg, int rc);
}

#define BL_MPI_REQUIRE(x)						\
do									\
{									\
  if ( int l_status_ = (x) )						\
    {									\
      ParallelDescriptor::MPI_Error(__FILE__,__LINE__,#x, l_status_);   \
    }									\
}									\
while ( false )

#if BL_USE_MPI
template <class T>
ParallelDescriptor::Message
ParallelDescriptor::Asend (const T* buf,
                           size_t   n,
                           int      dst_pid,
                           int      tag)
{
    MPI_Request req;
    BL_MPI_REQUIRE( MPI_Isend(const_cast<T*>(buf),
                              n,
                              Mpi_typemap<T>::type(),
                              dst_pid,
                              tag,
                              Communicator(),
                              &req) );
    return Message(req, Mpi_typemap<T>::type());
}

template <class T>
ParallelDescriptor::Message
ParallelDescriptor::Asend (const T* buf,
                           size_t   n,
                           int      dst_pid,
                           int      tag,
                           MPI_Comm comm)
{
    MPI_Request req;
    BL_MPI_REQUIRE( MPI_Isend(const_cast<T*>(buf),
                              n,
                              Mpi_typemap<T>::type(),
                              dst_pid,
                              tag,
                              comm,
                              &req) );
    return Message(req, Mpi_typemap<T>::type());
}

template <class T>
ParallelDescriptor::Message
ParallelDescriptor::Asend (const std::vector<T>& buf,
                           int                   dst_pid,
                           int                   tag)
{
    MPI_Request req;
    BL_MPI_REQUIRE( MPI_Isend(const_cast<T*>(&buf[0]),
                              buf.size(),
                              Mpi_typemap<T>::type(),
                              dst_pid,
                              tag,
                              Communicator(),
                              &req) );
    return Message(req, Mpi_typemap<T>::type());
}

template <class T>
ParallelDescriptor::Message
ParallelDescriptor::Send (const T* buf,
                          size_t   n,
                          int      dst_pid,
                          int      tag)
{
    BL_MPI_REQUIRE( MPI_Send(const_cast<T*>(buf),
                             n,
                             Mpi_typemap<T>::type(),
                             dst_pid,
                             tag,
                             Communicator()) );
    return Message();
}

template <class T>
ParallelDescriptor::Message
ParallelDescriptor::Send (const std::vector<T>& buf,
                          int                   dst_pid,
                          int                   tag)
{
    BL_MPI_REQUIRE( MPI_Send(const_cast<T*>(&buf[0]),
                             buf.size(),
                             Mpi_typemap<T>::type(),
                             dst_pid,
                             tag,
                             Communicator()) );
    return Message();
}

template <class T>
ParallelDescriptor::Message
ParallelDescriptor::Arecv (T*       buf,
                           size_t   n,
                           int      src_pid,
                           int      tag)
{
    MPI_Request req;
    BL_MPI_REQUIRE( MPI_Irecv(buf,
                              n,
                              Mpi_typemap<T>::type(),
                              src_pid,
                              tag,
                              Communicator(),
                              &req) );
    return Message(req, Mpi_typemap<T>::type());
}

template <class T>
ParallelDescriptor::Message
ParallelDescriptor::Arecv (T*       buf,
                           size_t   n,
                           int      src_pid,
                           int      tag,
                           MPI_Comm comm)
{
    MPI_Request req;
    BL_MPI_REQUIRE( MPI_Irecv(buf,
                              n,
                              Mpi_typemap<T>::type(),
                              src_pid,
                              tag,
                              comm,
                              &req) );
    return Message(req, Mpi_typemap<T>::type());
}

template <class T>
ParallelDescriptor::Message
ParallelDescriptor::Arecv (std::vector<T>& buf,
                           int             src_pid,
                           int             tag)
{
    MPI_Request req;
    BL_MPI_REQUIRE( MPI_Irecv(&buf[0],
                              buf.size(),
                              Mpi_typemap<T>::type(),
                              src_pid,
                              tag,
                              Communicator(),
                              &req) );
    return Message(req, Mpi_typemap<T>::type());
}

template <class T>
ParallelDescriptor::Message
ParallelDescriptor::Recv (T*     buf,
                          size_t n,
                          int    src_pid,
                          int    tag)
{
    MPI_Status stat;
    BL_MPI_REQUIRE( MPI_Recv(buf,
                             n,
                             Mpi_typemap<T>::type(),
                             src_pid,
                             tag,
                             Communicator(),
                             &stat) );
    return Message(stat, Mpi_typemap<T>::type());
}

template <class T>
ParallelDescriptor::Message
ParallelDescriptor::Recv (std::vector<T>& buf,
                          int             src_pid,
                          int             tag)
{
    MPI_Status stat;
    BL_MPI_REQUIRE( MPI_Recv(&buf[0],
                             buf.size(),
                             Mpi_typemap<T>::type(),
                             src_pid,
                             tag,
                             Communicator(),
                             &stat) );
    return Message(stat, Mpi_typemap<T>::type());
}

template <class Op, class T>
T
ParallelDescriptor::Reduce (const T& t)
{
    T recv;
    BL_MPI_REQUIRE( MPI_Allreduce(const_cast<T*>(&t),
                                  &recv,
                                  1,
                                  Mpi_typemap<T>::type(),
                                  Op::op(),
                                  Communicator()) );
    return recv;
}

template <class T>
void
ParallelDescriptor::Bcast (T*     t,
                           size_t n,
                           int    root)
{
    BL_MPI_REQUIRE( MPI_Bcast(t,
                              n,
                              Mpi_typemap<T>::type(),
                              root,
                              Communicator()) );
}

template <class T, class T1>
void
ParallelDescriptor::Gather (const T* t,
                            size_t   n,
                            T1*      t1,
                            size_t   n1,
                            int      root)
{
    BL_MPI_REQUIRE( MPI_Gather(const_cast<T*>(t),
                               n,
                               Mpi_typemap<T>::type(),
                               t1,
                               n1,
                               Mpi_typemap<T1>::type(),
                               root,
                               Communicator()) );
}

template <class T>
std::vector<T>
ParallelDescriptor::Gather (const T& t, int root)
{
    std::vector<T> resl;
    if ( root == MyProc() ) resl.resize(NProcs());
    BL_MPI_REQUIRE( MPI_Gather(const_cast<T*>(&t),
                               1,
                               Mpi_typemap<T>::type(),
                               &resl[0],
                               1,
                               Mpi_typemap<T>::type(),
                               root,
                               Communicator()) );
    return resl;
}

template <class T, class T1>
void
ParallelDescriptor::Scatter (T*        t,
                             size_t    n,
                             const T1* t1,
                             size_t    n1,
                             int       root)
{
    BL_MPI_REQUIRE( MPI_Scatter(const_cast<T1*>(t1),
                                n1,
                                Mpi_typemap<T1>::type(),
                                t,
                                n,
                                Mpi_typemap<T>::type(),
                                root,
                                Communicator()) );
}

#else

namespace ParallelDescriptor
{
template <class T>
Message
Asend(const T* buf, size_t n, int dst_pid, int tag)
{
    return Message();
}

template <class T>
Message
Asend(const T* buf, size_t n, int dst_pid, int tag, MPI_Comm comm)
{
    return Message();
}

template <class T>
Message
Asend(const std::vector<T>& buf, int dst_pid, int tag)
{
    return Message();
}

template <class T>
Message
Send(const T* buf, size_t n, int dst_pid, int tag)
{
    return Message();
}

template <class T>
Message
Send(const std::vector<T>& buf, int dst_pid, int tag)
{
    return Message();
}

template <class T>
Message
Arecv(T* buf, size_t n, int src_pid, int tag)
{
    return Message();
}

template <class T>
Message
Arecv(T* buf, size_t n, int src_pid, int tag, MPI_Comm comm)
{
    return Message();
}

template <class T>
Message
Arecv(std::vector<T>& buf, int src_pid, int tag)
{
    return Message();
}

template <class T>
Message
Recv(T* buf, size_t n, int src_pid, int tag)
{
    return Message();
}

template <class T>
Message
Recv(std::vector<T>& buf, int src_pid, int tag)
{
    return Message();
}

template <class Op, class T>
T
Reduce(const T& t)
{
    return t;
}

template <class T>
void
Bcast(T* t, size_t n, int root)
{}

template <class T, class T1>
void
Gather(const T* t, size_t n, T1* t1, size_t n1, int root)
{}

template <class T>
std::vector<T>
Gather(const T& t, int root)
{
    std::vector<T> resl(1);
    resl[0] = t;
    return resl;
}

template <class T, class T1>
void
Scatter(T* t, size_t n, const T1* t1, size_t n1, int root)
{}

}
#endif

#endif /*BL_PARALLELDESCRIPTOR_H*/
