fortran - 获取正在使用的 MPI Communicator 的数量

标签 fortran mpi intel-mpi

我有一个很大的代码,它因以下错误而崩溃:

Fatal error in PMPI_Comm_split: Other MPI error, error stack:
PMPI_Comm_split(532)................: MPI_Comm_split(comm=0xc4027cf0, color=0, key=0, new_comm=0x7ffdb50f2bd0) failed
PMPI_Comm_split(508)................: fail failed
MPIR_Comm_split_impl(260)...........: fail failed
MPIR_Get_contextid_sparse_group(676): Too many communicators (0/16384 free on this process; ignore_id=0)
Fatal error in PMPI_Comm_split: Other MPI error, error stack:
PMPI_Comm_split(532)................: MPI_Comm_split(comm=0xc401bcf1, color=1, key=0, new_comm=0x7ffed5aa4fd0) failed
PMPI_Comm_split(508)................: fail failed
MPIR_Comm_split_impl(260)...........: fail failed
MPIR_Get_contextid_sparse_group(676): Too many communicators (0/16384 free on this process; ignore_id=0)
Fatal error in PMPI_Comm_split: Other MPI error, error stack:
PMPI_Comm_split(532)................: MPI_Comm_split(comm=0xc4027ce9, color=0, key=0, new_comm=0x7ffe37e477d0) failed
PMPI_Comm_split(508)................: fail failed
MPIR_Comm_split_impl(260)...........: fail failed
MPIR_Get_contextid_sparse_group(676): Too many communicators (0/16384 free on this process; ignore_id=0)
Fatal error in PMPI_Comm_split: Other MPI error, error stack:
PMPI_Comm_split(532)................: MPI_Comm_split(comm=0xc401bcf1, color=1, key=0, new_comm=0x7ffd511ac4d0) failed
PMPI_Comm_split(508)................: fail failed
MPIR_Comm_split_impl(260)...........: fail failed
MPIR_Get_contextid_sparse_group(676): Too many communicators (0/16384 free on this process; ignore_id=0)

似乎存在某种 MPI 通信器泄漏。 MPI 似乎知道当前使用了多少个通信器:
Too many communicators (0/16384 free on this process; ignore_id=0)

有没有办法打印 MPI 使用的通信器的数量?这样我就可以缩小通信器泄漏的范围。

最佳答案

您可以覆盖 MPI_Comm_split 的实现和 MPI_Comm_free手动统计通信器的创建和销毁。

这是一个简单的例子

覆盖 MPI_Comm_splitMPI_Comm_free

#include "mpi.h"
#include "stdio.h"
static int comm_counter=0;
int MPI_Comm_split(MPI_Comm comm, int color, int key, MPI_Comm *newcomm)
{
      int world_rank;
      MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
      comm_counter++;
      printf("%s %i %s %i\n", "MPI_Comm_split ", comm_counter, " from ", world_rank);
      return PMPI_Comm_split(comm, color, key, newcomm);
}

int MPI_Comm_free(MPI_Comm *comm)
{
      int world_rank;
      MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
      comm_counter--;
      printf("%s %i %s %i\n", "PMPI_Comm_free ", comm_counter, " from ", world_rank);
      return PMPI_Comm_free(comm);
}

编译此代码以进行链接。
就我而言,我做了 mpicc -c comm_split.c -o comm_split.o
您的代码保持不变。您无需其他修改即可使用它。
使用 MPI_Comm_split 的主程序的简单示例和 MPI_Comm_free
C++案例
#include "mpi.h"
int main()
{
      MPI_Init(NULL, NULL);
      // Get the rank and size in the original communicator
      int world_rank, world_size;
      MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
      MPI_Comm_size(MPI_COMM_WORLD, &world_size);

      int color = world_rank / 4; // Determine color based on row

      // Split the communicator based on the color and use the
      // original rank for ordering
      MPI_Comm row_comm, row_comm2;
      MPI_Comm_split(MPI_COMM_WORLD, color, world_rank, &row_comm);
      MPI_Comm_split(MPI_COMM_WORLD, color, world_rank, &row_comm2);

      int row_rank, row_size;
      MPI_Comm_rank(row_comm, &row_rank);
      MPI_Comm_size(row_comm, &row_size);

      printf("WORLD RANK/SIZE: %d/%d \t ROW RANK/SIZE: %d/%d\n",
          world_rank, world_size, row_rank, row_size);

      MPI_Comm_free(&row_comm);
      MPI_Finalize();
}

Fortran案例
      program test

      include "mpif.h"

      integer comm_world, group_world, new_comm, new_comm2, ierr
      integer world_rank, world_size;
      integer color


      call MPI_INIT(ierr)

      comm_world = MPI_COMM_WORLD

      call MPI_Comm_rank(comm_world, world_rank, ierr);
      color = world_rank / 4
      call MPI_Comm_split(comm_world, color, world_rank, new_comm, ierr)
      call MPI_Comm_split(comm_world, color, world_rank,
     & new_comm2, ierr)

      call MPI_Comm_free(new_comm, ierr)
      call MPI_Finalize(ierr)
      end program

编译+链接重定义MPI_Comm_splitMPI_Comm_free
mpif77 test.f comm_split.o
mpiCC test.cpp comm_split.o

对于 Fortran案例你会得到类似的东西
MPI_Comm_split  1  from  3
MPI_Comm_split  1  from  0
MPI_Comm_split  1  from  1
MPI_Comm_split  1  from  2
MPI_Comm_split  2  from  0
PMPI_Comm_free  1  from  0
MPI_Comm_split  2  from  1
PMPI_Comm_free  1  from  1
MPI_Comm_split  2  from  2
PMPI_Comm_free  1  from  2
MPI_Comm_split  2  from  3
PMPI_Comm_free  1  from  3

这为您提供了有关每个流程中涉及的传播者数量的信息。

关于fortran - 获取正在使用的 MPI Communicator 的数量,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/53043835/

相关文章:

opencv - craycc 和 OpenCV

c++ - MPI 时间测量受到其他用户的影响?

c++ - 为什么我的 fortran 例程将不正确的值传递给我的 C++ 函数?

algorithm - 在 Fortran 中重新启动循环

c++ - 从 C++ 调用的 Fortran 子例程的错误值

java - 英特尔 MPI mpirun 不会使用 java Process.destroy() 终止

pip - 如何将 mpi4py 链接到 intelmpi

io - Fortran 90 - I/O 将变量作为文件路径传递

python - F2py 转换字符(*) 段错误