c++ - MPI_Neighbor_alltoallw() 中发送和接收的正确顺序是什么？

我正在使用 MPI_Neighbor_alltoallw()从相邻进程发送和接收数据。在我的应用程序中，我有幻影单元格，应该通过从相邻单元格复制单元格数据来更新它们。这是此过程的示意图:

蓝色单元格包含数据，紫色单元格是幻影单元格，应使用相邻单元格数据的拷贝进行更新。

据我了解MPI standard ，我写了我能写的最简单的例子。我还编写了一个并行的 vtk writer，以便稍后可视化数据。在下面的代码中，我为发送和接收子数组定义了新的 MPI 数据类型:

#include <mpi.h>

//VTK Library
#include <vtkXMLPStructuredGridWriter.h>
#include <vtkXMLStructuredGridWriter.h>
#include <vtkStructuredGrid.h>
#include <vtkSmartPointer.h>
#include <vtkFloatArray.h>
#include <vtkCellData.h>
#include <vtkProgrammableFilter.h>
#include <vtkInformation.h>
#include <vtkMPIController.h>

// To change the number of processes in each direction change nx, ny
const int nx{2};
const int ny{2};
const int Lx{100/nx};   // grid size without ghost cells
const int Ly{100/ny};
const int lx{Lx+4};     // grid size plus ghost cells
const int ly{Ly+4};

struct Args {
  vtkProgrammableFilter* pf;
  int local_extent[6];
};

// function to operate on the point attribute data
void execute (void* arg) {
  Args* args = reinterpret_cast<Args*>(arg);
  auto info = args->pf->GetOutputInformation(0);
  auto output_tmp = args->pf->GetOutput();  //WARNING this is a vtkDataObject*
  auto input_tmp  = args->pf->GetInput();   //WARNING this is a vtkDataObject*
  vtkStructuredGrid* output = dynamic_cast<vtkStructuredGrid*>(output_tmp);
  vtkStructuredGrid* input  = dynamic_cast<vtkStructuredGrid*>(input_tmp);
  output->ShallowCopy(input);
  output->SetExtent(args->local_extent);
}

void parallel_vtk_writer (double* cells, const char* name, int* coords, int* dim, vtkMPIController* contr) {
  int dims[2] = {lx+1, ly+1};
  int global_extent[6] = {0, dim[1]*lx, 0, dim[0]*ly, 0, 0};
  int local_extent[6] = {coords[1]*lx, coords[1]*lx + lx,
                        coords[0]*ly, coords[0]*ly + ly, 0, 0};

  int nranks = contr->GetNumberOfProcesses();
  int rank   = contr->GetLocalProcessId();

  auto points = vtkSmartPointer<vtkPoints>::New();
  points->Allocate((lx+1)*(ly+1));
  for (int j=0; j<ly+1; ++j)
    for (int i=0; i<lx+1; ++i)
      points->InsertPoint(i + j*(lx+1), i+coords[1]*lx, j+coords[0]*ly, 0);

  auto cell_value = vtkSmartPointer<vtkFloatArray>::New();
  cell_value->SetNumberOfComponents(1);
  cell_value->SetNumberOfTuples(lx*ly);
  cell_value->SetName ("cell value");

  for (int j=0; j<ly; ++j)
    for (int i=0; i<lx; ++i)
      cell_value->SetValue(i + j*lx, cells[i + j*lx]);

  auto pf = vtkSmartPointer<vtkProgrammableFilter>::New();

  Args args;
  args.pf = pf;
  for(int i=0; i<6; ++i) args.local_extent[i] = local_extent[i];

  pf->SetExecuteMethod(execute, &args);

  auto structuredGrid = vtkSmartPointer<vtkStructuredGrid>::New();
  structuredGrid->SetExtent(global_extent);
  pf->SetInputData(structuredGrid);
  structuredGrid->SetPoints(points);
  structuredGrid->GetCellData()->AddArray(cell_value);

  auto parallel_writer = vtkSmartPointer<vtkXMLPStructuredGridWriter>::New();
  parallel_writer->SetInputConnection(pf->GetOutputPort());
  parallel_writer->SetController(contr);
  parallel_writer->SetFileName(name);
  parallel_writer->SetNumberOfPieces(nranks);
  parallel_writer->SetStartPiece(rank);
  parallel_writer->SetEndPiece(rank);
  parallel_writer->SetDataModeToBinary();
  parallel_writer->Update();
  parallel_writer->Write();
}

int main (int argc, char *argv[]) {
  MPI_Init (&argc, &argv);  

  //*** Create cartesian topology for grids ***//
  MPI_Comm comm_cart;
  int cartesian_rank;
  int dim[2] = {ny, nx};
  int coords[2];
  int periods[2] = {1, 1};
  MPI_Cart_create (MPI_COMM_WORLD, 2, dim, periods, 0, &comm_cart);
  MPI_Comm_rank (comm_cart, &cartesian_rank);
  MPI_Cart_coords (comm_cart, cartesian_rank, 2, coords);

  //******** Allocate memory and initialize cells ********//
  double* cells = new double[lx*ly];
  for (int j=0; j<ly; ++j)
    for (int i=0; i<lx; ++i)
      cells[i + j*lx] = 0;

  //********* Assign a value to cells *********//
  int cx = coords[1];
  int cy = coords[0];
  int l, m;
  // for loops starts with 2, because we don't initialize the ghost cells
  for (int j=2; j<ly-2; ++j)
    for (int i=2; i<lx-2; ++i) {
      l = i + cx*lx - 4*cx;
      m = j + cy*ly - 4*cy;
      if ((l-(nx*Lx+3)/2.)*(l-(nx*Lx+3)/2.) + (m-(ny*Ly+3)/2.)*(m-(ny*Ly+3)/2.) <= 400)
        cells[i + j*lx] = (i+j)*0.1;
      else
        cells[i + j*lx] = 0.1;
    }

  //********** Define new data types **********//
  const int dims {2};
  int arr_sizes[dims] = {ly, lx};
  int subar_sizes_x[dims] = {Ly, 2};
  int subar_sizes_y[dims] = {2, Lx};
  MPI_Datatype subar_right;
  MPI_Datatype subar_left;
  MPI_Datatype subar_top;
  MPI_Datatype subar_bottom;
  MPI_Datatype ghost_left;
  MPI_Datatype ghost_right;
  MPI_Datatype ghost_bottom;
  MPI_Datatype ghost_top;

  // send subarrays
  int subar_right_start[dims] = {2, Lx};
  MPI_Type_create_subarray (dims, arr_sizes, subar_sizes_x, subar_right_start, MPI_ORDER_C, MPI_DOUBLE, &subar_right);
  MPI_Type_commit (&subar_right);

  int subar_left_start[dims] = {2, 2};
  MPI_Type_create_subarray (dims, arr_sizes, subar_sizes_x, subar_left_start, MPI_ORDER_C, MPI_DOUBLE, &subar_left);
  MPI_Type_commit (&subar_left);

  int subar_top_start[dims] = {Ly, 2};
  MPI_Type_create_subarray (dims, arr_sizes, subar_sizes_y, subar_top_start, MPI_ORDER_C, MPI_DOUBLE, &subar_top);
  MPI_Type_commit (&subar_top);

  int subar_bottom_start[dims] = {2, 2};
  MPI_Type_create_subarray (dims, arr_sizes, subar_sizes_y, subar_bottom_start, MPI_ORDER_C, MPI_DOUBLE, &subar_bottom);
  MPI_Type_commit (&subar_bottom);

  // recv subarrays
  int ghost_left_start[dims] = {2, 0};
  MPI_Type_create_subarray (dims, arr_sizes, subar_sizes_x, ghost_left_start, MPI_ORDER_C, MPI_DOUBLE, &ghost_left);
  MPI_Type_commit (&ghost_left);

  int ghost_right_start[dims] = {2, Lx+2};
  MPI_Type_create_subarray (dims, arr_sizes, subar_sizes_x, ghost_right_start, MPI_ORDER_C, MPI_DOUBLE, &ghost_right);
  MPI_Type_commit (&ghost_right);

  int ghost_bottom_start[dims] = {0, 2};
  MPI_Type_create_subarray (dims, arr_sizes, subar_sizes_y, ghost_bottom_start, MPI_ORDER_C, MPI_DOUBLE, &ghost_bottom);
  MPI_Type_commit (&ghost_bottom);

  int ghost_top_start[dims] = {Ly+2, 2};
  MPI_Type_create_subarray (dims, arr_sizes, subar_sizes_y, ghost_top_start, MPI_ORDER_C, MPI_DOUBLE, &ghost_top);
  MPI_Type_commit (&ghost_top);

  //******** SENDING SUBARRAY ********//
  int sendcounts[4] = {1, 1, 1, 1};
  int recvcounts[4] = {1, 1, 1, 1};
  MPI_Aint sdispls[4] = {0, 0, 0, 0};
  MPI_Aint rdispls[4] = {0, 0, 0, 0};
  MPI_Datatype sendtypes[4] = {subar_bottom, subar_top, subar_left, subar_right};
  MPI_Datatype recvtypes[4] = {ghost_top, ghost_bottom, ghost_right, ghost_left};
  MPI_Neighbor_alltoallw (cells, sendcounts, sdispls, sendtypes, cells, recvcounts, rdispls, recvtypes, comm_cart);

  //******** Writing the cells using VTK ********//
  auto contr = vtkSmartPointer<vtkMPIController>::New();
  contr->Initialize(nullptr, nullptr, 1);
  parallel_vtk_writer (cells, "data/grid.pvts", coords, dim, contr);

  //******** Free data types ********//
  MPI_Type_free (&subar_right);
  MPI_Type_free (&subar_left);
  MPI_Type_free (&subar_top);
  MPI_Type_free (&subar_bottom);
  MPI_Type_free (&ghost_left);
  MPI_Type_free (&ghost_right);
  MPI_Type_free (&ghost_bottom);
  MPI_Type_free (&ghost_top);
  delete[] cells;

  MPI_Finalize ();
  return 0;
}

我正在使用以下 CmakeList 制作我的可执行文件:

cmake_minimum_required(VERSION 2.8)
PROJECT(TEST)
add_executable(TEST test_stackoverflow.cpp)
add_compile_options(-std=c++11)

find_package(VTK REQUIRED)
include(${VTK_USE_FILE})
target_link_libraries(TEST ${VTK_LIBRARIES})

find_package(MPI REQUIRED)
include_directories(${MPI_INCLUDE_PATH})
target_link_libraries(TEST ${MPI_LIBRARIES})

问题

当我使用 2 x 2 进程网格 ([nx,ny] = [2,2]) 时，发送和接收工作正常。但是，当我使用 4 x 4 进程网格时，我看到了错误的结果(错误的发送和接收)。 MPI_Neighbor_alltoallw() 中发送类型和接收类型的正确顺序是什么？

欢迎并赞赏任何改进代码的建议。

最佳答案

来自MPI 3.1标准第7.6章第314页

For a Cartesian topology, created with MPI_CART_CREATE, the sequence of neighbors in the send and receive buffers at each process is defined by order of the dimensions, first the neighbor in the negative direction and then in the positive direction with displacement 1.

我凭经验发现在二维笛卡尔通信器的情况下，序列是:

顶
底部
左
对

(我测试了 Open MPI 和 MPICH，虽然我很难理解这里的逻辑......)

关于c++ - MPI_Neighbor_alltoallw() 中发送和接收的正确顺序是什么？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/50608184/

c++ - MPI_Neighbor_alltoallw() 中发送和接收的正确顺序是什么？

问题

上一篇：c++ - Qt Creator 包括保护模式

下一篇：c++ - 为什么我可以在 C++ 的可变 lambda 中捕获未声明的变量