我已经在单个节点上成功编写了矩阵-矩阵乘法,现在我的目标是链接该程序以在集群节点上并行执行。
主要工作修改了 Scalapack Netlib 的源代码,通过我的程序(mydgemm
).
这里的原代码是C程序,但是那个程序中的所有例程都调用了Fortran例程(比如dgemm_
是Fortran语言),而我的程序(mydgemm
)是C程序。
我修改后,单节点任意大小的矩阵都可以执行成功,但是当我用4个节点(矩阵大小大于200)运行时->节点间通信数据出错(初级产业部)。
这是一个错误:
*BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
PID 69754 RUNNING AT localhost.localdomain
EXIT CODE: 11
CLEANING UP REMAINING PROCESSES
YOU CAN IGNORE THE BELOW CLEANUP MESSAGES*
我只是在主函数中使用 MPI 在每个节点随机创建矩阵(附在后面)- 例程称为 new_pdgemm (...)
。 (我修改了 new-pdgemm
中的代码)。
在 mydgemm.c
中,我使用 OMP 进行并行处理,这段代码在内核上执行。
可以给我一个指导或想法来解决我的问题吗?
您认为问题是因为 Fortran 是列专业,而 C 是行专业吗?
或者我是否需要将
mydgemm.c
更改为mydgemm.f
(这真的很难,也许我做不到)?
我的代码:
int main(int argc, char **argv) {
int i, j, k;
/************ MPI ***************************/
int myrank_mpi, nprocs_mpi;
MPI_Init( &argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs_mpi);
/************ BLACS ***************************/
int ictxt, nprow, npcol, myrow, mycol,nb;
int info,itemp;
int _ZERO=0,_ONE=1;
int M=20000;
int K=20000;
int N=20000;
nprow = 2; npcol = 2;
nb=1200;
Cblacs_pinfo( &myrank_mpi, &nprocs_mpi ) ;
Cblacs_get( -1, 0, &ictxt );
Cblacs_gridinit( &ictxt, "Row", nprow, npcol );
Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );
//printf("myrank = %d\n",myrank_mpi);
int rA = numroc_( &M, &nb, &myrow, &_ZERO, &nprow );
int cA = numroc_( &K, &nb, &mycol, &_ZERO, &npcol );
int rB = numroc_( &K, &nb, &myrow, &_ZERO, &nprow );
int cB = numroc_( &N, &nb, &mycol, &_ZERO, &npcol );
int rC = numroc_( &M, &nb, &myrow, &_ZERO, &nprow );
int cC = numroc_( &N, &nb, &mycol, &_ZERO, &npcol );
double *A = (double*) malloc(rA*cA*sizeof(double));
double *B = (double*) malloc(rB*cB*sizeof(double));
double *C = (double*) malloc(rC*cC*sizeof(double));
int descA[9],descB[9],descC[9];
descinit_(descA, &M, &K, &nb, &nb, &_ZERO, &_ZERO, &ictxt, &rA, &info);
descinit_(descB, &K, &N, &nb, &nb, &_ZERO, &_ZERO, &ictxt, &rB, &info);
descinit_(descC, &M, &N, &nb, &nb, &_ZERO, &_ZERO, &ictxt, &rC, &info);
double alpha = 1.0; double beta = 1.0;
double start, end, flops;
srand(time(NULL)*myrow+mycol);
#pragma simd
for (j=0; j<rA*cA; j++)
{
A[j]=((double)rand()-(double)(RAND_MAX)*0.5)/(double)(RAND_MAX);
// printf("A in myrank: %d\n",myrank_mpi);
}
// printf("A: %d\n",myrank_mpi);
#pragma simd
for (j=0; j<rB*cB; j++)
{
B[j]=((double)rand()-(double)(RAND_MAX)*0.5)/(double)(RAND_MAX);
}
#pragma simd
for (j=0; j<rC*cC; j++)
{
C[j]=((double)rand()-(double)(RAND_MAX)*0.5)/(double)(RAND_MAX);
}
MPI_Barrier(MPI_COMM_WORLD);
start=MPI_Wtime();
new_pdgemm ("N", "N", &M , &N , &K , &alpha, A , &_ONE, &_ONE , descA , B , &_ONE, &_ONE , descB , &beta , C , &_ONE, &_ONE , descC );
MPI_Barrier(MPI_COMM_WORLD);
end=MPI_Wtime();
if (myrow==0 && mycol==0)
{
flops = 2 * (double) M * (double) N * (double) K / (end-start) / 1e9;
/* printf("This is value: %d\t%d\t%d\t%d\t%d\t%d\t\n",rA,cA,rB,cB,rC,cC);
printf("%f\t%f\t%f\n", A[4], B[6], C[3]);*/
printf("%f Gflops\n", flops);
}
Cblacs_gridexit( 0 );
MPI_Finalize();
free(A);
free(B);
free(C);
return 0;
}
最佳答案
好吧,这不是一个真正的答案,但对于评论来说太长了,我想要一个答案给你的格式。
所以我用blacs_gridexit修复了我在评论中提到的错误,即按照例程描述的要求制作参数ictxt。然后我用标准的 pdgemm 替换了你的例程。一旦我进行了这些更改并将矩阵大小切割为 2,000*2,000 以适合我的笔记本电脑。然后代码成功运行,至少在某种意义上它没有报告错误并给出了一种合理的 GFlopage。所以这对我来说要么是
- 您无法向我们展示代码中的错误
- 您的 MPI、blacs、pblas 和/或 Scalapack 实现有问题
因此我会重新安装您正在使用的库,确保它们与您正在使用的编译器一致,并运行库提供的测试,并且还包括您在代码中省略的头文件( DON 'T,这些非常重要!)。如果这些工作正常,我建议这是由于您的代码中存在错误。你不能显示这个的原因是什么?
下面是我成功运行的代码。如果我在自己的代码中正确执行此操作,我肯定还会通过确保在调用函数时适当的原型(prototype)在范围内来修复所有这些编译器警告。
ian-admin@agon ~/work/stack/mpi $ cat stack.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "mpi.h"
int main(void) {
int i, j, k;
/************ MPI ***************************/
int myrank_mpi, nprocs_mpi;
MPI_Init( NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs_mpi);
/************ BLACS ***************************/
int ictxt, nprow, npcol, myrow, mycol,nb;
int info,itemp;
int _ZERO=0,_ONE=1;
int M=2000;
int K=2000;
int N=2000;
nprow = 2; npcol = 2;
nb=1200;
Cblacs_pinfo( &myrank_mpi, &nprocs_mpi ) ;
Cblacs_get( -1, 0, &ictxt );
Cblacs_gridinit( &ictxt, "Row", nprow, npcol );
Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );
//printf("myrank = %d\n",myrank_mpi);
int rA = numroc_( &M, &nb, &myrow, &_ZERO, &nprow );
int cA = numroc_( &K, &nb, &mycol, &_ZERO, &npcol );
int rB = numroc_( &K, &nb, &myrow, &_ZERO, &nprow );
int cB = numroc_( &N, &nb, &mycol, &_ZERO, &npcol );
int rC = numroc_( &M, &nb, &myrow, &_ZERO, &nprow );
int cC = numroc_( &N, &nb, &mycol, &_ZERO, &npcol );
double *A = (double*) malloc(rA*cA*sizeof(double));
double *B = (double*) malloc(rB*cB*sizeof(double));
double *C = (double*) malloc(rC*cC*sizeof(double));
int descA[9],descB[9],descC[9];
descinit_(descA, &M, &K, &nb, &nb, &_ZERO, &_ZERO, &ictxt, &rA, &info);
descinit_(descB, &K, &N, &nb, &nb, &_ZERO, &_ZERO, &ictxt, &rB, &info);
descinit_(descC, &M, &N, &nb, &nb, &_ZERO, &_ZERO, &ictxt, &rC, &info);
double alpha = 1.0; double beta = 1.0;
double start, end, flops;
srand(time(NULL)*myrow+mycol);
#pragma simd
for (j=0; j<rA*cA; j++)
{
A[j]=((double)rand()-(double)(RAND_MAX)*0.5)/(double)(RAND_MAX);
// printf("A in myrank: %d\n",myrank_mpi);
}
// printf("A: %d\n",myrank_mpi);
#pragma simd
for (j=0; j<rB*cB; j++)
{
B[j]=((double)rand()-(double)(RAND_MAX)*0.5)/(double)(RAND_MAX);
}
#pragma simd
for (j=0; j<rC*cC; j++)
{
C[j]=((double)rand()-(double)(RAND_MAX)*0.5)/(double)(RAND_MAX);
}
MPI_Barrier(MPI_COMM_WORLD);
start=MPI_Wtime();
pdgemm_ ("N", "N", &M , &N , &K , &alpha, A , &_ONE, &_ONE , descA , B , &_ONE, &_ONE , descB , &beta , C , &_ONE, &_ONE , descC );
MPI_Barrier(MPI_COMM_WORLD);
end=MPI_Wtime();
if (myrow==0 && mycol==0)
{
flops = 2 * (double) M * (double) N * (double) K / (end-start) / 1e9;
/* printf("This is value: %d\t%d\t%d\t%d\t%d\t%d\t\n",rA,cA,rB,cB,rC,cC);
printf("%f\t%f\t%f\n", A[4], B[6], C[3]);*/
printf("%f Gflops\n", flops);
}
Cblacs_gridexit( ictxt );
MPI_Finalize();
free(A);
free(B);
free(C);
return 0;
}
ian-admin@agon ~/work/stack/mpi $ mpicc -g stack.c /home/ian-admin/Downloads/scalapack-2.0.2/libscalapack.a -llapack -lblas -lgfortran
stack.c: In function ‘main’:
stack.c:24:4: warning: implicit declaration of function ‘Cblacs_pinfo’ [-Wimplicit-function-declaration]
Cblacs_pinfo( &myrank_mpi, &nprocs_mpi ) ;
^~~~~~~~~~~~
stack.c:25:4: warning: implicit declaration of function ‘Cblacs_get’ [-Wimplicit-function-declaration]
Cblacs_get( -1, 0, &ictxt );
^~~~~~~~~~
stack.c:26:4: warning: implicit declaration of function ‘Cblacs_gridinit’ [-Wimplicit-function-declaration]
Cblacs_gridinit( &ictxt, "Row", nprow, npcol );
^~~~~~~~~~~~~~~
stack.c:27:4: warning: implicit declaration of function ‘Cblacs_gridinfo’ [-Wimplicit-function-declaration]
Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );
^~~~~~~~~~~~~~~
stack.c:31:13: warning: implicit declaration of function ‘numroc_’ [-Wimplicit-function-declaration]
int rA = numroc_( &M, &nb, &myrow, &_ZERO, &nprow );
^~~~~~~
stack.c:44:6: warning: implicit declaration of function ‘descinit_’ [-Wimplicit-function-declaration]
descinit_(descA, &M, &K, &nb, &nb, &_ZERO, &_ZERO, &ictxt, &rA, &info);
^~~~~~~~~
stack.c:72:5: warning: implicit declaration of function ‘pdgemm_’ [-Wimplicit-function-declaration]
pdgemm_ ("N", "N", &M , &N , &K , &alpha, A , &_ONE, &_ONE , descA , B , &_ONE, &_ONE , descB , &beta , C , &_ONE, &_ONE , descC );
^~~~~~~
stack.c:83:4: warning: implicit declaration of function ‘Cblacs_gridexit’ [-Wimplicit-function-declaration]
Cblacs_gridexit( ictxt );
^~~~~~~~~~~~~~~
/usr/bin/ld: warning: libgfortran.so.3, needed by //usr/lib/liblapack.so, may conflict with libgfortran.so.5
ian-admin@agon ~/work/stack/mpi $ mpirun -np 4 --oversubscribe ./a.out
9.424291 Gflops
关于调用 Fortran 例程的 C MPI 程序崩溃,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/55569301/