c++ - 信号 : Segmentation fault (11) when using Openmpi

标签 c++ c linux segmentation-fault openmpi

我正在使用基于 Opnempi 的 CFD 代码。当我只使用一个核心来运行它时,没有任何问题。但是当我使用更多的核心来运行它时,我得到了这样的错误:

[DESKTOP-7D2F3AN:03839] Process received signal

[DESKTOP-7D2F3AN:03839] Signal: Segmentation fault (11)

[DESKTOP-7D2F3AN:03839] Signal code: (128)

[DESKTOP-7D2F3AN:03839] Failing at address: (nil)

[DESKTOP-7D2F3AN:03839] [ 0] /lib/x86_64-linux-gnu/libpthread.so.0(+0x12890) [0x7f405efe2890]

[DESKTOP-7D2F3AN:03839] [ 1] /lib/x86_64-linux-gnu/libc.so.6(+0x18ec3c) [0x7f405ed5ec3c]

[DESKTOP-7D2F3AN:03839] [ 2] /usr/local/lib/libmpi.so.0(ompi_convertor_pack+0x196) [0x7f405f462176]

[DESKTOP-7D2F3AN:03839] [ 3] /usr/local/lib/openmpi/mca_pml_ob1.so(+0x10851) [0x7f405b780851]

[DESKTOP-7D2F3AN:03839] [ 4] /usr/local/lib/openmpi/mca_pml_ob1.so(+0x53ee) [0x7f405b7753ee]

[DESKTOP-7D2F3AN:03839] [ 5] /usr/local/lib/openmpi/mca_coll_tuned.so(+0xb2ff) [0x7f4059e2b2ff]

[DESKTOP-7D2F3AN:03839] [ 6] /usr/local/lib/openmpi/mca_coll_tuned.so(+0xbaf1) [0x7f4059e2baf1]

[DESKTOP-7D2F3AN:03839] [ 7] /usr/local/lib/openmpi/mca_coll_tuned.so(+0x20ab) [0x7f4059e220ab]

[DESKTOP-7D2F3AN:03839] [ 8] /usr/local/lib/openmpi/mca_coll_sync.so(+0x1377) [0x7f405a041377]

[DESKTOP-7D2F3AN:03839] [ 9] /usr/local/lib/libmpi.so.0(MPI_Bcast+0x11d) [0x7f405f46974d]

[DESKTOP-7D2F3AN:03839] [10] ./cgles(com_dist_param+0x4b) [0x7f4060236972]

[DESKTOP-7D2F3AN:03839] [11] ./cgles(main+0x377) [0x7f4060233377]

[DESKTOP-7D2F3AN:03839] [12] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7) [0x7f405ebf1b97]

[DESKTOP-7D2F3AN:03839] [13] ./cgles(_start+0x2a) [0x7f406022254a]

[DESKTOP-7D2F3AN:03839] End of error message

mpirun noticed that process rank 0 with PID 3839 on node DESKTOP-7D2F3AN exited on signal 11 (Segmentation fault).

我不知道为什么。代码太长。主要部分是这样的:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <unistd.h>
#include <string.h>
#include <signal.h>

#include "param.h"
#include "block.h"
#include "q.h"
#include "map.h"
#include "comms.h"
#include "util.h"
#include "dat.h"
#include "timing.h"
#include "sgm.h"
#include "error.h"
#include "init.h"
#include "mom.h"
#include "inst.h"
#include "pcg.h"
#include "bicgstab.h"
#include "ibm.h"
#include "hist.h"
#include "pcg_ext.h"
#include "dibm.h"
/* using BLAS library for better performance */
#ifdef BLAS 
#include <essl.h>
#endif

/* local prototypes */
void cmdopts(int *, char ***);
void mybasename(char *, char *);
void glob_output(void);
void glob_tstep(void);
void getstep(char *fname, double *Prtime, int *Pitime);
void putstep(char *fname, double *Prtime, int *Pitime);


/* external prototypes */
void slice_probe(void);
void print_solution(char* , int);
void print_vert(char*, int);
void print_wave(char*, int);
void print_moments(char*, int);

extern void InitializeIBPoint();
/*void glob_snapshot(char*);*/
extern void dibm_interploate_shear_velocity(void);


/* global, used by mgp.c, pcg.c and map.c */
/* global, used by mgp.c, pcg.c and map.c */
int mypid = -1;     /* my process id number */
int npid = 1;       /* total process number */
int halt_flg = 0;       /* halt t-steps, 1=stop, 0=cont */
double wck_t_start, wck_t_end;  /* wall clock timing */
double cpu_t_start, cpu_t_end;  /* cpu clock timing */
int time_limit = 999999999; /* wall clock limit (seconds) */
double Y_pro_mu=0; //Mu in Y-code;
double Y_pro_Laimuda=0;//Laimuda in Y-code
//char Y_y3dfile[256];  //*.y3d filename in Y-code
double Y_adheforcefactor=1.0;//force factor applied to ib points on Y solid, used in Yw.c-F2y()
double Y_shear_force_factor =1.0;
double Y_shear_vel = 0.0;
int Y_init_entity_num; //cell number at initilization of Y
//#Additional Misc by XuDong
int stat_y_save_dn; //  1000         #y solid saving interval
int stat_plane_save_dn;//   1000    #plane saving interval
int stat_glob_fluid_save_dn;//  10000000  #global fluid saving interval
int stat_restart_save_dn; //  1000   #Restart file saving interval
int stat_gtk_draw_dn; //  1          #gtkdraw interval when GTK defined
int stat_gtk_save_dn; //  100        #gtkdraw picture saving interval when GTK defined
int stat_y_typic_save_dn;
double case_parameter_1;//undefined reserved case parameter
double case_parameter_2;
double case_parameter_3;
double case_parameter_4;

/* main: driver routine */
int main(int argc, char **argv)
{
  int bid, ibid;
  /* switches */
  int add_seed_flg = 0, probe_flg = 0, putxd_2x_flg = 0, reset_stats_flg = 0;   

  /* Start up any other nodes. The cmd line options relating to the
   * parallel interface are parsed and pruned here. */
  mypid = com_start(&argc, &argv, 1);
  npid  = com_nnodes();
  wck_t_start = WCKseconds();
  cpu_t_start = CPUseconds();

  /* Parse the remaining options [first node only] */
  if (mypid == 0)
    cmdopts(&argc, &argv);

  /* Buffering mode for stdout; needed on T3D */
  fflush(stdout);
  setvbuf(stdout, (char *) 0, _IOLBF, 0); /* line buffering */

  /* Read configuration data on ONE node */
  if (mypid == 0) 
  {
    /* get the map data and mesh points */
    map_getfn(fnames.map, fnames.msh);
    map_print(BID_NULL);
    if (map_checkt() || map_checku())
      exit(2);

    /* debugging */
    /* map_putfn("temp.map"); */
    /* msh_putfn("temp.msh"); */

    /* read the local data file and build lookup table */
    dat_read(fnames.dat);
    dat_print();

    /* Modify parameters using info from *.dat file */
    dat_getdouble(&viscm, "viscm");
    dat_getdouble(&fbody_x, "fbody_x");
    dat_getdouble(&fbody_y, "fbody_y");
    dat_getdouble(&fbody_z, "fbody_z");
    dat_getint(&time_limit, "time_limit");

    //#Additional Misc by XuDong
    dat_getdouble(&crk_beta,"crk_beta");// #=0--explicit, ==1 fully implicit ==1/2 crank-nicolson
    dat_getdouble(&ibm_relax,"ibm_relax");// #=0.5---normal for direct forcing

    dat_getint(&stat_y_save_dn,"stat_y_save_dn");//  1000         #y solid saving interval
    dat_getint(&stat_plane_save_dn,"stat_plane_save_dn");//   1000    #plane saving interval
    dat_getint(&stat_glob_fluid_save_dn,"stat_glob_fluid_save_dn");//  10000000  #global fluid saving interval
    dat_getint(&stat_restart_save_dn,"stat_restart_save_dn");//  1000   #Restart file saving interval
    dat_getint(&stat_gtk_draw_dn,"stat_gtk_draw_dn");//  1          #gtkdraw interval when GTK defined
    dat_getint(&stat_gtk_save_dn,"stat_gtk_save_dn");//  100        #gtkdraw picture saving interval when GTK defined
    dat_getint(&stat_y_typic_save_dn,"stat_y_typic_save_dn");//  100        #gtkdraw picture saving interval when GTK defined


    dat_getdouble(&Y_adheforcefactor,"Y_adheforcefactor");//  1          #gtkdraw interval when GTK defined
    dat_getdouble(&Y_shear_force_factor,"Y_shear_force_factor");//  100        #gtkdraw picture saving interval when GTK defined
    dat_getdouble(&Y_shear_vel,"Y_shear_vel");//  100        #gtkdraw picture saving interval when GTK defined
    dat_getint(&Y_init_entity_num,"Y_init_entity_num");//  //cell number at initilization of Y

    dat_getdouble(&case_parameter_1,"case_parameter_1"); //undefined reserved case parameter
    dat_getdouble(&case_parameter_2,"case_parameter_2"); //undefined reserved case parameter
    dat_getdouble(&case_parameter_3,"case_parameter_3"); //undefined reserved case parameter
    dat_getdouble(&case_parameter_4,"case_parameter_4"); //undefined reserved case parameter 
    /* set current time step parameters */
    dat_getint(&ntime, "ntime");
    dat_getdouble(&dt, "dt");
    if (!flg_init)
      getstep(fnames.xdi, &rtime_current, &itime);
    itime_first = itime;
    itime_last = itime_first + ntime;
    rtime_first = rtime_current;
    rtime_last = rtime_current + dt * ntime;

    /* print fluid parameters */
    prparam();
  }


    /* Distibute to other nodes */
    com_dist_param();   /* distribute file names */
    com_dist_map();   /* distribute map */
    com_dist_data();    /* distribute table */

我找到了错误发生的地方,它在 com_dist_param() 中。

        void com_dist_param(void)
        {
            /* This is all there is to it, curtesy of derived datatypes */
            MPI_Bcast(&fnames, 1, fnames_type, 0, MPI_COMM_WORLD);
            /* params starts with address of fbody_x */
            MPI_Bcast(&fbody_x, 1, params_type, 0, MPI_COMM_WORLD);
        }

进入MPI_Bcast时出现错误。姓名:

typedef struct
  {
    char xdi[MAXSTR];       /* xd in */
    char xdo[MAXSTR];       /* xd out */
    char dat[MAXSTR];       /* data */
    char map[MAXSTR];       /* domain map */
    char msh[MAXSTR];       /* mesh points */
    char y3d[MAXSTR];       /* Y3D input */
    char log[MAXSTR];       /* logging */
    char mom[MAXSTR];       /* stats moments */
  }
Fnames;

并将 fnames_type 定义为 MPI_Datatype:

MPI_Datatype
make_fnames_type(void)
{
    MPI_Datatype fnames_type;
    MPI_Datatype type[8];
    int blocklen[8];
    MPI_Aint disp[8];
    int base, i;
    /* define it */
    MPI_Address(&fnames.xdi, disp);
    MPI_Address(&fnames.xdo, disp + 1);
    MPI_Address(&fnames.dat, disp + 2);
    MPI_Address(&fnames.map, disp + 3);
    MPI_Address(&fnames.msh, disp + 4);
    MPI_Address(&fnames.y3d, disp + 5);
    MPI_Address(&fnames.log, disp + 6);
    MPI_Address(&fnames.mom, disp + 7);
    base = disp[0];
    for (i = 0; i < 8; i++) {
    type[i] = MPI_CHAR;
    blocklen[i] = MAXSTR;
    disp[i] -= base;
/*
   printf("disp[i] = %d blocklen[i] = %d\n", disp[i], blocklen[i]);
 */
    }

    /* create it */
    MPI_Type_struct(8, blocklen, disp, type, &fnames_type);
    MPI_Type_commit(&fnames_type);
    return (fnames_type);
}

谁能帮帮我?非常感谢!

最佳答案

假设,像大多数 MPI 用户一样,您正在 64 位机器上运行您的代码:

MPI_AInt 必须能够包含一个指针,因此大小为 64 位。在函数 make_fnames_type() 中,为了从结构的各个字段的地址中减去结构的基地址,您将基地址存储到 int 变量中,适本地命名为 基础。但是那个 int 变量是一个 32 位的量。您可能会破坏指针值。

你的结构的基地址是一个 64 位的数量。您应该只将它存储到另一种 64 位类型中,例如 unsigned long int 或 uint64_t。语句 base = disp[0]; 很可能是一个narrowing 转换,无论如何都不可移植。您应该检查涉及的 sizeof 值。

旁注:在 C++ 中,与 C 不同,您不需要事先声明所有函数变量。您可以根据需要声明变量。因此,例如,没有理由在定义其值之前声明 int 变量base 10 个源代码行。因此,您可以将 int base = disp[0]; 全部放在一个地方,然后更容易检测到变窄的事故。更好的是,您可以只写:auto base = disp[0]; 然后就不会发生缩小。

一种可能性是像这样改变循环:

    long int base = disp[0];
    for (i = 0; i < 8; i++) {
        type[i] = MPI_CHAR;
        blocklen[i] = MAXSTR;
        disp[i] -= base;
        /* changed %d into %ld below */
        printf("disp[i] = %ld  blocklen[i] = %d\n",
            (long int)disp[i], blocklen[i]);
    }

您还可以通过 MPI_Type_Size() 函数检查 MPI 自定义类型的内存大小是否符合您的预期。

一般说明:所有 C/C++ 中的 MPI 函数都返回一个整数错误代码。如果您跳过检查这些错误代码,后果自负。很可能某些 MPI 函数在调用 MPI_Bcast() 之前就返回了错误代码。

关于c++ - 信号 : Segmentation fault (11) when using Openmpi,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/58296806/

相关文章:

c++ - OpenCV、OpenFramework 和处理框架之间有什么区别?

linux - 从 ARGV 访问变量

python - 如何组织使用 Curl 返回的数组?

linux - Docker 不会将文件从容器复制到主机的/tmp 文件夹

c - 为什么这个反向函数在for循环中不起作用?

c++ - 如何从C中的数据中删除Ì

c++ - 谁能告诉我为什么显示 "runtime error"?

c++ - 在 C/C++ 中,我想多次写入同一个管道

c++ - UML 图上数组的表示数组

c - 递归 C 宏未展开