c++ - 与 zheevr 的 OpenMP C++ 数据竞赛

标签 c++ openmp lapack

我正在尝试使用 openMP 在 C++ 中并行化 for 循环。我有许多矩阵(矩阵类)需要用 zheevr 取幂。该实现给出了数据竞赛。

并行化的for循环是

/* in main */
#pragma omp parallel for shared(Aexp_omp, A, Z, W) private(j) firstprivate(idt)
for (j=0; j< nMat; ++j) {
    Aexp_omp[j] = EigenMethod(A[j],-idt,&(Z[j]),W[j]);
}

这里,Aexp_ompAZ都是Matrix类的数组。确切的行似乎是 EigenMethod 中对 zheevr 的第二次调用。当 #pragma omp critical 没有被注释掉时,代码可以正确执行(但是在这里放置 pragma omp critical 似乎违背了将这段代码并行化的目的)。

我不明白数据竞争是从哪里来的,因为如果我没记错的话,循环中清除的所有变量都是线程私有(private)的?

关于为什么会发生这种情况的任何见解?我是否错误地清除了代码的 pragma omp parallel for 部分?

非常感谢您的帮助。

编辑:完整代码

为了提供所有信息,这里是完整的代码(有点长,我很抱歉)。它使用 makefile 在我的机器上编译

omp_matrix: omp_matrix.cpp
        g++ -fopenmp -lgsl -lgslcblas -llapack -lblas -lm -o omp_matrix omp_matrix.cpp

由于我怀疑数据竞争,它在运行时会出现随机行为。代码必须包含文件,包含 main(和一些额外的命名空间)和 Matrix 类的文件。

/* omp_matrix.cpp */
#include <iostream>
#include <omp.h>
#include <cstdio>
#include <ctime>
#include <string>
#include <cmath>
#include <vector>
#include <complex>
#include <limits>
#include <cstdlib>

const char Trans[]  = {'N','T','C'};
const char UpLo[]   = {'U','L'};
const char Jobz[]   = {'V','N'};
const char Range[]  = {'A','V','I'};
#ifdef __cplusplus
extern "C" {
#endif

void zgemm_(const char* TransA, const char* TransB, const size_t* M, const size_t* N, const size_t* K, const std::complex<double>* alpha, const std::complex<double>* A, const size_t* lda, const std::complex<double>* B, const size_t* ldb, const std::complex<double>* beta, std::complex<double>* C, size_t* ldc);
int zheevr_(const char* jobz, const char* range, const char* uplo, const size_t* n, std::complex<double>* a, size_t* lda, double* vl, double* vu, size_t* il, size_t* iu, double* abstol, size_t* m, double* w, std::complex<double>* z, size_t* ldz, int* isuppz, std::complex<double>* work, int* lwork, double* rwork, int* lrwork, int* iwork, int* liwork, int* info);

#ifdef __cplusplus
}
#endif

    #include "Matrix.hpp"

namespace MOs {
    //Identity Matrix
    template <class T> 
    inline void Identity(Matrix<T>& I){
    size_t dim = I.GetRows();
    if (dim != I.GetColumns())
            exit(1);
    for(size_t i=0; i<dim; i++)
        for(size_t j=0; j<dim; j++)
                I(i,j) = i == j ? T(1) : T(0);
    return; 
    }

    // The H.O. Annilation operator
    template <class T> 
    inline void Destroy(Matrix<T>& a){
            size_t dim = a.GetRows();
            if (dim != a.GetColumns())
                    exit(1);
            for(size_t i=0; i<dim; i++)
                    for(size_t j=0; j<dim; j++)
                            a(i,j) = j == i+1 ? std::sqrt(T(j)) : T(0); 
            return; 
    }

    //Take the Hermitian conjugate of a complex Matrix
    inline Matrix<std::complex<double> > Dagger(const Matrix<std::complex<double> >& A){
            size_t cols = A.GetColumns(), rows= A.GetRows();
            Matrix<std::complex<double> > temp(cols,rows);
            for (size_t i=0; i < rows; i++)
                    for(size_t j=0; j < cols; j++)
                            temp(j,i) = conj(A(i,j));
            return temp;
    }

    template <class T>
    inline Matrix<T> TensorProduct(const Matrix<T>& A, const Matrix<T>& B){
            size_t rows1 = A.GetRows(), rows2 = B.GetRows(), cols1 = A.GetColumns(), cols2 = B.GetColumns();
            size_t rows_new = rows1*rows2, cols_new = cols1*cols2, n, m;
            Matrix<T> temp(rows_new,cols_new);
            for(size_t i=0; i<rows1; i++)
                    for(size_t j=0; j<cols1; j++)
                            for(size_t p=0; p<rows2; p++)
                                    for(size_t q=0; q<cols2; q++){
                                            n = i*rows2+p;    
                                            m = j*cols2+q; //  0 (0 + 1)  + 1*dimb=2 + (0 + 1 )  (j*dimb+q)         
                                            temp(n,m) = A(i,j)*B(p,q);
                            }
                    return temp;
            }
}

Matrix<std::complex<double> > EigenMethod(const Matrix<std::complex<double> >& Ain,  std::complex<double> x, Matrix<std::complex<double> >* Z=NULL, double *W=NULL );

using namespace std;

int main(int argc, char const *argv[]) {
    // dim is the dimension of the system
    size_t  dimQ    = 2;
    size_t  dim = dimQ*dimQ*dimQ;

    // Define the matrices used to build the matrix to be exponentiated
    Matrix<complex<double> > o(dimQ,dimQ), od(dimQ,dimQ), nq(dimQ,dimQ);
    MOs::Destroy(o);
    od = MOs::Dagger(o); nq=od*o;
    Matrix<complex<double> > IdentQ(dimQ, dimQ), Hd(dim, dim);
    MOs::Identity(IdentQ);
    Hd = 0.170*MOs::TensorProduct(MOs::TensorProduct(od,IdentQ),o) + 0.124* MOs::TensorProduct(MOs::TensorProduct(IdentQ,od),o);

    complex<double> idt = complex<double>(0.0,0.2);                 // time unit multiplied by i

    size_t  nMat    = 2;
    double* c   = new double[nMat];
    c[0] = 0.134;
    c[1] = -0.326;

    // Decalre matrices and allocate memory for them
    Matrix<complex<double> >*   A    = new Matrix<complex<double> >[nMat];
    Matrix<complex<double> >*   Aexp     = new Matrix<complex<double> >[nMat];
    Matrix<complex<double> >*   Aexp_omp = new Matrix<complex<double> >[nMat];
    for (size_t k = 0; k < nMat; ++k)
            A[k] = c[k]*Hd;

    // METHOD 1: Serial. Exponentiate one matrix after another
    double start_s = omp_get_wtime();
    for (size_t k = 0; k < nMat; ++k)
            Aexp[k] = EigenMethod(A[k],-idt);
    double end_s = omp_get_wtime();

    cout << "Aexp[0] = \n" << Aexp[0] << endl;
    cout << "Aexp[1] = \n" << Aexp[1] << endl;

    // METHOD 2: Parallel. Exponentiate all matrices at the same time
    // THIS DOES NOT WORK. Data race condition in EigenMetod?
    Matrix<complex<double> >* Z = new Matrix<complex<double> >[nMat];
    double** W = new double*[nMat];
    for (size_t k = 0; k < nMat; ++k) {
            Z[k].initialize(dim,dim);
            W[k] = new double[dim];
    }

    size_t j;
    double start_p1 = omp_get_wtime( );
    #pragma omp parallel for shared(Aexp_omp,A,Z,W) private(j) firstprivate(idt)
    for (j=0; j< nMat; ++j) {
            Aexp_omp[j] = EigenMethod(A[j],-idt,&(Z[j]),W[j]);
    }
    double end_p1 = omp_get_wtime( );

    cout << "Done" << endl;

    cout << "Aexp_omp[0] = \n" << Aexp_omp[0] << endl;
    cout << "Aexp_omp[1] = \n" << Aexp_omp[1] << endl;

    cout << "Serial time: " << end_s - start_s << ", parallel time method 2: " << end_p1-start_p1 << endl;

    delete [] c;
    delete [] A;
    delete [] Aexp;
    delete [] Aexp_omp;

} /* end of int main */

Matrix<complex<double> > EigenMethod(const Matrix<complex<double> >& A, complex<double> x, Matrix<complex<double> >* Z, double *W){ 

    bool returnZ(Z!=NULL), returnW(W!=NULL);                            // flags to see if the function received valid Z and W adresses

    //if (MOs::IsHermitian(A)==0) UFs::MyError("Routine ExpM::EigenMethod: The input Matrix is not Hermitian.");
    size_t N = A.GetRows(), LDA=A.GetLD(), IL, IU, M, LDZ=N;
    double VL, VU, ABSTOL=numeric_limits<double>::min(); 
    complex<double>*    WORK;
    double*         RWORK;
    int*            IWORK;

    //finding the optimal work sizes
    int LWORK=-1, LRWORK=-1, LIWORK=-1, ok=0;
    WORK    = new std::complex<double>[1];
    RWORK   = new double[1];
    IWORK   = new int[1];

    // ZHEEV computes all eigenvalues and, optionally, eigenvectors of a complex Hermitian Matrix A.
    // Jobz[0] = V i.e. compute eigenvalues and eigen-vectors, Range[0] = A i.e. all eigen values will be found, UpLo[1] = L i.e. lower triangle of matrix is stored
    zheevr_ (&Jobz[0],&Range[0],&UpLo[1],&N,NULL,&LDA,&VL,&VU,&IL,&IU,&ABSTOL,&M,NULL,NULL,&LDZ,NULL,WORK,&LWORK,RWORK,&LRWORK,IWORK,&LIWORK,&ok);

    //Setting the optimal workspace
    LWORK   = (int)real(WORK[0]);
    LRWORK  = (int)RWORK[0];
    LIWORK  = IWORK[0];

    delete [] WORK; 
    delete [] RWORK;
    delete [] IWORK;

    //Running the algorithim 
    if (!returnZ) Z = new Matrix<std::complex<double> >(LDZ,LDZ);
    if (!returnW) W = new double[LDA];

    Matrix<std::complex<double> > temp(A);

    int *ISUPPZ;
    ISUPPZ  = new int[2*N]; 
    WORK    = new std::complex<double>[LWORK];
    RWORK   = new double[LRWORK];
    IWORK   = new int[LIWORK];
    //#pragma omp critical
    //{
    // THIS LINE IS DOING SOMETHING WRONG
    zheevr_ (&Jobz[0],&Range[0],&UpLo[1],&N,temp.GetMat(),&LDA,&VL,&VU,&IL,&IU,&ABSTOL,&M,W,Z->GetMat(),&LDZ,ISUPPZ,WORK,&LWORK,RWORK,&LRWORK,IWORK,&LIWORK,&ok);   
    //}

    if (ok!=0) {
        cerr << "Routine EigenMethod: An error occured in geting the diagonalization of A" << endl;
        exit(1);
    }

    //Getting the Diag*Dagger(Z), this is where the exponentiation is done using the eigenvalues
    for (size_t i=0; i < N; i++)
        for(size_t j=0; j < N; j++)
            temp(i,j) = exp(x*W[i])*conj((*Z)(j,i));

    temp = (*Z)*temp;   

    if(!returnW) delete [] W; 
    if(!returnZ) delete Z; 

    delete [] ISUPPZ;
    delete [] WORK;
    delete [] RWORK;
    delete [] IWORK;

    return temp;
}

接下来是Matrix类

#ifndef Matrix_h
#define Matrix_h

enum OutputStyle {Column, List, Array};

template <class T>
class Matrix {  
//friend functions get to use the private varibles of the class as well as have different classes as inputs
template <class S>
friend std::ostream& operator << (std::ostream& output, const Matrix<S>& A){
    for (size_t i=0; i<A.rows_; i++){
        for (size_t j=0; j<A.cols_; j++){
            output << A.mat_[j*A.rows_ +i] << "\t";
        }
        output << std::endl;
    }
    return output;
}

// overloads beta*A
template <class S1, class S2>
friend Matrix<S1> operator*(const S2& beta, const Matrix<S1>& A){
    size_t rows= A.rows_, cols = A.cols_;
    Matrix<S1> temp(rows,cols);
    for (size_t i=0; i < rows; i++)
        for(size_t j=0; j < cols; j++)
            temp(i,j) = beta*A(i,j);
    return temp;
}


friend Matrix<std::complex<double> > operator*(const Matrix<std::complex<double> >& A, const Matrix<std::complex<double> >& B) {
    static Matrix<std::complex<double> > C;
    C.initialize(A.rows_,B.cols_);
    std::complex<double> alpha =1.0, beta =0.0;
    zgemm_(&Trans[0], &Trans[0], &A.rows_, &B.cols_, &A.cols_, &alpha, A.mat_, &A.LD_, B.mat_, &B.LD_, &beta, C.mat_, &C.LD_);
    return C;
}


public:
//Construtors
Matrix() : rows_(0), cols_(0), size_(0), LD_(0), outputstyle_(Array){ mat_=0;}
Matrix(size_t rows, size_t cols) : rows_(rows), cols_(cols), size_(rows*cols), LD_(rows), outputstyle_(Array), mat_(new T [size_]){}

Matrix(const Matrix<T>& rhs) : rows_(rhs.rows_), cols_(rhs.cols_), size_(rhs.size_), LD_(rows_), outputstyle_(rhs.outputstyle_), mat_(new T [size_]) {
for(size_t p = 0; p < size_; p++)
    mat_[p] = rhs.mat_[p];
}

//Initialize an empty Matrix() to Matrix(size_t  rows, size_t cols)
inline void initialize(size_t  rows, size_t cols) {
    if (rows_ != rows || cols_ != cols ) {
        if (mat_ != 0) delete [] (mat_);
        rows_   = rows;
        cols_   = cols;
        size_   = rows_*cols_;
        LD_ = rows;
        mat_    = new T[size_];
    }
}

//Destructor
virtual ~Matrix()   {if (mat_ != 0) delete[] (mat_);}

//Assignment operator
inline Matrix<T>& operator=(const Matrix<T>& rhs){
    if (rows_ != rhs.rows_ || cols_ != rhs.cols_ ) {
        if (mat_ != 0) delete [] (mat_);
        rows_=rhs.rows_;
        cols_=rhs.cols_;
        size_=rows_*cols_;
        LD_=rhs.LD_;
        mat_= new T[size_];
    }
    for (size_t p=0; p < size_; p++)
        mat_[p] = T(rhs.mat_[p]);
    return *this;
}

inline T& operator()(size_t i, size_t j){
    if (i >= rows_ || j >= cols_) exit(1);
    return mat_[j*rows_+i];
}

inline T operator()(size_t i, size_t j) const{
    if (i >= rows_ || j >= cols_) exit(1);
    return mat_[j*rows_+i];
}


//overloading functions. 
inline Matrix<T> operator+(const Matrix<T>& A){
    if(rows_ != A.rows_ || cols_ != A.cols_)
        exit(1);
    Matrix<T> temp(rows_,cols_);
    for (unsigned int p=0; p < size_; p++)      
        temp.mat_[p] = mat_[p] + A.mat_[p];
    return temp;
}

//Member Functions
inline size_t GetColumns() const            { return cols_; }
inline size_t GetRows() const               { return rows_; }
inline size_t GetLD() const             { return LD_;   }
inline size_t size() const              { return size_; }
T* GetMat() const                   { return mat_;  }

protected:
size_t rows_, cols_, size_, LD_;
//rows_ and cols_ are the rows and columns of the Matrix
//size_ = rows*colums dimensions of the vector representation   
//LD is the leading dimeonsion and for Column major order is in general eqaul to rows
enum OutputStyle outputstyle_;

T * mat_;

};
#endif /* Matrix_h */

最佳答案

您并没有真正提供足够的代码(也没有实际症状)来为您调试它。如果没有它,我只能猜测 zheerv_() 的实现使用了全局变量(直接或间接)。

除了我已经在评论中提到的代码味道外,变量 idt 还存在一个问题:函数 EigenMethod() 通过引用获取它,但是你传递了一个临时的 (-idt) -- 我很惊讶编译。我认为您并没有真正在 EigenMethod() 中更改其值 (x),因此您应该采用常量引用或(最好)仅采用一个值。

顺便说一句,不需要在并行循环中声明共享变量,直接说

 #pragma omp parallel for
 for(int j=0; j<nMat ;++j) { /* ... */ }

默认情况下,在循环外声明的所有变量都是共享的,循环变量永远不应该被声明为任何东西(无论如何,private 似乎不是正确的选择)。

关于c++ - 与 zheevr 的 OpenMP C++ 数据竞赛,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/12410472/

相关文章:

c++ - 将字符串转换为int的简单方法? C++

c++ - "function"类型的优点是什么(不是 "pointer to function")

c++ - Boost asio ConstBufferSequence - C++ 模板

c++ - 无法获得加速 OpenMP

c++ - dbgheap.c 抛出访问冲突异常

C++ 可变参数和 vsprintf

c++ - OpenMP:并行 for(i;...) 和 i 值

c - 为什么openmp 32线程比1线程慢得多?

cmake - 使用 CMake 检测 BLAS/LAPACK 供应商

c - 在调用 lapack 子例程之前我们需要复制矩阵吗