c - 使用 OpenMP 并行化 C 代码后的性能损失

我需要使用 OpenMP 并行化 C 代码。我已经像下面的代码那样做到了。该代码的性能已从 0.001XXX 下降到 1.XXX ，因此存在竞争条件(瓶颈)。在代码中，我对它们在哪里进行了注释(在 MAYUS 中进行了注释，并且没有制成表格，以便更好地可视化 //BOTTLENECK)。我不知道为什么会有这样的照顾者条件。

代码是

#define sqr(x) ((x)*(x))
#define MAX_ITER_NO_IMPR 10



void fail(const char * str) {
    fprintf(stderr,"%s", str);
    exit(-1);
}

/**
* calc_distance calculates the distance between a given point and a cluster
* @param int -dim: number of columns (variables) in the data set to be classified
* @param float * -: first arrray to calculate de distance
* @param float * -: Second array to calculate de distance 
* @return float: Euclidean distance of two vectors
*/
float calc_distance(int dim, float *restrict p1, float *restrict p2) {
    float distance_sq_sum = 0;

    #pragma omp parallel for reduction(+:distance_sq_sum)     
    for (int i = 0; i < dim; ++i)
        distance_sq_sum += sqr(p1[i] - p2[i]);

    return distance_sq_sum;  
}

/**
* calc_all_distances computes the euclidean distances between centros ids and dataset points. 
* @param int -dim: number of columns (variables) in the data set to be classified
* @param int -n: number of rows (points) in the data set to be classified   
* @param int -k: number of clusters to be calculated
* @param float * -X: dataset to be classified
* @param float * -centroid: prototypes of each cluster. 
* @param float * -distance_output[n][k] contains the distance between all elements * in the dataset and all clusters
* return void  
*/
void calc_all_distances(int dim, int n, int k, float *restrict X, float *restrict centroid, float *distance_output) {
    #pragma omp parallel for simd
    for (int i = 0; i < n; ++i) // for each point
        for (int j = 0; j < k; ++j) // for each cluster 
            // calculate distance between point and cluster centroid
            distance_output[i*k+j] = calc_distance(dim, &X[i*dim], &centroid[j*dim]);
}


/**
* calc_total_distance calculates the clustering overall distance.  
* @param int -dim: number of columns (variables) in the data set to be classified
* @param int -n: number of rows (points) in the data set to be classified   
* @param int -k: number of clusters to be calculated
* @param float * -X: dataset to be classified
* @param float * -centroid: prototypes of each cluster. 
* @param int * - cluster_assignment_index: current cluster assignment to each point
* @return float overall distance. This is what the algorithm tried to minimize  
*/
float calc_total_distance(int dim, int n, int k, float *restrict X, float *restrict centroids, int *restrict cluster_assignment_index) {
    // NOTE: a point with cluster assignment -1 is ignored
    float tot_D = 0;

    // for every point
    #pragma omp parallel for simd reduction(+:tot_D)
//BOTTLENECK
    for (int i = 0; i < n; ++i) {
        // which cluster is it in?
        int active_cluster = cluster_assignment_index[i];

        // sum distance
        if (active_cluster != -1)
            tot_D += calc_distance(dim, &X[i*dim], &centroids[active_cluster*dim]);
    }

    return tot_D;
}


/**
* choose_all_clusters_from_distances obtains the closest cluster for each point.  
* @param int -dim: number of columns (variables) in the data set to be classified
* @param int -n: number of rows (points) in the data set to be classified   
* @param int -k: number of clusters to be calculated
* @param float * -distance_array[n][k] contains the distance between all elements * in the dataset and all clusters
* @param int* - cluster_assignment_index contains the assigned cluster to each point 
* @return void
*/
void choose_all_clusters_from_distances(int dim, int n, int k, float *restrict distance_array, int *cluster_assignment_index) {
    // for each point
    #pragma omp parallel for simd
    for (int i = 0; i < n; ++i) {
        int best_index = -1;
        float closest_distance = INFINITY;

        // for each cluster
//  #pragma omp privete(best_index, closest_distance)
        for (int j = 0; j < k; ++j) {
            // distance between point and cluster centroid
            float cur_distance = distance_array[i*k+j];
            if (cur_distance < closest_distance) {
                best_index = j;
                closest_distance = cur_distance;
            }
        }

        // record in array
        cluster_assignment_index[i] = best_index;
    }
}

/**
* calc_cluster_centroids calculates the new prototypes of all clusters 
* @param int -dim: number of columns (variables) in the data set to be classified
* @param int -n: number of rows (points) in the data set to be classified   
* @param int -k: number of clusters to be calculated
* @param float * -X: dataset to be classified
* @param int * - cluster_assigment_index:  
* @param float * -new_cluster_centroid: it is the output with the new cluster prototypes
*/

void calc_cluster_centroids(int dim, int n, int k, float *restrict X, int *restrict cluster_assignment_index, float *new_cluster_centroid) {
    int * cluster_member_count = (int *) calloc (k,sizeof(float));

    // sum all points
    // for every point
    #pragma omp parallel for simd
//BOTTLENECK
    for (int i = 0; i < n; ++i) {
        // which cluster is it in?
        int active_cluster = cluster_assignment_index[i];

        // update count of members in that cluster
        ++cluster_member_count[active_cluster];

        // sum point coordinates for finding centroid
        for (int j = 0; j < dim; ++j)
            new_cluster_centroid[active_cluster*dim + j] += X[i*dim + j];
    }

// USAR CONCELLATION POINT FOR O LA OTRA FORMA COMENTADA?      
    // now divide each coordinate sum by number of members to find mean/centroid
    // for each cluster
    #pragma omp for
    for (int i = 0; i < k; ++i) {
        if (cluster_member_count[i] == 0) {
            //printf("WARNING: Empty cluster %d! \n", i);
            //break;
            #pragma omp cancel for
        }
        #pragma omp cancellation point for

        // for each dimension
    #pragma omp simd 
        for (int j = 0; j < dim; ++j)
            new_cluster_centroid[i*dim + j] /= cluster_member_count[i];  /// XXXX will divide by zero here for any empty clusters!
    }
}

/**
* get_cluster_member_count the member of each cluster
* @param int -n: number of rows (points) in the data set to be classified   
* @param int -k: number of clusters to be calculated
* @param int* - cluster_assignment_index contains the assigned cluster to each point 
* @param int * -cluster_member_count: count members of each cluster 
*/
void get_cluster_member_count(int n, int k, int *restrict cluster_assignment_index, int *cluster_member_count) {
    // count members of each cluster    
    #pragma omp parallel for
    for (int i = 0; i < n; ++i)
    #pragma omp atomic update
        ++cluster_member_count[cluster_assignment_index[i]];
}


/**
* Visualize the number of members for all clusters
*/
void cluster_diag(int dim, int n, int k, float *restrict X, int *restrict cluster_assignment_index, float *restrict cluster_centroid) {
    int * cluster_member_count = (int *) calloc (k, sizeof(int));

    get_cluster_member_count(n, k, cluster_assignment_index, cluster_member_count);

    printf("  Final clusters\n");
    #pragma omp parallel for ordered
    for (int i = 0; i < k; ++i) { 
    #pragma omp ordered
        printf("\tcluster %d:  members: %8d, for the centroid (", i, cluster_member_count[i]);
        for (int j = 0; j < dim; ++j)  
            #pragma omp ordered 
            printf ("%f, ", cluster_centroid[i*dim + j]);
    #pragma omp ordered
        printf (")\n");
    }
}

void copy_assignment_array(int n, int *restrict src, int *tgt) {
    #pragma omp  parallel for simd
    for (int i = 0; i < n; ++i)
        tgt[i] = src[i];
}  


int assignment_change_count(int n, int a[], int b[]) {
    int change_count = 0;

    #pragma omp parallel for reduction(+:change_count)
    for (int i = 0; i < n; ++i)
        if (a[i] != b[i])
            ++change_count;

    return change_count;
}


/*
* This is C source code for a simple implementation of the popular k-means clustering algorithm. 
* It is based on the implementation in Matlab, which was in turn based on GAF Seber, 
* Multivariate Observations, 1964, and H Spath, Cluster Dissection and Analysis: Theory, FORTRAN Programs, Examples.
* @param int -dim: number of columns (variables) in the data set to be classified (dimension of data)
* @param float * -X: dataset to be classified (pointer to data)
* @param int -n: number of rows (points) in the data set to be classified (number of elements)
* @param int -k: number of clusters to be calculated
* @param float * -cluster_centroid: Initial clusters prototypes or centros (initial cluster centroids)
* @param int iterations -: number of iterations to be performed
* @param int * cluster_assignment_final -: Output classitfication  
*/
void kmeans(int dim, float *X, int n, int k, float *cluster_centroid, int iterations, int *cluster_assignment_final) {
    int floatPointerSize = n * k * sizeof(float);
    int intPointerSize = n * sizeof(int);
    float *dist = (float *) malloc( floatPointerSize );
    int *cluster_assignment_cur = (int *) malloc( intPointerSize );
    int  *cluster_assignment_prev = (int *) malloc( intPointerSize );
    float *point_move_score = (float *) malloc( floatPointerSize );

    if (!dist || !cluster_assignment_cur || !cluster_assignment_prev || !point_move_score)
        fail("Error allocating dist arrays\n");

    // Initial setup. Assignment Step  
    calc_all_distances(dim, n, k, X, cluster_centroid, dist);
    choose_all_clusters_from_distances(dim, n, k, dist, cluster_assignment_cur);
    copy_assignment_array(n, cluster_assignment_cur, cluster_assignment_prev);

    //The initial quality is the one obtained from the random election
    float prev_totD = calc_total_distance(dim, n, k, X, cluster_centroid, cluster_assignment_cur);

    int numVariations = 0;
    // UPDATE STEP
    // for (int batch=0; (batch < iterations) && (numVariations <MAX_ITER_NO_IMPR); ++batch) {

   for (int batch = 0; batch < iterations; ++batch) {
        //printf("Batch step: %d \n", batch);
        //cluster_diag(dim, n, k, X, cluster_assignment_cur, cluster_centroid);

        // update cluster centroids. Update Step
        calc_cluster_centroids(dim, n, k, X, cluster_assignment_cur, cluster_centroid);

        float totD = calc_total_distance(dim, n, k, X, cluster_centroid, cluster_assignment_cur);

        // see if we've failed to improve
        if (totD >= prev_totD){
            // failed to improve - currently solution worse than previous
            // restore old assignments
            copy_assignment_array(n, cluster_assignment_prev, cluster_assignment_cur);

            // recalc centroids
            // calc_cluster_centroids(dim, n, k, X, cluster_assignment_cur, cluster_centroid);    
            //printf("\tNegative progress made on this step - iteration completed (%.2f) \n", prev_totD-totD);
            ++numVariations; //To implement no convergence criteria               
        }
        else { // We have made some improvements        
            // save previous step
            copy_assignment_array(n, cluster_assignment_cur, cluster_assignment_prev);
            // move all points to nearest cluster
            calc_all_distances(dim, n, k, X, cluster_centroid, dist);
            choose_all_clusters_from_distances(dim, n, k, dist, cluster_assignment_cur);
            //check how many assignments are different  
            //int change_count = assignment_change_count(n, cluster_assignment_cur, cluster_assignment_prev);
            //printf("\tIn the batch: %d, has changed: %d element to a different cluster with an improvement of %f \n", batch, change_count, prev_totD-totD);
            //fflush(stdout);
            prev_totD = totD;
        } 
    }

// 
    cluster_diag(dim, n, k, X, cluster_assignment_cur, cluster_centroid);

    // write to output array
    copy_assignment_array(n, cluster_assignment_cur, cluster_assignment_final);    

    //Free memory
    free(dist);
    free(cluster_assignment_cur);
    free(cluster_assignment_prev);
    free(point_move_score);
}           



/**
* random_init_centroid chooses random prototypes that belong to the dataset. They are points of the dataset.   
*@param float * -: cluster_centro_if: clustes id choosen
*@param float * -: dataSetMatrix 
*@param int clusters: Number of cluster to be don. 
*@param int rows in number of rows in the dataset; i.e. points
*@param int columns: number of columns. Point's dimension. 
*@return void
*/
void random_init_centroid (float * cluster_centro_id, float * dataSetMatrix, int clusters, int rows, int columns) {
   srand(time(NULL));

   for (int i=0; i<clusters; ++i) {
        int r = rand()%rows; 
        for (int j=0; j<columns;++j) {
            cluster_centro_id[i*columns+j]=dataSetMatrix[r*columns+j];
            //printf ("Los indices son  %d\n", r*columns+j);        
        }       
    }
}   



int main( int argc, char *argv[] ) {
/**/ 
    if( !omp_get_cancellation() )
    {
        //printf("Cancellations were not enabled, enabling cancellation and rerunning program\n");
        putenv("OMP_CANCELLATION=true");
        execv(argv[0], argv);
    }
    int numHilos = 0;
    #pragma omp parallel
    {
    #pragma omp master
    numHilos = omp_get_num_threads();
    }
    if (numHilos == 1) {
    //printf("Program is executing sequentially, setting 2 threads and rerunning program\n");
        putenv("OMP_NUM_THREADS=2");
        execv(argv[0], argv);
    }
/**/

    float *cluster_centroid;   // initial cluster centroids. The size is Clusters x rows
    int *clustering_output;  // output
    int rows=0, columns=0, clusters=1;
    int iterations = 1000;
    float * dataSetMatrix=NULL;
    char c, *fileName=NULL;

    //int err=system("clear");

    while ((c = getopt (argc, argv, "v:c:f:i:h")) != -1) {
        switch (c) {
        case 'v':
            printf("K means algorithm v.1.0\n\n");
        return 0;
        case 'c':
            clusters = atoi(optarg);
            if (clusters < 1) { 
                    printf ("the minimum number of clusters is 1\n");
                    return 0;
                }
                break;
        case 'f':
                fileName = (char *) malloc (strlen(optarg)+1);  
            strcpy(fileName,optarg);
            break;
        case 'i':
                iterations = atoi (optarg);  
                break;
        case 'h':
        case '?':
            printf("Usage:\trun -c number of clusters -f fichero.txt -i number of iterations [-h | -? HELP] \n");
        printf("\t<Params>\n");
        printf("\t\t-v\t\tOutput version information and exit\n");
            return 0;
        }
    }

    //printf ("..............Loading data set...............\n "); 
    // Get file size dataset
    getSizeFile( fileName, &rows, &columns );

    clustering_output = (int *) malloc (rows*sizeof(int));
    // Reserve dynamic memory for dataset matrix
    reserveDynamicMemoryForMatrix( &dataSetMatrix, rows, columns );

    // Set data in the dataset matrix
    setDataInMatrix( dataSetMatrix, fileName, rows, columns );

    //printf ("-------DataSet: \n");
    //printMatrix(dataSetMatrix, rows, columns);

    // printf ("..............Done..............\n "); 
    cluster_centroid = (float *) malloc (clusters*columns*sizeof(float));
    random_init_centroid (cluster_centroid, dataSetMatrix, clusters, rows, columns);   

    //printf (".........Initial Prototypes: ................ \n");
    //printMatrix(cluster_centroid, clusters, columns);

// COMENTAR ESTAS LÍNEA PARA NO MOSTRAR RESULTADOS
    printf ("The number of instance: %d Variables: %d Clusters: %d and Iterations: %d\n", rows, columns,clusters, iterations);
//  printf ("File: %d; \tClusters: %d; \tIterations: %d\n", filename, clusters, iterations);
//    
    double ini = omp_get_wtime();
    kmeans (columns, dataSetMatrix, rows, clusters, cluster_centroid, iterations, clustering_output);  
    double fin = omp_get_wtime();
    printf ("The execution time is %lf seconds\n", fin-ini);

    // Free memory
    free (dataSetMatrix); 
    free (cluster_centroid);
    free (clustering_output); 
}

有人知道为什么会出现瓶颈吗？我该如何解决这些问题？

谢谢。

<小时/> 编辑1:
@Brice 更改后，我收到下一个错误:

最佳答案

1°/我没有看到太多矢量化的机会，因此 simd 构造很可能不会提高第一个循环的性能。可以在第二个上。

2°/循环包含对共享变量 cluster_member_count 和 new_cluster_centroid 的写入操作，并且应使用 ompatomic 保护这些变量免受竞争条件的影响或omp critical 编译指示。这将导致大部分循环的序列化。您需要的是确保两个线程不能处理属于同一簇的点。

有多种选项可以解决该问题。一是先获取簇号，然后决定是否处理。

// make sure the code will compile even if openMP is disabled
#ifdef _OPENMP
   #include <omp.h>
#else
   #define omp_get_num_threads() 1
   #define omp_get_thread_num() 0
#endif

#pragma omp parallel
{
// declare local variables
int active_cluster;
int myThread = omp_get_thread_num();
int nbOfThreads = omp_get_num_threads();
for (int i = 0; i < n; ++i) {
     active_cluster = cluster_assignment_index[i];
     if (active_cluster%nbOfThreads == myThread){

     // update count of members in that cluster
     ++cluster_member_count[active_cluster];

     // sum point coordinates for finding centroid
     #pragma omp simd
     for (int j = 0; j < dim; ++j)
         new_cluster_centroid[active_cluster*dim + j] += X[i*dim + j];
     }// end if
} //end parallel

有两个问题: 如果某些集群明显大于其他集群，这可能会导致线程不平衡。这可以通过在第一遍中计算集群成员来解决，然后决定进行一些负载平衡。此外，即使每个线程现在仅在其他线程不会写入的位置写入数据，该数据仍可能属于其他线程使用的缓存行，从而导致 false sharing 。例如，为了增加cluster_member_count[2]，线程需要从内存中获取它，因为 cluster_member_count 1刚刚被另一个线程修改。这很慢。

更好的方法是安排每个线程只将数据写入连续的内存空间。

#pragma omp parallel
{
// declare local variables
int active_cluster;
int myThread = omp_get_thread_num();
int nbOfThreads = omp_get_num_threads();
int process_from=myThread*k/nbOfThreads;
int process_to = (myThread+1==nbOfThreads) ? k : (myThread+1)*k/nbOfThreads;
for (int i = 0; i < n; ++i) {
     active_cluster = cluster_assignment_index[i];
     if (active_cluster>=process_from && active_cluster<process_to ){
     //SAME//
     }
}

最后，“map-reduce”方法虽然使用稍微多一点的内存，但在那里可能会很方便。它消耗更多的内存 d/t 本地缓冲区，但显示了如何调整算法才能并行化:

// add to header
#ifdef _OPENMP
   #include <omp.h>
#else
   #define omp_get_num_threads() 1
   #define omp_get_thread_num() 0
#endif


void calc_cluster_centroids(int dim, int n, int k, float *const X, int *const cluster_assignment_index, float *restrict new_cluster_centroid) {
    int ** cluster_member_count;
    float ** local_cluster_centroid;
    float normFactor;
    #pragma omp parallel
    {
    //declare variables
    int active_cluster;
    int myThread = omp_get_thread_num();
    int nbOfThreads = omp_get_num_threads();
    int number_count;
    float normFactor;

    // allocate memory
    // this is done now as we need to be in the parallel region to know how many threads there are
    // one thread will allocate the arrays of local buffers, the each thread allocates its own local buffers
    #pragma omp single
    {
        cluster_member_count=malloc(nbOfThreads*sizeof(*cluster_member_count));
        local_cluster_centroid=malloc(nbOfThreads*sizeof(*local_cluster_centroid));
    }
    // allocate local buffer for each thread
    // for thread 0, local_cluster_centroid[0] contains new_cluster_centroid
    cluster_member_count[myThread] = calloc(k,sizeof(int));
    if (myThread) local_cluster_centroid[myThread] = calloc(k*dim,sizeof(float));
    else local_cluster_centroid[myThread] = new_cluster_centroid;


    //MAP : loop over points, increment count and accumumlate position in a local buffer
    int *my_member_count=cluster_member_count[myThread];
    float *my_cluster_centroid=local_cluster_centroid[myThread];

    #pragma omp for
    for (int i = 0; i < n; ++i) {
        // each thread writes in its own buffer so there is no race condition or cache false sharing

        active_cluster = cluster_assignment_index[i];
        // update count of members in that cluster
        ++my_member_count[active_cluster];

        // sum point coordinates for finding centroid
        for (int j = 0; j < dim; ++j)
            my_cluster_centroid[active_cluster*dim + j] += X[i*dim + j];
    }


    // REDUCE : loop over clusters and summ all local buffers
    #pragma omp for schedule(static,64)
    for (int i = 0; i < k; ++i) {
        number_count=cluster_member_count[0][i];
        for (int buff=1 ; buff<nbOfThreads ; ++buff){
            number_count+=cluster_member_count[buff][i];

            // sum point coordinates for finding centroid
            for (int j = 0; j < dim; ++j)
                local_cluster_centroid[0][i*dim + j]+=local_cluster_centroid[buff][i*dim + j];
        }
        normFactor=(number_count) ? 1/(float) number_count : 1.0f ;
        #pragma omp simd
        for (int j = i*dim; j<(i+1)*dim ; ++j)
            cluster_member_count[0][j]*=normFactor;
    }

    // free memory
    free(cluster_member_count[myThread]);
    if (myThread) free(local_cluster_centroid[myThread]);

    }//end parallel
    //once each thread has freeed its own buffer, let the master free the array of buffers
    free(cluster_member_count);
    free(local_cluster_centroid);

}//end function

关于c - 使用 OpenMP 并行化 C 代码后的性能损失，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/53207436/

c - 使用 OpenMP 并行化 C 代码后的性能损失

上一篇：c - 如何创建具有数组输入和输出的 C 函数

下一篇：python - 如何返回 Flex 中 token 的最短匹配？