multithreading - 测量多线程应用程序的带宽

测量我的应用程序(多线程，使用 OpenMP 编写)正在使用的带宽的最简单和最有效的方法是什么？我跑了 STREAM 以获得最大值。可持续带宽，我现在想知道我是否使整个可用带宽饱和。

我发现了几个相关的问题(例如 Main memory bandwidth measurement )，但我找不到这个问题的答案；

遗憾的是，我不能使用 VTune，但我可以使用 PAPI 计数器；

我的主要目标是找出我的应用程序的可扩展性差是否与内存带宽的饱和有关。

谢谢

最佳答案

有多种方法可以(从命令行)获取整个应用程序的带宽，但听起来您想单独查看许多内核。在这种情况下，使用 PAPI 调用包装部分代码是一种非常明智的方法。

您可以使用系统上的 PAPI 事件计数器 (papi_avail) 来查找加载/存储指令的总数，如果您知道加载/存储的大小，您可以获得内存带宽。或者，您可以计算缓存中的命中，并乘以行大小，以推断跨系统传输的实际数据量。 PAPI wiki 上的各个地方都有文档，例如here用于高级接口(interface)，here's有用的导出量的一些有用的公式。

这是一个编码的简单示例，以合理的方式进行矩阵向量乘法和对缓存不友好的转置方式。请注意，调用 PAPI_read_counters 会重置计数器，这正是我们在这里想要的。

#include <stdio.h>
#include <stdlib.h>
typedef char * caddr_t;
#include <papi.h>
#include <sys/time.h>

int init(float ***a, float **x, float **y, int size);
void report_results(char *tname, long_long *values, const int n, double wtime);
void sensible_matvec(float **a, float *x, float *y, int size);
void wrong_order_matvec(float **a, float *x, float *y, int size);
void tick(struct timeval *t);
double tock(struct timeval *t);

#define NUM_EVENTS 3
int main(int argc, char **argv) {
    const int matsize = 4096;

    float **a, *x, *y;
    init(&a, &x, &y, matsize);

    int events[NUM_EVENTS] = {PAPI_L1_DCM, PAPI_LST_INS, PAPI_FP_INS};
    long_long values[NUM_EVENTS];

    double walltime;
    struct timeval t;

    if (PAPI_start_counters(events, NUM_EVENTS) != PAPI_OK) {
       fprintf(stderr, "Error starting PAPI counters; aborting\n");
       exit(1);
    }

    tick(&t);
    sensible_matvec(a, x, y, matsize);
    PAPI_read_counters(values, NUM_EVENTS);
    walltime = tock(&t);

    report_results("Sensible", values, NUM_EVENTS, walltime);

    tick(&t);
    wrong_order_matvec(a, x, y, matsize);
    PAPI_stop_counters(values, NUM_EVENTS);
    walltime = tock(&t);

    report_results("Wrong order", values, NUM_EVENTS, walltime);

    return 0;
}

void report_results(char *tname, long_long *values, const int n, double wtime) {
    long_long total_mem = values[1];
    long_long total_flops = values[2];
    long_long l1misses = values[0];
    printf("Test %s: time elapsed = %f, memory accesses = %lld, flop = %lld\n",
            tname, wtime, total_mem, total_flops);
    printf("\tMemory bandwidth (MB/sec) = %f\n", 1.0*total_mem*sizeof(float)/(wtime*1024*1024));
    printf("\tL1 cache miss rate = %f\n", 1.0*l1misses/total_mem);
    printf("\tMFLOPS = %lf\n\n", 1.0*total_flops/(wtime*1024*1024));
}

int alloc2d(float ***a, int n);
int free2d(float ***a, int n);
int alloc1d(float **x, int n);
int free1d(float **x, int n);

int init(float ***a, float **x, float **y, int size) {
    if (alloc2d(a,size))
        return -2;

    if (alloc1d(x,size)) {
        free2d(a,size);
        return -2;
    }

    if (alloc1d(y,size)) {
        free2d(a,size);
        free1d(x,size);
        return -3;
    }

    for (int i=0; i<size; i++) {
            (*x)[i] = (float)i;
            (*y)[i] = 0.;
    }

    for (int i=0; i<size; i++) {
        for (int j=0; j<size; j++) {
            (*a)[i][j] = i;
        }
    }

    return 0;
}
void sensible_matvec(float **a, float *x, float *y, int size) {
    for (int i=0; i<size; i++) {
        for (int j=0; j<size; j++) {
            y[i] += a[i][j]*x[j];
        }
    }
}

void wrong_order_matvec(float **a, float *x, float *y, int size) {
    for (int j=0; j<size; j++) {
        for (int i=0; i<size; i++) {
            y[i] += a[i][j]*x[j];
        }
    }
}

void tick(struct timeval *t) {
    gettimeofday(t, NULL);
}


double tock(struct timeval *t) {
    struct timeval now;
    gettimeofday(&now, NULL);
    return (double)(now.tv_sec - t->tv_sec) + ((double)(now.tv_usec - t->tv_usec)/1000000.);

}


void freeall(float ***a, float **x, float **y, int size) {
    free2d(a, size);
    free1d(x, size);
    free1d(y, size);
    return;
}

int alloc2d(float ***a, int n) {
    float *data = (float *)malloc(n*n*sizeof(float));
    if (data == NULL) return -1;

    *a = (float **)malloc(n*sizeof(float *));
    if (*a == NULL) {free(data); return -1;};

    for (int i=0; i<n; i++)
        (*a)[i] = &(data[i*n]);

    return 0;
}
int free2d(float ***a, int n) {
    free (&((*a)[0][0]));
    free(*a);

    return 0;
}


int alloc1d(float **a, int n) {
    *a = (float *)malloc(n*sizeof(float));
    if (*a == NULL) return -1;

    return 0;
}

int free1d(float **a, int n) {
    free(*a);

    return 0;
}

运行给出:

$ gcc -o papi-test papi-test.c -I${PAPI_INC_DIR} -L${PAPI_LIB_DIR} -lpapi -Wall -std=c99
$ ./papi-test
Test Sensible: time elapsed = 0.121877, memory accesses = 302020775, flop = 33580481
    Memory bandwidth (MB/sec) = 9453.119330
    L1 cache miss rate = 0.003921
    MFLOPS = 262.763624

Test Wrong order: time elapsed = 0.537639, memory accesses = 302026751, flop = 39629352
    Memory bandwidth (MB/sec) = 2142.963254
    L1 cache miss rate = 0.094045
    MFLOPS = 70.295301

关于multithreading - 测量多线程应用程序的带宽，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/25495592/

multithreading - 测量多线程应用程序的带宽

上一篇：code-contracts - 代码契约(Contract)试图获取构建错误而不是警告

下一篇：multithreading - 基于 GPU 的视频卡可加速您的程序计算，如何？