performance - 为什么 L2 硬件预取器在只有 1 KiB 或 2 KiB 访问大小时性能较差？

我有一个简单的多线程程序，其中线程对给定文件(在内存中)执行随机读取，该文件在线程之间均匀分配。线程从文件读取缓冲区并设置一个值。这实际上是一个旨在测试内存带宽的程序。这是以下程序，

#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <getopt.h>
#include <errno.h>
#include <stdbool.h>
#include <ctype.h>
#include <inttypes.h>
#include <pthread.h>
#include <assert.h>
#include <time.h>

#define NS_IN_SECOND 1000000000
uint64_t nano_time(void) {
struct timespec ts;
if( clock_gettime(CLOCK_REALTIME, &ts) == 0)
    return ts.tv_sec * NS_IN_SECOND + ts.tv_nsec;
}

// avx512 test
#include <stdint.h>
void *__memmove_chk_avx512_no_vzeroupper(void *dest, void *src, size_t s);

/**
* To create 4 GB file: This will allocate space on disk
* $ dd < /dev/zero bs=1048576 count=4096 > testfile
*
* 100 GiB
* dd if=/dev/zero of=bigmmaptest bs=1M count=102400
* To clear cache:
* $ sync; echo 1 > /proc/sys/vm/drop_caches
*/
//#define SAMPLE_LATENCY 1
#define BYTES_IN_GB (1024*1024*1024)
// Block sized will be used for read and the same will be used for striding 
// when iterating over a file in mmap. 
#define DEFAULT_BLOCK_SIZE 4096 //8192
#define NANOSECONDS_IN_SECOND 1000000000

const char DEFAULT_NAME[] = "/mnt/tmp/mmaptest";

#define EXIT_MSG(...)           \
    do {                        \
        printf(__VA_ARGS__);   \
        _exit(-1);              \
    } while (0)


uint64_t read_mmap_test(int fd, int tid, size_t block_size, size_t filesize, char* buf,
                    off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t write_mmap_test(int fd, int tid, size_t block_size, size_t filesize, char* buf,
                    off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t mmap_test(int fd, int tid, size_t block_size, size_t filesize, char *buf,
    char optype, off_t *offsets, uint64_t *begin, uint64_t *end);

uint64_t read_syscall_test(int fd, int tid, size_t block_size, size_t filesize, 
                    off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t write_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
                     off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t syscall_test(int fd, int tid, size_t block_size, size_t filesize, 
        char optype, off_t *offsets, uint64_t *begin, uint64_t *end);
        
size_t get_filesize(const char* filename);
void print_help_message(const char *progname);
char*    map_buffer(int fd, size_t size);
void    *run_tests(void *);

static int silent = 0;

typedef struct {
    int tid;
    int fd;
    char *mapped_buffer;
    int read_mmap;
    int read_syscall;
    int write_mmap;
    int write_syscall;
    off_t *offsets;
    size_t block_size;
    size_t chunk_size;
    int retval;
    uint64_t start_time;
    uint64_t end_time;
} threadargs_t;


size_t filesize;
int main(int argc, char **argv) {
    char *fname = (char*) DEFAULT_NAME;
    char *mapped_buffer = NULL;
    int c, fd, i, flags = O_RDWR, numthreads = 1, ret, option_index;
    static int randomaccess = 0,
        read_mmap = 0, read_syscall = 0,
        write_mmap = 0, write_syscall = 0,
        mixed_mmap = 0, write_tr = 0;
    off_t *offsets = 0;
    size_t block_size = DEFAULT_BLOCK_SIZE, numblocks,
     new_file_size = 0;
     uint64_t min_start_time, max_end_time = 0, retval;
     // permissions
    uint64_t mode = S_IRWXU | S_IRWXG;

    pthread_t *threads;
    threadargs_t *threadargs;

    static struct option long_options[] = 
    {
        // Options set a flag
        {"randomaccess", no_argument, &randomaccess, 1},
        {"readmmap", no_argument, &read_mmap, 1},
        {"readsyscall", no_argument, &read_syscall, 1},
        {"silent", no_argument, &silent, 1},
        {"writemmap", no_argument, &write_mmap, 1},
        {"writesyscall", no_argument, &write_syscall, 1},
        {"mixedmmap", no_argument, &mixed_mmap, 1},
        // Options take an argument
        {"block", required_argument, 0, 'b'},
        {"file", required_argument, 0, 'f'},
        {"help", no_argument, 0, 'h'},
        {"size", no_argument, 0, 's'},
        {"threads", required_argument, 0, 't'},
        {"writethreads", no_argument, 0, 'w'},
        {0, 0, 0, 0}
    };

    //read operations
    while(1) {
        c = getopt_long(argc, argv, "b:f:h:s:t:w:",
                long_options, &option_index);

        // is end of the option
        if (c == -1)
            break;
        
        switch(c)
        {
            case 0:
                break;
            case 'b':
                block_size = atoi(optarg);
                break;
            case 'f':
                fname = optarg;
                break;
            case 'h':
                print_help_message(argv[0]);
                _exit(0);
            case 's':
                new_file_size = (size_t)(atoi(optarg)) * BYTES_IN_GB;
                break;
            case 't':
                numthreads = (int) (atoi(optarg));
                break;
            case 'w':
                write_tr = atoi(optarg);
                break;
            default:
                break;
        }
    }

    if(!silent){
        printf("PID: %d\n", getpid());
        printf("Using file %s \n", fname);
    }
    if ((filesize = get_filesize(fname)) == -1) {
        if (read_mmap || read_syscall) {
            printf("Cannot obtain file size for %s: %s"
                   "File must exist prior to running read tests.\n",
                   fname, strerror(errno));
            _exit(-1);
        }
        else
            filesize = new_file_size;
    }

    fd = open((const char*)fname, flags, mode);
    if(fd <0) {
        printf("Clould not open/create file %s: %s\n",
            fname, strerror(errno));
            _exit(-1);
    }

    if(block_size < 0 || block_size > filesize){
        printf("Invalid block size: %zu for file of size "
        "%zu. Block size must be greater than 0 and no"
        "greater than the file size.\n",
        block_size, filesize);
        _exit(-1);
    }

    /* 
    * Generate random block number for random file access.
    * Sequential for sequential access
    */
   numblocks = filesize/block_size;
   if(filesize % block_size > 0)
        numblocks++;

    offsets = (off_t *) malloc(numblocks * sizeof(off_t));
    if(offsets == 0){
        printf("Failed to allocate memory: %s\n", strerror(errno));
        _exit(-1);
    }
    for (uint64_t i = 0; i < numblocks; i++)
        if(randomaccess)
            offsets[i] = ((int)random() % numblocks) * block_size;
        else
            offsets[i] = i*block_size;
    if (numblocks % numthreads != 0)
        EXIT_MSG("We have %" PRIu64 " blocks and %d threads. "
                "Threads must evenly divide blocks. "
                "Please fix the args.\n",
                (uint_least64_t)numblocks, numthreads);

    if( read_mmap || write_mmap || mixed_mmap)
        assert((mapped_buffer = map_buffer(fd, filesize)) != NULL);

    threads = (pthread_t*)malloc(numthreads * sizeof(pthread_t));
    threadargs = 
            (threadargs_t*)malloc(numthreads * sizeof(threadargs_t));
    
    if (threads == NULL || threadargs == NULL)
        EXIT_MSG("Could not allocate thread array for %d threads.\n", numthreads);
    
    for (i = 0; i < numthreads; i++) {
        if(mixed_mmap){
            if (i < write_tr) {
                write_mmap = 1;
            } else {
                read_mmap = 1;
            }
        }
        threadargs[i].fd = fd;
        threadargs[i].tid = i;
        threadargs[i].block_size = block_size;
        threadargs[i].chunk_size = filesize/numthreads;
        threadargs[i].mapped_buffer = mapped_buffer;
        threadargs[i].offsets = &offsets[numblocks/numthreads * i];
        threadargs[i].read_mmap = read_mmap;
        threadargs[i].read_syscall = read_syscall;
        threadargs[i].write_mmap = write_mmap;
        threadargs[i].write_syscall = write_syscall;
        int ret = pthread_create(&threads[i], NULL, run_tests, &threadargs[i]);
        if (ret!=0)
            EXIT_MSG("pthread_create for %dth thread failed: %s\n",
                i, strerror(errno));
    }

    for (i = 0; i< numthreads; i++){
        ret = pthread_join(threads[i], NULL);
        if (ret !=0)
            EXIT_MSG("Thread %d failed in join: %s\n", 
            i, strerror(errno));
    }


    // for mixed mode determine read and write aggregate b/w.
    if(mixed_mmap) {
        // Write b/w
        min_start_time = threadargs[0].start_time;
        max_end_time = 0;
        // Since tid 0 to write_tr-1 did writes, find it's min and max.
        for(i=0; i < write_tr; i++){
            min_start_time = (threadargs[i].start_time < min_start_time)?
                threadargs[i].start_time:min_start_time;
            max_end_time = (threadargs[i].end_time > max_end_time)?
                threadargs[i].end_time:max_end_time;
        }
        printf("Write: %.2f\n", 
            (double)write_tr*(filesize/numthreads)/(double)(max_end_time-min_start_time)
            * NANOSECONDS_IN_SECOND / BYTES_IN_GB);
        
        // Read b/w
        min_start_time = threadargs[write_tr].start_time;
        max_end_time = 0;
        for(i=write_tr; i < numthreads; i++){
            min_start_time = (threadargs[i].start_time < min_start_time)?
                threadargs[i].start_time:min_start_time;
            max_end_time = (threadargs[i].end_time > max_end_time)?
                threadargs[i].end_time:max_end_time;
       } 
        printf("Read: %.2f\n", 
            (double)(numthreads-write_tr)*(filesize/numthreads)/(double)(max_end_time-min_start_time)
            * NANOSECONDS_IN_SECOND / BYTES_IN_GB);
    }

    /**
     * For total run time. Find the smallest start time
     * and largest end time across all threads.
     */
    min_start_time = threadargs[0].start_time;
    max_end_time = 0;
    for (i=0; i< numthreads; i++){
        min_start_time = (threadargs[i].start_time < min_start_time)?
            threadargs[i].start_time:min_start_time;
        max_end_time = (threadargs[i].end_time > max_end_time)?
            threadargs[i].end_time:max_end_time;
    }

    printf("%.2f\n", 
            (double)filesize/(double)(max_end_time-min_start_time)
            * NANOSECONDS_IN_SECOND / BYTES_IN_GB);
    

    munmap(mapped_buffer, filesize);
    close(fd);

}

void * run_tests(void *args) {
    uint64_t retval;
    threadargs_t t = *(threadargs_t*)args;

    if(t.read_mmap) {
        if(!silent)
            printf("Running read mmap test:\n");
        retval = read_mmap_test(t.fd, t.tid, t.block_size, t.chunk_size, 
                    t.mapped_buffer, t.offsets,
                    &((threadargs_t*)args)->start_time,
                    &((threadargs_t*)args)->end_time);
    }
    else if(t.read_syscall) {
        if(!silent)
            printf("Running read syscall test:\n");
        retval = read_syscall_test(t.fd, t.tid, t.block_size, t.chunk_size, 
                    t.offsets,
                    &((threadargs_t*)args)->start_time,
                    &((threadargs_t*)args)->end_time);
    }
    else if(t.write_mmap) {
        if(!silent)
            printf("Running write mmap test:\n");
        retval = write_mmap_test(t.fd, t.tid, t.block_size, t.chunk_size, 
                    t.mapped_buffer, t.offsets,
                    &((threadargs_t*)args)->start_time,
                    &((threadargs_t*)args)->end_time);
    }
    else if(t.write_syscall) {
        if(!silent)
            printf("Running write syscall test:\n");
        retval = write_syscall_test(t.fd, t.tid, t.block_size, t.chunk_size, 
                    t.offsets,
                    &((threadargs_t*)args)->start_time,
                    &((threadargs_t*)args)->end_time);
    }
    return (void*) 0;
}

#define READ 1
#define WRITE 2

/**
 ********* SYSCALL section
 */
uint64_t read_syscall_test(int fd, int tid, size_t block_size, size_t filesize, 
                off_t *offsets, uint64_t *begin, uint64_t *end) {
            return syscall_test(fd, tid, block_size, filesize, READ, offsets,
                        begin, end);
}

uint64_t write_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
                off_t *offsets, uint64_t *begin, uint64_t *end) {
            return syscall_test(fd, tid, block_size, filesize, WRITE, offsets,
                        begin, end);
}

uint64_t syscall_test(int fd, int tid, size_t block_size, size_t filesize, 
        char optype, off_t *offsets, uint64_t *begin, uint64_t *end) {

    bool done = false;
    char * buffer = NULL;
    int i = 0;
    size_t total_bytes_transferred = 0;
    uint64_t begin_time, end_time, ret_token = 0;

    buffer = (char*)malloc(block_size);
    if(buffer == NULL) {
        printf("Failed to allocate memory: %s\n", strerror(errno));
        return -1;
    }

    memset((void*)buffer, 0, block_size);

    begin_time= nano_time();

    while(!done) {
        size_t bytes_transferred = 0;

        if(optype == READ)
            bytes_transferred = pread(fd, buffer, block_size, offsets[i++]);
        else if (optype == WRITE)
            bytes_transferred = pwrite(fd, buffer, block_size, offsets[i++]);
        if (bytes_transferred == 0)
            done = true;
        else if(bytes_transferred == -1){
            printf("Failed to IO: %s\n", strerror(errno));
            return -1;
        }
        else {
            total_bytes_transferred += bytes_transferred;

            if (optype == WRITE && total_bytes_transferred == filesize)
                done = true;
            
            // Do random operation
            ret_token += buffer[0];
        }
        if (i*block_size >= filesize)
            done = true;
    }

    end_time = nano_time();

    if(!silent){
        printf("%s: %" PRIu64 " bytes transferred in %" PRIu64 ""
        " ns.\n", (optype == READ)?"read-syscall":"write-syscall",
        (uint_least64_t)total_bytes_transferred, (end_time-begin_time));
        // Throughput in GB/s
        printf("(tid %d) %.2f\n", tid,
            (double)filesize/(double)(end_time-begin_time)
            * NANOSECONDS_IN_SECOND / BYTES_IN_GB);
    }
    
    *begin = begin_time;
    *end = end_time;
    return ret_token;
}

/**
 * MMAP tests
 */

uint64_t read_mmap_test(int fd, int tid, size_t block_size, size_t filesize, 
            char *buf, off_t *offsets, uint64_t *begin, uint64_t *end) {
    return mmap_test(fd, tid, block_size, filesize, buf, READ, offsets, begin, end);
}

uint64_t write_mmap_test(int fd, int tid, size_t block_size, size_t filesize, 
            char *buf, off_t *offsets, uint64_t *begin, uint64_t *end){
    return mmap_test(fd, tid, block_size, filesize, buf, WRITE, offsets, begin, end);
}
// Add memory addr
#if SAMPLE_LATENCY
#define BEGIN_LAT_SAMPLE                                 \
    if (num_samples < MAX_LAT_SAMPLES && i%LAT_SAMPL_INTERVAL == 0)   \
        lat_begin_time = nano_time();

#define END_LAT_SAMPLE                                                  \
    if (num_samples < MAX_LAT_SAMPLES && i%LAT_SAMPL_INTERVAL == 0) {                \
    lat_end_time = nano_time();                                         \
    latency_samples[i/LAT_SAMPL_INTERVAL % MAX_LAT_SAMPLES] =           \
        lat_end_time - lat_begin_time;                                  \
    num_samples++;                                                      \
    }

#define MAX_LAT_SAMPLES 50
//#define LAT_SAMPL_INTERVAL (1000*1048576)
#define LAT_SAMPL_INTERVAL block_size

#else

#define BEGIN_LAT_SAMPLE ;
#define END_LAT_SAMPLE

#endif

uint64_t mmap_test(int fd, int tid, size_t block_size, size_t filesize, char *mapped_buffer, 
                char optype, off_t *offsets, uint64_t *begin, uint64_t *end) {
    
    bool done = false;
    char *buffer = NULL;
    uint64_t i, j, numblocks, ret;
    uint64_t begin_time, end_time, ret_token = 0;

#if SAMPLE_LATENCY
    uint64_t lat_begin_time, lat_end_time;
    size_t latency_samples[MAX_LAT_SAMPLES];
    int num_samples = 0;

    memset((void*)latency_samples, 0, sizeof(latency_samples));
#endif

    buffer = (char*)malloc(block_size);
    if(buffer == NULL) {
        printf("Failed to allocate memory: %s\n", strerror(errno));
        return -1;
    }
    memset((void*)buffer, 1, block_size);

    begin_time = nano_time();
    for(i=0; i<filesize; i+=block_size){
        off_t offset = offsets[i/block_size];
        BEGIN_LAT_SAMPLE;
        if(optype == READ) {
            //__memmove_chk_avx512_no_vzeroupper(buffer, &mapped_buffer[offset], block_size);
            memcpy(buffer, &mapped_buffer[offset], block_size);
            ret_token += buffer[0];
        }
        else if (optype == WRITE) {
            //__memmove_chk_avx512_no_vzeroupper(&mapped_buffer[offset], buffer, block_size);
            memcpy(&mapped_buffer[offset], buffer, block_size);
            ret_token += mapped_buffer[i];
        }
        END_LAT_SAMPLE;
    }

    end_time = nano_time();

    if(!silent) {
        printf("%s: %" PRIu64 " bytes read in %" PRIu64 " ns.\n",
        (optype==READ)?"readmap":"writemap",
        (uint_least64_t)filesize, (end_time-begin_time));
    
        // print GB/s
        printf("(tid %d) %.2f\n", tid,
            (double)filesize/(double)(end_time-begin_time)
            * NANOSECONDS_IN_SECOND / BYTES_IN_GB);
    }

    *begin = begin_time;
    *end = end_time;

#if SAMPLE_LATENCY
    printf("\nSample latency for %ld byte block:\n", block_size);
    for (i = 0; i < MAX_LAT_SAMPLES; i++)
        printf("\t%ld: %ld\n", i, latency_samples[i]);

#endif
    return ret_token;
}

char* map_buffer(int fd, size_t size) {
    char *mapped_buffer = NULL;

//    Populate
      mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
                              MAP_PRIVATE | MAP_POPULATE, fd, 0);
//    Shared
//    mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
//                            MAP_SHARED, fd, 0);
//    Anon test
//    mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
//                            MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    if(mapped_buffer == MAP_FAILED)
        EXIT_MSG("Failed to mmap file of size %zu: %s\n",
            size, strerror(errno));
  
//    Might also need to gurantee page aligned - posix_memalign() 
//    int mret = madvise(mapped_buffer, filesize, MADV_HUGEPAGE);
//     if(mret!=0) {
//        fprintf(stderr, "failed madvise: %s\n", strerror(errno));
//    } 
    return mapped_buffer;
}

size_t get_filesize(const char* filename){
    int retval;

    struct stat st;
    retval = stat(filename, &st);
    if(retval)
        return -1;
    else 
        return st.st_size;
}

void print_help_message(const char *progname) {

    /* take only the last portion of the path */
    const char *basename = strrchr(progname, '/');
    basename = basename ? basename + 1 : progname;

    printf("usage: %s [OPTION]\n", basename);
    printf("  -h, --help\n"
           "     Print this help and exit.\n");
    printf("  -b, --block[=BLOCKSIZE]\n"
           "     Block size used for read system calls.\n"
           "     For mmap tests, the size of the stride when iterating\n"
           "     over the file.\n"
           "     Defaults to %d.\n", DEFAULT_BLOCK_SIZE);
    printf("  -f, --file[=FILENAME]\n"
           "     Perform all tests on this file (defaults to %s).\n",
           DEFAULT_NAME);
    printf("  --readsyscall\n"
           "     Perform a read test using system calls.\n");
    printf("  --readmmap\n"
           "     Perform a read test using mmap.\n");
    printf("  --writesyscall\n"
           "     Perform a write test using system calls.\n");
    printf("  --writemmap\n"
           "     Perform a write test using mmap.\n");
    printf(" --randomaccess\n"
           "    Perform random access.\n");
    printf(" --threads\n"
           "    Number of threads to use. Defaults to one.\n");
    printf(" --mixedmmap\n"
           "    Perfom read and write concurrently at different offsets\n");
    printf(" -w, -writethreads[=0]\n"
           "    Number of threads that should perform write\n");
}

编译:

$ gcc testm.c -o testm -lpthread -static -O2 -fno-builtin-memcpy

运行程序的命令:

$ dd if=/dev/zero of=bigmmaptest bs=1M count=25600 # 25 GiB file
$ ./testm -b 1024 -f bigmmaptest --threads 16 --randomaccess --readmmap

我使用的是 32 核 Xeon 5218 第二代 L1d KiB/L2 MiB/L3 MiB -- 512/16/22

当 memcpy 大小为 1 KiB 时，我得到 21.7 GB/s，但当大小为 256B 时，我得到 26.68 GB/s，当大小为 4 KiB 时，我得到 34.8 GB/s。为什么中间有一个落差？我观察到，与 256B 和 4 KiB 相比，2 KiB 的性能也较差。

更有趣的是，当我禁用 L2 硬件预取器并且没有任何其他更改时，我的带宽会自动增加 1 KiB 和 2 KiB。如果没有预取，2 KiB memcpy 的速度为 34.8 GB/s。所有这些都是聚合带宽。

使用 perf，我确实测量了 L2 加载存储未命中情况，但结果表明它们并没有发生太大变化。对于 8 线程及以下线程也看不到这种效果。

我使用的是linux 5.0.4。我正在使用 glibC memcpy (gcc 7.5.0)，即使使用 -O2 我也观察到上述怪癖。如果使用 L2 预取，1 KiB 访问大小为 18.76 GiB/s，没有 L2 预取则为 30.32 GiB/s。作为比较，256 B 访问大小提供 24.7 GiB/s(带预取)和 24.8 GiB/s(不带预取)。显然，性能下降是由于预取器造成的 L2 缓存污染，因为在线程数较少的情况下不会观察到这种情况。我正在考虑 SMT 是否可能是污染增加的原因，但我在 16 个物理核心上的 16 个线程上观察到了明显的影响。

浏览 glibc memcpy 代码，我可以看到任何小于 4 KiB 大小的访问都使用 AVX 256 指令，因此没有任何变化。

最佳答案

较小的 256B 大小没有从 L2 流媒体中看到下降可能是由于缓存未命中序列太短而无法激活流媒体并在请求时浪费带宽(以及 LFB 和 L2 <-> L3 super 队列中的插槽)那没有用。

对于对齐的 4k，同一页面内没有您未提取的字节，因此 L2 预取器非常有用，或者至少没有害处。 (运行 memcpy 时，后面的线路的需求负载很快就会出现，所以我猜测启用/不启用硬件预取的速度大致相同，除非硬件预取有助于开始新的 4k block ，同时仍在等待前一个 block 的结束.)

L2 只看到物理地址，据我所知，它不会尝试跨 4k 边界预取。 (即使它在同一个 2M 大页面内，因为它也不知道这一点。)Intel 提到的 Ivy Bridge 中的新“下一页预取器”据我所知只是 TLB 预取，而不是数据。

因此，使用对齐 4k memcpy，硬件预取会在您实际要读取的数据末尾自动停止，不会浪费任何带宽。自 mmap 起为您提供页面对齐的内存，这些 4k 内存副本来自单个源页面。

(目的地是无关紧要的，因为它可能在 L1d 缓存中保持热状态，可能偶尔会被驱逐到 L2，并且在 memcpy 之后从它重新加载可以来自存储转发，甚至不必等待 memcpy 的存储提交到L1d。)

预测:如果您的较小 memcpy 源在 4k 页面的中途开始，但仍然在 4k 页面的末尾结束，您可能会看到与禁用预取类似的行为。例如生成一个随机页码，并从 3072 字节开始，进行 1 KiB 的复制。因此，您所有 1 KiB 的副本都来自页面的末尾，而不是中间。

(每个字节 memcpyed 仍然会有更多 dTLB 未命中，因为每个 TLB 条目仅覆盖您实际读取的 1 K 数据。您是否使用了 MAP_POPULATE，因此您不应该在定时区域，假设您有足够的 RAM。)

L1d KiB /L2 MiB /L3 MiB -- 512 / 16 / 22

这些是总计，但 L1d 和 L2 是每个核心私有(private)的!每个核心有 32kiB L1d 和 1MiB L2，因为这是 Cascade Lake，与 Skylake-X 的布局相同。

顺便说一句，我会考虑在定时循环内使用快速 PRNG，例如 xorshift+ 或 xorshift*；这很容易随机，足以击败预取；即使是一个简单的 LFSR 甚至是具有 2 次幂模的 LCG 也可以做到这一点(而且非常便宜，只需一个 imul 和加法)。如果您确实想仅隔离 memcpy 内存访问，它可以避免从另一个数组读取偏移量。不过可能没有什么区别。一个非常简单的 PRNG 的一个优点是，其周期等于您试图覆盖的空间(如 LCG)，您不会两次生成相同的地址，从而为您提供 block 的随机排列。但是，如果内存块足够大，即使没有这种难以实现的特性，即使是来自 L3 的随机缓存命中也不太可能。

您当前的偏移量数组没问题。 (我没有仔细查看代码，所以我只是假设没有错误。)

关于performance - 为什么 L2 硬件预取器在只有 1 KiB 或 2 KiB 访问大小时性能较差？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/69342503/

performance - 为什么 L2 硬件预取器在只有 1 KiB 或 2 KiB 访问大小时性能较差？

上一篇：java -version，关于三个不同结果行的问题

下一篇：Magento 2.4.2-p1 索引问题