c - 测量缓存延迟

所以我正在尝试使用 C 来测量 L1、L2、L3 缓存的延迟。我知道它们的大小，并且我觉得我在概念上理解如何做到这一点，但我的实现遇到了问题。我想知道其他一些复杂的硬件(例如预取)是否会导致问题。

#include <time.h>
#include <stdio.h>
#include <string.h>

int main(){
    srand(time(NULL));  // Seed ONCE
    const int L1_CACHE_SIZE =  32768/sizeof(int);
    const int L2_CACHE_SIZE =  262144/sizeof(int);
    const int L3_CACHE_SIZE =  6587392/sizeof(int);
    const int NUM_ACCESSES = 1000000;
    const int SECONDS_PER_NS = 1000000000;
    int arrayAccess[L1_CACHE_SIZE];
    int arrayInvalidateL1[L1_CACHE_SIZE];
    int arrayInvalidateL2[L2_CACHE_SIZE];
    int arrayInvalidateL3[L3_CACHE_SIZE];
    int count=0;
    int index=0;
    int i=0;
    struct timespec startAccess, endAccess;
    double mainMemAccess, L1Access, L2Access, L3Access;
    int readValue=0;

    memset(arrayAccess, 0, L1_CACHE_SIZE*sizeof(int));
    memset(arrayInvalidateL1, 0, L1_CACHE_SIZE*sizeof(int));
    memset(arrayInvalidateL2, 0, L2_CACHE_SIZE*sizeof(int));
    memset(arrayInvalidateL3, 0, L3_CACHE_SIZE*sizeof(int));

    index = 0;
    clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    while (index < L1_CACHE_SIZE) {
        int tmp = arrayAccess[index];               //Access Value from L2
        index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
        count++;                                           //divide overall time by this 
    }
    clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
    mainMemAccess = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
    mainMemAccess /= count;

    printf("Main Memory Access %lf\n", mainMemAccess);

    index = 0;
    count=0;
    clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    while (index < L1_CACHE_SIZE) {
        int tmp = arrayAccess[index];               //Access Value from L2
        index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
        count++;                                           //divide overall time by this 
    }
    clock_gettime(CLOCK_REALTIME, &endAccess); //end clock              
    L1Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
    L1Access /= count;

    printf("L1 Cache Access %lf\n", L1Access);

    //invalidate L1 by accessing all elements of array which is larger than cache
    for(count=0; count < L1_CACHE_SIZE; count++){
        int read = arrayInvalidateL1[count]; 
        read++;
        readValue+=read;               
    }

    index = 0;
    count = 0;
    clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    while (index < L1_CACHE_SIZE) {
        int tmp = arrayAccess[index];               //Access Value from L2
        index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
        count++;                                           //divide overall time by this 
    }
    clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
    L2Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
    L2Access /= count;

    printf("L2 Cache Acces %lf\n", L2Access);

    //invalidate L2 by accessing all elements of array which is larger than cache
    for(count=0; count < L2_CACHE_SIZE; count++){
        int read = arrayInvalidateL2[count];  
        read++;
        readValue+=read;                        
    }

    index = 0;
    count=0;
    clock_gettime(CLOCK_REALTIME, &startAccess); //sreadValue+=read;tart clock
    while (index < L1_CACHE_SIZE) {
        int tmp = arrayAccess[index];               //Access Value from L2
        index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
        count++;                                           //divide overall time by this 
    }
    clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
    L3Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
    L3Access /= count;

    printf("L3 Cache Access %lf\n", L3Access);

    printf("Read Value: %d", readValue);

}

我首先访问要从中获取数据的数组中的一个值。这显然应该来自主内存，因为它是第一次访问。该数组很小(小于页面大小)，因此应将其复制到 L1、L2、L3。我从现在应该是 L1 的同一个数组中访问值。然后，我从与 L1 缓存大小相同的数组中访问所有值，以使我想要访问的数据无效(所以现在它应该只是在 L2/3 中)。然后我对 L2 和 L3 重复这个过程。不过访问时间明显不对，这意味着我做错了什么......

我认为计时所需的时间可能存在问题(启动和停止在 ns 中会花费一些时间，并且在缓存/未缓存时会发生变化)

谁能给我一些关于我可能做错了什么的指点？

UPDATE1:所以我通过进行大量访问来分摊计时器的成本，我固定了我的缓存大小，并且我还接受了建议以制定更复杂的索引方案以避免固定步幅。不幸的是，时代还没有结束。他们似乎都为L1而来。我认为问题可能在于无效而不是访问。随机 vs LRU 方案会影响失效的数据吗？

UPDATE2:修复了 memset(添加了 L3 memset 以使 L3 中的数据也无效，因此第一次访问从主内存开始)和索引方案，仍然没有运气。

更新 3:我无法让这种方法发挥作用，但有一些很好的建议答案，我发布了一些我自己的解决方案。

我还运行 Cachegrind 来查看命中/未命中

 ==6710== I   refs:      1,735,104
==6710== I1  misses:        1,092
==6710== LLi misses:        1,084
==6710== I1  miss rate:      0.06%
==6710== LLi miss rate:      0.06%
==6710== 
==6710== D   refs:      1,250,696  (721,162 rd   + 529,534 wr)
==6710== D1  misses:      116,492  (  7,627 rd   + 108,865 wr)
==6710== LLd misses:      115,102  (  6,414 rd   + 108,688 wr)
==6710== D1  miss rate:       9.3% (    1.0%     +    20.5%  )
==6710== LLd miss rate:       9.2% (    0.8%     +    20.5%  )
==6710== 
==6710== LL refs:         117,584  (  8,719 rd   + 108,865 wr)
==6710== LL misses:       116,186  (  7,498 rd   + 108,688 wr)
==6710== LL miss rate:        3.8% (    0.3%     +    20.5%  )


        Ir I1mr ILmr      Dr  D1mr  DLmr     Dw D1mw DLmw 

      .    .    .       .     .     .      .    .    .  #include <time.h>
      .    .    .       .     .     .      .    .    .  #include <stdio.h>
      .    .    .       .     .     .      .    .    .  #include <string.h>
      .    .    .       .     .     .      .    .    .  
      6    0    0       0     0     0      2    0    0  int main(){
      5    1    1       0     0     0      2    0    0      srand(time(NULL));  // Seed ONCE
      1    0    0       0     0     0      1    0    0      const int L1_CACHE_SIZE =  32768/sizeof(int);
      1    0    0       0     0     0      1    0    0      const int L2_CACHE_SIZE =  262144/sizeof(int);
      1    0    0       0     0     0      1    0    0      const int L3_CACHE_SIZE =  6587392/sizeof(int);
      1    0    0       0     0     0      1    0    0      const int NUM_ACCESSES = 1000000;
      1    0    0       0     0     0      1    0    0      const int SECONDS_PER_NS = 1000000000;
     21    2    2       3     0     0      3    0    0      int arrayAccess[L1_CACHE_SIZE];
     21    1    1       3     0     0      3    0    0      int arrayInvalidateL1[L1_CACHE_SIZE];
     21    2    2       3     0     0      3    0    0      int arrayInvalidateL2[L2_CACHE_SIZE];
     21    1    1       3     0     0      3    0    0      int arrayInvalidateL3[L3_CACHE_SIZE];
      1    0    0       0     0     0      1    0    0      int count=0;
      1    1    1       0     0     0      1    0    0      int index=0;
      1    0    0       0     0     0      1    0    0      int i=0;
      .    .    .       .     .     .      .    .    .      struct timespec startAccess, endAccess;
      .    .    .       .     .     .      .    .    .      double mainMemAccess, L1Access, L2Access, L3Access;
      1    0    0       0     0     0      1    0    0      int readValue=0;
      .    .    .       .     .     .      .    .    .  
      7    0    0       2     0     0      1    1    1      memset(arrayAccess, 0, L1_CACHE_SIZE*sizeof(int));
      7    1    1       2     2     0      1    0    0      memset(arrayInvalidateL1, 0, L1_CACHE_SIZE*sizeof(int));
      7    0    0       2     2     0      1    0    0      memset(arrayInvalidateL2, 0, L2_CACHE_SIZE*sizeof(int));
      7    1    1       2     2     0      1    0    0      memset(arrayInvalidateL3, 0, L3_CACHE_SIZE*sizeof(int));
      .    .    .       .     .     .      .    .    .  
      1    0    0       0     0     0      1    1    1      index = 0;
      4    0    0       0     0     0      1    0    0      clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    772    1    1     514     0     0      0    0    0      while (index < L1_CACHE_SIZE) {
  1,280    1    1     768   257   257    256    0    0          int tmp = arrayAccess[index];               //Access Value from L2
  2,688    0    0     768     0     0    256    0    0          index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
    256    0    0     256     0     0      0    0    0          count++;                                           //divide overall time by this 
      .    .    .       .     .     .      .    .    .      }
      4    0    0       0     0     0      1    0    0      clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
     14    1    1       5     1     1      1    1    1      mainMemAccess = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
      6    0    0       2     0     0      1    0    0      mainMemAccess /= count;
      .    .    .       .     .     .      .    .    .  
      6    1    1       2     0     0      2    0    0      printf("Main Memory Access %lf\n", mainMemAccess);
      .    .    .       .     .     .      .    .    .  
      1    0    0       0     0     0      1    0    0      index = 0;
      1    0    0       0     0     0      1    0    0      count=0;
      4    1    1       0     0     0      1    0    0      clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    772    1    1     514     0     0      0    0    0      while (index < L1_CACHE_SIZE) {
  1,280    0    0     768   240     0    256    0    0          int tmp = arrayAccess[index];               //Access Value from L2
  2,688    0    0     768     0     0    256    0    0          index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
    256    0    0     256     0     0      0    0    0          count++;                                           //divide overall time by this 
      .    .    .       .     .     .      .    .    .      }
      4    0    0       0     0     0      1    0    0      clock_gettime(CLOCK_REALTIME, &endAccess); //end clock              
     14    1    1       5     0     0      1    1    0      L1Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
      6    1    1       2     0     0      1    0    0      L1Access /= count;
      .    .    .       .     .     .      .    .    .  
      6    0    0       2     0     0      2    0    0      printf("L1 Cache Access %lf\n", L1Access);
      .    .    .       .     .     .      .    .    .  
      .    .    .       .     .     .      .    .    .      //invalidate L1 by accessing all elements of array which is larger than cache
 32,773    1    1  24,578     0     0      1    0    0      for(count=0; count < L1_CACHE_SIZE; count++){
 40,960    0    0  24,576   513   513  8,192    0    0          int read = arrayInvalidateL1[count]; 
  8,192    0    0   8,192     0     0      0    0    0          read++;
 16,384    0    0  16,384     0     0      0    0    0          readValue+=read;               
      .    .    .       .     .     .      .    .    .      }
      .    .    .       .     .     .      .    .    .  
      1    0    0       0     0     0      1    0    0      index = 0;
      1    1    1       0     0     0      1    0    0      count = 0;
      4    0    0       0     0     0      1    1    0      clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    772    1    1     514     0     0      0    0    0      while (index < L1_CACHE_SIZE) {
  1,280    0    0     768   256     0    256    0    0          int tmp = arrayAccess[index];               //Access Value from L2
  2,688    0    0     768     0     0    256    0    0          index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
    256    0    0     256     0     0      0    0    0          count++;                                           //divide overall time by this 
      .    .    .       .     .     .      .    .    .      }
      4    1    1       0     0     0      1    0    0      clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
     14    0    0       5     1     0      1    1    0      L2Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
      6    1    1       2     0     0      1    0    0      L2Access /= count;
      .    .    .       .     .     .      .    .    .  
      6    0    0       2     0     0      2    0    0      printf("L2 Cache Acces %lf\n", L2Access);
      .    .    .       .     .     .      .    .    .  
      .    .    .       .     .     .      .    .    .      //invalidate L2 by accessing all elements of array which is larger than cache
262,149    2    2 196,610     0     0      1    0    0      for(count=0; count < L2_CACHE_SIZE; count++){
327,680    0    0 196,608 4,097 4,095 65,536    0    0          int read = arrayInvalidateL2[count];  
 65,536    0    0  65,536     0     0      0    0    0          read++;
131,072    0    0 131,072     0     0      0    0    0          readValue+=read;                        
      .    .    .       .     .     .      .    .    .      }
      .    .    .       .     .     .      .    .    .  
      1    0    0       0     0     0      1    0    0      index = 0;
      1    0    0       0     0     0      1    0    0      count=0;
      4    0    0       0     0     0      1    1    0      clock_gettime(CLOCK_REALTIME, &startAccess); //sreadValue+=read;tart clock
    772    1    1     514     0     0      0    0    0      while (index < L1_CACHE_SIZE) {
  1,280    0    0     768   256     0    256    0    0          int tmp = arrayAccess[index];               //Access Value from L2
  2,688    0    0     768     0     0    256    0    0          index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
    256    0    0     256     0     0      0    0    0          count++;                                           //divide overall time by this 
      .    .    .       .     .     .      .    .    .      }
      4    0    0       0     0     0      1    0    0      clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
     14    1    1       5     1     0      1    1    0      L3Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
      6    0    0       2     0     0      1    0    0      L3Access /= count;
      .    .    .       .     .     .      .    .    .  
      6    1    1       2     0     0      2    0    0      printf("L3 Cache Access %lf\n", L3Access);
      .    .    .       .     .     .      .    .    .  
      6    0    0       1     0     0      1    0    0      printf("Read Value: %d", readValue);
      .    .    .       .     .     .      .    .    .  
      3    0    0       3     0     0      0    0    0  }

最佳答案

我宁愿尝试使用硬件时钟作为衡量标准。 rdtsc 指令将告诉您自 CPU 上电以来的当前循环计数。此外，最好使用 asm 以确保在测量和试运行中始终使用相同的指令。使用这个和一些聪明的统计数据，我很久以前就做了这个:

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <sys/mman.h>


int i386_cpuid_caches (size_t * data_caches) {
    int i;
    int num_data_caches = 0;
    for (i = 0; i < 32; i++) {

        // Variables to hold the contents of the 4 i386 legacy registers
        uint32_t eax, ebx, ecx, edx; 

        eax = 4; // get cache info
        ecx = i; // cache id

        asm (
            "cpuid" // call i386 cpuid instruction
            : "+a" (eax) // contains the cpuid command code, 4 for cache query
            , "=b" (ebx)
            , "+c" (ecx) // contains the cache id
            , "=d" (edx)
        ); // generates output in 4 registers eax, ebx, ecx and edx 

        // taken from http://download.intel.com/products/processor/manual/325462.pdf Vol. 2A 3-149
        int cache_type = eax & 0x1F; 

        if (cache_type == 0) // end of valid cache identifiers
            break;

        char * cache_type_string;
        switch (cache_type) {
            case 1: cache_type_string = "Data Cache"; break;
            case 2: cache_type_string = "Instruction Cache"; break;
            case 3: cache_type_string = "Unified Cache"; break;
            default: cache_type_string = "Unknown Type Cache"; break;
        }

        int cache_level = (eax >>= 5) & 0x7;

        int cache_is_self_initializing = (eax >>= 3) & 0x1; // does not need SW initialization
        int cache_is_fully_associative = (eax >>= 1) & 0x1;


        // taken from http://download.intel.com/products/processor/manual/325462.pdf 3-166 Vol. 2A
        // ebx contains 3 integers of 10, 10 and 12 bits respectively
        unsigned int cache_sets = ecx + 1;
        unsigned int cache_coherency_line_size = (ebx & 0xFFF) + 1;
        unsigned int cache_physical_line_partitions = ((ebx >>= 12) & 0x3FF) + 1;
        unsigned int cache_ways_of_associativity = ((ebx >>= 10) & 0x3FF) + 1;

        // Total cache size is the product
        size_t cache_total_size = cache_ways_of_associativity * cache_physical_line_partitions * cache_coherency_line_size * cache_sets;

        if (cache_type == 1 || cache_type == 3) {
            data_caches[num_data_caches++] = cache_total_size;
        }

        printf(
            "Cache ID %d:\n"
            "- Level: %d\n"
            "- Type: %s\n"
            "- Sets: %d\n"
            "- System Coherency Line Size: %d bytes\n"
            "- Physical Line partitions: %d\n"
            "- Ways of associativity: %d\n"
            "- Total Size: %zu bytes (%zu kb)\n"
            "- Is fully associative: %s\n"
            "- Is Self Initializing: %s\n"
            "\n"
            , i
            , cache_level
            , cache_type_string
            , cache_sets
            , cache_coherency_line_size
            , cache_physical_line_partitions
            , cache_ways_of_associativity
            , cache_total_size, cache_total_size >> 10
            , cache_is_fully_associative ? "true" : "false"
            , cache_is_self_initializing ? "true" : "false"
        );
    }

    return num_data_caches;
}

int test_cache(size_t attempts, size_t lower_cache_size, int * latencies, size_t max_latency) {
    int fd = open("/dev/urandom", O_RDONLY);
    if (fd < 0) {
        perror("open");
        abort();
    }
    char * random_data = mmap(
          NULL
        , lower_cache_size
        , PROT_READ | PROT_WRITE
        , MAP_PRIVATE | MAP_ANON // | MAP_POPULATE
        , -1
        , 0
        ); // get some random data
    if (random_data == MAP_FAILED) {
        perror("mmap");
        abort();
    }

    size_t i;
    for (i = 0; i < lower_cache_size; i += sysconf(_SC_PAGESIZE)) {
        random_data[i] = 1;
    }


    int64_t random_offset = 0;
    while (attempts--) {
        // use processor clock timer for exact measurement
        random_offset += rand();
        random_offset %= lower_cache_size;
        int32_t cycles_used, edx, temp1, temp2;
        asm (
            "mfence\n\t"        // memory fence
            "rdtsc\n\t"         // get cpu cycle count
            "mov %%edx, %2\n\t"
            "mov %%eax, %3\n\t"
            "mfence\n\t"        // memory fence
            "mov %4, %%al\n\t"  // load data
            "mfence\n\t"
            "rdtsc\n\t"
            "sub %2, %%edx\n\t" // substract cycle count
            "sbb %3, %%eax"     // substract cycle count
            : "=a" (cycles_used)
            , "=d" (edx)
            , "=r" (temp1)
            , "=r" (temp2)
            : "m" (random_data[random_offset])
            );
        // printf("%d\n", cycles_used);
        if (cycles_used < max_latency)
            latencies[cycles_used]++;
        else 
            latencies[max_latency - 1]++;
    }

    munmap(random_data, lower_cache_size);

    return 0;
} 

int main() {
    size_t cache_sizes[32];
    int num_data_caches = i386_cpuid_caches(cache_sizes);

    int latencies[0x400];
    memset(latencies, 0, sizeof(latencies));

    int empty_cycles = 0;

    int i;
    int attempts = 1000000;
    for (i = 0; i < attempts; i++) { // measure how much overhead we have for counting cyscles
        int32_t cycles_used, edx, temp1, temp2;
        asm (
            "mfence\n\t"        // memory fence
            "rdtsc\n\t"         // get cpu cycle count
            "mov %%edx, %2\n\t"
            "mov %%eax, %3\n\t"
            "mfence\n\t"        // memory fence
            "mfence\n\t"
            "rdtsc\n\t"
            "sub %2, %%edx\n\t" // substract cycle count
            "sbb %3, %%eax"     // substract cycle count
            : "=a" (cycles_used)
            , "=d" (edx)
            , "=r" (temp1)
            , "=r" (temp2)
            :
            );
        if (cycles_used < sizeof(latencies) / sizeof(*latencies))
            latencies[cycles_used]++;
        else 
            latencies[sizeof(latencies) / sizeof(*latencies) - 1]++;

    }

    {
        int j;
        size_t sum = 0;
        for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
            sum += latencies[j];
        }
        size_t sum2 = 0;
        for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
            sum2 += latencies[j];
            if (sum2 >= sum * .75) {
                empty_cycles = j;
                fprintf(stderr, "Empty counting takes %d cycles\n", empty_cycles);
                break;
            }
        }
    }

    for (i = 0; i < num_data_caches; i++) {
        test_cache(attempts, cache_sizes[i] * 4, latencies, sizeof(latencies) / sizeof(*latencies));

        int j;
        size_t sum = 0;
        for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
            sum += latencies[j];
        }
        size_t sum2 = 0;
        for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
            sum2 += latencies[j];
            if (sum2 >= sum * .75) {
                fprintf(stderr, "Cache ID %i has latency %d cycles\n", i, j - empty_cycles);
                break;
            }
        }

    }

    return 0;

}

Core2Duo 上的输出:

Cache ID 0:
- Level: 1
- Type: Data Cache
- Total Size: 32768 bytes (32 kb)

Cache ID 1:
- Level: 1
- Type: Instruction Cache
- Total Size: 32768 bytes (32 kb)

Cache ID 2:
- Level: 2
- Type: Unified Cache
- Total Size: 262144 bytes (256 kb)

Cache ID 3:
- Level: 3
- Type: Unified Cache
- Total Size: 3145728 bytes (3072 kb)

Empty counting takes 90 cycles
Cache ID 0 has latency 6 cycles
Cache ID 2 has latency 21 cycles
Cache ID 3 has latency 168 cycles

关于c - 测量缓存延迟，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/21369381/

c - 测量缓存延迟

上一篇：caching - 为什么在大多数处理器中，L1 缓存的大小都小于 L2 缓存的大小？

下一篇：memory - 不可变可以是内存 pig 吗？