intel - Haswell 微架构在性能中没有停滞周期后端

标签 intel performancecounter perf msr

我在 Haswell CPU(Intel Core i7-4790)上安装了 perf。但“性能列表”不包括“stalled-cycles-frontend”或“stalled-cycles-backend”。我检查了http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html从表19-7(第四代智能英特尔酷睿处理器的处理器核心中的非架构性能事件)中没有找到与stalled-cycles-backend相关的性能事件。

所以我的问题是:如何使用 Haswell CPU 内核中的 perf 或其他工具来测量停滞周期后端。内核是3.19,perf版本也是3.19。

谢谢

最佳答案

是的,对于 Ivy Bridge 或 Haswell 等较新的处理器,内核中的 perf_events 子系统中没有“stalled-cycles-frontend”和“stalled-cycles-backend”合成事件的映射。并且在较旧的 Core 2 上没有映射。也许,这个名称/概念/想法不适合现代乱序 CPU 的变化和复杂的微体系结构,而无需对全局“失速”进行简单的标量测量。

代码is in arch/x86/events/intel/core.c ,合成事件名称为 PERF_COUNT_HW_STALLED_CYCLES_FRONTENDPERF_COUNT_HW_STALLED_CYCLES_BACKEND:

__init int intel_pmu_init(void)
{...

两者都是从 Nehalem 开始为 Westmere、Sandy Bridge 定义的:

    case INTEL_FAM6_NEHALEM:
    case INTEL_FAM6_NEHALEM_EP:
    case INTEL_FAM6_NEHALEM_EX:

        /* UOPS_ISSUED.STALLED_CYCLES */
        intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
            X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
        /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
        intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
            X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);

    case INTEL_FAM6_WESTMERE:
    case INTEL_FAM6_WESTMERE_EP:
    case INTEL_FAM6_WESTMERE_EX:

        /* UOPS_ISSUED.STALLED_CYCLES */
        intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
            X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
        /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
        intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
            X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);


    case INTEL_FAM6_SANDYBRIDGE:
    case INTEL_FAM6_SANDYBRIDGE_X:


        /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
        intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
            X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
        /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
        intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
            X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);

仅为 Ivy Bridge 定义了前端停顿

    case INTEL_FAM6_IVYBRIDGE:
    case INTEL_FAM6_IVYBRIDGE_X:

        /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
        intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
            X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);

对于最新的 CPU 桌面(Haswell、Broadwell、Skylake、Kaby Lake)和 Phi(KNL、KNM)没有前端和后端停顿的映射:

    case INTEL_FAM6_HASWELL_CORE:
    case INTEL_FAM6_HASWELL_X:
    case INTEL_FAM6_HASWELL_ULT:
    case INTEL_FAM6_HASWELL_GT3E:

    case INTEL_FAM6_BROADWELL_CORE:
    case INTEL_FAM6_BROADWELL_XEON_D:
    case INTEL_FAM6_BROADWELL_GT3E:
    case INTEL_FAM6_BROADWELL_X:


    case INTEL_FAM6_XEON_PHI_KNL:
    case INTEL_FAM6_XEON_PHI_KNM:


    case INTEL_FAM6_SKYLAKE_MOBILE:
    case INTEL_FAM6_SKYLAKE_DESKTOP:
    case INTEL_FAM6_SKYLAKE_X:
    case INTEL_FAM6_KABYLAKE_MOBILE:
    case INTEL_FAM6_KABYLAKE_DESKTOP:

也没有为旧 Core2 定义(没有检查 Atoms):

http://elixir.free-electrons.com/linux/v4.11/source/arch/x86/events/intel/core.c#L27

static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
{
    [PERF_COUNT_HW_CPU_CYCLES]      = 0x003c,
    [PERF_COUNT_HW_INSTRUCTIONS]        = 0x00c0,
    [PERF_COUNT_HW_CACHE_REFERENCES]    = 0x4f2e,
    [PERF_COUNT_HW_CACHE_MISSES]        = 0x412e,
    [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
    [PERF_COUNT_HW_BRANCH_MISSES]       = 0x00c5,
    [PERF_COUNT_HW_BUS_CYCLES]      = 0x013c,
    [PERF_COUNT_HW_REF_CPU_CYCLES]      = 0x0300, /* pseudo-encoding */
};

关于intel - Haswell 微架构在性能中没有停滞周期后端,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/36348940/

相关文章:

C#:如何在激活 TurboBoost 时获取 Intel i 系列 CPU 的当前时钟速度

optimization - 分支目标缓冲区检测到什么分支预测错误?

c# - 如何重置自定义性能计数器

kernel - 如何使用 perf 测量程序执行期间的缺页时间?

c - perf record如何获取目标进程指令的虚拟内存地址以及perf使用什么数据结构来存储它

assembly - 64 位模式下的地址覆盖前缀

c - MPI 共享内存和四精度的段错误

windows - 系统性能计数器的实例名称是否已本地化?

c# - 是否有用于使用自定义 Windows 性能计数器的良好 .Net 库?

performance - 使用 perf 监控每个 CPU 的内存访问