diff options
Diffstat (limited to 'tools/perf/pmu-events/arch/x86')
96 files changed, 15790 insertions, 2349 deletions
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json index bbfa3883e5..b72c0e2cb9 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json @@ -811,13 +811,13 @@ "MetricExpr": "(cpu_core@UOPS_DISPATCHED.PORT_0@ + cpu_core@UOPS_DISPATCHED.PORT_1@ + cpu_core@UOPS_DISPATCHED.PORT_5_11@ + cpu_core@UOPS_DISPATCHED.PORT_6@) / (5 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", - "MetricThreshold": "tma_alu_op_utilization > 0.6", + "MetricThreshold": "tma_alu_op_utilization > 0.4", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * cpu_core@ASSISTS.ANY\\,umask\\=0x1B@ / tma_info_thread_slots", + "MetricExpr": "78 * cpu_core@ASSISTS.ANY@ / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -880,6 +880,24 @@ "Unit": "cpu_core" }, { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due staying in C0.1 power-performance optimized state (Faster wakeup time; Smaller power savings).", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.C01@ / tma_info_thread_clks", + "MetricGroup": "C0Wait;TopdownL4;tma_L4_group;tma_serializing_operation_group", + "MetricName": "tma_c01_wait", + "MetricThreshold": "tma_c01_wait > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due staying in C0.2 power-performance optimized state (Slower wakeup time; Larger power savings).", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.C02@ / tma_info_thread_clks", + "MetricGroup": "C0Wait;TopdownL4;tma_L4_group;tma_serializing_operation_group", + "MetricName": "tma_c02_wait", + "MetricThreshold": "tma_c02_wait > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", @@ -901,7 +919,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", - "MetricExpr": "(25 * tma_info_system_average_frequency * (cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) + 24 * tma_info_system_average_frequency * cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", + "MetricExpr": "(25 * tma_info_system_core_frequency * (cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) + 24 * tma_info_system_core_frequency * cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -922,7 +940,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", - "MetricExpr": "24 * tma_info_system_average_frequency * (cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD@ + cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * (1 - cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", + "MetricExpr": "24 * tma_info_system_core_frequency * (cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD@ + cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * (1 - cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -935,7 +953,7 @@ "MetricExpr": "(cpu_core@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu_core@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", "MetricName": "tma_decoder0_alone", - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35))", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)", "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", "ScaleUnit": "100%", "Unit": "cpu_core" @@ -965,7 +983,7 @@ "MetricExpr": "(cpu_core@IDQ.DSB_CYCLES_ANY@ - cpu_core@IDQ.DSB_CYCLES_OK@) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%", "Unit": "cpu_core" @@ -986,7 +1004,7 @@ "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -996,13 +1014,13 @@ "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "28 * tma_info_system_average_frequency * cpu_core@OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM@ / tma_info_thread_clks", + "MetricExpr": "28 * tma_info_system_core_frequency * cpu_core@OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM@ / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1016,7 +1034,7 @@ "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -1025,7 +1043,7 @@ "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", "MetricgroupNoGroup": "TopdownL2", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%", @@ -1127,10 +1145,10 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions", "MetricExpr": "tma_light_operations * cpu_core@INST_RETIRED.MACRO_FUSED@ / (tma_retiring * tma_info_thread_slots)", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_fused_instructions", "MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. CMP+JCC or DEC+JCC are common examples of legacy fusions. {([MTL] Note new MOV+OP and Load+OP fusions appear under Other_Light_Ops in MTL!)}", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -1141,14 +1159,14 @@ "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. Sample with: UOPS_RETIRED.HEAVY", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .). Sample with: UOPS_RETIRED.HEAVY", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", "MetricExpr": "cpu_core@ICACHE_DATA.STALLS@ / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", @@ -1157,7 +1175,7 @@ }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / cpu_core@BR_MISP_RETIRED.ALL_BRANCHES@ / 100", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bad_spec_branch_misprediction_cost", "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers", @@ -1181,7 +1199,7 @@ }, { "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "cpu_core@BR_MISP_RETIRED.INDIRECT_CALL\\,umask\\=0x80@ / BR_MISP_RETIRED.INDIRECT", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / BR_MISP_RETIRED.INDIRECT", "MetricGroup": "Bad;BrMispredicts", "MetricName": "tma_info_bad_spec_ipmisp_indirect", "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3", @@ -1204,6 +1222,13 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)", + "MetricExpr": "cpu_core@INT_MISC.CLEARS_COUNT@ / (cpu_core@BR_MISP_RETIRED.ALL_BRANCHES@ + cpu_core@MACHINE_CLEARS.COUNT@)", + "MetricGroup": "BrMispredicts", + "MetricName": "tma_info_bad_spec_spec_clears_ratio", + "Unit": "cpu_core" + }, + { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", "MetricGroup": "Cor;SMT", @@ -1230,61 +1255,94 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.", + "MetricExpr": "100 * (tma_retiring - (cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ + cpu_core@BR_INST_RETIRED.NEAR_CALL@) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Ret", + "MetricName": "tma_info_bottleneck_base_non_br", + "MetricThreshold": "tma_info_bottleneck_base_non_br > 20", + "Unit": "cpu_core" + }, + { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", - "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", + "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "tma_info_bottleneck_big_code", "MetricThreshold": "tma_info_bottleneck_big_code > 20", - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead", "Unit": "cpu_core" }, { "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((cpu_core@BR_INST_RETIRED.COND@ + 3 * cpu_core@BR_INST_RETIRED.NEAR_CALL@ + (cpu_core@BR_INST_RETIRED.NEAR_TAKEN@ - cpu_core@BR_INST_RETIRED.COND_TAKEN@ - 2 * cpu_core@BR_INST_RETIRED.NEAR_CALL@)) / tma_info_thread_slots)", - "MetricGroup": "Ret;tma_issueBC", + "MetricExpr": "100 * ((cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ + cpu_core@BR_INST_RETIRED.NEAR_CALL@) / tma_info_thread_slots)", + "MetricGroup": "Ret", "MetricName": "tma_info_bottleneck_branching_overhead", - "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code", + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", + "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", + "MetricName": "tma_info_bottleneck_cache_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", + "MetricName": "tma_info_bottleneck_cache_memory_latency", + "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "Cor;tma_issueComp", + "MetricName": "tma_info_bottleneck_compute_bound_est", + "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: ", "Unit": "cpu_core" }, { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))) - tma_info_bottleneck_big_code", "MetricGroup": "Fed;FetchBW;Frontend", "MetricName": "tma_info_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20", "Unit": "cpu_core" }, { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", - "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_info_bottleneck_memory_bandwidth", - "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full", + "BriefDescription": "Total pipeline cost of irregular execution (e.g", + "MetricExpr": "100 * ((1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + cpu_core@RS.EMPTY\\,umask\\=1@ / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Bad;Cor;Ret;tma_issueMS", + "MetricName": "tma_info_bottleneck_irregular_overhead", + "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10", + "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches", "Unit": "cpu_core" }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_info_bottleneck_memory_data_tlbs", "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", - "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store", + "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization", "Unit": "cpu_core" }, { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", - "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_info_bottleneck_memory_latency", - "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", - "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency", + "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", + "MetricExpr": "100 * (tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", + "MetricGroup": "Mem;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_synchronization", + "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10", + "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", "Unit": "cpu_core" }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", - "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bottleneck_mispredictions", "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", @@ -1292,6 +1350,15 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)", + "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)", + "MetricGroup": "Cor;Offcore", + "MetricName": "tma_info_bottleneck_other_bottlenecks", + "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20", + "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.", + "Unit": "cpu_core" + }, + { "BriefDescription": "Fraction of branches that are CALL or RET", "MetricExpr": "(cpu_core@BR_INST_RETIRED.NEAR_CALL@ + cpu_core@BR_INST_RETIRED.NEAR_RETURN@) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", @@ -1328,7 +1395,7 @@ }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.DISTRIBUTED@", + "MetricExpr": "(cpu_core@CPU_CLK_UNHALTED.DISTRIBUTED@ if #SMT_on else tma_info_thread_clks)", "MetricGroup": "SMT", "MetricName": "tma_info_core_core_clks", "Unit": "cpu_core" @@ -1341,8 +1408,15 @@ "Unit": "cpu_core" }, { + "BriefDescription": "uops Executed per Cycle", + "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / tma_info_thread_clks", + "MetricGroup": "Power", + "MetricName": "tma_info_core_epc", + "Unit": "cpu_core" + }, + { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@) + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / tma_info_core_core_clks", + "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / tma_info_core_core_clks", "MetricGroup": "Flops;Ret", "MetricName": "tma_info_core_flopc", "Unit": "cpu_core" @@ -1356,8 +1430,8 @@ "Unit": "cpu_core" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / (cpu_core@UOPS_EXECUTED.CORE_CYCLES_GE_1@ / 2 if #SMT_on else cpu_core@UOPS_EXECUTED.CORE_CYCLES_GE_1@)", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", + "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / cpu_core@UOPS_EXECUTED.THREAD\\,cmask\\=1@", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_core_ilp", "Unit": "cpu_core" @@ -1429,6 +1503,14 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Average number of cycles the front-end was delayed due to an Unknown Branch detection", + "MetricExpr": "cpu_core@INT_MISC.UNKNOWN_BRANCH_CYCLES@ / cpu_core@INT_MISC.UNKNOWN_BRANCH_CYCLES\\,cmask\\=1\\,edge@", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_unknown_branch_cost", + "PublicDescription": "Average number of cycles the front-end was delayed due to an Unknown Branch detection. See Unknown_Branches node.", + "Unit": "cpu_core" + }, + { "BriefDescription": "Branch instructions per taken branch.", "MetricExpr": "cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;PGO", @@ -1449,7 +1531,7 @@ "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", "MetricThreshold": "tma_info_inst_mix_iparith < 10", - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW.", + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW.", "Unit": "cpu_core" }, { @@ -1458,7 +1540,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx128", "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.", + "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.", "Unit": "cpu_core" }, { @@ -1467,7 +1549,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx256", "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.", + "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.", "Unit": "cpu_core" }, { @@ -1476,7 +1558,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_dp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.", + "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.", "Unit": "cpu_core" }, { @@ -1485,7 +1567,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_sp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.", + "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.", "Unit": "cpu_core" }, { @@ -1506,7 +1588,7 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@) + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_ipflop", "MetricThreshold": "tma_info_inst_mix_ipflop < 10", @@ -1521,6 +1603,13 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Instructions per PAUSE (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / CPU_CLK_UNHALTED.PAUSE_INST", + "MetricGroup": "Flops;FpVector;InsType", + "MetricName": "tma_info_inst_mix_ippause", + "Unit": "cpu_core" + }, + { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / MEM_INST_RETIRED.ALL_STORES", "MetricGroup": "InsType", @@ -1547,164 +1636,178 @@ }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * cpu_core@L1D.REPLACEMENT@ / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l1d_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l1d_cache_fill_bw", + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t", "Unit": "cpu_core" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * cpu_core@L2_LINES_IN.ALL@ / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l2_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l2_cache_fill_bw", + "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t", "Unit": "cpu_core" }, { "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * cpu_core@OFFCORE_REQUESTS.ALL_REQUESTS@ / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_access_bw", "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_core_l3_cache_access_bw", + "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t", "Unit": "cpu_core" }, { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * cpu_core@LONGEST_LAT_CACHE.MISS@ / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l3_cache_fill_bw", + "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t", "Unit": "cpu_core" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_fb_hpki", "Unit": "cpu_core" }, { + "BriefDescription": "", + "MetricExpr": "64 * cpu_core@L1D.REPLACEMENT@ / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw", + "Unit": "cpu_core" + }, + { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki", "Unit": "cpu_core" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * cpu_core@L2_RQSTS.ALL_DEMAND_DATA_RD@ / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki_load", "Unit": "cpu_core" }, { + "BriefDescription": "", + "MetricExpr": "64 * cpu_core@L2_LINES_IN.ALL@ / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw", + "Unit": "cpu_core" + }, + { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (cpu_core@L2_RQSTS.REFERENCES@ - cpu_core@L2_RQSTS.MISS@) / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_all", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * cpu_core@L2_RQSTS.DEMAND_DATA_RD_HIT@ / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_load", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L2_MISS@ / INST_RETIRED.ANY", - "MetricGroup": "Backend;CacheMisses;Mem", + "MetricGroup": "Backend;CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * cpu_core@L2_RQSTS.MISS@ / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem;Offcore", + "MetricGroup": "CacheHits;Mem;Offcore", "MetricName": "tma_info_memory_l2mpki_all", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * cpu_core@L2_RQSTS.DEMAND_DATA_RD_MISS@ / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki_load", "Unit": "cpu_core" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L3_MISS@ / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_memory_l3mpki", + "BriefDescription": "", + "MetricExpr": "64 * cpu_core@OFFCORE_REQUESTS.ALL_REQUESTS@ / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_l3_cache_access_bw", "Unit": "cpu_core" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "cpu_core@L1D_PEND_MISS.PENDING@ / MEM_LOAD_COMPLETED.L1_MISS_ANY", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_memory_load_miss_real_latency", + "BriefDescription": "", + "MetricExpr": "64 * cpu_core@LONGEST_LAT_CACHE.MISS@ / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw", "Unit": "cpu_core" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricExpr": "cpu_core@L1D_PEND_MISS.PENDING@ / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_memory_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L3_MISS@ / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_l3mpki", "Unit": "cpu_core" }, { "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD@ / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DATA_RD@ / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_data_l2_mlp", + "MetricName": "tma_info_memory_latency_data_l2_mlp", "Unit": "cpu_core" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD@ / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_miss_latency", + "MetricName": "tma_info_memory_latency_load_l2_miss_latency", "Unit": "cpu_core" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD@ / cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_mlp", + "MetricName": "tma_info_memory_latency_load_l2_mlp", "Unit": "cpu_core" }, { "BriefDescription": "Average Latency for L3 cache miss demand Loads", "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l3_miss_latency", + "MetricName": "tma_info_memory_latency_load_l3_miss_latency", "Unit": "cpu_core" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t", + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "cpu_core@L1D_PEND_MISS.PENDING@ / MEM_LOAD_COMPLETED.L1_MISS_ANY", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency", "Unit": "cpu_core" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t", + "BriefDescription": "\"Bus lock\" per kilo instruction", + "MetricExpr": "1e3 * cpu_core@SQ_MISC.BUS_LOCK@ / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_mix_bus_lock_pki", "Unit": "cpu_core" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t", + "BriefDescription": "Un-cacheable retired load per kilo instruction", + "MetricExpr": "1e3 * cpu_core@MEM_LOAD_MISC_RETIRED.UC@ / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_mix_uc_load_pki", "Unit": "cpu_core" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t", + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricExpr": "cpu_core@L1D_PEND_MISS.PENDING@ / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", "Unit": "cpu_core" }, { @@ -1737,16 +1840,16 @@ "Unit": "cpu_core" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / cpu_core@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "BriefDescription": "", + "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / (cpu_core@UOPS_EXECUTED.CORE_CYCLES_GE_1@ / 2 if #SMT_on else cpu_core@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute", "Unit": "cpu_core" }, { "BriefDescription": "Instructions per a microcode Assist invocation", - "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@ASSISTS.ANY\\,umask\\=0x1B@", - "MetricGroup": "Pipeline;Ret;Retire", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / ASSISTS.ANY", + "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", "MetricName": "tma_info_pipeline_ipassist", "MetricThreshold": "tma_info_pipeline_ipassist < 100e3", "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)", @@ -1762,39 +1865,54 @@ { "BriefDescription": "Estimated fraction of retirement-cycles dealing with repeat instructions", "MetricExpr": "cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.SLOTS\\,cmask\\=1@", - "MetricGroup": "Pipeline;Ret", + "MetricGroup": "MicroSeq;Pipeline;Ret", "MetricName": "tma_info_pipeline_strings_cycles", "MetricThreshold": "tma_info_pipeline_strings_cycles > 0.1", "Unit": "cpu_core" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "BriefDescription": "Fraction of cycles the processor is waiting yet unhalted; covering legacy PAUSE instruction, as well as C0.1 / C0.2 power-performance optimized states", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.C0_WAIT@ / tma_info_thread_clks", + "MetricGroup": "C0Wait", + "MetricName": "tma_info_system_c0_wait", + "MetricThreshold": "tma_info_system_c0_wait > 0.05", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", "MetricGroup": "Power;Summary", - "MetricName": "tma_info_system_average_frequency", + "MetricName": "tma_info_system_core_frequency", "Unit": "cpu_core" }, { - "BriefDescription": "Average CPU Utilization", + "BriefDescription": "Average CPU Utilization (percentage)", "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.REF_TSC@ / TSC", "MetricGroup": "HPC;Summary", "MetricName": "tma_info_system_cpu_utilization", "Unit": "cpu_core" }, { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized", + "Unit": "cpu_core" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full", "Unit": "cpu_core" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@) + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / 1e9 / duration_time", + "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / 1e9 / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_system_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine.", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width", "Unit": "cpu_core" }, { @@ -1838,14 +1956,6 @@ "Unit": "cpu_core" }, { - "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.ALL + UNC_ARB_DAT_OCCUPANCY.RD) / UNC_ARB_TRK_REQUESTS.ALL", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_system_mem_request_latency", - "Unit": "cpu_core" - }, - { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - cpu_core@CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE@ / cpu_core@CPU_CLK_UNHALTED.REF_DISTRIBUTED@ if #SMT_on else 0)", "MetricGroup": "SMT", @@ -1927,7 +2037,7 @@ }, { "BriefDescription": "This metric represents overall Integer (Int) select operations fraction the CPU has executed (retired)", - "MetricExpr": "tma_int_vector_128b + tma_int_vector_256b + tma_shuffles", + "MetricExpr": "tma_int_vector_128b + tma_int_vector_256b", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_int_operations", "MetricThreshold": "tma_int_operations > 0.1 & tma_light_operations > 0.6", @@ -1946,19 +2056,19 @@ "Unit": "cpu_core" }, { - "BriefDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired", + "BriefDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired", "MetricExpr": "(cpu_core@INT_VEC_RETIRED.ADD_256@ + cpu_core@INT_VEC_RETIRED.MUL_256@ + cpu_core@INT_VEC_RETIRED.VNNI_256@) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group;tma_issue2P", "MetricName": "tma_int_vector_256b", "MetricThreshold": "tma_int_vector_256b > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", - "PublicDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", "MetricExpr": "cpu_core@ICACHE_TAG.STALLS@ / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", @@ -1968,7 +2078,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", "MetricExpr": "max((cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L1D_MISS@) / tma_info_thread_clks, 0)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1", @@ -1978,7 +2088,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricExpr": "(cpu_core@MEMORY_ACTIVITY.STALLS_L1D_MISS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L2_MISS@) / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", @@ -1988,7 +2098,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricExpr": "(cpu_core@MEMORY_ACTIVITY.STALLS_L2_MISS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L3_MISS@) / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", @@ -1996,12 +2106,12 @@ "Unit": "cpu_core" }, { - "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "9 * tma_info_system_average_frequency * cpu_core@MEM_LOAD_RETIRED.L3_HIT@ * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", + "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "9 * tma_info_system_core_frequency * (cpu_core@MEM_LOAD_RETIRED.L3_HIT@ * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2022,7 +2132,7 @@ "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2069,7 +2179,7 @@ "MetricExpr": "(cpu_core@LSD.CYCLES_ACTIVE@ - cpu_core@LSD.CYCLES_OK@) / tma_info_core_core_clks / 2", "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_lsd", - "MetricThreshold": "tma_lsd > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)", + "MetricThreshold": "tma_lsd > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.", "ScaleUnit": "100%", "Unit": "cpu_core" @@ -2086,22 +2196,22 @@ "Unit": "cpu_core" }, { - "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%", "Unit": "cpu_core" }, { - "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD@) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2120,9 +2230,9 @@ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.", "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "13 * cpu_core@MISC2_RETIRED.LFENCE@ / tma_info_thread_clks", - "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", + "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_memory_fence", - "MetricThreshold": "tma_memory_fence > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", + "MetricThreshold": "tma_memory_fence > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2141,7 +2251,7 @@ "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", - "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: UOPS_RETIRED.MS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: UOPS_RETIRED.MS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2160,35 +2270,35 @@ "MetricExpr": "(cpu_core@IDQ.MITE_CYCLES_ANY@ - cpu_core@IDQ.MITE_CYCLES_OK@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%", "Unit": "cpu_core" }, { - "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", + "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)", "MetricExpr": "160 * cpu_core@ASSISTS.SSE_AVX_MIX@ / tma_info_thread_clks", "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group", "MetricName": "tma_mixing_vectors", "MetricThreshold": "tma_mixing_vectors > 0.05", - "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", + "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "3 * cpu_core@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (tma_retiring * tma_info_thread_slots / cpu_core@UOPS_ISSUED.ANY@) / tma_info_thread_clks", + "MetricExpr": "3 * cpu_core@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (cpu_core@UOPS_RETIRED.SLOTS@ / cpu_core@UOPS_ISSUED.ANY@) / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: FRONTEND_RETIRED.MS_FLOWS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: FRONTEND_RETIRED.MS_FLOWS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused", "MetricExpr": "tma_light_operations * (cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ - cpu_core@INST_RETIRED.MACRO_FUSED@) / (tma_retiring * tma_info_thread_slots)", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_non_fused_branches", "MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.", @@ -2198,16 +2308,16 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", "MetricExpr": "tma_light_operations * cpu_core@INST_RETIRED.NOP@ / (tma_retiring * tma_info_thread_slots)", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group", "MetricName": "tma_nop_instructions", - "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6", + "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", - "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches + tma_nop_instructions))", + "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6", @@ -2216,6 +2326,24 @@ "Unit": "cpu_core" }, { + "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).", + "MetricExpr": "max(tma_branch_mispredicts * (1 - cpu_core@BR_MISP_RETIRED.ALL_BRANCHES@ / (cpu_core@INT_MISC.CLEARS_COUNT@ - cpu_core@MACHINE_CLEARS.COUNT@)), 0.0001)", + "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group", + "MetricName": "tma_other_mispredicts", + "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.", + "MetricExpr": "max(tma_machine_clears * (1 - cpu_core@MACHINE_CLEARS.MEMORY_ORDERING@ / cpu_core@MACHINE_CLEARS.COUNT@), 0.0001)", + "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group", + "MetricName": "tma_other_nukes", + "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults", "MetricExpr": "99 * cpu_core@ASSISTS.PAGE_FAULT@ / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_assists_group", @@ -2246,18 +2374,18 @@ "Unit": "cpu_core" }, { - "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)", "MetricExpr": "cpu_core@UOPS_DISPATCHED.PORT_6@ / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@) + (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_thread_clks if cpu_core@ARITH.DIV_ACTIVE@ < cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ else (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_thread_clks)", + "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_thread_clks if cpu_core@ARITH.DIV_ACTIVE@ < cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ else (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -2267,7 +2395,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@) / tma_info_thread_clks", + "MetricExpr": "(cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + cpu_core@RS.EMPTY\\,umask\\=1@) / tma_info_thread_clks * (cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -2302,7 +2430,7 @@ "MetricExpr": "cpu_core@UOPS_EXECUTED.CYCLES_GE_3@ / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", - "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3", "ScaleUnit": "100%", "Unit": "cpu_core" @@ -2321,20 +2449,21 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", - "MetricExpr": "cpu_core@RESOURCE_STALLS.SCOREBOARD@ / tma_info_thread_clks", - "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", + "MetricExpr": "cpu_core@RESOURCE_STALLS.SCOREBOARD@ / tma_info_thread_clks + tma_c02_wait", + "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO", "MetricName": "tma_serializing_operation", - "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", + "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD. Related metrics: tma_ms_switches", "ScaleUnit": "100%", "Unit": "cpu_core" }, { - "BriefDescription": "This metric represents Shuffle (cross \"vector lane\" data transfers) uops fraction the CPU has retired.", - "MetricExpr": "cpu_core@INT_VEC_RETIRED.SHUFFLES@ / (tma_retiring * tma_info_thread_slots)", - "MetricGroup": "HPC;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group", - "MetricName": "tma_shuffles", - "MetricThreshold": "tma_shuffles > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring Shuffle operations of 256-bit vector size (FP or Integer)", + "MetricExpr": "tma_light_operations * cpu_core@INT_VEC_RETIRED.SHUFFLES@ / (tma_retiring * tma_info_thread_slots)", + "MetricGroup": "HPC;Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group", + "MetricName": "tma_shuffles_256b", + "MetricThreshold": "tma_shuffles_256b > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring Shuffle operations of 256-bit vector size (FP or Integer). Shuffles may incur slow cross \"vector lane\" data transfers.", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2342,9 +2471,9 @@ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.PAUSE@ / tma_info_thread_clks", - "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", + "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", - "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", + "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: CPU_CLK_UNHALTED.PAUSE_INST", "ScaleUnit": "100%", "Unit": "cpu_core" @@ -2375,7 +2504,7 @@ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2450,10 +2579,10 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", "MetricExpr": "cpu_core@INT_MISC.UNKNOWN_BRANCH_CYCLES@ / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", + "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH", "ScaleUnit": "100%", "Unit": "cpu_core" }, diff --git a/tools/perf/pmu-events/arch/x86/alderlake/floating-point.json b/tools/perf/pmu-events/arch/x86/alderlake/floating-point.json index c8ba96c4a7..cd291943dc 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/floating-point.json @@ -26,7 +26,7 @@ "Unit": "cpu_core" }, { - "BriefDescription": "FP_ARITH_DISPATCHED.PORT_0", + "BriefDescription": "FP_ARITH_DISPATCHED.PORT_0 [This event is alias to FP_ARITH_DISPATCHED.V0]", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_0", "SampleAfterValue": "2000003", @@ -34,7 +34,7 @@ "Unit": "cpu_core" }, { - "BriefDescription": "FP_ARITH_DISPATCHED.PORT_1", + "BriefDescription": "FP_ARITH_DISPATCHED.PORT_1 [This event is alias to FP_ARITH_DISPATCHED.V1]", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_1", "SampleAfterValue": "2000003", @@ -42,7 +42,7 @@ "Unit": "cpu_core" }, { - "BriefDescription": "FP_ARITH_DISPATCHED.PORT_5", + "BriefDescription": "FP_ARITH_DISPATCHED.PORT_5 [This event is alias to FP_ARITH_DISPATCHED.V2]", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_5", "SampleAfterValue": "2000003", @@ -50,6 +50,30 @@ "Unit": "cpu_core" }, { + "BriefDescription": "FP_ARITH_DISPATCHED.V0 [This event is alias to FP_ARITH_DISPATCHED.PORT_0]", + "EventCode": "0xb3", + "EventName": "FP_ARITH_DISPATCHED.V0", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "FP_ARITH_DISPATCHED.V1 [This event is alias to FP_ARITH_DISPATCHED.PORT_1]", + "EventCode": "0xb3", + "EventName": "FP_ARITH_DISPATCHED.V1", + "SampleAfterValue": "2000003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "FP_ARITH_DISPATCHED.V2 [This event is alias to FP_ARITH_DISPATCHED.PORT_5]", + "EventCode": "0xb3", + "EventName": "FP_ARITH_DISPATCHED.V2", + "SampleAfterValue": "2000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", diff --git a/tools/perf/pmu-events/arch/x86/alderlake/metricgroups.json b/tools/perf/pmu-events/arch/x86/alderlake/metricgroups.json index 516eb0f93f..7a03835f26 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/metricgroups.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/metricgroups.json @@ -2,10 +2,11 @@ "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "C0Wait": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -26,7 +27,9 @@ "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -69,6 +72,7 @@ "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", "tma_base_group": "Metrics contributing to tma_base category", + "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category", "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", "tma_core_bound_group": "Metrics contributing to tma_core_bound category", "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", @@ -82,9 +86,9 @@ "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", "tma_int_operations_group": "Metrics contributing to tma_int_operations category", "tma_issue2P": "Metrics related by the issue $issue2P", - "tma_issueBC": "Metrics related by the issue $issueBC", "tma_issueBM": "Metrics related by the issue $issueBM", "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueComp": "Metrics related by the issue $issueComp", "tma_issueD0": "Metrics related by the issue $issueD0", "tma_issueFB": "Metrics related by the issue $issueFB", "tma_issueFL": "Metrics related by the issue $issueFL", @@ -111,6 +115,7 @@ "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", "tma_mite_group": "Metrics contributing to tma_mite category", "tma_nuke_group": "Metrics contributing to tma_nuke category", + "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category", "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", diff --git a/tools/perf/pmu-events/arch/x86/alderlake/other.json b/tools/perf/pmu-events/arch/x86/alderlake/other.json index 1db73e0202..5250a17d9c 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/other.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/other.json @@ -40,6 +40,16 @@ "Unit": "cpu_core" }, { + "BriefDescription": "This event is deprecated. [This event is alias to MISC_RETIRED.LBR_INSERTS]", + "Deprecated": "1", + "EventCode": "0xe4", + "EventName": "LBR_INSERTS.ANY", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Counts modified writebacks from L1 cache and L2 cache that have any type of response.", "EventCode": "0xB7", "EventName": "OCR.COREWB_M.ANY_RESPONSE", diff --git a/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json b/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json index f9876bef16..df6032e816 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json @@ -799,6 +799,7 @@ "BriefDescription": "INST_RETIRED.MACRO_FUSED", "EventCode": "0xc0", "EventName": "INST_RETIRED.MACRO_FUSED", + "PEBS": "1", "SampleAfterValue": "2000003", "UMask": "0x10", "Unit": "cpu_core" @@ -807,6 +808,7 @@ "BriefDescription": "Retired NOP instructions.", "EventCode": "0xc0", "EventName": "INST_RETIRED.NOP", + "PEBS": "1", "PublicDescription": "Counts all retired NOP or ENDBR32/64 instructions", "SampleAfterValue": "2000003", "UMask": "0x2", @@ -825,6 +827,7 @@ "BriefDescription": "Iterations of Repeat string retired instructions.", "EventCode": "0xc0", "EventName": "INST_RETIRED.REP_ITERATION", + "PEBS": "1", "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent.", "SampleAfterValue": "2000003", "UMask": "0x8", @@ -1107,6 +1110,16 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts the number of LBR entries recorded. Requires LBRs to be enabled in IA32_LBR_CTL. [This event is alias to LBR_INSERTS.ANY]", + "EventCode": "0xe4", + "EventName": "MISC_RETIRED.LBR_INSERTS", + "PEBS": "1", + "PublicDescription": "Counts the number of LBR entries recorded. Requires LBRs to be enabled in IA32_LBR_CTL. This event is PDIR on GP0 and NPEBS on all other GPs [This event is alias to LBR_INSERTS.ANY]", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Increments whenever there is an update to the LBR array.", "EventCode": "0xcc", "EventName": "MISC_RETIRED.LBR_INSERTS", diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/other.json b/tools/perf/pmu-events/arch/x86/alderlaken/other.json index 6336de61f6..ccc892149d 100644 --- a/tools/perf/pmu-events/arch/x86/alderlaken/other.json +++ b/tools/perf/pmu-events/arch/x86/alderlaken/other.json @@ -1,5 +1,14 @@ [ { + "BriefDescription": "This event is deprecated. [This event is alias to MISC_RETIRED.LBR_INSERTS]", + "Deprecated": "1", + "EventCode": "0xe4", + "EventName": "LBR_INSERTS.ANY", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { "BriefDescription": "Counts modified writebacks from L1 cache and L2 cache that have any type of response.", "EventCode": "0xB7", "EventName": "OCR.COREWB_M.ANY_RESPONSE", diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/pipeline.json b/tools/perf/pmu-events/arch/x86/alderlaken/pipeline.json index 3153bab527..846bcdafca 100644 --- a/tools/perf/pmu-events/arch/x86/alderlaken/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/alderlaken/pipeline.json @@ -345,6 +345,15 @@ "UMask": "0x1" }, { + "BriefDescription": "Counts the number of LBR entries recorded. Requires LBRs to be enabled in IA32_LBR_CTL. [This event is alias to LBR_INSERTS.ANY]", + "EventCode": "0xe4", + "EventName": "MISC_RETIRED.LBR_INSERTS", + "PEBS": "1", + "PublicDescription": "Counts the number of LBR entries recorded. Requires LBRs to be enabled in IA32_LBR_CTL. This event is PDIR on GP0 and NPEBS on all other GPs [This event is alias to LBR_INSERTS.ANY]", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { "BriefDescription": "Counts the number of issue slots not consumed by the backend due to a micro-sequencer (MS) scoreboard, which stalls the front-end from issuing from the UROM until a specified older uop retires.", "EventCode": "0x75", "EventName": "SERIALIZATION.NON_C01_MS_SCB", diff --git a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json index 55a10b0bf3..c20833fb1f 100644 --- a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json @@ -84,12 +84,12 @@ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", - "MetricThreshold": "tma_alu_op_utilization > 0.6", + "MetricThreshold": "tma_alu_op_utilization > 0.4", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", + "MetricExpr": "66 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -211,7 +211,7 @@ "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, @@ -266,7 +266,7 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", "MetricgroupNoGroup": "TopdownL2", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" @@ -343,27 +343,20 @@ "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "ScaleUnit": "100%" }, { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_bad_spec_branch_misprediction_cost", - "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_mispredicts_resteers" - }, - { "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)", "MetricGroup": "Bad;BrMispredicts", "MetricName": "tma_info_bad_spec_ipmisp_indirect", "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" @@ -389,7 +382,7 @@ }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks", "MetricGroup": "Flops;Ret", "MetricName": "tma_info_core_flopc" }, @@ -401,8 +394,8 @@ "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_core_ilp" }, @@ -439,7 +432,7 @@ "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", "MetricThreshold": "tma_info_inst_mix_iparith < 10", - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", @@ -447,7 +440,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx128", "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", @@ -455,7 +448,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx256", "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", @@ -463,7 +456,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_dp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", @@ -471,7 +464,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_sp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", @@ -489,7 +482,7 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_ipflop", "MetricThreshold": "tma_info_inst_mix_ipflop < 10" @@ -518,120 +511,114 @@ }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l1d_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l2_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t" + }, + { + "BriefDescription": "", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki" }, { + "BriefDescription": "", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw" + }, + { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_all" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Backend;CacheMisses;Mem", + "MetricGroup": "Backend;CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem;Offcore", + "MetricGroup": "CacheHits;Mem;Offcore", "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki_load" }, { + "BriefDescription": "", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw" + }, + { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "Mem", "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_memory_load_miss_real_latency" - }, - { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_memory_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" - }, - { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_data_l2_mlp" + "MetricName": "tma_info_memory_latency_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_miss_latency" + "MetricName": "tma_info_memory_latency_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_mlp" + "MetricName": "tma_info_memory_latency_load_l2_mlp" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" - }, - { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "0", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", @@ -641,8 +628,8 @@ "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "BriefDescription": "", + "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" }, @@ -653,30 +640,36 @@ "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", "MetricGroup": "Power;Summary", - "MetricName": "tma_info_system_average_frequency" + "MetricName": "tma_info_system_core_frequency" }, { - "BriefDescription": "Average CPU Utilization", + "BriefDescription": "Average CPU Utilization (percentage)", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", "MetricGroup": "HPC;Summary", "MetricName": "tma_info_system_cpu_utilization" }, { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_system_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width" }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", @@ -699,19 +692,6 @@ "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" }, { - "BriefDescription": "Average number of parallel requests to external memory", - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_system_mem_parallel_requests", - "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests" - }, - { - "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_system_mem_request_latency" - }, - { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", "MetricGroup": "SMT", @@ -777,7 +757,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED", @@ -786,7 +766,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1", @@ -795,7 +775,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS", @@ -805,20 +785,20 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_SMT", "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency", "ScaleUnit": "100%" }, { @@ -837,7 +817,7 @@ "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%" }, { @@ -872,21 +852,21 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -923,7 +903,7 @@ "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", "ScaleUnit": "100%" }, @@ -991,12 +971,12 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)", "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -1050,7 +1030,7 @@ "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", - "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "ScaleUnit": "100%" }, { @@ -1130,10 +1110,10 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", "MetricExpr": "tma_branch_resteers - tma_mispredicts_resteers - tma_clears_resteers", - "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", + "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY", "ScaleUnit": "100%" }, { diff --git a/tools/perf/pmu-events/arch/x86/broadwell/memory.json b/tools/perf/pmu-events/arch/x86/broadwell/memory.json index ac7cdb8319..b01ed47072 100644 --- a/tools/perf/pmu-events/arch/x86/broadwell/memory.json +++ b/tools/perf/pmu-events/arch/x86/broadwell/memory.json @@ -2005,7 +2005,7 @@ "BriefDescription": "Number of times RTM abort was triggered", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED", - "PEBS": "1", + "PEBS": "2", "PublicDescription": "Number of times RTM abort was triggered .", "SampleAfterValue": "2000003", "UMask": "0x4" diff --git a/tools/perf/pmu-events/arch/x86/broadwell/metricgroups.json b/tools/perf/pmu-events/arch/x86/broadwell/metricgroups.json index f6a0258e32..8c808347f6 100644 --- a/tools/perf/pmu-events/arch/x86/broadwell/metricgroups.json +++ b/tools/perf/pmu-events/arch/x86/broadwell/metricgroups.json @@ -2,10 +2,10 @@ "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -24,7 +24,9 @@ "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -94,6 +96,7 @@ "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", "tma_light_operations_group": "Metrics contributing to tma_light_operations category", "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json index e1f55fcfa0..8263577872 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json @@ -84,12 +84,12 @@ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", - "MetricThreshold": "tma_alu_op_utilization > 0.6", + "MetricThreshold": "tma_alu_op_utilization > 0.4", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", + "MetricExpr": "66 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -211,7 +211,7 @@ "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, @@ -257,7 +257,7 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", "MetricgroupNoGroup": "TopdownL2", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" @@ -334,28 +334,21 @@ "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", "ScaleUnit": "100%" }, { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_bad_spec_branch_misprediction_cost", - "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_mispredicts_resteers" - }, - { "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)", "MetricGroup": "Bad;BrMispredicts", "MetricName": "tma_info_bad_spec_ipmisp_indirect", "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" @@ -381,7 +374,7 @@ }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks", "MetricGroup": "Flops;Ret", "MetricName": "tma_info_core_flopc" }, @@ -393,8 +386,8 @@ "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_core_ilp" }, @@ -431,7 +424,7 @@ "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", "MetricThreshold": "tma_info_inst_mix_iparith < 10", - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", @@ -439,7 +432,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx128", "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", @@ -447,7 +440,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx256", "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", @@ -455,7 +448,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_dp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", @@ -463,7 +456,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_sp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", @@ -481,7 +474,7 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_ipflop", "MetricThreshold": "tma_info_inst_mix_ipflop < 10" @@ -510,120 +503,114 @@ }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l1d_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l2_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t" + }, + { + "BriefDescription": "", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki" }, { + "BriefDescription": "", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw" + }, + { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_all" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Backend;CacheMisses;Mem", + "MetricGroup": "Backend;CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem;Offcore", + "MetricGroup": "CacheHits;Mem;Offcore", "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki_load" }, { + "BriefDescription": "", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw" + }, + { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "Mem", "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_memory_load_miss_real_latency" - }, - { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_memory_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" - }, - { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_data_l2_mlp" + "MetricName": "tma_info_memory_latency_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_miss_latency" + "MetricName": "tma_info_memory_latency_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_mlp" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + "MetricName": "tma_info_memory_latency_load_l2_mlp" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" - }, - { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "0", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", @@ -633,8 +620,8 @@ "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "BriefDescription": "", + "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" }, @@ -645,30 +632,36 @@ "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", "MetricGroup": "Power;Summary", - "MetricName": "tma_info_system_average_frequency" + "MetricName": "tma_info_system_core_frequency" }, { - "BriefDescription": "Average CPU Utilization", + "BriefDescription": "Average CPU Utilization (percentage)", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", "MetricGroup": "HPC;Summary", "MetricName": "tma_info_system_cpu_utilization" }, { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_system_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width" }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", @@ -756,7 +749,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", @@ -765,7 +758,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1", @@ -774,7 +767,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", @@ -784,20 +777,20 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_SMT", "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency", "ScaleUnit": "100%" }, { @@ -816,7 +809,7 @@ "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%" }, { @@ -851,21 +844,21 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -902,7 +895,7 @@ "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%" }, @@ -968,12 +961,12 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)", "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -1026,7 +1019,7 @@ "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", - "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3", "ScaleUnit": "100%" }, @@ -1108,10 +1101,10 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", "MetricExpr": "tma_branch_resteers - tma_mispredicts_resteers - tma_clears_resteers", - "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", + "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH", "ScaleUnit": "100%" }, { diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/metricgroups.json b/tools/perf/pmu-events/arch/x86/broadwellde/metricgroups.json index f6a0258e32..8c808347f6 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellde/metricgroups.json +++ b/tools/perf/pmu-events/arch/x86/broadwellde/metricgroups.json @@ -2,10 +2,10 @@ "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -24,7 +24,9 @@ "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -94,6 +96,7 @@ "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", "tma_light_operations_group": "Metrics contributing to tma_light_operations category", "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/uncore-power.json b/tools/perf/pmu-events/arch/x86/broadwellde/uncore-power.json index 83d20130c2..320aaab53a 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellde/uncore-power.json +++ b/tools/perf/pmu-events/arch/x86/broadwellde/uncore-power.json @@ -394,6 +394,7 @@ "BriefDescription": "Number of cores in C-State; C0 and C1", "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0", + "Filter": "occ_sel=1", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" @@ -402,6 +403,7 @@ "BriefDescription": "Number of cores in C-State; C3", "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3", + "Filter": "occ_sel=2", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" @@ -410,6 +412,7 @@ "BriefDescription": "Number of cores in C-State; C6 and C7", "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6", + "Filter": "occ_sel=3", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json index b319e4edc2..f2d378c9d6 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json @@ -286,12 +286,12 @@ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", - "MetricThreshold": "tma_alu_op_utilization > 0.6", + "MetricThreshold": "tma_alu_op_utilization > 0.4", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", + "MetricExpr": "66 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -413,7 +413,7 @@ "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, @@ -468,7 +468,7 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", "MetricgroupNoGroup": "TopdownL2", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" @@ -545,27 +545,20 @@ "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "ScaleUnit": "100%" }, { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_bad_spec_branch_misprediction_cost", - "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_mispredicts_resteers" - }, - { "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)", "MetricGroup": "Bad;BrMispredicts", "MetricName": "tma_info_bad_spec_ipmisp_indirect", "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" @@ -591,7 +584,7 @@ }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks", "MetricGroup": "Flops;Ret", "MetricName": "tma_info_core_flopc" }, @@ -603,8 +596,8 @@ "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_core_ilp" }, @@ -641,7 +634,7 @@ "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", "MetricThreshold": "tma_info_inst_mix_iparith < 10", - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", @@ -649,7 +642,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx128", "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", @@ -657,7 +650,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx256", "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", @@ -665,7 +658,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_dp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", @@ -673,7 +666,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_sp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", @@ -691,7 +684,7 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_ipflop", "MetricThreshold": "tma_info_inst_mix_ipflop < 10" @@ -720,120 +713,163 @@ }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l1d_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l2_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t" + }, + { + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "tma_info_memory_latency_data_l2_mlp", + "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_data_l2_mlp", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t", + "MetricgroupNoGroup": "TopdownL1" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki" }, { + "BriefDescription": "", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l2_cache_fill_bw_2t", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_all" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Backend;CacheMisses;Mem", + "MetricGroup": "Backend;CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem;Offcore", + "MetricGroup": "CacheHits;Mem;Offcore", "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki_load" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_memory_l3mpki" + "BriefDescription": "", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_memory_load_miss_real_latency" + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l3_cache_fill_bw_2t", + "MetricgroupNoGroup": "TopdownL1" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_memory_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_l3mpki" }, { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_data_l2_mlp" + "MetricName": "tma_info_memory_latency_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", + "MetricExpr": "tma_info_memory_load_l2_miss_latency", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_miss_latency" + "MetricName": "tma_info_memory_latency_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", + "MetricExpr": "tma_info_memory_load_l2_mlp", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_mlp" + "MetricName": "tma_info_memory_latency_load_l2_mlp" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + "BriefDescription": "Average Latency for L2 cache miss demand Loads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", + "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_load_l2_miss_latency", + "MetricgroupNoGroup": "TopdownL1" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + "BriefDescription": "Average Parallel L2 cache miss demand Loads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", + "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_load_l2_mlp", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "0", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "tma_info_memory_tlb_page_walks_utilization", + "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_page_walks_utilization", + "MetricgroupNoGroup": "TopdownL1" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", @@ -843,8 +879,8 @@ "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "BriefDescription": "", + "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" }, @@ -855,30 +891,36 @@ "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", "MetricGroup": "Power;Summary", - "MetricName": "tma_info_system_average_frequency" + "MetricName": "tma_info_system_core_frequency" }, { - "BriefDescription": "Average CPU Utilization", + "BriefDescription": "Average CPU Utilization (percentage)", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", "MetricGroup": "HPC;Summary", "MetricName": "tma_info_system_cpu_utilization" }, { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_system_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width" }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", @@ -933,6 +975,12 @@ "MetricName": "tma_info_system_turbo_utilization" }, { + "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]", + "MetricExpr": "tma_info_system_socket_clks / 1e9 / duration_time", + "MetricGroup": "SoC", + "MetricName": "tma_info_system_uncore_frequency" + }, + { "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", "MetricExpr": "CPU_CLK_UNHALTED.THREAD", "MetricGroup": "Pipeline", @@ -980,7 +1028,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED", @@ -989,7 +1037,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1", @@ -998,7 +1046,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS", @@ -1008,20 +1056,20 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_SMT", "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "41 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency", "ScaleUnit": "100%" }, { @@ -1040,7 +1088,7 @@ "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%" }, { @@ -1055,11 +1103,10 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", - "MetricName": "tma_local_dram", - "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", + "MetricName": "tma_local_mem", + "MetricThreshold": "tma_local_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM_PS", "ScaleUnit": "100%" }, @@ -1085,21 +1132,21 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1136,7 +1183,7 @@ "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", "ScaleUnit": "100%" }, @@ -1204,12 +1251,12 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)", "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -1263,7 +1310,7 @@ "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", - "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "ScaleUnit": "100%" }, { @@ -1278,11 +1325,10 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "310 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", - "MetricName": "tma_remote_dram", - "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", + "MetricName": "tma_remote_mem", + "MetricThreshold": "tma_remote_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article. Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM_PS", "ScaleUnit": "100%" }, @@ -1363,10 +1409,10 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", "MetricExpr": "tma_branch_resteers - tma_mispredicts_resteers - tma_clears_resteers", - "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", + "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY", "ScaleUnit": "100%" }, { diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/metricgroups.json b/tools/perf/pmu-events/arch/x86/broadwellx/metricgroups.json index f6a0258e32..8c808347f6 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellx/metricgroups.json +++ b/tools/perf/pmu-events/arch/x86/broadwellx/metricgroups.json @@ -2,10 +2,10 @@ "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -24,7 +24,9 @@ "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -94,6 +96,7 @@ "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", "tma_light_operations_group": "Metrics contributing to tma_light_operations category", "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-power.json b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-power.json index 83d20130c2..320aaab53a 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-power.json +++ b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-power.json @@ -394,6 +394,7 @@ "BriefDescription": "Number of cores in C-State; C0 and C1", "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0", + "Filter": "occ_sel=1", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" @@ -402,6 +403,7 @@ "BriefDescription": "Number of cores in C-State; C3", "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3", + "Filter": "occ_sel=2", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" @@ -410,6 +412,7 @@ "BriefDescription": "Number of cores in C-State; C6 and C7", "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6", + "Filter": "occ_sel=3", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json index 8bc6c07078..7f88b156f7 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json @@ -210,6 +210,12 @@ "ScaleUnit": "1MB/s" }, { + "BriefDescription": "Bandwidth (MB/sec) of write requests that miss the last level cache (LLC) and go to remote memory.", + "MetricExpr": "UNC_CHA_REQUESTS.WRITES_REMOTE * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_remote_memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions", "MetricExpr": "MEM_INST_RETIRED.ALL_LOADS / INST_RETIRED.ANY", "MetricName": "loads_per_instr", @@ -316,12 +322,12 @@ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", - "MetricThreshold": "tma_alu_op_utilization > 0.6", + "MetricThreshold": "tma_alu_op_utilization > 0.4", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_thread_slots", + "MetricExpr": "34 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -389,7 +395,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(44 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OCR.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_FWD))) + 44 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "(44 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OCR.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_FWD))) + 44 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -410,7 +416,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "44 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OCR.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "44 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OCR.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -422,7 +428,7 @@ "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", "MetricName": "tma_decoder0_alone", - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35))", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)", "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", "ScaleUnit": "100%" }, @@ -447,10 +453,10 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", + "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, @@ -470,7 +476,7 @@ "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", "ScaleUnit": "100%" }, { @@ -479,13 +485,13 @@ "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(110 * tma_info_system_average_frequency * (OCR.DEMAND_RFO.L3_MISS.REMOTE_HITM + OCR.PF_L2_RFO.L3_MISS.REMOTE_HITM) + 47.5 * tma_info_system_average_frequency * (OCR.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE + OCR.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE)) / tma_info_thread_clks", + "MetricExpr": "(110 * tma_info_system_core_frequency * (OCR.DEMAND_RFO.L3_MISS.REMOTE_HITM + OCR.PF_L2_RFO.L3_MISS.REMOTE_HITM) + 47.5 * tma_info_system_core_frequency * (OCR.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE + OCR.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE)) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -499,7 +505,7 @@ "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -507,7 +513,7 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", "MetricgroupNoGroup": "TopdownL2", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" @@ -542,6 +548,15 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists", + "MetricExpr": "34 * FP_ASSIST.ANY / tma_info_thread_slots", + "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", + "MetricName": "tma_fp_assists", + "MetricThreshold": "tma_fp_assists > 0.1", + "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called Denormals).", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / UOPS_RETIRED.RETIRE_SLOTS", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", @@ -600,10 +615,10 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions", "MetricExpr": "tma_light_operations * UOPS_RETIRED.MACRO_FUSED / UOPS_RETIRED.RETIRE_SLOTS", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_fused_instructions", "MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. CMP+JCC or DEC+JCC are common examples of legacy fusions. {([MTL] Note new MOV+OP and Load+OP fusions appear under Other_Light_Ops in MTL!)}", "ScaleUnit": "100%" }, { @@ -613,13 +628,13 @@ "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", @@ -627,26 +642,53 @@ }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES / 100", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bad_spec_branch_misprediction_cost", "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" }, { "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)", "MetricGroup": "Bad;BrMispredicts", "MetricName": "tma_info_bad_spec_ipmisp_indirect", "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" }, { "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_core_ipmispredict", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;BadSpec;BrMispredicts", "MetricName": "tma_info_bad_spec_ipmispredict", "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" }, { + "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)", + "MetricExpr": "INT_MISC.CLEARS_COUNT / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)", + "MetricGroup": "BrMispredicts", + "MetricName": "tma_info_bad_spec_spec_clears_ratio" + }, + { + "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricExpr": "(100 * (1 - tma_core_bound / (((EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) if tma_core_bound < (((EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", + "MetricGroup": "Cor;SMT;TopdownL1;tma_L1_group", + "MetricName": "tma_info_botlnk_core_bound_likely", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", + "MetricExpr": "100 * (100 * (tma_fetch_latency * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(2 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) + tma_fetch_bandwidth * tma_mite / (tma_mite + tma_dsb)))", + "MetricGroup": "DSBmiss;Fed;TopdownL1;tma_L1_group", + "MetricName": "tma_info_botlnk_dsb_misses", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck.", + "MetricExpr": "100 * (100 * (tma_fetch_latency * ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD) / ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(2 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD)))", + "MetricGroup": "Fed;FetchLat;IcMiss;TopdownL1;tma_L1_group", + "MetricName": "tma_info_botlnk_ic_misses", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", @@ -672,67 +714,102 @@ "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " }, { + "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.", + "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Ret", + "MetricName": "tma_info_bottleneck_base_non_br", + "MetricThreshold": "tma_info_bottleneck_base_non_br > 20" + }, + { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", - "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", + "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "tma_info_bottleneck_big_code", - "MetricThreshold": "tma_info_bottleneck_big_code > 20", - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" + "MetricThreshold": "tma_info_bottleneck_big_code > 20" }, { "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", - "MetricGroup": "Ret;tma_issueBC", + "MetricExpr": "100 * ((BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots)", + "MetricGroup": "Ret", "MetricName": "tma_info_bottleneck_branching_overhead", - "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", + "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", + "MetricName": "tma_info_bottleneck_cache_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", + "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", + "MetricName": "tma_info_bottleneck_cache_memory_latency", + "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency" + }, + { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "Cor;tma_issueComp", + "MetricName": "tma_info_bottleneck_compute_bound_est", + "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " }, { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", "MetricGroup": "Fed;FetchBW;Frontend", "MetricName": "tma_info_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" }, { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", - "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_info_bottleneck_memory_bandwidth", - "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Total pipeline cost of irregular execution (e.g", + "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Bad;Cor;Ret;tma_issueMS", + "MetricName": "tma_info_bottleneck_irregular_overhead", + "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10", + "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches" }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_info_bottleneck_memory_data_tlbs", "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", - "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" + "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization" }, { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound))", - "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_info_bottleneck_memory_latency", - "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", - "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" + "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) * tma_remote_cache / (tma_local_mem + tma_remote_cache + tma_remote_mem) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", + "MetricGroup": "Mem;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_synchronization", + "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10", + "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs" }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bottleneck_mispredictions", "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" }, { + "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)", + "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)", + "MetricGroup": "Cor;Offcore", + "MetricName": "tma_info_bottleneck_other_bottlenecks", + "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20", + "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls." + }, + { "BriefDescription": "Fraction of branches that are CALL or RET", "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", @@ -753,7 +830,7 @@ { "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.COND - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", "MetricName": "tma_info_branches_jump" }, @@ -770,9 +847,15 @@ "MetricName": "tma_info_core_coreipc" }, { + "BriefDescription": "uops Executed per Cycle", + "MetricExpr": "UOPS_EXECUTED.THREAD / tma_info_thread_clks", + "MetricGroup": "Power", + "MetricName": "tma_info_core_epc" + }, + { "BriefDescription": "Floating Point Operations Per Cycle", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", "MetricGroup": "Flops;Ret", "MetricName": "tma_info_core_flopc" }, @@ -784,19 +867,12 @@ "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_core_ilp" }, { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts;TopdownL1;tma_L1_group", - "MetricName": "tma_info_core_ipmispredict", - "MetricgroupNoGroup": "TopdownL1" - }, - { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", @@ -867,7 +943,7 @@ "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", "MetricThreshold": "tma_info_inst_mix_iparith < 10", - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", @@ -875,7 +951,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx128", "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", @@ -883,7 +959,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx256", "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", @@ -891,7 +967,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx512", "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", @@ -899,7 +975,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_dp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", @@ -907,7 +983,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_sp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", @@ -926,7 +1002,7 @@ { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_ipflop", "MetricThreshold": "tma_info_inst_mix_ipflop < 10" @@ -939,6 +1015,12 @@ "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { + "BriefDescription": "Instructions per PAUSE (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / ROB_MISC_EVENTS.PAUSE_INST", + "MetricGroup": "Flops;FpVector;InsType", + "MetricName": "tma_info_inst_mix_ippause" + }, + { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", "MetricGroup": "InsType", @@ -961,16 +1043,23 @@ "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp" }, { + "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "tma_info_memory_tlb_code_stlb_mpki", + "MetricGroup": "Fed;MemoryTLB;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_code_stlb_mpki", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l1d_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l2_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t" }, { "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", @@ -986,124 +1075,214 @@ }, { "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_access_bw", "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_core_l3_cache_access_bw" + "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t" + }, + { + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "tma_info_memory_latency_data_l2_mlp", + "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_data_l2_mlp", + "MetricgroupNoGroup": "TopdownL1" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_fb_hpki" }, { + "BriefDescription": "", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki_load" }, { + "BriefDescription": "", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l2_cache_fill_bw_2t", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", + "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY", + "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", + "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY", + "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l2_evictions_silent_pki", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_all" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Backend;CacheMisses;Mem", + "MetricGroup": "Backend;CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem;Offcore", + "MetricGroup": "CacheHits;Mem;Offcore", "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki_load" }, { + "BriefDescription": "", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_l3_cache_access_bw" + }, + { + "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l3_cache_access_bw_2t", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l3_cache_fill_bw_2t", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "Mem", "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_memory_load_miss_real_latency" + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_latency_data_l2_mlp" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_memory_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "Average Latency for L2 cache miss demand Loads", + "MetricExpr": "tma_info_memory_load_l2_miss_latency", + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_latency_load_l2_miss_latency" }, { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "BriefDescription": "Average Parallel L2 cache miss demand Loads", + "MetricExpr": "tma_info_memory_load_l2_mlp", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_data_l2_mlp" + "MetricName": "tma_info_memory_latency_load_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_miss_latency" + "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_load_l2_miss_latency", + "MetricgroupNoGroup": "TopdownL1" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_mlp" + "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_load_l2_mlp", + "MetricgroupNoGroup": "TopdownL1" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "tma_info_memory_tlb_load_stlb_mpki", + "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_load_stlb_mpki", + "MetricgroupNoGroup": "TopdownL1" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + "BriefDescription": "Un-cacheable retired load per kilo instruction", + "MetricExpr": "tma_info_memory_uc_load_pki", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_mix_uc_load_pki" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + }, + { + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "tma_info_memory_tlb_page_walks_utilization", + "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_page_walks_utilization", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "tma_info_memory_tlb_store_stlb_mpki", + "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_store_stlb_mpki", + "MetricgroupNoGroup": "TopdownL1" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", @@ -1132,55 +1311,78 @@ "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "BriefDescription": "Un-cacheable retired load per kilo instruction", + "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY", + "MetricGroup": "Mem;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_uc_load_pki", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "", + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" }, { + "BriefDescription": "Instructions per a microcode Assist invocation", + "MetricExpr": "INST_RETIRED.ANY / (FP_ASSIST.ANY + OTHER_ASSISTS.ANY)", + "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", + "MetricName": "tma_info_pipeline_ipassist", + "MetricThreshold": "tma_info_pipeline_ipassist < 100e3", + "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)" + }, + { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", "MetricGroup": "Power;Summary", - "MetricName": "tma_info_system_average_frequency" + "MetricName": "tma_info_system_core_frequency" }, { - "BriefDescription": "Average CPU Utilization", + "BriefDescription": "Average CPU Utilization (percentage)", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", "MetricGroup": "HPC;Summary", "MetricName": "tma_info_system_cpu_utilization" }, { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_system_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width" }, { "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]", "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3) * 4 / 1e9 / duration_time", - "MetricGroup": "IoBW;Mem;Server;SoC", - "MetricName": "tma_info_system_io_read_bw" + "MetricGroup": "IoBW;MemOffcore;Server;SoC", + "MetricName": "tma_info_system_io_read_bw", + "PublicDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]. Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU" }, { "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3) * 4 / 1e9 / duration_time", - "MetricGroup": "IoBW;Mem;Server;SoC", - "MetricName": "tma_info_system_io_write_bw" + "MetricGroup": "IoBW;MemOffcore;Server;SoC", + "MetricName": "tma_info_system_io_write_bw", + "PublicDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]. Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU" }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", @@ -1205,7 +1407,7 @@ { "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]", "MetricExpr": "1e9 * (UNC_M_RPQ_OCCUPANCY / UNC_M_RPQ_INSERTS) / imc_0@event\\=0x0@", - "MetricGroup": "Mem;MemoryLat;Server;SoC", + "MetricGroup": "MemOffcore;MemoryLat;Server;SoC", "MetricName": "tma_info_system_mem_dram_read_latency", "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" }, @@ -1219,7 +1421,7 @@ { "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]", "MetricExpr": "(1e9 * (UNC_M_PMM_RPQ_OCCUPANCY.ALL / UNC_M_PMM_RPQ_INSERTS) / imc_0@event\\=0x0@ if #has_pmem > 0 else 0)", - "MetricGroup": "Mem;MemoryLat;Server;SoC", + "MetricGroup": "MemOffcore;MemoryLat;Server;SoC", "MetricName": "tma_info_system_mem_pmm_read_latency", "PublicDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" }, @@ -1233,13 +1435,13 @@ { "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]", "MetricExpr": "(64 * UNC_M_PMM_RPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", - "MetricGroup": "Mem;MemoryBW;Server;SoC", + "MetricGroup": "MemOffcore;MemoryBW;Server;SoC", "MetricName": "tma_info_system_pmm_read_bw" }, { "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]", "MetricExpr": "(64 * UNC_M_PMM_WPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", - "MetricGroup": "Mem;MemoryBW;Server;SoC", + "MetricGroup": "MemOffcore;MemoryBW;Server;SoC", "MetricName": "tma_info_system_pmm_write_bw" }, { @@ -1284,6 +1486,12 @@ "MetricName": "tma_info_system_turbo_utilization" }, { + "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]", + "MetricExpr": "tma_info_system_socket_clks / 1e9 / duration_time", + "MetricGroup": "SoC", + "MetricName": "tma_info_system_uncore_frequency" + }, + { "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", "MetricExpr": "CPU_CLK_UNHALTED.THREAD", "MetricGroup": "Pipeline", @@ -1330,8 +1538,8 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks", + "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", @@ -1340,7 +1548,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1", @@ -1350,7 +1558,7 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", @@ -1359,24 +1567,24 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "17 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "17 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", + "MetricExpr": "DECODE.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1390,7 +1598,7 @@ "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%" }, { @@ -1421,10 +1629,10 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", - "MetricExpr": "59.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "59.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", - "MetricName": "tma_local_dram", - "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", + "MetricName": "tma_local_mem", + "MetricThreshold": "tma_local_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM_PS", "ScaleUnit": "100%" }, @@ -1449,21 +1657,21 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1491,7 +1699,7 @@ "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", - "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches", "ScaleUnit": "100%" }, { @@ -1508,17 +1716,17 @@ "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%" }, { - "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", + "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)", "MetricExpr": "UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY", "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group", "MetricName": "tma_mixing_vectors", "MetricThreshold": "tma_mixing_vectors > 0.05", - "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", + "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", "ScaleUnit": "100%" }, { @@ -1527,13 +1735,13 @@ "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused", "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - UOPS_RETIRED.MACRO_FUSED) / UOPS_RETIRED.RETIRE_SLOTS", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_non_fused_branches", "MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.", @@ -1542,15 +1750,15 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / UOPS_RETIRED.RETIRE_SLOTS", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group", "MetricName": "tma_nop_instructions", - "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6", + "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", - "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches + tma_nop_instructions))", + "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6", @@ -1558,9 +1766,25 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).", + "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)", + "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group", + "MetricName": "tma_other_mispredicts", + "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.", + "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)", + "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group", + "MetricName": "tma_other_nukes", + "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 0)) * (CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", + "MetricExpr": "(((1 - (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))))) * (CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_pmm_bound", "MetricThreshold": "tma_pmm_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1622,12 +1846,12 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)", "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -1641,7 +1865,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", + "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1650,7 +1874,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_core_core_clks", + "MetricExpr": "(EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / tma_info_thread_clks * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1680,13 +1904,13 @@ "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", - "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(89.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 89.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "(89.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 89.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group", "MetricName": "tma_remote_cache", "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1695,10 +1919,10 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", - "MetricExpr": "127 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "127 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", - "MetricName": "tma_remote_dram", - "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", + "MetricName": "tma_remote_mem", + "MetricThreshold": "tma_remote_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article. Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM_PS", "ScaleUnit": "100%" }, @@ -1715,18 +1939,18 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / tma_info_thread_clks", - "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", + "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO", "MetricName": "tma_serializing_operation", - "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", + "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: PARTIAL_RAT_STALLS.SCOREBOARD. Related metrics: tma_ms_switches", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", "MetricExpr": "40 * ROB_MISC_EVENTS.PAUSE_INST / tma_info_thread_clks", - "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", + "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", - "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", + "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: MISC_RETIRED.PAUSE_INST", "ScaleUnit": "100%" }, @@ -1755,7 +1979,7 @@ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { @@ -1813,10 +2037,10 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", "MetricExpr": "9 * BACLEARS.ANY / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", + "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY", "ScaleUnit": "100%" }, { diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/metricgroups.json b/tools/perf/pmu-events/arch/x86/cascadelakex/metricgroups.json index bc6a9a4d27..904d299c95 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/metricgroups.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/metricgroups.json @@ -2,10 +2,10 @@ "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -26,7 +26,9 @@ "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -64,8 +66,10 @@ "tma_L5_group": "Metrics for top-down breakdown at level 5", "tma_L6_group": "Metrics for top-down breakdown at level 6", "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_assists_group": "Metrics contributing to tma_assists category", "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category", "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", "tma_core_bound_group": "Metrics contributing to tma_core_bound category", "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", @@ -78,9 +82,9 @@ "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", "tma_issue2P": "Metrics related by the issue $issue2P", - "tma_issueBC": "Metrics related by the issue $issueBC", "tma_issueBM": "Metrics related by the issue $issueBM", "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueComp": "Metrics related by the issue $issueComp", "tma_issueD0": "Metrics related by the issue $issueD0", "tma_issueFB": "Metrics related by the issue $issueFB", "tma_issueFL": "Metrics related by the issue $issueFL", @@ -100,10 +104,12 @@ "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", "tma_light_operations_group": "Metrics contributing to tma_light_operations category", "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", "tma_mite_group": "Metrics contributing to tma_mite category", + "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category", "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-power.json b/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-power.json index c6254af7a4..ceef460464 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-power.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-power.json @@ -144,6 +144,7 @@ "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", + "UMask": "0x40", "Unit": "PCU" }, { @@ -152,6 +153,7 @@ "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", + "UMask": "0x80", "Unit": "PCU" }, { @@ -160,6 +162,7 @@ "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", + "UMask": "0xc0", "Unit": "PCU" }, { diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json index bf5a511b99..86a8f3b7fe 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json @@ -3565,6 +3565,15 @@ "Unit": "CHA" }, { + "BriefDescription": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_CXL_EXP_LOCAL", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_CXL_EXP_LOCAL", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x20c8968201", + "Unit": "CHA" + }, + { "BriefDescription": "TOR Inserts : DRd_Prefs issued by iA Cores targeting DDR Mem that Missed the LLC", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_DDR", @@ -3716,6 +3725,15 @@ "Unit": "CHA" }, { + "BriefDescription": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFDATA_CXL_EXP_LOCAL", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFDATA_CXL_EXP_LOCAL", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x20ccd68201", + "Unit": "CHA" + }, + { "BriefDescription": "TOR Inserts; LLCPrefRFO misses from local IA", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFRFO", @@ -3742,6 +3760,15 @@ "Unit": "CHA" }, { + "BriefDescription": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFRFO_CXL_EXP_LOCAL", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFRFO_CXL_EXP_LOCAL", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x20c8868201", + "Unit": "CHA" + }, + { "BriefDescription": "TOR Inserts : WCiLFs issued by iA Cores targeting DDR that missed the LLC - HOMed locally", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LOCAL_WCILF_DDR", @@ -3892,6 +3919,15 @@ "Unit": "CHA" }, { + "BriefDescription": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF_CXL_EXP_LOCAL", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF_CXL_EXP_LOCAL", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x20ccc68201", + "Unit": "CHA" + }, + { "BriefDescription": "TOR Inserts; RFO prefetch misses from local IA", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF_LOCAL", @@ -4429,6 +4465,46 @@ "Unit": "CHA" }, { + "BriefDescription": "TOR Inserts for INVXTOM opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.RRQ_MISS_INVXTOM_CXL_EXP_LOCAL", + "PerPkg": "1", + "UMask": "0x20e87e8240", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Inserts for RDCODE opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.RRQ_MISS_RDCODE_CXL_EXP_LOCAL", + "PerPkg": "1", + "UMask": "0x20e80e8240", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Inserts for RDCUR opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.RRQ_MISS_RDCUR_CXL_EXP_LOCAL", + "PerPkg": "1", + "UMask": "0x20e8068240", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Inserts for RDDATA opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.RRQ_MISS_RDDATA_CXL_EXP_LOCAL", + "PerPkg": "1", + "UMask": "0x20e8168240", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Inserts for RDINVOWN_OPT opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.RRQ_MISS_RDINVOWN_OPT_CXL_EXP_LOCAL", + "PerPkg": "1", + "UMask": "0x20e8268240", + "Unit": "CHA" + }, + { "BriefDescription": "TOR Inserts; All Snoops from Remote", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.SNPS_FROM_REM", @@ -5012,6 +5088,15 @@ "Unit": "CHA" }, { + "BriefDescription": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PREF_CXL_EXP_LOCAL", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PREF_CXL_EXP_LOCAL", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x20c8968201", + "Unit": "CHA" + }, + { "BriefDescription": "TOR Occupancy : DRd_Prefs issued by iA Cores targeting DDR Mem that Missed the LLC", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PREF_DDR", @@ -5163,6 +5248,15 @@ "Unit": "CHA" }, { + "BriefDescription": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFDATA_CXL_EXP_LOCAL", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFDATA_CXL_EXP_LOCAL", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x20ccd68201", + "Unit": "CHA" + }, + { "BriefDescription": "TOR Occupancy; LLCPrefRFO misses from local IA", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFRFO", @@ -5189,6 +5283,15 @@ "Unit": "CHA" }, { + "BriefDescription": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFRFO_CXL_EXP_LOCAL", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFRFO_CXL_EXP_LOCAL", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x20c8868201", + "Unit": "CHA" + }, + { "BriefDescription": "TOR Occupancy : WCiLFs issued by iA Cores targeting DDR that missed the LLC - HOMed locally", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LOCAL_WCILF_DDR", @@ -5339,6 +5442,15 @@ "Unit": "CHA" }, { + "BriefDescription": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_PREF_CXL_EXP_LOCAL", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_PREF_CXL_EXP_LOCAL", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x20ccc68201", + "Unit": "CHA" + }, + { "BriefDescription": "TOR Occupancy; RFO prefetch misses from local IA", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_PREF_LOCAL", @@ -5842,6 +5954,46 @@ "Unit": "CHA" }, { + "BriefDescription": "TOR Occupancy for INVXTOM opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.RRQ_MISS_INVXTOM_CXL_EXP_LOCAL", + "PerPkg": "1", + "UMask": "0x20e87e8240", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for RDCODE opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.RRQ_MISS_RDCODE_CXL_EXP_LOCAL", + "PerPkg": "1", + "UMask": "0x20e80e8240", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for RDCUR opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.RRQ_MISS_RDCUR_CXL_EXP_LOCAL", + "PerPkg": "1", + "UMask": "0x20e8068240", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for RDDATA opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.RRQ_MISS_RDDATA_CXL_EXP_LOCAL", + "PerPkg": "1", + "UMask": "0x20e8168240", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for RDINVOWN_OPT opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.RRQ_MISS_RDINVOWN_OPT_CXL_EXP_LOCAL", + "PerPkg": "1", + "UMask": "0x20e8268240", + "Unit": "CHA" + }, + { "BriefDescription": "TOR Occupancy; All Snoops from Remote", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.SNPS_FROM_REM", diff --git a/tools/perf/pmu-events/arch/x86/grandridge/cache.json b/tools/perf/pmu-events/arch/x86/grandridge/cache.json index 7f0dc65a55..f937ba0e50 100644 --- a/tools/perf/pmu-events/arch/x86/grandridge/cache.json +++ b/tools/perf/pmu-events/arch/x86/grandridge/cache.json @@ -16,6 +16,148 @@ "UMask": "0x4f" }, { + "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an instruction cache or TLB miss.", + "EventCode": "0x35", + "EventName": "MEM_BOUND_STALLS_IFETCH.ALL", + "SampleAfterValue": "1000003", + "UMask": "0x7f" + }, + { + "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the L2 cache.", + "EventCode": "0x35", + "EventName": "MEM_BOUND_STALLS_IFETCH.L2_HIT", + "PublicDescription": "Counts the number of cycles the core is stalled due to an instruction cache or Translation Lookaside Buffer (TLB) miss which hit in the L2 cache.", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an icache or itlb miss which hit in the LLC.", + "EventCode": "0x35", + "EventName": "MEM_BOUND_STALLS_IFETCH.LLC_HIT", + "SampleAfterValue": "1000003", + "UMask": "0x6" + }, + { + "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an icache or itlb miss which missed all the caches.", + "EventCode": "0x35", + "EventName": "MEM_BOUND_STALLS_IFETCH.LLC_MISS", + "SampleAfterValue": "1000003", + "UMask": "0x78" + }, + { + "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an L1 demand load miss.", + "EventCode": "0x34", + "EventName": "MEM_BOUND_STALLS_LOAD.ALL", + "SampleAfterValue": "1000003", + "UMask": "0x7f" + }, + { + "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load which hit in the L2 cache.", + "EventCode": "0x34", + "EventName": "MEM_BOUND_STALLS_LOAD.L2_HIT", + "PublicDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the L2 cache.", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which hit in the LLC.", + "EventCode": "0x34", + "EventName": "MEM_BOUND_STALLS_LOAD.LLC_HIT", + "SampleAfterValue": "1000003", + "UMask": "0x6" + }, + { + "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which missed all the local caches.", + "EventCode": "0x34", + "EventName": "MEM_BOUND_STALLS_LOAD.LLC_MISS", + "SampleAfterValue": "1000003", + "UMask": "0x78" + }, + { + "BriefDescription": "Counts the number of load ops retired that miss the L3 cache and hit in DRAM", + "EventCode": "0xd3", + "EventName": "MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of load ops retired that hit the L1 data cache.", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_UOPS_RETIRED.L1_HIT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of load ops retired that miss in the L1 data cache.", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_UOPS_RETIRED.L1_MISS", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x40" + }, + { + "BriefDescription": "Counts the number of load ops retired that hit in the L2 cache.", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_UOPS_RETIRED.L2_HIT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of load ops retired that miss in the L2 cache.", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_UOPS_RETIRED.L2_MISS", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x80" + }, + { + "BriefDescription": "Counts the number of load ops retired that hit in the L3 cache.", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_UOPS_RETIRED.L3_HIT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x1c" + }, + { + "BriefDescription": "Counts the number of loads that hit in a write combining buffer (WCB), excluding the first load that caused the WCB to allocate.", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_UOPS_RETIRED.WCB_HIT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x20" + }, + { + "BriefDescription": "Counts the number of cycles that uops are blocked for any of the following reasons: load buffer, store buffer or RSV full.", + "EventCode": "0x04", + "EventName": "MEM_SCHEDULER_BLOCK.ALL", + "SampleAfterValue": "20003", + "UMask": "0x7" + }, + { + "BriefDescription": "Counts the number of cycles that uops are blocked due to a load buffer full condition.", + "EventCode": "0x04", + "EventName": "MEM_SCHEDULER_BLOCK.LD_BUF", + "SampleAfterValue": "20003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of cycles that uops are blocked due to an RSV full condition.", + "EventCode": "0x04", + "EventName": "MEM_SCHEDULER_BLOCK.RSV", + "SampleAfterValue": "20003", + "UMask": "0x4" + }, + { + "BriefDescription": "Counts the number of cycles that uops are blocked due to a store buffer full condition.", + "EventCode": "0x04", + "EventName": "MEM_SCHEDULER_BLOCK.ST_BUF", + "SampleAfterValue": "20003", + "UMask": "0x1" + }, + { "BriefDescription": "Counts the number of load ops retired.", "Data_LA": "1", "EventCode": "0xd0", @@ -144,6 +286,42 @@ "UMask": "0x5" }, { + "BriefDescription": "Counts the number of load uops retired that performed one or more locks", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOCK_LOADS", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x21" + }, + { + "BriefDescription": "Counts the number of memory uops retired that were splits.", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.SPLIT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x43" + }, + { + "BriefDescription": "Counts the number of retired split load uops.", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.SPLIT_LOADS", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x41" + }, + { + "BriefDescription": "Counts the number of retired split store uops.", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.SPLIT_STORES", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x42" + }, + { "BriefDescription": "Counts the number of stores uops retired same as MEM_UOPS_RETIRED.ALL_STORES", "Data_LA": "1", "EventCode": "0xd0", @@ -151,5 +329,12 @@ "PEBS": "2", "SampleAfterValue": "1000003", "UMask": "0x6" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to an icache miss", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.ICACHE", + "SampleAfterValue": "1000003", + "UMask": "0x20" } ] diff --git a/tools/perf/pmu-events/arch/x86/grandridge/floating-point.json b/tools/perf/pmu-events/arch/x86/grandridge/floating-point.json new file mode 100644 index 0000000000..00c9a8ae0f --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/grandridge/floating-point.json @@ -0,0 +1,68 @@ +[ + { + "BriefDescription": "Counts the number of cycles when any of the floating point dividers are active.", + "CounterMask": "1", + "EventCode": "0xcd", + "EventName": "ARITH.FPDIV_ACTIVE", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of all types of floating point operations per uop with all default weighting", + "EventCode": "0xc8", + "EventName": "FP_FLOPS_RETIRED.ALL", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x3" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to FP_FLOPS_RETIRED.FP64]", + "Deprecated": "1", + "EventCode": "0xc8", + "EventName": "FP_FLOPS_RETIRED.DP", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of floating point operations that produce 32 bit single precision results [This event is alias to FP_FLOPS_RETIRED.SP]", + "EventCode": "0xc8", + "EventName": "FP_FLOPS_RETIRED.FP32", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of floating point operations that produce 64 bit double precision results [This event is alias to FP_FLOPS_RETIRED.DP]", + "EventCode": "0xc8", + "EventName": "FP_FLOPS_RETIRED.FP64", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to FP_FLOPS_RETIRED.FP32]", + "Deprecated": "1", + "EventCode": "0xc8", + "EventName": "FP_FLOPS_RETIRED.SP", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of floating point operations retired that required microcode assist.", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.FP_ASSIST", + "PublicDescription": "Counts the number of floating point operations retired that required microcode assist, which is not a reflection of the number of FP operations, instructions or uops.", + "SampleAfterValue": "20003", + "UMask": "0x4" + }, + { + "BriefDescription": "Counts the number of floating point divide uops retired (x87 and sse, including x87 sqrt).", + "EventCode": "0xc2", + "EventName": "UOPS_RETIRED.FPDIV", + "PEBS": "1", + "SampleAfterValue": "2000003", + "UMask": "0x8" + } +] diff --git a/tools/perf/pmu-events/arch/x86/grandridge/frontend.json b/tools/perf/pmu-events/arch/x86/grandridge/frontend.json index be8f1c7e19..356d36aecc 100644 --- a/tools/perf/pmu-events/arch/x86/grandridge/frontend.json +++ b/tools/perf/pmu-events/arch/x86/grandridge/frontend.json @@ -1,5 +1,21 @@ [ { + "BriefDescription": "Counts the total number of BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.", + "EventCode": "0xe6", + "EventName": "BACLEARS.ANY", + "PublicDescription": "Counts the total number of BACLEARS, which occur when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend. Includes BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.", + "SampleAfterValue": "200003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to ITLB miss", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.ITLB_MISS", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x10" + }, + { "BriefDescription": "Counts every time the code stream enters into a new cache line by walking sequential from the previous line or being redirected by a jump.", "EventCode": "0x80", "EventName": "ICACHE.ACCESSES", diff --git a/tools/perf/pmu-events/arch/x86/grandridge/memory.json b/tools/perf/pmu-events/arch/x86/grandridge/memory.json index 79d8af4510..e0ce2decc8 100644 --- a/tools/perf/pmu-events/arch/x86/grandridge/memory.json +++ b/tools/perf/pmu-events/arch/x86/grandridge/memory.json @@ -1,5 +1,71 @@ [ { + "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to any number of reasons, including an L1 miss, WCB full, pagewalk, store address block or store data block, on a load that retires.", + "EventCode": "0x05", + "EventName": "LD_HEAD.ANY_AT_RET", + "SampleAfterValue": "1000003", + "UMask": "0xff" + }, + { + "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to a core bound stall including a store address match, a DTLB miss or a page walk that detains the load from retiring.", + "EventCode": "0x05", + "EventName": "LD_HEAD.L1_BOUND_AT_RET", + "SampleAfterValue": "1000003", + "UMask": "0xf4" + }, + { + "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a DL1 miss.", + "EventCode": "0x05", + "EventName": "LD_HEAD.L1_MISS_AT_RET", + "SampleAfterValue": "1000003", + "UMask": "0x81" + }, + { + "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to other block cases.", + "EventCode": "0x05", + "EventName": "LD_HEAD.OTHER_AT_RET", + "PublicDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to other block cases such as pipeline conflicts, fences, etc.", + "SampleAfterValue": "1000003", + "UMask": "0xc0" + }, + { + "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a pagewalk.", + "EventCode": "0x05", + "EventName": "LD_HEAD.PGWALK_AT_RET", + "SampleAfterValue": "1000003", + "UMask": "0xa0" + }, + { + "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a store address match.", + "EventCode": "0x05", + "EventName": "LD_HEAD.ST_ADDR_AT_RET", + "SampleAfterValue": "1000003", + "UMask": "0x84" + }, + { + "BriefDescription": "Counts the number of machine clears due to memory ordering caused by a snoop from an external agent. Does not count internally generated machine clears such as those due to memory disambiguation.", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.MEMORY_ORDERING", + "SampleAfterValue": "20003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts misaligned loads that are 4K page splits.", + "EventCode": "0x13", + "EventName": "MISALIGN_MEM_REF.LOAD_PAGE_SPLIT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts misaligned stores that are 4K page splits.", + "EventCode": "0x13", + "EventName": "MISALIGN_MEM_REF.STORE_PAGE_SPLIT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x4" + }, + { "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.", "EventCode": "0xB7", "EventName": "OCR.DEMAND_DATA_RD.L3_MISS", diff --git a/tools/perf/pmu-events/arch/x86/grandridge/other.json b/tools/perf/pmu-events/arch/x86/grandridge/other.json index 2414f6ff53..70a9da7e97 100644 --- a/tools/perf/pmu-events/arch/x86/grandridge/other.json +++ b/tools/perf/pmu-events/arch/x86/grandridge/other.json @@ -1,5 +1,14 @@ [ { + "BriefDescription": "This event is deprecated. [This event is alias to MISC_RETIRED.LBR_INSERTS]", + "Deprecated": "1", + "EventCode": "0xe4", + "EventName": "LBR_INSERTS.ANY", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { "BriefDescription": "Counts demand data reads that have any type of response.", "EventCode": "0xB7", "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE", @@ -16,5 +25,12 @@ "MSRValue": "0x10002", "SampleAfterValue": "100003", "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of issue slots in a UMWAIT or TPAUSE instruction where no uop issues due to the instruction putting the CPU into the C0.1 activity state.", + "EventCode": "0x75", + "EventName": "SERIALIZATION.C01_MS_SCB", + "SampleAfterValue": "200003", + "UMask": "0x4" } ] diff --git a/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json b/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json index 41212957ef..daa0639bb1 100644 --- a/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json @@ -1,5 +1,13 @@ [ { + "BriefDescription": "Counts the number of cycles when any of the dividers are active.", + "CounterMask": "1", + "EventCode": "0xcd", + "EventName": "ARITH.DIV_ACTIVE", + "SampleAfterValue": "1000003", + "UMask": "0x3" + }, + { "BriefDescription": "Counts the total number of branch instructions retired for all branch types.", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.ALL_BRANCHES", @@ -8,6 +16,71 @@ "SampleAfterValue": "200003" }, { + "BriefDescription": "Counts the number of retired JCC (Jump on Conditional Code) branch instructions retired, includes both taken and not taken branches.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.COND", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x7e" + }, + { + "BriefDescription": "Counts the number of taken JCC (Jump on Conditional Code) branch instructions retired.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.COND_TAKEN", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xfe" + }, + { + "BriefDescription": "Counts the number of far branch instructions retired, includes far jump, far call and return, and interrupt call and return.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.FAR_BRANCH", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xbf" + }, + { + "BriefDescription": "Counts the number of near indirect JMP and near indirect CALL branch instructions retired.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.INDIRECT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xeb" + }, + { + "BriefDescription": "Counts the number of near indirect CALL branch instructions retired.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.INDIRECT_CALL", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xfb" + }, + { + "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.INDIRECT_CALL", + "Deprecated": "1", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.IND_CALL", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xfb" + }, + { + "BriefDescription": "Counts the number of near CALL branch instructions retired.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_CALL", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xf9" + }, + { + "BriefDescription": "Counts the number of near RET branch instructions retired.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_RETURN", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xf7" + }, + { "BriefDescription": "Counts the total number of mispredicted branch instructions retired for all branch types.", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.ALL_BRANCHES", @@ -16,6 +89,54 @@ "SampleAfterValue": "200003" }, { + "BriefDescription": "Counts the number of mispredicted JCC (Jump on Conditional Code) branch instructions retired.", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x7e" + }, + { + "BriefDescription": "Counts the number of mispredicted taken JCC (Jump on Conditional Code) branch instructions retired.", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND_TAKEN", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xfe" + }, + { + "BriefDescription": "Counts the number of mispredicted near indirect JMP and near indirect CALL branch instructions retired.", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.INDIRECT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xeb" + }, + { + "BriefDescription": "Counts the number of mispredicted near indirect CALL branch instructions retired.", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.INDIRECT_CALL", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xfb" + }, + { + "BriefDescription": "Counts the number of mispredicted near taken branch instructions retired.", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.NEAR_TAKEN", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x80" + }, + { + "BriefDescription": "Counts the number of mispredicted near RET branch instructions retired.", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.RETURN", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xf7" + }, + { "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles", "EventName": "CPU_CLK_UNHALTED.CORE", "SampleAfterValue": "2000003", @@ -68,6 +189,66 @@ "SampleAfterValue": "2000003" }, { + "BriefDescription": "Counts the number of retired loads that are blocked because it initially appears to be store forward blocked, but subsequently is shown not to be blocked based on 4K alias check.", + "EventCode": "0x03", + "EventName": "LD_BLOCKS.ADDRESS_ALIAS", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x4" + }, + { + "BriefDescription": "Counts the number of retired loads that are blocked because its address exactly matches an older store whose data is not ready.", + "EventCode": "0x03", + "EventName": "LD_BLOCKS.DATA_UNKNOWN", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of retired loads that are blocked because its address partially overlapped with an older store.", + "EventCode": "0x03", + "EventName": "LD_BLOCKS.STORE_FORWARD", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of machine clears due to memory ordering in which an internal load passes an older store within the same CPU.", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.DISAMBIGUATION", + "SampleAfterValue": "20003", + "UMask": "0x8" + }, + { + "BriefDescription": "Counts the number of machine clears due to a page fault. Counts both I-Side and D-Side (Loads/Stores) page faults. A page fault occurs when either the page is not present, or an access violation occurs.", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.PAGE_FAULT", + "SampleAfterValue": "20003", + "UMask": "0x20" + }, + { + "BriefDescription": "Counts the number of machine clears that flush the pipeline and restart the machine with the use of microcode due to SMC, MEMORY_ORDERING, FP_ASSISTS, PAGE_FAULT, DISAMBIGUATION, and FPC_VIRTUAL_TRAP.", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.SLOW", + "SampleAfterValue": "20003", + "UMask": "0x6f" + }, + { + "BriefDescription": "Counts the number of machine clears due to program modifying data (self modifying code) within 1K of a recently fetched code page.", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.SMC", + "SampleAfterValue": "20003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of Last Branch Record (LBR) entries. Requires LBRs to be enabled and configured in IA32_LBR_CTL. [This event is alias to LBR_INSERTS.ANY]", + "EventCode": "0xe4", + "EventName": "MISC_RETIRED.LBR_INSERTS", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.", "EventCode": "0x73", "EventName": "TOPDOWN_BAD_SPECULATION.ALL", @@ -75,22 +256,194 @@ "SampleAfterValue": "1000003" }, { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to Fast Nukes such as Memory Ordering Machine clears and MRN nukes", + "EventCode": "0x73", + "EventName": "TOPDOWN_BAD_SPECULATION.FASTNUKE", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation.", + "EventCode": "0x73", + "EventName": "TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS", + "SampleAfterValue": "1000003", + "UMask": "0x3" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to Branch Mispredict", + "EventCode": "0x73", + "EventName": "TOPDOWN_BAD_SPECULATION.MISPREDICT", + "SampleAfterValue": "1000003", + "UMask": "0x4" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to a machine clear (nuke).", + "EventCode": "0x73", + "EventName": "TOPDOWN_BAD_SPECULATION.NUKE", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls", "EventCode": "0x74", "EventName": "TOPDOWN_BE_BOUND.ALL", "SampleAfterValue": "1000003" }, { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to due to certain allocation restrictions", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to memory reservation stall (scheduler not being able to accept another uop). This could be caused by RSV full or load/store buffer block.", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.MEM_SCHEDULER", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to IEC and FPC RAT stalls - which can be due to the FIQ and IEC reservation station stall (integer, FP and SIMD scheduler not being able to accept another uop. )", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER", + "SampleAfterValue": "1000003", + "UMask": "0x8" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to mrbl stall. A 'marble' refers to a physical register file entry, also known as the physical destination (PDST).", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.REGISTER", + "SampleAfterValue": "1000003", + "UMask": "0x20" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to iq/jeu scoreboards or ms scb", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.SERIALIZATION", + "SampleAfterValue": "1000003", + "UMask": "0x10" + }, + { "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls", "EventCode": "0x71", "EventName": "TOPDOWN_FE_BOUND.ALL", "SampleAfterValue": "1000003" }, { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BAClear", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.BRANCH_DETECT", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BTClear", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.BRANCH_RESTEER", + "SampleAfterValue": "1000003", + "UMask": "0x40" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to ms", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.CISC", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to decode stall", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.DECODE", + "SampleAfterValue": "1000003", + "UMask": "0x8" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH", + "SampleAfterValue": "1000003", + "UMask": "0x8d" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to latency related stalls including BACLEARs, BTCLEARs, ITLB misses, and ICache misses.", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.FRONTEND_LATENCY", + "SampleAfterValue": "1000003", + "UMask": "0x72" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to TOPDOWN_FE_BOUND.ITLB_MISS]", + "Deprecated": "1", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.ITLB", + "SampleAfterValue": "1000003", + "UMask": "0x10" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to itlb miss [This event is alias to TOPDOWN_FE_BOUND.ITLB]", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.ITLB_MISS", + "SampleAfterValue": "1000003", + "UMask": "0x10" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend that do not categorize into any other common frontend stall", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.OTHER", + "SampleAfterValue": "1000003", + "UMask": "0x80" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to predecode wrong", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.PREDECODE", + "SampleAfterValue": "1000003", + "UMask": "0x4" + }, + { "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL", "EventCode": "0x72", "EventName": "TOPDOWN_RETIRING.ALL", "PEBS": "1", "SampleAfterValue": "1000003" + }, + { + "BriefDescription": "Counts the number of uops issued by the front end every cycle.", + "EventCode": "0x0e", + "EventName": "UOPS_ISSUED.ANY", + "PublicDescription": "Counts the number of uops issued by the front end every cycle. When 4-uops are requested and only 2-uops are delivered, the event counts 2. Uops_issued correlates to the number of ROB entries. If uop takes 2 ROB slots it counts as 2 uops_issued.", + "SampleAfterValue": "1000003" + }, + { + "BriefDescription": "Counts the total number of uops retired.", + "EventCode": "0xc2", + "EventName": "UOPS_RETIRED.ALL", + "PEBS": "1", + "SampleAfterValue": "2000003" + }, + { + "BriefDescription": "Counts the number of integer divide uops retired.", + "EventCode": "0xc2", + "EventName": "UOPS_RETIRED.IDIV", + "PEBS": "1", + "SampleAfterValue": "2000003", + "UMask": "0x10" + }, + { + "BriefDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS). This includes uops from flows due to complex instructions, faults, assists, and inserted flows.", + "EventCode": "0xc2", + "EventName": "UOPS_RETIRED.MS", + "PEBS": "1", + "SampleAfterValue": "2000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of x87 uops retired, includes those in ms flows", + "EventCode": "0xc2", + "EventName": "UOPS_RETIRED.X87", + "PEBS": "1", + "SampleAfterValue": "2000003", + "UMask": "0x2" } ] diff --git a/tools/perf/pmu-events/arch/x86/grandridge/uncore-cache.json b/tools/perf/pmu-events/arch/x86/grandridge/uncore-cache.json new file mode 100644 index 0000000000..74dfd9272c --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/grandridge/uncore-cache.json @@ -0,0 +1,1795 @@ +[ + { + "BriefDescription": "Clockticks for CMS units attached to CHA", + "EventCode": "0x01", + "EventName": "UNC_CHACMS_CLOCKTICKS", + "PerPkg": "1", + "PortMask": "0x000", + "PublicDescription": "UNC_CHACMS_CLOCKTICKS", + "Unit": "CHACMS" + }, + { + "BriefDescription": "Number of CHA clock cycles while the event is enabled", + "EventCode": "0x01", + "EventName": "UNC_CHA_CLOCKTICKS", + "PerPkg": "1", + "PublicDescription": "Clockticks of the uncore caching and home agent (CHA)", + "Unit": "CHA" + }, + { + "BriefDescription": "Distress signal assertion for dynamic prefetch throttle (DPT). Threshold for distress signal assertion reached in TOR or IRQ (immediate cause for triggering).", + "EventCode": "0x59", + "EventName": "UNC_CHA_DISTRESS_ASSERTED.DPT_ANY", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x3", + "Unit": "CHA" + }, + { + "BriefDescription": "Distress signal assertion for dynamic prefetch throttle (DPT). Threshold for distress signal assertion reached in IRQ (immediate cause for triggering).", + "EventCode": "0x59", + "EventName": "UNC_CHA_DISTRESS_ASSERTED.DPT_IRQ", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "CHA" + }, + { + "BriefDescription": "Distress signal assertion for dynamic prefetch throttle (DPT). Threshold for distress signal assertion reached in TOR (immediate cause for triggering).", + "EventCode": "0x59", + "EventName": "UNC_CHA_DISTRESS_ASSERTED.DPT_TOR", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts when a normal (Non-Isochronous) full line write is issued from the CHA to the any of the memory controller channels.", + "EventCode": "0x5b", + "EventName": "UNC_CHA_IMC_WRITES_COUNT.FULL", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "CHA" + }, + { + "BriefDescription": "CHA to iMC Full Line Writes Issued : ISOCH Full Line : Counts the total number of full line writes issued from the HA into the memory controller.", + "EventCode": "0x5b", + "EventName": "UNC_CHA_IMC_WRITES_COUNT.FULL_PRIORITY", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "CHA" + }, + { + "BriefDescription": "CHA to iMC Full Line Writes Issued : Partial Non-ISOCH : Counts the total number of full line writes issued from the HA into the memory controller.", + "EventCode": "0x5b", + "EventName": "UNC_CHA_IMC_WRITES_COUNT.PARTIAL", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "CHA" + }, + { + "BriefDescription": "CHA to iMC Full Line Writes Issued : ISOCH Partial : Counts the total number of full line writes issued from the HA into the memory controller.", + "EventCode": "0x5b", + "EventName": "UNC_CHA_IMC_WRITES_COUNT.PARTIAL_PRIORITY", + "PerPkg": "1", + "UMask": "0x8", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: CRd Requests", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.CODE", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : CRd Requests", + "UMask": "0x1bd0ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Read Requests and Read Prefetches", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.DATA_RD", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions", + "UMask": "0x1bc1ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Read Requests, Read Prefetches, and Snoops", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.DATA_READ_ALL", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : Data Reads", + "UMask": "0x1fc1ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Read Requests to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.DATA_READ_LOCAL", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : Demand Data Reads, Core and LLC prefetches", + "UMask": "0x841ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Read Requests, Read Prefetches, and Snoops which miss the Cache", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.DATA_READ_MISS", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : Data Read Misses", + "UMask": "0x1fc101", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: All Requests to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCALLY_HOMED_ADDRESS", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : Transactions homed locally", + "UMask": "0xbdfff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Code Read Requests and Code Read Prefetches to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_CODE", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : CRd Requests", + "UMask": "0x19d0ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Read Requests and Read Prefetches to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_DATA_RD", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions", + "UMask": "0x19c1ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Code Read Requests to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_DMND_CODE", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : CRd Requests", + "UMask": "0x1850ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Read Requests to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_DMND_DATA_RD", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions", + "UMask": "0x1841ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: RFO Requests to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_DMND_RFO", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : RFO Requests", + "UMask": "0x1848ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: LLC Prefetch Requests to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_LLC_PF", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions", + "UMask": "0x189dff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: All Prefetches to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_PF", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions", + "UMask": "0x199dff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Code Prefetches to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_PF_CODE", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : CRd Requests", + "UMask": "0x1910ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Read Prefetches to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_PF_DATA_RD", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions", + "UMask": "0x1981ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: RFO Prefetches to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_PF_RFO", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : RFO Requests", + "UMask": "0x1908ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: RFO Requests and RFO Prefetches to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_RFO", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : RFO Requests", + "UMask": "0x19c8ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: All RFO and RFO Prefetches", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.RFO", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : All RFOs - Demand and Prefetches", + "UMask": "0x1bc8ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: RFO Requests and RFO Prefetches to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.RFO_LOCAL", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : Locally HOMed RFOs - Demand and Prefetches", + "UMask": "0x9c8ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Writes to Locally Homed Memory (includes writebacks from L1/L2)", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.WRITE_LOCAL", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : Writes", + "UMask": "0x842ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.ALL", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : All Lines Victimized", + "UMask": "0xf", + "Unit": "CHA" + }, + { + "BriefDescription": "Lines Victimized : IA traffic : Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.IA", + "PerPkg": "1", + "UMask": "0x20", + "Unit": "CHA" + }, + { + "BriefDescription": "Lines Victimized : IO traffic : Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.IO", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_ALL", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Local - All Lines", + "UMask": "0x200f", + "Unit": "CHA" + }, + { + "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_E", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Local - Lines in E State", + "UMask": "0x2002", + "Unit": "CHA" + }, + { + "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_F", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Local - Lines in F State", + "UMask": "0x2008", + "Unit": "CHA" + }, + { + "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_M", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Local - Lines in M State", + "UMask": "0x2001", + "Unit": "CHA" + }, + { + "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_S", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Local - Lines in S State", + "UMask": "0x2004", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.TOTAL_E", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Lines in E state", + "UMask": "0x2", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.TOTAL_M", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Lines in M state", + "UMask": "0x1", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.TOTAL_S", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Lines in S State", + "UMask": "0x4", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts when a RFO (the Read for Ownership issued before a write) request hit a cacheline in the S (Shared) state.", + "EventCode": "0x39", + "EventName": "UNC_CHA_MISC.RFO_HIT_S", + "PerPkg": "1", + "PublicDescription": "Cbo Misc : RFO HitS", + "UMask": "0x8", + "Unit": "CHA" + }, + { + "BriefDescription": "OSB Snoop Broadcast : Local InvItoE : Count of OSB snoop broadcasts. Counts by 1 per request causing OSB snoops to be broadcast. Does not count all the snoops generated by OSB.", + "EventCode": "0x55", + "EventName": "UNC_CHA_OSB.LOCAL_INVITOE", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "CHA" + }, + { + "BriefDescription": "OSB Snoop Broadcast : Local Rd : Count of OSB snoop broadcasts. Counts by 1 per request causing OSB snoops to be broadcast. Does not count all the snoops generated by OSB.", + "EventCode": "0x55", + "EventName": "UNC_CHA_OSB.LOCAL_READ", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "CHA" + }, + { + "BriefDescription": "OSB Snoop Broadcast : Off : Count of OSB snoop broadcasts. Counts by 1 per request causing OSB snoops to be broadcast. Does not count all the snoops generated by OSB.", + "EventCode": "0x55", + "EventName": "UNC_CHA_OSB.OFF_PWRHEURISTIC", + "PerPkg": "1", + "UMask": "0x20", + "Unit": "CHA" + }, + { + "BriefDescription": "OSB Snoop Broadcast : RFO HitS Snoop Broadcast : Count of OSB snoop broadcasts. Counts by 1 per request causing OSB snoops to be broadcast. Does not count all the snoops generated by OSB.", + "EventCode": "0x55", + "EventName": "UNC_CHA_OSB.RFO_HITS_SNP_BCAST", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts the total number of requests coming from a unit on this socket for exclusive ownership of a cache line without receiving data (INVITOE) to the CHA.", + "EventCode": "0x50", + "EventName": "UNC_CHA_REQUESTS.INVITOE", + "PerPkg": "1", + "PublicDescription": "HA Read and Write Requests : InvalItoE", + "UMask": "0x30", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts the total number of requests coming from a unit on this socket for exclusive ownership of a cache line without receiving data (INVITOE) to the CHA.", + "EventCode": "0x50", + "EventName": "UNC_CHA_REQUESTS.INVITOE_LOCAL", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts read requests made into this CHA. Reads include all read opcodes (including RFO: the Read for Ownership issued before a write) .", + "EventCode": "0x50", + "EventName": "UNC_CHA_REQUESTS.READS", + "PerPkg": "1", + "PublicDescription": "HA Read and Write Requests : Reads", + "UMask": "0x3", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts read requests coming from a unit on this socket made into this CHA. Reads include all read opcodes (including RFO: the Read for Ownership issued before a write).", + "EventCode": "0x50", + "EventName": "UNC_CHA_REQUESTS.READS_LOCAL", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts write requests made into the CHA, including streaming, evictions, HitM (Reads from another core to a Modified cacheline), etc.", + "EventCode": "0x50", + "EventName": "UNC_CHA_REQUESTS.WRITES", + "PerPkg": "1", + "PublicDescription": "HA Read and Write Requests : Writes", + "UMask": "0xc", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts write requests coming from a unit on this socket made into this CHA, including streaming, evictions, HitM (Reads from another core to a Modified cacheline), etc.", + "EventCode": "0x50", + "EventName": "UNC_CHA_REQUESTS.WRITES_LOCAL", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "CHA" + }, + { + "BriefDescription": "All TOR Inserts", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.ALL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All", + "UMask": "0xc001ffff", + "Unit": "CHA" + }, + { + "BriefDescription": "All locally initiated requests from IA Cores", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All requests from iA Cores", + "UMask": "0xc001ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "CLFlush events that are initiated from the Core", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_CLFLUSH", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CLFlushes issued by iA Cores", + "UMask": "0xc8c7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "CLFlushOpt events that are initiated from the Core", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_CLFLUSHOPT", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CLFlushOpts issued by iA Cores", + "UMask": "0xc8d7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "Code read from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_CRD", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CRDs issued by iA Cores", + "UMask": "0xc80fff01", + "Unit": "CHA" + }, + { + "BriefDescription": "Code read prefetch from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_CRD_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts; Code read prefetch from local IA that misses in the snoop filter", + "UMask": "0xc88fff01", + "Unit": "CHA" + }, + { + "BriefDescription": "Data read opt from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_DRD_OPT", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : DRd_Opts issued by iA Cores", + "UMask": "0xc827ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "Data read opt prefetch from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_DRD_OPT_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : DRd_Opt_Prefs issued by iA Cores", + "UMask": "0xc8a7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "All locally initiated requests from IA Cores which hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All requests from iA Cores that Hit the LLC", + "UMask": "0xc001fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "Code read from local IA that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_CRD", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CRds issued by iA Cores that Hit the LLC", + "UMask": "0xc80ffd01", + "Unit": "CHA" + }, + { + "BriefDescription": "Code read prefetch from local IA that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_CRD_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CRd_Prefs issued by iA Cores that hit the LLC", + "UMask": "0xc88ffd01", + "Unit": "CHA" + }, + { + "BriefDescription": "Data read opt from local IA that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_DRD_OPT", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : DRd_Opts issued by iA Cores that hit the LLC", + "UMask": "0xc827fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "Data read opt prefetch from local IA that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_DRD_OPT_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : DRd_Opt_Prefs issued by iA Cores that hit the LLC", + "UMask": "0xc8a7fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoM requests from local IA cores that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by iA Cores that Hit LLC", + "UMask": "0xcc47fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "Last level cache prefetch code read from local IA that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LLCPREFCODE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : LLCPrefCode issued by iA Cores that hit the LLC", + "UMask": "0xcccffd01", + "Unit": "CHA" + }, + { + "BriefDescription": "Last level cache prefetch data read from local IA that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LLCPREFDATA", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : LLCPrefData issued by iA Cores that hit the LLC", + "UMask": "0xccd7fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "Last level cache prefetch read for ownership from local IA that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LLCPREFRFO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores that hit the LLC", + "UMask": "0xccc7fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "Read for ownership from local IA that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Hit the LLC", + "UMask": "0xc807fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "Read for ownership prefetch from local IA that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_RFO_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFO_Prefs issued by iA Cores that Hit the LLC", + "UMask": "0xc887fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoM events that are initiated from the Core", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by iA Cores", + "UMask": "0xcc47ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoMCacheNear requests from local IA cores", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_ITOMCACHENEAR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMCacheNears issued by iA Cores", + "UMask": "0xcd47ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "Last level cache prefetch code read from local IA.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_LLCPREFCODE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : LLCPrefCode issued by iA Cores", + "UMask": "0xcccfff01", + "Unit": "CHA" + }, + { + "BriefDescription": "Last level cache prefetch data read from local IA.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_LLCPREFDATA", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : LLCPrefData issued by iA Cores", + "UMask": "0xccd7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "Last level cache prefetch read for ownership from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_LLCPREFRFO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores", + "UMask": "0xccc7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "All locally initiated requests from IA Cores which miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All requests from iA Cores that Missed the LLC", + "UMask": "0xc001fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Code read from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CRds issued by iA Cores that Missed the LLC", + "UMask": "0xc80ffe01", + "Unit": "CHA" + }, + { + "BriefDescription": "CRDs from local IA cores to locally homed memory", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CRd issued by iA Cores that Missed the LLC - HOMed locally", + "UMask": "0xc80efe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Code read prefetch from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CRd_Prefs issued by iA Cores that Missed the LLC", + "UMask": "0xc88ffe01", + "Unit": "CHA" + }, + { + "BriefDescription": "CRD Prefetches from local IA cores to locally homed memory", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD_PREF_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CRd_Prefs issued by iA Cores that Missed the LLC - HOMed locally", + "UMask": "0xc88efe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Data read opt from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : DRd_Opt issued by iA Cores that missed the LLC", + "UMask": "0xc827fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Inserts into the TOR from local IA cores which miss the LLC and snoop filter with the opcode DRd_Opt, and which target local memory", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : DRd_Opt issued by iA Cores that Missed the LLC - HOMed locally", + "UMask": "0xc826fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Data read opt prefetch from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : DRd_Opt_Prefs issued by iA Cores that missed the LLC", + "UMask": "0xc8a7fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Inserts into the TOR from local IA cores which miss the LLC and snoop filter with the opcode DRD_PREF_OPT, and target local memory", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT_PREF_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : DRd_Opt_Prefs issued by iA Cores that missed the LLC", + "UMask": "0xc8a6fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoM requests from local IA cores that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by iA Cores that Missed LLC", + "UMask": "0xcc47fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Last level cache prefetch code read from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFCODE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : LLCPrefCode issued by iA Cores that missed the LLC", + "UMask": "0xcccffe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Last level cache prefetch data read from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFDATA", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : LLCPrefData issued by iA Cores that missed the LLC", + "UMask": "0xccd7fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Last level cache prefetch read for ownership from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFRFO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores that missed the LLC", + "UMask": "0xccc7fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "WCILF requests from local IA cores to locally homed DDR addresses that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LOCAL_WCILF_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLFs issued by iA Cores targeting DDR that missed the LLC - HOMed locally", + "UMask": "0xc8668601", + "Unit": "CHA" + }, + { + "BriefDescription": "WCIL requests from local IA cores to locally homed DDR addresses that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LOCAL_WCIL_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores targeting DDR that missed the LLC - HOMed locally", + "UMask": "0xc86e8601", + "Unit": "CHA" + }, + { + "BriefDescription": "Read for ownership from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Missed the LLC", + "UMask": "0xc807fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Read for ownership from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Missed the LLC - HOMed locally", + "UMask": "0xc806fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Read for ownership prefetch from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFO_Prefs issued by iA Cores that Missed the LLC", + "UMask": "0xc887fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Read for ownership prefetch from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFO_Prefs issued by iA Cores that Missed the LLC - HOMed locally", + "UMask": "0xc886fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "UCRDF requests from local IA cores that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_UCRDF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : UCRdFs issued by iA Cores that Missed LLC", + "UMask": "0xc877de01", + "Unit": "CHA" + }, + { + "BriefDescription": "WCIL requests from a local IA core that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCIL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores that Missed the LLC", + "UMask": "0xc86ffe01", + "Unit": "CHA" + }, + { + "BriefDescription": "WCILF requests from local IA core that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCILF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLF issued by iA Cores that Missed the LLC", + "UMask": "0xc867fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "WCILF requests from local IA cores to DDR homed addresses which miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCILF_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLFs issued by iA Cores targeting DDR that missed the LLC", + "UMask": "0xc8678601", + "Unit": "CHA" + }, + { + "BriefDescription": "WCIL requests from local IA cores to DDR homed addresses which miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCIL_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores targeting DDR that missed the LLC", + "UMask": "0xc86f8601", + "Unit": "CHA" + }, + { + "BriefDescription": "WIL requests from local IA cores that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WIL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WiLs issued by iA Cores that Missed LLC", + "UMask": "0xc87fde01", + "Unit": "CHA" + }, + { + "BriefDescription": "Read for ownership from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFOs issued by iA Cores", + "UMask": "0xc807ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "Read for ownership prefetch from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_RFO_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFO_Prefs issued by iA Cores", + "UMask": "0xc887ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "SpecItoM events that are initiated from the Core", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_SPECITOM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : SpecItoMs issued by iA Cores", + "UMask": "0xcc57ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "WbEFtoEs issued by iA Cores. (Non Modified Write Backs)", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_WBEFTOE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC", + "UMask": "0xcc3fff01", + "Unit": "CHA" + }, + { + "BriefDescription": "WbEFtoIs issued by iA Cores . (Non Modified Write Backs)", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_WBEFTOI", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC", + "UMask": "0xcc37ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "WbMtoEs issued by iA Cores . (Modified Write Backs)", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_WBMTOE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC", + "UMask": "0xcc2fff01", + "Unit": "CHA" + }, + { + "BriefDescription": "WbMtoI requests from local IA cores", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_WBMTOI", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WbMtoIs issued by iA Cores", + "UMask": "0xcc27ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "WbStoIs issued by iA Cores . (Non Modified Write Backs)", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_WBSTOI", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC", + "UMask": "0xcc67ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "WCIL requests from a local IA core", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_WCIL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores", + "UMask": "0xc86fff01", + "Unit": "CHA" + }, + { + "BriefDescription": "WCILF requests from local IA core", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_WCILF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLF issued by iA Cores", + "UMask": "0xc867ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "All TOR inserts from local IO devices", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All requests from IO Devices", + "UMask": "0xc001ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "CLFlush requests from IO devices", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_CLFLUSH", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CLFlushes issued by IO Devices", + "UMask": "0xc8c3ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "All TOR inserts from local IO devices which hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All requests from IO Devices that hit the LLC", + "UMask": "0xc001fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoMs from local IO devices which hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC", + "UMask": "0xcc43fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoMCacheNears, indicating a partial write request, from IO Devices that hit the LLC", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT_ITOMCACHENEAR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that hit the LLC", + "UMask": "0xcd43fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "PCIRDCURs issued by IO devices which hit the LLC", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT_PCIRDCUR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : PCIRdCurs issued by IO Devices that hit the LLC", + "UMask": "0xc8f3fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "RFOs from local IO devices which hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFOs issued by IO Devices that hit the LLC", + "UMask": "0xc803fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "All TOR ItoM inserts from local IO devices", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices", + "UMask": "0xcc43ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoMCacheNears, indicating a partial write request, from IO Devices", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices", + "UMask": "0xcd43ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "All TOR inserts from local IO devices which miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All requests from IO Devices that missed the LLC", + "UMask": "0xc001fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "All TOR ItoM inserts from local IO devices which miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that missed the LLC", + "UMask": "0xcc43fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC", + "UMask": "0xcd43fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "PCIRDCURs issued by IO devices which miss the LLC", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_PCIRDCUR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : PCIRdCurs issued by IO Devices that missed the LLC", + "UMask": "0xc8f3fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "All TOR RFO inserts from local IO devices which miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFOs issued by IO Devices that missed the LLC", + "UMask": "0xc803fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "PCIRDCURs issued by IO devices", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : PCIRdCurs issued by IO Devices", + "UMask": "0xc8f3ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "RFOs from local IO devices", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFOs issued by IO Devices", + "UMask": "0xc803ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "WBMtoI requests from IO devices", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_WBMTOI", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WbMtoIs issued by IO Devices", + "UMask": "0xcc23ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Inserts for SF or LLC Evictions", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.LLC_OR_SF_EVICTIONS", + "PerPkg": "1", + "PublicDescription": "TOR allocation occurred as a result of SF/LLC evictions (came from the ISMQ)", + "UMask": "0xc001ff02", + "Unit": "CHA" + }, + { + "BriefDescription": "All locally initiated requests", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.LOC_ALL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All from Local iA and IO", + "UMask": "0xc000ff05", + "Unit": "CHA" + }, + { + "BriefDescription": "All from Local iA", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.LOC_IA", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All from Local iA", + "UMask": "0xc000ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "All from Local IO", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.LOC_IO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All from Local IO", + "UMask": "0xc000ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "Occupancy for all TOR entries", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.ALL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All", + "UMask": "0xc001ffff", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All locally initiated requests from IA Cores", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All requests from iA Cores", + "UMask": "0xc001ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for CLFlush events that are initiated from the Core", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_CLFLUSH", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CLFlushes issued by iA Cores", + "UMask": "0xc8c7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for CLFlushOpt events that are initiated from the Core", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_CLFLUSHOPT", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CLFlushOpts issued by iA Cores", + "UMask": "0xc8d7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Code read from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_CRD", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CRDs issued by iA Cores", + "UMask": "0xc80fff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Code read prefetch from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_CRD_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy; Code read prefetch from local IA that misses in the snoop filter", + "UMask": "0xc88fff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Data read opt prefetch from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_DRD_OPT_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : DRd_Opt_Prefs issued by iA Cores", + "UMask": "0xc8a7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All locally initiated requests from IA Cores which hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All requests from iA Cores that Hit the LLC", + "UMask": "0xc001fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Code read from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_CRD", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CRds issued by iA Cores that Hit the LLC", + "UMask": "0xc80ffd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Code read prefetch from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_CRD_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CRd_Prefs issued by iA Cores that hit the LLC", + "UMask": "0xc88ffd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Data read opt prefetch from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_DRD_OPT_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : DRd_Opt_Prefs issued by iA Cores that hit the LLC", + "UMask": "0xc8a7fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoM requests from local IA cores that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMs issued by iA Cores that Hit LLC", + "UMask": "0xcc47fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Last level cache prefetch code read from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LLCPREFCODE", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : LLCPrefCode issued by iA Cores that hit the LLC", + "UMask": "0xcccffd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Last level cache prefetch data read from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LLCPREFDATA", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : LLCPrefData issued by iA Cores that hit the LLC", + "UMask": "0xccd7fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Last level cache prefetch read for ownership from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LLCPREFRFO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores that hit the LLC", + "UMask": "0xccc7fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Read for ownership from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Hit the LLC", + "UMask": "0xc807fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Read for ownership prefetch from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_RFO_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFO_Prefs issued by iA Cores that Hit the LLC", + "UMask": "0xc887fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoM events that are initiated from the Core", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMs issued by iA Cores", + "UMask": "0xcc47ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoMCacheNear requests from local IA cores", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_ITOMCACHENEAR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMCacheNears issued by iA Cores", + "UMask": "0xcd47ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Last level cache prefetch code read from local IA.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_LLCPREFCODE", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : LLCPrefCode issued by iA Cores", + "UMask": "0xcccfff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Last level cache prefetch data read from local IA.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_LLCPREFDATA", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : LLCPrefData issued by iA Cores", + "UMask": "0xccd7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Last level cache prefetch read for ownership from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_LLCPREFRFO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores", + "UMask": "0xccc7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All locally initiated requests from IA Cores which miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All requests from iA Cores that Missed the LLC", + "UMask": "0xc001fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Code read from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CRds issued by iA Cores that Missed the LLC", + "UMask": "0xc80ffe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for CRDs from local IA cores to locally homed memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CRd issued by iA Cores that Missed the LLC - HOMed locally", + "UMask": "0xc80efe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Code read prefetch from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CRd_Prefs issued by iA Cores that Missed the LLC", + "UMask": "0xc88ffe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for CRD Prefetches from local IA cores to locally homed memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD_PREF_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CRd_Prefs issued by iA Cores that Missed the LLC - HOMed locally", + "UMask": "0xc88efe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Data read opt prefetch from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_OPT_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : DRd_Opt_Prefs issued by iA Cores that missed the LLC", + "UMask": "0xc8a7fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoM requests from local IA cores that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMs issued by iA Cores that Missed LLC", + "UMask": "0xcc47fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Last level cache prefetch code read from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFCODE", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : LLCPrefCode issued by iA Cores that missed the LLC", + "UMask": "0xcccffe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Last level cache prefetch data read from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFDATA", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : LLCPrefData issued by iA Cores that missed the LLC", + "UMask": "0xccd7fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Last level cache prefetch read for ownership from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFRFO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores that missed the LLC", + "UMask": "0xccc7fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCILF requests from local IA cores to locally homed DDR addresses that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LOCAL_WCILF_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLFs issued by iA Cores targeting DDR that missed the LLC - HOMed locally", + "UMask": "0xc8668601", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCIL requests from local IA cores to locally homed DDR addresses that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LOCAL_WCIL_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores targeting DDR that missed the LLC - HOMed locally", + "UMask": "0xc86e8601", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Read for ownership from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Missed the LLC", + "UMask": "0xc807fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Read for ownership from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Missed the LLC - HOMed locally", + "UMask": "0xc806fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Read for ownership prefetch from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFO_Prefs issued by iA Cores that Missed the LLC", + "UMask": "0xc887fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Read for ownership prefetch from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_PREF_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFO_Prefs issued by iA Cores that Missed the LLC - HOMed locally", + "UMask": "0xc886fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for UCRDF requests from local IA cores that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_UCRDF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : UCRdFs issued by iA Cores that Missed LLC", + "UMask": "0xc877de01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCIL requests from a local IA core that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCIL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores that Missed the LLC", + "UMask": "0xc86ffe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCILF requests from local IA core that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCILF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLF issued by iA Cores that Missed the LLC", + "UMask": "0xc867fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCILF requests from local IA cores to DDR homed addresses which miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCILF_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLFs issued by iA Cores targeting DDR that missed the LLC", + "UMask": "0xc8678601", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCIL requests from local IA cores to DDR homed addresses which miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCIL_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores targeting DDR that missed the LLC", + "UMask": "0xc86f8601", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WIL requests from local IA cores that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WIL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WiLs issued by iA Cores that Missed LLC", + "UMask": "0xc87fde01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Read for ownership from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores", + "UMask": "0xc807ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Read for ownership prefetch from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_RFO_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFO_Prefs issued by iA Cores", + "UMask": "0xc887ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for SpecItoM events that are initiated from the Core", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_SPECITOM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : SpecItoMs issued by iA Cores", + "UMask": "0xcc57ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WbMtoI requests from local IA cores", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_WBMTOI", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WbMtoIs issued by iA Cores", + "UMask": "0xcc27ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCIL requests from a local IA core", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_WCIL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores", + "UMask": "0xc86fff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCILF requests from local IA core", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_WCILF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLF issued by iA Cores", + "UMask": "0xc867ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All TOR inserts from local IO devices", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All requests from IO Devices", + "UMask": "0xc001ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for CLFlush requests from IO devices", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_CLFLUSH", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CLFlushes issued by IO Devices", + "UMask": "0xc8c3ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All TOR inserts from local IO devices which hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All requests from IO Devices that hit the LLC", + "UMask": "0xc001fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoMs from local IO devices which hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMs issued by IO Devices that Hit the LLC", + "UMask": "0xcc43fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoMCacheNears, indicating a partial write request, from IO Devices that hit the LLC", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT_ITOMCACHENEAR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that hit the LLC", + "UMask": "0xcd43fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for PCIRDCURs issued by IO devices which hit the LLC", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT_PCIRDCUR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : PCIRdCurs issued by IO Devices that hit the LLC", + "UMask": "0xc8f3fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for RFOs from local IO devices which hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFOs issued by IO Devices that hit the LLC", + "UMask": "0xc803fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All TOR ItoM inserts from local IO devices", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMs issued by IO Devices", + "UMask": "0xcc43ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoMCacheNears, indicating a partial write request, from IO Devices", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_ITOMCACHENEAR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices", + "UMask": "0xcd43ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All TOR inserts from local IO devices which miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All requests from IO Devices that missed the LLC", + "UMask": "0xc001fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All TOR ItoM inserts from local IO devices which miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMs issued by IO Devices that missed the LLC", + "UMask": "0xcc43fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOMCACHENEAR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC", + "UMask": "0xcd43fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for PCIRDCURs issued by IO devices which miss the LLC", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_PCIRDCUR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : PCIRdCurs issued by IO Devices that missed the LLC", + "UMask": "0xc8f3fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All TOR RFO inserts from local IO devices which miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFOs issued by IO Devices that missed the LLC", + "UMask": "0xc803fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for PCIRDCURs issued by IO devices", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_PCIRDCUR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : PCIRdCurs issued by IO Devices", + "UMask": "0xc8f3ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for RFOs from local IO devices", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFOs issued by IO Devices", + "UMask": "0xc803ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WBMtoI requests from IO devices", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_WBMTOI", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WbMtoIs issued by IO Devices", + "UMask": "0xcc23ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All locally initiated requests", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.LOC_ALL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All from Local iA and IO", + "UMask": "0xc000ff05", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All from Local iA", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.LOC_IA", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All from Local iA", + "UMask": "0xc000ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All from Local IO", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.LOC_IO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All from Local IO", + "UMask": "0xc000ff04", + "Unit": "CHA" + } +] diff --git a/tools/perf/pmu-events/arch/x86/grandridge/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/grandridge/uncore-interconnect.json new file mode 100644 index 0000000000..9091f8fde5 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/grandridge/uncore-interconnect.json @@ -0,0 +1,175 @@ +[ + { + "BriefDescription": "Clockticks of the mesh to memory (B2CMI)", + "EventCode": "0x01", + "EventName": "UNC_B2CMI_CLOCKTICKS", + "PerPkg": "1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the number of times B2CMI egress did D2C (direct to core)", + "EventCode": "0x16", + "EventName": "UNC_B2CMI_DIRECT2CORE_TAKEN", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the number of times D2C wasn't honoured even though the incoming request had d2c set for non cisgress txn", + "EventCode": "0x18", + "EventName": "UNC_B2CMI_DIRECT2CORE_TXN_OVERRIDE", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts any read", + "EventCode": "0x24", + "EventName": "UNC_B2CMI_IMC_READS.ALL", + "PerPkg": "1", + "UMask": "0x104", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts normal reads issue to CMI", + "EventCode": "0x24", + "EventName": "UNC_B2CMI_IMC_READS.NORMAL", + "PerPkg": "1", + "UMask": "0x101", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts reads to 1lm non persistent memory regions", + "EventCode": "0x24", + "EventName": "UNC_B2CMI_IMC_READS.TO_DDR_AS_MEM", + "PerPkg": "1", + "UMask": "0x108", + "Unit": "B2CMI" + }, + { + "BriefDescription": "All Writes - All Channels", + "EventCode": "0x25", + "EventName": "UNC_B2CMI_IMC_WRITES.ALL", + "PerPkg": "1", + "UMask": "0x110", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Full Non-ISOCH - All Channels", + "EventCode": "0x25", + "EventName": "UNC_B2CMI_IMC_WRITES.FULL", + "PerPkg": "1", + "UMask": "0x101", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Partial Non-ISOCH - All Channels", + "EventCode": "0x25", + "EventName": "UNC_B2CMI_IMC_WRITES.PARTIAL", + "PerPkg": "1", + "UMask": "0x102", + "Unit": "B2CMI" + }, + { + "BriefDescription": "DDR - All Channels", + "EventCode": "0x25", + "EventName": "UNC_B2CMI_IMC_WRITES.TO_DDR_AS_MEM", + "PerPkg": "1", + "UMask": "0x120", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Prefetch CAM Inserts : XPT - Ch 0", + "EventCode": "0x56", + "EventName": "UNC_B2CMI_PREFCAM_INSERTS.CH0_XPT", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Prefetch CAM Inserts : XPT -All Channels", + "EventCode": "0x56", + "EventName": "UNC_B2CMI_PREFCAM_INSERTS.XPT_ALLCH", + "PerPkg": "1", + "PublicDescription": "Prefetch CAM Inserts : XPT - All Channels", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Prefetch CAM Occupancy : Channel 0", + "EventCode": "0x54", + "EventName": "UNC_B2CMI_PREFCAM_OCCUPANCY.CH0", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Tracker Inserts : Channel 0", + "EventCode": "0x32", + "EventName": "UNC_B2CMI_TRACKER_INSERTS.CH0", + "PerPkg": "1", + "UMask": "0x104", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Tracker Occupancy : Channel 0", + "EventCode": "0x33", + "EventName": "UNC_B2CMI_TRACKER_OCCUPANCY.CH0", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Write Tracker Inserts : Channel 0", + "EventCode": "0x40", + "EventName": "UNC_B2CMI_WR_TRACKER_INSERTS.CH0", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Total Write Cache Occupancy : Mem", + "EventCode": "0x0F", + "EventName": "UNC_I_CACHE_TOTAL_OCCUPANCY.MEM", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "IRP" + }, + { + "BriefDescription": "IRP Clockticks", + "EventCode": "0x01", + "EventName": "UNC_I_CLOCKTICKS", + "PerPkg": "1", + "Unit": "IRP" + }, + { + "BriefDescription": "Inbound read requests received by the IRP and inserted into the FAF queue", + "EventCode": "0x18", + "EventName": "UNC_I_FAF_INSERTS", + "PerPkg": "1", + "Unit": "IRP" + }, + { + "BriefDescription": "FAF occupancy", + "EventCode": "0x19", + "EventName": "UNC_I_FAF_OCCUPANCY", + "PerPkg": "1", + "Unit": "IRP" + }, + { + "BriefDescription": "Misc Events - Set 1 : Lost Forward : Snoop pulled away ownership before a write was committed", + "EventCode": "0x1F", + "EventName": "UNC_I_MISC1.LOST_FWD", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "IRP" + }, + { + "BriefDescription": "Inbound write (fast path) requests to coherent memory, received by the IRP resulting in write ownership requests issued by IRP to the mesh.", + "EventCode": "0x11", + "EventName": "UNC_I_TRANSACTIONS.WR_PREF", + "PerPkg": "1", + "UMask": "0x8", + "Unit": "IRP" + } +] diff --git a/tools/perf/pmu-events/arch/x86/grandridge/uncore-io.json b/tools/perf/pmu-events/arch/x86/grandridge/uncore-io.json new file mode 100644 index 0000000000..c301ef95ae --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/grandridge/uncore-io.json @@ -0,0 +1,1187 @@ +[ + { + "BriefDescription": "IIO Clockticks", + "EventCode": "0x01", + "EventName": "UNC_IIO_CLOCKTICKS", + "PerPkg": "1", + "PortMask": "0x000", + "Unit": "IIO" + }, + { + "BriefDescription": "PCIE Completion Buffer Inserts. Counts once per 64 byte read issued from this PCIE device.", + "EventCode": "0xC2", + "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.ALL_PARTS", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff004", + "Unit": "IIO" + }, + { + "BriefDescription": "PCIE Completion Buffer Inserts. Counts once per 64 byte read issued from this PCIE device.", + "EventCode": "0xC2", + "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001004", + "Unit": "IIO" + }, + { + "BriefDescription": "PCIE Completion Buffer Inserts. Counts once per 64 byte read issued from this PCIE device.", + "EventCode": "0xC2", + "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002004", + "Unit": "IIO" + }, + { + "BriefDescription": "PCIE Completion Buffer Inserts. Counts once per 64 byte read issued from this PCIE device.", + "EventCode": "0xC2", + "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004004", + "Unit": "IIO" + }, + { + "BriefDescription": "PCIE Completion Buffer Inserts. Counts once per 64 byte read issued from this PCIE device.", + "EventCode": "0xC2", + "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008004", + "Unit": "IIO" + }, + { + "BriefDescription": "PCIE Completion Buffer Inserts. Counts once per 64 byte read issued from this PCIE device.", + "EventCode": "0xC2", + "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010004", + "Unit": "IIO" + }, + { + "BriefDescription": "PCIE Completion Buffer Inserts. Counts once per 64 byte read issued from this PCIE device.", + "EventCode": "0xC2", + "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020004", + "Unit": "IIO" + }, + { + "BriefDescription": "PCIE Completion Buffer Inserts. Counts once per 64 byte read issued from this PCIE device.", + "EventCode": "0xC2", + "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040004", + "Unit": "IIO" + }, + { + "BriefDescription": "PCIE Completion Buffer Inserts. Counts once per 64 byte read issued from this PCIE device.", + "EventCode": "0xC2", + "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080004", + "Unit": "IIO" + }, + { + "BriefDescription": "Count of allocations in the completion buffer", + "EventCode": "0xD5", + "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.ALL_PARTS", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff0ff", + "Unit": "IIO" + }, + { + "BriefDescription": "Count of allocations in the completion buffer", + "EventCode": "0xD5", + "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001001", + "Unit": "IIO" + }, + { + "BriefDescription": "Count of allocations in the completion buffer", + "EventCode": "0xD5", + "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002002", + "Unit": "IIO" + }, + { + "BriefDescription": "Count of allocations in the completion buffer", + "EventCode": "0xD5", + "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004004", + "Unit": "IIO" + }, + { + "BriefDescription": "Count of allocations in the completion buffer", + "EventCode": "0xD5", + "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008008", + "Unit": "IIO" + }, + { + "BriefDescription": "Count of allocations in the completion buffer", + "EventCode": "0xD5", + "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010010", + "Unit": "IIO" + }, + { + "BriefDescription": "Count of allocations in the completion buffer", + "EventCode": "0xD5", + "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020020", + "Unit": "IIO" + }, + { + "BriefDescription": "Count of allocations in the completion buffer", + "EventCode": "0xD5", + "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040040", + "Unit": "IIO" + }, + { + "BriefDescription": "Count of allocations in the completion buffer", + "EventCode": "0xD5", + "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080080", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.ALL_PARTS", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff004", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001004", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002004", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004004", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008004", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010004", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020004", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040004", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080004", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.ALL_PARTS", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080001", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001004", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x02", + "UMask": "0x7002004", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x04", + "UMask": "0x7004004", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x08", + "UMask": "0x7008004", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x10", + "UMask": "0x7010004", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x20", + "UMask": "0x7020004", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x40", + "UMask": "0x7040004", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x80", + "UMask": "0x7080004", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001001", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x02", + "UMask": "0x7002001", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x04", + "UMask": "0x7004001", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x08", + "UMask": "0x7008001", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x10", + "UMask": "0x7010001", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x20", + "UMask": "0x7020001", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x40", + "UMask": "0x7040001", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x80", + "UMask": "0x7080001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001002", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002002", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004002", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008002", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010002", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020002", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040002", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080002", + "Unit": "IIO" + }, + { + "BriefDescription": "IOTLB Hits to a 1G Page", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.1G_HITS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x10", + "Unit": "IIO" + }, + { + "BriefDescription": "IOTLB Hits to a 2M Page", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.2M_HITS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x8", + "Unit": "IIO" + }, + { + "BriefDescription": "IOTLB Hits to a 4K Page", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.4K_HITS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x4", + "Unit": "IIO" + }, + { + "BriefDescription": "Context cache hits", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.CTXT_CACHE_HITS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x80", + "Unit": "IIO" + }, + { + "BriefDescription": "Context cache lookups", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.CTXT_CACHE_LOOKUPS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x40", + "Unit": "IIO" + }, + { + "BriefDescription": "IOTLB lookups first", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.FIRST_LOOKUPS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x1", + "Unit": "IIO" + }, + { + "BriefDescription": "IOTLB Fills (same as IOTLB miss)", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.MISSES", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x20", + "Unit": "IIO" + }, + { + "BriefDescription": "IOMMU memory access (both low and high priority)", + "EventCode": "0x41", + "EventName": "UNC_IIO_IOMMU1.NUM_MEM_ACCESSES", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0xc0", + "Unit": "IIO" + }, + { + "BriefDescription": "Second Level Page Walk Cache Hit to a 1G page", + "EventCode": "0x41", + "EventName": "UNC_IIO_IOMMU1.SLPWC_1G_HITS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x4", + "Unit": "IIO" + }, + { + "BriefDescription": "Second Level Page Walk Cache Hit to a 256T page", + "EventCode": "0x41", + "EventName": "UNC_IIO_IOMMU1.SLPWC_256T_HITS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x10", + "Unit": "IIO" + }, + { + "BriefDescription": "Second Level Page Walk Cache Hit to a 512G page", + "EventCode": "0x41", + "EventName": "UNC_IIO_IOMMU1.SLPWC_512G_HITS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x8", + "Unit": "IIO" + }, + { + "BriefDescription": "-", + "EventCode": "0x8e", + "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.ABORT", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff080", + "Unit": "IIO" + }, + { + "BriefDescription": "-", + "EventCode": "0x8e", + "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.CONFINED_P2P", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff040", + "Unit": "IIO" + }, + { + "BriefDescription": "-", + "EventCode": "0x8e", + "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.LOC_P2P", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff020", + "Unit": "IIO" + }, + { + "BriefDescription": "-", + "EventCode": "0x8e", + "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.MCAST", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff002", + "Unit": "IIO" + }, + { + "BriefDescription": "-", + "EventCode": "0x8e", + "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.MEM", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff008", + "Unit": "IIO" + }, + { + "BriefDescription": "-", + "EventCode": "0x8e", + "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.MSGB", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff001", + "Unit": "IIO" + }, + { + "BriefDescription": "-", + "EventCode": "0x8e", + "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.UBOX", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff004", + "Unit": "IIO" + }, + { + "BriefDescription": "All 9 bits of Page Walk Tracker Occupancy", + "EventCode": "0x42", + "EventName": "UNC_IIO_PWT_OCCUPANCY", + "PerPkg": "1", + "PortMask": "0x000", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001002", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002002", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004002", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008002", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010002", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020002", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040002", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080002", + "Unit": "IIO" + } +] diff --git a/tools/perf/pmu-events/arch/x86/grandridge/uncore-memory.json b/tools/perf/pmu-events/arch/x86/grandridge/uncore-memory.json new file mode 100644 index 0000000000..a2405ed640 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/grandridge/uncore-memory.json @@ -0,0 +1,385 @@ +[ + { + "BriefDescription": "DRAM Activate Count : Counts the number of DRAM Activate commands sent on this channel. Activate commands are issued to open up a page on the DRAM devices so that it can be read or written to with a CAS. One can calculate the number of Page Misses by subtracting the number of Page Miss precharges from the number of Activates.", + "EventCode": "0x02", + "EventName": "UNC_M_ACT_COUNT.ALL", + "PerPkg": "1", + "UMask": "0xf7", + "Unit": "IMC" + }, + { + "BriefDescription": "DRAM Activate Count : Read transaction on Page Empty or Page Miss : Counts the number of DRAM Activate commands sent on this channel. Activate commands are issued to open up a page on the DRAM devices so that it can be read or written to with a CAS. One can calculate the number of Page Misses by subtracting the number of Page Miss precharges from the number of Activates.", + "EventCode": "0x02", + "EventName": "UNC_M_ACT_COUNT.RD", + "PerPkg": "1", + "UMask": "0xf1", + "Unit": "IMC" + }, + { + "BriefDescription": "DRAM Activate Count : Underfill Read transaction on Page Empty or Page Miss : Counts the number of DRAM Activate commands sent on this channel. Activate commands are issued to open up a page on the DRAM devices so that it can be read or written to with a CAS. One can calculate the number of Page Misses by subtracting the number of Page Miss precharges from the number of Activates.", + "EventCode": "0x02", + "EventName": "UNC_M_ACT_COUNT.UFILL", + "PerPkg": "1", + "UMask": "0xf4", + "Unit": "IMC" + }, + { + "BriefDescription": "DRAM Activate Count : Write transaction on Page Empty or Page Miss : Counts the number of DRAM Activate commands sent on this channel. Activate commands are issued to open up a page on the DRAM devices so that it can be read or written to with a CAS. One can calculate the number of Page Misses by subtracting the number of Page Miss precharges from the number of Activates.", + "EventCode": "0x02", + "EventName": "UNC_M_ACT_COUNT.WR", + "PerPkg": "1", + "UMask": "0xf2", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0, all CAS operations", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.ALL", + "PerPkg": "1", + "UMask": "0xff", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0, all reads", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.RD", + "PerPkg": "1", + "UMask": "0xcf", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0 regular reads", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.RD_REG", + "PerPkg": "1", + "UMask": "0xc1", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0 underfill reads", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.RD_UNDERFILL", + "PerPkg": "1", + "UMask": "0xc4", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0, all writes", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.WR", + "PerPkg": "1", + "UMask": "0xf0", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0 regular writes", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.WR_NONPRE", + "PerPkg": "1", + "UMask": "0xd0", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0 auto-precharge writes", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.WR_PRE", + "PerPkg": "1", + "UMask": "0xe0", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1, all CAS operations", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.ALL", + "PerPkg": "1", + "UMask": "0xff", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1, all reads", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.RD", + "PerPkg": "1", + "UMask": "0xcf", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1 regular reads", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.RD_REG", + "PerPkg": "1", + "UMask": "0xc1", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1 underfill reads", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.RD_UNDERFILL", + "PerPkg": "1", + "UMask": "0xc4", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1, all writes", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.WR", + "PerPkg": "1", + "UMask": "0xf0", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1 regular writes", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.WR_NONPRE", + "PerPkg": "1", + "UMask": "0xd0", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1 auto-precharge writes", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.WR_PRE", + "PerPkg": "1", + "UMask": "0xe0", + "Unit": "IMC" + }, + { + "BriefDescription": "Number of DRAM DCLK clock cycles while the event is enabled", + "EventCode": "0x01", + "EventName": "UNC_M_CLOCKTICKS", + "PerPkg": "1", + "PublicDescription": "DRAM Clockticks", + "UMask": "0x1", + "Unit": "IMC" + }, + { + "BriefDescription": "Number of DRAM HCLK clock cycles while the event is enabled", + "EventCode": "0x01", + "EventName": "UNC_M_HCLOCKTICKS", + "PerPkg": "1", + "PublicDescription": "DRAM Clockticks", + "Unit": "IMC" + }, + { + "BriefDescription": "DRAM Precharge commands. : Counts the number of DRAM Precharge commands sent on this channel.", + "EventCode": "0x03", + "EventName": "UNC_M_PRE_COUNT.ALL", + "PerPkg": "1", + "UMask": "0xff", + "Unit": "IMC" + }, + { + "BriefDescription": "DRAM Precharge commands. : Precharge due to (?) : Counts the number of DRAM Precharge commands sent on this channel.", + "EventCode": "0x03", + "EventName": "UNC_M_PRE_COUNT.PGT", + "PerPkg": "1", + "UMask": "0xf8", + "Unit": "IMC" + }, + { + "BriefDescription": "DRAM Precharge commands. : Counts the number of DRAM Precharge commands sent on this channel.", + "EventCode": "0x03", + "EventName": "UNC_M_PRE_COUNT.RD", + "PerPkg": "1", + "UMask": "0xf1", + "Unit": "IMC" + }, + { + "BriefDescription": "DRAM Precharge commands. : Counts the number of DRAM Precharge commands sent on this channel.", + "EventCode": "0x03", + "EventName": "UNC_M_PRE_COUNT.UFILL", + "PerPkg": "1", + "UMask": "0xf4", + "Unit": "IMC" + }, + { + "BriefDescription": "DRAM Precharge commands. : Counts the number of DRAM Precharge commands sent on this channel.", + "EventCode": "0x03", + "EventName": "UNC_M_PRE_COUNT.WR", + "PerPkg": "1", + "UMask": "0xf2", + "Unit": "IMC" + }, + { + "BriefDescription": "Read buffer inserts on subchannel 0", + "EventCode": "0x17", + "EventName": "UNC_M_RDB_INSERTS.SCH0", + "PerPkg": "1", + "UMask": "0x40", + "Unit": "IMC" + }, + { + "BriefDescription": "Read buffer inserts on subchannel 1", + "EventCode": "0x17", + "EventName": "UNC_M_RDB_INSERTS.SCH1", + "PerPkg": "1", + "UMask": "0x80", + "Unit": "IMC" + }, + { + "BriefDescription": "Read buffer occupancy on subchannel 0", + "EventCode": "0x1a", + "EventName": "UNC_M_RDB_OCCUPANCY_SCH0", + "PerPkg": "1", + "Unit": "IMC" + }, + { + "BriefDescription": "Read buffer occupancy on subchannel 1", + "EventCode": "0x1b", + "EventName": "UNC_M_RDB_OCCUPANCY_SCH1", + "PerPkg": "1", + "Unit": "IMC" + }, + { + "BriefDescription": "Read Pending Queue Allocations : Counts the number of allocations into the Read Pending Queue. This queue is used to schedule reads out to the memory controller and to track the requests. Requests allocate into the RPQ soon after they enter the memory controller, and need credits for an entry in this buffer before being sent from the HA to the iMC. They deallocate after the CAS command has been issued to memory. This includes both ISOCH and non-ISOCH requests.", + "EventCode": "0x10", + "EventName": "UNC_M_RPQ_INSERTS.PCH0", + "PerPkg": "1", + "UMask": "0x50", + "Unit": "IMC" + }, + { + "BriefDescription": "Read Pending Queue Allocations : Counts the number of allocations into the Read Pending Queue. This queue is used to schedule reads out to the memory controller and to track the requests. Requests allocate into the RPQ soon after they enter the memory controller, and need credits for an entry in this buffer before being sent from the HA to the iMC. They deallocate after the CAS command has been issued to memory. This includes both ISOCH and non-ISOCH requests.", + "EventCode": "0x10", + "EventName": "UNC_M_RPQ_INSERTS.PCH1", + "PerPkg": "1", + "UMask": "0xa0", + "Unit": "IMC" + }, + { + "BriefDescription": "Read Pending Queue inserts for subchannel 0, pseudochannel 0", + "EventCode": "0x10", + "EventName": "UNC_M_RPQ_INSERTS.SCH0_PCH0", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "IMC" + }, + { + "BriefDescription": "Read Pending Queue inserts for subchannel 0, pseudochannel 1", + "EventCode": "0x10", + "EventName": "UNC_M_RPQ_INSERTS.SCH0_PCH1", + "PerPkg": "1", + "UMask": "0x20", + "Unit": "IMC" + }, + { + "BriefDescription": "Read Pending Queue inserts for subchannel 1, pseudochannel 0", + "EventCode": "0x10", + "EventName": "UNC_M_RPQ_INSERTS.SCH1_PCH0", + "PerPkg": "1", + "UMask": "0x40", + "Unit": "IMC" + }, + { + "BriefDescription": "Read Pending Queue inserts for subchannel 1, pseudochannel 1", + "EventCode": "0x10", + "EventName": "UNC_M_RPQ_INSERTS.SCH1_PCH1", + "PerPkg": "1", + "UMask": "0x80", + "Unit": "IMC" + }, + { + "BriefDescription": "Read pending queue occupancy for subchannel 0, pseudochannel 0", + "EventCode": "0x80", + "EventName": "UNC_M_RPQ_OCCUPANCY_SCH0_PCH0", + "PerPkg": "1", + "Unit": "IMC" + }, + { + "BriefDescription": "Read pending queue occupancy for subchannel 0, pseudochannel 1", + "EventCode": "0x81", + "EventName": "UNC_M_RPQ_OCCUPANCY_SCH0_PCH1", + "PerPkg": "1", + "Unit": "IMC" + }, + { + "BriefDescription": "Read pending queue occupancy for subchannel 1, pseudochannel 0", + "EventCode": "0x82", + "EventName": "UNC_M_RPQ_OCCUPANCY_SCH1_PCH0", + "PerPkg": "1", + "Unit": "IMC" + }, + { + "BriefDescription": "Read pending queue occupancy for subchannel 1, pseudochannel 1", + "EventCode": "0x83", + "EventName": "UNC_M_RPQ_OCCUPANCY_SCH1_PCH1", + "PerPkg": "1", + "Unit": "IMC" + }, + { + "BriefDescription": "Write Pending Queue Allocations", + "EventCode": "0x22", + "EventName": "UNC_M_WPQ_INSERTS.PCH0", + "PerPkg": "1", + "UMask": "0x50", + "Unit": "IMC" + }, + { + "BriefDescription": "Write Pending Queue Allocations", + "EventCode": "0x22", + "EventName": "UNC_M_WPQ_INSERTS.PCH1", + "PerPkg": "1", + "UMask": "0xa0", + "Unit": "IMC" + }, + { + "BriefDescription": "Write Pending Queue inserts for subchannel 0, pseudochannel 0", + "EventCode": "0x22", + "EventName": "UNC_M_WPQ_INSERTS.SCH0_PCH0", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "IMC" + }, + { + "BriefDescription": "Write Pending Queue inserts for subchannel 0, pseudochannel 1", + "EventCode": "0x22", + "EventName": "UNC_M_WPQ_INSERTS.SCH0_PCH1", + "PerPkg": "1", + "UMask": "0x20", + "Unit": "IMC" + }, + { + "BriefDescription": "Write Pending Queue inserts for subchannel 1, pseudochannel 0", + "EventCode": "0x22", + "EventName": "UNC_M_WPQ_INSERTS.SCH1_PCH0", + "PerPkg": "1", + "UMask": "0x40", + "Unit": "IMC" + }, + { + "BriefDescription": "Write Pending Queue inserts for subchannel 1, pseudochannel 1", + "EventCode": "0x22", + "EventName": "UNC_M_WPQ_INSERTS.SCH1_PCH1", + "PerPkg": "1", + "UMask": "0x80", + "Unit": "IMC" + }, + { + "BriefDescription": "Write pending queue occupancy for subchannel 0, pseudochannel 0", + "EventCode": "0x84", + "EventName": "UNC_M_WPQ_OCCUPANCY_SCH0_PCH0", + "PerPkg": "1", + "Unit": "IMC" + }, + { + "BriefDescription": "Write pending queue occupancy for subchannel 0, pseudochannel 1", + "EventCode": "0x85", + "EventName": "UNC_M_WPQ_OCCUPANCY_SCH0_PCH1", + "PerPkg": "1", + "Unit": "IMC" + }, + { + "BriefDescription": "Write pending queue occupancy for subchannel 1, pseudochannel 0", + "EventCode": "0x86", + "EventName": "UNC_M_WPQ_OCCUPANCY_SCH1_PCH0", + "PerPkg": "1", + "Unit": "IMC" + }, + { + "BriefDescription": "Write pending queue occupancy for subchannel 1, pseudochannel 1", + "EventCode": "0x87", + "EventName": "UNC_M_WPQ_OCCUPANCY_SCH1_PCH1", + "PerPkg": "1", + "Unit": "IMC" + } +] diff --git a/tools/perf/pmu-events/arch/x86/grandridge/uncore-power.json b/tools/perf/pmu-events/arch/x86/grandridge/uncore-power.json new file mode 100644 index 0000000000..e3a66166e2 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/grandridge/uncore-power.json @@ -0,0 +1,10 @@ +[ + { + "BriefDescription": "PCU Clockticks", + "EventCode": "0x01", + "EventName": "UNC_P_CLOCKTICKS", + "PerPkg": "1", + "PublicDescription": "PCU Clockticks: The PCU runs off a fixed 1 GHz clock. This event counts the number of pclk cycles measured while the counter was enabled. The pclk, like the Memory Controller's dclk, counts at a constant rate making it a good measure of actual wall time.", + "Unit": "PCU" + } +] diff --git a/tools/perf/pmu-events/arch/x86/grandridge/virtual-memory.json b/tools/perf/pmu-events/arch/x86/grandridge/virtual-memory.json index bd5f2b634c..371974c6d6 100644 --- a/tools/perf/pmu-events/arch/x86/grandridge/virtual-memory.json +++ b/tools/perf/pmu-events/arch/x86/grandridge/virtual-memory.json @@ -1,24 +1,131 @@ [ { - "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to a 1G page.", + "BriefDescription": "Counts the number of first level TLB misses but second level hits due to a demand load that did not start a page walk. Accounts for all page sizes. Will result in a DTLB write from STLB.", + "EventCode": "0x08", + "EventName": "DTLB_LOAD_MISSES.STLB_HIT", + "SampleAfterValue": "200003", + "UMask": "0x20" + }, + { + "BriefDescription": "Counts the number of page walks completed due to load DTLB misses.", "EventCode": "0x08", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0xe" }, { + "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to a 2M or 4M page.", + "EventCode": "0x08", + "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M", + "PublicDescription": "Counts the number of page walks completed due to loads (including SW prefetches) whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 2M or 4M pages. Includes page walks that page fault.", + "SampleAfterValue": "200003", + "UMask": "0x4" + }, + { + "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to a 4K page.", + "EventCode": "0x08", + "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K", + "PublicDescription": "Counts the number of page walks completed due to loads (including SW prefetches) whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages. Includes page walks that page fault.", + "SampleAfterValue": "200003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of page walks outstanding for Loads (demand or SW prefetch) in PMH every cycle.", + "EventCode": "0x08", + "EventName": "DTLB_LOAD_MISSES.WALK_PENDING", + "PublicDescription": "Counts the number of page walks outstanding for Loads (demand or SW prefetch) in PMH every cycle. A PMH page walk is outstanding from page walk start till PMH becomes idle again (ready to serve next walk). Includes EPT-walk intervals.", + "SampleAfterValue": "200003", + "UMask": "0x10" + }, + { + "BriefDescription": "Counts the number of first level TLB misses but second level hits due to stores that did not start a page walk. Accounts for all pages sizes. Will result in a DTLB write from STLB.", + "EventCode": "0x49", + "EventName": "DTLB_STORE_MISSES.STLB_HIT", + "SampleAfterValue": "2000003", + "UMask": "0x20" + }, + { "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to a 1G page.", "EventCode": "0x49", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED", - "SampleAfterValue": "1000003", + "SampleAfterValue": "2000003", "UMask": "0xe" }, { + "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to a 2M or 4M page.", + "EventCode": "0x49", + "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M", + "PublicDescription": "Counts the number of page walks completed due to stores whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 2M or 4M pages. Includes page walks that page fault.", + "SampleAfterValue": "2000003", + "UMask": "0x4" + }, + { + "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to a 4K page.", + "EventCode": "0x49", + "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_4K", + "PublicDescription": "Counts the number of page walks completed due to stores whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages. Includes page walks that page fault.", + "SampleAfterValue": "2000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of page walks outstanding in the page miss handler (PMH) for stores every cycle.", + "EventCode": "0x49", + "EventName": "DTLB_STORE_MISSES.WALK_PENDING", + "PublicDescription": "Counts the number of page walks outstanding in the page miss handler (PMH) for stores every cycle. A PMH page walk is outstanding from page walk start till PMH becomes idle again (ready to serve next walk). Includes EPT-walk intervals.", + "SampleAfterValue": "200003", + "UMask": "0x10" + }, + { + "BriefDescription": "Counts the number of page walks initiated by a instruction fetch that missed the first and second level TLBs.", + "EventCode": "0x85", + "EventName": "ITLB_MISSES.MISS_CAUSED_WALK", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of first level TLB misses but second level hits due to an instruction fetch that did not start a page walk. Account for all pages sizes. Will result in an ITLB write from STLB.", + "EventCode": "0x85", + "EventName": "ITLB_MISSES.STLB_HIT", + "SampleAfterValue": "2000003", + "UMask": "0x20" + }, + { "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to any page size.", "EventCode": "0x85", "EventName": "ITLB_MISSES.WALK_COMPLETED", "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to any page size. Includes page walks that page fault.", "SampleAfterValue": "200003", "UMask": "0xe" + }, + { + "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to a 2M or 4M page.", + "EventCode": "0x85", + "EventName": "ITLB_MISSES.WALK_COMPLETED_2M_4M", + "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 2M or 4M pages. Includes page walks that page fault.", + "SampleAfterValue": "2000003", + "UMask": "0x4" + }, + { + "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to a 4K page.", + "EventCode": "0x85", + "EventName": "ITLB_MISSES.WALK_COMPLETED_4K", + "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages. Includes page walks that page fault.", + "SampleAfterValue": "2000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of page walks outstanding for iside in PMH every cycle.", + "EventCode": "0x85", + "EventName": "ITLB_MISSES.WALK_PENDING", + "PublicDescription": "Counts the number of page walks outstanding for iside in PMH every cycle. A PMH page walk is outstanding from page walk start till PMH becomes idle again (ready to serve next walk). Includes EPT-walk intervals. Walks could be counted by edge detecting on this event, but would count restarted suspended walks.", + "SampleAfterValue": "200003", + "UMask": "0x10" + }, + { + "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a DTLB miss.", + "EventCode": "0x05", + "EventName": "LD_HEAD.DTLB_MISS_AT_RET", + "SampleAfterValue": "1000003", + "UMask": "0x90" } ] diff --git a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json index 79d89c2636..5631018ed3 100644 --- a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json +++ b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json @@ -84,12 +84,12 @@ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", - "MetricThreshold": "tma_alu_op_utilization > 0.6", + "MetricThreshold": "tma_alu_op_utilization > 0.4", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", + "MetricExpr": "66 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -202,7 +202,7 @@ "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, @@ -257,7 +257,7 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", "MetricgroupNoGroup": "TopdownL2", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" @@ -289,20 +289,20 @@ "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "ScaleUnit": "100%" }, { "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)", "MetricGroup": "Bad;BrMispredicts", "MetricName": "tma_info_bad_spec_ipmisp_indirect", "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" @@ -327,7 +327,7 @@ "MetricName": "tma_info_core_coreipc" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", "MetricExpr": "(UOPS_EXECUTED.CORE / 2 / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) if #SMT_on else UOPS_EXECUTED.CORE / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@))", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_core_ilp" @@ -397,96 +397,90 @@ }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l1d_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l2_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t" + }, + { + "BriefDescription": "", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + "MetricName": "tma_info_memory_l1d_cache_fill_bw" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki" }, { + "BriefDescription": "", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw" + }, + { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Backend;CacheMisses;Mem", + "MetricGroup": "Backend;CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki" }, { + "BriefDescription": "", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw" + }, + { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "Mem", "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_memory_load_miss_real_latency" - }, - { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_memory_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" - }, - { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_data_l2_mlp" + "MetricName": "tma_info_memory_latency_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_miss_latency" + "MetricName": "tma_info_memory_latency_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_mlp" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + "MetricName": "tma_info_memory_latency_load_l2_mlp" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "0", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", @@ -502,21 +496,27 @@ "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", "MetricGroup": "Power;Summary", - "MetricName": "tma_info_system_average_frequency" + "MetricName": "tma_info_system_core_frequency" }, { - "BriefDescription": "Average CPU Utilization", + "BriefDescription": "Average CPU Utilization (percentage)", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", "MetricGroup": "HPC;Summary", "MetricName": "tma_info_system_cpu_utilization" }, { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" }, @@ -541,19 +541,6 @@ "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" }, { - "BriefDescription": "Average number of parallel requests to external memory", - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_system_mem_parallel_requests", - "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests" - }, - { - "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_system_mem_request_latency" - }, - { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", "MetricGroup": "SMT", @@ -612,7 +599,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED", @@ -621,7 +608,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_thread_clks, 0)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1", @@ -630,7 +617,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS", @@ -640,20 +627,20 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_SMT", "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency", "ScaleUnit": "100%" }, { @@ -672,7 +659,7 @@ "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%" }, { @@ -707,21 +694,21 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -749,7 +736,7 @@ "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", "ScaleUnit": "100%" }, @@ -768,7 +755,7 @@ "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED_PORT.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED_PORT.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -777,7 +764,7 @@ "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -813,16 +800,16 @@ "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_5", "MetricThreshold": "tma_port_5 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)", "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -868,7 +855,7 @@ "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6", "ScaleUnit": "100%" }, { @@ -876,7 +863,7 @@ "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", - "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "ScaleUnit": "100%" }, { @@ -952,14 +939,5 @@ "MetricName": "tma_store_op_utilization", "MetricThreshold": "tma_store_op_utilization > 0.6", "ScaleUnit": "100%" - }, - { - "BriefDescription": "This metric serves as an approximation of legacy x87 usage", - "MetricExpr": "INST_RETIRED.X87 * tma_info_thread_uoppi / UOPS_RETIRED.RETIRE_SLOTS", - "MetricGroup": "Compute;TopdownL4;tma_L4_group;tma_fp_arith_group", - "MetricName": "tma_x87_use", - "MetricThreshold": "tma_x87_use > 0.1", - "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", - "ScaleUnit": "100%" } ] diff --git a/tools/perf/pmu-events/arch/x86/haswell/memory.json b/tools/perf/pmu-events/arch/x86/haswell/memory.json index 2fc25e22a4..6ba0ea6e3f 100644 --- a/tools/perf/pmu-events/arch/x86/haswell/memory.json +++ b/tools/perf/pmu-events/arch/x86/haswell/memory.json @@ -371,7 +371,7 @@ "BriefDescription": "Number of times an RTM execution aborted due to any reasons (multiple categories may count as one).", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED", - "PEBS": "1", + "PEBS": "2", "SampleAfterValue": "2000003", "UMask": "0x4" }, diff --git a/tools/perf/pmu-events/arch/x86/haswell/metricgroups.json b/tools/perf/pmu-events/arch/x86/haswell/metricgroups.json index f6a0258e32..8c808347f6 100644 --- a/tools/perf/pmu-events/arch/x86/haswell/metricgroups.json +++ b/tools/perf/pmu-events/arch/x86/haswell/metricgroups.json @@ -2,10 +2,10 @@ "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -24,7 +24,9 @@ "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -94,6 +96,7 @@ "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", "tma_light_operations_group": "Metrics contributing to tma_light_operations category", "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", diff --git a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json index 5f451948c8..21e2cb5e31 100644 --- a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json @@ -286,12 +286,12 @@ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", - "MetricThreshold": "tma_alu_op_utilization > 0.6", + "MetricThreshold": "tma_alu_op_utilization > 0.4", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", + "MetricExpr": "66 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -404,7 +404,7 @@ "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, @@ -459,7 +459,7 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", "MetricgroupNoGroup": "TopdownL2", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" @@ -491,20 +491,20 @@ "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "ScaleUnit": "100%" }, { "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)", "MetricGroup": "Bad;BrMispredicts", "MetricName": "tma_info_bad_spec_ipmisp_indirect", "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" @@ -529,7 +529,7 @@ "MetricName": "tma_info_core_coreipc" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", "MetricExpr": "(UOPS_EXECUTED.CORE / 2 / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) if #SMT_on else UOPS_EXECUTED.CORE / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@))", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_core_ilp" @@ -599,96 +599,139 @@ }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l1d_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l2_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t" + }, + { + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "tma_info_memory_latency_data_l2_mlp", + "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_data_l2_mlp", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + "MetricName": "tma_info_memory_l1d_cache_fill_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t", + "MetricgroupNoGroup": "TopdownL1" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki" }, { + "BriefDescription": "", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l2_cache_fill_bw_2t", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Backend;CacheMisses;Mem", + "MetricGroup": "Backend;CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_memory_l3mpki" + "BriefDescription": "", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_memory_load_miss_real_latency" + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l3_cache_fill_bw_2t", + "MetricgroupNoGroup": "TopdownL1" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_memory_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_l3mpki" }, { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_data_l2_mlp" + "MetricName": "tma_info_memory_latency_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", + "MetricExpr": "tma_info_memory_load_l2_miss_latency", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_miss_latency" + "MetricName": "tma_info_memory_latency_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", + "MetricExpr": "tma_info_memory_load_l2_mlp", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_mlp" + "MetricName": "tma_info_memory_latency_load_l2_mlp" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + "BriefDescription": "Average Latency for L2 cache miss demand Loads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", + "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_load_l2_miss_latency", + "MetricgroupNoGroup": "TopdownL1" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + "BriefDescription": "Average Parallel L2 cache miss demand Loads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", + "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_load_l2_mlp", + "MetricgroupNoGroup": "TopdownL1" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "0", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + }, + { + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "tma_info_memory_tlb_page_walks_utilization", + "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_page_walks_utilization", + "MetricgroupNoGroup": "TopdownL1" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", @@ -704,21 +747,27 @@ "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", "MetricGroup": "Power;Summary", - "MetricName": "tma_info_system_average_frequency" + "MetricName": "tma_info_system_core_frequency" }, { - "BriefDescription": "Average CPU Utilization", + "BriefDescription": "Average CPU Utilization (percentage)", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", "MetricGroup": "HPC;Summary", "MetricName": "tma_info_system_cpu_utilization" }, { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" }, @@ -775,6 +824,12 @@ "MetricName": "tma_info_system_turbo_utilization" }, { + "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]", + "MetricExpr": "tma_info_system_socket_clks / 1e9 / duration_time", + "MetricGroup": "SoC", + "MetricName": "tma_info_system_uncore_frequency" + }, + { "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", "MetricExpr": "CPU_CLK_UNHALTED.THREAD", "MetricGroup": "Pipeline", @@ -815,7 +870,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED", @@ -824,7 +879,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_thread_clks, 0)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1", @@ -833,7 +888,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS", @@ -843,20 +898,20 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_SMT", "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "41 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency", "ScaleUnit": "100%" }, { @@ -875,7 +930,7 @@ "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%" }, { @@ -890,11 +945,10 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", - "MetricName": "tma_local_dram", - "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", + "MetricName": "tma_local_mem", + "MetricThreshold": "tma_local_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM_PS", "ScaleUnit": "100%" }, @@ -920,21 +974,21 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -962,7 +1016,7 @@ "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", "ScaleUnit": "100%" }, @@ -981,7 +1035,7 @@ "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED_PORT.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED_PORT.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -990,7 +1044,7 @@ "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -1026,16 +1080,16 @@ "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_5", "MetricThreshold": "tma_port_5 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)", "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -1081,7 +1135,7 @@ "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6", "ScaleUnit": "100%" }, { @@ -1089,7 +1143,7 @@ "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", - "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "ScaleUnit": "100%" }, { @@ -1104,11 +1158,10 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "310 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", - "MetricName": "tma_remote_dram", - "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", + "MetricName": "tma_remote_mem", + "MetricThreshold": "tma_remote_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article. Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM_PS", "ScaleUnit": "100%" }, @@ -1187,15 +1240,6 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric serves as an approximation of legacy x87 usage", - "MetricExpr": "INST_RETIRED.X87 * tma_info_thread_uoppi / UOPS_RETIRED.RETIRE_SLOTS", - "MetricGroup": "Compute;TopdownL4;tma_L4_group;tma_fp_arith_group", - "MetricName": "tma_x87_use", - "MetricThreshold": "tma_x87_use > 0.1", - "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", - "ScaleUnit": "100%" - }, - { "BriefDescription": "Uncore operating frequency in GHz", "MetricExpr": "UNC_C_CLOCKTICKS / (#num_cores / #num_packages * #num_packages) / 1e9 / duration_time", "MetricName": "uncore_frequency", diff --git a/tools/perf/pmu-events/arch/x86/haswellx/metricgroups.json b/tools/perf/pmu-events/arch/x86/haswellx/metricgroups.json index f6a0258e32..8c808347f6 100644 --- a/tools/perf/pmu-events/arch/x86/haswellx/metricgroups.json +++ b/tools/perf/pmu-events/arch/x86/haswellx/metricgroups.json @@ -2,10 +2,10 @@ "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -24,7 +24,9 @@ "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -94,6 +96,7 @@ "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", "tma_light_operations_group": "Metrics contributing to tma_light_operations category", "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", diff --git a/tools/perf/pmu-events/arch/x86/haswellx/uncore-power.json b/tools/perf/pmu-events/arch/x86/haswellx/uncore-power.json index daebf1050a..c391325ee3 100644 --- a/tools/perf/pmu-events/arch/x86/haswellx/uncore-power.json +++ b/tools/perf/pmu-events/arch/x86/haswellx/uncore-power.json @@ -426,6 +426,7 @@ "BriefDescription": "Number of cores in C-State; C0 and C1", "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0", + "Filter": "occ_sel=1", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" @@ -434,6 +435,7 @@ "BriefDescription": "Number of cores in C-State; C3", "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3", + "Filter": "occ_sel=2", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" @@ -442,6 +444,7 @@ "BriefDescription": "Number of cores in C-State; C6 and C7", "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6", + "Filter": "occ_sel=3", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" diff --git a/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json b/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json index b43a6c6d8b..f67cc73779 100644 --- a/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json @@ -98,12 +98,12 @@ "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", - "MetricThreshold": "tma_alu_op_utilization > 0.6", + "MetricThreshold": "tma_alu_op_utilization > 0.4", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * ASSISTS.ANY / tma_info_thread_slots", + "MetricExpr": "34 * ASSISTS.ANY / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -113,7 +113,7 @@ { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_thread_slots", + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / tma_info_thread_slots", "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -135,7 +135,7 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions.", "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * tma_info_thread_slots)", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_branch_instructions", "MetricThreshold": "tma_branch_instructions > 0.1 & tma_light_operations > 0.6", "ScaleUnit": "100%" @@ -180,7 +180,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(29 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 23.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "(29 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 23.5 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -200,7 +200,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "23.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "23.5 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -212,7 +212,7 @@ "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", "MetricName": "tma_decoder0_alone", - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)", "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", "ScaleUnit": "100%" }, @@ -240,7 +240,7 @@ "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, @@ -259,7 +259,7 @@ "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", "ScaleUnit": "100%" }, { @@ -268,12 +268,12 @@ "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "32.5 * tma_info_system_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", + "MetricExpr": "32.5 * tma_info_system_core_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -286,7 +286,7 @@ "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -294,7 +294,7 @@ "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", "MetricgroupNoGroup": "TopdownL2", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" @@ -328,6 +328,15 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists", + "MetricExpr": "34 * ASSISTS.FP / tma_info_thread_slots", + "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", + "MetricName": "tma_fp_assists", + "MetricThreshold": "tma_fp_assists > 0.1", + "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called Denormals).", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", @@ -390,13 +399,13 @@ "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", - "MetricExpr": "ICACHE_16B.IFDATA_STALL / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricExpr": "ICACHE_DATA.STALLS / tma_info_thread_clks", + "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", @@ -405,7 +414,7 @@ { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES / 100", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bad_spec_branch_misprediction_cost", "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" @@ -446,6 +455,12 @@ "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" }, { + "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)", + "MetricExpr": "INT_MISC.CLEARS_COUNT / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)", + "MetricGroup": "BrMispredicts", + "MetricName": "tma_info_bad_spec_spec_clears_ratio" + }, + { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", @@ -472,67 +487,102 @@ "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " }, { + "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.", + "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Ret", + "MetricName": "tma_info_bottleneck_base_non_br", + "MetricThreshold": "tma_info_bottleneck_base_non_br > 20" + }, + { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", - "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", + "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "tma_info_bottleneck_big_code", - "MetricThreshold": "tma_info_bottleneck_big_code > 20", - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" + "MetricThreshold": "tma_info_bottleneck_big_code > 20" }, { "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", - "MetricGroup": "Ret;tma_issueBC", + "MetricExpr": "100 * ((BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots)", + "MetricGroup": "Ret", "MetricName": "tma_info_bottleneck_branching_overhead", - "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", + "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", + "MetricName": "tma_info_bottleneck_cache_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", + "MetricName": "tma_info_bottleneck_cache_memory_latency", + "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency" + }, + { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "Cor;tma_issueComp", + "MetricName": "tma_info_bottleneck_compute_bound_est", + "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " }, { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", "MetricGroup": "Fed;FetchBW;Frontend", "MetricName": "tma_info_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" }, { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", - "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_info_bottleneck_memory_bandwidth", - "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Total pipeline cost of irregular execution (e.g", + "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Bad;Cor;Ret;tma_issueMS", + "MetricName": "tma_info_bottleneck_irregular_overhead", + "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10", + "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches" }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_info_bottleneck_memory_data_tlbs", "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", - "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" + "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization" }, { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", - "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_info_bottleneck_memory_latency", - "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", - "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" + "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", + "MetricExpr": "100 * (tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", + "MetricGroup": "Mem;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_synchronization", + "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10", + "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs" }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bottleneck_mispredictions", "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" }, { + "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)", + "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)", + "MetricGroup": "Cor;Offcore", + "MetricName": "tma_info_bottleneck_other_bottlenecks", + "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20", + "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls." + }, + { "BriefDescription": "Fraction of branches that are CALL or RET", "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", @@ -564,7 +614,7 @@ }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", + "MetricExpr": "(CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else tma_info_thread_clks)", "MetricGroup": "SMT", "MetricName": "tma_info_core_core_clks" }, @@ -575,8 +625,14 @@ "MetricName": "tma_info_core_coreipc" }, { + "BriefDescription": "uops Executed per Cycle", + "MetricExpr": "UOPS_EXECUTED.THREAD / tma_info_thread_clks", + "MetricGroup": "Power", + "MetricName": "tma_info_core_epc" + }, + { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", "MetricGroup": "Flops;Ret", "MetricName": "tma_info_core_flopc" }, @@ -588,8 +644,8 @@ "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_core_ilp" }, @@ -669,7 +725,7 @@ "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", "MetricThreshold": "tma_info_inst_mix_iparith < 10", - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", @@ -677,7 +733,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx128", "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", @@ -685,7 +741,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx256", "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", @@ -693,7 +749,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx512", "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", @@ -701,7 +757,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_dp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", @@ -709,7 +765,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_sp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", @@ -727,7 +783,7 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_ipflop", "MetricThreshold": "tma_info_inst_mix_ipflop < 10" @@ -740,6 +796,12 @@ "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { + "BriefDescription": "Instructions per PAUSE (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / MISC_RETIRED.PAUSE_INST", + "MetricGroup": "Flops;FpVector;InsType", + "MetricName": "tma_info_inst_mix_ippause" + }, + { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", "MetricGroup": "InsType", @@ -763,136 +825,148 @@ }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l1d_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l2_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_access_bw", "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_core_l3_cache_access_bw" + "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_fb_hpki" }, { + "BriefDescription": "", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw" + }, + { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki_load" }, { + "BriefDescription": "", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw" + }, + { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Backend;CacheMisses;Mem", + "MetricGroup": "Backend;CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (OFFCORE_REQUESTS.ALL_DATA_RD - OFFCORE_REQUESTS.DEMAND_DATA_RD + L2_RQSTS.ALL_DEMAND_MISS + L2_RQSTS.SWPF_MISS) / tma_info_inst_mix_instructions", - "MetricGroup": "CacheMisses;Mem;Offcore", + "MetricGroup": "CacheHits;Mem;Offcore", "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki_load" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_memory_l3mpki" + "BriefDescription": "", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_l3_cache_access_bw" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_memory_load_miss_real_latency" + "BriefDescription": "", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_memory_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_l3mpki" }, { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_data_l2_mlp" + "MetricName": "tma_info_memory_latency_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_miss_latency" + "MetricName": "tma_info_memory_latency_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_mlp" + "MetricName": "tma_info_memory_latency_load_l2_mlp" }, { "BriefDescription": "Average Latency for L3 cache miss demand Loads", "MetricExpr": "cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,umask\\=0x10@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l3_miss_latency" + "MetricName": "tma_info_memory_latency_load_l3_miss_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + "BriefDescription": "\"Bus lock\" per kilo instruction", + "MetricExpr": "1e3 * SQ_MISC.BUS_LOCK / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_mix_bus_lock_pki" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + "BriefDescription": "Un-cacheable retired load per kilo instruction", + "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_mix_uc_load_pki" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", @@ -920,42 +994,56 @@ "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "BriefDescription": "", + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" }, { + "BriefDescription": "Instructions per a microcode Assist invocation", + "MetricExpr": "INST_RETIRED.ANY / ASSISTS.ANY", + "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", + "MetricName": "tma_info_pipeline_ipassist", + "MetricThreshold": "tma_info_pipeline_ipassist < 100e3", + "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)" + }, + { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", "MetricGroup": "Power;Summary", - "MetricName": "tma_info_system_average_frequency" + "MetricName": "tma_info_system_core_frequency" }, { - "BriefDescription": "Average CPU Utilization", + "BriefDescription": "Average CPU Utilization (percentage)", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", "MetricGroup": "HPC;Summary", "MetricName": "tma_info_system_cpu_utilization" }, { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_system_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width" }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", @@ -1071,8 +1159,8 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks", + "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", @@ -1081,7 +1169,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1", @@ -1091,7 +1179,7 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + L1D_PEND_MISS.FB_FULL_PERIODS) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", @@ -1101,24 +1189,24 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "9 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "9 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", + "MetricExpr": "DECODE.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1132,7 +1220,7 @@ "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%" }, { @@ -1175,7 +1263,7 @@ "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_lsd", - "MetricThreshold": "tma_lsd > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", + "MetricThreshold": "tma_lsd > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.", "ScaleUnit": "100%" }, @@ -1190,21 +1278,21 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1228,11 +1316,11 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "tma_retiring * tma_info_thread_slots / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", + "MetricExpr": "UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", - "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches", "ScaleUnit": "100%" }, { @@ -1249,7 +1337,7 @@ "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%" }, @@ -1258,16 +1346,16 @@ "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_mite_group", "MetricName": "tma_mite_4wide", - "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))", + "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)", "ScaleUnit": "100%" }, { - "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", + "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)", "MetricExpr": "UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY", "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group", "MetricName": "tma_mixing_vectors", "MetricThreshold": "tma_mixing_vectors > 0.05", - "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", + "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", "ScaleUnit": "100%" }, { @@ -1276,22 +1364,22 @@ "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_thread_slots)", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group", "MetricName": "tma_nop_instructions", - "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6", + "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions + tma_nop_instructions))", + "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6", @@ -1299,6 +1387,22 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).", + "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)", + "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group", + "MetricName": "tma_other_mispredicts", + "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.", + "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)", + "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group", + "MetricName": "tma_other_nukes", + "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", @@ -1326,17 +1430,17 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)", "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", + "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1345,7 +1449,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks", + "MetricExpr": "(cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / tma_info_thread_clks * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1375,7 +1479,7 @@ "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", - "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3", "ScaleUnit": "100%" }, @@ -1393,18 +1497,18 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks", - "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", + "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO", "MetricName": "tma_serializing_operation", - "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", + "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD. Related metrics: tma_ms_switches", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", "MetricExpr": "140 * MISC_RETIRED.PAUSE_INST / tma_info_thread_clks", - "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", + "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", - "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", + "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: MISC_RETIRED.PAUSE_INST", "ScaleUnit": "100%" }, @@ -1433,7 +1537,7 @@ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { @@ -1501,10 +1605,10 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", "MetricExpr": "10 * BACLEARS.ANY / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", + "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY", "ScaleUnit": "100%" }, { diff --git a/tools/perf/pmu-events/arch/x86/icelake/memory.json b/tools/perf/pmu-events/arch/x86/icelake/memory.json index e8d2ec1c02..f847632205 100644 --- a/tools/perf/pmu-events/arch/x86/icelake/memory.json +++ b/tools/perf/pmu-events/arch/x86/icelake/memory.json @@ -259,6 +259,7 @@ "BriefDescription": "Number of times an RTM execution aborted.", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED", + "PEBS": "1", "PublicDescription": "Counts the number of times RTM abort was triggered.", "SampleAfterValue": "100003", "UMask": "0x4" diff --git a/tools/perf/pmu-events/arch/x86/icelake/metricgroups.json b/tools/perf/pmu-events/arch/x86/icelake/metricgroups.json index a151ba9ccc..5452a1448d 100644 --- a/tools/perf/pmu-events/arch/x86/icelake/metricgroups.json +++ b/tools/perf/pmu-events/arch/x86/icelake/metricgroups.json @@ -2,10 +2,10 @@ "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -25,7 +25,9 @@ "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -63,8 +65,10 @@ "tma_L5_group": "Metrics for top-down breakdown at level 5", "tma_L6_group": "Metrics for top-down breakdown at level 6", "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_assists_group": "Metrics contributing to tma_assists category", "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category", "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", "tma_core_bound_group": "Metrics contributing to tma_core_bound category", "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", @@ -77,9 +81,9 @@ "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", "tma_issue2P": "Metrics related by the issue $issue2P", - "tma_issueBC": "Metrics related by the issue $issueBC", "tma_issueBM": "Metrics related by the issue $issueBM", "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueComp": "Metrics related by the issue $issueComp", "tma_issueD0": "Metrics related by the issue $issueD0", "tma_issueFB": "Metrics related by the issue $issueFB", "tma_issueFL": "Metrics related by the issue $issueFL", @@ -99,10 +103,12 @@ "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", "tma_light_operations_group": "Metrics contributing to tma_light_operations category", "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", "tma_mite_group": "Metrics contributing to tma_mite category", + "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category", "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", diff --git a/tools/perf/pmu-events/arch/x86/icelake/other.json b/tools/perf/pmu-events/arch/x86/icelake/other.json index cfb5906329..4fdc873395 100644 --- a/tools/perf/pmu-events/arch/x86/icelake/other.json +++ b/tools/perf/pmu-events/arch/x86/icelake/other.json @@ -19,7 +19,7 @@ "BriefDescription": "Core cycles where the core was running in a manner where Turbo may be clipped to the AVX512 turbo schedule.", "EventCode": "0x28", "EventName": "CORE_POWER.LVL2_TURBO_LICENSE", - "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server microarchtecture). This includes high current AVX 512-bit instructions.", + "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server microarchitecture). This includes high current AVX 512-bit instructions.", "SampleAfterValue": "200003", "UMask": "0x20" }, diff --git a/tools/perf/pmu-events/arch/x86/icelake/pipeline.json b/tools/perf/pmu-events/arch/x86/icelake/pipeline.json index 375b78044f..c7313fd4fd 100644 --- a/tools/perf/pmu-events/arch/x86/icelake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/icelake/pipeline.json @@ -529,7 +529,7 @@ "BriefDescription": "Cycles when Reservation Station (RS) is empty for the thread", "EventCode": "0x5e", "EventName": "RS_EVENTS.EMPTY_CYCLES", - "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into stravation periods (e.g. branch mispredictions or i-cache misses)", + "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses)", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -553,14 +553,6 @@ "UMask": "0x2" }, { - "BriefDescription": "TMA slots wasted due to incorrect speculation by branch mispredictions", - "EventCode": "0xa4", - "EventName": "TOPDOWN.BR_MISPREDICT_SLOTS", - "PublicDescription": "Number of TMA slots that were wasted due to incorrect speculation by branch mispredictions. This event estimates number of operations that were issued but not retired from the speculative path as well as the out-of-order engine recovery past a branch misprediction.", - "SampleAfterValue": "10000003", - "UMask": "0x8" - }, - { "BriefDescription": "TMA slots available for an unhalted logical processor. Fixed counter - architectural event", "EventName": "TOPDOWN.SLOTS", "PublicDescription": "Number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method (TMA). The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core. Software can use this event as the denominator for the top-level metrics of the TMA method. This architectural event is counted on a designated fixed counter (Fixed Counter 3).", diff --git a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json index 71d78a7841..c015b8277d 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json @@ -302,12 +302,12 @@ "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", - "MetricThreshold": "tma_alu_op_utilization > 0.6", + "MetricThreshold": "tma_alu_op_utilization > 0.4", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * ASSISTS.ANY / tma_info_thread_slots", + "MetricExpr": "34 * ASSISTS.ANY / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -317,7 +317,7 @@ { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_thread_slots", + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / tma_info_thread_slots", "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -339,7 +339,7 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions.", "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * tma_info_thread_slots)", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_branch_instructions", "MetricThreshold": "tma_branch_instructions > 0.1 & tma_light_operations > 0.6", "ScaleUnit": "100%" @@ -384,7 +384,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(44 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 43.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "(44 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 43.5 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -404,7 +404,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "43.5 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "43.5 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -416,7 +416,7 @@ "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", "MetricName": "tma_decoder0_alone", - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)", "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", "ScaleUnit": "100%" }, @@ -444,7 +444,7 @@ "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, @@ -463,7 +463,7 @@ "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", "ScaleUnit": "100%" }, { @@ -472,12 +472,12 @@ "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "48 * tma_info_system_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", + "MetricExpr": "48 * tma_info_system_core_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -490,7 +490,7 @@ "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -498,7 +498,7 @@ "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", "MetricgroupNoGroup": "TopdownL2", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" @@ -532,6 +532,15 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists", + "MetricExpr": "34 * ASSISTS.FP / tma_info_thread_slots", + "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", + "MetricName": "tma_fp_assists", + "MetricThreshold": "tma_fp_assists > 0.1", + "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called Denormals).", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", @@ -594,13 +603,13 @@ "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", - "MetricExpr": "ICACHE_16B.IFDATA_STALL / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricExpr": "ICACHE_DATA.STALLS / tma_info_thread_clks", + "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", @@ -609,7 +618,7 @@ { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES / 100", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bad_spec_branch_misprediction_cost", "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" @@ -644,12 +653,39 @@ }, { "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_core_ipmispredict", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;BadSpec;BrMispredicts", "MetricName": "tma_info_bad_spec_ipmispredict", "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" }, { + "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)", + "MetricExpr": "INT_MISC.CLEARS_COUNT / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)", + "MetricGroup": "BrMispredicts", + "MetricName": "tma_info_bad_spec_spec_clears_ratio" + }, + { + "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricExpr": "(100 * (1 - max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots - (CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * (topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots)) / (((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots - (CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * (topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots)) * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) if max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots - (CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * (topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots)) < (((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots - (CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * (topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots)) * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", + "MetricGroup": "Cor;SMT;TopdownL1;tma_L1_group", + "MetricName": "tma_info_botlnk_core_bound_likely", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", + "MetricExpr": "100 * (100 * ((5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / slots * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 10 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(3 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) + max(0, topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots - (5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / slots) * ((IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2) / ((IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2 + (IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2)))", + "MetricGroup": "DSBmiss;Fed;TopdownL1;tma_L1_group", + "MetricName": "tma_info_botlnk_dsb_misses", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck.", + "MetricExpr": "100 * (100 * ((5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / slots * (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD) / (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 10 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(3 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD)))", + "MetricGroup": "Fed;FetchLat;IcMiss;TopdownL1;tma_L1_group", + "MetricName": "tma_info_botlnk_ic_misses", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", @@ -676,67 +712,102 @@ "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " }, { + "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.", + "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Ret", + "MetricName": "tma_info_bottleneck_base_non_br", + "MetricThreshold": "tma_info_bottleneck_base_non_br > 20" + }, + { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", - "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", + "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "tma_info_bottleneck_big_code", - "MetricThreshold": "tma_info_bottleneck_big_code > 20", - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" + "MetricThreshold": "tma_info_bottleneck_big_code > 20" }, { "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", - "MetricGroup": "Ret;tma_issueBC", + "MetricExpr": "100 * ((BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots)", + "MetricGroup": "Ret", "MetricName": "tma_info_bottleneck_branching_overhead", - "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", + "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", + "MetricName": "tma_info_bottleneck_cache_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", + "MetricName": "tma_info_bottleneck_cache_memory_latency", + "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency" + }, + { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "Cor;tma_issueComp", + "MetricName": "tma_info_bottleneck_compute_bound_est", + "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " }, { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", "MetricGroup": "Fed;FetchBW;Frontend", "MetricName": "tma_info_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" }, { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", - "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_info_bottleneck_memory_bandwidth", - "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Total pipeline cost of irregular execution (e.g", + "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Bad;Cor;Ret;tma_issueMS", + "MetricName": "tma_info_bottleneck_irregular_overhead", + "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10", + "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches" }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_info_bottleneck_memory_data_tlbs", "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", - "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" + "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization" }, { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound))", - "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_info_bottleneck_memory_latency", - "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", - "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" + "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) * tma_remote_cache / (tma_local_mem + tma_remote_cache + tma_remote_mem) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", + "MetricGroup": "Mem;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_synchronization", + "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10", + "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs" }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bottleneck_mispredictions", "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" }, { + "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)", + "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)", + "MetricGroup": "Cor;Offcore", + "MetricName": "tma_info_bottleneck_other_bottlenecks", + "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20", + "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls." + }, + { "BriefDescription": "Fraction of branches that are CALL or RET", "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", @@ -768,7 +839,7 @@ }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", + "MetricExpr": "(CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else tma_info_thread_clks)", "MetricGroup": "SMT", "MetricName": "tma_info_core_core_clks" }, @@ -779,8 +850,14 @@ "MetricName": "tma_info_core_coreipc" }, { + "BriefDescription": "uops Executed per Cycle", + "MetricExpr": "UOPS_EXECUTED.THREAD / tma_info_thread_clks", + "MetricGroup": "Power", + "MetricName": "tma_info_core_epc" + }, + { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", "MetricGroup": "Flops;Ret", "MetricName": "tma_info_core_flopc" }, @@ -792,19 +869,12 @@ "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_core_ilp" }, { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts;TopdownL1;tma_L1_group", - "MetricName": "tma_info_core_ipmispredict", - "MetricgroupNoGroup": "TopdownL1" - }, - { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", @@ -874,7 +944,7 @@ "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", "MetricThreshold": "tma_info_inst_mix_iparith < 10", - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", @@ -882,7 +952,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx128", "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", @@ -890,7 +960,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx256", "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", @@ -898,7 +968,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx512", "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", @@ -906,7 +976,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_dp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", @@ -914,7 +984,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_sp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", @@ -932,7 +1002,7 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_ipflop", "MetricThreshold": "tma_info_inst_mix_ipflop < 10" @@ -945,6 +1015,12 @@ "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { + "BriefDescription": "Instructions per PAUSE (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / MISC_RETIRED.PAUSE_INST", + "MetricGroup": "Flops;FpVector;InsType", + "MetricName": "tma_info_inst_mix_ippause" + }, + { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", "MetricGroup": "InsType", @@ -967,16 +1043,30 @@ "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp" }, { + "BriefDescription": "\"Bus lock\" per kilo instruction", + "MetricExpr": "tma_info_memory_mix_bus_lock_pki", + "MetricGroup": "Mem;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_bus_lock_pki", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "tma_info_memory_tlb_code_stlb_mpki", + "MetricGroup": "Fed;MemoryTLB;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_code_stlb_mpki", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l1d_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l2_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t" }, { "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", @@ -992,124 +1082,227 @@ }, { "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_access_bw", "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_core_l3_cache_access_bw" + "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t" + }, + { + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "tma_info_memory_latency_data_l2_mlp", + "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_data_l2_mlp", + "MetricgroupNoGroup": "TopdownL1" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_fb_hpki" }, { + "BriefDescription": "", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki_load" }, { + "BriefDescription": "", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l2_cache_fill_bw_2t", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", + "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY", + "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", + "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY", + "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l2_evictions_silent_pki", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Backend;CacheMisses;Mem", + "MetricGroup": "Backend;CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (OFFCORE_REQUESTS.ALL_DATA_RD - OFFCORE_REQUESTS.DEMAND_DATA_RD + L2_RQSTS.ALL_DEMAND_MISS + L2_RQSTS.SWPF_MISS) / tma_info_inst_mix_instructions", - "MetricGroup": "CacheMisses;Mem;Offcore", + "MetricGroup": "CacheHits;Mem;Offcore", "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki_load" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_memory_l3mpki" + "BriefDescription": "", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_l3_cache_access_bw" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_memory_load_miss_real_latency" + "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l3_cache_access_bw_2t", + "MetricgroupNoGroup": "TopdownL1" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_memory_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l3_cache_fill_bw_2t", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_l3mpki" }, { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_data_l2_mlp" + "MetricName": "tma_info_memory_latency_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", + "MetricExpr": "tma_info_memory_load_l2_miss_latency", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_miss_latency" + "MetricName": "tma_info_memory_latency_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_mlp" + "MetricName": "tma_info_memory_latency_load_l2_mlp" }, { "BriefDescription": "Average Latency for L3 cache miss demand Loads", - "MetricExpr": "cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,umask\\=0x10@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", + "MetricExpr": "tma_info_memory_load_l3_miss_latency", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l3_miss_latency" + "MetricName": "tma_info_memory_latency_load_l3_miss_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + "BriefDescription": "Average Latency for L2 cache miss demand Loads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", + "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_load_l2_miss_latency", + "MetricgroupNoGroup": "TopdownL1" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + "BriefDescription": "Average Parallel L2 cache miss demand Loads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=0x1@", + "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_load_l2_mlp", + "MetricgroupNoGroup": "TopdownL1" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + "BriefDescription": "Average Latency for L3 cache miss demand Loads", + "MetricExpr": "cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,umask\\=0x10@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", + "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_load_l3_miss_latency", + "MetricgroupNoGroup": "TopdownL1" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" + }, + { + "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "tma_info_memory_tlb_load_stlb_mpki", + "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_load_stlb_mpki", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "\"Bus lock\" per kilo instruction", + "MetricExpr": "1e3 * SQ_MISC.BUS_LOCK / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_mix_bus_lock_pki" + }, + { + "BriefDescription": "Un-cacheable retired load per kilo instruction", + "MetricExpr": "tma_info_memory_uc_load_pki", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_mix_uc_load_pki" + }, + { + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + }, + { + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (2 * (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD))", + "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_page_walks_utilization", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "tma_info_memory_tlb_store_stlb_mpki", + "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_store_stlb_mpki", + "MetricgroupNoGroup": "TopdownL1" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", @@ -1137,54 +1330,77 @@ "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "BriefDescription": "Un-cacheable retired load per kilo instruction", + "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY", + "MetricGroup": "Mem;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_uc_load_pki", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "", + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" }, { + "BriefDescription": "Instructions per a microcode Assist invocation", + "MetricExpr": "INST_RETIRED.ANY / ASSISTS.ANY", + "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", + "MetricName": "tma_info_pipeline_ipassist", + "MetricThreshold": "tma_info_pipeline_ipassist < 100e3", + "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)" + }, + { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", "MetricGroup": "Power;Summary", - "MetricName": "tma_info_system_average_frequency" + "MetricName": "tma_info_system_core_frequency" }, { - "BriefDescription": "Average CPU Utilization", + "BriefDescription": "Average CPU Utilization (percentage)", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", "MetricGroup": "HPC;Summary", "MetricName": "tma_info_system_cpu_utilization" }, { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_system_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width" }, { "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]", - "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_HIT_ITOM + UNC_CHA_TOR_INSERTS.IO_MISS_ITOM + UNC_CHA_TOR_INSERTS.IO_HIT_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR) * 64 / 1e9 / duration_time", - "MetricGroup": "IoBW;Mem;Server;SoC", - "MetricName": "tma_info_system_io_read_bw" + "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1e9 / duration_time", + "MetricGroup": "IoBW;MemOffcore;Server;SoC", + "MetricName": "tma_info_system_io_read_bw", + "PublicDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]. Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU" }, { "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", - "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1e9 / duration_time", - "MetricGroup": "IoBW;Mem;Server;SoC", - "MetricName": "tma_info_system_io_write_bw" + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_HIT_ITOM + UNC_CHA_TOR_INSERTS.IO_MISS_ITOM + UNC_CHA_TOR_INSERTS.IO_HIT_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR) * 64 / 1e9 / duration_time", + "MetricGroup": "IoBW;MemOffcore;Server;SoC", + "MetricName": "tma_info_system_io_write_bw", + "PublicDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]. Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU" }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", @@ -1209,7 +1425,7 @@ { "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]", "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR) / cha_0@event\\=0x0@", - "MetricGroup": "Mem;MemoryLat;Server;SoC", + "MetricGroup": "MemOffcore;MemoryLat;Server;SoC", "MetricName": "tma_info_system_mem_dram_read_latency", "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" }, @@ -1223,7 +1439,7 @@ { "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]", "MetricExpr": "(1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM) / cha_0@event\\=0x0@ if #has_pmem > 0 else 0)", - "MetricGroup": "Mem;MemoryLat;Server;SoC", + "MetricGroup": "MemOffcore;MemoryLat;Server;SoC", "MetricName": "tma_info_system_mem_pmm_read_latency", "PublicDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" }, @@ -1237,13 +1453,13 @@ { "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]", "MetricExpr": "(64 * UNC_M_PMM_RPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", - "MetricGroup": "Mem;MemoryBW;Server;SoC", + "MetricGroup": "MemOffcore;MemoryBW;Server;SoC", "MetricName": "tma_info_system_pmm_read_bw" }, { "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]", "MetricExpr": "(64 * UNC_M_PMM_WPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", - "MetricGroup": "Mem;MemoryBW;Server;SoC", + "MetricGroup": "MemOffcore;MemoryBW;Server;SoC", "MetricName": "tma_info_system_pmm_write_bw" }, { @@ -1288,6 +1504,12 @@ "MetricName": "tma_info_system_turbo_utilization" }, { + "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]", + "MetricExpr": "tma_info_system_socket_clks / 1e9 / duration_time", + "MetricGroup": "SoC", + "MetricName": "tma_info_system_uncore_frequency" + }, + { "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", "MetricExpr": "CPU_CLK_UNHALTED.THREAD", "MetricGroup": "Pipeline", @@ -1340,8 +1562,8 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks", + "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", @@ -1350,7 +1572,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1", @@ -1360,7 +1582,7 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + L1D_PEND_MISS.FB_FULL_PERIODS) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", @@ -1370,24 +1592,24 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "19 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "19 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", + "MetricExpr": "DECODE.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1401,7 +1623,7 @@ "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%" }, { @@ -1431,10 +1653,10 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", - "MetricExpr": "43.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "43.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", - "MetricName": "tma_local_dram", - "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", + "MetricName": "tma_local_mem", + "MetricThreshold": "tma_local_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM_PS", "ScaleUnit": "100%" }, @@ -1459,21 +1681,21 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1497,11 +1719,11 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "tma_retiring * tma_info_thread_slots / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", + "MetricExpr": "UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", - "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches", "ScaleUnit": "100%" }, { @@ -1518,7 +1740,7 @@ "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%" }, @@ -1527,16 +1749,16 @@ "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_mite_group", "MetricName": "tma_mite_4wide", - "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))", + "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)", "ScaleUnit": "100%" }, { - "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", + "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)", "MetricExpr": "UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY", "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group", "MetricName": "tma_mixing_vectors", "MetricThreshold": "tma_mixing_vectors > 0.05", - "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", + "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", "ScaleUnit": "100%" }, { @@ -1545,22 +1767,22 @@ "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_thread_slots)", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group", "MetricName": "tma_nop_instructions", - "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6", + "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions + tma_nop_instructions))", + "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6", @@ -1568,8 +1790,24 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).", + "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)", + "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group", + "MetricName": "tma_other_mispredicts", + "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.", + "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)", + "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group", + "MetricName": "tma_other_nukes", + "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a", - "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 0)) * (CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", + "MetricExpr": "(((1 - (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))))) * (CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_pmm_bound", "MetricThreshold": "tma_pmm_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1604,17 +1842,17 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)", "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", + "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1623,7 +1861,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks", + "MetricExpr": "(cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / tma_info_thread_clks * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1653,13 +1891,13 @@ "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", - "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", - "MetricExpr": "(97 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 97 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "(97 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 97 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group", "MetricName": "tma_remote_cache", "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1668,10 +1906,10 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", - "MetricExpr": "108 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "108 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", - "MetricName": "tma_remote_dram", - "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", + "MetricName": "tma_remote_mem", + "MetricThreshold": "tma_remote_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article. Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM_PS", "ScaleUnit": "100%" }, @@ -1689,18 +1927,18 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks", - "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", + "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO", "MetricName": "tma_serializing_operation", - "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", + "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD. Related metrics: tma_ms_switches", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", "MetricExpr": "37 * MISC_RETIRED.PAUSE_INST / tma_info_thread_clks", - "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", + "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", - "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", + "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: MISC_RETIRED.PAUSE_INST", "ScaleUnit": "100%" }, @@ -1729,7 +1967,7 @@ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { @@ -1797,10 +2035,10 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", "MetricExpr": "10 * BACLEARS.ANY / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", + "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY", "ScaleUnit": "100%" }, { diff --git a/tools/perf/pmu-events/arch/x86/icelakex/metricgroups.json b/tools/perf/pmu-events/arch/x86/icelakex/metricgroups.json index bc6a9a4d27..904d299c95 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/metricgroups.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/metricgroups.json @@ -2,10 +2,10 @@ "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -26,7 +26,9 @@ "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -64,8 +66,10 @@ "tma_L5_group": "Metrics for top-down breakdown at level 5", "tma_L6_group": "Metrics for top-down breakdown at level 6", "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_assists_group": "Metrics contributing to tma_assists category", "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category", "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", "tma_core_bound_group": "Metrics contributing to tma_core_bound category", "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", @@ -78,9 +82,9 @@ "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", "tma_issue2P": "Metrics related by the issue $issue2P", - "tma_issueBC": "Metrics related by the issue $issueBC", "tma_issueBM": "Metrics related by the issue $issueBM", "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueComp": "Metrics related by the issue $issueComp", "tma_issueD0": "Metrics related by the issue $issueD0", "tma_issueFB": "Metrics related by the issue $issueFB", "tma_issueFL": "Metrics related by the issue $issueFL", @@ -100,10 +104,12 @@ "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", "tma_light_operations_group": "Metrics contributing to tma_light_operations category", "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", "tma_mite_group": "Metrics contributing to tma_mite category", + "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category", "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", diff --git a/tools/perf/pmu-events/arch/x86/icelakex/uncore-power.json b/tools/perf/pmu-events/arch/x86/icelakex/uncore-power.json index ee4dac6fc7..920cab6ffe 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/uncore-power.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/uncore-power.json @@ -151,6 +151,7 @@ "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0", "PerPkg": "1", "PublicDescription": "Number of cores in C-State : C0 and C1 : This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", + "UMask": "0x40", "Unit": "PCU" }, { @@ -159,6 +160,7 @@ "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3", "PerPkg": "1", "PublicDescription": "Number of cores in C-State : C3 : This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", + "UMask": "0x80", "Unit": "PCU" }, { @@ -167,6 +169,7 @@ "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6", "PerPkg": "1", "PublicDescription": "Number of cores in C-State : C6 and C7 : This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", + "UMask": "0xc0", "Unit": "PCU" }, { diff --git a/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json b/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json index 33fe555252..5f3f0b5aeb 100644 --- a/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json +++ b/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json @@ -84,12 +84,12 @@ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5) / (3 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", - "MetricThreshold": "tma_alu_op_utilization > 0.6", + "MetricThreshold": "tma_alu_op_utilization > 0.4", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", + "MetricExpr": "66 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -202,7 +202,7 @@ "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, @@ -257,7 +257,7 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", "MetricgroupNoGroup": "TopdownL2", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" @@ -287,7 +287,7 @@ "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_scalar", "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", - "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -296,7 +296,25 @@ "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_vector", "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", - "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE) / UOPS_EXECUTED.THREAD", + "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", + "MetricName": "tma_fp_vector_128b", + "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", + "MetricExpr": "(SIMD_FP_256.PACKED_DOUBLE + SIMD_FP_256.PACKED_SINGLE) / UOPS_EXECUTED.THREAD", + "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", + "MetricName": "tma_fp_vector_256b", + "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -316,20 +334,20 @@ "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", "MetricExpr": "ICACHE.IFETCH_STALL / tma_info_thread_clks - tma_itlb_misses", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "ScaleUnit": "100%" }, { "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)", "MetricGroup": "Bad;BrMispredicts", "MetricName": "tma_info_bad_spec_ipmisp_indirect", "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" @@ -360,8 +378,8 @@ "MetricName": "tma_info_core_flopc" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_core_ilp" }, @@ -398,7 +416,7 @@ "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", "MetricThreshold": "tma_info_inst_mix_iparith < 10", - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", @@ -438,96 +456,90 @@ }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l1d_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l2_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t" + }, + { + "BriefDescription": "", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + "MetricName": "tma_info_memory_l1d_cache_fill_bw" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki" }, { + "BriefDescription": "", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw" + }, + { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Backend;CacheMisses;Mem", + "MetricGroup": "Backend;CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki" }, { + "BriefDescription": "", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw" + }, + { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.LLC_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "Mem", "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_memory_load_miss_real_latency" - }, - { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_memory_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" - }, - { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_data_l2_mlp" + "MetricName": "tma_info_memory_latency_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_miss_latency" + "MetricName": "tma_info_memory_latency_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_mlp" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + "MetricName": "tma_info_memory_latency_load_l2_mlp" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "0", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", @@ -537,8 +549,8 @@ "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "BriefDescription": "", + "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" }, @@ -549,21 +561,27 @@ "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", "MetricGroup": "Power;Summary", - "MetricName": "tma_info_system_average_frequency" + "MetricName": "tma_info_system_core_frequency" }, { - "BriefDescription": "Average CPU Utilization", + "BriefDescription": "Average CPU Utilization (percentage)", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", "MetricGroup": "HPC;Summary", "MetricName": "tma_info_system_cpu_utilization" }, { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" }, @@ -572,7 +590,7 @@ "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_system_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width" }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", @@ -595,19 +613,6 @@ "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" }, { - "BriefDescription": "Average number of parallel requests to external memory", - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_system_mem_parallel_requests", - "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests" - }, - { - "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_system_mem_request_latency" - }, - { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", "MetricGroup": "SMT", @@ -673,7 +678,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED", @@ -682,7 +687,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_thread_clks, 0)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1", @@ -691,7 +696,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS", @@ -701,20 +706,20 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_SMT", "MetricExpr": "MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.LLC_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.LLC_MISS))) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency", "ScaleUnit": "100%" }, { @@ -733,7 +738,7 @@ "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%" }, { @@ -768,21 +773,21 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -810,7 +815,7 @@ "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", "ScaleUnit": "100%" }, @@ -829,7 +834,7 @@ "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED_PORT.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED_PORT.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -838,7 +843,7 @@ "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -874,7 +879,7 @@ "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_5", "MetricThreshold": "tma_port_5 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -911,7 +916,7 @@ "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6", "ScaleUnit": "100%" }, { @@ -919,7 +924,7 @@ "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", - "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "ScaleUnit": "100%" }, { diff --git a/tools/perf/pmu-events/arch/x86/ivybridge/metricgroups.json b/tools/perf/pmu-events/arch/x86/ivybridge/metricgroups.json index f6a0258e32..8c808347f6 100644 --- a/tools/perf/pmu-events/arch/x86/ivybridge/metricgroups.json +++ b/tools/perf/pmu-events/arch/x86/ivybridge/metricgroups.json @@ -2,10 +2,10 @@ "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -24,7 +24,9 @@ "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -94,6 +96,7 @@ "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", "tma_light_operations_group": "Metrics contributing to tma_light_operations category", "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", diff --git a/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json b/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json index f5e46a768f..e6f5b05a71 100644 --- a/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json +++ b/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json @@ -84,12 +84,12 @@ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5) / (3 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", - "MetricThreshold": "tma_alu_op_utilization > 0.6", + "MetricThreshold": "tma_alu_op_utilization > 0.4", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", + "MetricExpr": "66 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -202,7 +202,7 @@ "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, @@ -257,7 +257,7 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", "MetricgroupNoGroup": "TopdownL2", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" @@ -287,7 +287,7 @@ "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_scalar", "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", - "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -296,7 +296,25 @@ "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_vector", "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", - "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE) / UOPS_EXECUTED.THREAD", + "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", + "MetricName": "tma_fp_vector_128b", + "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", + "MetricExpr": "(SIMD_FP_256.PACKED_DOUBLE + SIMD_FP_256.PACKED_SINGLE) / UOPS_EXECUTED.THREAD", + "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", + "MetricName": "tma_fp_vector_256b", + "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -316,20 +334,20 @@ "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", "MetricExpr": "ICACHE.IFETCH_STALL / tma_info_thread_clks - tma_itlb_misses", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "ScaleUnit": "100%" }, { "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)", "MetricGroup": "Bad;BrMispredicts", "MetricName": "tma_info_bad_spec_ipmisp_indirect", "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" @@ -360,8 +378,8 @@ "MetricName": "tma_info_core_flopc" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_core_ilp" }, @@ -398,7 +416,7 @@ "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", "MetricThreshold": "tma_info_inst_mix_iparith < 10", - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", @@ -438,96 +456,90 @@ }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l1d_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l2_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t" + }, + { + "BriefDescription": "", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + "MetricName": "tma_info_memory_l1d_cache_fill_bw" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki" }, { + "BriefDescription": "", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw" + }, + { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Backend;CacheMisses;Mem", + "MetricGroup": "Backend;CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki" }, { + "BriefDescription": "", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw" + }, + { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.LLC_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "Mem", "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_memory_load_miss_real_latency" - }, - { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_memory_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" - }, - { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_data_l2_mlp" + "MetricName": "tma_info_memory_latency_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_miss_latency" + "MetricName": "tma_info_memory_latency_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_mlp" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + "MetricName": "tma_info_memory_latency_load_l2_mlp" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" - }, - { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "0", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", @@ -537,8 +549,8 @@ "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "BriefDescription": "", + "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" }, @@ -549,21 +561,27 @@ "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", "MetricGroup": "Power;Summary", - "MetricName": "tma_info_system_average_frequency" + "MetricName": "tma_info_system_core_frequency" }, { - "BriefDescription": "Average CPU Utilization", + "BriefDescription": "Average CPU Utilization (percentage)", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", "MetricGroup": "HPC;Summary", "MetricName": "tma_info_system_cpu_utilization" }, { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" }, @@ -572,7 +590,7 @@ "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_system_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width" }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", @@ -627,6 +645,12 @@ "MetricName": "tma_info_system_turbo_utilization" }, { + "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]", + "MetricExpr": "tma_info_system_socket_clks / 1e9 / duration_time", + "MetricGroup": "SoC", + "MetricName": "tma_info_system_uncore_frequency" + }, + { "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", "MetricExpr": "CPU_CLK_UNHALTED.THREAD", "MetricGroup": "Pipeline", @@ -674,7 +698,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED", @@ -683,7 +707,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_thread_clks, 0)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1", @@ -692,7 +716,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS", @@ -702,20 +726,20 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_SMT", "MetricExpr": "MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "41 * (MEM_LOAD_UOPS_RETIRED.LLC_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency", "ScaleUnit": "100%" }, { @@ -734,7 +758,7 @@ "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%" }, { @@ -749,11 +773,10 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "200 * (MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", - "MetricName": "tma_local_dram", - "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", + "MetricName": "tma_local_mem", + "MetricThreshold": "tma_local_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM_PS", "ScaleUnit": "100%" }, @@ -779,21 +802,21 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -821,7 +844,7 @@ "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", "ScaleUnit": "100%" }, @@ -840,7 +863,7 @@ "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED_PORT.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED_PORT.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -849,7 +872,7 @@ "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -885,7 +908,7 @@ "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_5", "MetricThreshold": "tma_port_5 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -922,7 +945,7 @@ "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6", "ScaleUnit": "100%" }, { @@ -930,7 +953,7 @@ "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", - "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "ScaleUnit": "100%" }, { @@ -945,11 +968,10 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "310 * (MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", - "MetricName": "tma_remote_dram", - "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", + "MetricName": "tma_remote_mem", + "MetricThreshold": "tma_remote_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article. Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM_PS", "ScaleUnit": "100%" }, diff --git a/tools/perf/pmu-events/arch/x86/ivytown/metricgroups.json b/tools/perf/pmu-events/arch/x86/ivytown/metricgroups.json index f6a0258e32..8c808347f6 100644 --- a/tools/perf/pmu-events/arch/x86/ivytown/metricgroups.json +++ b/tools/perf/pmu-events/arch/x86/ivytown/metricgroups.json @@ -2,10 +2,10 @@ "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -24,7 +24,9 @@ "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -94,6 +96,7 @@ "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", "tma_light_operations_group": "Metrics contributing to tma_light_operations category", "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", diff --git a/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json b/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json index 5df1ebfb89..ad6c531a9e 100644 --- a/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json +++ b/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json @@ -514,6 +514,7 @@ "BriefDescription": "Number of cores in C-State; C0 and C1", "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0", + "Filter": "occ_sel=1", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" @@ -522,6 +523,7 @@ "BriefDescription": "Number of cores in C-State; C3", "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3", + "Filter": "occ_sel=2", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" @@ -530,6 +532,7 @@ "BriefDescription": "Number of cores in C-State; C6 and C7", "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6", + "Filter": "occ_sel=3", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" diff --git a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json index 35b1a3aa72..fc8c3f785b 100644 --- a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json +++ b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json @@ -163,7 +163,7 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", "MetricgroupNoGroup": "TopdownL2", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_lcp", "ScaleUnit": "100%" @@ -193,7 +193,7 @@ "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_scalar", "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", - "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -202,7 +202,25 @@ "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_vector", "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", - "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE) / UOPS_DISPATCHED.THREAD", + "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", + "MetricName": "tma_fp_vector_128b", + "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", + "MetricExpr": "(SIMD_FP_256.PACKED_DOUBLE + SIMD_FP_256.PACKED_SINGLE) / UOPS_DISPATCHED.THREAD", + "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", + "MetricName": "tma_fp_vector_256b", + "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -222,7 +240,7 @@ "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)", "ScaleUnit": "100%" }, { @@ -244,7 +262,7 @@ "MetricName": "tma_info_core_flopc" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", "MetricExpr": "UOPS_DISPATCHED.THREAD / (cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_core_ilp" @@ -271,21 +289,27 @@ "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", "MetricGroup": "Power;Summary", - "MetricName": "tma_info_system_average_frequency" + "MetricName": "tma_info_system_core_frequency" }, { - "BriefDescription": "Average CPU Utilization", + "BriefDescription": "Average CPU Utilization (percentage)", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", "MetricGroup": "HPC;Summary", "MetricName": "tma_info_system_cpu_utilization" }, { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_mem_bandwidth" }, @@ -294,7 +318,7 @@ "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_system_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width" }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", @@ -349,6 +373,12 @@ "MetricName": "tma_info_system_turbo_utilization" }, { + "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]", + "MetricExpr": "tma_info_system_socket_clks / 1e9 / duration_time", + "MetricGroup": "SoC", + "MetricName": "tma_info_system_uncore_frequency" + }, + { "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", "MetricExpr": "CPU_CLK_UNHALTED.THREAD", "MetricGroup": "Pipeline", @@ -389,7 +419,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED", @@ -399,7 +429,7 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_SMT", "MetricExpr": "MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", @@ -421,7 +451,7 @@ "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%" }, { @@ -436,21 +466,21 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_info_system_dram_bw_use", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_info_system_dram_bw_use", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: ", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: ", "ScaleUnit": "100%" }, { diff --git a/tools/perf/pmu-events/arch/x86/jaketown/metricgroups.json b/tools/perf/pmu-events/arch/x86/jaketown/metricgroups.json index bebb85945d..a2c27794c0 100644 --- a/tools/perf/pmu-events/arch/x86/jaketown/metricgroups.json +++ b/tools/perf/pmu-events/arch/x86/jaketown/metricgroups.json @@ -2,10 +2,10 @@ "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -23,7 +23,9 @@ "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -88,6 +90,7 @@ "tma_issueTLB": "Metrics related by the issue $issueTLB", "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", diff --git a/tools/perf/pmu-events/arch/x86/jaketown/uncore-power.json b/tools/perf/pmu-events/arch/x86/jaketown/uncore-power.json index b3ee5d7410..6f98fc1728 100644 --- a/tools/perf/pmu-events/arch/x86/jaketown/uncore-power.json +++ b/tools/perf/pmu-events/arch/x86/jaketown/uncore-power.json @@ -233,6 +233,7 @@ "BriefDescription": "Number of cores in C0", "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0", + "Filter": "occ_sel=1", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in C0. It can be used by itself to get the average number of cores in C0, with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" @@ -241,6 +242,7 @@ "BriefDescription": "Number of cores in C0", "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3", + "Filter": "occ_sel=2", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in C0. It can be used by itself to get the average number of cores in C0, with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" @@ -249,6 +251,7 @@ "BriefDescription": "Number of cores in C0", "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6", + "Filter": "occ_sel=3", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in C0. It can be used by itself to get the average number of cores in C0, with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 4d1deed443..5297d25f4e 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -1,38 +1,38 @@ Family-model,Version,Filename,EventType -GenuineIntel-6-(97|9A|B7|BA|BF),v1.23,alderlake,core -GenuineIntel-6-BE,v1.23,alderlaken,core +GenuineIntel-6-(97|9A|B7|BA|BF),v1.24,alderlake,core +GenuineIntel-6-BE,v1.24,alderlaken,core GenuineIntel-6-(1C|26|27|35|36),v5,bonnell,core -GenuineIntel-6-(3D|47),v28,broadwell,core +GenuineIntel-6-(3D|47),v29,broadwell,core GenuineIntel-6-56,v11,broadwellde,core GenuineIntel-6-4F,v22,broadwellx,core GenuineIntel-6-55-[56789ABCDEF],v1.20,cascadelakex,core GenuineIntel-6-9[6C],v1.04,elkhartlake,core -GenuineIntel-6-CF,v1.02,emeraldrapids,core +GenuineIntel-6-CF,v1.03,emeraldrapids,core GenuineIntel-6-5[CF],v13,goldmont,core GenuineIntel-6-7A,v1.01,goldmontplus,core -GenuineIntel-6-B6,v1.00,grandridge,core +GenuineIntel-6-B6,v1.01,grandridge,core GenuineIntel-6-A[DE],v1.01,graniterapids,core -GenuineIntel-6-(3C|45|46),v33,haswell,core +GenuineIntel-6-(3C|45|46),v35,haswell,core GenuineIntel-6-3F,v28,haswellx,core -GenuineIntel-6-7[DE],v1.19,icelake,core +GenuineIntel-6-7[DE],v1.21,icelake,core GenuineIntel-6-6[AC],v1.23,icelakex,core GenuineIntel-6-3A,v24,ivybridge,core GenuineIntel-6-3E,v24,ivytown,core GenuineIntel-6-2D,v24,jaketown,core GenuineIntel-6-(57|85),v16,knightslanding,core GenuineIntel-6-BD,v1.00,lunarlake,core -GenuineIntel-6-A[AC],v1.06,meteorlake,core +GenuineIntel-6-A[AC],v1.07,meteorlake,core GenuineIntel-6-1[AEF],v4,nehalemep,core GenuineIntel-6-2E,v4,nehalemex,core -GenuineIntel-6-A7,v1.01,rocketlake,core +GenuineIntel-6-A7,v1.02,rocketlake,core GenuineIntel-6-2A,v19,sandybridge,core GenuineIntel-6-8F,v1.17,sapphirerapids,core -GenuineIntel-6-AF,v1.00,sierraforest,core +GenuineIntel-6-AF,v1.01,sierraforest,core GenuineIntel-6-(37|4A|4C|4D|5A),v15,silvermont,core -GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v57,skylake,core +GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v58,skylake,core GenuineIntel-6-55-[01234],v1.32,skylakex,core GenuineIntel-6-86,v1.21,snowridgex,core -GenuineIntel-6-8[CD],v1.13,tigerlake,core +GenuineIntel-6-8[CD],v1.15,tigerlake,core GenuineIntel-6-2C,v5,westmereep-dp,core GenuineIntel-6-25,v4,westmereep-sp,core GenuineIntel-6-2F,v4,westmereex,core diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/cache.json b/tools/perf/pmu-events/arch/x86/meteorlake/cache.json index 5fef87502d..47861a6dd8 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/cache.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/cache.json @@ -319,7 +319,7 @@ "EventCode": "0x35", "EventName": "MEM_BOUND_STALLS_IFETCH.ALL", "SampleAfterValue": "1000003", - "UMask": "0x6f", + "UMask": "0x7f", "Unit": "cpu_atom" }, { @@ -344,7 +344,7 @@ "EventCode": "0x35", "EventName": "MEM_BOUND_STALLS_IFETCH.LLC_MISS", "SampleAfterValue": "1000003", - "UMask": "0x68", + "UMask": "0x78", "Unit": "cpu_atom" }, { @@ -352,7 +352,7 @@ "EventCode": "0x34", "EventName": "MEM_BOUND_STALLS_LOAD.ALL", "SampleAfterValue": "1000003", - "UMask": "0x6f", + "UMask": "0x7f", "Unit": "cpu_atom" }, { @@ -377,7 +377,7 @@ "EventCode": "0x34", "EventName": "MEM_BOUND_STALLS_LOAD.LLC_MISS", "SampleAfterValue": "1000003", - "UMask": "0x68", + "UMask": "0x78", "Unit": "cpu_atom" }, { diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/floating-point.json b/tools/perf/pmu-events/arch/x86/meteorlake/floating-point.json index f66506ee37..30e604d212 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/floating-point.json @@ -1,5 +1,14 @@ [ { + "BriefDescription": "Counts the number of cycles when any of the floating point dividers are active.", + "CounterMask": "1", + "EventCode": "0xcd", + "EventName": "ARITH.FPDIV_ACTIVE", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, + { "BriefDescription": "This event counts the cycles the floating point divider is busy.", "CounterMask": "1", "EventCode": "0xb0", @@ -26,7 +35,7 @@ "Unit": "cpu_core" }, { - "BriefDescription": "FP_ARITH_DISPATCHED.PORT_0", + "BriefDescription": "FP_ARITH_DISPATCHED.PORT_0 [This event is alias to FP_ARITH_DISPATCHED.V0]", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_0", "SampleAfterValue": "2000003", @@ -34,7 +43,7 @@ "Unit": "cpu_core" }, { - "BriefDescription": "FP_ARITH_DISPATCHED.PORT_1", + "BriefDescription": "FP_ARITH_DISPATCHED.PORT_1 [This event is alias to FP_ARITH_DISPATCHED.V1]", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_1", "SampleAfterValue": "2000003", @@ -42,7 +51,7 @@ "Unit": "cpu_core" }, { - "BriefDescription": "FP_ARITH_DISPATCHED.PORT_5", + "BriefDescription": "FP_ARITH_DISPATCHED.PORT_5 [This event is alias to FP_ARITH_DISPATCHED.V2]", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_5", "SampleAfterValue": "2000003", @@ -50,6 +59,30 @@ "Unit": "cpu_core" }, { + "BriefDescription": "FP_ARITH_DISPATCHED.V0 [This event is alias to FP_ARITH_DISPATCHED.PORT_0]", + "EventCode": "0xb3", + "EventName": "FP_ARITH_DISPATCHED.V0", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "FP_ARITH_DISPATCHED.V1 [This event is alias to FP_ARITH_DISPATCHED.PORT_1]", + "EventCode": "0xb3", + "EventName": "FP_ARITH_DISPATCHED.V1", + "SampleAfterValue": "2000003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "FP_ARITH_DISPATCHED.V2 [This event is alias to FP_ARITH_DISPATCHED.PORT_5]", + "EventCode": "0xb3", + "EventName": "FP_ARITH_DISPATCHED.V2", + "SampleAfterValue": "2000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", @@ -131,6 +164,53 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts the number of all types of floating point operations per uop with all default weighting", + "EventCode": "0xc8", + "EventName": "FP_FLOPS_RETIRED.ALL", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x3", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to FP_FLOPS_RETIRED.FP64]", + "Deprecated": "1", + "EventCode": "0xc8", + "EventName": "FP_FLOPS_RETIRED.DP", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of floating point operations that produce 32 bit single precision results [This event is alias to FP_FLOPS_RETIRED.SP]", + "EventCode": "0xc8", + "EventName": "FP_FLOPS_RETIRED.FP32", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of floating point operations that produce 64 bit double precision results [This event is alias to FP_FLOPS_RETIRED.DP]", + "EventCode": "0xc8", + "EventName": "FP_FLOPS_RETIRED.FP64", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to FP_FLOPS_RETIRED.FP32]", + "Deprecated": "1", + "EventCode": "0xc8", + "EventName": "FP_FLOPS_RETIRED.SP", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Counts the number of floating point operations retired that required microcode assist.", "EventCode": "0xc3", "EventName": "MACHINE_CLEARS.FP_ASSIST", diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/other.json b/tools/perf/pmu-events/arch/x86/meteorlake/other.json index d55e792c0c..7effc1f271 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/other.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/other.json @@ -8,6 +8,16 @@ "Unit": "cpu_core" }, { + "BriefDescription": "This event is deprecated. [This event is alias to MISC_RETIRED.LBR_INSERTS]", + "Deprecated": "1", + "EventCode": "0xe4", + "EventName": "LBR_INSERTS.ANY", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Counts demand data reads that have any type of response.", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE", diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json b/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json index deaa7aba93..24bbfcebd2 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json @@ -1,5 +1,14 @@ [ { + "BriefDescription": "Counts the number of cycles when any of the dividers are active.", + "CounterMask": "1", + "EventCode": "0xcd", + "EventName": "ARITH.DIV_ACTIVE", + "SampleAfterValue": "1000003", + "UMask": "0x3", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Cycles when divide unit is busy executing divide or square root operations.", "CounterMask": "1", "EventCode": "0xb0", @@ -46,6 +55,15 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts the number of retired JCC (Jump on Conditional Code) branch instructions retired, includes both taken and not taken branches.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.COND", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x7e", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Conditional branch instructions retired.", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.COND", @@ -66,6 +84,15 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts the number of taken JCC (Jump on Conditional Code) branch instructions retired.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.COND_TAKEN", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xfe", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Taken conditional branch instructions retired.", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.COND_TAKEN", @@ -95,6 +122,15 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts the number of near indirect JMP and near indirect CALL branch instructions retired.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.INDIRECT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xeb", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Indirect near branch instructions retired (excluding returns)", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.INDIRECT", @@ -105,6 +141,25 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts the number of near indirect CALL branch instructions retired.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.INDIRECT_CALL", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xfb", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.INDIRECT_CALL", + "Deprecated": "1", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.IND_CALL", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xfb", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Counts the number of near CALL branch instructions retired.", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.NEAR_CALL", @@ -124,6 +179,15 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts the number of near RET branch instructions retired.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_RETURN", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xf7", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Return instructions retired.", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.NEAR_RETURN", @@ -671,6 +735,7 @@ "BriefDescription": "INST_RETIRED.MACRO_FUSED", "EventCode": "0xc0", "EventName": "INST_RETIRED.MACRO_FUSED", + "PEBS": "1", "SampleAfterValue": "2000003", "UMask": "0x10", "Unit": "cpu_core" @@ -679,6 +744,7 @@ "BriefDescription": "Retired NOP instructions.", "EventCode": "0xc0", "EventName": "INST_RETIRED.NOP", + "PEBS": "1", "PublicDescription": "Counts all retired NOP or ENDBR32/64 or PREFETCHIT0/1 instructions", "SampleAfterValue": "2000003", "UMask": "0x2", @@ -697,6 +763,7 @@ "BriefDescription": "Iterations of Repeat string retired instructions.", "EventCode": "0xc0", "EventName": "INST_RETIRED.REP_ITERATION", + "PEBS": "1", "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent.", "SampleAfterValue": "2000003", "UMask": "0x8", @@ -980,6 +1047,15 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts the number of Last Branch Record (LBR) entries. Requires LBRs to be enabled and configured in IA32_LBR_CTL. [This event is alias to LBR_INSERTS.ANY]", + "EventCode": "0xe4", + "EventName": "MISC_RETIRED.LBR_INSERTS", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Counts cycles where the pipeline is stalled due to serializing operations.", "EventCode": "0xa2", "EventName": "RESOURCE_STALLS.SCOREBOARD", diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/virtual-memory.json b/tools/perf/pmu-events/arch/x86/meteorlake/virtual-memory.json index 056c2a885a..55798e64c5 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/virtual-memory.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/virtual-memory.json @@ -71,6 +71,15 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to a 4K page.", + "EventCode": "0x08", + "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K", + "PublicDescription": "Counts the number of page walks completed due to loads (including SW prefetches) whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages. Includes page walks that page fault.", + "SampleAfterValue": "200003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Page walks completed due to a demand data load to a 4K page.", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K", @@ -151,6 +160,15 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to a 2M or 4M page.", + "EventCode": "0x49", + "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M", + "PublicDescription": "Counts the number of page walks completed due to stores whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 2M or 4M pages. Includes page walks that page fault.", + "SampleAfterValue": "2000003", + "UMask": "0x4", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Page walks completed due to a demand data store to a 2M/4M page.", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M", @@ -160,6 +178,15 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to a 4K page.", + "EventCode": "0x49", + "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_4K", + "PublicDescription": "Counts the number of page walks completed due to stores whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages. Includes page walks that page fault.", + "SampleAfterValue": "2000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Page walks completed due to a demand data store to a 4K page.", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_4K", @@ -258,6 +285,15 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to a 4K page.", + "EventCode": "0x85", + "EventName": "ITLB_MISSES.WALK_COMPLETED_4K", + "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages. Includes page walks that page fault.", + "SampleAfterValue": "2000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Code miss in all TLB levels causes a page walk that completes. (4K)", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_COMPLETED_4K", diff --git a/tools/perf/pmu-events/arch/x86/rocketlake/memory.json b/tools/perf/pmu-events/arch/x86/rocketlake/memory.json index e8d2ec1c02..f847632205 100644 --- a/tools/perf/pmu-events/arch/x86/rocketlake/memory.json +++ b/tools/perf/pmu-events/arch/x86/rocketlake/memory.json @@ -259,6 +259,7 @@ "BriefDescription": "Number of times an RTM execution aborted.", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED", + "PEBS": "1", "PublicDescription": "Counts the number of times RTM abort was triggered.", "SampleAfterValue": "100003", "UMask": "0x4" diff --git a/tools/perf/pmu-events/arch/x86/rocketlake/metricgroups.json b/tools/perf/pmu-events/arch/x86/rocketlake/metricgroups.json index a151ba9ccc..5452a1448d 100644 --- a/tools/perf/pmu-events/arch/x86/rocketlake/metricgroups.json +++ b/tools/perf/pmu-events/arch/x86/rocketlake/metricgroups.json @@ -2,10 +2,10 @@ "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -25,7 +25,9 @@ "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -63,8 +65,10 @@ "tma_L5_group": "Metrics for top-down breakdown at level 5", "tma_L6_group": "Metrics for top-down breakdown at level 6", "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_assists_group": "Metrics contributing to tma_assists category", "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category", "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", "tma_core_bound_group": "Metrics contributing to tma_core_bound category", "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", @@ -77,9 +81,9 @@ "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", "tma_issue2P": "Metrics related by the issue $issue2P", - "tma_issueBC": "Metrics related by the issue $issueBC", "tma_issueBM": "Metrics related by the issue $issueBM", "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueComp": "Metrics related by the issue $issueComp", "tma_issueD0": "Metrics related by the issue $issueD0", "tma_issueFB": "Metrics related by the issue $issueFB", "tma_issueFL": "Metrics related by the issue $issueFL", @@ -99,10 +103,12 @@ "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", "tma_light_operations_group": "Metrics contributing to tma_light_operations category", "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", "tma_mite_group": "Metrics contributing to tma_mite category", + "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category", "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", diff --git a/tools/perf/pmu-events/arch/x86/rocketlake/other.json b/tools/perf/pmu-events/arch/x86/rocketlake/other.json index cfb5906329..4fdc873395 100644 --- a/tools/perf/pmu-events/arch/x86/rocketlake/other.json +++ b/tools/perf/pmu-events/arch/x86/rocketlake/other.json @@ -19,7 +19,7 @@ "BriefDescription": "Core cycles where the core was running in a manner where Turbo may be clipped to the AVX512 turbo schedule.", "EventCode": "0x28", "EventName": "CORE_POWER.LVL2_TURBO_LICENSE", - "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server microarchtecture). This includes high current AVX 512-bit instructions.", + "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server microarchitecture). This includes high current AVX 512-bit instructions.", "SampleAfterValue": "200003", "UMask": "0x20" }, diff --git a/tools/perf/pmu-events/arch/x86/rocketlake/pipeline.json b/tools/perf/pmu-events/arch/x86/rocketlake/pipeline.json index 375b78044f..c7313fd4fd 100644 --- a/tools/perf/pmu-events/arch/x86/rocketlake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/rocketlake/pipeline.json @@ -529,7 +529,7 @@ "BriefDescription": "Cycles when Reservation Station (RS) is empty for the thread", "EventCode": "0x5e", "EventName": "RS_EVENTS.EMPTY_CYCLES", - "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into stravation periods (e.g. branch mispredictions or i-cache misses)", + "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses)", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -553,14 +553,6 @@ "UMask": "0x2" }, { - "BriefDescription": "TMA slots wasted due to incorrect speculation by branch mispredictions", - "EventCode": "0xa4", - "EventName": "TOPDOWN.BR_MISPREDICT_SLOTS", - "PublicDescription": "Number of TMA slots that were wasted due to incorrect speculation by branch mispredictions. This event estimates number of operations that were issued but not retired from the speculative path as well as the out-of-order engine recovery past a branch misprediction.", - "SampleAfterValue": "10000003", - "UMask": "0x8" - }, - { "BriefDescription": "TMA slots available for an unhalted logical processor. Fixed counter - architectural event", "EventName": "TOPDOWN.SLOTS", "PublicDescription": "Number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method (TMA). The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core. Software can use this event as the denominator for the top-level metrics of the TMA method. This architectural event is counted on a designated fixed counter (Fixed Counter 3).", diff --git a/tools/perf/pmu-events/arch/x86/rocketlake/rkl-metrics.json b/tools/perf/pmu-events/arch/x86/rocketlake/rkl-metrics.json index 27433fc15e..1dad462e58 100644 --- a/tools/perf/pmu-events/arch/x86/rocketlake/rkl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/rocketlake/rkl-metrics.json @@ -98,12 +98,12 @@ "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", - "MetricThreshold": "tma_alu_op_utilization > 0.6", + "MetricThreshold": "tma_alu_op_utilization > 0.4", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * ASSISTS.ANY / tma_info_thread_slots", + "MetricExpr": "34 * ASSISTS.ANY / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -113,7 +113,7 @@ { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_thread_slots", + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / tma_info_thread_slots", "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -135,7 +135,7 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions.", "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * tma_info_thread_slots)", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_branch_instructions", "MetricThreshold": "tma_branch_instructions > 0.1 & tma_light_operations > 0.6", "ScaleUnit": "100%" @@ -180,7 +180,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(29 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 23.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "(29 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 23.5 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -200,7 +200,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "23.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "23.5 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -212,7 +212,7 @@ "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", "MetricName": "tma_decoder0_alone", - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)", "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", "ScaleUnit": "100%" }, @@ -240,7 +240,7 @@ "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, @@ -259,7 +259,7 @@ "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", "ScaleUnit": "100%" }, { @@ -268,12 +268,12 @@ "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "32.5 * tma_info_system_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", + "MetricExpr": "32.5 * tma_info_system_core_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -286,7 +286,7 @@ "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -294,7 +294,7 @@ "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", "MetricgroupNoGroup": "TopdownL2", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" @@ -328,6 +328,15 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists", + "MetricExpr": "34 * ASSISTS.FP / tma_info_thread_slots", + "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", + "MetricName": "tma_fp_assists", + "MetricThreshold": "tma_fp_assists > 0.1", + "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called Denormals).", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", @@ -390,13 +399,13 @@ "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", - "MetricExpr": "ICACHE_16B.IFDATA_STALL / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricExpr": "ICACHE_DATA.STALLS / tma_info_thread_clks", + "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", @@ -405,7 +414,7 @@ { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES / 100", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bad_spec_branch_misprediction_cost", "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" @@ -446,6 +455,12 @@ "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" }, { + "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)", + "MetricExpr": "INT_MISC.CLEARS_COUNT / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)", + "MetricGroup": "BrMispredicts", + "MetricName": "tma_info_bad_spec_spec_clears_ratio" + }, + { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", @@ -472,67 +487,102 @@ "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " }, { + "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.", + "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Ret", + "MetricName": "tma_info_bottleneck_base_non_br", + "MetricThreshold": "tma_info_bottleneck_base_non_br > 20" + }, + { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", - "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", + "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "tma_info_bottleneck_big_code", - "MetricThreshold": "tma_info_bottleneck_big_code > 20", - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" + "MetricThreshold": "tma_info_bottleneck_big_code > 20" }, { "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", - "MetricGroup": "Ret;tma_issueBC", + "MetricExpr": "100 * ((BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots)", + "MetricGroup": "Ret", "MetricName": "tma_info_bottleneck_branching_overhead", - "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", + "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", + "MetricName": "tma_info_bottleneck_cache_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", + "MetricName": "tma_info_bottleneck_cache_memory_latency", + "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency" + }, + { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "Cor;tma_issueComp", + "MetricName": "tma_info_bottleneck_compute_bound_est", + "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " }, { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", "MetricGroup": "Fed;FetchBW;Frontend", "MetricName": "tma_info_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" }, { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", - "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_info_bottleneck_memory_bandwidth", - "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Total pipeline cost of irregular execution (e.g", + "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Bad;Cor;Ret;tma_issueMS", + "MetricName": "tma_info_bottleneck_irregular_overhead", + "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10", + "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches" }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_info_bottleneck_memory_data_tlbs", "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", - "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" + "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization" }, { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", - "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_info_bottleneck_memory_latency", - "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", - "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" + "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", + "MetricExpr": "100 * (tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", + "MetricGroup": "Mem;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_synchronization", + "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10", + "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs" }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bottleneck_mispredictions", "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" }, { + "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)", + "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)", + "MetricGroup": "Cor;Offcore", + "MetricName": "tma_info_bottleneck_other_bottlenecks", + "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20", + "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls." + }, + { "BriefDescription": "Fraction of branches that are CALL or RET", "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", @@ -564,7 +614,7 @@ }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", + "MetricExpr": "(CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else tma_info_thread_clks)", "MetricGroup": "SMT", "MetricName": "tma_info_core_core_clks" }, @@ -575,8 +625,14 @@ "MetricName": "tma_info_core_coreipc" }, { + "BriefDescription": "uops Executed per Cycle", + "MetricExpr": "UOPS_EXECUTED.THREAD / tma_info_thread_clks", + "MetricGroup": "Power", + "MetricName": "tma_info_core_epc" + }, + { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", "MetricGroup": "Flops;Ret", "MetricName": "tma_info_core_flopc" }, @@ -588,8 +644,8 @@ "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_core_ilp" }, @@ -669,7 +725,7 @@ "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", "MetricThreshold": "tma_info_inst_mix_iparith < 10", - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", @@ -677,7 +733,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx128", "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", @@ -685,7 +741,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx256", "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", @@ -693,7 +749,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx512", "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", @@ -701,7 +757,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_dp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", @@ -709,7 +765,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_sp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", @@ -727,7 +783,7 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_ipflop", "MetricThreshold": "tma_info_inst_mix_ipflop < 10" @@ -740,6 +796,12 @@ "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { + "BriefDescription": "Instructions per PAUSE (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / MISC_RETIRED.PAUSE_INST", + "MetricGroup": "Flops;FpVector;InsType", + "MetricName": "tma_info_inst_mix_ippause" + }, + { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", "MetricGroup": "InsType", @@ -763,142 +825,154 @@ }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l1d_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l2_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_access_bw", "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_core_l3_cache_access_bw" + "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_fb_hpki" }, { + "BriefDescription": "", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw" + }, + { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki_load" }, { + "BriefDescription": "", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw" + }, + { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_all" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Backend;CacheMisses;Mem", + "MetricGroup": "Backend;CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem;Offcore", + "MetricGroup": "CacheHits;Mem;Offcore", "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki_load" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_memory_l3mpki" + "BriefDescription": "", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_l3_cache_access_bw" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_memory_load_miss_real_latency" + "BriefDescription": "", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_memory_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_l3mpki" }, { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_data_l2_mlp" + "MetricName": "tma_info_memory_latency_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_miss_latency" + "MetricName": "tma_info_memory_latency_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_mlp" + "MetricName": "tma_info_memory_latency_load_l2_mlp" }, { "BriefDescription": "Average Latency for L3 cache miss demand Loads", "MetricExpr": "cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,umask\\=0x10@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l3_miss_latency" + "MetricName": "tma_info_memory_latency_load_l3_miss_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + "BriefDescription": "\"Bus lock\" per kilo instruction", + "MetricExpr": "1e3 * SQ_MISC.BUS_LOCK / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_mix_bus_lock_pki" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + "BriefDescription": "Un-cacheable retired load per kilo instruction", + "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_mix_uc_load_pki" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", @@ -926,42 +1000,56 @@ "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "BriefDescription": "", + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" }, { + "BriefDescription": "Instructions per a microcode Assist invocation", + "MetricExpr": "INST_RETIRED.ANY / ASSISTS.ANY", + "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", + "MetricName": "tma_info_pipeline_ipassist", + "MetricThreshold": "tma_info_pipeline_ipassist < 100e3", + "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)" + }, + { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", "MetricGroup": "Power;Summary", - "MetricName": "tma_info_system_average_frequency" + "MetricName": "tma_info_system_core_frequency" }, { - "BriefDescription": "Average CPU Utilization", + "BriefDescription": "Average CPU Utilization (percentage)", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", "MetricGroup": "HPC;Summary", "MetricName": "tma_info_system_cpu_utilization" }, { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_system_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width" }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", @@ -998,12 +1086,6 @@ "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" }, { - "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", - "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.ALL + UNC_ARB_DAT_OCCUPANCY.RD) / UNC_ARB_TRK_REQUESTS.ALL", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_system_mem_request_latency" - }, - { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0", "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_core_clks", "MetricGroup": "Power", @@ -1097,8 +1179,8 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks", + "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", @@ -1107,7 +1189,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1", @@ -1117,7 +1199,7 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + L1D_PEND_MISS.FB_FULL_PERIODS) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", @@ -1127,24 +1209,24 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "9 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "9 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", + "MetricExpr": "DECODE.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1158,7 +1240,7 @@ "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%" }, { @@ -1201,7 +1283,7 @@ "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_lsd", - "MetricThreshold": "tma_lsd > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", + "MetricThreshold": "tma_lsd > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.", "ScaleUnit": "100%" }, @@ -1216,21 +1298,21 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1254,11 +1336,11 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "tma_retiring * tma_info_thread_slots / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", + "MetricExpr": "UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", - "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches", "ScaleUnit": "100%" }, { @@ -1275,7 +1357,7 @@ "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%" }, @@ -1284,16 +1366,16 @@ "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_mite_group", "MetricName": "tma_mite_4wide", - "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))", + "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)", "ScaleUnit": "100%" }, { - "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", + "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)", "MetricExpr": "UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY", "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group", "MetricName": "tma_mixing_vectors", "MetricThreshold": "tma_mixing_vectors > 0.05", - "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", + "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", "ScaleUnit": "100%" }, { @@ -1302,22 +1384,22 @@ "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_thread_slots)", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group", "MetricName": "tma_nop_instructions", - "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6", + "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions + tma_nop_instructions))", + "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6", @@ -1325,6 +1407,22 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).", + "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)", + "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group", + "MetricName": "tma_other_mispredicts", + "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.", + "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)", + "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group", + "MetricName": "tma_other_nukes", + "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", @@ -1352,17 +1450,17 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)", "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", + "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1371,7 +1469,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks", + "MetricExpr": "(cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / tma_info_thread_clks * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1401,7 +1499,7 @@ "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", - "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3", "ScaleUnit": "100%" }, @@ -1419,18 +1517,18 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks", - "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", + "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO", "MetricName": "tma_serializing_operation", - "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", + "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD. Related metrics: tma_ms_switches", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", "MetricExpr": "140 * MISC_RETIRED.PAUSE_INST / tma_info_thread_clks", - "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", + "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", - "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", + "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: MISC_RETIRED.PAUSE_INST", "ScaleUnit": "100%" }, @@ -1459,7 +1557,7 @@ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { @@ -1527,10 +1625,10 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", "MetricExpr": "10 * BACLEARS.ANY / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", + "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY", "ScaleUnit": "100%" }, { diff --git a/tools/perf/pmu-events/arch/x86/sandybridge/metricgroups.json b/tools/perf/pmu-events/arch/x86/sandybridge/metricgroups.json index bebb85945d..a2c27794c0 100644 --- a/tools/perf/pmu-events/arch/x86/sandybridge/metricgroups.json +++ b/tools/perf/pmu-events/arch/x86/sandybridge/metricgroups.json @@ -2,10 +2,10 @@ "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -23,7 +23,9 @@ "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -88,6 +90,7 @@ "tma_issueTLB": "Metrics related by the issue $issueTLB", "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", diff --git a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json index 8898b6fd0d..ce836ebda5 100644 --- a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json +++ b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json @@ -163,7 +163,7 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", "MetricgroupNoGroup": "TopdownL2", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_lcp", "ScaleUnit": "100%" @@ -193,7 +193,7 @@ "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_scalar", "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", - "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -202,7 +202,25 @@ "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_vector", "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", - "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE) / UOPS_DISPATCHED.THREAD", + "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", + "MetricName": "tma_fp_vector_128b", + "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", + "MetricExpr": "(SIMD_FP_256.PACKED_DOUBLE + SIMD_FP_256.PACKED_SINGLE) / UOPS_DISPATCHED.THREAD", + "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", + "MetricName": "tma_fp_vector_256b", + "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -222,7 +240,7 @@ "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)", "ScaleUnit": "100%" }, { @@ -244,7 +262,7 @@ "MetricName": "tma_info_core_flopc" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", "MetricExpr": "UOPS_DISPATCHED.THREAD / (cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_core_ilp" @@ -271,21 +289,27 @@ "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", "MetricGroup": "Power;Summary", - "MetricName": "tma_info_system_average_frequency" + "MetricName": "tma_info_system_core_frequency" }, { - "BriefDescription": "Average CPU Utilization", + "BriefDescription": "Average CPU Utilization (percentage)", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", "MetricGroup": "HPC;Summary", "MetricName": "tma_info_system_cpu_utilization" }, { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_mem_bandwidth" }, @@ -294,7 +318,7 @@ "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_system_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width" }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", @@ -317,19 +341,6 @@ "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" }, { - "BriefDescription": "Average number of parallel requests to external memory", - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_system_mem_parallel_requests", - "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests" - }, - { - "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_system_mem_request_latency" - }, - { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", "MetricGroup": "SMT", @@ -388,7 +399,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED", @@ -398,7 +409,7 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_SMT", "MetricExpr": "MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS", @@ -420,7 +431,7 @@ "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%" }, { @@ -435,21 +446,21 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_info_system_dram_bw_use", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_info_system_dram_bw_use", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: ", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: ", "ScaleUnit": "100%" }, { diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/metricgroups.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/metricgroups.json index e6f7934320..81e5ca1c30 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/metricgroups.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/metricgroups.json @@ -2,10 +2,11 @@ "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "C0Wait": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -27,7 +28,9 @@ "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -68,6 +71,7 @@ "tma_assists_group": "Metrics contributing to tma_assists category", "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category", "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", "tma_core_bound_group": "Metrics contributing to tma_core_bound category", "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", @@ -81,9 +85,9 @@ "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", "tma_int_operations_group": "Metrics contributing to tma_int_operations category", "tma_issue2P": "Metrics related by the issue $issue2P", - "tma_issueBC": "Metrics related by the issue $issueBC", "tma_issueBM": "Metrics related by the issue $issueBM", "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueComp": "Metrics related by the issue $issueComp", "tma_issueD0": "Metrics related by the issue $issueD0", "tma_issueFB": "Metrics related by the issue $issueFB", "tma_issueFL": "Metrics related by the issue $issueFL", @@ -103,11 +107,13 @@ "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", "tma_light_operations_group": "Metrics contributing to tma_light_operations category", "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", "tma_mem_bandwidth_group": "Metrics contributing to tma_mem_bandwidth category", "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", "tma_mite_group": "Metrics contributing to tma_mite category", + "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category", "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json index 56e54babcc..6f0e6360e9 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json @@ -85,6 +85,24 @@ "ScaleUnit": "1MB/s" }, { + "BriefDescription": "Percentage of inbound full cacheline writes initiated by end device controllers that miss the L3 cache.", + "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM / UNC_CHA_TOR_INSERTS.IO_ITOM", + "MetricName": "io_percent_of_inbound_full_writes_that_miss_l3", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Percentage of inbound partial cacheline writes initiated by end device controllers that miss the L3 cache.", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_MISS_RFO) / (UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_RFO)", + "MetricName": "io_percent_of_inbound_partial_writes_that_miss_l3", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Percentage of inbound reads initiated by end device controllers that miss the L3 cache.", + "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_MISS_PCIRDCUR / UNC_CHA_TOR_INSERTS.IO_PCIRDCUR", + "MetricName": "io_percent_of_inbound_reads_that_miss_l3", + "ScaleUnit": "100%" + }, + { "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions", "MetricExpr": "ITLB_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", "MetricName": "itlb_2nd_level_large_page_mpi", @@ -310,20 +328,20 @@ "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5_11 + UOPS_DISPATCHED.PORT_6) / (5 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", - "MetricThreshold": "tma_alu_op_utilization > 0.6", + "MetricThreshold": "tma_alu_op_utilization > 0.4", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the Advanced Matrix Extensions (AMX) execution engine was busy with tile (arithmetic) operations", + "BriefDescription": "This metric estimates fraction of cycles where the Advanced Matrix eXtensions (AMX) execution engine was busy with tile (arithmetic) operations", "MetricExpr": "EXE.AMX_BUSY / tma_info_core_core_clks", - "MetricGroup": "Compute;HPC;Server;TopdownL5;tma_L5_group;tma_ports_utilized_0_group", + "MetricGroup": "Compute;HPC;Server;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_amx_busy", - "MetricThreshold": "tma_amx_busy > 0.5 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", + "MetricThreshold": "tma_amx_busy > 0.5 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * cpu@ASSISTS.ANY\\,umask\\=0x1B@ / tma_info_thread_slots", + "MetricExpr": "78 * ASSISTS.ANY / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -381,6 +399,22 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due staying in C0.1 power-performance optimized state (Faster wakeup time; Smaller power savings).", + "MetricExpr": "CPU_CLK_UNHALTED.C01 / tma_info_thread_clks", + "MetricGroup": "C0Wait;TopdownL4;tma_L4_group;tma_serializing_operation_group", + "MetricName": "tma_c01_wait", + "MetricThreshold": "tma_c01_wait > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due staying in C0.2 power-performance optimized state (Slower wakeup time; Larger power savings).", + "MetricExpr": "CPU_CLK_UNHALTED.C02 / tma_info_thread_clks", + "MetricGroup": "C0Wait;TopdownL4;tma_L4_group;tma_serializing_operation_group", + "MetricName": "tma_c02_wait", + "MetricThreshold": "tma_c02_wait > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", @@ -400,7 +434,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", - "MetricExpr": "(76 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 75.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "(76 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 75.5 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -420,7 +454,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", - "MetricExpr": "75.5 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "75.5 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -432,7 +466,7 @@ "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", "MetricName": "tma_decoder0_alone", - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35))", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)", "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", "ScaleUnit": "100%" }, @@ -459,7 +493,7 @@ "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, @@ -478,7 +512,7 @@ "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", "ScaleUnit": "100%" }, { @@ -487,12 +521,12 @@ "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "80 * tma_info_system_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", + "MetricExpr": "80 * tma_info_system_core_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -505,7 +539,7 @@ "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -514,7 +548,7 @@ "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", "MetricGroup": "Default;FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", "MetricgroupNoGroup": "TopdownL2;Default", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" @@ -540,17 +574,8 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric approximates arithmetic floating-point (FP) matrix uops fraction the CPU has retired (aggregated across all supported FP datatypes in AMX engine)", - "MetricExpr": "cpu@AMX_OPS_RETIRED.BF16\\,cmask\\=1@ / (tma_retiring * tma_info_thread_slots)", - "MetricGroup": "Compute;Flops;HPC;Pipeline;Server;TopdownL4;tma_L4_group;tma_fp_arith_group", - "MetricName": "tma_fp_amx", - "MetricThreshold": "tma_fp_amx > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", - "PublicDescription": "This metric approximates arithmetic floating-point (FP) matrix uops fraction the CPU has retired (aggregated across all supported FP datatypes in AMX engine). Refer to AMX_Busy and GFLOPs metrics for actual AMX utilization and FP performance, resp.", - "ScaleUnit": "100%" - }, - { "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", - "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector + tma_fp_amx", + "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", "MetricGroup": "HPC;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_fp_arith", "MetricThreshold": "tma_fp_arith > 0.2 & tma_light_operations > 0.6", @@ -568,7 +593,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR) / (tma_retiring * tma_info_thread_slots)", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + FP_ARITH_INST_RETIRED2.SCALAR) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_scalar", "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -577,7 +602,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ + FP_ARITH_INST_RETIRED2.VECTOR) / (tma_retiring * tma_info_thread_slots)", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@ + FP_ARITH_INST_RETIRED2.VECTOR) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_vector", "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -625,10 +650,10 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions", "MetricExpr": "tma_light_operations * INST_RETIRED.MACRO_FUSED / (tma_retiring * tma_info_thread_slots)", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_fused_instructions", "MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. CMP+JCC or DEC+JCC are common examples of legacy fusions. {([MTL] Note new MOV+OP and Load+OP fusions appear under Other_Light_Ops in MTL!)}", "ScaleUnit": "100%" }, { @@ -639,13 +664,13 @@ "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", "MetricgroupNoGroup": "TopdownL2;Default", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. Sample with: UOPS_RETIRED.HEAVY", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .). Sample with: UOPS_RETIRED.HEAVY", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", "MetricExpr": "ICACHE_DATA.STALLS / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", @@ -653,7 +678,7 @@ }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES / 100", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bad_spec_branch_misprediction_cost", "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" @@ -694,6 +719,33 @@ "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" }, { + "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)", + "MetricExpr": "INT_MISC.CLEARS_COUNT / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)", + "MetricGroup": "BrMispredicts", + "MetricName": "tma_info_bad_spec_spec_clears_ratio" + }, + { + "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricExpr": "(100 * (1 - max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)) / (((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + cpu@RS.EMPTY\\,umask\\=0x1@) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / CPU_CLK_UNHALTED.THREAD) if max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)) < (((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + cpu@RS.EMPTY\\,umask\\=0x1@) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / CPU_CLK_UNHALTED.THREAD) else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0) + 0 * slots", + "MetricGroup": "Cor;SMT;TopdownL1;tma_L1_group", + "MetricName": "tma_info_botlnk_core_bound_likely", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", + "MetricExpr": "100 * (100 * ((topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots) * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + INT_MISC.UNKNOWN_BRANCH_CYCLES / CPU_CLK_UNHALTED.THREAD) + min(3 * cpu@UOPS_RETIRED.MS\\,cmask\\=0x1\\,edge\\=0x1@ / (UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY) / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) + max(0, topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots - (topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots)) * ((IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2) / ((IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2 + (IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2)))", + "MetricGroup": "DSBmiss;Fed;TopdownL1;tma_L1_group", + "MetricName": "tma_info_botlnk_dsb_misses", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck.", + "MetricExpr": "100 * (100 * ((topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots) * (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD) / (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + INT_MISC.UNKNOWN_BRANCH_CYCLES / CPU_CLK_UNHALTED.THREAD) + min(3 * cpu@UOPS_RETIRED.MS\\,cmask\\=0x1\\,edge\\=0x1@ / (UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY) / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD)))", + "MetricGroup": "Fed;FetchLat;IcMiss;TopdownL1;tma_L1_group", + "MetricName": "tma_info_botlnk_ic_misses", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", "MetricGroup": "Cor;SMT", @@ -717,61 +769,98 @@ "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " }, { + "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.", + "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Ret", + "MetricName": "tma_info_bottleneck_base_non_br", + "MetricThreshold": "tma_info_bottleneck_base_non_br > 20" + }, + { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", - "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", + "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "tma_info_bottleneck_big_code", - "MetricThreshold": "tma_info_bottleneck_big_code > 20", - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" + "MetricThreshold": "tma_info_bottleneck_big_code > 20" }, { "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", - "MetricGroup": "Ret;tma_issueBC", + "MetricExpr": "100 * ((BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots)", + "MetricGroup": "Ret", "MetricName": "tma_info_bottleneck_branching_overhead", - "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", + "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", + "MetricName": "tma_info_bottleneck_cache_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", + "MetricName": "tma_info_bottleneck_cache_memory_latency", + "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency" + }, + { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * tma_amx_busy / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "Cor;tma_issueComp", + "MetricName": "tma_info_bottleneck_compute_bound_est", + "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " }, { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))) - tma_info_bottleneck_big_code", "MetricGroup": "Fed;FetchBW;Frontend", "MetricName": "tma_info_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" }, { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", - "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_info_bottleneck_memory_bandwidth", - "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Total pipeline cost of irregular execution (e.g", + "MetricExpr": "100 * ((1 - INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + cpu@RS.EMPTY\\,umask\\=1@ / tma_info_thread_clks * tma_ports_utilized_0) / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Bad;Cor;Ret;tma_issueMS", + "MetricName": "tma_info_bottleneck_irregular_overhead", + "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10", + "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches" }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_info_bottleneck_memory_data_tlbs", "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", - "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" + "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization" }, { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound))", - "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_info_bottleneck_memory_latency", - "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", - "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" + "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) * tma_remote_cache / (tma_local_mem + tma_remote_cache + tma_remote_mem) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", + "MetricGroup": "Mem;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_synchronization", + "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10", + "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs" }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", - "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bottleneck_mispredictions", "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" }, { + "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)", + "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)", + "MetricGroup": "Cor;Offcore", + "MetricName": "tma_info_bottleneck_other_bottlenecks", + "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20", + "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls." + }, + { "BriefDescription": "Fraction of branches that are CALL or RET", "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", @@ -803,7 +892,7 @@ }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", + "MetricExpr": "(CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else tma_info_thread_clks)", "MetricGroup": "SMT", "MetricName": "tma_info_core_core_clks" }, @@ -814,8 +903,14 @@ "MetricName": "tma_info_core_coreipc" }, { + "BriefDescription": "uops Executed per Cycle", + "MetricExpr": "UOPS_EXECUTED.THREAD / tma_info_thread_clks", + "MetricGroup": "Power", + "MetricName": "tma_info_core_epc" + }, + { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR_HALF + 2 * (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF) + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * (FP_ARITH_INST_RETIRED2.128B_PACKED_HALF + cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@) + 16 * (FP_ARITH_INST_RETIRED2.256B_PACKED_HALF + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) + 32 * FP_ARITH_INST_RETIRED2.512B_PACKED_HALF + 4 * AMX_OPS_RETIRED.BF16", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", "MetricGroup": "Flops;Ret", "MetricName": "tma_info_core_flopc" }, @@ -827,8 +922,8 @@ "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_core_ilp" }, @@ -884,6 +979,13 @@ "MetricName": "tma_info_frontend_l2mpki_code_all" }, { + "BriefDescription": "Average number of cycles the front-end was delayed due to an Unknown Branch detection", + "MetricExpr": "INT_MISC.UNKNOWN_BRANCH_CYCLES / cpu@INT_MISC.UNKNOWN_BRANCH_CYCLES\\,cmask\\=1\\,edge@", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_unknown_branch_cost", + "PublicDescription": "Average number of cycles the front-end was delayed due to an Unknown Branch detection. See Unknown_Branches node." + }, + { "BriefDescription": "Branch instructions per taken branch.", "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;PGO", @@ -898,27 +1000,11 @@ }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR + (cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ + FP_ARITH_INST_RETIRED2.VECTOR))", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + FP_ARITH_INST_RETIRED2.SCALAR + (cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@ + FP_ARITH_INST_RETIRED2.VECTOR))", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", "MetricThreshold": "tma_info_inst_mix_iparith < 10", - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." - }, - { - "BriefDescription": "Instructions per FP Arithmetic AMX operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / AMX_OPS_RETIRED.BF16", - "MetricGroup": "Flops;FpVector;InsType;Server", - "MetricName": "tma_info_inst_mix_iparith_amx_f16", - "MetricThreshold": "tma_info_inst_mix_iparith_amx_f16 < 10", - "PublicDescription": "Instructions per FP Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions." - }, - { - "BriefDescription": "Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / AMX_OPS_RETIRED.INT8", - "MetricGroup": "InsType;IntVector;Server", - "MetricName": "tma_info_inst_mix_iparith_amx_int8", - "MetricThreshold": "tma_info_inst_mix_iparith_amx_int8 < 10", - "PublicDescription": "Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions." + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", @@ -926,7 +1012,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx128", "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", @@ -934,7 +1020,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx256", "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", @@ -942,7 +1028,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx512", "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", @@ -950,7 +1036,15 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_dp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." + }, + { + "BriefDescription": "Instructions per FP Arithmetic Scalar Half-Precision instruction (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED2.SCALAR", + "MetricGroup": "Flops;FpScalar;InsType;Server", + "MetricName": "tma_info_inst_mix_iparith_scalar_hp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_hp < 10", + "PublicDescription": "Instructions per FP Arithmetic Scalar Half-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", @@ -958,7 +1052,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_sp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", @@ -976,7 +1070,7 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_flopc", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_ipflop", "MetricThreshold": "tma_info_inst_mix_ipflop < 10" @@ -989,6 +1083,12 @@ "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { + "BriefDescription": "Instructions per PAUSE (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / CPU_CLK_UNHALTED.PAUSE_INST", + "MetricGroup": "Flops;FpVector;InsType", + "MetricName": "tma_info_inst_mix_ippause" + }, + { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", "MetricGroup": "InsType", @@ -1011,16 +1111,30 @@ "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp" }, { + "BriefDescription": "\"Bus lock\" per kilo instruction", + "MetricExpr": "tma_info_memory_mix_bus_lock_pki", + "MetricGroup": "Mem;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_bus_lock_pki", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "tma_info_memory_tlb_code_stlb_mpki", + "MetricGroup": "Fed;MemoryTLB;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_code_stlb_mpki", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l1d_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l2_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t" }, { "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", @@ -1036,77 +1150,232 @@ }, { "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_access_bw", "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_core_l3_cache_access_bw" + "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t" + }, + { + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "tma_info_memory_latency_data_l2_mlp", + "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_data_l2_mlp", + "MetricgroupNoGroup": "TopdownL1" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_fb_hpki" }, { + "BriefDescription": "", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki_load" }, { + "BriefDescription": "", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l2_cache_fill_bw_2t", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", + "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY", + "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", + "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY", + "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l2_evictions_silent_pki", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_all" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Backend;CacheMisses;Mem", + "MetricGroup": "Backend;CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem;Offcore", + "MetricGroup": "CacheHits;Mem;Offcore", "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki_load" }, { + "BriefDescription": "", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_l3_cache_access_bw" + }, + { + "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l3_cache_access_bw_2t", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l3_cache_fill_bw_2t", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "Mem", "MetricName": "tma_info_memory_l3mpki" }, { + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_latency_data_l2_mlp" + }, + { + "BriefDescription": "Average Latency for L2 cache miss demand Loads", + "MetricExpr": "tma_info_memory_load_l2_miss_latency", + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_latency_load_l2_miss_latency" + }, + { + "BriefDescription": "Average Parallel L2 cache miss demand Loads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_latency_load_l2_mlp" + }, + { + "BriefDescription": "Average Latency for L3 cache miss demand Loads", + "MetricExpr": "tma_info_memory_load_l3_miss_latency", + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_latency_load_l3_miss_latency" + }, + { + "BriefDescription": "Average Latency for L2 cache miss demand Loads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", + "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_load_l2_miss_latency", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Average Parallel L2 cache miss demand Loads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=0x1@", + "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_load_l2_mlp", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Average Latency for L3 cache miss demand Loads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", + "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_load_l3_miss_latency", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", "MetricExpr": "L1D_PEND_MISS.PENDING / MEM_LOAD_COMPLETED.L1_MISS_ANY", "MetricGroup": "Mem;MemoryBound;MemoryLat", "MetricName": "tma_info_memory_load_miss_real_latency" }, { + "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "tma_info_memory_tlb_load_stlb_mpki", + "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_load_stlb_mpki", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "\"Bus lock\" per kilo instruction", + "MetricExpr": "1e3 * SQ_MISC.BUS_LOCK / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_mix_bus_lock_pki" + }, + { + "BriefDescription": "Off-core accesses per kilo instruction for modified write requests", + "MetricExpr": "1e3 * OCR.MODIFIED_WRITE.ANY_RESPONSE / tma_info_inst_mix_instructions", + "MetricGroup": "Offcore", + "MetricName": "tma_info_memory_mix_offcore_mwrite_any_pki" + }, + { + "BriefDescription": "Off-core accesses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)", + "MetricExpr": "1e3 * OCR.READS_TO_CORE.ANY_RESPONSE / tma_info_inst_mix_instructions", + "MetricGroup": "CacheHits;Offcore", + "MetricName": "tma_info_memory_mix_offcore_read_any_pki" + }, + { + "BriefDescription": "L3 cache misses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)", + "MetricExpr": "1e3 * OCR.READS_TO_CORE.L3_MISS / tma_info_inst_mix_instructions", + "MetricGroup": "Offcore", + "MetricName": "tma_info_memory_mix_offcore_read_l3m_pki" + }, + { + "BriefDescription": "Un-cacheable retired load per kilo instruction", + "MetricExpr": "tma_info_memory_uc_load_pki", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_mix_uc_load_pki" + }, + { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", "MetricGroup": "Mem;MemoryBW;MemoryBound", @@ -1114,52 +1383,84 @@ "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_data_l2_mlp" + "BriefDescription": "Off-core accesses per kilo instruction for modified write requests", + "MetricExpr": "1e3 * OCR.MODIFIED_WRITE.ANY_RESPONSE / INST_RETIRED.ANY", + "MetricGroup": "Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_offcore_mwrite_any_pki", + "MetricgroupNoGroup": "TopdownL1" }, { - "BriefDescription": "Average Latency for L2 cache miss demand Loads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_miss_latency" + "BriefDescription": "Off-core accesses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)", + "MetricExpr": "1e3 * OCR.READS_TO_CORE.ANY_RESPONSE / INST_RETIRED.ANY", + "MetricGroup": "CacheHits;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_offcore_read_any_pki", + "MetricgroupNoGroup": "TopdownL1" }, { - "BriefDescription": "Average Parallel L2 cache miss demand Loads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_mlp" + "BriefDescription": "L3 cache misses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)", + "MetricExpr": "1e3 * OCR.READS_TO_CORE.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_offcore_read_l3m_pki", + "MetricgroupNoGroup": "TopdownL1" }, { - "BriefDescription": "Average Latency for L3 cache miss demand Loads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l3_miss_latency" + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (4 * (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD))", + "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_page_walks_utilization", + "MetricgroupNoGroup": "TopdownL1" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + "BriefDescription": "Average DRAM BW for Reads-to-Core (R2C) covering for memory attached to local- and remote-socket", + "MetricExpr": "64 * OCR.READS_TO_CORE.DRAM / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_r2c_dram_bw", + "MetricgroupNoGroup": "TopdownL1", + "PublicDescription": "Average DRAM BW for Reads-to-Core (R2C) covering for memory attached to local- and remote-socket. See R2C_Offcore_BW." }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + "BriefDescription": "Average L3-cache miss BW for Reads-to-Core (R2C)", + "MetricExpr": "64 * OCR.READS_TO_CORE.L3_MISS / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_r2c_l3m_bw", + "MetricgroupNoGroup": "TopdownL1", + "PublicDescription": "Average L3-cache miss BW for Reads-to-Core (R2C). This covering going to DRAM or other memory off-chip memory tears. See R2C_Offcore_BW." }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + "BriefDescription": "Average Off-core access BW for Reads-to-Core (R2C)", + "MetricExpr": "64 * OCR.READS_TO_CORE.ANY_RESPONSE / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_r2c_offcore_bw", + "MetricgroupNoGroup": "TopdownL1", + "PublicDescription": "Average Off-core access BW for Reads-to-Core (R2C). R2C account for demand or prefetch load/RFO/code access that fill data into the Core caches." }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + "BriefDescription": "Average DRAM BW for Reads-to-Core (R2C) covering for memory attached to local- and remote-socket", + "MetricExpr": "64 * OCR.READS_TO_CORE.DRAM / 1e9 / duration_time", + "MetricGroup": "HPC;Mem;MemoryBW;SoC", + "MetricName": "tma_info_memory_soc_r2c_dram_bw", + "PublicDescription": "Average DRAM BW for Reads-to-Core (R2C) covering for memory attached to local- and remote-socket. See R2C_Offcore_BW." + }, + { + "BriefDescription": "Average L3-cache miss BW for Reads-to-Core (R2C)", + "MetricExpr": "64 * OCR.READS_TO_CORE.L3_MISS / 1e9 / duration_time", + "MetricGroup": "HPC;Mem;MemoryBW;SoC", + "MetricName": "tma_info_memory_soc_r2c_l3m_bw", + "PublicDescription": "Average L3-cache miss BW for Reads-to-Core (R2C). This covering going to DRAM or other memory off-chip memory tears. See R2C_Offcore_BW." + }, + { + "BriefDescription": "Average Off-core access BW for Reads-to-Core (R2C)", + "MetricExpr": "64 * OCR.READS_TO_CORE.ANY_RESPONSE / 1e9 / duration_time", + "MetricGroup": "HPC;Mem;MemoryBW;SoC", + "MetricName": "tma_info_memory_soc_r2c_offcore_bw", + "PublicDescription": "Average Off-core access BW for Reads-to-Core (R2C). R2C account for demand or prefetch load/RFO/code access that fill data into the Core caches." + }, + { + "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "tma_info_memory_tlb_store_stlb_mpki", + "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_store_stlb_mpki", + "MetricgroupNoGroup": "TopdownL1" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", @@ -1187,15 +1488,22 @@ "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "BriefDescription": "Un-cacheable retired load per kilo instruction", + "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY", + "MetricGroup": "Mem;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_uc_load_pki", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "", + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" }, { "BriefDescription": "Instructions per a microcode Assist invocation", - "MetricExpr": "INST_RETIRED.ANY / cpu@ASSISTS.ANY\\,umask\\=0x1B@", - "MetricGroup": "Pipeline;Ret;Retire", + "MetricExpr": "INST_RETIRED.ANY / ASSISTS.ANY", + "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", "MetricName": "tma_info_pipeline_ipassist", "MetricThreshold": "tma_info_pipeline_ipassist < 100e3", "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)" @@ -1209,41 +1517,62 @@ { "BriefDescription": "Estimated fraction of retirement-cycles dealing with repeat instructions", "MetricExpr": "INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", - "MetricGroup": "Pipeline;Ret", + "MetricGroup": "MicroSeq;Pipeline;Ret", "MetricName": "tma_info_pipeline_strings_cycles", "MetricThreshold": "tma_info_pipeline_strings_cycles > 0.1" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "BriefDescription": "Fraction of cycles the processor is waiting yet unhalted; covering legacy PAUSE instruction, as well as C0.1 / C0.2 power-performance optimized states", + "MetricExpr": "CPU_CLK_UNHALTED.C0_WAIT / tma_info_thread_clks", + "MetricGroup": "C0Wait", + "MetricName": "tma_info_system_c0_wait", + "MetricThreshold": "tma_info_system_c0_wait > 0.05" + }, + { + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", "MetricGroup": "Power;Summary", - "MetricName": "tma_info_system_average_frequency" + "MetricName": "tma_info_system_core_frequency" }, { - "BriefDescription": "Average CPU Utilization", + "BriefDescription": "Average CPU Utilization (percentage)", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", "MetricGroup": "HPC;Summary", "MetricName": "tma_info_system_cpu_utilization" }, { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "tma_info_core_flopc / duration_time", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_system_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width" }, { - "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", + "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]", "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1e9 / duration_time", - "MetricGroup": "IoBW;Mem;Server;SoC", - "MetricName": "tma_info_system_io_write_bw" + "MetricGroup": "IoBW;MemOffcore;Server;SoC", + "MetricName": "tma_info_system_io_read_bw", + "PublicDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]. Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU" + }, + { + "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_ITOM + UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR) * 64 / 1e9 / duration_time", + "MetricGroup": "IoBW;MemOffcore;Server;SoC", + "MetricName": "tma_info_system_io_write_bw", + "PublicDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]. Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU" }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", @@ -1268,7 +1597,7 @@ { "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]", "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR) / uncore_cha_0@event\\=0x1@", - "MetricGroup": "Mem;MemoryLat;Server;SoC", + "MetricGroup": "MemOffcore;MemoryLat;Server;SoC", "MetricName": "tma_info_system_mem_dram_read_latency", "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" }, @@ -1282,7 +1611,7 @@ { "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]", "MetricExpr": "(1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM) / uncore_cha_0@event\\=0x1@ if #has_pmem > 0 else 0)", - "MetricGroup": "Mem;MemoryLat;Server;SoC", + "MetricGroup": "MemOffcore;MemoryLat;Server;SoC", "MetricName": "tma_info_system_mem_pmm_read_latency", "PublicDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" }, @@ -1297,13 +1626,13 @@ { "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]", "MetricExpr": "(64 * UNC_M_PMM_RPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", - "MetricGroup": "Mem;MemoryBW;Server;SoC", + "MetricGroup": "MemOffcore;MemoryBW;Server;SoC", "MetricName": "tma_info_system_pmm_read_bw" }, { "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]", "MetricExpr": "(64 * UNC_M_PMM_WPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", - "MetricGroup": "Mem;MemoryBW;Server;SoC", + "MetricGroup": "MemOffcore;MemoryBW;Server;SoC", "MetricName": "tma_info_system_pmm_write_bw" }, { @@ -1319,18 +1648,18 @@ "MetricName": "tma_info_system_socket_clks" }, { - "BriefDescription": "Tera Integer (matrix) Operations Per Second", - "MetricExpr": "8 * AMX_OPS_RETIRED.INT8 / 1e12 / duration_time", - "MetricGroup": "Cor;HPC;IntVector;Server", - "MetricName": "tma_info_system_tiops" - }, - { "BriefDescription": "Average Frequency Utilization relative nominal frequency", "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", "MetricName": "tma_info_system_turbo_utilization" }, { + "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]", + "MetricExpr": "tma_info_system_socket_clks / 1e9 / duration_time", + "MetricGroup": "SoC", + "MetricName": "tma_info_system_uncore_frequency" + }, + { "BriefDescription": "Cross-socket Ultra Path Interconnect (UPI) data transmit bandwidth for data only [MB / sec]", "MetricExpr": "UNC_UPI_TxL_FLITS.ALL_DATA * 64 / 9 / 1e6", "MetricGroup": "Server;SoC", @@ -1388,17 +1717,8 @@ "MetricThreshold": "tma_info_thread_uptb < 9" }, { - "BriefDescription": "This metric approximates arithmetic Integer (Int) matrix uops fraction the CPU has retired (aggregated across all supported Int datatypes in AMX engine)", - "MetricExpr": "cpu@AMX_OPS_RETIRED.INT8\\,cmask\\=1@ / (tma_retiring * tma_info_thread_slots)", - "MetricGroup": "Compute;HPC;IntVector;Pipeline;Server;TopdownL4;tma_L4_group;tma_int_operations_group", - "MetricName": "tma_int_amx", - "MetricThreshold": "tma_int_amx > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", - "PublicDescription": "This metric approximates arithmetic Integer (Int) matrix uops fraction the CPU has retired (aggregated across all supported Int datatypes in AMX engine). Refer to AMX_Busy and TIOPs metrics for actual AMX utilization and Int performance, resp.", - "ScaleUnit": "100%" - }, - { "BriefDescription": "This metric represents overall Integer (Int) select operations fraction the CPU has executed (retired)", - "MetricExpr": "tma_int_vector_128b + tma_int_vector_256b + tma_shuffles + tma_int_amx", + "MetricExpr": "tma_int_vector_128b + tma_int_vector_256b", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_int_operations", "MetricThreshold": "tma_int_operations > 0.1 & tma_light_operations > 0.6", @@ -1415,18 +1735,18 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired", + "BriefDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired", "MetricExpr": "(INT_VEC_RETIRED.ADD_256 + INT_VEC_RETIRED.MUL_256 + INT_VEC_RETIRED.VNNI_256) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group;tma_issue2P", "MetricName": "tma_int_vector_256b", "MetricThreshold": "tma_int_vector_256b > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", - "PublicDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "PublicDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", @@ -1435,7 +1755,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", "MetricExpr": "max((EXE_ACTIVITY.BOUND_ON_LOADS - MEMORY_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1", @@ -1444,7 +1764,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L1D_MISS - MEMORY_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", @@ -1453,19 +1773,19 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L2_MISS - MEMORY_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "33 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "33 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { @@ -1485,7 +1805,7 @@ "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", "MetricgroupNoGroup": "TopdownL2;Default", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%" }, { @@ -1515,10 +1835,10 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", - "MetricExpr": "71 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "71 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", - "MetricName": "tma_local_dram", - "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", + "MetricName": "tma_local_mem", + "MetricThreshold": "tma_local_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM_PS", "ScaleUnit": "100%" }, @@ -1551,21 +1871,21 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1583,9 +1903,9 @@ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.", "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "13 * MISC2_RETIRED.LFENCE / tma_info_thread_clks", - "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", + "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_memory_fence", - "MetricThreshold": "tma_memory_fence > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", + "MetricThreshold": "tma_memory_fence > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "ScaleUnit": "100%" }, { @@ -1602,7 +1922,7 @@ "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", - "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: UOPS_RETIRED.MS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: UOPS_RETIRED.MS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches", "ScaleUnit": "100%" }, { @@ -1619,32 +1939,32 @@ "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%" }, { - "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", + "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)", "MetricExpr": "160 * ASSISTS.SSE_AVX_MIX / tma_info_thread_clks", "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group", "MetricName": "tma_mixing_vectors", "MetricThreshold": "tma_mixing_vectors > 0.05", - "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", + "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "3 * cpu@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (tma_retiring * tma_info_thread_slots / UOPS_ISSUED.ANY) / tma_info_thread_clks", + "MetricExpr": "3 * cpu@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY) / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: FRONTEND_RETIRED.MS_FLOWS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: FRONTEND_RETIRED.MS_FLOWS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused", "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - INST_RETIRED.MACRO_FUSED) / (tma_retiring * tma_info_thread_slots)", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_non_fused_branches", "MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.", @@ -1653,15 +1973,15 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_thread_slots)", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group", "MetricName": "tma_nop_instructions", - "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6", + "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", - "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches + tma_nop_instructions))", + "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6", @@ -1669,6 +1989,22 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).", + "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)", + "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group", + "MetricName": "tma_other_mispredicts", + "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.", + "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)", + "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group", + "MetricName": "tma_other_nukes", + "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults", "MetricExpr": "99 * ASSISTS.PAGE_FAULT / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_assists_group", @@ -1679,7 +2015,7 @@ }, { "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a", - "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 0)) * (MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", + "MetricExpr": "(((1 - (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))))) * (MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_pmm_bound", "MetricThreshold": "tma_pmm_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1705,17 +2041,17 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)", "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_thread_clks if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_thread_clks)", + "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_thread_clks if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1724,7 +2060,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / tma_info_thread_clks", + "MetricExpr": "(cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + cpu@RS.EMPTY\\,umask\\=1@) / tma_info_thread_clks * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1756,13 +2092,13 @@ "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", - "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", - "MetricExpr": "(135.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 135.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "(135.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 135.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group", "MetricName": "tma_remote_cache", "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1771,10 +2107,10 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", - "MetricExpr": "149 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "149 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", - "MetricName": "tma_remote_dram", - "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", + "MetricName": "tma_remote_mem", + "MetricThreshold": "tma_remote_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article. Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM_PS", "ScaleUnit": "100%" }, @@ -1791,28 +2127,29 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", - "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks", - "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", + "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks + tma_c02_wait", + "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO", "MetricName": "tma_serializing_operation", - "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", + "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD. Related metrics: tma_ms_switches", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents Shuffle (cross \"vector lane\" data transfers) uops fraction the CPU has retired.", - "MetricExpr": "INT_VEC_RETIRED.SHUFFLES / (tma_retiring * tma_info_thread_slots)", - "MetricGroup": "HPC;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group", - "MetricName": "tma_shuffles", - "MetricThreshold": "tma_shuffles > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring Shuffle operations of 256-bit vector size (FP or Integer)", + "MetricExpr": "tma_light_operations * INT_VEC_RETIRED.SHUFFLES / (tma_retiring * tma_info_thread_slots)", + "MetricGroup": "HPC;Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group", + "MetricName": "tma_shuffles_256b", + "MetricThreshold": "tma_shuffles_256b > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring Shuffle operations of 256-bit vector size (FP or Integer). Shuffles may incur slow cross \"vector lane\" data transfers.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "CPU_CLK_UNHALTED.PAUSE / tma_info_thread_clks", - "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", + "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", - "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", + "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: CPU_CLK_UNHALTED.PAUSE_INST", "ScaleUnit": "100%" }, @@ -1840,7 +2177,7 @@ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { @@ -1907,10 +2244,10 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", "MetricExpr": "INT_MISC.UNKNOWN_BRANCH_CYCLES / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", + "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH", "ScaleUnit": "100%" }, { diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/cache.json b/tools/perf/pmu-events/arch/x86/sierraforest/cache.json index 7f0dc65a55..f937ba0e50 100644 --- a/tools/perf/pmu-events/arch/x86/sierraforest/cache.json +++ b/tools/perf/pmu-events/arch/x86/sierraforest/cache.json @@ -16,6 +16,148 @@ "UMask": "0x4f" }, { + "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an instruction cache or TLB miss.", + "EventCode": "0x35", + "EventName": "MEM_BOUND_STALLS_IFETCH.ALL", + "SampleAfterValue": "1000003", + "UMask": "0x7f" + }, + { + "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the L2 cache.", + "EventCode": "0x35", + "EventName": "MEM_BOUND_STALLS_IFETCH.L2_HIT", + "PublicDescription": "Counts the number of cycles the core is stalled due to an instruction cache or Translation Lookaside Buffer (TLB) miss which hit in the L2 cache.", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an icache or itlb miss which hit in the LLC.", + "EventCode": "0x35", + "EventName": "MEM_BOUND_STALLS_IFETCH.LLC_HIT", + "SampleAfterValue": "1000003", + "UMask": "0x6" + }, + { + "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an icache or itlb miss which missed all the caches.", + "EventCode": "0x35", + "EventName": "MEM_BOUND_STALLS_IFETCH.LLC_MISS", + "SampleAfterValue": "1000003", + "UMask": "0x78" + }, + { + "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an L1 demand load miss.", + "EventCode": "0x34", + "EventName": "MEM_BOUND_STALLS_LOAD.ALL", + "SampleAfterValue": "1000003", + "UMask": "0x7f" + }, + { + "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load which hit in the L2 cache.", + "EventCode": "0x34", + "EventName": "MEM_BOUND_STALLS_LOAD.L2_HIT", + "PublicDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the L2 cache.", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which hit in the LLC.", + "EventCode": "0x34", + "EventName": "MEM_BOUND_STALLS_LOAD.LLC_HIT", + "SampleAfterValue": "1000003", + "UMask": "0x6" + }, + { + "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which missed all the local caches.", + "EventCode": "0x34", + "EventName": "MEM_BOUND_STALLS_LOAD.LLC_MISS", + "SampleAfterValue": "1000003", + "UMask": "0x78" + }, + { + "BriefDescription": "Counts the number of load ops retired that miss the L3 cache and hit in DRAM", + "EventCode": "0xd3", + "EventName": "MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of load ops retired that hit the L1 data cache.", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_UOPS_RETIRED.L1_HIT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of load ops retired that miss in the L1 data cache.", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_UOPS_RETIRED.L1_MISS", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x40" + }, + { + "BriefDescription": "Counts the number of load ops retired that hit in the L2 cache.", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_UOPS_RETIRED.L2_HIT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of load ops retired that miss in the L2 cache.", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_UOPS_RETIRED.L2_MISS", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x80" + }, + { + "BriefDescription": "Counts the number of load ops retired that hit in the L3 cache.", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_UOPS_RETIRED.L3_HIT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x1c" + }, + { + "BriefDescription": "Counts the number of loads that hit in a write combining buffer (WCB), excluding the first load that caused the WCB to allocate.", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_UOPS_RETIRED.WCB_HIT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x20" + }, + { + "BriefDescription": "Counts the number of cycles that uops are blocked for any of the following reasons: load buffer, store buffer or RSV full.", + "EventCode": "0x04", + "EventName": "MEM_SCHEDULER_BLOCK.ALL", + "SampleAfterValue": "20003", + "UMask": "0x7" + }, + { + "BriefDescription": "Counts the number of cycles that uops are blocked due to a load buffer full condition.", + "EventCode": "0x04", + "EventName": "MEM_SCHEDULER_BLOCK.LD_BUF", + "SampleAfterValue": "20003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of cycles that uops are blocked due to an RSV full condition.", + "EventCode": "0x04", + "EventName": "MEM_SCHEDULER_BLOCK.RSV", + "SampleAfterValue": "20003", + "UMask": "0x4" + }, + { + "BriefDescription": "Counts the number of cycles that uops are blocked due to a store buffer full condition.", + "EventCode": "0x04", + "EventName": "MEM_SCHEDULER_BLOCK.ST_BUF", + "SampleAfterValue": "20003", + "UMask": "0x1" + }, + { "BriefDescription": "Counts the number of load ops retired.", "Data_LA": "1", "EventCode": "0xd0", @@ -144,6 +286,42 @@ "UMask": "0x5" }, { + "BriefDescription": "Counts the number of load uops retired that performed one or more locks", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOCK_LOADS", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x21" + }, + { + "BriefDescription": "Counts the number of memory uops retired that were splits.", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.SPLIT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x43" + }, + { + "BriefDescription": "Counts the number of retired split load uops.", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.SPLIT_LOADS", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x41" + }, + { + "BriefDescription": "Counts the number of retired split store uops.", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.SPLIT_STORES", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x42" + }, + { "BriefDescription": "Counts the number of stores uops retired same as MEM_UOPS_RETIRED.ALL_STORES", "Data_LA": "1", "EventCode": "0xd0", @@ -151,5 +329,12 @@ "PEBS": "2", "SampleAfterValue": "1000003", "UMask": "0x6" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to an icache miss", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.ICACHE", + "SampleAfterValue": "1000003", + "UMask": "0x20" } ] diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/floating-point.json b/tools/perf/pmu-events/arch/x86/sierraforest/floating-point.json new file mode 100644 index 0000000000..00c9a8ae0f --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/sierraforest/floating-point.json @@ -0,0 +1,68 @@ +[ + { + "BriefDescription": "Counts the number of cycles when any of the floating point dividers are active.", + "CounterMask": "1", + "EventCode": "0xcd", + "EventName": "ARITH.FPDIV_ACTIVE", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of all types of floating point operations per uop with all default weighting", + "EventCode": "0xc8", + "EventName": "FP_FLOPS_RETIRED.ALL", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x3" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to FP_FLOPS_RETIRED.FP64]", + "Deprecated": "1", + "EventCode": "0xc8", + "EventName": "FP_FLOPS_RETIRED.DP", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of floating point operations that produce 32 bit single precision results [This event is alias to FP_FLOPS_RETIRED.SP]", + "EventCode": "0xc8", + "EventName": "FP_FLOPS_RETIRED.FP32", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of floating point operations that produce 64 bit double precision results [This event is alias to FP_FLOPS_RETIRED.DP]", + "EventCode": "0xc8", + "EventName": "FP_FLOPS_RETIRED.FP64", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to FP_FLOPS_RETIRED.FP32]", + "Deprecated": "1", + "EventCode": "0xc8", + "EventName": "FP_FLOPS_RETIRED.SP", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of floating point operations retired that required microcode assist.", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.FP_ASSIST", + "PublicDescription": "Counts the number of floating point operations retired that required microcode assist, which is not a reflection of the number of FP operations, instructions or uops.", + "SampleAfterValue": "20003", + "UMask": "0x4" + }, + { + "BriefDescription": "Counts the number of floating point divide uops retired (x87 and sse, including x87 sqrt).", + "EventCode": "0xc2", + "EventName": "UOPS_RETIRED.FPDIV", + "PEBS": "1", + "SampleAfterValue": "2000003", + "UMask": "0x8" + } +] diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/frontend.json b/tools/perf/pmu-events/arch/x86/sierraforest/frontend.json index be8f1c7e19..356d36aecc 100644 --- a/tools/perf/pmu-events/arch/x86/sierraforest/frontend.json +++ b/tools/perf/pmu-events/arch/x86/sierraforest/frontend.json @@ -1,5 +1,21 @@ [ { + "BriefDescription": "Counts the total number of BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.", + "EventCode": "0xe6", + "EventName": "BACLEARS.ANY", + "PublicDescription": "Counts the total number of BACLEARS, which occur when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend. Includes BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.", + "SampleAfterValue": "200003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to ITLB miss", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.ITLB_MISS", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x10" + }, + { "BriefDescription": "Counts every time the code stream enters into a new cache line by walking sequential from the previous line or being redirected by a jump.", "EventCode": "0x80", "EventName": "ICACHE.ACCESSES", diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/memory.json b/tools/perf/pmu-events/arch/x86/sierraforest/memory.json index 79d8af4510..e0ce2decc8 100644 --- a/tools/perf/pmu-events/arch/x86/sierraforest/memory.json +++ b/tools/perf/pmu-events/arch/x86/sierraforest/memory.json @@ -1,5 +1,71 @@ [ { + "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to any number of reasons, including an L1 miss, WCB full, pagewalk, store address block or store data block, on a load that retires.", + "EventCode": "0x05", + "EventName": "LD_HEAD.ANY_AT_RET", + "SampleAfterValue": "1000003", + "UMask": "0xff" + }, + { + "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to a core bound stall including a store address match, a DTLB miss or a page walk that detains the load from retiring.", + "EventCode": "0x05", + "EventName": "LD_HEAD.L1_BOUND_AT_RET", + "SampleAfterValue": "1000003", + "UMask": "0xf4" + }, + { + "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a DL1 miss.", + "EventCode": "0x05", + "EventName": "LD_HEAD.L1_MISS_AT_RET", + "SampleAfterValue": "1000003", + "UMask": "0x81" + }, + { + "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to other block cases.", + "EventCode": "0x05", + "EventName": "LD_HEAD.OTHER_AT_RET", + "PublicDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to other block cases such as pipeline conflicts, fences, etc.", + "SampleAfterValue": "1000003", + "UMask": "0xc0" + }, + { + "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a pagewalk.", + "EventCode": "0x05", + "EventName": "LD_HEAD.PGWALK_AT_RET", + "SampleAfterValue": "1000003", + "UMask": "0xa0" + }, + { + "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a store address match.", + "EventCode": "0x05", + "EventName": "LD_HEAD.ST_ADDR_AT_RET", + "SampleAfterValue": "1000003", + "UMask": "0x84" + }, + { + "BriefDescription": "Counts the number of machine clears due to memory ordering caused by a snoop from an external agent. Does not count internally generated machine clears such as those due to memory disambiguation.", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.MEMORY_ORDERING", + "SampleAfterValue": "20003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts misaligned loads that are 4K page splits.", + "EventCode": "0x13", + "EventName": "MISALIGN_MEM_REF.LOAD_PAGE_SPLIT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts misaligned stores that are 4K page splits.", + "EventCode": "0x13", + "EventName": "MISALIGN_MEM_REF.STORE_PAGE_SPLIT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x4" + }, + { "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.", "EventCode": "0xB7", "EventName": "OCR.DEMAND_DATA_RD.L3_MISS", diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/other.json b/tools/perf/pmu-events/arch/x86/sierraforest/other.json index 2414f6ff53..70a9da7e97 100644 --- a/tools/perf/pmu-events/arch/x86/sierraforest/other.json +++ b/tools/perf/pmu-events/arch/x86/sierraforest/other.json @@ -1,5 +1,14 @@ [ { + "BriefDescription": "This event is deprecated. [This event is alias to MISC_RETIRED.LBR_INSERTS]", + "Deprecated": "1", + "EventCode": "0xe4", + "EventName": "LBR_INSERTS.ANY", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { "BriefDescription": "Counts demand data reads that have any type of response.", "EventCode": "0xB7", "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE", @@ -16,5 +25,12 @@ "MSRValue": "0x10002", "SampleAfterValue": "100003", "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of issue slots in a UMWAIT or TPAUSE instruction where no uop issues due to the instruction putting the CPU into the C0.1 activity state.", + "EventCode": "0x75", + "EventName": "SERIALIZATION.C01_MS_SCB", + "SampleAfterValue": "200003", + "UMask": "0x4" } ] diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json b/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json index 41212957ef..ba9843110f 100644 --- a/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json @@ -1,5 +1,13 @@ [ { + "BriefDescription": "Counts the number of cycles when any of the dividers are active.", + "CounterMask": "1", + "EventCode": "0xcd", + "EventName": "ARITH.DIV_ACTIVE", + "SampleAfterValue": "1000003", + "UMask": "0x3" + }, + { "BriefDescription": "Counts the total number of branch instructions retired for all branch types.", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.ALL_BRANCHES", @@ -8,6 +16,71 @@ "SampleAfterValue": "200003" }, { + "BriefDescription": "Counts the number of retired JCC (Jump on Conditional Code) branch instructions retired, includes both taken and not taken branches.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.COND", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x7e" + }, + { + "BriefDescription": "Counts the number of taken JCC (Jump on Conditional Code) branch instructions retired.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.COND_TAKEN", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xfe" + }, + { + "BriefDescription": "Counts the number of far branch instructions retired, includes far jump, far call and return, and interrupt call and return.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.FAR_BRANCH", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xbf" + }, + { + "BriefDescription": "Counts the number of near indirect JMP and near indirect CALL branch instructions retired.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.INDIRECT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xeb" + }, + { + "BriefDescription": "Counts the number of near indirect CALL branch instructions retired.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.INDIRECT_CALL", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xfb" + }, + { + "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.INDIRECT_CALL", + "Deprecated": "1", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.IND_CALL", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xfb" + }, + { + "BriefDescription": "Counts the number of near CALL branch instructions retired.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_CALL", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xf9" + }, + { + "BriefDescription": "Counts the number of near RET branch instructions retired.", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_RETURN", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xf7" + }, + { "BriefDescription": "Counts the total number of mispredicted branch instructions retired for all branch types.", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.ALL_BRANCHES", @@ -16,6 +89,54 @@ "SampleAfterValue": "200003" }, { + "BriefDescription": "Counts the number of mispredicted JCC (Jump on Conditional Code) branch instructions retired.", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x7e" + }, + { + "BriefDescription": "Counts the number of mispredicted taken JCC (Jump on Conditional Code) branch instructions retired.", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND_TAKEN", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xfe" + }, + { + "BriefDescription": "Counts the number of mispredicted near indirect JMP and near indirect CALL branch instructions retired.", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.INDIRECT", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xeb" + }, + { + "BriefDescription": "Counts the number of mispredicted near indirect CALL branch instructions retired.", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.INDIRECT_CALL", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xfb" + }, + { + "BriefDescription": "Counts the number of mispredicted near taken branch instructions retired.", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.NEAR_TAKEN", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0x80" + }, + { + "BriefDescription": "Counts the number of mispredicted near RET branch instructions retired.", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.RETURN", + "PEBS": "1", + "SampleAfterValue": "200003", + "UMask": "0xf7" + }, + { "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles", "EventName": "CPU_CLK_UNHALTED.CORE", "SampleAfterValue": "2000003", @@ -68,6 +189,66 @@ "SampleAfterValue": "2000003" }, { + "BriefDescription": "Counts the number of retired loads that are blocked because it initially appears to be store forward blocked, but subsequently is shown not to be blocked based on 4K alias check.", + "EventCode": "0x03", + "EventName": "LD_BLOCKS.ADDRESS_ALIAS", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x4" + }, + { + "BriefDescription": "Counts the number of retired loads that are blocked because its address exactly matches an older store whose data is not ready.", + "EventCode": "0x03", + "EventName": "LD_BLOCKS.DATA_UNKNOWN", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of retired loads that are blocked because its address partially overlapped with an older store.", + "EventCode": "0x03", + "EventName": "LD_BLOCKS.STORE_FORWARD", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of machine clears due to memory ordering in which an internal load passes an older store within the same CPU.", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.DISAMBIGUATION", + "SampleAfterValue": "20003", + "UMask": "0x8" + }, + { + "BriefDescription": "Counts the number of machine clears due to a page fault. Counts both I-Side and D-Side (Loads/Stores) page faults. A page fault occurs when either the page is not present, or an access violation occurs.", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.PAGE_FAULT", + "SampleAfterValue": "20003", + "UMask": "0x20" + }, + { + "BriefDescription": "Counts the number of machine clears that flush the pipeline and restart the machine with the use of microcode due to SMC, MEMORY_ORDERING, FP_ASSISTS, PAGE_FAULT, DISAMBIGUATION, and FPC_VIRTUAL_TRAP.", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.SLOW", + "SampleAfterValue": "20003", + "UMask": "0x6f" + }, + { + "BriefDescription": "Counts the number of machine clears due to program modifying data (self modifying code) within 1K of a recently fetched code page.", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.SMC", + "SampleAfterValue": "20003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of Last Branch Record (LBR) entries. Requires LBRs to be enabled and configured in IA32_LBR_CTL. [This event is alias to LBR_INSERTS.ANY]", + "EventCode": "0xe4", + "EventName": "MISC_RETIRED.LBR_INSERTS", + "PEBS": "1", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.", "EventCode": "0x73", "EventName": "TOPDOWN_BAD_SPECULATION.ALL", @@ -75,22 +256,201 @@ "SampleAfterValue": "1000003" }, { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to Fast Nukes such as Memory Ordering Machine clears and MRN nukes", + "EventCode": "0x73", + "EventName": "TOPDOWN_BAD_SPECULATION.FASTNUKE", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation.", + "EventCode": "0x73", + "EventName": "TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS", + "SampleAfterValue": "1000003", + "UMask": "0x3" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to Branch Mispredict", + "EventCode": "0x73", + "EventName": "TOPDOWN_BAD_SPECULATION.MISPREDICT", + "SampleAfterValue": "1000003", + "UMask": "0x4" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to a machine clear (nuke).", + "EventCode": "0x73", + "EventName": "TOPDOWN_BAD_SPECULATION.NUKE", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls", "EventCode": "0x74", "EventName": "TOPDOWN_BE_BOUND.ALL", "SampleAfterValue": "1000003" }, { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to due to certain allocation restrictions", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to memory reservation stall (scheduler not being able to accept another uop). This could be caused by RSV full or load/store buffer block.", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.MEM_SCHEDULER", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to IEC and FPC RAT stalls - which can be due to the FIQ and IEC reservation station stall (integer, FP and SIMD scheduler not being able to accept another uop. )", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER", + "SampleAfterValue": "1000003", + "UMask": "0x8" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to mrbl stall. A 'marble' refers to a physical register file entry, also known as the physical destination (PDST).", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.REGISTER", + "SampleAfterValue": "1000003", + "UMask": "0x20" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to ROB full", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.REORDER_BUFFER", + "SampleAfterValue": "1000003", + "UMask": "0x40" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to iq/jeu scoreboards or ms scb", + "EventCode": "0x74", + "EventName": "TOPDOWN_BE_BOUND.SERIALIZATION", + "SampleAfterValue": "1000003", + "UMask": "0x10" + }, + { "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls", "EventCode": "0x71", "EventName": "TOPDOWN_FE_BOUND.ALL", "SampleAfterValue": "1000003" }, { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BAClear", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.BRANCH_DETECT", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BTClear", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.BRANCH_RESTEER", + "SampleAfterValue": "1000003", + "UMask": "0x40" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to ms", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.CISC", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to decode stall", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.DECODE", + "SampleAfterValue": "1000003", + "UMask": "0x8" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH", + "SampleAfterValue": "1000003", + "UMask": "0x8d" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to latency related stalls including BACLEARs, BTCLEARs, ITLB misses, and ICache misses.", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.FRONTEND_LATENCY", + "SampleAfterValue": "1000003", + "UMask": "0x72" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to TOPDOWN_FE_BOUND.ITLB_MISS]", + "Deprecated": "1", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.ITLB", + "SampleAfterValue": "1000003", + "UMask": "0x10" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to itlb miss [This event is alias to TOPDOWN_FE_BOUND.ITLB]", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.ITLB_MISS", + "SampleAfterValue": "1000003", + "UMask": "0x10" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend that do not categorize into any other common frontend stall", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.OTHER", + "SampleAfterValue": "1000003", + "UMask": "0x80" + }, + { + "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to predecode wrong", + "EventCode": "0x71", + "EventName": "TOPDOWN_FE_BOUND.PREDECODE", + "SampleAfterValue": "1000003", + "UMask": "0x4" + }, + { "BriefDescription": "Counts the number of consumed retirement slots. Similar to UOPS_RETIRED.ALL", "EventCode": "0x72", "EventName": "TOPDOWN_RETIRING.ALL", "PEBS": "1", "SampleAfterValue": "1000003" + }, + { + "BriefDescription": "Counts the number of uops issued by the front end every cycle.", + "EventCode": "0x0e", + "EventName": "UOPS_ISSUED.ANY", + "PublicDescription": "Counts the number of uops issued by the front end every cycle. When 4-uops are requested and only 2-uops are delivered, the event counts 2. Uops_issued correlates to the number of ROB entries. If uop takes 2 ROB slots it counts as 2 uops_issued.", + "SampleAfterValue": "1000003" + }, + { + "BriefDescription": "Counts the total number of uops retired.", + "EventCode": "0xc2", + "EventName": "UOPS_RETIRED.ALL", + "PEBS": "1", + "SampleAfterValue": "2000003" + }, + { + "BriefDescription": "Counts the number of integer divide uops retired.", + "EventCode": "0xc2", + "EventName": "UOPS_RETIRED.IDIV", + "PEBS": "1", + "SampleAfterValue": "2000003", + "UMask": "0x10" + }, + { + "BriefDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS). This includes uops from flows due to complex instructions, faults, assists, and inserted flows.", + "EventCode": "0xc2", + "EventName": "UOPS_RETIRED.MS", + "PEBS": "1", + "SampleAfterValue": "2000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of x87 uops retired, includes those in ms flows", + "EventCode": "0xc2", + "EventName": "UOPS_RETIRED.X87", + "PEBS": "1", + "SampleAfterValue": "2000003", + "UMask": "0x2" } ] diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/uncore-cache.json b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-cache.json new file mode 100644 index 0000000000..a3aafbbc34 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-cache.json @@ -0,0 +1,2853 @@ +[ + { + "BriefDescription": "Clockticks for CMS units attached to CHA", + "EventCode": "0x01", + "EventName": "UNC_CHACMS_CLOCKTICKS", + "PerPkg": "1", + "PortMask": "0x000", + "PublicDescription": "UNC_CHACMS_CLOCKTICKS", + "Unit": "CHACMS" + }, + { + "BriefDescription": "Number of CHA clock cycles while the event is enabled", + "EventCode": "0x01", + "EventName": "UNC_CHA_CLOCKTICKS", + "PerPkg": "1", + "PublicDescription": "Clockticks of the uncore caching and home agent (CHA)", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts transactions that looked into the multi-socket cacheline Directory state, and therefore did not send a snoop because the Directory indicated it was not needed.", + "EventCode": "0x53", + "EventName": "UNC_CHA_DIR_LOOKUP.NO_SNP", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts transactions that looked into the multi-socket cacheline Directory state, and sent one or more snoops, because the Directory indicated it was needed.", + "EventCode": "0x53", + "EventName": "UNC_CHA_DIR_LOOKUP.SNP", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts only multi-socket cacheline Directory state updates memory writes issued from the HA pipe. This does not include memory write requests which are for I (Invalid) or E (Exclusive) cachelines.", + "EventCode": "0x54", + "EventName": "UNC_CHA_DIR_UPDATE.HA", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts only multi-socket cacheline Directory state updates due to memory writes issued from the TOR pipe which are the result of remote transaction hitting the SF/LLC and returning data Core2Core. This does not include memory write requests which are for I (Invalid) or E (Exclusive) cachelines.", + "EventCode": "0x54", + "EventName": "UNC_CHA_DIR_UPDATE.TOR", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "CHA" + }, + { + "BriefDescription": "Distress signal assertion for dynamic prefetch throttle (DPT). Threshold for distress signal assertion reached in TOR or IRQ (immediate cause for triggering).", + "EventCode": "0x59", + "EventName": "UNC_CHA_DISTRESS_ASSERTED.DPT_ANY", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x3", + "Unit": "CHA" + }, + { + "BriefDescription": "Distress signal assertion for dynamic prefetch throttle (DPT). Threshold for distress signal assertion reached in IRQ (immediate cause for triggering).", + "EventCode": "0x59", + "EventName": "UNC_CHA_DISTRESS_ASSERTED.DPT_IRQ", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "CHA" + }, + { + "BriefDescription": "Distress signal assertion for dynamic prefetch throttle (DPT). Threshold for distress signal assertion reached in TOR (immediate cause for triggering).", + "EventCode": "0x59", + "EventName": "UNC_CHA_DISTRESS_ASSERTED.DPT_TOR", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts when a normal (Non-Isochronous) full line write is issued from the CHA to the any of the memory controller channels.", + "EventCode": "0x5b", + "EventName": "UNC_CHA_IMC_WRITES_COUNT.FULL", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "CHA" + }, + { + "BriefDescription": "CHA to iMC Full Line Writes Issued : ISOCH Full Line : Counts the total number of full line writes issued from the HA into the memory controller.", + "EventCode": "0x5b", + "EventName": "UNC_CHA_IMC_WRITES_COUNT.FULL_PRIORITY", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "CHA" + }, + { + "BriefDescription": "CHA to iMC Full Line Writes Issued : Partial Non-ISOCH : Counts the total number of full line writes issued from the HA into the memory controller.", + "EventCode": "0x5b", + "EventName": "UNC_CHA_IMC_WRITES_COUNT.PARTIAL", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "CHA" + }, + { + "BriefDescription": "CHA to iMC Full Line Writes Issued : ISOCH Partial : Counts the total number of full line writes issued from the HA into the memory controller.", + "EventCode": "0x5b", + "EventName": "UNC_CHA_IMC_WRITES_COUNT.PARTIAL_PRIORITY", + "PerPkg": "1", + "UMask": "0x8", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: All Requests to Remotely Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.ALL_REMOTE", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : All transactions from Remote Agents", + "UMask": "0x17e0ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: CRd Requests", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.CODE", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : CRd Requests", + "UMask": "0x1bd0ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Read Requests and Read Prefetches", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.DATA_RD", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions", + "UMask": "0x1bc1ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Read Requests, Read Prefetches, and Snoops", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.DATA_READ_ALL", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : Data Reads", + "UMask": "0x1fc1ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Read Requests to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.DATA_READ_LOCAL", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : Demand Data Reads, Core and LLC prefetches", + "UMask": "0x841ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Read Requests, Read Prefetches, and Snoops which miss the Cache", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.DATA_READ_MISS", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : Data Read Misses", + "UMask": "0x1fc101", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: All Requests to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCALLY_HOMED_ADDRESS", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : Transactions homed locally", + "UMask": "0xbdfff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Code Read Requests and Code Read Prefetches to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_CODE", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : CRd Requests", + "UMask": "0x19d0ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Read Requests and Read Prefetches to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_DATA_RD", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions", + "UMask": "0x19c1ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Code Read Requests to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_DMND_CODE", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : CRd Requests", + "UMask": "0x1850ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Read Requests to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_DMND_DATA_RD", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions", + "UMask": "0x1841ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: RFO Requests to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_DMND_RFO", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : RFO Requests", + "UMask": "0x1848ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: LLC Prefetch Requests to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_LLC_PF", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions", + "UMask": "0x189dff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: All Prefetches to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_PF", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions", + "UMask": "0x199dff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Code Prefetches to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_PF_CODE", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : CRd Requests", + "UMask": "0x1910ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Read Prefetches to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_PF_DATA_RD", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions", + "UMask": "0x1981ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: RFO Prefetches to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_PF_RFO", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : RFO Requests", + "UMask": "0x1908ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: RFO Requests and RFO Prefetches to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_RFO", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : RFO Requests", + "UMask": "0x19c8ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: All Requests to Remotely Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.REMOTELY_HOMED_ADDRESS", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : Transactions homed remotely : Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. : Transaction whose address resides in a remote MC", + "UMask": "0x15dfff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Code Read/Prefetch Requests from a Remote Socket", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.REMOTE_CODE", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : CRd Requests", + "UMask": "0x1a10ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Data Read/Prefetch Requests from a Remote Socket", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.REMOTE_DATA_RD", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2. This has numerous filters available. Note the non-standard filtering equation. This event will count requests that lookup the cache multiple times with multiple increments. One must ALWAYS set umask bit 0 and select a state or states to match. Otherwise, the event will count nothing. CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions", + "UMask": "0x1a01ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: RFO Requests/Prefetches from a Remote Socket", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.REMOTE_RFO", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : RFO Requests", + "UMask": "0x1a08ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Snoop Requests from a Remote Socket", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.REMOTE_SNP", + "PerPkg": "1", + "PublicDescription": "Counts the number of times the LLC was accessed", + "UMask": "0x1c19ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: All RFO and RFO Prefetches", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.RFO", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : All RFOs - Demand and Prefetches", + "UMask": "0x1bc8ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: RFO Requests and RFO Prefetches to Locally Homed Memory", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.RFO_LOCAL", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : Locally HOMed RFOs - Demand and Prefetches", + "UMask": "0x9c8ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Writes to Locally Homed Memory (includes writebacks from L1/L2)", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.WRITE_LOCAL", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : Writes", + "UMask": "0x842ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Cache Lookups: Writes to Remotely Homed Memory (includes writebacks from L1/L2)", + "EventCode": "0x34", + "EventName": "UNC_CHA_LLC_LOOKUP.WRITE_REMOTE", + "PerPkg": "1", + "PublicDescription": "Cache Lookups : Remote Writes", + "UMask": "0x17c2ff", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.ALL", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : All Lines Victimized", + "UMask": "0xf", + "Unit": "CHA" + }, + { + "BriefDescription": "Lines Victimized : IA traffic : Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.IA", + "PerPkg": "1", + "UMask": "0x20", + "Unit": "CHA" + }, + { + "BriefDescription": "Lines Victimized : IO traffic : Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.IO", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_ALL", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Local - All Lines", + "UMask": "0x200f", + "Unit": "CHA" + }, + { + "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_E", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Local - Lines in E State", + "UMask": "0x2002", + "Unit": "CHA" + }, + { + "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_F", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Local - Lines in F State", + "UMask": "0x2008", + "Unit": "CHA" + }, + { + "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_M", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Local - Lines in M State", + "UMask": "0x2001", + "Unit": "CHA" + }, + { + "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_S", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Local - Lines in S State", + "UMask": "0x2004", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.REMOTE_ALL", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Remote - All Lines", + "UMask": "0x800f", + "Unit": "CHA" + }, + { + "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.REMOTE_E", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Remote - Lines in E State", + "UMask": "0x8002", + "Unit": "CHA" + }, + { + "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.REMOTE_M", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Remote - Lines in M State", + "UMask": "0x8001", + "Unit": "CHA" + }, + { + "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.REMOTE_S", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Remote - Lines in S State", + "UMask": "0x8004", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.TOTAL_E", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Lines in E state", + "UMask": "0x2", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.TOTAL_M", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Lines in M state", + "UMask": "0x1", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts the number of lines that were victimized on a fill. This can be filtered by the state that the line was in.", + "EventCode": "0x37", + "EventName": "UNC_CHA_LLC_VICTIMS.TOTAL_S", + "PerPkg": "1", + "PublicDescription": "Lines Victimized : Lines in S State", + "UMask": "0x4", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts when a RFO (the Read for Ownership issued before a write) request hit a cacheline in the S (Shared) state.", + "EventCode": "0x39", + "EventName": "UNC_CHA_MISC.RFO_HIT_S", + "PerPkg": "1", + "PublicDescription": "Cbo Misc : RFO HitS", + "UMask": "0x8", + "Unit": "CHA" + }, + { + "BriefDescription": "OSB Snoop Broadcast : Local InvItoE : Count of OSB snoop broadcasts. Counts by 1 per request causing OSB snoops to be broadcast. Does not count all the snoops generated by OSB.", + "EventCode": "0x55", + "EventName": "UNC_CHA_OSB.LOCAL_INVITOE", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "CHA" + }, + { + "BriefDescription": "OSB Snoop Broadcast : Local Rd : Count of OSB snoop broadcasts. Counts by 1 per request causing OSB snoops to be broadcast. Does not count all the snoops generated by OSB.", + "EventCode": "0x55", + "EventName": "UNC_CHA_OSB.LOCAL_READ", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "CHA" + }, + { + "BriefDescription": "OSB Snoop Broadcast : Off : Count of OSB snoop broadcasts. Counts by 1 per request causing OSB snoops to be broadcast. Does not count all the snoops generated by OSB.", + "EventCode": "0x55", + "EventName": "UNC_CHA_OSB.OFF_PWRHEURISTIC", + "PerPkg": "1", + "UMask": "0x20", + "Unit": "CHA" + }, + { + "BriefDescription": "OSB Snoop Broadcast : Remote Rd : Count of OSB snoop broadcasts. Counts by 1 per request causing OSB snoops to be broadcast. Does not count all the snoops generated by OSB.", + "EventCode": "0x55", + "EventName": "UNC_CHA_OSB.REMOTE_READ", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "CHA" + }, + { + "BriefDescription": "OSB Snoop Broadcast : RFO HitS Snoop Broadcast : Count of OSB snoop broadcasts. Counts by 1 per request causing OSB snoops to be broadcast. Does not count all the snoops generated by OSB.", + "EventCode": "0x55", + "EventName": "UNC_CHA_OSB.RFO_HITS_SNP_BCAST", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "CHA" + }, + { + "BriefDescription": "UNC_CHA_REMOTE_SF.ALLOC_EXCLUSIVE", + "EventCode": "0x69", + "EventName": "UNC_CHA_REMOTE_SF.ALLOC_EXCLUSIVE", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "CHA" + }, + { + "BriefDescription": "UNC_CHA_REMOTE_SF.ALLOC_SHARED", + "EventCode": "0x69", + "EventName": "UNC_CHA_REMOTE_SF.ALLOC_SHARED", + "PerPkg": "1", + "UMask": "0x8", + "Unit": "CHA" + }, + { + "BriefDescription": "UNC_CHA_REMOTE_SF.DEALLOC_EVCTCLN", + "EventCode": "0x69", + "EventName": "UNC_CHA_REMOTE_SF.DEALLOC_EVCTCLN", + "PerPkg": "1", + "UMask": "0x40", + "Unit": "CHA" + }, + { + "BriefDescription": "UNC_CHA_REMOTE_SF.DIRBACKED_ONLY", + "EventCode": "0x69", + "EventName": "UNC_CHA_REMOTE_SF.DIRBACKED_ONLY", + "PerPkg": "1", + "Unit": "CHA" + }, + { + "BriefDescription": "UNC_CHA_REMOTE_SF.HIT_EXCLUSIVE", + "EventCode": "0x69", + "EventName": "UNC_CHA_REMOTE_SF.HIT_EXCLUSIVE", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "CHA" + }, + { + "BriefDescription": "UNC_CHA_REMOTE_SF.HIT_SHARED", + "EventCode": "0x69", + "EventName": "UNC_CHA_REMOTE_SF.HIT_SHARED", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "CHA" + }, + { + "BriefDescription": "UNC_CHA_REMOTE_SF.INCLUSIVE_ONLY", + "EventCode": "0x69", + "EventName": "UNC_CHA_REMOTE_SF.INCLUSIVE_ONLY", + "PerPkg": "1", + "Unit": "CHA" + }, + { + "BriefDescription": "UNC_CHA_REMOTE_SF.MISS", + "EventCode": "0x69", + "EventName": "UNC_CHA_REMOTE_SF.MISS", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "CHA" + }, + { + "BriefDescription": "UNC_CHA_REMOTE_SF.UPDATE_EXCLUSIVE", + "EventCode": "0x69", + "EventName": "UNC_CHA_REMOTE_SF.UPDATE_EXCLUSIVE", + "PerPkg": "1", + "Unit": "CHA" + }, + { + "BriefDescription": "UNC_CHA_REMOTE_SF.UPDATE_SHARED", + "EventCode": "0x69", + "EventName": "UNC_CHA_REMOTE_SF.UPDATE_SHARED", + "PerPkg": "1", + "UMask": "0x80", + "Unit": "CHA" + }, + { + "BriefDescription": "UNC_CHA_REMOTE_SF.VICTIM_EXCLUSIVE", + "EventCode": "0x69", + "EventName": "UNC_CHA_REMOTE_SF.VICTIM_EXCLUSIVE", + "PerPkg": "1", + "Unit": "CHA" + }, + { + "BriefDescription": "UNC_CHA_REMOTE_SF.VICTIM_SHARED", + "EventCode": "0x69", + "EventName": "UNC_CHA_REMOTE_SF.VICTIM_SHARED", + "PerPkg": "1", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts the total number of requests coming from a unit on this socket for exclusive ownership of a cache line without receiving data (INVITOE) to the CHA.", + "EventCode": "0x50", + "EventName": "UNC_CHA_REQUESTS.INVITOE", + "PerPkg": "1", + "PublicDescription": "HA Read and Write Requests : InvalItoE", + "UMask": "0x30", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts the total number of requests coming from a unit on this socket for exclusive ownership of a cache line without receiving data (INVITOE) to the CHA.", + "EventCode": "0x50", + "EventName": "UNC_CHA_REQUESTS.INVITOE_LOCAL", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts the total number of requests coming from a remote socket for exclusive ownership of a cache line without receiving data (INVITOE) to the CHA.", + "EventCode": "0x50", + "EventName": "UNC_CHA_REQUESTS.INVITOE_REMOTE", + "PerPkg": "1", + "UMask": "0x20", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts read requests made into this CHA. Reads include all read opcodes (including RFO: the Read for Ownership issued before a write) .", + "EventCode": "0x50", + "EventName": "UNC_CHA_REQUESTS.READS", + "PerPkg": "1", + "PublicDescription": "HA Read and Write Requests : Reads", + "UMask": "0x3", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts read requests coming from a unit on this socket made into this CHA. Reads include all read opcodes (including RFO: the Read for Ownership issued before a write).", + "EventCode": "0x50", + "EventName": "UNC_CHA_REQUESTS.READS_LOCAL", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts read requests coming from a remote socket made into the CHA. Reads include all read opcodes (including RFO: the Read for Ownership issued before a write).", + "EventCode": "0x50", + "EventName": "UNC_CHA_REQUESTS.READS_REMOTE", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts write requests made into the CHA, including streaming, evictions, HitM (Reads from another core to a Modified cacheline), etc.", + "EventCode": "0x50", + "EventName": "UNC_CHA_REQUESTS.WRITES", + "PerPkg": "1", + "PublicDescription": "HA Read and Write Requests : Writes", + "UMask": "0xc", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts write requests coming from a unit on this socket made into this CHA, including streaming, evictions, HitM (Reads from another core to a Modified cacheline), etc.", + "EventCode": "0x50", + "EventName": "UNC_CHA_REQUESTS.WRITES_LOCAL", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "CHA" + }, + { + "BriefDescription": "Counts the total number of read requests made into the Home Agent. Reads include all read opcodes (including RFO). Writes include all writes (streaming, evictions, HitM, etc).", + "EventCode": "0x50", + "EventName": "UNC_CHA_REQUESTS.WRITES_REMOTE", + "PerPkg": "1", + "UMask": "0x8", + "Unit": "CHA" + }, + { + "BriefDescription": "All TOR Inserts", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.ALL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All", + "UMask": "0xc001ffff", + "Unit": "CHA" + }, + { + "BriefDescription": "CLFlush transactions from a CXL device which hit in the L3.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.CXL_HIT_CLFLUSH", + "PerPkg": "1", + "UMask": "0x78c8c7fd20", + "Unit": "CHA" + }, + { + "BriefDescription": "FsRdCur transactions from a CXL device which hit in the L3.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.CXL_HIT_FSRDCUR", + "PerPkg": "1", + "UMask": "0x78c8effd20", + "Unit": "CHA" + }, + { + "BriefDescription": "FsRdCurPtl transactions from a CXL device which hit in the L3.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.CXL_HIT_FSRDCURPTL", + "PerPkg": "1", + "UMask": "0x78c9effd20", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoM transactions from a CXL device which hit in the L3.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.CXL_HIT_ITOM", + "PerPkg": "1", + "UMask": "0x78cc47fd20", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoMWr transactions from a CXL device which hit in the L3.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.CXL_HIT_ITOMWR", + "PerPkg": "1", + "UMask": "0x78cc4ffd20", + "Unit": "CHA" + }, + { + "BriefDescription": "MemPushWr transactions from a CXL device which hit in the L3.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.CXL_HIT_MEMPUSHWR", + "PerPkg": "1", + "UMask": "0x78cc6ffd20", + "Unit": "CHA" + }, + { + "BriefDescription": "WCiL transactions from a CXL device which hit in the L3.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.CXL_HIT_WCIL", + "PerPkg": "1", + "UMask": "0x78c86ffd20", + "Unit": "CHA" + }, + { + "BriefDescription": "WcilF transactions from a CXL device which hit in the L3.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.CXL_HIT_WCILF", + "PerPkg": "1", + "UMask": "0x78c867fd20", + "Unit": "CHA" + }, + { + "BriefDescription": "WiL transactions from a CXL device which hit in the L3.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.CXL_HIT_WIL", + "PerPkg": "1", + "UMask": "0x78c87ffd20", + "Unit": "CHA" + }, + { + "BriefDescription": "CLFlush transactions from a CXL device which miss the L3.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.CXL_MISS_CLFLUSH", + "PerPkg": "1", + "UMask": "0x78c8c7fe20", + "Unit": "CHA" + }, + { + "BriefDescription": "FsRdCur transactions from a CXL device which miss the L3.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.CXL_MISS_FSRDCUR", + "PerPkg": "1", + "UMask": "0x78c8effe20", + "Unit": "CHA" + }, + { + "BriefDescription": "FsRdCurPtl transactions from a CXL device which miss the L3.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.CXL_MISS_FSRDCURPTL", + "PerPkg": "1", + "UMask": "0x78c9effe20", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoM transactions from a CXL device which miss the L3.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.CXL_MISS_ITOM", + "PerPkg": "1", + "UMask": "0x78cc47fe20", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoMWr transactions from a CXL device which miss the L3.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.CXL_MISS_ITOMWR", + "PerPkg": "1", + "UMask": "0x78cc4ffe20", + "Unit": "CHA" + }, + { + "BriefDescription": "MemPushWr transactions from a CXL device which miss the L3.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.CXL_MISS_MEMPUSHWR", + "PerPkg": "1", + "UMask": "0x78cc6ffe20", + "Unit": "CHA" + }, + { + "BriefDescription": "WCiL transactions from a CXL device which miss the L3.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.CXL_MISS_WCIL", + "PerPkg": "1", + "UMask": "0x78c86ffe20", + "Unit": "CHA" + }, + { + "BriefDescription": "WcilF transactions from a CXL device which miss the L3.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.CXL_MISS_WCILF", + "PerPkg": "1", + "UMask": "0x78c867fe20", + "Unit": "CHA" + }, + { + "BriefDescription": "WiL transactions from a CXL device which miss the L3.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.CXL_MISS_WIL", + "PerPkg": "1", + "UMask": "0x78c87ffe20", + "Unit": "CHA" + }, + { + "BriefDescription": "All locally initiated requests from IA Cores", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All requests from iA Cores", + "UMask": "0xc001ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "CLFlush events that are initiated from the Core", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_CLFLUSH", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CLFlushes issued by iA Cores", + "UMask": "0xc8c7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "CLFlushOpt events that are initiated from the Core", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_CLFLUSHOPT", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CLFlushOpts issued by iA Cores", + "UMask": "0xc8d7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "Code read from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_CRD", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CRDs issued by iA Cores", + "UMask": "0xc80fff01", + "Unit": "CHA" + }, + { + "BriefDescription": "Code read prefetch from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_CRD_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts; Code read prefetch from local IA that misses in the snoop filter", + "UMask": "0xc88fff01", + "Unit": "CHA" + }, + { + "BriefDescription": "Data read opt from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_DRD_OPT", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : DRd_Opts issued by iA Cores", + "UMask": "0xc827ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "Data read opt prefetch from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_DRD_OPT_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : DRd_Opt_Prefs issued by iA Cores", + "UMask": "0xc8a7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "All locally initiated requests from IA Cores which hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All requests from iA Cores that Hit the LLC", + "UMask": "0xc001fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "Code read from local IA that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_CRD", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CRds issued by iA Cores that Hit the LLC", + "UMask": "0xc80ffd01", + "Unit": "CHA" + }, + { + "BriefDescription": "Code read prefetch from local IA that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_CRD_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CRd_Prefs issued by iA Cores that hit the LLC", + "UMask": "0xc88ffd01", + "Unit": "CHA" + }, + { + "BriefDescription": "All requests issued from IA cores to CXL accelerator memory regions that hit the LLC.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_CXL_ACC", + "PerPkg": "1", + "UMask": "0x10c0018101", + "Unit": "CHA" + }, + { + "BriefDescription": "Data read opt from local IA that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_DRD_OPT", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : DRd_Opts issued by iA Cores that hit the LLC", + "UMask": "0xc827fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "Data read opt prefetch from local IA that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_DRD_OPT_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : DRd_Opt_Prefs issued by iA Cores that hit the LLC", + "UMask": "0xc8a7fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoM requests from local IA cores that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by iA Cores that Hit LLC", + "UMask": "0xcc47fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "Last level cache prefetch code read from local IA that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LLCPREFCODE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : LLCPrefCode issued by iA Cores that hit the LLC", + "UMask": "0xcccffd01", + "Unit": "CHA" + }, + { + "BriefDescription": "Last level cache prefetch data read from local IA that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LLCPREFDATA", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : LLCPrefData issued by iA Cores that hit the LLC", + "UMask": "0xccd7fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "Last level cache prefetch read for ownership from local IA that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LLCPREFRFO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores that hit the LLC", + "UMask": "0xccc7fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "Read for ownership from local IA that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Hit the LLC", + "UMask": "0xc807fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "Read for ownership prefetch from local IA that hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_RFO_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFO_Prefs issued by iA Cores that Hit the LLC", + "UMask": "0xc887fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoM events that are initiated from the Core", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by iA Cores", + "UMask": "0xcc47ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoMCacheNear requests from local IA cores", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_ITOMCACHENEAR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMCacheNears issued by iA Cores", + "UMask": "0xcd47ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "Last level cache prefetch code read from local IA.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_LLCPREFCODE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : LLCPrefCode issued by iA Cores", + "UMask": "0xcccfff01", + "Unit": "CHA" + }, + { + "BriefDescription": "Last level cache prefetch data read from local IA.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_LLCPREFDATA", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : LLCPrefData issued by iA Cores", + "UMask": "0xccd7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "Last level cache prefetch read for ownership from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_LLCPREFRFO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores", + "UMask": "0xccc7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "All locally initiated requests from IA Cores which miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All requests from iA Cores that Missed the LLC", + "UMask": "0xc001fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Code read from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CRds issued by iA Cores that Missed the LLC", + "UMask": "0xc80ffe01", + "Unit": "CHA" + }, + { + "BriefDescription": "CRDs from local IA cores to locally homed memory", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CRd issued by iA Cores that Missed the LLC - HOMed locally", + "UMask": "0xc80efe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Code read prefetch from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CRd_Prefs issued by iA Cores that Missed the LLC", + "UMask": "0xc88ffe01", + "Unit": "CHA" + }, + { + "BriefDescription": "CRD Prefetches from local IA cores to locally homed memory", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD_PREF_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CRd_Prefs issued by iA Cores that Missed the LLC - HOMed locally", + "UMask": "0xc88efe01", + "Unit": "CHA" + }, + { + "BriefDescription": "CRD Prefetches from local IA cores to remotely homed memory", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD_PREF_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CRd_Prefs issued by iA Cores that Missed the LLC - HOMed remotely", + "UMask": "0xc88f7e01", + "Unit": "CHA" + }, + { + "BriefDescription": "CRDs from local IA cores to remotely homed memory", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CRd issued by iA Cores that Missed the LLC - HOMed remotely", + "UMask": "0xc80f7e01", + "Unit": "CHA" + }, + { + "BriefDescription": "All requests issued from IA cores to CXL accelerator memory regions that miss the LLC.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CXL_ACC", + "PerPkg": "1", + "UMask": "0x10c0018201", + "Unit": "CHA" + }, + { + "BriefDescription": "DRds and equivalent opcodes issued from an IA core which miss the L3 and target memory in a CXL type 2 memory expander card.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_CXL_ACC", + "PerPkg": "1", + "PublicDescription": "DRds issued from an IA core which miss the L3 and target memory in a CXL type 2 memory expander card.", + "UMask": "0x10c8178201", + "Unit": "CHA" + }, + { + "BriefDescription": "Data read opt from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : DRd_Opt issued by iA Cores that missed the LLC", + "UMask": "0xc827fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Inserts into the TOR from local IA cores which miss the LLC and snoop filter with the opcode DRd_Opt, and which target local memory", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : DRd_Opt issued by iA Cores that Missed the LLC - HOMed locally", + "UMask": "0xc826fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Data read opt prefetch from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : DRd_Opt_Prefs issued by iA Cores that missed the LLC", + "UMask": "0xc8a7fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Inserts into the TOR from local IA cores which miss the LLC and snoop filter with the opcode DRD_PREF_OPT, and target local memory", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT_PREF_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : DRd_Opt_Prefs issued by iA Cores that missed the LLC", + "UMask": "0xc8a6fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Inserts into the TOR from local IA cores which miss the LLC and snoop filter with the opcode DRD_PREF_OPT, and target remote memory", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT_PREF_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : DRd_Opt_Prefs issued by iA Cores that missed the LLC", + "UMask": "0xc8a77e01", + "Unit": "CHA" + }, + { + "BriefDescription": "Inserts into the TOR from local IA cores which miss the LLC and snoop filter with the opcode DRd_Opt, and target remote memory", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : DRd_Opt issued by iA Cores that missed the LLC", + "UMask": "0xc8277e01", + "Unit": "CHA" + }, + { + "BriefDescription": "L2 data prefetches issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_CXL_ACC", + "PerPkg": "1", + "UMask": "0x10c8978201", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoM requests from local IA cores that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by iA Cores that Missed LLC", + "UMask": "0xcc47fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Last level cache prefetch code read from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFCODE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : LLCPrefCode issued by iA Cores that missed the LLC", + "UMask": "0xcccffe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Last level cache prefetch data read from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFDATA", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : LLCPrefData issued by iA Cores that missed the LLC", + "UMask": "0xccd7fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "LLC data prefetches issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFDATA_CXL_ACC", + "PerPkg": "1", + "UMask": "0x10ccd78201", + "Unit": "CHA" + }, + { + "BriefDescription": "Last level cache prefetch read for ownership from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFRFO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores that missed the LLC", + "UMask": "0xccc7fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "L2 RFO prefetches issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFRFO_CXL_ACC", + "PerPkg": "1", + "UMask": "0x10c8878201", + "Unit": "CHA" + }, + { + "BriefDescription": "WCILF requests from local IA cores to locally homed DDR addresses that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LOCAL_WCILF_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLFs issued by iA Cores targeting DDR that missed the LLC - HOMed locally", + "UMask": "0xc8668601", + "Unit": "CHA" + }, + { + "BriefDescription": "WCILF requests from local IA cores to locally homed PMM addresses which miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LOCAL_WCILF_PMM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLFs issued by iA Cores targeting PMM that missed the LLC - HOMed locally", + "UMask": "0xc8668a01", + "Unit": "CHA" + }, + { + "BriefDescription": "WCIL requests from local IA cores to locally homed DDR addresses that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LOCAL_WCIL_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores targeting DDR that missed the LLC - HOMed locally", + "UMask": "0xc86e8601", + "Unit": "CHA" + }, + { + "BriefDescription": "WCIL requests from local IA cores to locally homed PMM addresses which miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LOCAL_WCIL_PMM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores targeting PMM that missed the LLC - HOMed locally", + "UMask": "0xc86e8a01", + "Unit": "CHA" + }, + { + "BriefDescription": "WCILF requests from local IA cores to remotely homed DDR addresses that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_REMOTE_WCILF_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLFs issued by iA Cores targeting DDR that missed the LLC - HOMed remotely", + "UMask": "0xc8670601", + "Unit": "CHA" + }, + { + "BriefDescription": "WCILF requests from local IA cores to remotely homed PMM addresses which miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_REMOTE_WCILF_PMM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLFs issued by iA Cores targeting PMM that missed the LLC - HOMed remotely", + "UMask": "0xc8670a01", + "Unit": "CHA" + }, + { + "BriefDescription": "WCIL requests from local IA cores to remotely homed DDR addresses that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_REMOTE_WCIL_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores targeting DDR that missed the LLC - HOMed remotely", + "UMask": "0xc86f0601", + "Unit": "CHA" + }, + { + "BriefDescription": "WCIL requests from local IA cores to remotely homed PMM addresses which miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_REMOTE_WCIL_PMM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores targeting PMM that missed the LLC - HOMed remotely", + "UMask": "0xc86f0a01", + "Unit": "CHA" + }, + { + "BriefDescription": "Read for ownership from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Missed the LLC", + "UMask": "0xc807fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "RFOs issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_CXL_ACC", + "PerPkg": "1", + "UMask": "0x10c8078201", + "Unit": "CHA" + }, + { + "BriefDescription": "Read for ownership from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Missed the LLC - HOMed locally", + "UMask": "0xc806fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Read for ownership prefetch from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFO_Prefs issued by iA Cores that Missed the LLC", + "UMask": "0xc887fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "LLC RFO prefetches issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF_CXL_ACC", + "PerPkg": "1", + "UMask": "0x10ccc78201", + "Unit": "CHA" + }, + { + "BriefDescription": "Read for ownership prefetch from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFO_Prefs issued by iA Cores that Missed the LLC - HOMed locally", + "UMask": "0xc886fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "Read for ownership prefetch from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFO_Prefs issued by iA Cores that Missed the LLC - HOMed remotely", + "UMask": "0xc8877e01", + "Unit": "CHA" + }, + { + "BriefDescription": "Read for ownership from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Missed the LLC - HOMed remotely", + "UMask": "0xc8077e01", + "Unit": "CHA" + }, + { + "BriefDescription": "UCRDF requests from local IA cores that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_UCRDF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : UCRdFs issued by iA Cores that Missed LLC", + "UMask": "0xc877de01", + "Unit": "CHA" + }, + { + "BriefDescription": "WCIL requests from a local IA core that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCIL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores that Missed the LLC", + "UMask": "0xc86ffe01", + "Unit": "CHA" + }, + { + "BriefDescription": "WCILF requests from local IA core that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCILF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLF issued by iA Cores that Missed the LLC", + "UMask": "0xc867fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "WCILF requests from local IA cores to DDR homed addresses which miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCILF_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLFs issued by iA Cores targeting DDR that missed the LLC", + "UMask": "0xc8678601", + "Unit": "CHA" + }, + { + "BriefDescription": "WCILF requests from local IA cores to PMM homed addresses which miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCILF_PMM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLFs issued by iA Cores targeting PMM that missed the LLC", + "UMask": "0xc8678a01", + "Unit": "CHA" + }, + { + "BriefDescription": "WCIL requests from local IA cores to DDR homed addresses which miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCIL_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores targeting DDR that missed the LLC", + "UMask": "0xc86f8601", + "Unit": "CHA" + }, + { + "BriefDescription": "WCIL requests from a local IA core to PMM homed addresses that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCIL_PMM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores targeting PMM that missed the LLC", + "UMask": "0xc86f8a01", + "Unit": "CHA" + }, + { + "BriefDescription": "WIL requests from local IA cores that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WIL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WiLs issued by iA Cores that Missed LLC", + "UMask": "0xc87fde01", + "Unit": "CHA" + }, + { + "BriefDescription": "Read for ownership from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFOs issued by iA Cores", + "UMask": "0xc807ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "Read for ownership prefetch from local IA that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_RFO_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFO_Prefs issued by iA Cores", + "UMask": "0xc887ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "SpecItoM events that are initiated from the Core", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_SPECITOM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : SpecItoMs issued by iA Cores", + "UMask": "0xcc57ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "WbEFtoEs issued by iA Cores. (Non Modified Write Backs)", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_WBEFTOE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC", + "UMask": "0xcc3fff01", + "Unit": "CHA" + }, + { + "BriefDescription": "WbEFtoIs issued by iA Cores . (Non Modified Write Backs)", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_WBEFTOI", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC", + "UMask": "0xcc37ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "WbMtoEs issued by iA Cores . (Modified Write Backs)", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_WBMTOE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC", + "UMask": "0xcc2fff01", + "Unit": "CHA" + }, + { + "BriefDescription": "WbMtoI requests from local IA cores", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_WBMTOI", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WbMtoIs issued by iA Cores", + "UMask": "0xcc27ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "WbStoIs issued by iA Cores . (Non Modified Write Backs)", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_WBSTOI", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC", + "UMask": "0xcc67ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "WCIL requests from a local IA core", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_WCIL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores", + "UMask": "0xc86fff01", + "Unit": "CHA" + }, + { + "BriefDescription": "WCILF requests from local IA core", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IA_WCILF", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WCiLF issued by iA Cores", + "UMask": "0xc867ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "All TOR inserts from local IO devices", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All requests from IO Devices", + "UMask": "0xc001ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "CLFlush requests from IO devices", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_CLFLUSH", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : CLFlushes issued by IO Devices", + "UMask": "0xc8c3ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "All TOR inserts from local IO devices which hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All requests from IO Devices that hit the LLC", + "UMask": "0xc001fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoMs from local IO devices which hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC", + "UMask": "0xcc43fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoMCacheNears, indicating a partial write request, from IO Devices that hit the LLC", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT_ITOMCACHENEAR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that hit the LLC", + "UMask": "0xcd43fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "PCIRDCURs issued by IO devices which hit the LLC", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT_PCIRDCUR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : PCIRdCurs issued by IO Devices that hit the LLC", + "UMask": "0xc8f3fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "RFOs from local IO devices which hit the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFOs issued by IO Devices that hit the LLC", + "UMask": "0xc803fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "All TOR ItoM inserts from local IO devices", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices", + "UMask": "0xcc43ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoMCacheNears, indicating a partial write request, from IO Devices", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices", + "UMask": "0xcd43ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "All TOR inserts from local IO devices which miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All requests from IO Devices that missed the LLC", + "UMask": "0xc001fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "All TOR ItoM inserts from local IO devices which miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that missed the LLC", + "UMask": "0xcc43fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC", + "UMask": "0xcd43fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoMCacheNear transactions from an IO device on the local socket that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC", + "UMask": "0xcd42fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoMCacheNear transactions from an IO device on a remote socket that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC", + "UMask": "0xcd437e04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoM transactions from an IO device on the local socket that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that missed the LLC", + "UMask": "0xcc42fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "ItoM transactions from an IO device on a remote socket that miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that missed the LLC", + "UMask": "0xcc437e04", + "Unit": "CHA" + }, + { + "BriefDescription": "PCIRDCURs issued by IO devices which miss the LLC", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_PCIRDCUR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : PCIRdCurs issued by IO Devices that missed the LLC", + "UMask": "0xc8f3fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "All TOR RFO inserts from local IO devices which miss the cache", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFOs issued by IO Devices that missed the LLC", + "UMask": "0xc803fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "PCIRDCURs issued by IO devices", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : PCIRdCurs issued by IO Devices", + "UMask": "0xc8f3ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "RFOs from local IO devices", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : RFOs issued by IO Devices", + "UMask": "0xc803ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "WBMtoI requests from IO devices", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.IO_WBMTOI", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : WbMtoIs issued by IO Devices", + "UMask": "0xcc23ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Inserts for SF or LLC Evictions", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.LLC_OR_SF_EVICTIONS", + "PerPkg": "1", + "PublicDescription": "TOR allocation occurred as a result of SF/LLC evictions (came from the ISMQ)", + "UMask": "0xc001ff02", + "Unit": "CHA" + }, + { + "BriefDescription": "All locally initiated requests", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.LOC_ALL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All from Local iA and IO", + "UMask": "0xc000ff05", + "Unit": "CHA" + }, + { + "BriefDescription": "All from Local iA", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.LOC_IA", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All from Local iA", + "UMask": "0xc000ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "All from Local IO", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.LOC_IO", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All from Local IO", + "UMask": "0xc000ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "All remote requests (e.g. snoops, writebacks) that came from remote sockets", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.REM_ALL", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All Remote Requests", + "UMask": "0xc001ffc8", + "Unit": "CHA" + }, + { + "BriefDescription": "All snoops to this LLC that came from remote sockets", + "EventCode": "0x35", + "EventName": "UNC_CHA_TOR_INSERTS.REM_SNPS", + "PerPkg": "1", + "PublicDescription": "TOR Inserts : All Snoops from Remote", + "UMask": "0xc001ff08", + "Unit": "CHA" + }, + { + "BriefDescription": "Occupancy for all TOR entries", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.ALL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All", + "UMask": "0xc001ffff", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for CLFlush transactions from a CXL device which hit in the L3.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_HIT_CLFLUSH", + "PerPkg": "1", + "UMask": "0x78c8c7fd20", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for FsRdCur transactions from a CXL device which hit in the L3.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_HIT_FSRDCUR", + "PerPkg": "1", + "UMask": "0x78c8effd20", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for FsRdCurPtl transactions from a CXL device which hit in the L3.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_HIT_FSRDCURPTL", + "PerPkg": "1", + "UMask": "0x78c9effd20", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoM transactions from a CXL device which hit in the L3.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_HIT_ITOM", + "PerPkg": "1", + "UMask": "0x78cc47fd20", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoMWr transactions from a CXL device which hit in the L3.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_HIT_ITOMWR", + "PerPkg": "1", + "UMask": "0x78cc4ffd20", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for MemPushWr transactions from a CXL device which hit in the L3.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_HIT_MEMPUSHWR", + "PerPkg": "1", + "UMask": "0x78cc6ffd20", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCiL transactions from a CXL device which hit in the L3.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_HIT_WCIL", + "PerPkg": "1", + "UMask": "0x78c86ffd20", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WcilF transactions from a CXL device which hit in the L3.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_HIT_WCILF", + "PerPkg": "1", + "UMask": "0x78c867fd20", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WiL transactions from a CXL device which hit in the L3.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_HIT_WIL", + "PerPkg": "1", + "UMask": "0x78c87ffd20", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for CLFlush transactions from a CXL device which miss the L3.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_MISS_CLFLUSH", + "PerPkg": "1", + "UMask": "0x78c8c7fe20", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for FsRdCur transactions from a CXL device which miss the L3.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_MISS_FSRDCUR", + "PerPkg": "1", + "UMask": "0x78c8effe20", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for FsRdCurPtl transactions from a CXL device which miss the L3.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_MISS_FSRDCURPTL", + "PerPkg": "1", + "UMask": "0x78c9effe20", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoM transactions from a CXL device which miss the L3.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_MISS_ITOM", + "PerPkg": "1", + "UMask": "0x78cc47fe20", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoMWr transactions from a CXL device which miss the L3.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_MISS_ITOMWR", + "PerPkg": "1", + "UMask": "0x78cc4ffe20", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for MemPushWr transactions from a CXL device which miss the L3.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_MISS_MEMPUSHWR", + "PerPkg": "1", + "UMask": "0x78cc6ffe20", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCiL transactions from a CXL device which miss the L3.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_MISS_WCIL", + "PerPkg": "1", + "UMask": "0x78c86ffe20", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WcilF transactions from a CXL device which miss the L3.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_MISS_WCILF", + "PerPkg": "1", + "UMask": "0x78c867fe20", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WiL transactions from a CXL device which miss the L3.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_MISS_WIL", + "PerPkg": "1", + "UMask": "0x78c87ffe20", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All locally initiated requests from IA Cores", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All requests from iA Cores", + "UMask": "0xc001ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for CLFlush events that are initiated from the Core", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_CLFLUSH", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CLFlushes issued by iA Cores", + "UMask": "0xc8c7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for CLFlushOpt events that are initiated from the Core", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_CLFLUSHOPT", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CLFlushOpts issued by iA Cores", + "UMask": "0xc8d7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Code read from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_CRD", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CRDs issued by iA Cores", + "UMask": "0xc80fff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Code read prefetch from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_CRD_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy; Code read prefetch from local IA that misses in the snoop filter", + "UMask": "0xc88fff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Data read opt from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_DRD_OPT", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : DRd_Opts issued by iA Cores", + "UMask": "0xc827ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Data read opt prefetch from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_DRD_OPT_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : DRd_Opt_Prefs issued by iA Cores", + "UMask": "0xc8a7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All locally initiated requests from IA Cores which hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All requests from iA Cores that Hit the LLC", + "UMask": "0xc001fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Code read from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_CRD", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CRds issued by iA Cores that Hit the LLC", + "UMask": "0xc80ffd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Code read prefetch from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_CRD_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CRd_Prefs issued by iA Cores that hit the LLC", + "UMask": "0xc88ffd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All requests issued from IA cores to CXL accelerator memory regions that hit the LLC.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_CXL_ACC", + "PerPkg": "1", + "UMask": "0x10c0018101", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Data read opt from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_DRD_OPT", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : DRd_Opts issued by iA Cores that hit the LLC", + "UMask": "0xc827fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Data read opt prefetch from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_DRD_OPT_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : DRd_Opt_Prefs issued by iA Cores that hit the LLC", + "UMask": "0xc8a7fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoM requests from local IA cores that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMs issued by iA Cores that Hit LLC", + "UMask": "0xcc47fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Last level cache prefetch code read from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LLCPREFCODE", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : LLCPrefCode issued by iA Cores that hit the LLC", + "UMask": "0xcccffd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Last level cache prefetch data read from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LLCPREFDATA", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : LLCPrefData issued by iA Cores that hit the LLC", + "UMask": "0xccd7fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Last level cache prefetch read for ownership from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LLCPREFRFO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores that hit the LLC", + "UMask": "0xccc7fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Read for ownership from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Hit the LLC", + "UMask": "0xc807fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Read for ownership prefetch from local IA that hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_RFO_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFO_Prefs issued by iA Cores that Hit the LLC", + "UMask": "0xc887fd01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoM events that are initiated from the Core", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMs issued by iA Cores", + "UMask": "0xcc47ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoMCacheNear requests from local IA cores", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_ITOMCACHENEAR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMCacheNears issued by iA Cores", + "UMask": "0xcd47ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Last level cache prefetch code read from local IA.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_LLCPREFCODE", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : LLCPrefCode issued by iA Cores", + "UMask": "0xcccfff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Last level cache prefetch data read from local IA.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_LLCPREFDATA", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : LLCPrefData issued by iA Cores", + "UMask": "0xccd7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Last level cache prefetch read for ownership from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_LLCPREFRFO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores", + "UMask": "0xccc7ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All locally initiated requests from IA Cores which miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All requests from iA Cores that Missed the LLC", + "UMask": "0xc001fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Code read from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CRds issued by iA Cores that Missed the LLC", + "UMask": "0xc80ffe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for CRDs from local IA cores to locally homed memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CRd issued by iA Cores that Missed the LLC - HOMed locally", + "UMask": "0xc80efe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Code read prefetch from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CRd_Prefs issued by iA Cores that Missed the LLC", + "UMask": "0xc88ffe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for CRD Prefetches from local IA cores to locally homed memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD_PREF_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CRd_Prefs issued by iA Cores that Missed the LLC - HOMed locally", + "UMask": "0xc88efe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for CRD Prefetches from local IA cores to remotely homed memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD_PREF_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CRd_Prefs issued by iA Cores that Missed the LLC - HOMed remotely", + "UMask": "0xc88f7e01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for CRDs from local IA cores to remotely homed memory", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CRd issued by iA Cores that Missed the LLC - HOMed remotely", + "UMask": "0xc80f7e01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All requests issued from IA cores to CXL accelerator memory regions that miss the LLC.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CXL_ACC", + "PerPkg": "1", + "UMask": "0x10c0018201", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for DRds and equivalent opcodes issued from an IA core which miss the L3 and target memory in a CXL type 2 memory expander card.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_CXL_ACC", + "PerPkg": "1", + "UMask": "0x10c8178201", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Data read opt from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_OPT", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : DRd_Opt issued by iA Cores that missed the LLC", + "UMask": "0xc827fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Data read opt prefetch from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_OPT_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : DRd_Opt_Prefs issued by iA Cores that missed the LLC", + "UMask": "0xc8a7fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for L2 data prefetches issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PREF_CXL_ACC", + "PerPkg": "1", + "UMask": "0x10c8978201", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoM requests from local IA cores that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMs issued by iA Cores that Missed LLC", + "UMask": "0xcc47fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Last level cache prefetch code read from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFCODE", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : LLCPrefCode issued by iA Cores that missed the LLC", + "UMask": "0xcccffe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Last level cache prefetch data read from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFDATA", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : LLCPrefData issued by iA Cores that missed the LLC", + "UMask": "0xccd7fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for LLC data prefetches issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFDATA_CXL_ACC", + "PerPkg": "1", + "UMask": "0x10ccd78201", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Last level cache prefetch read for ownership from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFRFO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores that missed the LLC", + "UMask": "0xccc7fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for L2 RFO prefetches issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFRFO_CXL_ACC", + "PerPkg": "1", + "UMask": "0x10c8878201", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCILF requests from local IA cores to locally homed DDR addresses that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LOCAL_WCILF_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLFs issued by iA Cores targeting DDR that missed the LLC - HOMed locally", + "UMask": "0xc8668601", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCILF requests from local IA cores to locally homed PMM addresses which miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LOCAL_WCILF_PMM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLFs issued by iA Cores targeting PMM that missed the LLC - HOMed locally", + "UMask": "0xc8668a01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCIL requests from local IA cores to locally homed DDR addresses that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LOCAL_WCIL_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores targeting DDR that missed the LLC - HOMed locally", + "UMask": "0xc86e8601", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCIL requests from local IA cores to locally homed PMM addresses which miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LOCAL_WCIL_PMM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores targeting PMM that missed the LLC - HOMed locally", + "UMask": "0xc86e8a01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCILF requests from local IA cores to remotely homed DDR addresses that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_REMOTE_WCILF_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLFs issued by iA Cores targeting DDR that missed the LLC - HOMed remotely", + "UMask": "0xc8670601", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCILF requests from local IA cores to remotely homed PMM addresses which miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_REMOTE_WCILF_PMM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLFs issued by iA Cores targeting PMM that missed the LLC - HOMed remotely", + "UMask": "0xc8670a01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCIL requests from local IA cores to remotely homed DDR addresses that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_REMOTE_WCIL_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores targeting DDR that missed the LLC - HOMed remotely", + "UMask": "0xc86f0601", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCIL requests from local IA cores to remotely homed PMM addresses which miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_REMOTE_WCIL_PMM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores targeting PMM that missed the LLC - HOMed remotely", + "UMask": "0xc86f0a01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Read for ownership from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Missed the LLC", + "UMask": "0xc807fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for RFOs issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_CXL_ACC", + "PerPkg": "1", + "UMask": "0x10c8078201", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Read for ownership from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Missed the LLC - HOMed locally", + "UMask": "0xc806fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Read for ownership prefetch from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFO_Prefs issued by iA Cores that Missed the LLC", + "UMask": "0xc887fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for LLC RFO prefetches issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_PREF_CXL_ACC", + "PerPkg": "1", + "UMask": "0x10ccc78201", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Read for ownership prefetch from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_PREF_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFO_Prefs issued by iA Cores that Missed the LLC - HOMed locally", + "UMask": "0xc886fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Read for ownership prefetch from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_PREF_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFO_Prefs issued by iA Cores that Missed the LLC - HOMed remotely", + "UMask": "0xc8877e01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Read for ownership from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Missed the LLC - HOMed remotely", + "UMask": "0xc8077e01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for UCRDF requests from local IA cores that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_UCRDF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : UCRdFs issued by iA Cores that Missed LLC", + "UMask": "0xc877de01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCIL requests from a local IA core that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCIL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores that Missed the LLC", + "UMask": "0xc86ffe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCILF requests from local IA core that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCILF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLF issued by iA Cores that Missed the LLC", + "UMask": "0xc867fe01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCILF requests from local IA cores to DDR homed addresses which miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCILF_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLFs issued by iA Cores targeting DDR that missed the LLC", + "UMask": "0xc8678601", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCILF requests from local IA cores to PMM homed addresses which miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCILF_PMM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLFs issued by iA Cores targeting PMM that missed the LLC", + "UMask": "0xc8678a01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCIL requests from local IA cores to DDR homed addresses which miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCIL_DDR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores targeting DDR that missed the LLC", + "UMask": "0xc86f8601", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCIL requests from a local IA core to PMM homed addresses that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCIL_PMM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores targeting PMM that missed the LLC", + "UMask": "0xc86f8a01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WIL requests from local IA cores that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WIL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WiLs issued by iA Cores that Missed LLC", + "UMask": "0xc87fde01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Read for ownership from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores", + "UMask": "0xc807ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for Read for ownership prefetch from local IA that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_RFO_PREF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFO_Prefs issued by iA Cores", + "UMask": "0xc887ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for SpecItoM events that are initiated from the Core", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_SPECITOM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : SpecItoMs issued by iA Cores", + "UMask": "0xcc57ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WbMtoI requests from local IA cores", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_WBMTOI", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WbMtoIs issued by iA Cores", + "UMask": "0xcc27ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCIL requests from a local IA core", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_WCIL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores", + "UMask": "0xc86fff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WCILF requests from local IA core", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_WCILF", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WCiLF issued by iA Cores", + "UMask": "0xc867ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All TOR inserts from local IO devices", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All requests from IO Devices", + "UMask": "0xc001ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for CLFlush requests from IO devices", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_CLFLUSH", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : CLFlushes issued by IO Devices", + "UMask": "0xc8c3ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All TOR inserts from local IO devices which hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All requests from IO Devices that hit the LLC", + "UMask": "0xc001fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoMs from local IO devices which hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMs issued by IO Devices that Hit the LLC", + "UMask": "0xcc43fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoMCacheNears, indicating a partial write request, from IO Devices that hit the LLC", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT_ITOMCACHENEAR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that hit the LLC", + "UMask": "0xcd43fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for PCIRDCURs issued by IO devices which hit the LLC", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT_PCIRDCUR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : PCIRdCurs issued by IO Devices that hit the LLC", + "UMask": "0xc8f3fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for RFOs from local IO devices which hit the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFOs issued by IO Devices that hit the LLC", + "UMask": "0xc803fd04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All TOR ItoM inserts from local IO devices", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMs issued by IO Devices", + "UMask": "0xcc43ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoMCacheNears, indicating a partial write request, from IO Devices", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_ITOMCACHENEAR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices", + "UMask": "0xcd43ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All TOR inserts from local IO devices which miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All requests from IO Devices that missed the LLC", + "UMask": "0xc001fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All TOR ItoM inserts from local IO devices which miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMs issued by IO Devices that missed the LLC", + "UMask": "0xcc43fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOMCACHENEAR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC", + "UMask": "0xcd43fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoMCacheNear transactions from an IO device on the local socket that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOMCACHENEAR_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC", + "UMask": "0xcd42fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoMCacheNear transactions from an IO device on a remote socket that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOMCACHENEAR_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC", + "UMask": "0xcd437e04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoM transactions from an IO device on the local socket that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMs issued by IO Devices that missed the LLC", + "UMask": "0xcc42fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for ItoM transactions from an IO device on a remote socket that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : ItoMs issued by IO Devices that missed the LLC", + "UMask": "0xcc437e04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for PCIRDCURs issued by IO devices which miss the LLC", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_PCIRDCUR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : PCIRdCurs issued by IO Devices that missed the LLC", + "UMask": "0xc8f3fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for PCIRDCUR transactions from an IO device on the local socket that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_PCIRDCUR_LOCAL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : PCIRdCurs issued by IO Devices that missed the LLC", + "UMask": "0xc8f2fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for PCIRDCUR transactions from an IO device on a remote socket that miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_PCIRDCUR_REMOTE", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : PCIRdCurs issued by IO Devices that missed the LLC", + "UMask": "0xc8f37e04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All TOR RFO inserts from local IO devices which miss the cache", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFOs issued by IO Devices that missed the LLC", + "UMask": "0xc803fe04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for PCIRDCURs issued by IO devices", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_PCIRDCUR", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : PCIRdCurs issued by IO Devices", + "UMask": "0xc8f3ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for RFOs from local IO devices", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_RFO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : RFOs issued by IO Devices", + "UMask": "0xc803ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for WBMtoI requests from IO devices", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_WBMTOI", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : WbMtoIs issued by IO Devices", + "UMask": "0xcc23ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All locally initiated requests", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.LOC_ALL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All from Local iA and IO", + "UMask": "0xc000ff05", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All from Local iA", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.LOC_IA", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All from Local iA", + "UMask": "0xc000ff01", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All from Local IO", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.LOC_IO", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All from Local IO", + "UMask": "0xc000ff04", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All remote requests (e.g. snoops, writebacks) that came from remote sockets", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.REM_ALL", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All Remote Requests", + "UMask": "0xc001ffc8", + "Unit": "CHA" + }, + { + "BriefDescription": "TOR Occupancy for All snoops to this LLC that came from remote sockets", + "EventCode": "0x36", + "EventName": "UNC_CHA_TOR_OCCUPANCY.REM_SNPS", + "PerPkg": "1", + "PublicDescription": "TOR Occupancy : All Snoops from Remote", + "UMask": "0xc001ff08", + "Unit": "CHA" + } +] diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/uncore-cxl.json b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-cxl.json new file mode 100644 index 0000000000..dc676c7aa3 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-cxl.json @@ -0,0 +1,10 @@ +[ + { + "BriefDescription": "B2CXL Clockticks", + "EventCode": "0x01", + "EventName": "UNC_B2CXL_CLOCKTICKS", + "PerPkg": "1", + "PortMask": "0x000", + "Unit": "B2CXL" + } +] diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-interconnect.json new file mode 100644 index 0000000000..6932b2fea3 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-interconnect.json @@ -0,0 +1,1228 @@ +[ + { + "BriefDescription": "Clockticks of the mesh to memory (B2CMI)", + "EventCode": "0x01", + "EventName": "UNC_B2CMI_CLOCKTICKS", + "PerPkg": "1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the number of time D2C was not honoured by egress due to directory state constraints", + "EventCode": "0x17", + "EventName": "UNC_B2CMI_DIRECT2CORE_NOT_TAKEN_DIRSTATE", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the number of times B2CMI egress did D2C (direct to core)", + "EventCode": "0x16", + "EventName": "UNC_B2CMI_DIRECT2CORE_TAKEN", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the number of times D2C wasn't honoured even though the incoming request had d2c set for non cisgress txn", + "EventCode": "0x18", + "EventName": "UNC_B2CMI_DIRECT2CORE_TXN_OVERRIDE", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the number of d2k wasn't done due to credit constraints", + "EventCode": "0x1B", + "EventName": "UNC_B2CMI_DIRECT2UPI_NOT_TAKEN_CREDITS", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Direct to UPI Transactions - Ignored due to lack of credits : All : Counts the number of d2k wasn't done due to credit constraints", + "EventCode": "0x1B", + "EventName": "UNC_B2CMI_DIRECT2UPI_NOT_TAKEN_CREDITS.EGRESS", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the number of time D2K was not honoured by egress due to directory state constraints", + "EventCode": "0x1A", + "EventName": "UNC_B2CMI_DIRECT2UPI_NOT_TAKEN_DIRSTATE", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Cycles when Direct2UPI was Disabled : Egress Ignored D2U : Counts the number of time D2K was not honoured by egress due to directory state constraints", + "EventCode": "0x1A", + "EventName": "UNC_B2CMI_DIRECT2UPI_NOT_TAKEN_DIRSTATE.EGRESS", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the number of times egress did D2K (Direct to KTI)", + "EventCode": "0x19", + "EventName": "UNC_B2CMI_DIRECT2UPI_TAKEN", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the number of times D2K wasn't honoured even though the incoming request had d2k set for non cisgress txn", + "EventCode": "0x1C", + "EventName": "UNC_B2CMI_DIRECT2UPI_TXN_OVERRIDE", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory Hit Clean", + "EventCode": "0x1D", + "EventName": "UNC_B2CMI_DIRECTORY_HIT.CLEAN", + "PerPkg": "1", + "UMask": "0x38", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory Hit : On NonDirty Line in A State", + "EventCode": "0x1D", + "EventName": "UNC_B2CMI_DIRECTORY_HIT.CLEAN_A", + "PerPkg": "1", + "UMask": "0x20", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory Hit : On NonDirty Line in I State", + "EventCode": "0x1D", + "EventName": "UNC_B2CMI_DIRECTORY_HIT.CLEAN_I", + "PerPkg": "1", + "UMask": "0x8", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory Hit : On NonDirty Line in S State", + "EventCode": "0x1D", + "EventName": "UNC_B2CMI_DIRECTORY_HIT.CLEAN_S", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory Hit Dirty (modified)", + "EventCode": "0x1D", + "EventName": "UNC_B2CMI_DIRECTORY_HIT.DIRTY", + "PerPkg": "1", + "UMask": "0x7", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory Hit : On Dirty Line in A State", + "EventCode": "0x1D", + "EventName": "UNC_B2CMI_DIRECTORY_HIT.DIRTY_A", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory Hit : On Dirty Line in I State", + "EventCode": "0x1D", + "EventName": "UNC_B2CMI_DIRECTORY_HIT.DIRTY_I", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory Hit : On Dirty Line in S State", + "EventCode": "0x1D", + "EventName": "UNC_B2CMI_DIRECTORY_HIT.DIRTY_S", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the number of 1lm or 2lm hit read data returns to egress with any directory to non persistent memory", + "EventCode": "0x20", + "EventName": "UNC_B2CMI_DIRECTORY_LOOKUP.ANY", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the number of 1lm or 2lm hit read data returns to egress with directory A to non persistent memory", + "EventCode": "0x20", + "EventName": "UNC_B2CMI_DIRECTORY_LOOKUP.STATE_A", + "PerPkg": "1", + "UMask": "0x8", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the number of 1lm or 2lm hit read data returns to egress with directory I to non persistent memory", + "EventCode": "0x20", + "EventName": "UNC_B2CMI_DIRECTORY_LOOKUP.STATE_I", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the number of 1lm or 2lm hit read data returns to egress with directory S to non persistent memory", + "EventCode": "0x20", + "EventName": "UNC_B2CMI_DIRECTORY_LOOKUP.STATE_S", + "PerPkg": "1", + "PublicDescription": "Counts the number of 1lm or 2lm hit read data returns to egress with directory S to non persistent memory", + "UMask": "0x4", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory Miss Clean", + "EventCode": "0x1E", + "EventName": "UNC_B2CMI_DIRECTORY_MISS.CLEAN", + "PerPkg": "1", + "UMask": "0x38", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory Miss : On NonDirty Line in A State", + "EventCode": "0x1E", + "EventName": "UNC_B2CMI_DIRECTORY_MISS.CLEAN_A", + "PerPkg": "1", + "UMask": "0x20", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory Miss : On NonDirty Line in I State", + "EventCode": "0x1E", + "EventName": "UNC_B2CMI_DIRECTORY_MISS.CLEAN_I", + "PerPkg": "1", + "UMask": "0x8", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory Miss : On NonDirty Line in S State", + "EventCode": "0x1E", + "EventName": "UNC_B2CMI_DIRECTORY_MISS.CLEAN_S", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory Miss Dirty (modified)", + "EventCode": "0x1E", + "EventName": "UNC_B2CMI_DIRECTORY_MISS.DIRTY", + "PerPkg": "1", + "UMask": "0x7", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory Miss : On Dirty Line in A State", + "EventCode": "0x1E", + "EventName": "UNC_B2CMI_DIRECTORY_MISS.DIRTY_A", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory Miss : On Dirty Line in I State", + "EventCode": "0x1E", + "EventName": "UNC_B2CMI_DIRECTORY_MISS.DIRTY_I", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory Miss : On Dirty Line in S State", + "EventCode": "0x1E", + "EventName": "UNC_B2CMI_DIRECTORY_MISS.DIRTY_S", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Any A2I Transition", + "EventCode": "0x21", + "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.A2I", + "PerPkg": "1", + "UMask": "0x320", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Any A2S Transition", + "EventCode": "0x21", + "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.A2S", + "PerPkg": "1", + "UMask": "0x340", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts cisgress directory updates", + "EventCode": "0x21", + "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.ANY", + "PerPkg": "1", + "UMask": "0x301", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts any 1lm or 2lm hit data return that would result in directory update to non persistent memory (DRAM)", + "EventCode": "0x21", + "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.HIT_ANY", + "PerPkg": "1", + "UMask": "0x101", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory update in near memory to the A state", + "EventCode": "0x21", + "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.HIT_X2A", + "PerPkg": "1", + "UMask": "0x114", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory update in near memory to the I state", + "EventCode": "0x21", + "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.HIT_X2I", + "PerPkg": "1", + "UMask": "0x128", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory update in near memory to the S state", + "EventCode": "0x21", + "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.HIT_X2S", + "PerPkg": "1", + "UMask": "0x142", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Any I2A Transition", + "EventCode": "0x21", + "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.I2A", + "PerPkg": "1", + "UMask": "0x304", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Any I2S Transition", + "EventCode": "0x21", + "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.I2S", + "PerPkg": "1", + "UMask": "0x302", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory update in far memory to the A state", + "EventCode": "0x21", + "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.MISS_X2A", + "PerPkg": "1", + "UMask": "0x214", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory update in far memory to the I state", + "EventCode": "0x21", + "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.MISS_X2I", + "PerPkg": "1", + "UMask": "0x228", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory update in far memory to the S state", + "EventCode": "0x21", + "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.MISS_X2S", + "PerPkg": "1", + "UMask": "0x242", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Any S2A Transition", + "EventCode": "0x21", + "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.S2A", + "PerPkg": "1", + "UMask": "0x310", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Any S2I Transition", + "EventCode": "0x21", + "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.S2I", + "PerPkg": "1", + "UMask": "0x308", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory update to the A state", + "EventCode": "0x21", + "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.X2A", + "PerPkg": "1", + "UMask": "0x314", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory update to the I state", + "EventCode": "0x21", + "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.X2I", + "PerPkg": "1", + "UMask": "0x328", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Directory update to the S state", + "EventCode": "0x21", + "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.X2S", + "PerPkg": "1", + "UMask": "0x342", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts any read", + "EventCode": "0x24", + "EventName": "UNC_B2CMI_IMC_READS.ALL", + "PerPkg": "1", + "UMask": "0x104", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts normal reads issue to CMI", + "EventCode": "0x24", + "EventName": "UNC_B2CMI_IMC_READS.NORMAL", + "PerPkg": "1", + "UMask": "0x101", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Count reads to NM region", + "EventCode": "0x24", + "EventName": "UNC_B2CMI_IMC_READS.TO_DDR_AS_CACHE", + "PerPkg": "1", + "UMask": "0x110", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts reads to 1lm non persistent memory regions", + "EventCode": "0x24", + "EventName": "UNC_B2CMI_IMC_READS.TO_DDR_AS_MEM", + "PerPkg": "1", + "UMask": "0x108", + "Unit": "B2CMI" + }, + { + "BriefDescription": "All Writes - All Channels", + "EventCode": "0x25", + "EventName": "UNC_B2CMI_IMC_WRITES.ALL", + "PerPkg": "1", + "UMask": "0x110", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Full Non-ISOCH - All Channels", + "EventCode": "0x25", + "EventName": "UNC_B2CMI_IMC_WRITES.FULL", + "PerPkg": "1", + "UMask": "0x101", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Non-Inclusive - All Channels", + "EventCode": "0x25", + "EventName": "UNC_B2CMI_IMC_WRITES.NI", + "PerPkg": "1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Non-Inclusive Miss - All Channels", + "EventCode": "0x25", + "EventName": "UNC_B2CMI_IMC_WRITES.NI_MISS", + "PerPkg": "1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Partial Non-ISOCH - All Channels", + "EventCode": "0x25", + "EventName": "UNC_B2CMI_IMC_WRITES.PARTIAL", + "PerPkg": "1", + "UMask": "0x102", + "Unit": "B2CMI" + }, + { + "BriefDescription": "DDR, acting as Cache - All Channels", + "EventCode": "0x25", + "EventName": "UNC_B2CMI_IMC_WRITES.TO_DDR_AS_CACHE", + "PerPkg": "1", + "UMask": "0x140", + "Unit": "B2CMI" + }, + { + "BriefDescription": "DDR - All Channels", + "EventCode": "0x25", + "EventName": "UNC_B2CMI_IMC_WRITES.TO_DDR_AS_MEM", + "PerPkg": "1", + "UMask": "0x120", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Prefetch CAM Inserts : UPI - Ch 0", + "EventCode": "0x56", + "EventName": "UNC_B2CMI_PREFCAM_INSERTS.CH0_UPI", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Prefetch CAM Inserts : XPT - Ch 0", + "EventCode": "0x56", + "EventName": "UNC_B2CMI_PREFCAM_INSERTS.CH0_XPT", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Prefetch CAM Inserts : UPI - All Channels", + "EventCode": "0x56", + "EventName": "UNC_B2CMI_PREFCAM_INSERTS.UPI_ALLCH", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Prefetch CAM Inserts : XPT -All Channels", + "EventCode": "0x56", + "EventName": "UNC_B2CMI_PREFCAM_INSERTS.XPT_ALLCH", + "PerPkg": "1", + "PublicDescription": "Prefetch CAM Inserts : XPT - All Channels", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Prefetch CAM Occupancy : Channel 0", + "EventCode": "0x54", + "EventName": "UNC_B2CMI_PREFCAM_OCCUPANCY.CH0", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the 2lm reads and WRNI which were a hit", + "EventCode": "0x1F", + "EventName": "UNC_B2CMI_TAG_HIT.ALL", + "PerPkg": "1", + "UMask": "0xf", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the 2lm reads which were a hit clean", + "EventCode": "0x1F", + "EventName": "UNC_B2CMI_TAG_HIT.RD_CLEAN", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the 2lm reads which were a hit dirty", + "EventCode": "0x1F", + "EventName": "UNC_B2CMI_TAG_HIT.RD_DIRTY", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the 2lm WRNI which were a hit clean", + "EventCode": "0x1F", + "EventName": "UNC_B2CMI_TAG_HIT.WR_CLEAN", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the 2lm WRNI which were a hit dirty", + "EventCode": "0x1F", + "EventName": "UNC_B2CMI_TAG_HIT.WR_DIRTY", + "PerPkg": "1", + "UMask": "0x8", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the 2lm second way read miss for a WrNI", + "EventCode": "0x4B", + "EventName": "UNC_B2CMI_TAG_MISS.CLEAN", + "PerPkg": "1", + "UMask": "0x5", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the 2lm second way read miss for a WrNI", + "EventCode": "0x4B", + "EventName": "UNC_B2CMI_TAG_MISS.DIRTY", + "PerPkg": "1", + "UMask": "0xa", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the 2lm second way read miss for a Rd", + "EventCode": "0x4B", + "EventName": "UNC_B2CMI_TAG_MISS.RD_2WAY", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the 2lm reads which were a miss and the cache line is unmodified", + "EventCode": "0x4B", + "EventName": "UNC_B2CMI_TAG_MISS.RD_CLEAN", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the 2lm reads which were a miss and the cache line is modified", + "EventCode": "0x4B", + "EventName": "UNC_B2CMI_TAG_MISS.RD_DIRTY", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the 2lm second way read miss for a WrNI", + "EventCode": "0x4B", + "EventName": "UNC_B2CMI_TAG_MISS.WR_2WAY", + "PerPkg": "1", + "UMask": "0x20", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the 2lm WRNI which were a miss and the cache line is unmodified", + "EventCode": "0x4B", + "EventName": "UNC_B2CMI_TAG_MISS.WR_CLEAN", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Counts the 2lm WRNI which were a miss and the cache line is modified", + "EventCode": "0x4B", + "EventName": "UNC_B2CMI_TAG_MISS.WR_DIRTY", + "PerPkg": "1", + "UMask": "0x8", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Tracker Inserts : Channel 0", + "EventCode": "0x32", + "EventName": "UNC_B2CMI_TRACKER_INSERTS.CH0", + "PerPkg": "1", + "UMask": "0x104", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Tracker Occupancy : Channel 0", + "EventCode": "0x33", + "EventName": "UNC_B2CMI_TRACKER_OCCUPANCY.CH0", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "Write Tracker Inserts : Channel 0", + "EventCode": "0x40", + "EventName": "UNC_B2CMI_WR_TRACKER_INSERTS.CH0", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2CMI" + }, + { + "BriefDescription": "UNC_B2HOT_CLOCKTICKS", + "EventCode": "0x01", + "EventName": "UNC_B2HOT_CLOCKTICKS", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "B2HOT" + }, + { + "BriefDescription": "Number of uclks in domain", + "EventCode": "0x01", + "EventName": "UNC_B2UPI_CLOCKTICKS", + "PerPkg": "1", + "Unit": "B2UPI" + }, + { + "BriefDescription": "Total Write Cache Occupancy : Mem", + "EventCode": "0x0F", + "EventName": "UNC_I_CACHE_TOTAL_OCCUPANCY.MEM", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "IRP" + }, + { + "BriefDescription": "IRP Clockticks", + "EventCode": "0x01", + "EventName": "UNC_I_CLOCKTICKS", + "PerPkg": "1", + "Unit": "IRP" + }, + { + "BriefDescription": "Inbound read requests received by the IRP and inserted into the FAF queue", + "EventCode": "0x18", + "EventName": "UNC_I_FAF_INSERTS", + "PerPkg": "1", + "Unit": "IRP" + }, + { + "BriefDescription": "FAF occupancy", + "EventCode": "0x19", + "EventName": "UNC_I_FAF_OCCUPANCY", + "PerPkg": "1", + "Unit": "IRP" + }, + { + "BriefDescription": "Misc Events - Set 1 : Lost Forward : Snoop pulled away ownership before a write was committed", + "EventCode": "0x1F", + "EventName": "UNC_I_MISC1.LOST_FWD", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "IRP" + }, + { + "BriefDescription": "Inbound write (fast path) requests to coherent memory, received by the IRP resulting in write ownership requests issued by IRP to the mesh.", + "EventCode": "0x11", + "EventName": "UNC_I_TRANSACTIONS.WR_PREF", + "PerPkg": "1", + "UMask": "0x8", + "Unit": "IRP" + }, + { + "BriefDescription": "MDF Clockticks", + "EventCode": "0x01", + "EventName": "UNC_MDF_CLOCKTICKS", + "PerPkg": "1", + "Unit": "MDF" + }, + { + "BriefDescription": "Number of UPI LL clock cycles while the event is enabled", + "EventCode": "0x01", + "EventName": "UNC_UPI_CLOCKTICKS", + "PerPkg": "1", + "PublicDescription": "Number of kfclks", + "Unit": "UPI" + }, + { + "BriefDescription": "Cycles in L1 : Number of UPI qfclk cycles spent in L1 power mode. L1 is a mode that totally shuts down a UPI link. Use edge detect to count the number of instances when the UPI link entered L1. Link power states are per link and per direction, so for example the Tx direction could be in one state while Rx was in another. Because L1 totally shuts down the link, it takes a good amount of time to exit this mode.", + "EventCode": "0x21", + "EventName": "UNC_UPI_L1_POWER_CYCLES", + "PerPkg": "1", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass", + "EventCode": "0x05", + "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCB", + "PerPkg": "1", + "UMask": "0xe", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass, Match Opcode", + "EventCode": "0x05", + "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCB_OPC", + "PerPkg": "1", + "UMask": "0x10e", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard", + "EventCode": "0x05", + "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCS", + "PerPkg": "1", + "UMask": "0xf", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard, Match Opcode", + "EventCode": "0x05", + "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCS_OPC", + "PerPkg": "1", + "UMask": "0x10f", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Receive path of a UPI Port : Request", + "EventCode": "0x05", + "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.REQ", + "PerPkg": "1", + "UMask": "0x8", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Receive path of a UPI Port : Request, Match Opcode", + "EventCode": "0x05", + "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.REQ_OPC", + "PerPkg": "1", + "UMask": "0x108", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Receive path of a UPI Port : Response - Conflict", + "EventCode": "0x05", + "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSPCNFLT", + "PerPkg": "1", + "UMask": "0x1aa", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Receive path of a UPI Port : Response - Invalid", + "EventCode": "0x05", + "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSPI", + "PerPkg": "1", + "UMask": "0x12a", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Receive path of a UPI Port : Response - Data", + "EventCode": "0x05", + "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSP_DATA", + "PerPkg": "1", + "UMask": "0xc", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Receive path of a UPI Port : Response - Data, Match Opcode", + "EventCode": "0x05", + "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSP_DATA_OPC", + "PerPkg": "1", + "UMask": "0x10c", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Receive path of a UPI Port : Response - No Data", + "EventCode": "0x05", + "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSP_NODATA", + "PerPkg": "1", + "UMask": "0xa", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Receive path of a UPI Port : Response - No Data, Match Opcode", + "EventCode": "0x05", + "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSP_NODATA_OPC", + "PerPkg": "1", + "UMask": "0x10a", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Receive path of a UPI Port : Snoop", + "EventCode": "0x05", + "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.SNP", + "PerPkg": "1", + "UMask": "0x9", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Receive path of a UPI Port : Snoop, Match Opcode", + "EventCode": "0x05", + "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.SNP_OPC", + "PerPkg": "1", + "UMask": "0x109", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Receive path of a UPI Port : Writeback", + "EventCode": "0x05", + "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.WB", + "PerPkg": "1", + "UMask": "0xd", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Receive path of a UPI Port : Writeback, Match Opcode", + "EventCode": "0x05", + "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.WB_OPC", + "PerPkg": "1", + "UMask": "0x10d", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Received : All Data : Shows legal flit time (hides impact of L0p and L0c).", + "EventCode": "0x03", + "EventName": "UNC_UPI_RxL_FLITS.ALL_DATA", + "PerPkg": "1", + "UMask": "0xf", + "Unit": "UPI" + }, + { + "BriefDescription": "Null FLITs received from any slot", + "EventCode": "0x03", + "EventName": "UNC_UPI_RxL_FLITS.ALL_NULL", + "PerPkg": "1", + "PublicDescription": "Valid Flits Received : Null FLITs received from any slot", + "UMask": "0x27", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Received : Data : Shows legal flit time (hides impact of L0p and L0c). : Count Data Flits (which consume all slots), but how much to count is based on Slot0-2 mask, so count can be 0-3 depending on which slots are enabled for counting..", + "EventCode": "0x03", + "EventName": "UNC_UPI_RxL_FLITS.DATA", + "PerPkg": "1", + "UMask": "0x8", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Received : Idle : Shows legal flit time (hides impact of L0p and L0c).", + "EventCode": "0x03", + "EventName": "UNC_UPI_RxL_FLITS.IDLE", + "PerPkg": "1", + "UMask": "0x47", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Received : LLCRD Not Empty : Shows legal flit time (hides impact of L0p and L0c). : Enables counting of LLCRD (with non-zero payload). This only applies to slot 2 since LLCRD is only allowed in slot 2", + "EventCode": "0x03", + "EventName": "UNC_UPI_RxL_FLITS.LLCRD", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Received : LLCTRL : Shows legal flit time (hides impact of L0p and L0c). : Equivalent to an idle packet. Enables counting of slot 0 LLCTRL messages.", + "EventCode": "0x03", + "EventName": "UNC_UPI_RxL_FLITS.LLCTRL", + "PerPkg": "1", + "UMask": "0x40", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Received : All Non Data : Shows legal flit time (hides impact of L0p and L0c).", + "EventCode": "0x03", + "EventName": "UNC_UPI_RxL_FLITS.NON_DATA", + "PerPkg": "1", + "UMask": "0x97", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Received : Slot NULL or LLCRD Empty : Shows legal flit time (hides impact of L0p and L0c). : LLCRD with all zeros is treated as NULL. Slot 1 is not treated as NULL if slot 0 is a dual slot. This can apply to slot 0,1, or 2.", + "EventCode": "0x03", + "EventName": "UNC_UPI_RxL_FLITS.NULL", + "PerPkg": "1", + "UMask": "0x20", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Received : Protocol Header : Shows legal flit time (hides impact of L0p and L0c). : Enables count of protocol headers in slot 0,1,2 (depending on slot uMask bits)", + "EventCode": "0x03", + "EventName": "UNC_UPI_RxL_FLITS.PROTHDR", + "PerPkg": "1", + "UMask": "0x80", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Received : Slot 0 : Shows legal flit time (hides impact of L0p and L0c). : Count Slot 0 - Other mask bits determine types of headers to count.", + "EventCode": "0x03", + "EventName": "UNC_UPI_RxL_FLITS.SLOT0", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Received : Slot 1 : Shows legal flit time (hides impact of L0p and L0c). : Count Slot 1 - Other mask bits determine types of headers to count.", + "EventCode": "0x03", + "EventName": "UNC_UPI_RxL_FLITS.SLOT1", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Received : Slot 2 : Shows legal flit time (hides impact of L0p and L0c). : Count Slot 2 - Other mask bits determine types of headers to count.", + "EventCode": "0x03", + "EventName": "UNC_UPI_RxL_FLITS.SLOT2", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "UPI" + }, + { + "BriefDescription": "RxQ Flit Buffer Allocations : Slot 0 : Number of allocations into the UPI Rx Flit Buffer. Generally, when data is transmitted across UPI, it will bypass the RxQ and pass directly to the ring interface. If things back up getting transmitted onto the ring, however, it may need to allocate into this buffer, thus increasing the latency. This event can be used in conjunction with the Flit Buffer Occupancy event in order to calculate the average flit buffer lifetime.", + "EventCode": "0x30", + "EventName": "UNC_UPI_RxL_INSERTS.SLOT0", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "UPI" + }, + { + "BriefDescription": "RxQ Flit Buffer Allocations : Slot 1 : Number of allocations into the UPI Rx Flit Buffer. Generally, when data is transmitted across UPI, it will bypass the RxQ and pass directly to the ring interface. If things back up getting transmitted onto the ring, however, it may need to allocate into this buffer, thus increasing the latency. This event can be used in conjunction with the Flit Buffer Occupancy event in order to calculate the average flit buffer lifetime.", + "EventCode": "0x30", + "EventName": "UNC_UPI_RxL_INSERTS.SLOT1", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "UPI" + }, + { + "BriefDescription": "RxQ Flit Buffer Allocations : Slot 2 : Number of allocations into the UPI Rx Flit Buffer. Generally, when data is transmitted across UPI, it will bypass the RxQ and pass directly to the ring interface. If things back up getting transmitted onto the ring, however, it may need to allocate into this buffer, thus increasing the latency. This event can be used in conjunction with the Flit Buffer Occupancy event in order to calculate the average flit buffer lifetime.", + "EventCode": "0x30", + "EventName": "UNC_UPI_RxL_INSERTS.SLOT2", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "UPI" + }, + { + "BriefDescription": "RxQ Occupancy - All Packets : Slot 0", + "EventCode": "0x32", + "EventName": "UNC_UPI_RxL_OCCUPANCY.SLOT0", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "UPI" + }, + { + "BriefDescription": "RxQ Occupancy - All Packets : Slot 1", + "EventCode": "0x32", + "EventName": "UNC_UPI_RxL_OCCUPANCY.SLOT1", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "UPI" + }, + { + "BriefDescription": "RxQ Occupancy - All Packets : Slot 2", + "EventCode": "0x32", + "EventName": "UNC_UPI_RxL_OCCUPANCY.SLOT2", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass", + "EventCode": "0x04", + "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCB", + "PerPkg": "1", + "UMask": "0xe", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass, Match Opcode", + "EventCode": "0x04", + "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCB_OPC", + "PerPkg": "1", + "UMask": "0x10e", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard", + "EventCode": "0x04", + "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCS", + "PerPkg": "1", + "UMask": "0xf", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard, Match Opcode", + "EventCode": "0x04", + "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCS_OPC", + "PerPkg": "1", + "UMask": "0x10f", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Transmit path of a UPI Port : Request", + "EventCode": "0x04", + "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.REQ", + "PerPkg": "1", + "UMask": "0x8", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Transmit path of a UPI Port : Request, Match Opcode", + "EventCode": "0x04", + "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.REQ_OPC", + "PerPkg": "1", + "UMask": "0x108", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Transmit path of a UPI Port : Response - Conflict", + "EventCode": "0x04", + "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSPCNFLT", + "PerPkg": "1", + "UMask": "0x1aa", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Transmit path of a UPI Port : Response - Invalid", + "EventCode": "0x04", + "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSPI", + "PerPkg": "1", + "UMask": "0x12a", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Transmit path of a UPI Port : Response - Data", + "EventCode": "0x04", + "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSP_DATA", + "PerPkg": "1", + "UMask": "0xc", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Transmit path of a UPI Port : Response - Data, Match Opcode", + "EventCode": "0x04", + "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSP_DATA_OPC", + "PerPkg": "1", + "UMask": "0x10c", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Transmit path of a UPI Port : Response - No Data", + "EventCode": "0x04", + "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSP_NODATA", + "PerPkg": "1", + "UMask": "0xa", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Transmit path of a UPI Port : Response - No Data, Match Opcode", + "EventCode": "0x04", + "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSP_NODATA_OPC", + "PerPkg": "1", + "UMask": "0x10a", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Transmit path of a UPI Port : Snoop", + "EventCode": "0x04", + "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.SNP", + "PerPkg": "1", + "UMask": "0x9", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Transmit path of a UPI Port : Snoop, Match Opcode", + "EventCode": "0x04", + "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.SNP_OPC", + "PerPkg": "1", + "UMask": "0x109", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Transmit path of a UPI Port : Writeback", + "EventCode": "0x04", + "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.WB", + "PerPkg": "1", + "UMask": "0xd", + "Unit": "UPI" + }, + { + "BriefDescription": "Matches on Transmit path of a UPI Port : Writeback, Match Opcode", + "EventCode": "0x04", + "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.WB_OPC", + "PerPkg": "1", + "UMask": "0x10d", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Sent : All Data : Counts number of data flits across this UPI link.", + "EventCode": "0x02", + "EventName": "UNC_UPI_TxL_FLITS.ALL_DATA", + "PerPkg": "1", + "UMask": "0xf", + "Unit": "UPI" + }, + { + "BriefDescription": "All Null Flits", + "EventCode": "0x02", + "EventName": "UNC_UPI_TxL_FLITS.ALL_NULL", + "PerPkg": "1", + "PublicDescription": "Valid Flits Sent : Idle", + "UMask": "0x27", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Sent : Data : Shows legal flit time (hides impact of L0p and L0c). : Count Data Flits (which consume all slots), but how much to count is based on Slot0-2 mask, so count can be 0-3 depending on which slots are enabled for counting..", + "EventCode": "0x02", + "EventName": "UNC_UPI_TxL_FLITS.DATA", + "PerPkg": "1", + "UMask": "0x8", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Sent : Idle : Shows legal flit time (hides impact of L0p and L0c).", + "EventCode": "0x02", + "EventName": "UNC_UPI_TxL_FLITS.IDLE", + "PerPkg": "1", + "UMask": "0x47", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Sent : LLCRD Not Empty : Shows legal flit time (hides impact of L0p and L0c). : Enables counting of LLCRD (with non-zero payload). This only applies to slot 2 since LLCRD is only allowed in slot 2", + "EventCode": "0x02", + "EventName": "UNC_UPI_TxL_FLITS.LLCRD", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Sent : LLCTRL : Shows legal flit time (hides impact of L0p and L0c). : Equivalent to an idle packet. Enables counting of slot 0 LLCTRL messages.", + "EventCode": "0x02", + "EventName": "UNC_UPI_TxL_FLITS.LLCTRL", + "PerPkg": "1", + "UMask": "0x40", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Sent : All Non Data : Shows legal flit time (hides impact of L0p and L0c).", + "EventCode": "0x02", + "EventName": "UNC_UPI_TxL_FLITS.NON_DATA", + "PerPkg": "1", + "PublicDescription": "Valid Flits Sent : Null FLITs transmitted to any slot", + "UMask": "0x97", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Sent : Slot NULL or LLCRD Empty : Shows legal flit time (hides impact of L0p and L0c). : LLCRD with all zeros is treated as NULL. Slot 1 is not treated as NULL if slot 0 is a dual slot. This can apply to slot 0,1, or 2.", + "EventCode": "0x02", + "EventName": "UNC_UPI_TxL_FLITS.NULL", + "PerPkg": "1", + "UMask": "0x20", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Sent : Protocol Header : Shows legal flit time (hides impact of L0p and L0c). : Enables count of protocol headers in slot 0,1,2 (depending on slot uMask bits)", + "EventCode": "0x02", + "EventName": "UNC_UPI_TxL_FLITS.PROTHDR", + "PerPkg": "1", + "UMask": "0x80", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Sent : Slot 0 : Shows legal flit time (hides impact of L0p and L0c). : Count Slot 0 - Other mask bits determine types of headers to count.", + "EventCode": "0x02", + "EventName": "UNC_UPI_TxL_FLITS.SLOT0", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Sent : Slot 1 : Shows legal flit time (hides impact of L0p and L0c). : Count Slot 1 - Other mask bits determine types of headers to count.", + "EventCode": "0x02", + "EventName": "UNC_UPI_TxL_FLITS.SLOT1", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "UPI" + }, + { + "BriefDescription": "Valid Flits Sent : Slot 2 : Shows legal flit time (hides impact of L0p and L0c). : Count Slot 2 - Other mask bits determine types of headers to count.", + "EventCode": "0x02", + "EventName": "UNC_UPI_TxL_FLITS.SLOT2", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "UPI" + }, + { + "BriefDescription": "Tx Flit Buffer Allocations : Number of allocations into the UPI Tx Flit Buffer. Generally, when data is transmitted across UPI, it will bypass the TxQ and pass directly to the link. However, the TxQ will be used with L0p and when LLR occurs, increasing latency to transfer out to the link. This event can be used in conjunction with the Flit Buffer Occupancy event in order to calculate the average flit buffer lifetime.", + "EventCode": "0x40", + "EventName": "UNC_UPI_TxL_INSERTS", + "PerPkg": "1", + "Unit": "UPI" + }, + { + "BriefDescription": "Tx Flit Buffer Occupancy : Accumulates the number of flits in the TxQ. Generally, when data is transmitted across UPI, it will bypass the TxQ and pass directly to the link. However, the TxQ will be used with L0p and when LLR occurs, increasing latency to transfer out to the link. This can be used with the cycles not empty event to track average occupancy, or the allocations event to track average lifetime in the TxQ.", + "EventCode": "0x42", + "EventName": "UNC_UPI_TxL_OCCUPANCY", + "PerPkg": "1", + "Unit": "UPI" + } +] diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/uncore-io.json b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-io.json new file mode 100644 index 0000000000..9495cb0f68 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-io.json @@ -0,0 +1,1634 @@ +[ + { + "BriefDescription": "IIO Clockticks", + "EventCode": "0x01", + "EventName": "UNC_IIO_CLOCKTICKS", + "PerPkg": "1", + "PortMask": "0x000", + "Unit": "IIO" + }, + { + "BriefDescription": "PCIE Completion Buffer Inserts. Counts once per 64 byte read issued from this PCIE device.", + "EventCode": "0xC2", + "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.ALL_PARTS", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff004", + "Unit": "IIO" + }, + { + "BriefDescription": "PCIE Completion Buffer Inserts. Counts once per 64 byte read issued from this PCIE device.", + "EventCode": "0xC2", + "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001004", + "Unit": "IIO" + }, + { + "BriefDescription": "PCIE Completion Buffer Inserts. Counts once per 64 byte read issued from this PCIE device.", + "EventCode": "0xC2", + "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002004", + "Unit": "IIO" + }, + { + "BriefDescription": "PCIE Completion Buffer Inserts. Counts once per 64 byte read issued from this PCIE device.", + "EventCode": "0xC2", + "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004004", + "Unit": "IIO" + }, + { + "BriefDescription": "PCIE Completion Buffer Inserts. Counts once per 64 byte read issued from this PCIE device.", + "EventCode": "0xC2", + "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008004", + "Unit": "IIO" + }, + { + "BriefDescription": "PCIE Completion Buffer Inserts. Counts once per 64 byte read issued from this PCIE device.", + "EventCode": "0xC2", + "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010004", + "Unit": "IIO" + }, + { + "BriefDescription": "PCIE Completion Buffer Inserts. Counts once per 64 byte read issued from this PCIE device.", + "EventCode": "0xC2", + "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020004", + "Unit": "IIO" + }, + { + "BriefDescription": "PCIE Completion Buffer Inserts. Counts once per 64 byte read issued from this PCIE device.", + "EventCode": "0xC2", + "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040004", + "Unit": "IIO" + }, + { + "BriefDescription": "PCIE Completion Buffer Inserts. Counts once per 64 byte read issued from this PCIE device.", + "EventCode": "0xC2", + "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080004", + "Unit": "IIO" + }, + { + "BriefDescription": "Count of allocations in the completion buffer", + "EventCode": "0xD5", + "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.ALL_PARTS", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff0ff", + "Unit": "IIO" + }, + { + "BriefDescription": "Count of allocations in the completion buffer", + "EventCode": "0xD5", + "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001001", + "Unit": "IIO" + }, + { + "BriefDescription": "Count of allocations in the completion buffer", + "EventCode": "0xD5", + "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002002", + "Unit": "IIO" + }, + { + "BriefDescription": "Count of allocations in the completion buffer", + "EventCode": "0xD5", + "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004004", + "Unit": "IIO" + }, + { + "BriefDescription": "Count of allocations in the completion buffer", + "EventCode": "0xD5", + "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008008", + "Unit": "IIO" + }, + { + "BriefDescription": "Count of allocations in the completion buffer", + "EventCode": "0xD5", + "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010010", + "Unit": "IIO" + }, + { + "BriefDescription": "Count of allocations in the completion buffer", + "EventCode": "0xD5", + "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020020", + "Unit": "IIO" + }, + { + "BriefDescription": "Count of allocations in the completion buffer", + "EventCode": "0xD5", + "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040040", + "Unit": "IIO" + }, + { + "BriefDescription": "Count of allocations in the completion buffer", + "EventCode": "0xD5", + "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080080", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.ALL_PARTS", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff004", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001004", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002004", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004004", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008004", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010004", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020004", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040004", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080004", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.ALL_PARTS", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Another card (different IIO stack) reading from this card.", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.PEER_READ.ALL_PARTS", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff008", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested by the CPU : Another card (different IIO stack) writing to this card.", + "EventCode": "0xC0", + "EventName": "UNC_IIO_DATA_REQ_BY_CPU.PEER_WRITE.ALL_PARTS", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff002", + "Unit": "IIO" + }, + { + "BriefDescription": "Counts once for every 4 bytes read from this card to memory. This event does include reads to IO.", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.ALL_PARTS", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff004", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001004", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x02", + "UMask": "0x7002004", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x04", + "UMask": "0x7004004", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x08", + "UMask": "0x7008004", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x10", + "UMask": "0x7010004", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x20", + "UMask": "0x7020004", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x40", + "UMask": "0x7040004", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x80", + "UMask": "0x7080004", + "Unit": "IIO" + }, + { + "BriefDescription": "Counts once for every 4 bytes written from this card to memory. This event does include writes to IO.", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.ALL_PARTS", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff001", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001001", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x02", + "UMask": "0x7002001", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x04", + "UMask": "0x7004001", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x08", + "UMask": "0x7008001", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x10", + "UMask": "0x7010001", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x20", + "UMask": "0x7020001", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x40", + "UMask": "0x7040001", + "Unit": "IIO" + }, + { + "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x80", + "UMask": "0x7080001", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card reading from another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_READ.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001008", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card reading from another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_READ.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002008", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card reading from another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_READ.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004008", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card reading from another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_READ.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008008", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card reading from another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_READ.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010008", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card reading from another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_READ.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020008", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card reading from another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_READ.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040008", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card reading from another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_READ.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080008", + "Unit": "IIO" + }, + { + "BriefDescription": "Counts once for every 4 bytes written from this card to a peer device's IO space.", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.ALL_PARTS", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff002", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001002", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002002", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004002", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008002", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010002", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020002", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040002", + "Unit": "IIO" + }, + { + "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x83", + "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080002", + "Unit": "IIO" + }, + { + "BriefDescription": "IOTLB Hits to a 1G Page", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.1G_HITS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x10", + "Unit": "IIO" + }, + { + "BriefDescription": "IOTLB Hits to a 2M Page", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.2M_HITS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x8", + "Unit": "IIO" + }, + { + "BriefDescription": "IOTLB Hits to a 4K Page", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.4K_HITS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x4", + "Unit": "IIO" + }, + { + "BriefDescription": "IOTLB lookups all", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.ALL_LOOKUPS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x2", + "Unit": "IIO" + }, + { + "BriefDescription": "Context cache hits", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.CTXT_CACHE_HITS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x80", + "Unit": "IIO" + }, + { + "BriefDescription": "Context cache lookups", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.CTXT_CACHE_LOOKUPS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x40", + "Unit": "IIO" + }, + { + "BriefDescription": "IOTLB lookups first", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.FIRST_LOOKUPS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x1", + "Unit": "IIO" + }, + { + "BriefDescription": "IOTLB Fills (same as IOTLB miss)", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.MISSES", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x20", + "Unit": "IIO" + }, + { + "BriefDescription": "IOMMU memory access (both low and high priority)", + "EventCode": "0x41", + "EventName": "UNC_IIO_IOMMU1.NUM_MEM_ACCESSES", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0xc0", + "Unit": "IIO" + }, + { + "BriefDescription": "IOMMU high priority memory access", + "EventCode": "0x41", + "EventName": "UNC_IIO_IOMMU1.NUM_MEM_ACCESSES_HIGH", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x80", + "Unit": "IIO" + }, + { + "BriefDescription": "IOMMU low priority memory access", + "EventCode": "0x41", + "EventName": "UNC_IIO_IOMMU1.NUM_MEM_ACCESSES_LOW", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x40", + "Unit": "IIO" + }, + { + "BriefDescription": "Second Level Page Walk Cache Hit to a 1G page", + "EventCode": "0x41", + "EventName": "UNC_IIO_IOMMU1.SLPWC_1G_HITS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x4", + "Unit": "IIO" + }, + { + "BriefDescription": "Second Level Page Walk Cache Hit to a 256T page", + "EventCode": "0x41", + "EventName": "UNC_IIO_IOMMU1.SLPWC_256T_HITS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x10", + "Unit": "IIO" + }, + { + "BriefDescription": "Second Level Page Walk Cache Hit to a 2M page", + "EventCode": "0x41", + "EventName": "UNC_IIO_IOMMU1.SLPWC_2M_HITS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x2", + "Unit": "IIO" + }, + { + "BriefDescription": "Second Level Page Walk Cache Hit to a 512G page", + "EventCode": "0x41", + "EventName": "UNC_IIO_IOMMU1.SLPWC_512G_HITS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x8", + "Unit": "IIO" + }, + { + "BriefDescription": "Second Level Page Walk Cache fill", + "EventCode": "0x41", + "EventName": "UNC_IIO_IOMMU1.SLPWC_CACHE_FILLS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x20", + "Unit": "IIO" + }, + { + "BriefDescription": "Second Level Page Walk Cache lookup", + "EventCode": "0x41", + "EventName": "UNC_IIO_IOMMU1.SLPWC_CACHE_LOOKUPS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x1", + "Unit": "IIO" + }, + { + "BriefDescription": "Cycles PWT full", + "EventCode": "0x43", + "EventName": "UNC_IIO_IOMMU3.CYC_PWT_FULL", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x2", + "Unit": "IIO" + }, + { + "BriefDescription": "Interrupt Entry cache hit", + "EventCode": "0x43", + "EventName": "UNC_IIO_IOMMU3.INT_CACHE_HITS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x80", + "Unit": "IIO" + }, + { + "BriefDescription": "Interrupt Entry cache lookup", + "EventCode": "0x43", + "EventName": "UNC_IIO_IOMMU3.INT_CACHE_LOOKUPS", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x40", + "Unit": "IIO" + }, + { + "BriefDescription": "Context Cache invalidation events", + "EventCode": "0x43", + "EventName": "UNC_IIO_IOMMU3.NUM_INVAL_CTXT_CACHE", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x8", + "Unit": "IIO" + }, + { + "BriefDescription": "Interrupt Entry Cache invalidation events", + "EventCode": "0x43", + "EventName": "UNC_IIO_IOMMU3.NUM_INVAL_INT_CACHE", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x20", + "Unit": "IIO" + }, + { + "BriefDescription": "IOTLB invalidation events", + "EventCode": "0x43", + "EventName": "UNC_IIO_IOMMU3.NUM_INVAL_IOTLB", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x4", + "Unit": "IIO" + }, + { + "BriefDescription": "PASID Cache invalidation events", + "EventCode": "0x43", + "EventName": "UNC_IIO_IOMMU3.NUM_INVAL_PASID_CACHE", + "PerPkg": "1", + "PortMask": "0x000", + "UMask": "0x10", + "Unit": "IIO" + }, + { + "BriefDescription": "Occupancy of outbound request queue : To device : Counts number of outbound requests/completions IIO is currently processing", + "EventCode": "0xc5", + "EventName": "UNC_IIO_NUM_OUSTANDING_REQ_FROM_CPU.TO_IO", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff008", + "Unit": "IIO" + }, + { + "BriefDescription": "Passing data to be written", + "EventCode": "0x88", + "EventName": "UNC_IIO_NUM_OUTSTANDING_REQ_OF_CPU.DATA", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x700f020", + "Unit": "IIO" + }, + { + "BriefDescription": "Issuing final read or write of line", + "EventCode": "0x88", + "EventName": "UNC_IIO_NUM_OUTSTANDING_REQ_OF_CPU.FINAL_RD_WR", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x700f008", + "Unit": "IIO" + }, + { + "BriefDescription": "Processing response from IOMMU", + "EventCode": "0x88", + "EventName": "UNC_IIO_NUM_OUTSTANDING_REQ_OF_CPU.IOMMU_HIT", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x700f002", + "Unit": "IIO" + }, + { + "BriefDescription": "Issuing to IOMMU", + "EventCode": "0x88", + "EventName": "UNC_IIO_NUM_OUTSTANDING_REQ_OF_CPU.IOMMU_REQ", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x700f001", + "Unit": "IIO" + }, + { + "BriefDescription": "Request Ownership", + "EventCode": "0x88", + "EventName": "UNC_IIO_NUM_OUTSTANDING_REQ_OF_CPU.REQ_OWN", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x700f004", + "Unit": "IIO" + }, + { + "BriefDescription": "Writing line", + "EventCode": "0x88", + "EventName": "UNC_IIO_NUM_OUTSTANDING_REQ_OF_CPU.WR", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x700f010", + "Unit": "IIO" + }, + { + "BriefDescription": "-", + "EventCode": "0x8e", + "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.ABORT", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff080", + "Unit": "IIO" + }, + { + "BriefDescription": "-", + "EventCode": "0x8e", + "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.CONFINED_P2P", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff040", + "Unit": "IIO" + }, + { + "BriefDescription": "-", + "EventCode": "0x8e", + "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.LOC_P2P", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff020", + "Unit": "IIO" + }, + { + "BriefDescription": "-", + "EventCode": "0x8e", + "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.MCAST", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff002", + "Unit": "IIO" + }, + { + "BriefDescription": "-", + "EventCode": "0x8e", + "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.MEM", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff008", + "Unit": "IIO" + }, + { + "BriefDescription": "-", + "EventCode": "0x8e", + "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.MSGB", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff001", + "Unit": "IIO" + }, + { + "BriefDescription": "-", + "EventCode": "0x8e", + "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.REM_P2P", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff010", + "Unit": "IIO" + }, + { + "BriefDescription": "-", + "EventCode": "0x8e", + "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.UBOX", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff004", + "Unit": "IIO" + }, + { + "BriefDescription": "All 9 bits of Page Walk Tracker Occupancy", + "EventCode": "0x42", + "EventName": "UNC_IIO_PWT_OCCUPANCY", + "PerPkg": "1", + "PortMask": "0x000", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.ALL_PARTS", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.ALL_PARTS", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Another card (different IIO stack) reading from this card.", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.PEER_READ.ALL_PARTS", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff008", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested by the CPU : Another card (different IIO stack) writing to this card.", + "EventCode": "0xC1", + "EventName": "UNC_IIO_TXN_REQ_BY_CPU.PEER_WRITE.ALL_PARTS", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x0FF", + "UMask": "0x70ff002", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080004", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080001", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_READ.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001008", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_READ.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002008", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_READ.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004008", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_READ.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008008", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_READ.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010008", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_READ.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020008", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_READ.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040008", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card reading from another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_READ.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080008", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART0", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x001", + "UMask": "0x7001002", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART1", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x002", + "UMask": "0x7002002", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART2", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x004", + "UMask": "0x7004002", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART3", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x008", + "UMask": "0x7008002", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART4", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x010", + "UMask": "0x7010002", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART5", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x020", + "UMask": "0x7020002", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART6", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x040", + "UMask": "0x7040002", + "Unit": "IIO" + }, + { + "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)", + "EventCode": "0x84", + "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART7", + "FCMask": "0x07", + "PerPkg": "1", + "PortMask": "0x080", + "UMask": "0x7080002", + "Unit": "IIO" + } +] diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/uncore-memory.json b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-memory.json new file mode 100644 index 0000000000..a2405ed640 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-memory.json @@ -0,0 +1,385 @@ +[ + { + "BriefDescription": "DRAM Activate Count : Counts the number of DRAM Activate commands sent on this channel. Activate commands are issued to open up a page on the DRAM devices so that it can be read or written to with a CAS. One can calculate the number of Page Misses by subtracting the number of Page Miss precharges from the number of Activates.", + "EventCode": "0x02", + "EventName": "UNC_M_ACT_COUNT.ALL", + "PerPkg": "1", + "UMask": "0xf7", + "Unit": "IMC" + }, + { + "BriefDescription": "DRAM Activate Count : Read transaction on Page Empty or Page Miss : Counts the number of DRAM Activate commands sent on this channel. Activate commands are issued to open up a page on the DRAM devices so that it can be read or written to with a CAS. One can calculate the number of Page Misses by subtracting the number of Page Miss precharges from the number of Activates.", + "EventCode": "0x02", + "EventName": "UNC_M_ACT_COUNT.RD", + "PerPkg": "1", + "UMask": "0xf1", + "Unit": "IMC" + }, + { + "BriefDescription": "DRAM Activate Count : Underfill Read transaction on Page Empty or Page Miss : Counts the number of DRAM Activate commands sent on this channel. Activate commands are issued to open up a page on the DRAM devices so that it can be read or written to with a CAS. One can calculate the number of Page Misses by subtracting the number of Page Miss precharges from the number of Activates.", + "EventCode": "0x02", + "EventName": "UNC_M_ACT_COUNT.UFILL", + "PerPkg": "1", + "UMask": "0xf4", + "Unit": "IMC" + }, + { + "BriefDescription": "DRAM Activate Count : Write transaction on Page Empty or Page Miss : Counts the number of DRAM Activate commands sent on this channel. Activate commands are issued to open up a page on the DRAM devices so that it can be read or written to with a CAS. One can calculate the number of Page Misses by subtracting the number of Page Miss precharges from the number of Activates.", + "EventCode": "0x02", + "EventName": "UNC_M_ACT_COUNT.WR", + "PerPkg": "1", + "UMask": "0xf2", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0, all CAS operations", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.ALL", + "PerPkg": "1", + "UMask": "0xff", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0, all reads", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.RD", + "PerPkg": "1", + "UMask": "0xcf", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0 regular reads", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.RD_REG", + "PerPkg": "1", + "UMask": "0xc1", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0 underfill reads", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.RD_UNDERFILL", + "PerPkg": "1", + "UMask": "0xc4", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0, all writes", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.WR", + "PerPkg": "1", + "UMask": "0xf0", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0 regular writes", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.WR_NONPRE", + "PerPkg": "1", + "UMask": "0xd0", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0 auto-precharge writes", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.WR_PRE", + "PerPkg": "1", + "UMask": "0xe0", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1, all CAS operations", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.ALL", + "PerPkg": "1", + "UMask": "0xff", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1, all reads", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.RD", + "PerPkg": "1", + "UMask": "0xcf", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1 regular reads", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.RD_REG", + "PerPkg": "1", + "UMask": "0xc1", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1 underfill reads", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.RD_UNDERFILL", + "PerPkg": "1", + "UMask": "0xc4", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1, all writes", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.WR", + "PerPkg": "1", + "UMask": "0xf0", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1 regular writes", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.WR_NONPRE", + "PerPkg": "1", + "UMask": "0xd0", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1 auto-precharge writes", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.WR_PRE", + "PerPkg": "1", + "UMask": "0xe0", + "Unit": "IMC" + }, + { + "BriefDescription": "Number of DRAM DCLK clock cycles while the event is enabled", + "EventCode": "0x01", + "EventName": "UNC_M_CLOCKTICKS", + "PerPkg": "1", + "PublicDescription": "DRAM Clockticks", + "UMask": "0x1", + "Unit": "IMC" + }, + { + "BriefDescription": "Number of DRAM HCLK clock cycles while the event is enabled", + "EventCode": "0x01", + "EventName": "UNC_M_HCLOCKTICKS", + "PerPkg": "1", + "PublicDescription": "DRAM Clockticks", + "Unit": "IMC" + }, + { + "BriefDescription": "DRAM Precharge commands. : Counts the number of DRAM Precharge commands sent on this channel.", + "EventCode": "0x03", + "EventName": "UNC_M_PRE_COUNT.ALL", + "PerPkg": "1", + "UMask": "0xff", + "Unit": "IMC" + }, + { + "BriefDescription": "DRAM Precharge commands. : Precharge due to (?) : Counts the number of DRAM Precharge commands sent on this channel.", + "EventCode": "0x03", + "EventName": "UNC_M_PRE_COUNT.PGT", + "PerPkg": "1", + "UMask": "0xf8", + "Unit": "IMC" + }, + { + "BriefDescription": "DRAM Precharge commands. : Counts the number of DRAM Precharge commands sent on this channel.", + "EventCode": "0x03", + "EventName": "UNC_M_PRE_COUNT.RD", + "PerPkg": "1", + "UMask": "0xf1", + "Unit": "IMC" + }, + { + "BriefDescription": "DRAM Precharge commands. : Counts the number of DRAM Precharge commands sent on this channel.", + "EventCode": "0x03", + "EventName": "UNC_M_PRE_COUNT.UFILL", + "PerPkg": "1", + "UMask": "0xf4", + "Unit": "IMC" + }, + { + "BriefDescription": "DRAM Precharge commands. : Counts the number of DRAM Precharge commands sent on this channel.", + "EventCode": "0x03", + "EventName": "UNC_M_PRE_COUNT.WR", + "PerPkg": "1", + "UMask": "0xf2", + "Unit": "IMC" + }, + { + "BriefDescription": "Read buffer inserts on subchannel 0", + "EventCode": "0x17", + "EventName": "UNC_M_RDB_INSERTS.SCH0", + "PerPkg": "1", + "UMask": "0x40", + "Unit": "IMC" + }, + { + "BriefDescription": "Read buffer inserts on subchannel 1", + "EventCode": "0x17", + "EventName": "UNC_M_RDB_INSERTS.SCH1", + "PerPkg": "1", + "UMask": "0x80", + "Unit": "IMC" + }, + { + "BriefDescription": "Read buffer occupancy on subchannel 0", + "EventCode": "0x1a", + "EventName": "UNC_M_RDB_OCCUPANCY_SCH0", + "PerPkg": "1", + "Unit": "IMC" + }, + { + "BriefDescription": "Read buffer occupancy on subchannel 1", + "EventCode": "0x1b", + "EventName": "UNC_M_RDB_OCCUPANCY_SCH1", + "PerPkg": "1", + "Unit": "IMC" + }, + { + "BriefDescription": "Read Pending Queue Allocations : Counts the number of allocations into the Read Pending Queue. This queue is used to schedule reads out to the memory controller and to track the requests. Requests allocate into the RPQ soon after they enter the memory controller, and need credits for an entry in this buffer before being sent from the HA to the iMC. They deallocate after the CAS command has been issued to memory. This includes both ISOCH and non-ISOCH requests.", + "EventCode": "0x10", + "EventName": "UNC_M_RPQ_INSERTS.PCH0", + "PerPkg": "1", + "UMask": "0x50", + "Unit": "IMC" + }, + { + "BriefDescription": "Read Pending Queue Allocations : Counts the number of allocations into the Read Pending Queue. This queue is used to schedule reads out to the memory controller and to track the requests. Requests allocate into the RPQ soon after they enter the memory controller, and need credits for an entry in this buffer before being sent from the HA to the iMC. They deallocate after the CAS command has been issued to memory. This includes both ISOCH and non-ISOCH requests.", + "EventCode": "0x10", + "EventName": "UNC_M_RPQ_INSERTS.PCH1", + "PerPkg": "1", + "UMask": "0xa0", + "Unit": "IMC" + }, + { + "BriefDescription": "Read Pending Queue inserts for subchannel 0, pseudochannel 0", + "EventCode": "0x10", + "EventName": "UNC_M_RPQ_INSERTS.SCH0_PCH0", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "IMC" + }, + { + "BriefDescription": "Read Pending Queue inserts for subchannel 0, pseudochannel 1", + "EventCode": "0x10", + "EventName": "UNC_M_RPQ_INSERTS.SCH0_PCH1", + "PerPkg": "1", + "UMask": "0x20", + "Unit": "IMC" + }, + { + "BriefDescription": "Read Pending Queue inserts for subchannel 1, pseudochannel 0", + "EventCode": "0x10", + "EventName": "UNC_M_RPQ_INSERTS.SCH1_PCH0", + "PerPkg": "1", + "UMask": "0x40", + "Unit": "IMC" + }, + { + "BriefDescription": "Read Pending Queue inserts for subchannel 1, pseudochannel 1", + "EventCode": "0x10", + "EventName": "UNC_M_RPQ_INSERTS.SCH1_PCH1", + "PerPkg": "1", + "UMask": "0x80", + "Unit": "IMC" + }, + { + "BriefDescription": "Read pending queue occupancy for subchannel 0, pseudochannel 0", + "EventCode": "0x80", + "EventName": "UNC_M_RPQ_OCCUPANCY_SCH0_PCH0", + "PerPkg": "1", + "Unit": "IMC" + }, + { + "BriefDescription": "Read pending queue occupancy for subchannel 0, pseudochannel 1", + "EventCode": "0x81", + "EventName": "UNC_M_RPQ_OCCUPANCY_SCH0_PCH1", + "PerPkg": "1", + "Unit": "IMC" + }, + { + "BriefDescription": "Read pending queue occupancy for subchannel 1, pseudochannel 0", + "EventCode": "0x82", + "EventName": "UNC_M_RPQ_OCCUPANCY_SCH1_PCH0", + "PerPkg": "1", + "Unit": "IMC" + }, + { + "BriefDescription": "Read pending queue occupancy for subchannel 1, pseudochannel 1", + "EventCode": "0x83", + "EventName": "UNC_M_RPQ_OCCUPANCY_SCH1_PCH1", + "PerPkg": "1", + "Unit": "IMC" + }, + { + "BriefDescription": "Write Pending Queue Allocations", + "EventCode": "0x22", + "EventName": "UNC_M_WPQ_INSERTS.PCH0", + "PerPkg": "1", + "UMask": "0x50", + "Unit": "IMC" + }, + { + "BriefDescription": "Write Pending Queue Allocations", + "EventCode": "0x22", + "EventName": "UNC_M_WPQ_INSERTS.PCH1", + "PerPkg": "1", + "UMask": "0xa0", + "Unit": "IMC" + }, + { + "BriefDescription": "Write Pending Queue inserts for subchannel 0, pseudochannel 0", + "EventCode": "0x22", + "EventName": "UNC_M_WPQ_INSERTS.SCH0_PCH0", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "IMC" + }, + { + "BriefDescription": "Write Pending Queue inserts for subchannel 0, pseudochannel 1", + "EventCode": "0x22", + "EventName": "UNC_M_WPQ_INSERTS.SCH0_PCH1", + "PerPkg": "1", + "UMask": "0x20", + "Unit": "IMC" + }, + { + "BriefDescription": "Write Pending Queue inserts for subchannel 1, pseudochannel 0", + "EventCode": "0x22", + "EventName": "UNC_M_WPQ_INSERTS.SCH1_PCH0", + "PerPkg": "1", + "UMask": "0x40", + "Unit": "IMC" + }, + { + "BriefDescription": "Write Pending Queue inserts for subchannel 1, pseudochannel 1", + "EventCode": "0x22", + "EventName": "UNC_M_WPQ_INSERTS.SCH1_PCH1", + "PerPkg": "1", + "UMask": "0x80", + "Unit": "IMC" + }, + { + "BriefDescription": "Write pending queue occupancy for subchannel 0, pseudochannel 0", + "EventCode": "0x84", + "EventName": "UNC_M_WPQ_OCCUPANCY_SCH0_PCH0", + "PerPkg": "1", + "Unit": "IMC" + }, + { + "BriefDescription": "Write pending queue occupancy for subchannel 0, pseudochannel 1", + "EventCode": "0x85", + "EventName": "UNC_M_WPQ_OCCUPANCY_SCH0_PCH1", + "PerPkg": "1", + "Unit": "IMC" + }, + { + "BriefDescription": "Write pending queue occupancy for subchannel 1, pseudochannel 0", + "EventCode": "0x86", + "EventName": "UNC_M_WPQ_OCCUPANCY_SCH1_PCH0", + "PerPkg": "1", + "Unit": "IMC" + }, + { + "BriefDescription": "Write pending queue occupancy for subchannel 1, pseudochannel 1", + "EventCode": "0x87", + "EventName": "UNC_M_WPQ_OCCUPANCY_SCH1_PCH1", + "PerPkg": "1", + "Unit": "IMC" + } +] diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/uncore-power.json b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-power.json new file mode 100644 index 0000000000..e3a66166e2 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-power.json @@ -0,0 +1,10 @@ +[ + { + "BriefDescription": "PCU Clockticks", + "EventCode": "0x01", + "EventName": "UNC_P_CLOCKTICKS", + "PerPkg": "1", + "PublicDescription": "PCU Clockticks: The PCU runs off a fixed 1 GHz clock. This event counts the number of pclk cycles measured while the counter was enabled. The pclk, like the Memory Controller's dclk, counts at a constant rate making it a good measure of actual wall time.", + "Unit": "PCU" + } +] diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/virtual-memory.json b/tools/perf/pmu-events/arch/x86/sierraforest/virtual-memory.json index bd5f2b634c..371974c6d6 100644 --- a/tools/perf/pmu-events/arch/x86/sierraforest/virtual-memory.json +++ b/tools/perf/pmu-events/arch/x86/sierraforest/virtual-memory.json @@ -1,24 +1,131 @@ [ { - "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to a 1G page.", + "BriefDescription": "Counts the number of first level TLB misses but second level hits due to a demand load that did not start a page walk. Accounts for all page sizes. Will result in a DTLB write from STLB.", + "EventCode": "0x08", + "EventName": "DTLB_LOAD_MISSES.STLB_HIT", + "SampleAfterValue": "200003", + "UMask": "0x20" + }, + { + "BriefDescription": "Counts the number of page walks completed due to load DTLB misses.", "EventCode": "0x08", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED", - "SampleAfterValue": "1000003", + "SampleAfterValue": "200003", "UMask": "0xe" }, { + "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to a 2M or 4M page.", + "EventCode": "0x08", + "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M", + "PublicDescription": "Counts the number of page walks completed due to loads (including SW prefetches) whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 2M or 4M pages. Includes page walks that page fault.", + "SampleAfterValue": "200003", + "UMask": "0x4" + }, + { + "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to a 4K page.", + "EventCode": "0x08", + "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K", + "PublicDescription": "Counts the number of page walks completed due to loads (including SW prefetches) whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages. Includes page walks that page fault.", + "SampleAfterValue": "200003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of page walks outstanding for Loads (demand or SW prefetch) in PMH every cycle.", + "EventCode": "0x08", + "EventName": "DTLB_LOAD_MISSES.WALK_PENDING", + "PublicDescription": "Counts the number of page walks outstanding for Loads (demand or SW prefetch) in PMH every cycle. A PMH page walk is outstanding from page walk start till PMH becomes idle again (ready to serve next walk). Includes EPT-walk intervals.", + "SampleAfterValue": "200003", + "UMask": "0x10" + }, + { + "BriefDescription": "Counts the number of first level TLB misses but second level hits due to stores that did not start a page walk. Accounts for all pages sizes. Will result in a DTLB write from STLB.", + "EventCode": "0x49", + "EventName": "DTLB_STORE_MISSES.STLB_HIT", + "SampleAfterValue": "2000003", + "UMask": "0x20" + }, + { "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to a 1G page.", "EventCode": "0x49", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED", - "SampleAfterValue": "1000003", + "SampleAfterValue": "2000003", "UMask": "0xe" }, { + "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to a 2M or 4M page.", + "EventCode": "0x49", + "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M", + "PublicDescription": "Counts the number of page walks completed due to stores whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 2M or 4M pages. Includes page walks that page fault.", + "SampleAfterValue": "2000003", + "UMask": "0x4" + }, + { + "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to a 4K page.", + "EventCode": "0x49", + "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_4K", + "PublicDescription": "Counts the number of page walks completed due to stores whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages. Includes page walks that page fault.", + "SampleAfterValue": "2000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of page walks outstanding in the page miss handler (PMH) for stores every cycle.", + "EventCode": "0x49", + "EventName": "DTLB_STORE_MISSES.WALK_PENDING", + "PublicDescription": "Counts the number of page walks outstanding in the page miss handler (PMH) for stores every cycle. A PMH page walk is outstanding from page walk start till PMH becomes idle again (ready to serve next walk). Includes EPT-walk intervals.", + "SampleAfterValue": "200003", + "UMask": "0x10" + }, + { + "BriefDescription": "Counts the number of page walks initiated by a instruction fetch that missed the first and second level TLBs.", + "EventCode": "0x85", + "EventName": "ITLB_MISSES.MISS_CAUSED_WALK", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of first level TLB misses but second level hits due to an instruction fetch that did not start a page walk. Account for all pages sizes. Will result in an ITLB write from STLB.", + "EventCode": "0x85", + "EventName": "ITLB_MISSES.STLB_HIT", + "SampleAfterValue": "2000003", + "UMask": "0x20" + }, + { "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to any page size.", "EventCode": "0x85", "EventName": "ITLB_MISSES.WALK_COMPLETED", "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to any page size. Includes page walks that page fault.", "SampleAfterValue": "200003", "UMask": "0xe" + }, + { + "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to a 2M or 4M page.", + "EventCode": "0x85", + "EventName": "ITLB_MISSES.WALK_COMPLETED_2M_4M", + "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 2M or 4M pages. Includes page walks that page fault.", + "SampleAfterValue": "2000003", + "UMask": "0x4" + }, + { + "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to a 4K page.", + "EventCode": "0x85", + "EventName": "ITLB_MISSES.WALK_COMPLETED_4K", + "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages. Includes page walks that page fault.", + "SampleAfterValue": "2000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of page walks outstanding for iside in PMH every cycle.", + "EventCode": "0x85", + "EventName": "ITLB_MISSES.WALK_PENDING", + "PublicDescription": "Counts the number of page walks outstanding for iside in PMH every cycle. A PMH page walk is outstanding from page walk start till PMH becomes idle again (ready to serve next walk). Includes EPT-walk intervals. Walks could be counted by edge detecting on this event, but would count restarted suspended walks.", + "SampleAfterValue": "200003", + "UMask": "0x10" + }, + { + "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a DTLB miss.", + "EventCode": "0x05", + "EventName": "LD_HEAD.DTLB_MISS_AT_RET", + "SampleAfterValue": "1000003", + "UMask": "0x90" } ] diff --git a/tools/perf/pmu-events/arch/x86/skylake/memory.json b/tools/perf/pmu-events/arch/x86/skylake/memory.json index 588ad6059a..f047862f97 100644 --- a/tools/perf/pmu-events/arch/x86/skylake/memory.json +++ b/tools/perf/pmu-events/arch/x86/skylake/memory.json @@ -1008,7 +1008,7 @@ "BriefDescription": "Number of times an RTM execution aborted due to any reasons (multiple categories may count as one).", "EventCode": "0xC9", "EventName": "RTM_RETIRED.ABORTED", - "PEBS": "1", + "PEBS": "2", "PublicDescription": "Number of times RTM abort was triggered.", "SampleAfterValue": "2000003", "UMask": "0x4" diff --git a/tools/perf/pmu-events/arch/x86/skylake/metricgroups.json b/tools/perf/pmu-events/arch/x86/skylake/metricgroups.json index a151ba9ccc..5452a1448d 100644 --- a/tools/perf/pmu-events/arch/x86/skylake/metricgroups.json +++ b/tools/perf/pmu-events/arch/x86/skylake/metricgroups.json @@ -2,10 +2,10 @@ "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -25,7 +25,9 @@ "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -63,8 +65,10 @@ "tma_L5_group": "Metrics for top-down breakdown at level 5", "tma_L6_group": "Metrics for top-down breakdown at level 6", "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_assists_group": "Metrics contributing to tma_assists category", "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category", "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", "tma_core_bound_group": "Metrics contributing to tma_core_bound category", "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", @@ -77,9 +81,9 @@ "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", "tma_issue2P": "Metrics related by the issue $issue2P", - "tma_issueBC": "Metrics related by the issue $issueBC", "tma_issueBM": "Metrics related by the issue $issueBM", "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueComp": "Metrics related by the issue $issueComp", "tma_issueD0": "Metrics related by the issue $issueD0", "tma_issueFB": "Metrics related by the issue $issueFB", "tma_issueFL": "Metrics related by the issue $issueFL", @@ -99,10 +103,12 @@ "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", "tma_light_operations_group": "Metrics contributing to tma_light_operations category", "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", "tma_mite_group": "Metrics contributing to tma_mite category", + "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category", "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", diff --git a/tools/perf/pmu-events/arch/x86/skylake/pipeline.json b/tools/perf/pmu-events/arch/x86/skylake/pipeline.json index cd3e737bf4..fe202d1e36 100644 --- a/tools/perf/pmu-events/arch/x86/skylake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/skylake/pipeline.json @@ -387,7 +387,7 @@ "Errata": "SKL091, SKL044", "EventCode": "0xC0", "EventName": "INST_RETIRED.NOP", - "PEBS": "1", + "PEBS": "2", "SampleAfterValue": "2000003", "UMask": "0x2" }, diff --git a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json index faa615c578..3af71b84bb 100644 --- a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json @@ -83,12 +83,12 @@ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", - "MetricThreshold": "tma_alu_op_utilization > 0.6", + "MetricThreshold": "tma_alu_op_utilization > 0.4", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_thread_slots", + "MetricExpr": "34 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -156,7 +156,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(18.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 16.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "(18.5 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 16.5 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -177,7 +177,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "16.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "16.5 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -189,7 +189,7 @@ "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", "MetricName": "tma_decoder0_alone", - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35))", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)", "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", "ScaleUnit": "100%" }, @@ -214,10 +214,10 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", + "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, @@ -237,7 +237,7 @@ "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", "ScaleUnit": "100%" }, { @@ -246,13 +246,13 @@ "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "22 * tma_info_system_average_frequency * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", + "MetricExpr": "22 * tma_info_system_core_frequency * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -266,7 +266,7 @@ "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -274,7 +274,7 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", "MetricgroupNoGroup": "TopdownL2", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" @@ -309,6 +309,15 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists", + "MetricExpr": "34 * FP_ASSIST.ANY / tma_info_thread_slots", + "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", + "MetricName": "tma_fp_assists", + "MetricThreshold": "tma_fp_assists > 0.1", + "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called Denormals).", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / UOPS_RETIRED.RETIRE_SLOTS", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", @@ -358,10 +367,10 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions", "MetricExpr": "tma_light_operations * UOPS_RETIRED.MACRO_FUSED / UOPS_RETIRED.RETIRE_SLOTS", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_fused_instructions", "MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. CMP+JCC or DEC+JCC are common examples of legacy fusions. {([MTL] Note new MOV+OP and Load+OP fusions appear under Other_Light_Ops in MTL!)}", "ScaleUnit": "100%" }, { @@ -371,13 +380,13 @@ "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", @@ -385,14 +394,14 @@ }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES / 100", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bad_spec_branch_misprediction_cost", "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" }, { "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)", "MetricGroup": "Bad;BrMispredicts", "MetricName": "tma_info_bad_spec_ipmisp_indirect", "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" @@ -405,6 +414,12 @@ "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" }, { + "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)", + "MetricExpr": "INT_MISC.CLEARS_COUNT / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)", + "MetricGroup": "BrMispredicts", + "MetricName": "tma_info_bad_spec_spec_clears_ratio" + }, + { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", @@ -430,67 +445,102 @@ "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " }, { + "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.", + "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Ret", + "MetricName": "tma_info_bottleneck_base_non_br", + "MetricThreshold": "tma_info_bottleneck_base_non_br > 20" + }, + { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", - "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", + "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "tma_info_bottleneck_big_code", - "MetricThreshold": "tma_info_bottleneck_big_code > 20", - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" + "MetricThreshold": "tma_info_bottleneck_big_code > 20" }, { "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", - "MetricGroup": "Ret;tma_issueBC", + "MetricExpr": "100 * ((BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots)", + "MetricGroup": "Ret", "MetricName": "tma_info_bottleneck_branching_overhead", - "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", + "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", + "MetricName": "tma_info_bottleneck_cache_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", + "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", + "MetricName": "tma_info_bottleneck_cache_memory_latency", + "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency" + }, + { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "Cor;tma_issueComp", + "MetricName": "tma_info_bottleneck_compute_bound_est", + "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " }, { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", "MetricGroup": "Fed;FetchBW;Frontend", "MetricName": "tma_info_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" }, { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", - "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_info_bottleneck_memory_bandwidth", - "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Total pipeline cost of irregular execution (e.g", + "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Bad;Cor;Ret;tma_issueMS", + "MetricName": "tma_info_bottleneck_irregular_overhead", + "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10", + "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches" }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_info_bottleneck_memory_data_tlbs", "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", - "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" + "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization" }, { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", - "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_info_bottleneck_memory_latency", - "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", - "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" + "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", + "MetricExpr": "100 * (tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", + "MetricGroup": "Mem;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_synchronization", + "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10", + "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs" }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bottleneck_mispredictions", "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" }, { + "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)", + "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)", + "MetricGroup": "Cor;Offcore", + "MetricName": "tma_info_bottleneck_other_bottlenecks", + "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20", + "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls." + }, + { "BriefDescription": "Fraction of branches that are CALL or RET", "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", @@ -511,7 +561,7 @@ { "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.COND - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", "MetricName": "tma_info_branches_jump" }, @@ -528,9 +578,15 @@ "MetricName": "tma_info_core_coreipc" }, { + "BriefDescription": "uops Executed per Cycle", + "MetricExpr": "UOPS_EXECUTED.THREAD / tma_info_thread_clks", + "MetricGroup": "Power", + "MetricName": "tma_info_core_epc" + }, + { "BriefDescription": "Floating Point Operations Per Cycle", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks", "MetricGroup": "Flops;Ret", "MetricName": "tma_info_core_flopc" }, @@ -542,8 +598,8 @@ "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_core_ilp" }, @@ -618,7 +674,7 @@ "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", "MetricThreshold": "tma_info_inst_mix_iparith < 10", - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", @@ -626,7 +682,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx128", "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", @@ -634,7 +690,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx256", "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", @@ -642,7 +698,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_dp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", @@ -650,7 +706,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_sp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", @@ -669,7 +725,7 @@ { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_ipflop", "MetricThreshold": "tma_info_inst_mix_ipflop < 10" @@ -705,136 +761,142 @@ }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l1d_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l2_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_access_bw", "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_core_l3_cache_access_bw" + "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_fb_hpki" }, { + "BriefDescription": "", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw" + }, + { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki_load" }, { + "BriefDescription": "", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw" + }, + { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_all" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Backend;CacheMisses;Mem", + "MetricGroup": "Backend;CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem;Offcore", + "MetricGroup": "CacheHits;Mem;Offcore", "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki_load" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_memory_l3mpki" + "BriefDescription": "", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_l3_cache_access_bw" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_memory_load_miss_real_latency" + "BriefDescription": "", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_memory_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_l3mpki" }, { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_data_l2_mlp" + "MetricName": "tma_info_memory_latency_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_miss_latency" + "MetricName": "tma_info_memory_latency_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_mlp" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + "MetricName": "tma_info_memory_latency_load_l2_mlp" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + "BriefDescription": "Un-cacheable retired load per kilo instruction", + "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_mix_uc_load_pki" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", @@ -863,43 +925,57 @@ "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "BriefDescription": "", + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" }, { + "BriefDescription": "Instructions per a microcode Assist invocation", + "MetricExpr": "INST_RETIRED.ANY / (FP_ASSIST.ANY + OTHER_ASSISTS.ANY)", + "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", + "MetricName": "tma_info_pipeline_ipassist", + "MetricThreshold": "tma_info_pipeline_ipassist < 100e3", + "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)" + }, + { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", "MetricGroup": "Power;Summary", - "MetricName": "tma_info_system_average_frequency" + "MetricName": "tma_info_system_core_frequency" }, { - "BriefDescription": "Average CPU Utilization", + "BriefDescription": "Average CPU Utilization (percentage)", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", "MetricGroup": "HPC;Summary", "MetricName": "tma_info_system_cpu_utilization" }, { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_system_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width" }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", @@ -929,13 +1005,6 @@ "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" }, { - "BriefDescription": "Average number of parallel requests to external memory", - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_system_mem_parallel_requests", - "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests" - }, - { "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", "MetricExpr": "1e9 * (UNC_ARB_TRK_OCCUPANCY.DATA_READ / UNC_ARB_TRK_REQUESTS.DATA_READ) / (tma_info_system_socket_clks / duration_time)", "MetricGroup": "Mem;MemoryLat;SoC", @@ -943,12 +1012,6 @@ "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" }, { - "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_system_mem_request_latency" - }, - { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", "MetricGroup": "SMT", @@ -1013,8 +1076,8 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks", + "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", @@ -1023,7 +1086,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1", @@ -1033,7 +1096,7 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", @@ -1042,24 +1105,24 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "6.5 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "6.5 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", + "MetricExpr": "DECODE.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1073,7 +1136,7 @@ "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%" }, { @@ -1123,21 +1186,21 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1165,7 +1228,7 @@ "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", - "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches", "ScaleUnit": "100%" }, { @@ -1182,17 +1245,17 @@ "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%" }, { - "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", + "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)", "MetricExpr": "UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY", "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group", "MetricName": "tma_mixing_vectors", "MetricThreshold": "tma_mixing_vectors > 0.05", - "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", + "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", "ScaleUnit": "100%" }, { @@ -1201,13 +1264,13 @@ "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused", "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - UOPS_RETIRED.MACRO_FUSED) / UOPS_RETIRED.RETIRE_SLOTS", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_non_fused_branches", "MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.", @@ -1216,15 +1279,15 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / UOPS_RETIRED.RETIRE_SLOTS", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group", "MetricName": "tma_nop_instructions", - "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6", + "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", - "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches + tma_nop_instructions))", + "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6", @@ -1232,6 +1295,22 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).", + "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)", + "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group", + "MetricName": "tma_other_mispredicts", + "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.", + "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)", + "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group", + "MetricName": "tma_other_nukes", + "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", @@ -1286,12 +1365,12 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)", "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -1305,7 +1384,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", + "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1314,7 +1393,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_core_core_clks", + "MetricExpr": "(EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / tma_info_thread_clks * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1344,7 +1423,7 @@ "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", - "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "ScaleUnit": "100%" }, { @@ -1360,9 +1439,9 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / tma_info_thread_clks", - "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", + "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO", "MetricName": "tma_serializing_operation", - "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", + "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: PARTIAL_RAT_STALLS.SCOREBOARD. Related metrics: tma_ms_switches", "ScaleUnit": "100%" }, @@ -1391,7 +1470,7 @@ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { @@ -1449,10 +1528,10 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", "MetricExpr": "9 * BACLEARS.ANY / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", + "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY", "ScaleUnit": "100%" }, { diff --git a/tools/perf/pmu-events/arch/x86/skylake/virtual-memory.json b/tools/perf/pmu-events/arch/x86/skylake/virtual-memory.json index f59405877a..73feadaf76 100644 --- a/tools/perf/pmu-events/arch/x86/skylake/virtual-memory.json +++ b/tools/perf/pmu-events/arch/x86/skylake/virtual-memory.json @@ -205,7 +205,7 @@ "BriefDescription": "Counts 1 per cycle for each PMH that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake.", "EventCode": "0x85", "EventName": "ITLB_MISSES.WALK_PENDING", - "PublicDescription": "Counts 1 per cycle for each PMH (Page Miss Handler) that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake michroarchitecture.", + "PublicDescription": "Counts 1 per cycle for each PMH (Page Miss Handler) that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake microarchitecture.", "SampleAfterValue": "100003", "UMask": "0x10" }, diff --git a/tools/perf/pmu-events/arch/x86/skylakex/metricgroups.json b/tools/perf/pmu-events/arch/x86/skylakex/metricgroups.json index bc6a9a4d27..904d299c95 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/metricgroups.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/metricgroups.json @@ -2,10 +2,10 @@ "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -26,7 +26,9 @@ "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -64,8 +66,10 @@ "tma_L5_group": "Metrics for top-down breakdown at level 5", "tma_L6_group": "Metrics for top-down breakdown at level 6", "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_assists_group": "Metrics contributing to tma_assists category", "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category", "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", "tma_core_bound_group": "Metrics contributing to tma_core_bound category", "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", @@ -78,9 +82,9 @@ "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", "tma_issue2P": "Metrics related by the issue $issue2P", - "tma_issueBC": "Metrics related by the issue $issueBC", "tma_issueBM": "Metrics related by the issue $issueBM", "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueComp": "Metrics related by the issue $issueComp", "tma_issueD0": "Metrics related by the issue $issueD0", "tma_issueFB": "Metrics related by the issue $issueFB", "tma_issueFL": "Metrics related by the issue $issueFL", @@ -100,10 +104,12 @@ "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", "tma_light_operations_group": "Metrics contributing to tma_light_operations category", "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", "tma_mite_group": "Metrics contributing to tma_mite category", + "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category", "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", diff --git a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json index ec3aa5ef00..025e836a1c 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json @@ -210,6 +210,12 @@ "ScaleUnit": "1MB/s" }, { + "BriefDescription": "Bandwidth (MB/sec) of write requests that miss the last level cache (LLC) and go to remote memory.", + "MetricExpr": "UNC_CHA_REQUESTS.WRITES_REMOTE * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_remote_memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions", "MetricExpr": "MEM_INST_RETIRED.ALL_LOADS / INST_RETIRED.ANY", "MetricName": "loads_per_instr", @@ -298,12 +304,12 @@ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", - "MetricThreshold": "tma_alu_op_utilization > 0.6", + "MetricThreshold": "tma_alu_op_utilization > 0.4", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_thread_slots", + "MetricExpr": "34 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -371,7 +377,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(44 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 44 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "(44 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 44 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -392,7 +398,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "44 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "44 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -404,7 +410,7 @@ "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", "MetricName": "tma_decoder0_alone", - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35))", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)", "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", "ScaleUnit": "100%" }, @@ -429,10 +435,10 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", + "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, @@ -452,7 +458,7 @@ "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", "ScaleUnit": "100%" }, { @@ -461,13 +467,13 @@ "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(110 * tma_info_system_average_frequency * (OFFCORE_RESPONSE.DEMAND_RFO.L3_MISS.REMOTE_HITM + OFFCORE_RESPONSE.PF_L2_RFO.L3_MISS.REMOTE_HITM) + 47.5 * tma_info_system_average_frequency * (OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE)) / tma_info_thread_clks", + "MetricExpr": "(110 * tma_info_system_core_frequency * (OFFCORE_RESPONSE.DEMAND_RFO.L3_MISS.REMOTE_HITM + OFFCORE_RESPONSE.PF_L2_RFO.L3_MISS.REMOTE_HITM) + 47.5 * tma_info_system_core_frequency * (OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE)) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -481,7 +487,7 @@ "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -489,7 +495,7 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", "MetricgroupNoGroup": "TopdownL2", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" @@ -524,6 +530,15 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists", + "MetricExpr": "34 * FP_ASSIST.ANY / tma_info_thread_slots", + "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", + "MetricName": "tma_fp_assists", + "MetricThreshold": "tma_fp_assists > 0.1", + "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called Denormals).", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / UOPS_RETIRED.RETIRE_SLOTS", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", @@ -582,10 +597,10 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions", "MetricExpr": "tma_light_operations * UOPS_RETIRED.MACRO_FUSED / UOPS_RETIRED.RETIRE_SLOTS", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_fused_instructions", "MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. CMP+JCC or DEC+JCC are common examples of legacy fusions. {([MTL] Note new MOV+OP and Load+OP fusions appear under Other_Light_Ops in MTL!)}", "ScaleUnit": "100%" }, { @@ -595,13 +610,13 @@ "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", @@ -609,26 +624,53 @@ }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES / 100", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bad_spec_branch_misprediction_cost", "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" }, { "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)", "MetricGroup": "Bad;BrMispredicts", "MetricName": "tma_info_bad_spec_ipmisp_indirect", "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" }, { "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_core_ipmispredict", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;BadSpec;BrMispredicts", "MetricName": "tma_info_bad_spec_ipmispredict", "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" }, { + "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)", + "MetricExpr": "INT_MISC.CLEARS_COUNT / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)", + "MetricGroup": "BrMispredicts", + "MetricName": "tma_info_bad_spec_spec_clears_ratio" + }, + { + "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricExpr": "(100 * (1 - tma_core_bound / (((EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) if tma_core_bound < (((EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", + "MetricGroup": "Cor;SMT;TopdownL1;tma_L1_group", + "MetricName": "tma_info_botlnk_core_bound_likely", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.", + "MetricExpr": "100 * (100 * (tma_fetch_latency * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(2 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) + tma_fetch_bandwidth * tma_mite / (tma_mite + tma_dsb)))", + "MetricGroup": "DSBmiss;Fed;TopdownL1;tma_L1_group", + "MetricName": "tma_info_botlnk_dsb_misses", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck.", + "MetricExpr": "100 * (100 * (tma_fetch_latency * ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD) / ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(2 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD)))", + "MetricGroup": "Fed;FetchLat;IcMiss;TopdownL1;tma_L1_group", + "MetricName": "tma_info_botlnk_ic_misses", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", @@ -654,67 +696,102 @@ "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " }, { + "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.", + "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Ret", + "MetricName": "tma_info_bottleneck_base_non_br", + "MetricThreshold": "tma_info_bottleneck_base_non_br > 20" + }, + { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", - "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", + "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "tma_info_bottleneck_big_code", - "MetricThreshold": "tma_info_bottleneck_big_code > 20", - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" + "MetricThreshold": "tma_info_bottleneck_big_code > 20" }, { "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", - "MetricGroup": "Ret;tma_issueBC", + "MetricExpr": "100 * ((BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots)", + "MetricGroup": "Ret", "MetricName": "tma_info_bottleneck_branching_overhead", - "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", + "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", + "MetricName": "tma_info_bottleneck_cache_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", + "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", + "MetricName": "tma_info_bottleneck_cache_memory_latency", + "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency" + }, + { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "Cor;tma_issueComp", + "MetricName": "tma_info_bottleneck_compute_bound_est", + "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " }, { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", "MetricGroup": "Fed;FetchBW;Frontend", "MetricName": "tma_info_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" }, { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", - "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_info_bottleneck_memory_bandwidth", - "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Total pipeline cost of irregular execution (e.g", + "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Bad;Cor;Ret;tma_issueMS", + "MetricName": "tma_info_bottleneck_irregular_overhead", + "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10", + "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches" }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_info_bottleneck_memory_data_tlbs", "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", - "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" + "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization" }, { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", - "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_info_bottleneck_memory_latency", - "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", - "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" + "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) * tma_remote_cache / (tma_local_mem + tma_remote_cache + tma_remote_mem) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", + "MetricGroup": "Mem;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_synchronization", + "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10", + "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs" }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bottleneck_mispredictions", "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" }, { + "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)", + "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)", + "MetricGroup": "Cor;Offcore", + "MetricName": "tma_info_bottleneck_other_bottlenecks", + "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20", + "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls." + }, + { "BriefDescription": "Fraction of branches that are CALL or RET", "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", @@ -735,7 +812,7 @@ { "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.COND - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", "MetricName": "tma_info_branches_jump" }, @@ -752,9 +829,15 @@ "MetricName": "tma_info_core_coreipc" }, { + "BriefDescription": "uops Executed per Cycle", + "MetricExpr": "UOPS_EXECUTED.THREAD / tma_info_thread_clks", + "MetricGroup": "Power", + "MetricName": "tma_info_core_epc" + }, + { "BriefDescription": "Floating Point Operations Per Cycle", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", "MetricGroup": "Flops;Ret", "MetricName": "tma_info_core_flopc" }, @@ -766,19 +849,12 @@ "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_core_ilp" }, { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts;TopdownL1;tma_L1_group", - "MetricName": "tma_info_core_ipmispredict", - "MetricgroupNoGroup": "TopdownL1" - }, - { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", @@ -849,7 +925,7 @@ "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", "MetricThreshold": "tma_info_inst_mix_iparith < 10", - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", @@ -857,7 +933,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx128", "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", @@ -865,7 +941,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx256", "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", @@ -873,7 +949,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx512", "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", @@ -881,7 +957,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_dp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", @@ -889,7 +965,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_sp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", @@ -908,7 +984,7 @@ { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_ipflop", "MetricThreshold": "tma_info_inst_mix_ipflop < 10" @@ -943,16 +1019,23 @@ "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp" }, { + "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "tma_info_memory_tlb_code_stlb_mpki", + "MetricGroup": "Fed;MemoryTLB;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_code_stlb_mpki", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l1d_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l2_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t" }, { "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", @@ -968,124 +1051,214 @@ }, { "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_access_bw", "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_core_l3_cache_access_bw" + "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t" + }, + { + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "tma_info_memory_latency_data_l2_mlp", + "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_data_l2_mlp", + "MetricgroupNoGroup": "TopdownL1" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_fb_hpki" }, { + "BriefDescription": "", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki_load" }, { + "BriefDescription": "", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l2_cache_fill_bw_2t", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", + "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY", + "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", + "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY", + "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l2_evictions_silent_pki", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_all" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Backend;CacheMisses;Mem", + "MetricGroup": "Backend;CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem;Offcore", + "MetricGroup": "CacheHits;Mem;Offcore", "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki_load" }, { + "BriefDescription": "", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_l3_cache_access_bw" + }, + { + "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l3_cache_access_bw_2t", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)", + "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_l3_cache_fill_bw_2t", + "MetricgroupNoGroup": "TopdownL1" + }, + { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "Mem", "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_memory_load_miss_real_latency" + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_latency_data_l2_mlp" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_memory_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "Average Latency for L2 cache miss demand Loads", + "MetricExpr": "tma_info_memory_load_l2_miss_latency", + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_latency_load_l2_miss_latency" }, { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "BriefDescription": "Average Parallel L2 cache miss demand Loads", + "MetricExpr": "tma_info_memory_load_l2_mlp", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_data_l2_mlp" + "MetricName": "tma_info_memory_latency_load_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_miss_latency" + "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_load_l2_miss_latency", + "MetricgroupNoGroup": "TopdownL1" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_mlp" + "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_load_l2_mlp", + "MetricgroupNoGroup": "TopdownL1" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "tma_info_memory_tlb_load_stlb_mpki", + "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_load_stlb_mpki", + "MetricgroupNoGroup": "TopdownL1" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + "BriefDescription": "Un-cacheable retired load per kilo instruction", + "MetricExpr": "tma_info_memory_uc_load_pki", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_mix_uc_load_pki" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + }, + { + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "tma_info_memory_tlb_page_walks_utilization", + "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_page_walks_utilization", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "tma_info_memory_tlb_store_stlb_mpki", + "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_store_stlb_mpki", + "MetricgroupNoGroup": "TopdownL1" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", @@ -1114,55 +1287,78 @@ "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "BriefDescription": "Un-cacheable retired load per kilo instruction", + "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY", + "MetricGroup": "Mem;TopdownL1;tma_L1_group", + "MetricName": "tma_info_memory_uc_load_pki", + "MetricgroupNoGroup": "TopdownL1" + }, + { + "BriefDescription": "", + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" }, { + "BriefDescription": "Instructions per a microcode Assist invocation", + "MetricExpr": "INST_RETIRED.ANY / (FP_ASSIST.ANY + OTHER_ASSISTS.ANY)", + "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", + "MetricName": "tma_info_pipeline_ipassist", + "MetricThreshold": "tma_info_pipeline_ipassist < 100e3", + "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)" + }, + { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", "MetricGroup": "Power;Summary", - "MetricName": "tma_info_system_average_frequency" + "MetricName": "tma_info_system_core_frequency" }, { - "BriefDescription": "Average CPU Utilization", + "BriefDescription": "Average CPU Utilization (percentage)", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", "MetricGroup": "HPC;Summary", "MetricName": "tma_info_system_cpu_utilization" }, { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_system_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width" }, { "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]", "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3) * 4 / 1e9 / duration_time", - "MetricGroup": "IoBW;Mem;Server;SoC", - "MetricName": "tma_info_system_io_read_bw" + "MetricGroup": "IoBW;MemOffcore;Server;SoC", + "MetricName": "tma_info_system_io_read_bw", + "PublicDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]. Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU" }, { "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3) * 4 / 1e9 / duration_time", - "MetricGroup": "IoBW;Mem;Server;SoC", - "MetricName": "tma_info_system_io_write_bw" + "MetricGroup": "IoBW;MemOffcore;Server;SoC", + "MetricName": "tma_info_system_io_write_bw", + "PublicDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]. Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU" }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", @@ -1187,7 +1383,7 @@ { "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]", "MetricExpr": "1e9 * (UNC_M_RPQ_OCCUPANCY / UNC_M_RPQ_INSERTS) / imc_0@event\\=0x0@", - "MetricGroup": "Mem;MemoryLat;Server;SoC", + "MetricGroup": "MemOffcore;MemoryLat;Server;SoC", "MetricName": "tma_info_system_mem_dram_read_latency", "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" }, @@ -1247,6 +1443,12 @@ "MetricName": "tma_info_system_turbo_utilization" }, { + "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]", + "MetricExpr": "tma_info_system_socket_clks / 1e9 / duration_time", + "MetricGroup": "SoC", + "MetricName": "tma_info_system_uncore_frequency" + }, + { "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", "MetricExpr": "CPU_CLK_UNHALTED.THREAD", "MetricGroup": "Pipeline", @@ -1293,8 +1495,8 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks", + "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", @@ -1303,7 +1505,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1", @@ -1313,7 +1515,7 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", @@ -1322,24 +1524,24 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "17 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "17 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", + "MetricExpr": "DECODE.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1353,7 +1555,7 @@ "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%" }, { @@ -1384,10 +1586,10 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", - "MetricExpr": "59.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "59.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", - "MetricName": "tma_local_dram", - "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", + "MetricName": "tma_local_mem", + "MetricThreshold": "tma_local_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM_PS", "ScaleUnit": "100%" }, @@ -1412,21 +1614,21 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1454,7 +1656,7 @@ "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", - "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches", "ScaleUnit": "100%" }, { @@ -1471,17 +1673,17 @@ "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%" }, { - "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", + "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)", "MetricExpr": "UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY", "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group", "MetricName": "tma_mixing_vectors", "MetricThreshold": "tma_mixing_vectors > 0.05", - "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", + "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", "ScaleUnit": "100%" }, { @@ -1490,13 +1692,13 @@ "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused", "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - UOPS_RETIRED.MACRO_FUSED) / UOPS_RETIRED.RETIRE_SLOTS", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_non_fused_branches", "MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.", @@ -1505,15 +1707,15 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / UOPS_RETIRED.RETIRE_SLOTS", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group", "MetricName": "tma_nop_instructions", - "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6", + "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", - "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches + tma_nop_instructions))", + "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6", @@ -1521,6 +1723,22 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).", + "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)", + "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group", + "MetricName": "tma_other_mispredicts", + "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.", + "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)", + "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group", + "MetricName": "tma_other_nukes", + "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", @@ -1575,12 +1793,12 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)", "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", "ScaleUnit": "100%" }, { @@ -1594,7 +1812,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", + "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1603,7 +1821,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_core_core_clks", + "MetricExpr": "(EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / tma_info_thread_clks * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1633,13 +1851,13 @@ "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", - "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(89.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 89.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "(89.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 89.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group", "MetricName": "tma_remote_cache", "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1648,10 +1866,10 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", - "MetricExpr": "127 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "127 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", - "MetricName": "tma_remote_dram", - "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", + "MetricName": "tma_remote_mem", + "MetricThreshold": "tma_remote_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article. Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM_PS", "ScaleUnit": "100%" }, @@ -1668,9 +1886,9 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / tma_info_thread_clks", - "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", + "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO", "MetricName": "tma_serializing_operation", - "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", + "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: PARTIAL_RAT_STALLS.SCOREBOARD. Related metrics: tma_ms_switches", "ScaleUnit": "100%" }, @@ -1699,7 +1917,7 @@ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { @@ -1757,10 +1975,10 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", "MetricExpr": "9 * BACLEARS.ANY / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", + "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY", "ScaleUnit": "100%" }, { diff --git a/tools/perf/pmu-events/arch/x86/skylakex/uncore-power.json b/tools/perf/pmu-events/arch/x86/skylakex/uncore-power.json index c6254af7a4..ceef460464 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/uncore-power.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/uncore-power.json @@ -144,6 +144,7 @@ "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", + "UMask": "0x40", "Unit": "PCU" }, { @@ -152,6 +153,7 @@ "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", + "UMask": "0x80", "Unit": "PCU" }, { @@ -160,6 +162,7 @@ "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6", "PerPkg": "1", "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", + "UMask": "0xc0", "Unit": "PCU" }, { diff --git a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-power.json b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-power.json index a61ffca2df..dcf268467d 100644 --- a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-power.json +++ b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-power.json @@ -150,6 +150,7 @@ "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0", "PerPkg": "1", "PublicDescription": "Number of cores in C-State : C0 and C1 : This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", + "UMask": "0x40", "Unit": "PCU" }, { @@ -158,6 +159,7 @@ "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3", "PerPkg": "1", "PublicDescription": "Number of cores in C-State : C3 : This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", + "UMask": "0x80", "Unit": "PCU" }, { @@ -166,6 +168,7 @@ "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6", "PerPkg": "1", "PublicDescription": "Number of cores in C-State : C6 and C7 : This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", + "UMask": "0xc0", "Unit": "PCU" }, { diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/metricgroups.json b/tools/perf/pmu-events/arch/x86/tigerlake/metricgroups.json index a151ba9ccc..5452a1448d 100644 --- a/tools/perf/pmu-events/arch/x86/tigerlake/metricgroups.json +++ b/tools/perf/pmu-events/arch/x86/tigerlake/metricgroups.json @@ -2,10 +2,10 @@ "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", - "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -25,7 +25,9 @@ "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", @@ -63,8 +65,10 @@ "tma_L5_group": "Metrics for top-down breakdown at level 5", "tma_L6_group": "Metrics for top-down breakdown at level 6", "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_assists_group": "Metrics contributing to tma_assists category", "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category", "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", "tma_core_bound_group": "Metrics contributing to tma_core_bound category", "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", @@ -77,9 +81,9 @@ "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", "tma_issue2P": "Metrics related by the issue $issue2P", - "tma_issueBC": "Metrics related by the issue $issueBC", "tma_issueBM": "Metrics related by the issue $issueBM", "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueComp": "Metrics related by the issue $issueComp", "tma_issueD0": "Metrics related by the issue $issueD0", "tma_issueFB": "Metrics related by the issue $issueFB", "tma_issueFL": "Metrics related by the issue $issueFL", @@ -99,10 +103,12 @@ "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", "tma_light_operations_group": "Metrics contributing to tma_light_operations category", "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", "tma_mite_group": "Metrics contributing to tma_mite category", + "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category", "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/other.json b/tools/perf/pmu-events/arch/x86/tigerlake/other.json index 55f3048bcf..117b18abca 100644 --- a/tools/perf/pmu-events/arch/x86/tigerlake/other.json +++ b/tools/perf/pmu-events/arch/x86/tigerlake/other.json @@ -19,7 +19,7 @@ "BriefDescription": "Core cycles where the core was running in a manner where Turbo may be clipped to the AVX512 turbo schedule.", "EventCode": "0x28", "EventName": "CORE_POWER.LVL2_TURBO_LICENSE", - "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server microarchtecture). This includes high current AVX 512-bit instructions.", + "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server microarchitecture). This includes high current AVX 512-bit instructions.", "SampleAfterValue": "200003", "UMask": "0x20" }, diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/pipeline.json b/tools/perf/pmu-events/arch/x86/tigerlake/pipeline.json index 541bf1dd16..4f85d53ede 100644 --- a/tools/perf/pmu-events/arch/x86/tigerlake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/tigerlake/pipeline.json @@ -537,7 +537,7 @@ "BriefDescription": "Cycles when Reservation Station (RS) is empty for the thread", "EventCode": "0x5e", "EventName": "RS_EVENTS.EMPTY_CYCLES", - "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into stravation periods (e.g. branch mispredictions or i-cache misses)", + "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses)", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -561,14 +561,6 @@ "UMask": "0x2" }, { - "BriefDescription": "TMA slots wasted due to incorrect speculation by branch mispredictions", - "EventCode": "0xa4", - "EventName": "TOPDOWN.BR_MISPREDICT_SLOTS", - "PublicDescription": "Number of TMA slots that were wasted due to incorrect speculation by branch mispredictions. This event estimates number of operations that were issued but not retired from the speculative path as well as the out-of-order engine recovery past a branch misprediction.", - "SampleAfterValue": "10000003", - "UMask": "0x8" - }, - { "BriefDescription": "TMA slots available for an unhalted logical processor. Fixed counter - architectural event", "EventName": "TOPDOWN.SLOTS", "PublicDescription": "Number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method (TMA). The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core. Software can use this event as the denominator for the top-level metrics of the TMA method. This architectural event is counted on a designated fixed counter (Fixed Counter 3).", diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json b/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json index f11860f39c..8ae4f2474b 100644 --- a/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json @@ -98,12 +98,12 @@ "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", - "MetricThreshold": "tma_alu_op_utilization > 0.6", + "MetricThreshold": "tma_alu_op_utilization > 0.4", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * ASSISTS.ANY / tma_info_thread_slots", + "MetricExpr": "34 * ASSISTS.ANY / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -113,7 +113,7 @@ { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_thread_slots", + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / tma_info_thread_slots", "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -135,7 +135,7 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions.", "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * tma_info_thread_slots)", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_branch_instructions", "MetricThreshold": "tma_branch_instructions > 0.1 & tma_light_operations > 0.6", "ScaleUnit": "100%" @@ -180,7 +180,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(49 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 48 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "(49 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 48 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -200,7 +200,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "48 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "48 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -212,7 +212,7 @@ "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", "MetricName": "tma_decoder0_alone", - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)", "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", "ScaleUnit": "100%" }, @@ -240,7 +240,7 @@ "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, @@ -259,7 +259,7 @@ "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", "ScaleUnit": "100%" }, { @@ -268,12 +268,12 @@ "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "54 * tma_info_system_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", + "MetricExpr": "54 * tma_info_system_core_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -286,7 +286,7 @@ "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -294,7 +294,7 @@ "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", "MetricgroupNoGroup": "TopdownL2", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" @@ -328,6 +328,15 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists", + "MetricExpr": "34 * ASSISTS.FP / tma_info_thread_slots", + "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", + "MetricName": "tma_fp_assists", + "MetricThreshold": "tma_fp_assists > 0.1", + "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called Denormals).", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", @@ -390,13 +399,13 @@ "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", - "MetricExpr": "ICACHE_16B.IFDATA_STALL / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricExpr": "ICACHE_DATA.STALLS / tma_info_thread_clks", + "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", @@ -405,7 +414,7 @@ { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES / 100", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bad_spec_branch_misprediction_cost", "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" @@ -446,6 +455,12 @@ "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" }, { + "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)", + "MetricExpr": "INT_MISC.CLEARS_COUNT / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)", + "MetricGroup": "BrMispredicts", + "MetricName": "tma_info_bad_spec_spec_clears_ratio" + }, + { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", @@ -472,67 +487,102 @@ "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " }, { + "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.", + "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Ret", + "MetricName": "tma_info_bottleneck_base_non_br", + "MetricThreshold": "tma_info_bottleneck_base_non_br > 20" + }, + { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", - "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", + "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "tma_info_bottleneck_big_code", - "MetricThreshold": "tma_info_bottleneck_big_code > 20", - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" + "MetricThreshold": "tma_info_bottleneck_big_code > 20" }, { "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", - "MetricGroup": "Ret;tma_issueBC", + "MetricExpr": "100 * ((BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots)", + "MetricGroup": "Ret", "MetricName": "tma_info_bottleneck_branching_overhead", - "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", + "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", + "MetricName": "tma_info_bottleneck_cache_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", + "MetricName": "tma_info_bottleneck_cache_memory_latency", + "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency" + }, + { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "Cor;tma_issueComp", + "MetricName": "tma_info_bottleneck_compute_bound_est", + "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " }, { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", "MetricGroup": "Fed;FetchBW;Frontend", "MetricName": "tma_info_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" }, { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", - "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_info_bottleneck_memory_bandwidth", - "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Total pipeline cost of irregular execution (e.g", + "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Bad;Cor;Ret;tma_issueMS", + "MetricName": "tma_info_bottleneck_irregular_overhead", + "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10", + "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches" }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_info_bottleneck_memory_data_tlbs", "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", - "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" + "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization" }, { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", - "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_info_bottleneck_memory_latency", - "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", - "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" + "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", + "MetricExpr": "100 * (tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", + "MetricGroup": "Mem;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_synchronization", + "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10", + "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs" }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bottleneck_mispredictions", "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" }, { + "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)", + "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)", + "MetricGroup": "Cor;Offcore", + "MetricName": "tma_info_bottleneck_other_bottlenecks", + "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20", + "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls." + }, + { "BriefDescription": "Fraction of branches that are CALL or RET", "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", @@ -564,7 +614,7 @@ }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", + "MetricExpr": "(CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else tma_info_thread_clks)", "MetricGroup": "SMT", "MetricName": "tma_info_core_core_clks" }, @@ -575,8 +625,14 @@ "MetricName": "tma_info_core_coreipc" }, { + "BriefDescription": "uops Executed per Cycle", + "MetricExpr": "UOPS_EXECUTED.THREAD / tma_info_thread_clks", + "MetricGroup": "Power", + "MetricName": "tma_info_core_epc" + }, + { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", "MetricGroup": "Flops;Ret", "MetricName": "tma_info_core_flopc" }, @@ -588,8 +644,8 @@ "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_core_ilp" }, @@ -669,7 +725,7 @@ "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", "MetricThreshold": "tma_info_inst_mix_iparith < 10", - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", @@ -677,7 +733,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx128", "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", @@ -685,7 +741,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx256", "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", @@ -693,7 +749,7 @@ "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx512", "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", @@ -701,7 +757,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_dp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", @@ -709,7 +765,7 @@ "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_sp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", @@ -727,7 +783,7 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_ipflop", "MetricThreshold": "tma_info_inst_mix_ipflop < 10" @@ -740,6 +796,12 @@ "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { + "BriefDescription": "Instructions per PAUSE (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / MISC_RETIRED.PAUSE_INST", + "MetricGroup": "Flops;FpVector;InsType", + "MetricName": "tma_info_inst_mix_ippause" + }, + { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", "MetricGroup": "InsType", @@ -763,142 +825,154 @@ }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l1d_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l2_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t" }, { "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_access_bw", "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_core_l3_cache_access_bw" + "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t" }, { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricExpr": "tma_info_memory_l3_cache_fill_bw", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t" }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_fb_hpki" }, { + "BriefDescription": "", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw" + }, + { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l1mpki_load" }, { + "BriefDescription": "", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw" + }, + { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_all" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Backend;CacheMisses;Mem", + "MetricGroup": "Backend;CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem;Offcore", + "MetricGroup": "CacheHits;Mem;Offcore", "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", + "MetricGroup": "CacheHits;Mem", "MetricName": "tma_info_memory_l2mpki_load" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_memory_l3mpki" + "BriefDescription": "", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_l3_cache_access_bw" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_memory_load_miss_real_latency" + "BriefDescription": "", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_memory_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_l3mpki" }, { "BriefDescription": "Average Parallel L2 cache miss data reads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_data_l2_mlp" + "MetricName": "tma_info_memory_latency_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_miss_latency" + "MetricName": "tma_info_memory_latency_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_memory_oro_load_l2_mlp" + "MetricName": "tma_info_memory_latency_load_l2_mlp" }, { "BriefDescription": "Average Latency for L3 cache miss demand Loads", "MetricExpr": "cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,umask\\=0x10@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_memory_oro_load_l3_miss_latency" + "MetricName": "tma_info_memory_latency_load_l3_miss_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + "BriefDescription": "\"Bus lock\" per kilo instruction", + "MetricExpr": "1e3 * SQ_MISC.BUS_LOCK / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_mix_bus_lock_pki" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + "BriefDescription": "Un-cacheable retired load per kilo instruction", + "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_mix_uc_load_pki" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", @@ -926,42 +1000,56 @@ "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "BriefDescription": "", + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" }, { + "BriefDescription": "Instructions per a microcode Assist invocation", + "MetricExpr": "INST_RETIRED.ANY / ASSISTS.ANY", + "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", + "MetricName": "tma_info_pipeline_ipassist", + "MetricThreshold": "tma_info_pipeline_ipassist < 100e3", + "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)" + }, + { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", "MetricGroup": "Power;Summary", - "MetricName": "tma_info_system_average_frequency" + "MetricName": "tma_info_system_core_frequency" }, { - "BriefDescription": "Average CPU Utilization", + "BriefDescription": "Average CPU Utilization (percentage)", "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", "MetricGroup": "HPC;Summary", "MetricName": "tma_info_system_cpu_utilization" }, { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1e6 / duration_time / 1e3", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_system_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width" }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", @@ -998,12 +1086,6 @@ "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" }, { - "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", - "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.ALL + UNC_ARB_DAT_OCCUPANCY.RD) / arb@event\\=0x81\\,umask\\=0x1@", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_system_mem_request_latency" - }, - { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0", "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_core_clks", "MetricGroup": "Power", @@ -1097,8 +1179,8 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks", + "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", @@ -1107,7 +1189,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1", @@ -1117,7 +1199,7 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + L1D_PEND_MISS.FB_FULL_PERIODS) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", @@ -1127,24 +1209,24 @@ "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", - "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "17.5 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "17.5 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", + "MetricExpr": "DECODE.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1158,7 +1240,7 @@ "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%" }, { @@ -1201,7 +1283,7 @@ "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_lsd", - "MetricThreshold": "tma_lsd > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", + "MetricThreshold": "tma_lsd > 0.15 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.", "ScaleUnit": "100%" }, @@ -1216,21 +1298,21 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { - "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1254,11 +1336,11 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "tma_retiring * tma_info_thread_slots / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", + "MetricExpr": "UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", - "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches", "ScaleUnit": "100%" }, { @@ -1275,7 +1357,7 @@ "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%" }, @@ -1284,16 +1366,16 @@ "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_mite_group", "MetricName": "tma_mite_4wide", - "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))", + "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)", "ScaleUnit": "100%" }, { - "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", + "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)", "MetricExpr": "UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY", "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group", "MetricName": "tma_mixing_vectors", "MetricThreshold": "tma_mixing_vectors > 0.05", - "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", + "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", "ScaleUnit": "100%" }, { @@ -1302,22 +1384,22 @@ "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_thread_slots)", - "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group", "MetricName": "tma_nop_instructions", - "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6", + "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions + tma_nop_instructions))", + "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6", @@ -1325,6 +1407,22 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).", + "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)", + "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group", + "MetricName": "tma_other_mispredicts", + "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.", + "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)", + "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group", + "MetricName": "tma_other_nukes", + "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", @@ -1352,17 +1450,17 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)", "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", - "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", + "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1371,7 +1469,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks", + "MetricExpr": "(cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / tma_info_thread_clks * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1401,7 +1499,7 @@ "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", - "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3", "ScaleUnit": "100%" }, @@ -1419,18 +1517,18 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks", - "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", + "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO", "MetricName": "tma_serializing_operation", - "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", + "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD. Related metrics: tma_ms_switches", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", "MetricExpr": "140 * MISC_RETIRED.PAUSE_INST / tma_info_thread_clks", - "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", + "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", - "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", + "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: MISC_RETIRED.PAUSE_INST", "ScaleUnit": "100%" }, @@ -1459,7 +1557,7 @@ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { @@ -1527,10 +1625,10 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", "MetricExpr": "10 * BACLEARS.ANY / tma_info_thread_clks", - "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", + "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY", "ScaleUnit": "100%" }, { diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/tigerlake/uncore-interconnect.json index eed1b90a27..48f23acc76 100644 --- a/tools/perf/pmu-events/arch/x86/tigerlake/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/tigerlake/uncore-interconnect.json @@ -25,6 +25,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event UNC_ARB_REQ_TRK_REQUEST.DRD", + "Deprecated": "1", "EventCode": "0x81", "EventName": "UNC_ARB_DAT_REQUESTS.RD", "PerPkg": "1", @@ -33,6 +34,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event UNC_ARB_DAT_OCCUPANCY.ALL", + "Deprecated": "1", "EventCode": "0x85", "EventName": "UNC_ARB_IFA_OCCUPANCY.ALL", "PerPkg": "1", |