diff options
Diffstat (limited to 'drivers/gpu/drm/i915/gt')
161 files changed, 60433 insertions, 0 deletions
diff --git a/drivers/gpu/drm/i915/gt/debugfs_engines.c b/drivers/gpu/drm/i915/gt/debugfs_engines.c new file mode 100644 index 000000000..5e3725e62 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/debugfs_engines.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: MIT + +/* + * Copyright © 2019 Intel Corporation + */ + +#include <drm/drm_print.h> + +#include "debugfs_engines.h" +#include "debugfs_gt.h" +#include "i915_drv.h" /* for_each_engine! */ +#include "intel_engine.h" + +static int engines_show(struct seq_file *m, void *data) +{ + struct intel_gt *gt = m->private; + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct drm_printer p; + + p = drm_seq_file_printer(m); + for_each_engine(engine, gt, id) + intel_engine_dump(engine, &p, "%s\n", engine->name); + + return 0; +} +DEFINE_GT_DEBUGFS_ATTRIBUTE(engines); + +void debugfs_engines_register(struct intel_gt *gt, struct dentry *root) +{ + static const struct debugfs_gt_file files[] = { + { "engines", &engines_fops }, + }; + + intel_gt_debugfs_register_files(root, files, ARRAY_SIZE(files), gt); +} diff --git a/drivers/gpu/drm/i915/gt/debugfs_engines.h b/drivers/gpu/drm/i915/gt/debugfs_engines.h new file mode 100644 index 000000000..f69257eaa --- /dev/null +++ b/drivers/gpu/drm/i915/gt/debugfs_engines.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2019 Intel Corporation + */ + +#ifndef DEBUGFS_ENGINES_H +#define DEBUGFS_ENGINES_H + +struct intel_gt; +struct dentry; + +void debugfs_engines_register(struct intel_gt *gt, struct dentry *root); + +#endif /* DEBUGFS_ENGINES_H */ diff --git a/drivers/gpu/drm/i915/gt/debugfs_gt.c b/drivers/gpu/drm/i915/gt/debugfs_gt.c new file mode 100644 index 000000000..3a21cf63b --- /dev/null +++ b/drivers/gpu/drm/i915/gt/debugfs_gt.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: MIT + +/* + * Copyright © 2019 Intel Corporation + */ + +#include <linux/debugfs.h> + +#include "debugfs_engines.h" +#include "debugfs_gt.h" +#include "debugfs_gt_pm.h" +#include "intel_sseu_debugfs.h" +#include "uc/intel_uc_debugfs.h" +#include "i915_drv.h" + +void debugfs_gt_register(struct intel_gt *gt) +{ + struct dentry *root; + + if (!gt->i915->drm.primary->debugfs_root) + return; + + root = debugfs_create_dir("gt", gt->i915->drm.primary->debugfs_root); + if (IS_ERR(root)) + return; + + debugfs_engines_register(gt, root); + debugfs_gt_pm_register(gt, root); + intel_sseu_debugfs_register(gt, root); + + intel_uc_debugfs_register(>->uc, root); +} + +void intel_gt_debugfs_register_files(struct dentry *root, + const struct debugfs_gt_file *files, + unsigned long count, void *data) +{ + while (count--) { + umode_t mode = files->fops->write ? 0644 : 0444; + if (!files->eval || files->eval(data)) + debugfs_create_file(files->name, + mode, root, data, + files->fops); + + files++; + } +} diff --git a/drivers/gpu/drm/i915/gt/debugfs_gt.h b/drivers/gpu/drm/i915/gt/debugfs_gt.h new file mode 100644 index 000000000..f77540f72 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/debugfs_gt.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2019 Intel Corporation + */ + +#ifndef DEBUGFS_GT_H +#define DEBUGFS_GT_H + +#include <linux/file.h> + +struct intel_gt; + +#define DEFINE_GT_DEBUGFS_ATTRIBUTE(__name) \ + static int __name ## _open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, __name ## _show, inode->i_private); \ +} \ +static const struct file_operations __name ## _fops = { \ + .owner = THIS_MODULE, \ + .open = __name ## _open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +} + +void debugfs_gt_register(struct intel_gt *gt); + +struct debugfs_gt_file { + const char *name; + const struct file_operations *fops; + bool (*eval)(void *data); +}; + +void intel_gt_debugfs_register_files(struct dentry *root, + const struct debugfs_gt_file *files, + unsigned long count, void *data); + +#endif /* DEBUGFS_GT_H */ diff --git a/drivers/gpu/drm/i915/gt/debugfs_gt_pm.c b/drivers/gpu/drm/i915/gt/debugfs_gt_pm.c new file mode 100644 index 000000000..174a24553 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/debugfs_gt_pm.c @@ -0,0 +1,627 @@ +// SPDX-License-Identifier: MIT + +/* + * Copyright © 2019 Intel Corporation + */ + +#include <linux/seq_file.h> + +#include "debugfs_gt.h" +#include "debugfs_gt_pm.h" +#include "i915_drv.h" +#include "intel_gt.h" +#include "intel_gt_clock_utils.h" +#include "intel_llc.h" +#include "intel_rc6.h" +#include "intel_rps.h" +#include "intel_runtime_pm.h" +#include "intel_sideband.h" +#include "intel_uncore.h" + +static int fw_domains_show(struct seq_file *m, void *data) +{ + struct intel_gt *gt = m->private; + struct intel_uncore *uncore = gt->uncore; + struct intel_uncore_forcewake_domain *fw_domain; + unsigned int tmp; + + seq_printf(m, "user.bypass_count = %u\n", + uncore->user_forcewake_count); + + for_each_fw_domain(fw_domain, uncore, tmp) + seq_printf(m, "%s.wake_count = %u\n", + intel_uncore_forcewake_domain_to_str(fw_domain->id), + READ_ONCE(fw_domain->wake_count)); + + return 0; +} +DEFINE_GT_DEBUGFS_ATTRIBUTE(fw_domains); + +static void print_rc6_res(struct seq_file *m, + const char *title, + const i915_reg_t reg) +{ + struct intel_gt *gt = m->private; + intel_wakeref_t wakeref; + + with_intel_runtime_pm(gt->uncore->rpm, wakeref) + seq_printf(m, "%s %u (%llu us)\n", title, + intel_uncore_read(gt->uncore, reg), + intel_rc6_residency_us(>->rc6, reg)); +} + +static int vlv_drpc(struct seq_file *m) +{ + struct intel_gt *gt = m->private; + struct intel_uncore *uncore = gt->uncore; + u32 rcctl1, pw_status; + + pw_status = intel_uncore_read(uncore, VLV_GTLC_PW_STATUS); + rcctl1 = intel_uncore_read(uncore, GEN6_RC_CONTROL); + + seq_printf(m, "RC6 Enabled: %s\n", + yesno(rcctl1 & (GEN7_RC_CTL_TO_MODE | + GEN6_RC_CTL_EI_MODE(1)))); + seq_printf(m, "Render Power Well: %s\n", + (pw_status & VLV_GTLC_PW_RENDER_STATUS_MASK) ? "Up" : "Down"); + seq_printf(m, "Media Power Well: %s\n", + (pw_status & VLV_GTLC_PW_MEDIA_STATUS_MASK) ? "Up" : "Down"); + + print_rc6_res(m, "Render RC6 residency since boot:", VLV_GT_RENDER_RC6); + print_rc6_res(m, "Media RC6 residency since boot:", VLV_GT_MEDIA_RC6); + + return fw_domains_show(m, NULL); +} + +static int gen6_drpc(struct seq_file *m) +{ + struct intel_gt *gt = m->private; + struct drm_i915_private *i915 = gt->i915; + struct intel_uncore *uncore = gt->uncore; + u32 gt_core_status, rcctl1, rc6vids = 0; + u32 gen9_powergate_enable = 0, gen9_powergate_status = 0; + + gt_core_status = intel_uncore_read_fw(uncore, GEN6_GT_CORE_STATUS); + + rcctl1 = intel_uncore_read(uncore, GEN6_RC_CONTROL); + if (INTEL_GEN(i915) >= 9) { + gen9_powergate_enable = + intel_uncore_read(uncore, GEN9_PG_ENABLE); + gen9_powergate_status = + intel_uncore_read(uncore, GEN9_PWRGT_DOMAIN_STATUS); + } + + if (INTEL_GEN(i915) <= 7) + sandybridge_pcode_read(i915, GEN6_PCODE_READ_RC6VIDS, + &rc6vids, NULL); + + seq_printf(m, "RC1e Enabled: %s\n", + yesno(rcctl1 & GEN6_RC_CTL_RC1e_ENABLE)); + seq_printf(m, "RC6 Enabled: %s\n", + yesno(rcctl1 & GEN6_RC_CTL_RC6_ENABLE)); + if (INTEL_GEN(i915) >= 9) { + seq_printf(m, "Render Well Gating Enabled: %s\n", + yesno(gen9_powergate_enable & GEN9_RENDER_PG_ENABLE)); + seq_printf(m, "Media Well Gating Enabled: %s\n", + yesno(gen9_powergate_enable & GEN9_MEDIA_PG_ENABLE)); + } + seq_printf(m, "Deep RC6 Enabled: %s\n", + yesno(rcctl1 & GEN6_RC_CTL_RC6p_ENABLE)); + seq_printf(m, "Deepest RC6 Enabled: %s\n", + yesno(rcctl1 & GEN6_RC_CTL_RC6pp_ENABLE)); + seq_puts(m, "Current RC state: "); + switch (gt_core_status & GEN6_RCn_MASK) { + case GEN6_RC0: + if (gt_core_status & GEN6_CORE_CPD_STATE_MASK) + seq_puts(m, "Core Power Down\n"); + else + seq_puts(m, "on\n"); + break; + case GEN6_RC3: + seq_puts(m, "RC3\n"); + break; + case GEN6_RC6: + seq_puts(m, "RC6\n"); + break; + case GEN6_RC7: + seq_puts(m, "RC7\n"); + break; + default: + seq_puts(m, "Unknown\n"); + break; + } + + seq_printf(m, "Core Power Down: %s\n", + yesno(gt_core_status & GEN6_CORE_CPD_STATE_MASK)); + if (INTEL_GEN(i915) >= 9) { + seq_printf(m, "Render Power Well: %s\n", + (gen9_powergate_status & + GEN9_PWRGT_RENDER_STATUS_MASK) ? "Up" : "Down"); + seq_printf(m, "Media Power Well: %s\n", + (gen9_powergate_status & + GEN9_PWRGT_MEDIA_STATUS_MASK) ? "Up" : "Down"); + } + + /* Not exactly sure what this is */ + print_rc6_res(m, "RC6 \"Locked to RPn\" residency since boot:", + GEN6_GT_GFX_RC6_LOCKED); + print_rc6_res(m, "RC6 residency since boot:", GEN6_GT_GFX_RC6); + print_rc6_res(m, "RC6+ residency since boot:", GEN6_GT_GFX_RC6p); + print_rc6_res(m, "RC6++ residency since boot:", GEN6_GT_GFX_RC6pp); + + if (INTEL_GEN(i915) <= 7) { + seq_printf(m, "RC6 voltage: %dmV\n", + GEN6_DECODE_RC6_VID(((rc6vids >> 0) & 0xff))); + seq_printf(m, "RC6+ voltage: %dmV\n", + GEN6_DECODE_RC6_VID(((rc6vids >> 8) & 0xff))); + seq_printf(m, "RC6++ voltage: %dmV\n", + GEN6_DECODE_RC6_VID(((rc6vids >> 16) & 0xff))); + } + + return fw_domains_show(m, NULL); +} + +static int ilk_drpc(struct seq_file *m) +{ + struct intel_gt *gt = m->private; + struct intel_uncore *uncore = gt->uncore; + u32 rgvmodectl, rstdbyctl; + u16 crstandvid; + + rgvmodectl = intel_uncore_read(uncore, MEMMODECTL); + rstdbyctl = intel_uncore_read(uncore, RSTDBYCTL); + crstandvid = intel_uncore_read16(uncore, CRSTANDVID); + + seq_printf(m, "HD boost: %s\n", yesno(rgvmodectl & MEMMODE_BOOST_EN)); + seq_printf(m, "Boost freq: %d\n", + (rgvmodectl & MEMMODE_BOOST_FREQ_MASK) >> + MEMMODE_BOOST_FREQ_SHIFT); + seq_printf(m, "HW control enabled: %s\n", + yesno(rgvmodectl & MEMMODE_HWIDLE_EN)); + seq_printf(m, "SW control enabled: %s\n", + yesno(rgvmodectl & MEMMODE_SWMODE_EN)); + seq_printf(m, "Gated voltage change: %s\n", + yesno(rgvmodectl & MEMMODE_RCLK_GATE)); + seq_printf(m, "Starting frequency: P%d\n", + (rgvmodectl & MEMMODE_FSTART_MASK) >> MEMMODE_FSTART_SHIFT); + seq_printf(m, "Max P-state: P%d\n", + (rgvmodectl & MEMMODE_FMAX_MASK) >> MEMMODE_FMAX_SHIFT); + seq_printf(m, "Min P-state: P%d\n", (rgvmodectl & MEMMODE_FMIN_MASK)); + seq_printf(m, "RS1 VID: %d\n", (crstandvid & 0x3f)); + seq_printf(m, "RS2 VID: %d\n", ((crstandvid >> 8) & 0x3f)); + seq_printf(m, "Render standby enabled: %s\n", + yesno(!(rstdbyctl & RCX_SW_EXIT))); + seq_puts(m, "Current RS state: "); + switch (rstdbyctl & RSX_STATUS_MASK) { + case RSX_STATUS_ON: + seq_puts(m, "on\n"); + break; + case RSX_STATUS_RC1: + seq_puts(m, "RC1\n"); + break; + case RSX_STATUS_RC1E: + seq_puts(m, "RC1E\n"); + break; + case RSX_STATUS_RS1: + seq_puts(m, "RS1\n"); + break; + case RSX_STATUS_RS2: + seq_puts(m, "RS2 (RC6)\n"); + break; + case RSX_STATUS_RS3: + seq_puts(m, "RC3 (RC6+)\n"); + break; + default: + seq_puts(m, "unknown\n"); + break; + } + + return 0; +} + +static int drpc_show(struct seq_file *m, void *unused) +{ + struct intel_gt *gt = m->private; + struct drm_i915_private *i915 = gt->i915; + intel_wakeref_t wakeref; + int err = -ENODEV; + + with_intel_runtime_pm(gt->uncore->rpm, wakeref) { + if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915)) + err = vlv_drpc(m); + else if (INTEL_GEN(i915) >= 6) + err = gen6_drpc(m); + else + err = ilk_drpc(m); + } + + return err; +} +DEFINE_GT_DEBUGFS_ATTRIBUTE(drpc); + +static int frequency_show(struct seq_file *m, void *unused) +{ + struct intel_gt *gt = m->private; + struct drm_i915_private *i915 = gt->i915; + struct intel_uncore *uncore = gt->uncore; + struct intel_rps *rps = >->rps; + intel_wakeref_t wakeref; + + wakeref = intel_runtime_pm_get(uncore->rpm); + + if (IS_GEN(i915, 5)) { + u16 rgvswctl = intel_uncore_read16(uncore, MEMSWCTL); + u16 rgvstat = intel_uncore_read16(uncore, MEMSTAT_ILK); + + seq_printf(m, "Requested P-state: %d\n", (rgvswctl >> 8) & 0xf); + seq_printf(m, "Requested VID: %d\n", rgvswctl & 0x3f); + seq_printf(m, "Current VID: %d\n", (rgvstat & MEMSTAT_VID_MASK) >> + MEMSTAT_VID_SHIFT); + seq_printf(m, "Current P-state: %d\n", + (rgvstat & MEMSTAT_PSTATE_MASK) >> MEMSTAT_PSTATE_SHIFT); + } else if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915)) { + u32 rpmodectl, freq_sts; + + rpmodectl = intel_uncore_read(uncore, GEN6_RP_CONTROL); + seq_printf(m, "Video Turbo Mode: %s\n", + yesno(rpmodectl & GEN6_RP_MEDIA_TURBO)); + seq_printf(m, "HW control enabled: %s\n", + yesno(rpmodectl & GEN6_RP_ENABLE)); + seq_printf(m, "SW control enabled: %s\n", + yesno((rpmodectl & GEN6_RP_MEDIA_MODE_MASK) == + GEN6_RP_MEDIA_SW_MODE)); + + vlv_punit_get(i915); + freq_sts = vlv_punit_read(i915, PUNIT_REG_GPU_FREQ_STS); + vlv_punit_put(i915); + + seq_printf(m, "PUNIT_REG_GPU_FREQ_STS: 0x%08x\n", freq_sts); + seq_printf(m, "DDR freq: %d MHz\n", i915->mem_freq); + + seq_printf(m, "actual GPU freq: %d MHz\n", + intel_gpu_freq(rps, (freq_sts >> 8) & 0xff)); + + seq_printf(m, "current GPU freq: %d MHz\n", + intel_gpu_freq(rps, rps->cur_freq)); + + seq_printf(m, "max GPU freq: %d MHz\n", + intel_gpu_freq(rps, rps->max_freq)); + + seq_printf(m, "min GPU freq: %d MHz\n", + intel_gpu_freq(rps, rps->min_freq)); + + seq_printf(m, "idle GPU freq: %d MHz\n", + intel_gpu_freq(rps, rps->idle_freq)); + + seq_printf(m, "efficient (RPe) frequency: %d MHz\n", + intel_gpu_freq(rps, rps->efficient_freq)); + } else if (INTEL_GEN(i915) >= 6) { + u32 rp_state_limits; + u32 gt_perf_status; + u32 rp_state_cap; + u32 rpmodectl, rpinclimit, rpdeclimit; + u32 rpstat, cagf, reqf; + u32 rpcurupei, rpcurup, rpprevup; + u32 rpcurdownei, rpcurdown, rpprevdown; + u32 rpupei, rpupt, rpdownei, rpdownt; + u32 pm_ier, pm_imr, pm_isr, pm_iir, pm_mask; + int max_freq; + + rp_state_limits = intel_uncore_read(uncore, GEN6_RP_STATE_LIMITS); + if (IS_GEN9_LP(i915)) { + rp_state_cap = intel_uncore_read(uncore, BXT_RP_STATE_CAP); + gt_perf_status = intel_uncore_read(uncore, BXT_GT_PERF_STATUS); + } else { + rp_state_cap = intel_uncore_read(uncore, GEN6_RP_STATE_CAP); + gt_perf_status = intel_uncore_read(uncore, GEN6_GT_PERF_STATUS); + } + + /* RPSTAT1 is in the GT power well */ + intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); + + reqf = intel_uncore_read(uncore, GEN6_RPNSWREQ); + if (INTEL_GEN(i915) >= 9) { + reqf >>= 23; + } else { + reqf &= ~GEN6_TURBO_DISABLE; + if (IS_HASWELL(i915) || IS_BROADWELL(i915)) + reqf >>= 24; + else + reqf >>= 25; + } + reqf = intel_gpu_freq(rps, reqf); + + rpmodectl = intel_uncore_read(uncore, GEN6_RP_CONTROL); + rpinclimit = intel_uncore_read(uncore, GEN6_RP_UP_THRESHOLD); + rpdeclimit = intel_uncore_read(uncore, GEN6_RP_DOWN_THRESHOLD); + + rpstat = intel_uncore_read(uncore, GEN6_RPSTAT1); + rpcurupei = intel_uncore_read(uncore, GEN6_RP_CUR_UP_EI) & GEN6_CURICONT_MASK; + rpcurup = intel_uncore_read(uncore, GEN6_RP_CUR_UP) & GEN6_CURBSYTAVG_MASK; + rpprevup = intel_uncore_read(uncore, GEN6_RP_PREV_UP) & GEN6_CURBSYTAVG_MASK; + rpcurdownei = intel_uncore_read(uncore, GEN6_RP_CUR_DOWN_EI) & GEN6_CURIAVG_MASK; + rpcurdown = intel_uncore_read(uncore, GEN6_RP_CUR_DOWN) & GEN6_CURBSYTAVG_MASK; + rpprevdown = intel_uncore_read(uncore, GEN6_RP_PREV_DOWN) & GEN6_CURBSYTAVG_MASK; + + rpupei = intel_uncore_read(uncore, GEN6_RP_UP_EI); + rpupt = intel_uncore_read(uncore, GEN6_RP_UP_THRESHOLD); + + rpdownei = intel_uncore_read(uncore, GEN6_RP_DOWN_EI); + rpdownt = intel_uncore_read(uncore, GEN6_RP_DOWN_THRESHOLD); + + cagf = intel_rps_read_actual_frequency(rps); + + intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL); + + if (INTEL_GEN(i915) >= 11) { + pm_ier = intel_uncore_read(uncore, GEN11_GPM_WGBOXPERF_INTR_ENABLE); + pm_imr = intel_uncore_read(uncore, GEN11_GPM_WGBOXPERF_INTR_MASK); + /* + * The equivalent to the PM ISR & IIR cannot be read + * without affecting the current state of the system + */ + pm_isr = 0; + pm_iir = 0; + } else if (INTEL_GEN(i915) >= 8) { + pm_ier = intel_uncore_read(uncore, GEN8_GT_IER(2)); + pm_imr = intel_uncore_read(uncore, GEN8_GT_IMR(2)); + pm_isr = intel_uncore_read(uncore, GEN8_GT_ISR(2)); + pm_iir = intel_uncore_read(uncore, GEN8_GT_IIR(2)); + } else { + pm_ier = intel_uncore_read(uncore, GEN6_PMIER); + pm_imr = intel_uncore_read(uncore, GEN6_PMIMR); + pm_isr = intel_uncore_read(uncore, GEN6_PMISR); + pm_iir = intel_uncore_read(uncore, GEN6_PMIIR); + } + pm_mask = intel_uncore_read(uncore, GEN6_PMINTRMSK); + + seq_printf(m, "Video Turbo Mode: %s\n", + yesno(rpmodectl & GEN6_RP_MEDIA_TURBO)); + seq_printf(m, "HW control enabled: %s\n", + yesno(rpmodectl & GEN6_RP_ENABLE)); + seq_printf(m, "SW control enabled: %s\n", + yesno((rpmodectl & GEN6_RP_MEDIA_MODE_MASK) == + GEN6_RP_MEDIA_SW_MODE)); + + seq_printf(m, "PM IER=0x%08x IMR=0x%08x, MASK=0x%08x\n", + pm_ier, pm_imr, pm_mask); + if (INTEL_GEN(i915) <= 10) + seq_printf(m, "PM ISR=0x%08x IIR=0x%08x\n", + pm_isr, pm_iir); + seq_printf(m, "pm_intrmsk_mbz: 0x%08x\n", + rps->pm_intrmsk_mbz); + seq_printf(m, "GT_PERF_STATUS: 0x%08x\n", gt_perf_status); + seq_printf(m, "Render p-state ratio: %d\n", + (gt_perf_status & (INTEL_GEN(i915) >= 9 ? 0x1ff00 : 0xff00)) >> 8); + seq_printf(m, "Render p-state VID: %d\n", + gt_perf_status & 0xff); + seq_printf(m, "Render p-state limit: %d\n", + rp_state_limits & 0xff); + seq_printf(m, "RPSTAT1: 0x%08x\n", rpstat); + seq_printf(m, "RPMODECTL: 0x%08x\n", rpmodectl); + seq_printf(m, "RPINCLIMIT: 0x%08x\n", rpinclimit); + seq_printf(m, "RPDECLIMIT: 0x%08x\n", rpdeclimit); + seq_printf(m, "RPNSWREQ: %dMHz\n", reqf); + seq_printf(m, "CAGF: %dMHz\n", cagf); + seq_printf(m, "RP CUR UP EI: %d (%dns)\n", + rpcurupei, + intel_gt_pm_interval_to_ns(gt, rpcurupei)); + seq_printf(m, "RP CUR UP: %d (%dns)\n", + rpcurup, intel_gt_pm_interval_to_ns(gt, rpcurup)); + seq_printf(m, "RP PREV UP: %d (%dns)\n", + rpprevup, intel_gt_pm_interval_to_ns(gt, rpprevup)); + seq_printf(m, "Up threshold: %d%%\n", + rps->power.up_threshold); + seq_printf(m, "RP UP EI: %d (%dns)\n", + rpupei, intel_gt_pm_interval_to_ns(gt, rpupei)); + seq_printf(m, "RP UP THRESHOLD: %d (%dns)\n", + rpupt, intel_gt_pm_interval_to_ns(gt, rpupt)); + + seq_printf(m, "RP CUR DOWN EI: %d (%dns)\n", + rpcurdownei, + intel_gt_pm_interval_to_ns(gt, rpcurdownei)); + seq_printf(m, "RP CUR DOWN: %d (%dns)\n", + rpcurdown, + intel_gt_pm_interval_to_ns(gt, rpcurdown)); + seq_printf(m, "RP PREV DOWN: %d (%dns)\n", + rpprevdown, + intel_gt_pm_interval_to_ns(gt, rpprevdown)); + seq_printf(m, "Down threshold: %d%%\n", + rps->power.down_threshold); + seq_printf(m, "RP DOWN EI: %d (%dns)\n", + rpdownei, intel_gt_pm_interval_to_ns(gt, rpdownei)); + seq_printf(m, "RP DOWN THRESHOLD: %d (%dns)\n", + rpdownt, intel_gt_pm_interval_to_ns(gt, rpdownt)); + + max_freq = (IS_GEN9_LP(i915) ? rp_state_cap >> 0 : + rp_state_cap >> 16) & 0xff; + max_freq *= (IS_GEN9_BC(i915) || + INTEL_GEN(i915) >= 10 ? GEN9_FREQ_SCALER : 1); + seq_printf(m, "Lowest (RPN) frequency: %dMHz\n", + intel_gpu_freq(rps, max_freq)); + + max_freq = (rp_state_cap & 0xff00) >> 8; + max_freq *= (IS_GEN9_BC(i915) || + INTEL_GEN(i915) >= 10 ? GEN9_FREQ_SCALER : 1); + seq_printf(m, "Nominal (RP1) frequency: %dMHz\n", + intel_gpu_freq(rps, max_freq)); + + max_freq = (IS_GEN9_LP(i915) ? rp_state_cap >> 16 : + rp_state_cap >> 0) & 0xff; + max_freq *= (IS_GEN9_BC(i915) || + INTEL_GEN(i915) >= 10 ? GEN9_FREQ_SCALER : 1); + seq_printf(m, "Max non-overclocked (RP0) frequency: %dMHz\n", + intel_gpu_freq(rps, max_freq)); + seq_printf(m, "Max overclocked frequency: %dMHz\n", + intel_gpu_freq(rps, rps->max_freq)); + + seq_printf(m, "Current freq: %d MHz\n", + intel_gpu_freq(rps, rps->cur_freq)); + seq_printf(m, "Actual freq: %d MHz\n", cagf); + seq_printf(m, "Idle freq: %d MHz\n", + intel_gpu_freq(rps, rps->idle_freq)); + seq_printf(m, "Min freq: %d MHz\n", + intel_gpu_freq(rps, rps->min_freq)); + seq_printf(m, "Boost freq: %d MHz\n", + intel_gpu_freq(rps, rps->boost_freq)); + seq_printf(m, "Max freq: %d MHz\n", + intel_gpu_freq(rps, rps->max_freq)); + seq_printf(m, + "efficient (RPe) frequency: %d MHz\n", + intel_gpu_freq(rps, rps->efficient_freq)); + } else { + seq_puts(m, "no P-state info available\n"); + } + + seq_printf(m, "Current CD clock frequency: %d kHz\n", i915->cdclk.hw.cdclk); + seq_printf(m, "Max CD clock frequency: %d kHz\n", i915->max_cdclk_freq); + seq_printf(m, "Max pixel clock frequency: %d kHz\n", i915->max_dotclk_freq); + + intel_runtime_pm_put(uncore->rpm, wakeref); + + return 0; +} +DEFINE_GT_DEBUGFS_ATTRIBUTE(frequency); + +static int llc_show(struct seq_file *m, void *data) +{ + struct intel_gt *gt = m->private; + struct drm_i915_private *i915 = gt->i915; + const bool edram = INTEL_GEN(i915) > 8; + struct intel_rps *rps = >->rps; + unsigned int max_gpu_freq, min_gpu_freq; + intel_wakeref_t wakeref; + int gpu_freq, ia_freq; + + seq_printf(m, "LLC: %s\n", yesno(HAS_LLC(i915))); + seq_printf(m, "%s: %uMB\n", edram ? "eDRAM" : "eLLC", + i915->edram_size_mb); + + min_gpu_freq = rps->min_freq; + max_gpu_freq = rps->max_freq; + if (IS_GEN9_BC(i915) || INTEL_GEN(i915) >= 10) { + /* Convert GT frequency to 50 HZ units */ + min_gpu_freq /= GEN9_FREQ_SCALER; + max_gpu_freq /= GEN9_FREQ_SCALER; + } + + seq_puts(m, "GPU freq (MHz)\tEffective CPU freq (MHz)\tEffective Ring freq (MHz)\n"); + + wakeref = intel_runtime_pm_get(gt->uncore->rpm); + for (gpu_freq = min_gpu_freq; gpu_freq <= max_gpu_freq; gpu_freq++) { + ia_freq = gpu_freq; + sandybridge_pcode_read(i915, + GEN6_PCODE_READ_MIN_FREQ_TABLE, + &ia_freq, NULL); + seq_printf(m, "%d\t\t%d\t\t\t\t%d\n", + intel_gpu_freq(rps, + (gpu_freq * + (IS_GEN9_BC(i915) || + INTEL_GEN(i915) >= 10 ? + GEN9_FREQ_SCALER : 1))), + ((ia_freq >> 0) & 0xff) * 100, + ((ia_freq >> 8) & 0xff) * 100); + } + intel_runtime_pm_put(gt->uncore->rpm, wakeref); + + return 0; +} + +static bool llc_eval(void *data) +{ + struct intel_gt *gt = data; + + return HAS_LLC(gt->i915); +} + +DEFINE_GT_DEBUGFS_ATTRIBUTE(llc); + +static const char *rps_power_to_str(unsigned int power) +{ + static const char * const strings[] = { + [LOW_POWER] = "low power", + [BETWEEN] = "mixed", + [HIGH_POWER] = "high power", + }; + + if (power >= ARRAY_SIZE(strings) || !strings[power]) + return "unknown"; + + return strings[power]; +} + +static int rps_boost_show(struct seq_file *m, void *data) +{ + struct intel_gt *gt = m->private; + struct drm_i915_private *i915 = gt->i915; + struct intel_rps *rps = >->rps; + + seq_printf(m, "RPS enabled? %s\n", yesno(intel_rps_is_enabled(rps))); + seq_printf(m, "RPS active? %s\n", yesno(intel_rps_is_active(rps))); + seq_printf(m, "GPU busy? %s\n", yesno(gt->awake)); + seq_printf(m, "Boosts outstanding? %d\n", + atomic_read(&rps->num_waiters)); + seq_printf(m, "Interactive? %d\n", READ_ONCE(rps->power.interactive)); + seq_printf(m, "Frequency requested %d, actual %d\n", + intel_gpu_freq(rps, rps->cur_freq), + intel_rps_read_actual_frequency(rps)); + seq_printf(m, " min hard:%d, soft:%d; max soft:%d, hard:%d\n", + intel_gpu_freq(rps, rps->min_freq), + intel_gpu_freq(rps, rps->min_freq_softlimit), + intel_gpu_freq(rps, rps->max_freq_softlimit), + intel_gpu_freq(rps, rps->max_freq)); + seq_printf(m, " idle:%d, efficient:%d, boost:%d\n", + intel_gpu_freq(rps, rps->idle_freq), + intel_gpu_freq(rps, rps->efficient_freq), + intel_gpu_freq(rps, rps->boost_freq)); + + seq_printf(m, "Wait boosts: %d\n", atomic_read(&rps->boosts)); + + if (INTEL_GEN(i915) >= 6 && intel_rps_is_active(rps)) { + struct intel_uncore *uncore = gt->uncore; + u32 rpup, rpupei; + u32 rpdown, rpdownei; + + intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); + rpup = intel_uncore_read_fw(uncore, GEN6_RP_CUR_UP) & GEN6_RP_EI_MASK; + rpupei = intel_uncore_read_fw(uncore, GEN6_RP_CUR_UP_EI) & GEN6_RP_EI_MASK; + rpdown = intel_uncore_read_fw(uncore, GEN6_RP_CUR_DOWN) & GEN6_RP_EI_MASK; + rpdownei = intel_uncore_read_fw(uncore, GEN6_RP_CUR_DOWN_EI) & GEN6_RP_EI_MASK; + intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL); + + seq_printf(m, "\nRPS Autotuning (current \"%s\" window):\n", + rps_power_to_str(rps->power.mode)); + seq_printf(m, " Avg. up: %d%% [above threshold? %d%%]\n", + rpup && rpupei ? 100 * rpup / rpupei : 0, + rps->power.up_threshold); + seq_printf(m, " Avg. down: %d%% [below threshold? %d%%]\n", + rpdown && rpdownei ? 100 * rpdown / rpdownei : 0, + rps->power.down_threshold); + } else { + seq_puts(m, "\nRPS Autotuning inactive\n"); + } + + return 0; +} + +static bool rps_eval(void *data) +{ + struct intel_gt *gt = data; + + return HAS_RPS(gt->i915); +} + +DEFINE_GT_DEBUGFS_ATTRIBUTE(rps_boost); + +void debugfs_gt_pm_register(struct intel_gt *gt, struct dentry *root) +{ + static const struct debugfs_gt_file files[] = { + { "drpc", &drpc_fops, NULL }, + { "frequency", &frequency_fops, NULL }, + { "forcewake", &fw_domains_fops, NULL }, + { "llc", &llc_fops, llc_eval }, + { "rps_boost", &rps_boost_fops, rps_eval }, + }; + + intel_gt_debugfs_register_files(root, files, ARRAY_SIZE(files), gt); +} diff --git a/drivers/gpu/drm/i915/gt/debugfs_gt_pm.h b/drivers/gpu/drm/i915/gt/debugfs_gt_pm.h new file mode 100644 index 000000000..4cf5f5c9d --- /dev/null +++ b/drivers/gpu/drm/i915/gt/debugfs_gt_pm.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2019 Intel Corporation + */ + +#ifndef DEBUGFS_GT_PM_H +#define DEBUGFS_GT_PM_H + +struct intel_gt; +struct dentry; + +void debugfs_gt_pm_register(struct intel_gt *gt, struct dentry *root); + +#endif /* DEBUGFS_GT_PM_H */ diff --git a/drivers/gpu/drm/i915/gt/gen2_engine_cs.c b/drivers/gpu/drm/i915/gt/gen2_engine_cs.c new file mode 100644 index 000000000..b491a6491 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/gen2_engine_cs.c @@ -0,0 +1,329 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +#include "gen2_engine_cs.h" +#include "i915_drv.h" +#include "intel_engine.h" +#include "intel_gpu_commands.h" +#include "intel_gt.h" +#include "intel_gt_irq.h" +#include "intel_ring.h" + +int gen2_emit_flush(struct i915_request *rq, u32 mode) +{ + unsigned int num_store_dw = 12; + u32 cmd, *cs; + + cmd = MI_FLUSH; + if (mode & EMIT_INVALIDATE) + cmd |= MI_READ_FLUSH; + + cs = intel_ring_begin(rq, 2 + 4 * num_store_dw); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = cmd; + while (num_store_dw--) { + *cs++ = MI_STORE_DWORD_INDEX; + *cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32); + *cs++ = 0; + *cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH; + } + *cs++ = cmd; + + intel_ring_advance(rq, cs); + + return 0; +} + +int gen4_emit_flush_rcs(struct i915_request *rq, u32 mode) +{ + u32 cmd, *cs; + int i; + + /* + * read/write caches: + * + * I915_GEM_DOMAIN_RENDER is always invalidated, but is + * only flushed if MI_NO_WRITE_FLUSH is unset. On 965, it is + * also flushed at 2d versus 3d pipeline switches. + * + * read-only caches: + * + * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if + * MI_READ_FLUSH is set, and is always flushed on 965. + * + * I915_GEM_DOMAIN_COMMAND may not exist? + * + * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is + * invalidated when MI_EXE_FLUSH is set. + * + * I915_GEM_DOMAIN_VERTEX, which exists on 965, is + * invalidated with every MI_FLUSH. + * + * TLBs: + * + * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND + * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and + * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER + * are flushed at any MI_FLUSH. + */ + + cmd = MI_FLUSH; + if (mode & EMIT_INVALIDATE) { + cmd |= MI_EXE_FLUSH; + if (IS_G4X(rq->engine->i915) || IS_GEN(rq->engine->i915, 5)) + cmd |= MI_INVALIDATE_ISP; + } + + i = 2; + if (mode & EMIT_INVALIDATE) + i += 20; + + cs = intel_ring_begin(rq, i); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = cmd; + + /* + * A random delay to let the CS invalidate take effect? Without this + * delay, the GPU relocation path fails as the CS does not see + * the updated contents. Just as important, if we apply the flushes + * to the EMIT_FLUSH branch (i.e. immediately after the relocation + * write and before the invalidate on the next batch), the relocations + * still fail. This implies that is a delay following invalidation + * that is required to reset the caches as opposed to a delay to + * ensure the memory is written. + */ + if (mode & EMIT_INVALIDATE) { + *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE; + *cs++ = intel_gt_scratch_offset(rq->engine->gt, + INTEL_GT_SCRATCH_FIELD_DEFAULT) | + PIPE_CONTROL_GLOBAL_GTT; + *cs++ = 0; + *cs++ = 0; + + for (i = 0; i < 12; i++) + *cs++ = MI_FLUSH; + + *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE; + *cs++ = intel_gt_scratch_offset(rq->engine->gt, + INTEL_GT_SCRATCH_FIELD_DEFAULT) | + PIPE_CONTROL_GLOBAL_GTT; + *cs++ = 0; + *cs++ = 0; + } + + *cs++ = cmd; + + intel_ring_advance(rq, cs); + + return 0; +} + +int gen4_emit_flush_vcs(struct i915_request *rq, u32 mode) +{ + u32 *cs; + + cs = intel_ring_begin(rq, 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_FLUSH; + *cs++ = MI_NOOP; + intel_ring_advance(rq, cs); + + return 0; +} + +static u32 *__gen2_emit_breadcrumb(struct i915_request *rq, u32 *cs, + int flush, int post) +{ + GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma); + GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR); + + *cs++ = MI_FLUSH; + + while (flush--) { + *cs++ = MI_STORE_DWORD_INDEX; + *cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32); + *cs++ = rq->fence.seqno; + } + + while (post--) { + *cs++ = MI_STORE_DWORD_INDEX; + *cs++ = I915_GEM_HWS_SEQNO_ADDR; + *cs++ = rq->fence.seqno; + } + + *cs++ = MI_USER_INTERRUPT; + + rq->tail = intel_ring_offset(rq, cs); + assert_ring_tail_valid(rq->ring, rq->tail); + + return cs; +} + +u32 *gen3_emit_breadcrumb(struct i915_request *rq, u32 *cs) +{ + return __gen2_emit_breadcrumb(rq, cs, 16, 8); +} + +u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs) +{ + return __gen2_emit_breadcrumb(rq, cs, 8, 8); +} + +/* Just userspace ABI convention to limit the wa batch bo to a resonable size */ +#define I830_BATCH_LIMIT SZ_256K +#define I830_TLB_ENTRIES (2) +#define I830_WA_SIZE max(I830_TLB_ENTRIES * SZ_4K, I830_BATCH_LIMIT) +int i830_emit_bb_start(struct i915_request *rq, + u64 offset, u32 len, + unsigned int dispatch_flags) +{ + u32 *cs, cs_offset = + intel_gt_scratch_offset(rq->engine->gt, + INTEL_GT_SCRATCH_FIELD_DEFAULT); + + GEM_BUG_ON(rq->engine->gt->scratch->size < I830_WA_SIZE); + + cs = intel_ring_begin(rq, 6); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* Evict the invalid PTE TLBs */ + *cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA; + *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096; + *cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */ + *cs++ = cs_offset; + *cs++ = 0xdeadbeef; + *cs++ = MI_NOOP; + intel_ring_advance(rq, cs); + + if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) { + if (len > I830_BATCH_LIMIT) + return -ENOSPC; + + cs = intel_ring_begin(rq, 6 + 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* + * Blit the batch (which has now all relocs applied) to the + * stable batch scratch bo area (so that the CS never + * stumbles over its tlb invalidation bug) ... + */ + *cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2); + *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096; + *cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096; + *cs++ = cs_offset; + *cs++ = 4096; + *cs++ = offset; + + *cs++ = MI_FLUSH; + *cs++ = MI_NOOP; + intel_ring_advance(rq, cs); + + /* ... and execute it. */ + offset = cs_offset; + } + + if (!(dispatch_flags & I915_DISPATCH_SECURE)) + offset |= MI_BATCH_NON_SECURE; + + cs = intel_ring_begin(rq, 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; + *cs++ = offset; + intel_ring_advance(rq, cs); + + return 0; +} + +int gen3_emit_bb_start(struct i915_request *rq, + u64 offset, u32 len, + unsigned int dispatch_flags) +{ + u32 *cs; + + if (!(dispatch_flags & I915_DISPATCH_SECURE)) + offset |= MI_BATCH_NON_SECURE; + + cs = intel_ring_begin(rq, 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; + *cs++ = offset; + intel_ring_advance(rq, cs); + + return 0; +} + +int gen4_emit_bb_start(struct i915_request *rq, + u64 offset, u32 length, + unsigned int dispatch_flags) +{ + u32 security; + u32 *cs; + + security = MI_BATCH_NON_SECURE_I965; + if (dispatch_flags & I915_DISPATCH_SECURE) + security = 0; + + cs = intel_ring_begin(rq, 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | security; + *cs++ = offset; + intel_ring_advance(rq, cs); + + return 0; +} + +void gen2_irq_enable(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + i915->irq_mask &= ~engine->irq_enable_mask; + intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask); + ENGINE_POSTING_READ16(engine, RING_IMR); +} + +void gen2_irq_disable(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + i915->irq_mask |= engine->irq_enable_mask; + intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask); +} + +void gen3_irq_enable(struct intel_engine_cs *engine) +{ + engine->i915->irq_mask &= ~engine->irq_enable_mask; + intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask); + intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR); +} + +void gen3_irq_disable(struct intel_engine_cs *engine) +{ + engine->i915->irq_mask |= engine->irq_enable_mask; + intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask); +} + +void gen5_irq_enable(struct intel_engine_cs *engine) +{ + gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask); +} + +void gen5_irq_disable(struct intel_engine_cs *engine) +{ + gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask); +} diff --git a/drivers/gpu/drm/i915/gt/gen2_engine_cs.h b/drivers/gpu/drm/i915/gt/gen2_engine_cs.h new file mode 100644 index 000000000..a5cd64a65 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/gen2_engine_cs.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2020 Intel Corporation + */ + +#ifndef __GEN2_ENGINE_CS_H__ +#define __GEN2_ENGINE_CS_H__ + +#include <linux/types.h> + +struct i915_request; +struct intel_engine_cs; + +int gen2_emit_flush(struct i915_request *rq, u32 mode); +int gen4_emit_flush_rcs(struct i915_request *rq, u32 mode); +int gen4_emit_flush_vcs(struct i915_request *rq, u32 mode); + +u32 *gen3_emit_breadcrumb(struct i915_request *rq, u32 *cs); +u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs); + +int i830_emit_bb_start(struct i915_request *rq, + u64 offset, u32 len, + unsigned int dispatch_flags); +int gen3_emit_bb_start(struct i915_request *rq, + u64 offset, u32 len, + unsigned int dispatch_flags); +int gen4_emit_bb_start(struct i915_request *rq, + u64 offset, u32 length, + unsigned int dispatch_flags); + +void gen2_irq_enable(struct intel_engine_cs *engine); +void gen2_irq_disable(struct intel_engine_cs *engine); +void gen3_irq_enable(struct intel_engine_cs *engine); +void gen3_irq_disable(struct intel_engine_cs *engine); +void gen5_irq_enable(struct intel_engine_cs *engine); +void gen5_irq_disable(struct intel_engine_cs *engine); + +#endif /* __GEN2_ENGINE_CS_H__ */ diff --git a/drivers/gpu/drm/i915/gt/gen6_engine_cs.c b/drivers/gpu/drm/i915/gt/gen6_engine_cs.c new file mode 100644 index 000000000..ce38d1bca --- /dev/null +++ b/drivers/gpu/drm/i915/gt/gen6_engine_cs.c @@ -0,0 +1,455 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +#include "gen6_engine_cs.h" +#include "intel_engine.h" +#include "intel_gpu_commands.h" +#include "intel_gt.h" +#include "intel_gt_irq.h" +#include "intel_gt_pm_irq.h" +#include "intel_ring.h" + +#define HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH * sizeof(u32)) + +/* + * Emits a PIPE_CONTROL with a non-zero post-sync operation, for + * implementing two workarounds on gen6. From section 1.4.7.1 + * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1: + * + * [DevSNB-C+{W/A}] Before any depth stall flush (including those + * produced by non-pipelined state commands), software needs to first + * send a PIPE_CONTROL with no bits set except Post-Sync Operation != + * 0. + * + * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable + * =1, a PIPE_CONTROL with any non-zero post-sync-op is required. + * + * And the workaround for these two requires this workaround first: + * + * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent + * BEFORE the pipe-control with a post-sync op and no write-cache + * flushes. + * + * And this last workaround is tricky because of the requirements on + * that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM + * volume 2 part 1: + * + * "1 of the following must also be set: + * - Render Target Cache Flush Enable ([12] of DW1) + * - Depth Cache Flush Enable ([0] of DW1) + * - Stall at Pixel Scoreboard ([1] of DW1) + * - Depth Stall ([13] of DW1) + * - Post-Sync Operation ([13] of DW1) + * - Notify Enable ([8] of DW1)" + * + * The cache flushes require the workaround flush that triggered this + * one, so we can't use it. Depth stall would trigger the same. + * Post-sync nonzero is what triggered this second workaround, so we + * can't use that one either. Notify enable is IRQs, which aren't + * really our business. That leaves only stall at scoreboard. + */ +static int +gen6_emit_post_sync_nonzero_flush(struct i915_request *rq) +{ + u32 scratch_addr = + intel_gt_scratch_offset(rq->engine->gt, + INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); + u32 *cs; + + cs = intel_ring_begin(rq, 6); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = GFX_OP_PIPE_CONTROL(5); + *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; + *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT; + *cs++ = 0; /* low dword */ + *cs++ = 0; /* high dword */ + *cs++ = MI_NOOP; + intel_ring_advance(rq, cs); + + cs = intel_ring_begin(rq, 6); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = GFX_OP_PIPE_CONTROL(5); + *cs++ = PIPE_CONTROL_QW_WRITE; + *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT; + *cs++ = 0; + *cs++ = 0; + *cs++ = MI_NOOP; + intel_ring_advance(rq, cs); + + return 0; +} + +int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode) +{ + u32 scratch_addr = + intel_gt_scratch_offset(rq->engine->gt, + INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); + u32 *cs, flags = 0; + int ret; + + /* Force SNB workarounds for PIPE_CONTROL flushes */ + ret = gen6_emit_post_sync_nonzero_flush(rq); + if (ret) + return ret; + + /* + * Just flush everything. Experiments have shown that reducing the + * number of bits based on the write domains has little performance + * impact. And when rearranging requests, the order of flushes is + * unknown. + */ + if (mode & EMIT_FLUSH) { + flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; + flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; + /* + * Ensure that any following seqno writes only happen + * when the render cache is indeed flushed. + */ + flags |= PIPE_CONTROL_CS_STALL; + } + if (mode & EMIT_INVALIDATE) { + flags |= PIPE_CONTROL_TLB_INVALIDATE; + flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; + /* + * TLB invalidate requires a post-sync write. + */ + flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL; + } + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = flags; + *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT; + *cs++ = 0; + intel_ring_advance(rq, cs); + + return 0; +} + +u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs) +{ + /* First we do the gen6_emit_post_sync_nonzero_flush w/a */ + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; + *cs++ = 0; + *cs++ = 0; + + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = PIPE_CONTROL_QW_WRITE; + *cs++ = intel_gt_scratch_offset(rq->engine->gt, + INTEL_GT_SCRATCH_FIELD_DEFAULT) | + PIPE_CONTROL_GLOBAL_GTT; + *cs++ = 0; + + /* Finally we can flush and with it emit the breadcrumb */ + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_DC_FLUSH_ENABLE | + PIPE_CONTROL_QW_WRITE | + PIPE_CONTROL_CS_STALL); + *cs++ = i915_request_active_timeline(rq)->hwsp_offset | + PIPE_CONTROL_GLOBAL_GTT; + *cs++ = rq->fence.seqno; + + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + + rq->tail = intel_ring_offset(rq, cs); + assert_ring_tail_valid(rq->ring, rq->tail); + + return cs; +} + +static int mi_flush_dw(struct i915_request *rq, u32 flags) +{ + u32 cmd, *cs; + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + cmd = MI_FLUSH_DW; + + /* + * We always require a command barrier so that subsequent + * commands, such as breadcrumb interrupts, are strictly ordered + * wrt the contents of the write cache being flushed to memory + * (and thus being coherent from the CPU). + */ + cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; + + /* + * Bspec vol 1c.3 - blitter engine command streamer: + * "If ENABLED, all TLBs will be invalidated once the flush + * operation is complete. This bit is only valid when the + * Post-Sync Operation field is a value of 1h or 3h." + */ + cmd |= flags; + + *cs++ = cmd; + *cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; + *cs++ = 0; + *cs++ = MI_NOOP; + + intel_ring_advance(rq, cs); + + return 0; +} + +static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags) +{ + return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0); +} + +int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode) +{ + return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB); +} + +int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode) +{ + return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD); +} + +int gen6_emit_bb_start(struct i915_request *rq, + u64 offset, u32 len, + unsigned int dispatch_flags) +{ + u32 security; + u32 *cs; + + security = MI_BATCH_NON_SECURE_I965; + if (dispatch_flags & I915_DISPATCH_SECURE) + security = 0; + + cs = intel_ring_begin(rq, 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + cs = __gen6_emit_bb_start(cs, offset, security); + intel_ring_advance(rq, cs); + + return 0; +} + +int +hsw_emit_bb_start(struct i915_request *rq, + u64 offset, u32 len, + unsigned int dispatch_flags) +{ + u32 security; + u32 *cs; + + security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW; + if (dispatch_flags & I915_DISPATCH_SECURE) + security = 0; + + cs = intel_ring_begin(rq, 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + cs = __gen6_emit_bb_start(cs, offset, security); + intel_ring_advance(rq, cs); + + return 0; +} + +static int gen7_stall_cs(struct i915_request *rq) +{ + u32 *cs; + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; + *cs++ = 0; + *cs++ = 0; + intel_ring_advance(rq, cs); + + return 0; +} + +int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode) +{ + u32 scratch_addr = + intel_gt_scratch_offset(rq->engine->gt, + INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); + u32 *cs, flags = 0; + + /* + * Ensure that any following seqno writes only happen when the render + * cache is indeed flushed. + * + * Workaround: 4th PIPE_CONTROL command (except the ones with only + * read-cache invalidate bits set) must have the CS_STALL bit set. We + * don't try to be clever and just set it unconditionally. + */ + flags |= PIPE_CONTROL_CS_STALL; + + /* + * CS_STALL suggests at least a post-sync write. + */ + flags |= PIPE_CONTROL_QW_WRITE; + flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; + + /* + * Just flush everything. Experiments have shown that reducing the + * number of bits based on the write domains has little performance + * impact. + */ + if (mode & EMIT_FLUSH) { + flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; + flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; + flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; + flags |= PIPE_CONTROL_FLUSH_ENABLE; + } + if (mode & EMIT_INVALIDATE) { + flags |= PIPE_CONTROL_TLB_INVALIDATE; + flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR; + + /* + * Workaround: we must issue a pipe_control with CS-stall bit + * set before a pipe_control command that has the state cache + * invalidate bit set. + */ + gen7_stall_cs(rq); + } + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = flags; + *cs++ = scratch_addr; + *cs++ = 0; + intel_ring_advance(rq, cs); + + return 0; +} + +u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs) +{ + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_DC_FLUSH_ENABLE | + PIPE_CONTROL_FLUSH_ENABLE | + PIPE_CONTROL_QW_WRITE | + PIPE_CONTROL_GLOBAL_GTT_IVB | + PIPE_CONTROL_CS_STALL); + *cs++ = i915_request_active_timeline(rq)->hwsp_offset; + *cs++ = rq->fence.seqno; + + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + + rq->tail = intel_ring_offset(rq, cs); + assert_ring_tail_valid(rq->ring, rq->tail); + + return cs; +} + +u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs) +{ + GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma); + GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR); + + *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; + *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT; + *cs++ = rq->fence.seqno; + + *cs++ = MI_USER_INTERRUPT; + + rq->tail = intel_ring_offset(rq, cs); + assert_ring_tail_valid(rq->ring, rq->tail); + + return cs; +} + +#define GEN7_XCS_WA 32 +u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs) +{ + int i; + + GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma); + GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR); + + *cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB | + MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; + *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT; + *cs++ = rq->fence.seqno; + + for (i = 0; i < GEN7_XCS_WA; i++) { + *cs++ = MI_STORE_DWORD_INDEX; + *cs++ = I915_GEM_HWS_SEQNO_ADDR; + *cs++ = rq->fence.seqno; + } + + *cs++ = MI_FLUSH_DW; + *cs++ = 0; + *cs++ = 0; + + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + + rq->tail = intel_ring_offset(rq, cs); + assert_ring_tail_valid(rq->ring, rq->tail); + + return cs; +} +#undef GEN7_XCS_WA + +void gen6_irq_enable(struct intel_engine_cs *engine) +{ + ENGINE_WRITE(engine, RING_IMR, + ~(engine->irq_enable_mask | engine->irq_keep_mask)); + + /* Flush/delay to ensure the RING_IMR is active before the GT IMR */ + ENGINE_POSTING_READ(engine, RING_IMR); + + gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask); +} + +void gen6_irq_disable(struct intel_engine_cs *engine) +{ + ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); + gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask); +} + +void hsw_irq_enable_vecs(struct intel_engine_cs *engine) +{ + ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask); + + /* Flush/delay to ensure the RING_IMR is active before the GT IMR */ + ENGINE_POSTING_READ(engine, RING_IMR); + + gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask); +} + +void hsw_irq_disable_vecs(struct intel_engine_cs *engine) +{ + ENGINE_WRITE(engine, RING_IMR, ~0); + gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask); +} diff --git a/drivers/gpu/drm/i915/gt/gen6_engine_cs.h b/drivers/gpu/drm/i915/gt/gen6_engine_cs.h new file mode 100644 index 000000000..76c6bc9f3 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/gen6_engine_cs.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2020 Intel Corporation + */ + +#ifndef __GEN6_ENGINE_CS_H__ +#define __GEN6_ENGINE_CS_H__ + +#include <linux/types.h> + +#include "intel_gpu_commands.h" + +struct i915_request; +struct intel_engine_cs; + +int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode); +int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode); +int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode); +u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs); +u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs); + +int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode); +u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs); +u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs); + +int gen6_emit_bb_start(struct i915_request *rq, + u64 offset, u32 len, + unsigned int dispatch_flags); +int hsw_emit_bb_start(struct i915_request *rq, + u64 offset, u32 len, + unsigned int dispatch_flags); + +void gen6_irq_enable(struct intel_engine_cs *engine); +void gen6_irq_disable(struct intel_engine_cs *engine); + +void hsw_irq_enable_vecs(struct intel_engine_cs *engine); +void hsw_irq_disable_vecs(struct intel_engine_cs *engine); + +#endif /* __GEN6_ENGINE_CS_H__ */ diff --git a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c new file mode 100644 index 000000000..c30adc05f --- /dev/null +++ b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c @@ -0,0 +1,483 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +#include <linux/log2.h> + +#include "gen6_ppgtt.h" +#include "i915_scatterlist.h" +#include "i915_trace.h" +#include "i915_vgpu.h" +#include "intel_gt.h" + +/* Write pde (index) from the page directory @pd to the page table @pt */ +static inline void gen6_write_pde(const struct gen6_ppgtt *ppgtt, + const unsigned int pde, + const struct i915_page_table *pt) +{ + dma_addr_t addr = pt ? px_dma(pt) : px_dma(ppgtt->base.vm.scratch[1]); + + /* Caller needs to make sure the write completes if necessary */ + iowrite32(GEN6_PDE_ADDR_ENCODE(addr) | GEN6_PDE_VALID, + ppgtt->pd_addr + pde); +} + +void gen7_ppgtt_enable(struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + struct intel_uncore *uncore = gt->uncore; + struct intel_engine_cs *engine; + enum intel_engine_id id; + u32 ecochk; + + intel_uncore_rmw(uncore, GAC_ECO_BITS, 0, ECOBITS_PPGTT_CACHE64B); + + ecochk = intel_uncore_read(uncore, GAM_ECOCHK); + if (IS_HASWELL(i915)) { + ecochk |= ECOCHK_PPGTT_WB_HSW; + } else { + ecochk |= ECOCHK_PPGTT_LLC_IVB; + ecochk &= ~ECOCHK_PPGTT_GFDT_IVB; + } + intel_uncore_write(uncore, GAM_ECOCHK, ecochk); + + for_each_engine(engine, gt, id) { + /* GFX_MODE is per-ring on gen7+ */ + ENGINE_WRITE(engine, + RING_MODE_GEN7, + _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE)); + } +} + +void gen6_ppgtt_enable(struct intel_gt *gt) +{ + struct intel_uncore *uncore = gt->uncore; + + intel_uncore_rmw(uncore, + GAC_ECO_BITS, + 0, + ECOBITS_SNB_BIT | ECOBITS_PPGTT_CACHE64B); + + intel_uncore_rmw(uncore, + GAB_CTL, + 0, + GAB_CTL_CONT_AFTER_PAGEFAULT); + + intel_uncore_rmw(uncore, + GAM_ECOCHK, + 0, + ECOCHK_SNB_BIT | ECOCHK_PPGTT_CACHE64B); + + if (HAS_PPGTT(uncore->i915)) /* may be disabled for VT-d */ + intel_uncore_write(uncore, + GFX_MODE, + _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE)); +} + +/* PPGTT support for Sandybdrige/Gen6 and later */ +static void gen6_ppgtt_clear_range(struct i915_address_space *vm, + u64 start, u64 length) +{ + struct gen6_ppgtt * const ppgtt = to_gen6_ppgtt(i915_vm_to_ppgtt(vm)); + const unsigned int first_entry = start / I915_GTT_PAGE_SIZE; + const gen6_pte_t scratch_pte = vm->scratch[0]->encode; + unsigned int pde = first_entry / GEN6_PTES; + unsigned int pte = first_entry % GEN6_PTES; + unsigned int num_entries = length / I915_GTT_PAGE_SIZE; + + while (num_entries) { + struct i915_page_table * const pt = + i915_pt_entry(ppgtt->base.pd, pde++); + const unsigned int count = min(num_entries, GEN6_PTES - pte); + gen6_pte_t *vaddr; + + num_entries -= count; + + GEM_BUG_ON(count > atomic_read(&pt->used)); + if (!atomic_sub_return(count, &pt->used)) + ppgtt->scan_for_unused_pt = true; + + /* + * Note that the hw doesn't support removing PDE on the fly + * (they are cached inside the context with no means to + * invalidate the cache), so we can only reset the PTE + * entries back to scratch. + */ + + vaddr = kmap_atomic_px(pt); + memset32(vaddr + pte, scratch_pte, count); + kunmap_atomic(vaddr); + + pte = 0; + } +} + +static void gen6_ppgtt_insert_entries(struct i915_address_space *vm, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags) +{ + struct i915_ppgtt *ppgtt = i915_vm_to_ppgtt(vm); + struct i915_page_directory * const pd = ppgtt->pd; + unsigned int first_entry = vma->node.start / I915_GTT_PAGE_SIZE; + unsigned int act_pt = first_entry / GEN6_PTES; + unsigned int act_pte = first_entry % GEN6_PTES; + const u32 pte_encode = vm->pte_encode(0, cache_level, flags); + struct sgt_dma iter = sgt_dma(vma); + gen6_pte_t *vaddr; + + GEM_BUG_ON(!pd->entry[act_pt]); + + vaddr = kmap_atomic_px(i915_pt_entry(pd, act_pt)); + do { + GEM_BUG_ON(iter.sg->length < I915_GTT_PAGE_SIZE); + vaddr[act_pte] = pte_encode | GEN6_PTE_ADDR_ENCODE(iter.dma); + + iter.dma += I915_GTT_PAGE_SIZE; + if (iter.dma == iter.max) { + iter.sg = __sg_next(iter.sg); + if (!iter.sg) + break; + + iter.dma = sg_dma_address(iter.sg); + iter.max = iter.dma + iter.sg->length; + } + + if (++act_pte == GEN6_PTES) { + kunmap_atomic(vaddr); + vaddr = kmap_atomic_px(i915_pt_entry(pd, ++act_pt)); + act_pte = 0; + } + } while (1); + kunmap_atomic(vaddr); + + vma->page_sizes.gtt = I915_GTT_PAGE_SIZE; +} + +static void gen6_flush_pd(struct gen6_ppgtt *ppgtt, u64 start, u64 end) +{ + struct i915_page_directory * const pd = ppgtt->base.pd; + struct i915_page_table *pt; + unsigned int pde; + + start = round_down(start, SZ_64K); + end = round_up(end, SZ_64K) - start; + + mutex_lock(&ppgtt->flush); + + gen6_for_each_pde(pt, pd, start, end, pde) + gen6_write_pde(ppgtt, pde, pt); + + mb(); + ioread32(ppgtt->pd_addr + pde - 1); + gen6_ggtt_invalidate(ppgtt->base.vm.gt->ggtt); + mb(); + + mutex_unlock(&ppgtt->flush); +} + +static void gen6_alloc_va_range(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + u64 start, u64 length) +{ + struct gen6_ppgtt *ppgtt = to_gen6_ppgtt(i915_vm_to_ppgtt(vm)); + struct i915_page_directory * const pd = ppgtt->base.pd; + struct i915_page_table *pt; + bool flush = false; + u64 from = start; + unsigned int pde; + + spin_lock(&pd->lock); + gen6_for_each_pde(pt, pd, start, length, pde) { + const unsigned int count = gen6_pte_count(start, length); + + if (!pt) { + spin_unlock(&pd->lock); + + pt = stash->pt[0]; + __i915_gem_object_pin_pages(pt->base); + i915_gem_object_make_unshrinkable(pt->base); + + fill32_px(pt, vm->scratch[0]->encode); + + spin_lock(&pd->lock); + if (!pd->entry[pde]) { + stash->pt[0] = pt->stash; + atomic_set(&pt->used, 0); + pd->entry[pde] = pt; + } else { + pt = pd->entry[pde]; + } + + flush = true; + } + + atomic_add(count, &pt->used); + } + spin_unlock(&pd->lock); + + if (flush && i915_vma_is_bound(ppgtt->vma, I915_VMA_GLOBAL_BIND)) { + intel_wakeref_t wakeref; + + with_intel_runtime_pm(&vm->i915->runtime_pm, wakeref) + gen6_flush_pd(ppgtt, from, start); + } +} + +static int gen6_ppgtt_init_scratch(struct gen6_ppgtt *ppgtt) +{ + struct i915_address_space * const vm = &ppgtt->base.vm; + int ret; + + ret = setup_scratch_page(vm); + if (ret) + return ret; + + vm->scratch[0]->encode = + vm->pte_encode(px_dma(vm->scratch[0]), + I915_CACHE_NONE, PTE_READ_ONLY); + + vm->scratch[1] = vm->alloc_pt_dma(vm, I915_GTT_PAGE_SIZE_4K); + if (IS_ERR(vm->scratch[1])) { + ret = PTR_ERR(vm->scratch[1]); + goto err_scratch0; + } + + ret = pin_pt_dma(vm, vm->scratch[1]); + if (ret) + goto err_scratch1; + + fill32_px(vm->scratch[1], vm->scratch[0]->encode); + + return 0; + +err_scratch1: + i915_gem_object_put(vm->scratch[1]); +err_scratch0: + i915_gem_object_put(vm->scratch[0]); + return ret; +} + +static void gen6_ppgtt_free_pd(struct gen6_ppgtt *ppgtt) +{ + struct i915_page_directory * const pd = ppgtt->base.pd; + struct i915_page_table *pt; + u32 pde; + + gen6_for_all_pdes(pt, pd, pde) + if (pt) + free_pt(&ppgtt->base.vm, pt); +} + +static void gen6_ppgtt_cleanup(struct i915_address_space *vm) +{ + struct gen6_ppgtt *ppgtt = to_gen6_ppgtt(i915_vm_to_ppgtt(vm)); + + __i915_vma_put(ppgtt->vma); + + gen6_ppgtt_free_pd(ppgtt); + free_scratch(vm); + + mutex_destroy(&ppgtt->flush); + mutex_destroy(&ppgtt->pin_mutex); + + free_pd(&ppgtt->base.vm, ppgtt->base.pd); +} + +static int pd_vma_set_pages(struct i915_vma *vma) +{ + vma->pages = ERR_PTR(-ENODEV); + return 0; +} + +static void pd_vma_clear_pages(struct i915_vma *vma) +{ + GEM_BUG_ON(!vma->pages); + + vma->pages = NULL; +} + +static void pd_vma_bind(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 unused) +{ + struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); + struct gen6_ppgtt *ppgtt = vma->private; + u32 ggtt_offset = i915_ggtt_offset(vma) / I915_GTT_PAGE_SIZE; + + ppgtt->pp_dir = ggtt_offset * sizeof(gen6_pte_t) << 10; + ppgtt->pd_addr = (gen6_pte_t __iomem *)ggtt->gsm + ggtt_offset; + + gen6_flush_pd(ppgtt, 0, ppgtt->base.vm.total); +} + +static void pd_vma_unbind(struct i915_address_space *vm, struct i915_vma *vma) +{ + struct gen6_ppgtt *ppgtt = vma->private; + struct i915_page_directory * const pd = ppgtt->base.pd; + struct i915_page_table *pt; + unsigned int pde; + + if (!ppgtt->scan_for_unused_pt) + return; + + /* Free all no longer used page tables */ + gen6_for_all_pdes(pt, ppgtt->base.pd, pde) { + if (!pt || atomic_read(&pt->used)) + continue; + + free_pt(&ppgtt->base.vm, pt); + pd->entry[pde] = NULL; + } + + ppgtt->scan_for_unused_pt = false; +} + +static const struct i915_vma_ops pd_vma_ops = { + .set_pages = pd_vma_set_pages, + .clear_pages = pd_vma_clear_pages, + .bind_vma = pd_vma_bind, + .unbind_vma = pd_vma_unbind, +}; + +static struct i915_vma *pd_vma_create(struct gen6_ppgtt *ppgtt, int size) +{ + struct i915_ggtt *ggtt = ppgtt->base.vm.gt->ggtt; + struct i915_vma *vma; + + GEM_BUG_ON(!IS_ALIGNED(size, I915_GTT_PAGE_SIZE)); + GEM_BUG_ON(size > ggtt->vm.total); + + vma = i915_vma_alloc(); + if (!vma) + return ERR_PTR(-ENOMEM); + + i915_active_init(&vma->active, NULL, NULL); + + kref_init(&vma->ref); + mutex_init(&vma->pages_mutex); + vma->vm = i915_vm_get(&ggtt->vm); + vma->ops = &pd_vma_ops; + vma->private = ppgtt; + + vma->size = size; + vma->fence_size = size; + atomic_set(&vma->flags, I915_VMA_GGTT); + vma->ggtt_view.type = I915_GGTT_VIEW_ROTATED; /* prevent fencing */ + + INIT_LIST_HEAD(&vma->obj_link); + INIT_LIST_HEAD(&vma->closed_link); + + return vma; +} + +int gen6_ppgtt_pin(struct i915_ppgtt *base, struct i915_gem_ww_ctx *ww) +{ + struct gen6_ppgtt *ppgtt = to_gen6_ppgtt(base); + int err; + + GEM_BUG_ON(!atomic_read(&ppgtt->base.vm.open)); + + /* + * Workaround the limited maximum vma->pin_count and the aliasing_ppgtt + * which will be pinned into every active context. + * (When vma->pin_count becomes atomic, I expect we will naturally + * need a larger, unpacked, type and kill this redundancy.) + */ + if (atomic_add_unless(&ppgtt->pin_count, 1, 0)) + return 0; + + if (mutex_lock_interruptible(&ppgtt->pin_mutex)) + return -EINTR; + + /* + * PPGTT PDEs reside in the GGTT and consists of 512 entries. The + * allocator works in address space sizes, so it's multiplied by page + * size. We allocate at the top of the GTT to avoid fragmentation. + */ + err = 0; + if (!atomic_read(&ppgtt->pin_count)) + err = i915_ggtt_pin(ppgtt->vma, ww, GEN6_PD_ALIGN, PIN_HIGH); + if (!err) + atomic_inc(&ppgtt->pin_count); + mutex_unlock(&ppgtt->pin_mutex); + + return err; +} + +void gen6_ppgtt_unpin(struct i915_ppgtt *base) +{ + struct gen6_ppgtt *ppgtt = to_gen6_ppgtt(base); + + GEM_BUG_ON(!atomic_read(&ppgtt->pin_count)); + if (atomic_dec_and_test(&ppgtt->pin_count)) + i915_vma_unpin(ppgtt->vma); +} + +void gen6_ppgtt_unpin_all(struct i915_ppgtt *base) +{ + struct gen6_ppgtt *ppgtt = to_gen6_ppgtt(base); + + if (!atomic_read(&ppgtt->pin_count)) + return; + + i915_vma_unpin(ppgtt->vma); + atomic_set(&ppgtt->pin_count, 0); +} + +struct i915_ppgtt *gen6_ppgtt_create(struct intel_gt *gt) +{ + struct i915_ggtt * const ggtt = gt->ggtt; + struct gen6_ppgtt *ppgtt; + int err; + + ppgtt = kzalloc(sizeof(*ppgtt), GFP_KERNEL); + if (!ppgtt) + return ERR_PTR(-ENOMEM); + + mutex_init(&ppgtt->flush); + mutex_init(&ppgtt->pin_mutex); + + ppgtt_init(&ppgtt->base, gt); + ppgtt->base.vm.pd_shift = ilog2(SZ_4K * SZ_4K / sizeof(gen6_pte_t)); + ppgtt->base.vm.top = 1; + + ppgtt->base.vm.bind_async_flags = I915_VMA_LOCAL_BIND; + ppgtt->base.vm.allocate_va_range = gen6_alloc_va_range; + ppgtt->base.vm.clear_range = gen6_ppgtt_clear_range; + ppgtt->base.vm.insert_entries = gen6_ppgtt_insert_entries; + ppgtt->base.vm.cleanup = gen6_ppgtt_cleanup; + + ppgtt->base.vm.alloc_pt_dma = alloc_pt_dma; + ppgtt->base.vm.pte_encode = ggtt->vm.pte_encode; + + ppgtt->base.pd = __alloc_pd(I915_PDES); + if (!ppgtt->base.pd) { + err = -ENOMEM; + goto err_free; + } + + err = gen6_ppgtt_init_scratch(ppgtt); + if (err) + goto err_pd; + + ppgtt->vma = pd_vma_create(ppgtt, GEN6_PD_SIZE); + if (IS_ERR(ppgtt->vma)) { + err = PTR_ERR(ppgtt->vma); + goto err_scratch; + } + + return &ppgtt->base; + +err_scratch: + free_scratch(&ppgtt->base.vm); +err_pd: + free_pd(&ppgtt->base.vm, ppgtt->base.pd); +err_free: + mutex_destroy(&ppgtt->pin_mutex); + kfree(ppgtt); + return ERR_PTR(err); +} diff --git a/drivers/gpu/drm/i915/gt/gen6_ppgtt.h b/drivers/gpu/drm/i915/gt/gen6_ppgtt.h new file mode 100644 index 000000000..3357228f3 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/gen6_ppgtt.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2020 Intel Corporation + */ + +#ifndef __GEN6_PPGTT_H__ +#define __GEN6_PPGTT_H__ + +#include "intel_gtt.h" + +struct i915_gem_ww_ctx; + +struct gen6_ppgtt { + struct i915_ppgtt base; + + struct mutex flush; + struct i915_vma *vma; + gen6_pte_t __iomem *pd_addr; + u32 pp_dir; + + atomic_t pin_count; + struct mutex pin_mutex; + + bool scan_for_unused_pt; +}; + +static inline u32 gen6_pte_index(u32 addr) +{ + return i915_pte_index(addr, GEN6_PDE_SHIFT); +} + +static inline u32 gen6_pte_count(u32 addr, u32 length) +{ + return i915_pte_count(addr, length, GEN6_PDE_SHIFT); +} + +static inline u32 gen6_pde_index(u32 addr) +{ + return i915_pde_index(addr, GEN6_PDE_SHIFT); +} + +#define __to_gen6_ppgtt(base) container_of(base, struct gen6_ppgtt, base) + +static inline struct gen6_ppgtt *to_gen6_ppgtt(struct i915_ppgtt *base) +{ + BUILD_BUG_ON(offsetof(struct gen6_ppgtt, base)); + return __to_gen6_ppgtt(base); +} + +/* + * gen6_for_each_pde() iterates over every pde from start until start+length. + * If start and start+length are not perfectly divisible, the macro will round + * down and up as needed. Start=0 and length=2G effectively iterates over + * every PDE in the system. The macro modifies ALL its parameters except 'pd', + * so each of the other parameters should preferably be a simple variable, or + * at most an lvalue with no side-effects! + */ +#define gen6_for_each_pde(pt, pd, start, length, iter) \ + for (iter = gen6_pde_index(start); \ + length > 0 && iter < I915_PDES && \ + (pt = i915_pt_entry(pd, iter), true); \ + ({ u32 temp = ALIGN(start+1, 1 << GEN6_PDE_SHIFT); \ + temp = min(temp - start, length); \ + start += temp, length -= temp; }), ++iter) + +#define gen6_for_all_pdes(pt, pd, iter) \ + for (iter = 0; \ + iter < I915_PDES && \ + (pt = i915_pt_entry(pd, iter), true); \ + ++iter) + +int gen6_ppgtt_pin(struct i915_ppgtt *base, struct i915_gem_ww_ctx *ww); +void gen6_ppgtt_unpin(struct i915_ppgtt *base); +void gen6_ppgtt_unpin_all(struct i915_ppgtt *base); +void gen6_ppgtt_enable(struct intel_gt *gt); +void gen7_ppgtt_enable(struct intel_gt *gt); +struct i915_ppgtt *gen6_ppgtt_create(struct intel_gt *gt); + +#endif diff --git a/drivers/gpu/drm/i915/gt/gen6_renderstate.c b/drivers/gpu/drm/i915/gt/gen6_renderstate.c new file mode 100644 index 000000000..11c8e7b3d --- /dev/null +++ b/drivers/gpu/drm/i915/gt/gen6_renderstate.c @@ -0,0 +1,315 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Generated by: intel-gpu-tools-1.8-220-g01153e7 + */ + +#include "intel_renderstate.h" + +static const u32 gen6_null_state_relocs[] = { + 0x00000020, + 0x00000024, + 0x0000002c, + 0x000001e0, + 0x000001e4, + -1, +}; + +static const u32 gen6_null_state_batch[] = { + 0x69040000, + 0x790d0001, + 0x00000000, + 0x00000000, + 0x78180000, + 0x00000001, + 0x61010008, + 0x00000000, + 0x00000001, /* reloc */ + 0x00000001, /* reloc */ + 0x00000000, + 0x00000001, /* reloc */ + 0x00000000, + 0x00000001, + 0x00000000, + 0x00000001, + 0x61020000, + 0x00000000, + 0x78050001, + 0x00000018, + 0x00000000, + 0x780d1002, + 0x00000000, + 0x00000000, + 0x00000420, + 0x78150003, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78100004, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78160003, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78110005, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78120002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78170003, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x79050005, + 0xe0040000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x79100000, + 0x00000000, + 0x79000002, + 0xffffffff, + 0x00000000, + 0x00000000, + 0x780e0002, + 0x00000441, + 0x00000401, + 0x00000401, + 0x78021002, + 0x00000000, + 0x00000000, + 0x00000400, + 0x78130012, + 0x00400810, + 0x00000000, + 0x20000000, + 0x04000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78140007, + 0x00000280, + 0x08080000, + 0x00000000, + 0x00060000, + 0x4e080002, + 0x00100400, + 0x00000000, + 0x00000000, + 0x78090005, + 0x02000000, + 0x22220000, + 0x02f60000, + 0x11330000, + 0x02850004, + 0x11220000, + 0x78011002, + 0x00000000, + 0x00000000, + 0x00000200, + 0x78080003, + 0x00002000, + 0x00000448, /* reloc */ + 0x00000448, /* reloc */ + 0x00000000, + 0x05000000, /* cmds end */ + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000220, /* state start */ + 0x00000240, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x0060005a, + 0x204077be, + 0x000000c0, + 0x008d0040, + 0x0060005a, + 0x206077be, + 0x000000c0, + 0x008d0080, + 0x0060005a, + 0x208077be, + 0x000000d0, + 0x008d0040, + 0x0060005a, + 0x20a077be, + 0x000000d0, + 0x008d0080, + 0x00000201, + 0x20080061, + 0x00000000, + 0x00000000, + 0x00600001, + 0x20200022, + 0x008d0000, + 0x00000000, + 0x02800031, + 0x21c01cc9, + 0x00000020, + 0x0a8a0001, + 0x00600001, + 0x204003be, + 0x008d01c0, + 0x00000000, + 0x00600001, + 0x206003be, + 0x008d01e0, + 0x00000000, + 0x00600001, + 0x208003be, + 0x008d0200, + 0x00000000, + 0x00600001, + 0x20a003be, + 0x008d0220, + 0x00000000, + 0x00600001, + 0x20c003be, + 0x008d0240, + 0x00000000, + 0x00600001, + 0x20e003be, + 0x008d0260, + 0x00000000, + 0x00600001, + 0x210003be, + 0x008d0280, + 0x00000000, + 0x00600001, + 0x212003be, + 0x008d02a0, + 0x00000000, + 0x05800031, + 0x24001cc8, + 0x00000040, + 0x90019000, + 0x0000007e, + 0x00000000, + 0x00000000, + 0x00000000, + 0x0000007e, + 0x00000000, + 0x00000000, + 0x00000000, + 0x0000007e, + 0x00000000, + 0x00000000, + 0x00000000, + 0x0000007e, + 0x00000000, + 0x00000000, + 0x00000000, + 0x0000007e, + 0x00000000, + 0x00000000, + 0x00000000, + 0x0000007e, + 0x00000000, + 0x00000000, + 0x00000000, + 0x0000007e, + 0x00000000, + 0x00000000, + 0x00000000, + 0x0000007e, + 0x00000000, + 0x00000000, + 0x00000000, + 0x30000000, + 0x00000124, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0xf99a130c, + 0x799a130c, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x80000031, + 0x00000003, + 0x00000000, /* state end */ +}; + +RO_RENDERSTATE(6); diff --git a/drivers/gpu/drm/i915/gt/gen7_renderclear.c b/drivers/gpu/drm/i915/gt/gen7_renderclear.c new file mode 100644 index 000000000..c724fba8a --- /dev/null +++ b/drivers/gpu/drm/i915/gt/gen7_renderclear.c @@ -0,0 +1,452 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2019 Intel Corporation + */ + +#include "gen7_renderclear.h" +#include "i915_drv.h" +#include "intel_gpu_commands.h" + +#define GT3_INLINE_DATA_DELAYS 0x1E00 +#define batch_advance(Y, CS) GEM_BUG_ON((Y)->end != (CS)) + +struct cb_kernel { + const void *data; + u32 size; +}; + +#define CB_KERNEL(name) { .data = (name), .size = sizeof(name) } + +#include "ivb_clear_kernel.c" +static const struct cb_kernel cb_kernel_ivb = CB_KERNEL(ivb_clear_kernel); + +#include "hsw_clear_kernel.c" +static const struct cb_kernel cb_kernel_hsw = CB_KERNEL(hsw_clear_kernel); + +struct batch_chunk { + struct i915_vma *vma; + u32 offset; + u32 *start; + u32 *end; + u32 max_items; +}; + +struct batch_vals { + u32 max_threads; + u32 state_start; + u32 surface_start; + u32 surface_height; + u32 surface_width; + u32 size; +}; + +static inline int num_primitives(const struct batch_vals *bv) +{ + /* + * We need to saturate the GPU with work in order to dispatch + * a shader on every HW thread, and clear the thread-local registers. + * In short, we have to dispatch work faster than the shaders can + * run in order to fill the EU and occupy each HW thread. + */ + return bv->max_threads; +} + +static void +batch_get_defaults(struct drm_i915_private *i915, struct batch_vals *bv) +{ + if (IS_HASWELL(i915)) { + switch (INTEL_INFO(i915)->gt) { + default: + case 1: + bv->max_threads = 70; + break; + case 2: + bv->max_threads = 140; + break; + case 3: + bv->max_threads = 280; + break; + } + bv->surface_height = 16 * 16; + bv->surface_width = 32 * 2 * 16; + } else { + switch (INTEL_INFO(i915)->gt) { + default: + case 1: /* including vlv */ + bv->max_threads = 36; + break; + case 2: + bv->max_threads = 128; + break; + } + bv->surface_height = 16 * 8; + bv->surface_width = 32 * 16; + } + bv->state_start = round_up(SZ_1K + num_primitives(bv) * 64, SZ_4K); + bv->surface_start = bv->state_start + SZ_4K; + bv->size = bv->surface_start + bv->surface_height * bv->surface_width; +} + +static void batch_init(struct batch_chunk *bc, + struct i915_vma *vma, + u32 *start, u32 offset, u32 max_bytes) +{ + bc->vma = vma; + bc->offset = offset; + bc->start = start + bc->offset / sizeof(*bc->start); + bc->end = bc->start; + bc->max_items = max_bytes / sizeof(*bc->start); +} + +static u32 batch_offset(const struct batch_chunk *bc, u32 *cs) +{ + return (cs - bc->start) * sizeof(*bc->start) + bc->offset; +} + +static u32 batch_addr(const struct batch_chunk *bc) +{ + return bc->vma->node.start; +} + +static void batch_add(struct batch_chunk *bc, const u32 d) +{ + GEM_BUG_ON((bc->end - bc->start) >= bc->max_items); + *bc->end++ = d; +} + +static u32 *batch_alloc_items(struct batch_chunk *bc, u32 align, u32 items) +{ + u32 *map; + + if (align) { + u32 *end = PTR_ALIGN(bc->end, align); + + memset32(bc->end, 0, end - bc->end); + bc->end = end; + } + + map = bc->end; + bc->end += items; + + return map; +} + +static u32 *batch_alloc_bytes(struct batch_chunk *bc, u32 align, u32 bytes) +{ + GEM_BUG_ON(!IS_ALIGNED(bytes, sizeof(*bc->start))); + return batch_alloc_items(bc, align, bytes / sizeof(*bc->start)); +} + +static u32 +gen7_fill_surface_state(struct batch_chunk *state, + const u32 dst_offset, + const struct batch_vals *bv) +{ + u32 surface_h = bv->surface_height; + u32 surface_w = bv->surface_width; + u32 *cs = batch_alloc_items(state, 32, 8); + u32 offset = batch_offset(state, cs); + +#define SURFACE_2D 1 +#define SURFACEFORMAT_B8G8R8A8_UNORM 0x0C0 +#define RENDER_CACHE_READ_WRITE 1 + + *cs++ = SURFACE_2D << 29 | + (SURFACEFORMAT_B8G8R8A8_UNORM << 18) | + (RENDER_CACHE_READ_WRITE << 8); + + *cs++ = batch_addr(state) + dst_offset; + + *cs++ = ((surface_h / 4 - 1) << 16) | (surface_w / 4 - 1); + *cs++ = surface_w; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; +#define SHADER_CHANNELS(r, g, b, a) \ + (((r) << 25) | ((g) << 22) | ((b) << 19) | ((a) << 16)) + *cs++ = SHADER_CHANNELS(4, 5, 6, 7); + batch_advance(state, cs); + + return offset; +} + +static u32 +gen7_fill_binding_table(struct batch_chunk *state, + const struct batch_vals *bv) +{ + u32 surface_start = + gen7_fill_surface_state(state, bv->surface_start, bv); + u32 *cs = batch_alloc_items(state, 32, 8); + u32 offset = batch_offset(state, cs); + + *cs++ = surface_start - state->offset; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + batch_advance(state, cs); + + return offset; +} + +static u32 +gen7_fill_kernel_data(struct batch_chunk *state, + const u32 *data, + const u32 size) +{ + return batch_offset(state, + memcpy(batch_alloc_bytes(state, 64, size), + data, size)); +} + +static u32 +gen7_fill_interface_descriptor(struct batch_chunk *state, + const struct batch_vals *bv, + const struct cb_kernel *kernel, + unsigned int count) +{ + u32 kernel_offset = + gen7_fill_kernel_data(state, kernel->data, kernel->size); + u32 binding_table = gen7_fill_binding_table(state, bv); + u32 *cs = batch_alloc_items(state, 32, 8 * count); + u32 offset = batch_offset(state, cs); + + *cs++ = kernel_offset; + *cs++ = (1 << 7) | (1 << 13); + *cs++ = 0; + *cs++ = (binding_table - state->offset) | 1; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + + /* 1 - 63dummy idds */ + memset32(cs, 0x00, (count - 1) * 8); + batch_advance(state, cs + (count - 1) * 8); + + return offset; +} + +static void +gen7_emit_state_base_address(struct batch_chunk *batch, + u32 surface_state_base) +{ + u32 *cs = batch_alloc_items(batch, 0, 10); + + *cs++ = STATE_BASE_ADDRESS | (10 - 2); + /* general */ + *cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY; + /* surface */ + *cs++ = (batch_addr(batch) + surface_state_base) | BASE_ADDRESS_MODIFY; + /* dynamic */ + *cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY; + /* indirect */ + *cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY; + /* instruction */ + *cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY; + + /* general/dynamic/indirect/instruction access Bound */ + *cs++ = 0; + *cs++ = BASE_ADDRESS_MODIFY; + *cs++ = 0; + *cs++ = BASE_ADDRESS_MODIFY; + batch_advance(batch, cs); +} + +static void +gen7_emit_vfe_state(struct batch_chunk *batch, + const struct batch_vals *bv, + u32 urb_size, u32 curbe_size, + u32 mode) +{ + u32 threads = bv->max_threads - 1; + u32 *cs = batch_alloc_items(batch, 32, 8); + + *cs++ = MEDIA_VFE_STATE | (8 - 2); + + /* scratch buffer */ + *cs++ = 0; + + /* number of threads & urb entries for GPGPU vs Media Mode */ + *cs++ = threads << 16 | 1 << 8 | mode << 2; + + *cs++ = 0; + + /* urb entry size & curbe size in 256 bits unit */ + *cs++ = urb_size << 16 | curbe_size; + + /* scoreboard */ + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + batch_advance(batch, cs); +} + +static void +gen7_emit_interface_descriptor_load(struct batch_chunk *batch, + const u32 interface_descriptor, + unsigned int count) +{ + u32 *cs = batch_alloc_items(batch, 8, 4); + + *cs++ = MEDIA_INTERFACE_DESCRIPTOR_LOAD | (4 - 2); + *cs++ = 0; + *cs++ = count * 8 * sizeof(*cs); + + /* + * interface descriptor address - it is relative to the dynamics base + * address + */ + *cs++ = interface_descriptor; + batch_advance(batch, cs); +} + +static void +gen7_emit_media_object(struct batch_chunk *batch, + unsigned int media_object_index) +{ + unsigned int x_offset = (media_object_index % 16) * 64; + unsigned int y_offset = (media_object_index / 16) * 16; + unsigned int pkt = 6 + 3; + u32 *cs; + + cs = batch_alloc_items(batch, 8, pkt); + + *cs++ = MEDIA_OBJECT | (pkt - 2); + + /* interface descriptor offset */ + *cs++ = 0; + + /* without indirect data */ + *cs++ = 0; + *cs++ = 0; + + /* scoreboard */ + *cs++ = 0; + *cs++ = 0; + + /* inline */ + *cs++ = y_offset << 16 | x_offset; + *cs++ = 0; + *cs++ = GT3_INLINE_DATA_DELAYS; + + batch_advance(batch, cs); +} + +static void gen7_emit_pipeline_flush(struct batch_chunk *batch) +{ + u32 *cs = batch_alloc_items(batch, 0, 4); + + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_DC_FLUSH_ENABLE | + PIPE_CONTROL_CS_STALL; + *cs++ = 0; + *cs++ = 0; + + batch_advance(batch, cs); +} + +static void gen7_emit_pipeline_invalidate(struct batch_chunk *batch) +{ + u32 *cs = batch_alloc_items(batch, 0, 10); + + /* ivb: Stall before STATE_CACHE_INVALIDATE */ + *cs++ = GFX_OP_PIPE_CONTROL(5); + *cs++ = PIPE_CONTROL_STALL_AT_SCOREBOARD | + PIPE_CONTROL_CS_STALL; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + + *cs++ = GFX_OP_PIPE_CONTROL(5); + *cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + + batch_advance(batch, cs); +} + +static void emit_batch(struct i915_vma * const vma, + u32 *start, + const struct batch_vals *bv) +{ + struct drm_i915_private *i915 = vma->vm->i915; + const unsigned int desc_count = 1; + const unsigned int urb_size = 1; + struct batch_chunk cmds, state; + u32 descriptors; + unsigned int i; + + batch_init(&cmds, vma, start, 0, bv->state_start); + batch_init(&state, vma, start, bv->state_start, SZ_4K); + + descriptors = gen7_fill_interface_descriptor(&state, bv, + IS_HASWELL(i915) ? + &cb_kernel_hsw : + &cb_kernel_ivb, + desc_count); + + /* Reset inherited context registers */ + gen7_emit_pipeline_flush(&cmds); + gen7_emit_pipeline_invalidate(&cmds); + batch_add(&cmds, MI_LOAD_REGISTER_IMM(2)); + batch_add(&cmds, i915_mmio_reg_offset(CACHE_MODE_0_GEN7)); + batch_add(&cmds, 0xffff0000 | + ((IS_IVB_GT1(i915) || IS_VALLEYVIEW(i915)) ? + HIZ_RAW_STALL_OPT_DISABLE : + 0)); + batch_add(&cmds, i915_mmio_reg_offset(CACHE_MODE_1)); + batch_add(&cmds, 0xffff0000 | PIXEL_SUBSPAN_COLLECT_OPT_DISABLE); + gen7_emit_pipeline_invalidate(&cmds); + gen7_emit_pipeline_flush(&cmds); + + /* Switch to the media pipeline and our base address */ + gen7_emit_pipeline_invalidate(&cmds); + batch_add(&cmds, PIPELINE_SELECT | PIPELINE_SELECT_MEDIA); + batch_add(&cmds, MI_NOOP); + gen7_emit_pipeline_invalidate(&cmds); + + gen7_emit_pipeline_flush(&cmds); + gen7_emit_state_base_address(&cmds, descriptors); + gen7_emit_pipeline_invalidate(&cmds); + + /* Set the clear-residual kernel state */ + gen7_emit_vfe_state(&cmds, bv, urb_size - 1, 0, 0); + gen7_emit_interface_descriptor_load(&cmds, descriptors, desc_count); + + /* Execute the kernel on all HW threads */ + for (i = 0; i < num_primitives(bv); i++) + gen7_emit_media_object(&cmds, i); + + batch_add(&cmds, MI_BATCH_BUFFER_END); +} + +int gen7_setup_clear_gpr_bb(struct intel_engine_cs * const engine, + struct i915_vma * const vma) +{ + struct batch_vals bv; + u32 *batch; + + batch_get_defaults(engine->i915, &bv); + if (!vma) + return bv.size; + + GEM_BUG_ON(vma->obj->base.size < bv.size); + + batch = i915_gem_object_pin_map(vma->obj, I915_MAP_WC); + if (IS_ERR(batch)) + return PTR_ERR(batch); + + emit_batch(vma, memset(batch, 0, bv.size), &bv); + + i915_gem_object_flush_map(vma->obj); + __i915_gem_object_release_map(vma->obj); + + return 0; +} diff --git a/drivers/gpu/drm/i915/gt/gen7_renderclear.h b/drivers/gpu/drm/i915/gt/gen7_renderclear.h new file mode 100644 index 000000000..bb100748e --- /dev/null +++ b/drivers/gpu/drm/i915/gt/gen7_renderclear.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2019 Intel Corporation + */ + +#ifndef __GEN7_RENDERCLEAR_H__ +#define __GEN7_RENDERCLEAR_H__ + +struct intel_engine_cs; +struct i915_vma; + +int gen7_setup_clear_gpr_bb(struct intel_engine_cs * const engine, + struct i915_vma * const vma); + +#endif /* __GEN7_RENDERCLEAR_H__ */ diff --git a/drivers/gpu/drm/i915/gt/gen7_renderstate.c b/drivers/gpu/drm/i915/gt/gen7_renderstate.c new file mode 100644 index 000000000..655180646 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/gen7_renderstate.c @@ -0,0 +1,279 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Generated by: intel-gpu-tools-1.8-220-g01153e7 + */ + +#include "intel_renderstate.h" + +static const u32 gen7_null_state_relocs[] = { + 0x0000000c, + 0x00000010, + 0x00000018, + 0x000001ec, + -1, +}; + +static const u32 gen7_null_state_batch[] = { + 0x69040000, + 0x61010008, + 0x00000000, + 0x00000001, /* reloc */ + 0x00000001, /* reloc */ + 0x00000000, + 0x00000001, /* reloc */ + 0x00000000, + 0x00000001, + 0x00000000, + 0x00000001, + 0x790d0002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78180000, + 0x00000001, + 0x79160000, + 0x00000008, + 0x78300000, + 0x02010040, + 0x78310000, + 0x04000000, + 0x78320000, + 0x04000000, + 0x78330000, + 0x02000000, + 0x78100004, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x781b0005, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x781c0002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x781d0004, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78110005, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78120002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78210000, + 0x00000000, + 0x78130005, + 0x00000000, + 0x20000000, + 0x04000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78140001, + 0x20000800, + 0x00000000, + 0x781e0001, + 0x00000000, + 0x00000000, + 0x78050005, + 0xe0040000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78040001, + 0x00000000, + 0x00000000, + 0x78240000, + 0x00000240, + 0x78230000, + 0x00000260, + 0x782f0000, + 0x00000280, + 0x781f000c, + 0x00400810, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78200006, + 0x000002c0, + 0x08080000, + 0x00000000, + 0x28000402, + 0x00060000, + 0x00000000, + 0x00000000, + 0x78090005, + 0x02000000, + 0x22220000, + 0x02f60000, + 0x11230000, + 0x02f60004, + 0x11230000, + 0x78080003, + 0x00006008, + 0x00000340, /* reloc */ + 0xffffffff, + 0x00000000, + 0x782a0000, + 0x00000360, + 0x79000002, + 0xffffffff, + 0x00000000, + 0x00000000, + 0x7b000005, + 0x0000000f, + 0x00000003, + 0x00000000, + 0x00000001, + 0x00000000, + 0x00000000, + 0x05000000, /* cmds end */ + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000031, /* state start */ + 0x00000003, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0xf99a130c, + 0x799a130c, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000492, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x0080005a, + 0x2e2077bd, + 0x000000c0, + 0x008d0040, + 0x0080005a, + 0x2e6077bd, + 0x000000d0, + 0x008d0040, + 0x02800031, + 0x21801fa9, + 0x008d0e20, + 0x08840001, + 0x00800001, + 0x2e2003bd, + 0x008d0180, + 0x00000000, + 0x00800001, + 0x2e6003bd, + 0x008d01c0, + 0x00000000, + 0x00800001, + 0x2ea003bd, + 0x008d0200, + 0x00000000, + 0x00800001, + 0x2ee003bd, + 0x008d0240, + 0x00000000, + 0x05800031, + 0x20001fa8, + 0x008d0e20, + 0x90031000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000380, + 0x000003a0, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, /* state end */ +}; + +RO_RENDERSTATE(7); diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c new file mode 100644 index 000000000..be27f9889 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c @@ -0,0 +1,744 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +#include <linux/log2.h> + +#include "gen8_ppgtt.h" +#include "i915_scatterlist.h" +#include "i915_trace.h" +#include "i915_pvinfo.h" +#include "i915_vgpu.h" +#include "intel_gt.h" +#include "intel_gtt.h" + +static u64 gen8_pde_encode(const dma_addr_t addr, + const enum i915_cache_level level) +{ + u64 pde = addr | _PAGE_PRESENT | _PAGE_RW; + + if (level != I915_CACHE_NONE) + pde |= PPAT_CACHED_PDE; + else + pde |= PPAT_UNCACHED; + + return pde; +} + +static u64 gen8_pte_encode(dma_addr_t addr, + enum i915_cache_level level, + u32 flags) +{ + gen8_pte_t pte = addr | _PAGE_PRESENT | _PAGE_RW; + + if (unlikely(flags & PTE_READ_ONLY)) + pte &= ~_PAGE_RW; + + switch (level) { + case I915_CACHE_NONE: + pte |= PPAT_UNCACHED; + break; + case I915_CACHE_WT: + pte |= PPAT_DISPLAY_ELLC; + break; + default: + pte |= PPAT_CACHED; + break; + } + + return pte; +} + +static void gen8_ppgtt_notify_vgt(struct i915_ppgtt *ppgtt, bool create) +{ + struct drm_i915_private *i915 = ppgtt->vm.i915; + struct intel_uncore *uncore = ppgtt->vm.gt->uncore; + enum vgt_g2v_type msg; + int i; + + if (create) + atomic_inc(px_used(ppgtt->pd)); /* never remove */ + else + atomic_dec(px_used(ppgtt->pd)); + + mutex_lock(&i915->vgpu.lock); + + if (i915_vm_is_4lvl(&ppgtt->vm)) { + const u64 daddr = px_dma(ppgtt->pd); + + intel_uncore_write(uncore, + vgtif_reg(pdp[0].lo), lower_32_bits(daddr)); + intel_uncore_write(uncore, + vgtif_reg(pdp[0].hi), upper_32_bits(daddr)); + + msg = create ? + VGT_G2V_PPGTT_L4_PAGE_TABLE_CREATE : + VGT_G2V_PPGTT_L4_PAGE_TABLE_DESTROY; + } else { + for (i = 0; i < GEN8_3LVL_PDPES; i++) { + const u64 daddr = i915_page_dir_dma_addr(ppgtt, i); + + intel_uncore_write(uncore, + vgtif_reg(pdp[i].lo), + lower_32_bits(daddr)); + intel_uncore_write(uncore, + vgtif_reg(pdp[i].hi), + upper_32_bits(daddr)); + } + + msg = create ? + VGT_G2V_PPGTT_L3_PAGE_TABLE_CREATE : + VGT_G2V_PPGTT_L3_PAGE_TABLE_DESTROY; + } + + /* g2v_notify atomically (via hv trap) consumes the message packet. */ + intel_uncore_write(uncore, vgtif_reg(g2v_notify), msg); + + mutex_unlock(&i915->vgpu.lock); +} + +/* Index shifts into the pagetable are offset by GEN8_PTE_SHIFT [12] */ +#define GEN8_PAGE_SIZE (SZ_4K) /* page and page-directory sizes are the same */ +#define GEN8_PTE_SHIFT (ilog2(GEN8_PAGE_SIZE)) +#define GEN8_PDES (GEN8_PAGE_SIZE / sizeof(u64)) +#define gen8_pd_shift(lvl) ((lvl) * ilog2(GEN8_PDES)) +#define gen8_pd_index(i, lvl) i915_pde_index((i), gen8_pd_shift(lvl)) +#define __gen8_pte_shift(lvl) (GEN8_PTE_SHIFT + gen8_pd_shift(lvl)) +#define __gen8_pte_index(a, lvl) i915_pde_index((a), __gen8_pte_shift(lvl)) + +#define as_pd(x) container_of((x), typeof(struct i915_page_directory), pt) + +static inline unsigned int +gen8_pd_range(u64 start, u64 end, int lvl, unsigned int *idx) +{ + const int shift = gen8_pd_shift(lvl); + const u64 mask = ~0ull << gen8_pd_shift(lvl + 1); + + GEM_BUG_ON(start >= end); + end += ~mask >> gen8_pd_shift(1); + + *idx = i915_pde_index(start, shift); + if ((start ^ end) & mask) + return GEN8_PDES - *idx; + else + return i915_pde_index(end, shift) - *idx; +} + +static inline bool gen8_pd_contains(u64 start, u64 end, int lvl) +{ + const u64 mask = ~0ull << gen8_pd_shift(lvl + 1); + + GEM_BUG_ON(start >= end); + return (start ^ end) & mask && (start & ~mask) == 0; +} + +static inline unsigned int gen8_pt_count(u64 start, u64 end) +{ + GEM_BUG_ON(start >= end); + if ((start ^ end) >> gen8_pd_shift(1)) + return GEN8_PDES - (start & (GEN8_PDES - 1)); + else + return end - start; +} + +static inline unsigned int +gen8_pd_top_count(const struct i915_address_space *vm) +{ + unsigned int shift = __gen8_pte_shift(vm->top); + return (vm->total + (1ull << shift) - 1) >> shift; +} + +static inline struct i915_page_directory * +gen8_pdp_for_page_index(struct i915_address_space * const vm, const u64 idx) +{ + struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(vm); + + if (vm->top == 2) + return ppgtt->pd; + else + return i915_pd_entry(ppgtt->pd, gen8_pd_index(idx, vm->top)); +} + +static inline struct i915_page_directory * +gen8_pdp_for_page_address(struct i915_address_space * const vm, const u64 addr) +{ + return gen8_pdp_for_page_index(vm, addr >> GEN8_PTE_SHIFT); +} + +static void __gen8_ppgtt_cleanup(struct i915_address_space *vm, + struct i915_page_directory *pd, + int count, int lvl) +{ + if (lvl) { + void **pde = pd->entry; + + do { + if (!*pde) + continue; + + __gen8_ppgtt_cleanup(vm, *pde, GEN8_PDES, lvl - 1); + } while (pde++, --count); + } + + free_px(vm, &pd->pt, lvl); +} + +static void gen8_ppgtt_cleanup(struct i915_address_space *vm) +{ + struct i915_ppgtt *ppgtt = i915_vm_to_ppgtt(vm); + + if (intel_vgpu_active(vm->i915)) + gen8_ppgtt_notify_vgt(ppgtt, false); + + __gen8_ppgtt_cleanup(vm, ppgtt->pd, gen8_pd_top_count(vm), vm->top); + free_scratch(vm); +} + +static u64 __gen8_ppgtt_clear(struct i915_address_space * const vm, + struct i915_page_directory * const pd, + u64 start, const u64 end, int lvl) +{ + const struct drm_i915_gem_object * const scratch = vm->scratch[lvl]; + unsigned int idx, len; + + GEM_BUG_ON(end > vm->total >> GEN8_PTE_SHIFT); + + len = gen8_pd_range(start, end, lvl--, &idx); + DBG("%s(%p):{ lvl:%d, start:%llx, end:%llx, idx:%d, len:%d, used:%d }\n", + __func__, vm, lvl + 1, start, end, + idx, len, atomic_read(px_used(pd))); + GEM_BUG_ON(!len || len >= atomic_read(px_used(pd))); + + do { + struct i915_page_table *pt = pd->entry[idx]; + + if (atomic_fetch_inc(&pt->used) >> gen8_pd_shift(1) && + gen8_pd_contains(start, end, lvl)) { + DBG("%s(%p):{ lvl:%d, idx:%d, start:%llx, end:%llx } removing pd\n", + __func__, vm, lvl + 1, idx, start, end); + clear_pd_entry(pd, idx, scratch); + __gen8_ppgtt_cleanup(vm, as_pd(pt), I915_PDES, lvl); + start += (u64)I915_PDES << gen8_pd_shift(lvl); + continue; + } + + if (lvl) { + start = __gen8_ppgtt_clear(vm, as_pd(pt), + start, end, lvl); + } else { + unsigned int count; + u64 *vaddr; + + count = gen8_pt_count(start, end); + DBG("%s(%p):{ lvl:%d, start:%llx, end:%llx, idx:%d, len:%d, used:%d } removing pte\n", + __func__, vm, lvl, start, end, + gen8_pd_index(start, 0), count, + atomic_read(&pt->used)); + GEM_BUG_ON(!count || count >= atomic_read(&pt->used)); + + vaddr = kmap_atomic_px(pt); + memset64(vaddr + gen8_pd_index(start, 0), + vm->scratch[0]->encode, + count); + kunmap_atomic(vaddr); + + atomic_sub(count, &pt->used); + start += count; + } + + if (release_pd_entry(pd, idx, pt, scratch)) + free_px(vm, pt, lvl); + } while (idx++, --len); + + return start; +} + +static void gen8_ppgtt_clear(struct i915_address_space *vm, + u64 start, u64 length) +{ + GEM_BUG_ON(!IS_ALIGNED(start, BIT_ULL(GEN8_PTE_SHIFT))); + GEM_BUG_ON(!IS_ALIGNED(length, BIT_ULL(GEN8_PTE_SHIFT))); + GEM_BUG_ON(range_overflows(start, length, vm->total)); + + start >>= GEN8_PTE_SHIFT; + length >>= GEN8_PTE_SHIFT; + GEM_BUG_ON(length == 0); + + __gen8_ppgtt_clear(vm, i915_vm_to_ppgtt(vm)->pd, + start, start + length, vm->top); +} + +static void __gen8_ppgtt_alloc(struct i915_address_space * const vm, + struct i915_vm_pt_stash *stash, + struct i915_page_directory * const pd, + u64 * const start, const u64 end, int lvl) +{ + unsigned int idx, len; + + GEM_BUG_ON(end > vm->total >> GEN8_PTE_SHIFT); + + len = gen8_pd_range(*start, end, lvl--, &idx); + DBG("%s(%p):{ lvl:%d, start:%llx, end:%llx, idx:%d, len:%d, used:%d }\n", + __func__, vm, lvl + 1, *start, end, + idx, len, atomic_read(px_used(pd))); + GEM_BUG_ON(!len || (idx + len - 1) >> gen8_pd_shift(1)); + + spin_lock(&pd->lock); + GEM_BUG_ON(!atomic_read(px_used(pd))); /* Must be pinned! */ + do { + struct i915_page_table *pt = pd->entry[idx]; + + if (!pt) { + spin_unlock(&pd->lock); + + DBG("%s(%p):{ lvl:%d, idx:%d } allocating new tree\n", + __func__, vm, lvl + 1, idx); + + pt = stash->pt[!!lvl]; + __i915_gem_object_pin_pages(pt->base); + i915_gem_object_make_unshrinkable(pt->base); + + fill_px(pt, vm->scratch[lvl]->encode); + + spin_lock(&pd->lock); + if (likely(!pd->entry[idx])) { + stash->pt[!!lvl] = pt->stash; + atomic_set(&pt->used, 0); + set_pd_entry(pd, idx, pt); + } else { + pt = pd->entry[idx]; + } + } + + if (lvl) { + atomic_inc(&pt->used); + spin_unlock(&pd->lock); + + __gen8_ppgtt_alloc(vm, stash, + as_pd(pt), start, end, lvl); + + spin_lock(&pd->lock); + atomic_dec(&pt->used); + GEM_BUG_ON(!atomic_read(&pt->used)); + } else { + unsigned int count = gen8_pt_count(*start, end); + + DBG("%s(%p):{ lvl:%d, start:%llx, end:%llx, idx:%d, len:%d, used:%d } inserting pte\n", + __func__, vm, lvl, *start, end, + gen8_pd_index(*start, 0), count, + atomic_read(&pt->used)); + + atomic_add(count, &pt->used); + /* All other pdes may be simultaneously removed */ + GEM_BUG_ON(atomic_read(&pt->used) > NALLOC * I915_PDES); + *start += count; + } + } while (idx++, --len); + spin_unlock(&pd->lock); +} + +static void gen8_ppgtt_alloc(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + u64 start, u64 length) +{ + GEM_BUG_ON(!IS_ALIGNED(start, BIT_ULL(GEN8_PTE_SHIFT))); + GEM_BUG_ON(!IS_ALIGNED(length, BIT_ULL(GEN8_PTE_SHIFT))); + GEM_BUG_ON(range_overflows(start, length, vm->total)); + + start >>= GEN8_PTE_SHIFT; + length >>= GEN8_PTE_SHIFT; + GEM_BUG_ON(length == 0); + + __gen8_ppgtt_alloc(vm, stash, i915_vm_to_ppgtt(vm)->pd, + &start, start + length, vm->top); +} + +static __always_inline u64 +gen8_ppgtt_insert_pte(struct i915_ppgtt *ppgtt, + struct i915_page_directory *pdp, + struct sgt_dma *iter, + u64 idx, + enum i915_cache_level cache_level, + u32 flags) +{ + struct i915_page_directory *pd; + const gen8_pte_t pte_encode = gen8_pte_encode(0, cache_level, flags); + gen8_pte_t *vaddr; + + pd = i915_pd_entry(pdp, gen8_pd_index(idx, 2)); + vaddr = kmap_atomic_px(i915_pt_entry(pd, gen8_pd_index(idx, 1))); + do { + GEM_BUG_ON(iter->sg->length < I915_GTT_PAGE_SIZE); + vaddr[gen8_pd_index(idx, 0)] = pte_encode | iter->dma; + + iter->dma += I915_GTT_PAGE_SIZE; + if (iter->dma >= iter->max) { + iter->sg = __sg_next(iter->sg); + if (!iter->sg) { + idx = 0; + break; + } + + iter->dma = sg_dma_address(iter->sg); + iter->max = iter->dma + iter->sg->length; + } + + if (gen8_pd_index(++idx, 0) == 0) { + if (gen8_pd_index(idx, 1) == 0) { + /* Limited by sg length for 3lvl */ + if (gen8_pd_index(idx, 2) == 0) + break; + + pd = pdp->entry[gen8_pd_index(idx, 2)]; + } + + clflush_cache_range(vaddr, PAGE_SIZE); + kunmap_atomic(vaddr); + vaddr = kmap_atomic_px(i915_pt_entry(pd, gen8_pd_index(idx, 1))); + } + } while (1); + clflush_cache_range(vaddr, PAGE_SIZE); + kunmap_atomic(vaddr); + + return idx; +} + +static void gen8_ppgtt_insert_huge(struct i915_vma *vma, + struct sgt_dma *iter, + enum i915_cache_level cache_level, + u32 flags) +{ + const gen8_pte_t pte_encode = gen8_pte_encode(0, cache_level, flags); + u64 start = vma->node.start; + dma_addr_t rem = iter->sg->length; + + GEM_BUG_ON(!i915_vm_is_4lvl(vma->vm)); + + do { + struct i915_page_directory * const pdp = + gen8_pdp_for_page_address(vma->vm, start); + struct i915_page_directory * const pd = + i915_pd_entry(pdp, __gen8_pte_index(start, 2)); + gen8_pte_t encode = pte_encode; + unsigned int maybe_64K = -1; + unsigned int page_size; + gen8_pte_t *vaddr; + u16 index; + + if (vma->page_sizes.sg & I915_GTT_PAGE_SIZE_2M && + IS_ALIGNED(iter->dma, I915_GTT_PAGE_SIZE_2M) && + rem >= I915_GTT_PAGE_SIZE_2M && + !__gen8_pte_index(start, 0)) { + index = __gen8_pte_index(start, 1); + encode |= GEN8_PDE_PS_2M; + page_size = I915_GTT_PAGE_SIZE_2M; + + vaddr = kmap_atomic_px(pd); + } else { + struct i915_page_table *pt = + i915_pt_entry(pd, __gen8_pte_index(start, 1)); + + index = __gen8_pte_index(start, 0); + page_size = I915_GTT_PAGE_SIZE; + + if (!index && + vma->page_sizes.sg & I915_GTT_PAGE_SIZE_64K && + IS_ALIGNED(iter->dma, I915_GTT_PAGE_SIZE_64K) && + (IS_ALIGNED(rem, I915_GTT_PAGE_SIZE_64K) || + rem >= (I915_PDES - index) * I915_GTT_PAGE_SIZE)) + maybe_64K = __gen8_pte_index(start, 1); + + vaddr = kmap_atomic_px(pt); + } + + do { + GEM_BUG_ON(iter->sg->length < page_size); + vaddr[index++] = encode | iter->dma; + + start += page_size; + iter->dma += page_size; + rem -= page_size; + if (iter->dma >= iter->max) { + iter->sg = __sg_next(iter->sg); + if (!iter->sg) + break; + + rem = iter->sg->length; + iter->dma = sg_dma_address(iter->sg); + iter->max = iter->dma + rem; + + if (maybe_64K != -1 && index < I915_PDES && + !(IS_ALIGNED(iter->dma, I915_GTT_PAGE_SIZE_64K) && + (IS_ALIGNED(rem, I915_GTT_PAGE_SIZE_64K) || + rem >= (I915_PDES - index) * I915_GTT_PAGE_SIZE))) + maybe_64K = -1; + + if (unlikely(!IS_ALIGNED(iter->dma, page_size))) + break; + } + } while (rem >= page_size && index < I915_PDES); + + clflush_cache_range(vaddr, PAGE_SIZE); + kunmap_atomic(vaddr); + + /* + * Is it safe to mark the 2M block as 64K? -- Either we have + * filled whole page-table with 64K entries, or filled part of + * it and have reached the end of the sg table and we have + * enough padding. + */ + if (maybe_64K != -1 && + (index == I915_PDES || + (i915_vm_has_scratch_64K(vma->vm) && + !iter->sg && IS_ALIGNED(vma->node.start + + vma->node.size, + I915_GTT_PAGE_SIZE_2M)))) { + vaddr = kmap_atomic_px(pd); + vaddr[maybe_64K] |= GEN8_PDE_IPS_64K; + kunmap_atomic(vaddr); + page_size = I915_GTT_PAGE_SIZE_64K; + + /* + * We write all 4K page entries, even when using 64K + * pages. In order to verify that the HW isn't cheating + * by using the 4K PTE instead of the 64K PTE, we want + * to remove all the surplus entries. If the HW skipped + * the 64K PTE, it will read/write into the scratch page + * instead - which we detect as missing results during + * selftests. + */ + if (I915_SELFTEST_ONLY(vma->vm->scrub_64K)) { + u16 i; + + encode = vma->vm->scratch[0]->encode; + vaddr = kmap_atomic_px(i915_pt_entry(pd, maybe_64K)); + + for (i = 1; i < index; i += 16) + memset64(vaddr + i, encode, 15); + + kunmap_atomic(vaddr); + } + } + + vma->page_sizes.gtt |= page_size; + } while (iter->sg); +} + +static void gen8_ppgtt_insert(struct i915_address_space *vm, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags) +{ + struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(vm); + struct sgt_dma iter = sgt_dma(vma); + + if (vma->page_sizes.sg > I915_GTT_PAGE_SIZE) { + gen8_ppgtt_insert_huge(vma, &iter, cache_level, flags); + } else { + u64 idx = vma->node.start >> GEN8_PTE_SHIFT; + + do { + struct i915_page_directory * const pdp = + gen8_pdp_for_page_index(vm, idx); + + idx = gen8_ppgtt_insert_pte(ppgtt, pdp, &iter, idx, + cache_level, flags); + } while (idx); + + vma->page_sizes.gtt = I915_GTT_PAGE_SIZE; + } +} + +static int gen8_init_scratch(struct i915_address_space *vm) +{ + int ret; + int i; + + /* + * If everybody agrees to not to write into the scratch page, + * we can reuse it for all vm, keeping contexts and processes separate. + */ + if (vm->has_read_only && vm->gt->vm && !i915_is_ggtt(vm->gt->vm)) { + struct i915_address_space *clone = vm->gt->vm; + + GEM_BUG_ON(!clone->has_read_only); + + vm->scratch_order = clone->scratch_order; + for (i = 0; i <= vm->top; i++) + vm->scratch[i] = i915_gem_object_get(clone->scratch[i]); + + return 0; + } + + ret = setup_scratch_page(vm); + if (ret) + return ret; + + vm->scratch[0]->encode = + gen8_pte_encode(px_dma(vm->scratch[0]), + I915_CACHE_LLC, vm->has_read_only); + + for (i = 1; i <= vm->top; i++) { + struct drm_i915_gem_object *obj; + + obj = vm->alloc_pt_dma(vm, I915_GTT_PAGE_SIZE_4K); + if (IS_ERR(obj)) + goto free_scratch; + + ret = pin_pt_dma(vm, obj); + if (ret) { + i915_gem_object_put(obj); + goto free_scratch; + } + + fill_px(obj, vm->scratch[i - 1]->encode); + obj->encode = gen8_pde_encode(px_dma(obj), I915_CACHE_LLC); + + vm->scratch[i] = obj; + } + + return 0; + +free_scratch: + while (i--) + i915_gem_object_put(vm->scratch[i]); + return -ENOMEM; +} + +static int gen8_preallocate_top_level_pdp(struct i915_ppgtt *ppgtt) +{ + struct i915_address_space *vm = &ppgtt->vm; + struct i915_page_directory *pd = ppgtt->pd; + unsigned int idx; + + GEM_BUG_ON(vm->top != 2); + GEM_BUG_ON(gen8_pd_top_count(vm) != GEN8_3LVL_PDPES); + + for (idx = 0; idx < GEN8_3LVL_PDPES; idx++) { + struct i915_page_directory *pde; + int err; + + pde = alloc_pd(vm); + if (IS_ERR(pde)) + return PTR_ERR(pde); + + err = pin_pt_dma(vm, pde->pt.base); + if (err) { + free_pd(vm, pde); + return err; + } + + fill_px(pde, vm->scratch[1]->encode); + set_pd_entry(pd, idx, pde); + atomic_inc(px_used(pde)); /* keep pinned */ + } + wmb(); + + return 0; +} + +static struct i915_page_directory * +gen8_alloc_top_pd(struct i915_address_space *vm) +{ + const unsigned int count = gen8_pd_top_count(vm); + struct i915_page_directory *pd; + int err; + + GEM_BUG_ON(count > I915_PDES); + + pd = __alloc_pd(count); + if (unlikely(!pd)) + return ERR_PTR(-ENOMEM); + + pd->pt.base = vm->alloc_pt_dma(vm, I915_GTT_PAGE_SIZE_4K); + if (IS_ERR(pd->pt.base)) { + err = PTR_ERR(pd->pt.base); + pd->pt.base = NULL; + goto err_pd; + } + + err = pin_pt_dma(vm, pd->pt.base); + if (err) + goto err_pd; + + fill_page_dma(px_base(pd), vm->scratch[vm->top]->encode, count); + atomic_inc(px_used(pd)); /* mark as pinned */ + return pd; + +err_pd: + free_pd(vm, pd); + return ERR_PTR(err); +} + +/* + * GEN8 legacy ppgtt programming is accomplished through a max 4 PDP registers + * with a net effect resembling a 2-level page table in normal x86 terms. Each + * PDP represents 1GB of memory 4 * 512 * 512 * 4096 = 4GB legacy 32b address + * space. + * + */ +struct i915_ppgtt *gen8_ppgtt_create(struct intel_gt *gt) +{ + struct i915_ppgtt *ppgtt; + int err; + + ppgtt = kzalloc(sizeof(*ppgtt), GFP_KERNEL); + if (!ppgtt) + return ERR_PTR(-ENOMEM); + + ppgtt_init(ppgtt, gt); + ppgtt->vm.top = i915_vm_is_4lvl(&ppgtt->vm) ? 3 : 2; + ppgtt->vm.pd_shift = ilog2(SZ_4K * SZ_4K / sizeof(gen8_pte_t)); + + /* + * From bdw, there is hw support for read-only pages in the PPGTT. + * + * Gen11 has HSDES#:1807136187 unresolved. Disable ro support + * for now. + * + * Gen12 has inherited the same read-only fault issue from gen11. + */ + ppgtt->vm.has_read_only = !IS_GEN_RANGE(gt->i915, 11, 12); + + ppgtt->vm.alloc_pt_dma = alloc_pt_dma; + + err = gen8_init_scratch(&ppgtt->vm); + if (err) + goto err_free; + + ppgtt->pd = gen8_alloc_top_pd(&ppgtt->vm); + if (IS_ERR(ppgtt->pd)) { + err = PTR_ERR(ppgtt->pd); + goto err_free_scratch; + } + + if (!i915_vm_is_4lvl(&ppgtt->vm)) { + err = gen8_preallocate_top_level_pdp(ppgtt); + if (err) + goto err_free_pd; + } + + ppgtt->vm.bind_async_flags = I915_VMA_LOCAL_BIND; + ppgtt->vm.insert_entries = gen8_ppgtt_insert; + ppgtt->vm.allocate_va_range = gen8_ppgtt_alloc; + ppgtt->vm.clear_range = gen8_ppgtt_clear; + + ppgtt->vm.pte_encode = gen8_pte_encode; + + if (intel_vgpu_active(gt->i915)) + gen8_ppgtt_notify_vgt(ppgtt, true); + + ppgtt->vm.cleanup = gen8_ppgtt_cleanup; + + return ppgtt; + +err_free_pd: + __gen8_ppgtt_cleanup(&ppgtt->vm, ppgtt->pd, + gen8_pd_top_count(&ppgtt->vm), ppgtt->vm.top); +err_free_scratch: + free_scratch(&ppgtt->vm); +err_free: + kfree(ppgtt); + return ERR_PTR(err); +} diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.h b/drivers/gpu/drm/i915/gt/gen8_ppgtt.h new file mode 100644 index 000000000..76a08b9c1 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2020 Intel Corporation + */ + +#ifndef __GEN8_PPGTT_H__ +#define __GEN8_PPGTT_H__ + +struct intel_gt; + +struct i915_ppgtt *gen8_ppgtt_create(struct intel_gt *gt); + +#endif diff --git a/drivers/gpu/drm/i915/gt/gen8_renderstate.c b/drivers/gpu/drm/i915/gt/gen8_renderstate.c new file mode 100644 index 000000000..95288a34c --- /dev/null +++ b/drivers/gpu/drm/i915/gt/gen8_renderstate.c @@ -0,0 +1,983 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Generated by: intel-gpu-tools-1.8-220-g01153e7 + */ + +#include "intel_renderstate.h" + +static const u32 gen8_null_state_relocs[] = { + 0x00000798, + 0x000007a4, + 0x000007ac, + 0x000007bc, + -1, +}; + +static const u32 gen8_null_state_batch[] = { + 0x7a000004, + 0x01000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x69040000, + 0x78140000, + 0x04000000, + 0x7820000a, + 0x00000000, + 0x00000000, + 0x80000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78130002, + 0x00000000, + 0x00000000, + 0x02001808, + 0x781f0002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78510009, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78100007, + 0x00000000, + 0x00000000, + 0x00010000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x781b0007, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000800, + 0x00000000, + 0x78110008, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x781e0003, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x781d0007, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78120002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78500003, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x781c0002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x780c0000, + 0x00000000, + 0x78520003, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78300000, + 0x08010040, + 0x78310000, + 0x1e000000, + 0x78320000, + 0x1e000000, + 0x78330000, + 0x1e000000, + 0x79190002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x791a0002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x791b0002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x79120000, + 0x00000000, + 0x79130000, + 0x00000000, + 0x79140000, + 0x00000000, + 0x79150000, + 0x00000000, + 0x79160000, + 0x00000000, + 0x78150009, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78190009, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x781a0009, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78160009, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78170009, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78490001, + 0x00000000, + 0x00000000, + 0x784a0000, + 0x00000000, + 0x784b0000, + 0x00000004, + 0x79170101, + 0x00000000, + 0x00000080, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x79180006, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x79180006, + 0x20000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x79180006, + 0x40000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x79180006, + 0x60000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x6101000e, + 0x00000001, /* reloc */ + 0x00000000, + 0x00000000, + 0x00000001, /* reloc */ + 0x00000000, + 0x00000001, /* reloc */ + 0x00000000, + 0x00000001, + 0x00000000, + 0x00000001, /* reloc */ + 0x00000000, + 0x00001001, + 0x00001001, + 0x00000001, + 0x00001001, + 0x61020001, + 0x00000000, + 0x00000000, + 0x79000002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78050006, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x79040002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x79040002, + 0x40000000, + 0x00000000, + 0x00000000, + 0x79040002, + 0x80000000, + 0x00000000, + 0x00000000, + 0x79040002, + 0xc0000000, + 0x00000000, + 0x00000000, + 0x79080001, + 0x00000000, + 0x00000000, + 0x790a0001, + 0x00000000, + 0x00000000, + 0x78060003, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78070003, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78040001, + 0x00000000, + 0x00000000, + 0x79110000, + 0x00000000, + 0x780d0000, + 0x00000000, + 0x79060000, + 0x00000000, + 0x7907001f, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x7902000f, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x790c000f, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x780a0003, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78080083, + 0x00004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x04004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x08004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x0c004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x10004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x14004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x18004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x1c004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x20004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x24004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x28004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x2c004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x30004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x34004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x38004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x3c004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x40004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x44004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x48004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x4c004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x50004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x54004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x58004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x5c004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x60004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x64004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x68004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x6c004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x70004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x74004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x7c004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x80004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78090043, + 0x02000000, + 0x22220000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x680b0001, + 0x78260000, + 0x00000000, + 0x78270000, + 0x00000000, + 0x78280000, + 0x00000000, + 0x78290000, + 0x00000000, + 0x782a0000, + 0x00000000, + 0x780e0000, + 0x00000dc1, + 0x78240000, + 0x00000e01, + 0x784f0000, + 0x80000100, + 0x784d0000, + 0x40000000, + 0x782b0000, + 0x00000000, + 0x782c0000, + 0x00000000, + 0x782d0000, + 0x00000000, + 0x782e0000, + 0x00000000, + 0x782f0000, + 0x00000000, + 0x780f0000, + 0x00000000, + 0x78230000, + 0x00000e60, + 0x78210000, + 0x00000e80, + 0x7b000005, + 0x00000004, + 0x00000001, + 0x00000000, + 0x00000001, + 0x00000000, + 0x00000000, + 0x05000000, /* cmds end */ + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, /* state start */ + 0x00000000, + 0x3f800000, + 0x3f800000, + 0x3f800000, + 0x3f800000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, /* state end */ +}; + +RO_RENDERSTATE(8); diff --git a/drivers/gpu/drm/i915/gt/gen9_renderstate.c b/drivers/gpu/drm/i915/gt/gen9_renderstate.c new file mode 100644 index 000000000..7d3ac02f0 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/gen9_renderstate.c @@ -0,0 +1,999 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Generated by: intel-gpu-tools-1.19-177-g68e2eab2 + */ + +#include "intel_renderstate.h" + +static const u32 gen9_null_state_relocs[] = { + 0x000007a8, + 0x000007b4, + 0x000007bc, + 0x000007cc, + -1, +}; + +static const u32 gen9_null_state_batch[] = { + 0x7a000004, + 0x01000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x69040300, + 0x78140000, + 0x04000000, + 0x7820000a, + 0x00000000, + 0x00000000, + 0x80000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78130002, + 0x00000000, + 0x00000000, + 0x02001808, + 0x781f0004, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78510009, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78100007, + 0x00000000, + 0x00000000, + 0x00010000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x781b0007, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000800, + 0x00000000, + 0x78110008, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x781e0003, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x781d0009, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78120002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78500003, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x781c0002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x780c0000, + 0x00000000, + 0x78520003, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78300000, + 0x08010040, + 0x78310000, + 0x1e000000, + 0x78320000, + 0x1e000000, + 0x78330000, + 0x1e000000, + 0x79190002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x791a0002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x791b0002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x79120000, + 0x00000000, + 0x79130000, + 0x00000000, + 0x79140000, + 0x00000000, + 0x79150000, + 0x00000000, + 0x79160000, + 0x00000000, + 0x78150009, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78190009, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x781a0009, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78160009, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78170009, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78490001, + 0x00000000, + 0x00000000, + 0x784a0000, + 0x00000000, + 0x784b0000, + 0x00000004, + 0x79170101, + 0x00000000, + 0x00000080, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x79180006, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x79180006, + 0x20000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x79180006, + 0x40000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x79180006, + 0x60000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x61010011, + 0x00000001, /* reloc */ + 0x00000000, + 0x00000000, + 0x00000001, /* reloc */ + 0x00000000, + 0x00000001, /* reloc */ + 0x00000000, + 0x00000001, + 0x00000000, + 0x00000001, /* reloc */ + 0x00000000, + 0x00001001, + 0x00001001, + 0x00000001, + 0x00001001, + 0x00000000, + 0x00000000, + 0x00000000, + 0x61020001, + 0x00000000, + 0x00000000, + 0x79000002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78050006, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x79040002, + 0x00000000, + 0x00000000, + 0x00000000, + 0x79040002, + 0x40000000, + 0x00000000, + 0x00000000, + 0x79040002, + 0x80000000, + 0x00000000, + 0x00000000, + 0x79040002, + 0xc0000000, + 0x00000000, + 0x00000000, + 0x79080001, + 0x00000000, + 0x00000000, + 0x790a0001, + 0x00000000, + 0x00000000, + 0x78060003, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78070003, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78040001, + 0x00000000, + 0x00000000, + 0x79110000, + 0x00000000, + 0x780d0000, + 0x00000000, + 0x79060000, + 0x00000000, + 0x7907001f, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x7902000f, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x790c000f, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x780a0003, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78080083, + 0x00004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x04004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x08004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x0c004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x10004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x14004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x18004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x1c004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x20004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x24004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x28004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x2c004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x30004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x34004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x38004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x3c004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x40004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x44004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x48004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x4c004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x50004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x54004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x58004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x5c004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x60004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x64004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x68004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x6c004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x70004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x74004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x7c004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x80004000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78090043, + 0x02000000, + 0x22220000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x78550003, + 0x0000000f, + 0x00000000, + 0x00000000, + 0x00000000, + 0x680b0001, + 0x780e0000, + 0x00000e01, + 0x78240000, + 0x00000e41, + 0x784f0000, + 0x80000100, + 0x784d0000, + 0x40000000, + 0x782b0000, + 0x00000000, + 0x782c0000, + 0x00000000, + 0x782d0000, + 0x00000000, + 0x782e0000, + 0x00000000, + 0x782f0000, + 0x00000000, + 0x780f0000, + 0x00000000, + 0x78230000, + 0x00000ea0, + 0x78210000, + 0x00000ec0, + 0x78260000, + 0x00000000, + 0x78270000, + 0x00000000, + 0x78280000, + 0x00000000, + 0x78290000, + 0x00000000, + 0x782a0000, + 0x00000000, + 0x7b000005, + 0x00000004, + 0x00000001, + 0x00000000, + 0x00000001, + 0x00000000, + 0x00000000, + 0x05000000, /* cmds end */ + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, /* state start */ + 0x00000000, + 0x3f800000, + 0x3f800000, + 0x3f800000, + 0x3f800000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, /* state end */ +}; + +RO_RENDERSTATE(9); diff --git a/drivers/gpu/drm/i915/gt/hsw_clear_kernel.c b/drivers/gpu/drm/i915/gt/hsw_clear_kernel.c new file mode 100644 index 000000000..b47f9d4a0 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/hsw_clear_kernel.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + * + * Generated by: IGT Gpu Tools on Fri 21 Feb 2020 05:30:13 AM UTC + */ + +static const u32 hsw_clear_kernel[] = { + 0x00000001, 0x26020128, 0x00000024, 0x00000000, + 0x00000040, 0x20280c21, 0x00000028, 0x00000001, + 0x01000010, 0x20000c20, 0x0000002c, 0x00000000, + 0x00010220, 0x34001c00, 0x00001400, 0x00000160, + 0x00600001, 0x20600061, 0x00000000, 0x00000000, + 0x00000008, 0x20601c85, 0x00000e00, 0x0000000c, + 0x00000005, 0x20601ca5, 0x00000060, 0x00000001, + 0x00000008, 0x20641c85, 0x00000e00, 0x0000000d, + 0x00000005, 0x20641ca5, 0x00000064, 0x00000003, + 0x00000041, 0x207424a5, 0x00000064, 0x00000034, + 0x00000040, 0x206014a5, 0x00000060, 0x00000074, + 0x00000008, 0x20681c85, 0x00000e00, 0x00000008, + 0x00000005, 0x20681ca5, 0x00000068, 0x0000000f, + 0x00000041, 0x20701ca5, 0x00000060, 0x00000010, + 0x00000040, 0x206814a5, 0x00000068, 0x00000070, + 0x00600001, 0x20a00061, 0x00000000, 0x00000000, + 0x00000005, 0x206c1c85, 0x00000e00, 0x00000007, + 0x00000041, 0x206c1ca5, 0x0000006c, 0x00000004, + 0x00600001, 0x20800021, 0x008d0000, 0x00000000, + 0x00000001, 0x20800021, 0x0000006c, 0x00000000, + 0x00000001, 0x20840021, 0x00000068, 0x00000000, + 0x00000001, 0x20880061, 0x00000000, 0x00000003, + 0x00000005, 0x208c0d21, 0x00000086, 0xffffffff, + 0x05600032, 0x20a00fa1, 0x008d0080, 0x02190001, + 0x00000040, 0x20a01ca5, 0x000000a0, 0x00000001, + 0x05600032, 0x20a00fa1, 0x008d0080, 0x040a8001, + 0x02000040, 0x20281c21, 0x00000028, 0xffffffff, + 0x00010220, 0x34001c00, 0x00001400, 0xffffffe0, + 0x00000001, 0x26020128, 0x00000024, 0x00000000, + 0x00000001, 0x220010e4, 0x00000000, 0x00000000, + 0x00000001, 0x220831ec, 0x00000000, 0x007f007f, + 0x00600001, 0x20400021, 0x008d0000, 0x00000000, + 0x00600001, 0x2fe00021, 0x008d0000, 0x00000000, + 0x00200001, 0x20400121, 0x00450020, 0x00000000, + 0x00000001, 0x20480061, 0x00000000, 0x000f000f, + 0x00000005, 0x204c0d21, 0x00000046, 0xffffffef, + 0x00800001, 0x20600061, 0x00000000, 0x00000000, + 0x00800001, 0x20800061, 0x00000000, 0x00000000, + 0x00800001, 0x20a00061, 0x00000000, 0x00000000, + 0x00800001, 0x20c00061, 0x00000000, 0x00000000, + 0x00800001, 0x20e00061, 0x00000000, 0x00000000, + 0x00800001, 0x21000061, 0x00000000, 0x00000000, + 0x00800001, 0x21200061, 0x00000000, 0x00000000, + 0x00800001, 0x21400061, 0x00000000, 0x00000000, + 0x05600032, 0x20000fa0, 0x008d0040, 0x120a8000, + 0x00000040, 0x20402d21, 0x00000020, 0x00100010, + 0x05600032, 0x20000fa0, 0x008d0040, 0x120a8000, + 0x02000040, 0x22083d8c, 0x00000208, 0xffffffff, + 0x00800001, 0xa0000109, 0x00000602, 0x00000000, + 0x00000040, 0x22001c84, 0x00000200, 0x00000020, + 0x00010220, 0x34001c00, 0x00001400, 0xffffffc0, + 0x07600032, 0x20000fa0, 0x008d0fe0, 0x82000010, +}; diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c new file mode 100644 index 000000000..0040b4765 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c @@ -0,0 +1,501 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include <linux/kthread.h> +#include <trace/events/dma_fence.h> +#include <uapi/linux/sched/types.h> + +#include "i915_drv.h" +#include "i915_trace.h" +#include "intel_breadcrumbs.h" +#include "intel_context.h" +#include "intel_engine_pm.h" +#include "intel_gt_pm.h" +#include "intel_gt_requests.h" + +static bool irq_enable(struct intel_engine_cs *engine) +{ + if (!engine->irq_enable) + return false; + + /* Caller disables interrupts */ + spin_lock(&engine->gt->irq_lock); + engine->irq_enable(engine); + spin_unlock(&engine->gt->irq_lock); + + return true; +} + +static void irq_disable(struct intel_engine_cs *engine) +{ + if (!engine->irq_disable) + return; + + /* Caller disables interrupts */ + spin_lock(&engine->gt->irq_lock); + engine->irq_disable(engine); + spin_unlock(&engine->gt->irq_lock); +} + +static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b) +{ + /* + * Since we are waiting on a request, the GPU should be busy + * and should have its own rpm reference. + */ + if (GEM_WARN_ON(!intel_gt_pm_get_if_awake(b->irq_engine->gt))) + return; + + /* + * The breadcrumb irq will be disarmed on the interrupt after the + * waiters are signaled. This gives us a single interrupt window in + * which we can add a new waiter and avoid the cost of re-enabling + * the irq. + */ + WRITE_ONCE(b->irq_armed, true); + + /* Requests may have completed before we could enable the interrupt. */ + if (!b->irq_enabled++ && irq_enable(b->irq_engine)) + irq_work_queue(&b->irq_work); +} + +static void intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b) +{ + if (!b->irq_engine) + return; + + spin_lock(&b->irq_lock); + if (!b->irq_armed) + __intel_breadcrumbs_arm_irq(b); + spin_unlock(&b->irq_lock); +} + +static void __intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b) +{ + GEM_BUG_ON(!b->irq_enabled); + if (!--b->irq_enabled) + irq_disable(b->irq_engine); + + WRITE_ONCE(b->irq_armed, false); + intel_gt_pm_put_async(b->irq_engine->gt); +} + +static void intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b) +{ + spin_lock(&b->irq_lock); + if (b->irq_armed) + __intel_breadcrumbs_disarm_irq(b); + spin_unlock(&b->irq_lock); +} + +static void add_signaling_context(struct intel_breadcrumbs *b, + struct intel_context *ce) +{ + lockdep_assert_held(&ce->signal_lock); + + spin_lock(&b->signalers_lock); + list_add_rcu(&ce->signal_link, &b->signalers); + spin_unlock(&b->signalers_lock); +} + +static bool remove_signaling_context(struct intel_breadcrumbs *b, + struct intel_context *ce) +{ + lockdep_assert_held(&ce->signal_lock); + + if (!list_empty(&ce->signals)) + return false; + + spin_lock(&b->signalers_lock); + list_del_rcu(&ce->signal_link); + spin_unlock(&b->signalers_lock); + + return true; +} + +__maybe_unused static bool +check_signal_order(struct intel_context *ce, struct i915_request *rq) +{ + if (rq->context != ce) + return false; + + if (!list_is_last(&rq->signal_link, &ce->signals) && + i915_seqno_passed(rq->fence.seqno, + list_next_entry(rq, signal_link)->fence.seqno)) + return false; + + if (!list_is_first(&rq->signal_link, &ce->signals) && + i915_seqno_passed(list_prev_entry(rq, signal_link)->fence.seqno, + rq->fence.seqno)) + return false; + + return true; +} + +static bool +__dma_fence_signal(struct dma_fence *fence) +{ + return !test_and_set_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags); +} + +static void +__dma_fence_signal__timestamp(struct dma_fence *fence, ktime_t timestamp) +{ + fence->timestamp = timestamp; + set_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags); + trace_dma_fence_signaled(fence); +} + +static void +__dma_fence_signal__notify(struct dma_fence *fence, + const struct list_head *list) +{ + struct dma_fence_cb *cur, *tmp; + + lockdep_assert_held(fence->lock); + + list_for_each_entry_safe(cur, tmp, list, node) { + INIT_LIST_HEAD(&cur->node); + cur->func(fence, cur); + } +} + +static void add_retire(struct intel_breadcrumbs *b, struct intel_timeline *tl) +{ + if (b->irq_engine) + intel_engine_add_retire(b->irq_engine, tl); +} + +static bool __signal_request(struct i915_request *rq) +{ + GEM_BUG_ON(test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags)); + + if (!__dma_fence_signal(&rq->fence)) { + i915_request_put(rq); + return false; + } + + return true; +} + +static struct llist_node * +slist_add(struct llist_node *node, struct llist_node *head) +{ + node->next = head; + return node; +} + +static void signal_irq_work(struct irq_work *work) +{ + struct intel_breadcrumbs *b = container_of(work, typeof(*b), irq_work); + const ktime_t timestamp = ktime_get(); + struct llist_node *signal, *sn; + struct intel_context *ce; + + signal = NULL; + if (unlikely(!llist_empty(&b->signaled_requests))) + signal = llist_del_all(&b->signaled_requests); + + /* + * Keep the irq armed until the interrupt after all listeners are gone. + * + * Enabling/disabling the interrupt is rather costly, roughly a couple + * of hundred microseconds. If we are proactive and enable/disable + * the interrupt around every request that wants a breadcrumb, we + * quickly drown in the extra orders of magnitude of latency imposed + * on request submission. + * + * So we try to be lazy, and keep the interrupts enabled until no + * more listeners appear within a breadcrumb interrupt interval (that + * is until a request completes that no one cares about). The + * observation is that listeners come in batches, and will often + * listen to a bunch of requests in succession. Though note on icl+, + * interrupts are always enabled due to concerns with rc6 being + * dysfunctional with per-engine interrupt masking. + * + * We also try to avoid raising too many interrupts, as they may + * be generated by userspace batches and it is unfortunately rather + * too easy to drown the CPU under a flood of GPU interrupts. Thus + * whenever no one appears to be listening, we turn off the interrupts. + * Fewer interrupts should conserve power -- at the very least, fewer + * interrupt draw less ire from other users of the system and tools + * like powertop. + */ + if (!signal && READ_ONCE(b->irq_armed) && list_empty(&b->signalers)) + intel_breadcrumbs_disarm_irq(b); + + rcu_read_lock(); + list_for_each_entry_rcu(ce, &b->signalers, signal_link) { + struct i915_request *rq; + + list_for_each_entry_rcu(rq, &ce->signals, signal_link) { + bool release; + + if (!__i915_request_is_complete(rq)) + break; + + if (!test_and_clear_bit(I915_FENCE_FLAG_SIGNAL, + &rq->fence.flags)) + break; + + /* + * Queue for execution after dropping the signaling + * spinlock as the callback chain may end up adding + * more signalers to the same context or engine. + */ + spin_lock(&ce->signal_lock); + list_del_rcu(&rq->signal_link); + release = remove_signaling_context(b, ce); + spin_unlock(&ce->signal_lock); + + if (__signal_request(rq)) + /* We own signal_node now, xfer to local list */ + signal = slist_add(&rq->signal_node, signal); + + if (release) { + add_retire(b, ce->timeline); + intel_context_put(ce); + } + } + } + rcu_read_unlock(); + + llist_for_each_safe(signal, sn, signal) { + struct i915_request *rq = + llist_entry(signal, typeof(*rq), signal_node); + struct list_head cb_list; + + spin_lock(&rq->lock); + list_replace(&rq->fence.cb_list, &cb_list); + __dma_fence_signal__timestamp(&rq->fence, timestamp); + __dma_fence_signal__notify(&rq->fence, &cb_list); + spin_unlock(&rq->lock); + + i915_request_put(rq); + } + + if (!READ_ONCE(b->irq_armed) && !list_empty(&b->signalers)) + intel_breadcrumbs_arm_irq(b); +} + +struct intel_breadcrumbs * +intel_breadcrumbs_create(struct intel_engine_cs *irq_engine) +{ + struct intel_breadcrumbs *b; + + b = kzalloc(sizeof(*b), GFP_KERNEL); + if (!b) + return NULL; + + b->irq_engine = irq_engine; + + spin_lock_init(&b->signalers_lock); + INIT_LIST_HEAD(&b->signalers); + init_llist_head(&b->signaled_requests); + + spin_lock_init(&b->irq_lock); + init_irq_work(&b->irq_work, signal_irq_work); + + return b; +} + +void intel_breadcrumbs_reset(struct intel_breadcrumbs *b) +{ + unsigned long flags; + + if (!b->irq_engine) + return; + + spin_lock_irqsave(&b->irq_lock, flags); + + if (b->irq_enabled) + irq_enable(b->irq_engine); + else + irq_disable(b->irq_engine); + + spin_unlock_irqrestore(&b->irq_lock, flags); +} + +void intel_breadcrumbs_park(struct intel_breadcrumbs *b) +{ + /* Kick the work once more to drain the signalers */ + irq_work_sync(&b->irq_work); + while (unlikely(READ_ONCE(b->irq_armed))) { + local_irq_disable(); + signal_irq_work(&b->irq_work); + local_irq_enable(); + cond_resched(); + } + GEM_BUG_ON(!list_empty(&b->signalers)); +} + +void intel_breadcrumbs_free(struct intel_breadcrumbs *b) +{ + irq_work_sync(&b->irq_work); + GEM_BUG_ON(!list_empty(&b->signalers)); + GEM_BUG_ON(b->irq_armed); + kfree(b); +} + +static void insert_breadcrumb(struct i915_request *rq) +{ + struct intel_breadcrumbs *b = READ_ONCE(rq->engine)->breadcrumbs; + struct intel_context *ce = rq->context; + struct list_head *pos; + + if (test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags)) + return; + + i915_request_get(rq); + + /* + * If the request is already completed, we can transfer it + * straight onto a signaled list, and queue the irq worker for + * its signal completion. + */ + if (__i915_request_is_complete(rq)) { + if (__signal_request(rq) && + llist_add(&rq->signal_node, &b->signaled_requests)) + irq_work_queue(&b->irq_work); + return; + } + + if (list_empty(&ce->signals)) { + intel_context_get(ce); + add_signaling_context(b, ce); + pos = &ce->signals; + } else { + /* + * We keep the seqno in retirement order, so we can break + * inside intel_engine_signal_breadcrumbs as soon as we've + * passed the last completed request (or seen a request that + * hasn't event started). We could walk the timeline->requests, + * but keeping a separate signalers_list has the advantage of + * hopefully being much smaller than the full list and so + * provides faster iteration and detection when there are no + * more interrupts required for this context. + * + * We typically expect to add new signalers in order, so we + * start looking for our insertion point from the tail of + * the list. + */ + list_for_each_prev(pos, &ce->signals) { + struct i915_request *it = + list_entry(pos, typeof(*it), signal_link); + + if (i915_seqno_passed(rq->fence.seqno, it->fence.seqno)) + break; + } + } + list_add_rcu(&rq->signal_link, pos); + GEM_BUG_ON(!check_signal_order(ce, rq)); + GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags)); + set_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags); + + /* + * Defer enabling the interrupt to after HW submission and recheck + * the request as it may have completed and raised the interrupt as + * we were attaching it into the lists. + */ + irq_work_queue(&b->irq_work); +} + +bool i915_request_enable_breadcrumb(struct i915_request *rq) +{ + struct intel_context *ce = rq->context; + + /* Serialises with i915_request_retire() using rq->lock */ + if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags)) + return true; + + /* + * Peek at i915_request_submit()/i915_request_unsubmit() status. + * + * If the request is not yet active (and not signaled), we will + * attach the breadcrumb later. + */ + if (!test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) + return true; + + spin_lock(&ce->signal_lock); + if (test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) + insert_breadcrumb(rq); + spin_unlock(&ce->signal_lock); + + return true; +} + +void i915_request_cancel_breadcrumb(struct i915_request *rq) +{ + struct intel_context *ce = rq->context; + bool release; + + spin_lock(&ce->signal_lock); + if (!test_and_clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags)) { + spin_unlock(&ce->signal_lock); + return; + } + + list_del_rcu(&rq->signal_link); + release = remove_signaling_context(rq->engine->breadcrumbs, ce); + spin_unlock(&ce->signal_lock); + if (release) + intel_context_put(ce); + + i915_request_put(rq); +} + +static void print_signals(struct intel_breadcrumbs *b, struct drm_printer *p) +{ + struct intel_context *ce; + struct i915_request *rq; + + drm_printf(p, "Signals:\n"); + + rcu_read_lock(); + list_for_each_entry_rcu(ce, &b->signalers, signal_link) { + list_for_each_entry_rcu(rq, &ce->signals, signal_link) + drm_printf(p, "\t[%llx:%llx%s] @ %dms\n", + rq->fence.context, rq->fence.seqno, + i915_request_completed(rq) ? "!" : + i915_request_started(rq) ? "*" : + "", + jiffies_to_msecs(jiffies - rq->emitted_jiffies)); + } + rcu_read_unlock(); +} + +void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine, + struct drm_printer *p) +{ + struct intel_breadcrumbs *b; + + b = engine->breadcrumbs; + if (!b) + return; + + drm_printf(p, "IRQ: %s\n", enableddisabled(b->irq_armed)); + if (!list_empty(&b->signalers)) + print_signals(b, p); +} diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.h b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.h new file mode 100644 index 000000000..ed3d1deab --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2019 Intel Corporation + */ + +#ifndef __INTEL_BREADCRUMBS__ +#define __INTEL_BREADCRUMBS__ + +#include <linux/irq_work.h> + +#include "intel_engine_types.h" + +struct drm_printer; +struct i915_request; +struct intel_breadcrumbs; + +struct intel_breadcrumbs * +intel_breadcrumbs_create(struct intel_engine_cs *irq_engine); +void intel_breadcrumbs_free(struct intel_breadcrumbs *b); + +void intel_breadcrumbs_reset(struct intel_breadcrumbs *b); +void intel_breadcrumbs_park(struct intel_breadcrumbs *b); + +static inline void +intel_engine_signal_breadcrumbs(struct intel_engine_cs *engine) +{ + irq_work_queue(&engine->breadcrumbs->irq_work); +} + +void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine, + struct drm_printer *p); + +bool i915_request_enable_breadcrumb(struct i915_request *request); +void i915_request_cancel_breadcrumb(struct i915_request *request); + +#endif /* __INTEL_BREADCRUMBS__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h b/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h new file mode 100644 index 000000000..a74bb3062 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2019 Intel Corporation + */ + +#ifndef __INTEL_BREADCRUMBS_TYPES__ +#define __INTEL_BREADCRUMBS_TYPES__ + +#include <linux/irq_work.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/types.h> + +/* + * Rather than have every client wait upon all user interrupts, + * with the herd waking after every interrupt and each doing the + * heavyweight seqno dance, we delegate the task (of being the + * bottom-half of the user interrupt) to the first client. After + * every interrupt, we wake up one client, who does the heavyweight + * coherent seqno read and either goes back to sleep (if incomplete), + * or wakes up all the completed clients in parallel, before then + * transferring the bottom-half status to the next client in the queue. + * + * Compared to walking the entire list of waiters in a single dedicated + * bottom-half, we reduce the latency of the first waiter by avoiding + * a context switch, but incur additional coherent seqno reads when + * following the chain of request breadcrumbs. Since it is most likely + * that we have a single client waiting on each seqno, then reducing + * the overhead of waking that client is much preferred. + */ +struct intel_breadcrumbs { + /* Not all breadcrumbs are attached to physical HW */ + struct intel_engine_cs *irq_engine; + + spinlock_t signalers_lock; /* protects the list of signalers */ + struct list_head signalers; + struct llist_head signaled_requests; + + spinlock_t irq_lock; /* protects the interrupt from hardirq context */ + struct irq_work irq_work; /* for use from inside irq_lock */ + unsigned int irq_enabled; + bool irq_armed; +}; + +#endif /* __INTEL_BREADCRUMBS_TYPES__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c new file mode 100644 index 000000000..349e7fa14 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -0,0 +1,506 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include "gem/i915_gem_context.h" +#include "gem/i915_gem_pm.h" + +#include "i915_drv.h" +#include "i915_globals.h" + +#include "intel_context.h" +#include "intel_engine.h" +#include "intel_engine_pm.h" +#include "intel_ring.h" + +static struct i915_global_context { + struct i915_global base; + struct kmem_cache *slab_ce; +} global; + +static struct intel_context *intel_context_alloc(void) +{ + return kmem_cache_zalloc(global.slab_ce, GFP_KERNEL); +} + +static void rcu_context_free(struct rcu_head *rcu) +{ + struct intel_context *ce = container_of(rcu, typeof(*ce), rcu); + + kmem_cache_free(global.slab_ce, ce); +} + +void intel_context_free(struct intel_context *ce) +{ + call_rcu(&ce->rcu, rcu_context_free); +} + +struct intel_context * +intel_context_create(struct intel_engine_cs *engine) +{ + struct intel_context *ce; + + ce = intel_context_alloc(); + if (!ce) + return ERR_PTR(-ENOMEM); + + intel_context_init(ce, engine); + return ce; +} + +int intel_context_alloc_state(struct intel_context *ce) +{ + int err = 0; + + if (mutex_lock_interruptible(&ce->pin_mutex)) + return -EINTR; + + if (!test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) { + if (intel_context_is_banned(ce)) { + err = -EIO; + goto unlock; + } + + err = ce->ops->alloc(ce); + if (unlikely(err)) + goto unlock; + + set_bit(CONTEXT_ALLOC_BIT, &ce->flags); + } + +unlock: + mutex_unlock(&ce->pin_mutex); + return err; +} + +static int intel_context_active_acquire(struct intel_context *ce) +{ + int err; + + __i915_active_acquire(&ce->active); + + if (intel_context_is_barrier(ce)) + return 0; + + /* Preallocate tracking nodes */ + err = i915_active_acquire_preallocate_barrier(&ce->active, + ce->engine); + if (err) + i915_active_release(&ce->active); + + return err; +} + +static void intel_context_active_release(struct intel_context *ce) +{ + /* Nodes preallocated in intel_context_active() */ + i915_active_acquire_barrier(&ce->active); + i915_active_release(&ce->active); +} + +static int __context_pin_state(struct i915_vma *vma, struct i915_gem_ww_ctx *ww) +{ + unsigned int bias = i915_ggtt_pin_bias(vma) | PIN_OFFSET_BIAS; + int err; + + err = i915_ggtt_pin(vma, ww, 0, bias | PIN_HIGH); + if (err) + return err; + + err = i915_active_acquire(&vma->active); + if (err) + goto err_unpin; + + /* + * And mark it as a globally pinned object to let the shrinker know + * it cannot reclaim the object until we release it. + */ + i915_vma_make_unshrinkable(vma); + vma->obj->mm.dirty = true; + + return 0; + +err_unpin: + i915_vma_unpin(vma); + return err; +} + +static void __context_unpin_state(struct i915_vma *vma) +{ + i915_vma_make_shrinkable(vma); + i915_active_release(&vma->active); + __i915_vma_unpin(vma); +} + +static int __ring_active(struct intel_ring *ring, + struct i915_gem_ww_ctx *ww) +{ + int err; + + err = intel_ring_pin(ring, ww); + if (err) + return err; + + err = i915_active_acquire(&ring->vma->active); + if (err) + goto err_pin; + + return 0; + +err_pin: + intel_ring_unpin(ring); + return err; +} + +static void __ring_retire(struct intel_ring *ring) +{ + i915_active_release(&ring->vma->active); + intel_ring_unpin(ring); +} + +static int intel_context_pre_pin(struct intel_context *ce, + struct i915_gem_ww_ctx *ww) +{ + int err; + + CE_TRACE(ce, "active\n"); + + err = __ring_active(ce->ring, ww); + if (err) + return err; + + err = intel_timeline_pin(ce->timeline, ww); + if (err) + goto err_ring; + + if (!ce->state) + return 0; + + err = __context_pin_state(ce->state, ww); + if (err) + goto err_timeline; + + + return 0; + +err_timeline: + intel_timeline_unpin(ce->timeline); +err_ring: + __ring_retire(ce->ring); + return err; +} + +static void intel_context_post_unpin(struct intel_context *ce) +{ + if (ce->state) + __context_unpin_state(ce->state); + + intel_timeline_unpin(ce->timeline); + __ring_retire(ce->ring); +} + +int __intel_context_do_pin_ww(struct intel_context *ce, + struct i915_gem_ww_ctx *ww) +{ + bool handoff = false; + void *vaddr; + int err = 0; + + if (unlikely(!test_bit(CONTEXT_ALLOC_BIT, &ce->flags))) { + err = intel_context_alloc_state(ce); + if (err) + return err; + } + + /* + * We always pin the context/ring/timeline here, to ensure a pin + * refcount for __intel_context_active(), which prevent a lock + * inversion of ce->pin_mutex vs dma_resv_lock(). + */ + + err = i915_gem_object_lock(ce->timeline->hwsp_ggtt->obj, ww); + if (!err && ce->ring->vma->obj) + err = i915_gem_object_lock(ce->ring->vma->obj, ww); + if (!err && ce->state) + err = i915_gem_object_lock(ce->state->obj, ww); + if (!err) + err = intel_context_pre_pin(ce, ww); + if (err) + return err; + + err = i915_active_acquire(&ce->active); + if (err) + goto err_ctx_unpin; + + err = ce->ops->pre_pin(ce, ww, &vaddr); + if (err) + goto err_release; + + err = mutex_lock_interruptible(&ce->pin_mutex); + if (err) + goto err_post_unpin; + + if (unlikely(intel_context_is_closed(ce))) { + err = -ENOENT; + goto err_unlock; + } + + if (likely(!atomic_add_unless(&ce->pin_count, 1, 0))) { + err = intel_context_active_acquire(ce); + if (unlikely(err)) + goto err_unlock; + + err = ce->ops->pin(ce, vaddr); + if (err) { + intel_context_active_release(ce); + goto err_unlock; + } + + CE_TRACE(ce, "pin ring:{start:%08x, head:%04x, tail:%04x}\n", + i915_ggtt_offset(ce->ring->vma), + ce->ring->head, ce->ring->tail); + + handoff = true; + smp_mb__before_atomic(); /* flush pin before it is visible */ + atomic_inc(&ce->pin_count); + } + + GEM_BUG_ON(!intel_context_is_pinned(ce)); /* no overflow! */ + +err_unlock: + mutex_unlock(&ce->pin_mutex); +err_post_unpin: + if (!handoff) + ce->ops->post_unpin(ce); +err_release: + i915_active_release(&ce->active); +err_ctx_unpin: + intel_context_post_unpin(ce); + + /* + * Unlock the hwsp_ggtt object since it's shared. + * In principle we can unlock all the global state locked above + * since it's pinned and doesn't need fencing, and will + * thus remain resident until it is explicitly unpinned. + */ + i915_gem_ww_unlock_single(ce->timeline->hwsp_ggtt->obj); + + return err; +} + +int __intel_context_do_pin(struct intel_context *ce) +{ + struct i915_gem_ww_ctx ww; + int err; + + i915_gem_ww_ctx_init(&ww, true); +retry: + err = __intel_context_do_pin_ww(ce, &ww); + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); + return err; +} + +void intel_context_unpin(struct intel_context *ce) +{ + if (!atomic_dec_and_test(&ce->pin_count)) + return; + + CE_TRACE(ce, "unpin\n"); + ce->ops->unpin(ce); + ce->ops->post_unpin(ce); + + /* + * Once released, we may asynchronously drop the active reference. + * As that may be the only reference keeping the context alive, + * take an extra now so that it is not freed before we finish + * dereferencing it. + */ + intel_context_get(ce); + intel_context_active_release(ce); + intel_context_put(ce); +} + +__i915_active_call +static void __intel_context_retire(struct i915_active *active) +{ + struct intel_context *ce = container_of(active, typeof(*ce), active); + + CE_TRACE(ce, "retire runtime: { total:%lluns, avg:%lluns }\n", + intel_context_get_total_runtime_ns(ce), + intel_context_get_avg_runtime_ns(ce)); + + set_bit(CONTEXT_VALID_BIT, &ce->flags); + intel_context_post_unpin(ce); + intel_context_put(ce); +} + +static int __intel_context_active(struct i915_active *active) +{ + struct intel_context *ce = container_of(active, typeof(*ce), active); + + intel_context_get(ce); + + /* everything should already be activated by intel_context_pre_pin() */ + GEM_WARN_ON(!i915_active_acquire_if_busy(&ce->ring->vma->active)); + __intel_ring_pin(ce->ring); + + __intel_timeline_pin(ce->timeline); + + if (ce->state) { + GEM_WARN_ON(!i915_active_acquire_if_busy(&ce->state->active)); + __i915_vma_pin(ce->state); + i915_vma_make_unshrinkable(ce->state); + } + + return 0; +} + +void +intel_context_init(struct intel_context *ce, struct intel_engine_cs *engine) +{ + GEM_BUG_ON(!engine->cops); + GEM_BUG_ON(!engine->gt->vm); + + kref_init(&ce->ref); + + ce->engine = engine; + ce->ops = engine->cops; + ce->sseu = engine->sseu; + ce->ring = __intel_context_ring_size(SZ_4K); + + ewma_runtime_init(&ce->runtime.avg); + + ce->vm = i915_vm_get(engine->gt->vm); + + /* NB ce->signal_link/lock is used under RCU */ + spin_lock_init(&ce->signal_lock); + INIT_LIST_HEAD(&ce->signals); + + mutex_init(&ce->pin_mutex); + + i915_active_init(&ce->active, + __intel_context_active, __intel_context_retire); +} + +void intel_context_fini(struct intel_context *ce) +{ + if (ce->timeline) + intel_timeline_put(ce->timeline); + i915_vm_put(ce->vm); + + mutex_destroy(&ce->pin_mutex); + i915_active_fini(&ce->active); +} + +static void i915_global_context_shrink(void) +{ + kmem_cache_shrink(global.slab_ce); +} + +static void i915_global_context_exit(void) +{ + kmem_cache_destroy(global.slab_ce); +} + +static struct i915_global_context global = { { + .shrink = i915_global_context_shrink, + .exit = i915_global_context_exit, +} }; + +int __init i915_global_context_init(void) +{ + global.slab_ce = KMEM_CACHE(intel_context, SLAB_HWCACHE_ALIGN); + if (!global.slab_ce) + return -ENOMEM; + + i915_global_register(&global.base); + return 0; +} + +void intel_context_enter_engine(struct intel_context *ce) +{ + intel_engine_pm_get(ce->engine); + intel_timeline_enter(ce->timeline); +} + +void intel_context_exit_engine(struct intel_context *ce) +{ + intel_timeline_exit(ce->timeline); + intel_engine_pm_put(ce->engine); +} + +int intel_context_prepare_remote_request(struct intel_context *ce, + struct i915_request *rq) +{ + struct intel_timeline *tl = ce->timeline; + int err; + + /* Only suitable for use in remotely modifying this context */ + GEM_BUG_ON(rq->context == ce); + + if (rcu_access_pointer(rq->timeline) != tl) { /* timeline sharing! */ + /* Queue this switch after current activity by this context. */ + err = i915_active_fence_set(&tl->last_request, rq); + if (err) + return err; + } + + /* + * Guarantee context image and the timeline remains pinned until the + * modifying request is retired by setting the ce activity tracker. + * + * But we only need to take one pin on the account of it. Or in other + * words transfer the pinned ce object to tracked active request. + */ + GEM_BUG_ON(i915_active_is_idle(&ce->active)); + return i915_active_add_request(&ce->active, rq); +} + +struct i915_request *intel_context_create_request(struct intel_context *ce) +{ + struct i915_gem_ww_ctx ww; + struct i915_request *rq; + int err; + + i915_gem_ww_ctx_init(&ww, true); +retry: + err = intel_context_pin_ww(ce, &ww); + if (!err) { + rq = i915_request_create(ce); + intel_context_unpin(ce); + } else if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + rq = ERR_PTR(err); + } else { + rq = ERR_PTR(err); + } + + i915_gem_ww_ctx_fini(&ww); + + if (IS_ERR(rq)) + return rq; + + /* + * timeline->mutex should be the inner lock, but is used as outer lock. + * Hack around this to shut up lockdep in selftests.. + */ + lockdep_unpin_lock(&ce->timeline->mutex, rq->cookie); + mutex_release(&ce->timeline->mutex.dep_map, _RET_IP_); + mutex_acquire(&ce->timeline->mutex.dep_map, SINGLE_DEPTH_NESTING, 0, _RET_IP_); + rq->cookie = lockdep_pin_lock(&ce->timeline->mutex); + + return rq; +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_context.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h new file mode 100644 index 000000000..fda2eba81 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_context.h @@ -0,0 +1,265 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef __INTEL_CONTEXT_H__ +#define __INTEL_CONTEXT_H__ + +#include <linux/bitops.h> +#include <linux/lockdep.h> +#include <linux/types.h> + +#include "i915_active.h" +#include "i915_drv.h" +#include "intel_context_types.h" +#include "intel_engine_types.h" +#include "intel_ring_types.h" +#include "intel_timeline_types.h" + +#define CE_TRACE(ce, fmt, ...) do { \ + const struct intel_context *ce__ = (ce); \ + ENGINE_TRACE(ce__->engine, "context:%llx " fmt, \ + ce__->timeline->fence_context, \ + ##__VA_ARGS__); \ +} while (0) + +struct i915_gem_ww_ctx; + +void intel_context_init(struct intel_context *ce, + struct intel_engine_cs *engine); +void intel_context_fini(struct intel_context *ce); + +struct intel_context * +intel_context_create(struct intel_engine_cs *engine); + +int intel_context_alloc_state(struct intel_context *ce); + +void intel_context_free(struct intel_context *ce); + +int intel_context_reconfigure_sseu(struct intel_context *ce, + const struct intel_sseu sseu); + +/** + * intel_context_lock_pinned - Stablises the 'pinned' status of the HW context + * @ce - the context + * + * Acquire a lock on the pinned status of the HW context, such that the context + * can neither be bound to the GPU or unbound whilst the lock is held, i.e. + * intel_context_is_pinned() remains stable. + */ +static inline int intel_context_lock_pinned(struct intel_context *ce) + __acquires(ce->pin_mutex) +{ + return mutex_lock_interruptible(&ce->pin_mutex); +} + +/** + * intel_context_is_pinned - Reports the 'pinned' status + * @ce - the context + * + * While in use by the GPU, the context, along with its ring and page + * tables is pinned into memory and the GTT. + * + * Returns: true if the context is currently pinned for use by the GPU. + */ +static inline bool +intel_context_is_pinned(struct intel_context *ce) +{ + return atomic_read(&ce->pin_count); +} + +/** + * intel_context_unlock_pinned - Releases the earlier locking of 'pinned' status + * @ce - the context + * + * Releases the lock earlier acquired by intel_context_unlock_pinned(). + */ +static inline void intel_context_unlock_pinned(struct intel_context *ce) + __releases(ce->pin_mutex) +{ + mutex_unlock(&ce->pin_mutex); +} + +int __intel_context_do_pin(struct intel_context *ce); +int __intel_context_do_pin_ww(struct intel_context *ce, + struct i915_gem_ww_ctx *ww); + +static inline bool intel_context_pin_if_active(struct intel_context *ce) +{ + return atomic_inc_not_zero(&ce->pin_count); +} + +static inline int intel_context_pin(struct intel_context *ce) +{ + if (likely(intel_context_pin_if_active(ce))) + return 0; + + return __intel_context_do_pin(ce); +} + +static inline int intel_context_pin_ww(struct intel_context *ce, + struct i915_gem_ww_ctx *ww) +{ + if (likely(intel_context_pin_if_active(ce))) + return 0; + + return __intel_context_do_pin_ww(ce, ww); +} + +static inline void __intel_context_pin(struct intel_context *ce) +{ + GEM_BUG_ON(!intel_context_is_pinned(ce)); + atomic_inc(&ce->pin_count); +} + +void intel_context_unpin(struct intel_context *ce); + +void intel_context_enter_engine(struct intel_context *ce); +void intel_context_exit_engine(struct intel_context *ce); + +static inline void intel_context_enter(struct intel_context *ce) +{ + lockdep_assert_held(&ce->timeline->mutex); + if (!ce->active_count++) + ce->ops->enter(ce); +} + +static inline void intel_context_mark_active(struct intel_context *ce) +{ + lockdep_assert_held(&ce->timeline->mutex); + ++ce->active_count; +} + +static inline void intel_context_exit(struct intel_context *ce) +{ + lockdep_assert_held(&ce->timeline->mutex); + GEM_BUG_ON(!ce->active_count); + if (!--ce->active_count) + ce->ops->exit(ce); +} + +static inline struct intel_context *intel_context_get(struct intel_context *ce) +{ + kref_get(&ce->ref); + return ce; +} + +static inline void intel_context_put(struct intel_context *ce) +{ + kref_put(&ce->ref, ce->ops->destroy); +} + +static inline struct intel_timeline *__must_check +intel_context_timeline_lock(struct intel_context *ce) + __acquires(&ce->timeline->mutex) +{ + struct intel_timeline *tl = ce->timeline; + int err; + + err = mutex_lock_interruptible(&tl->mutex); + if (err) + return ERR_PTR(err); + + return tl; +} + +static inline void intel_context_timeline_unlock(struct intel_timeline *tl) + __releases(&tl->mutex) +{ + mutex_unlock(&tl->mutex); +} + +int intel_context_prepare_remote_request(struct intel_context *ce, + struct i915_request *rq); + +struct i915_request *intel_context_create_request(struct intel_context *ce); + +static inline struct intel_ring *__intel_context_ring_size(u64 sz) +{ + return u64_to_ptr(struct intel_ring, sz); +} + +static inline bool intel_context_is_barrier(const struct intel_context *ce) +{ + return test_bit(CONTEXT_BARRIER_BIT, &ce->flags); +} + +static inline bool intel_context_is_closed(const struct intel_context *ce) +{ + return test_bit(CONTEXT_CLOSED_BIT, &ce->flags); +} + +static inline bool intel_context_use_semaphores(const struct intel_context *ce) +{ + return test_bit(CONTEXT_USE_SEMAPHORES, &ce->flags); +} + +static inline void intel_context_set_use_semaphores(struct intel_context *ce) +{ + set_bit(CONTEXT_USE_SEMAPHORES, &ce->flags); +} + +static inline void intel_context_clear_use_semaphores(struct intel_context *ce) +{ + clear_bit(CONTEXT_USE_SEMAPHORES, &ce->flags); +} + +static inline bool intel_context_is_banned(const struct intel_context *ce) +{ + return test_bit(CONTEXT_BANNED, &ce->flags); +} + +static inline bool intel_context_set_banned(struct intel_context *ce) +{ + return test_and_set_bit(CONTEXT_BANNED, &ce->flags); +} + +static inline bool +intel_context_force_single_submission(const struct intel_context *ce) +{ + return test_bit(CONTEXT_FORCE_SINGLE_SUBMISSION, &ce->flags); +} + +static inline void +intel_context_set_single_submission(struct intel_context *ce) +{ + __set_bit(CONTEXT_FORCE_SINGLE_SUBMISSION, &ce->flags); +} + +static inline bool +intel_context_nopreempt(const struct intel_context *ce) +{ + return test_bit(CONTEXT_NOPREEMPT, &ce->flags); +} + +static inline void +intel_context_set_nopreempt(struct intel_context *ce) +{ + set_bit(CONTEXT_NOPREEMPT, &ce->flags); +} + +static inline void +intel_context_clear_nopreempt(struct intel_context *ce) +{ + clear_bit(CONTEXT_NOPREEMPT, &ce->flags); +} + +static inline u64 intel_context_get_total_runtime_ns(struct intel_context *ce) +{ + const u32 period = + RUNTIME_INFO(ce->engine->i915)->cs_timestamp_period_ns; + + return READ_ONCE(ce->runtime.total) * period; +} + +static inline u64 intel_context_get_avg_runtime_ns(struct intel_context *ce) +{ + const u32 period = + RUNTIME_INFO(ce->engine->i915)->cs_timestamp_period_ns; + + return mul_u32_u32(ewma_runtime_read(&ce->runtime.avg), period); +} + +#endif /* __INTEL_CONTEXT_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_context_param.c b/drivers/gpu/drm/i915/gt/intel_context_param.c new file mode 100644 index 000000000..65dcd0902 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_context_param.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2019 Intel Corporation + */ + +#include "i915_active.h" +#include "intel_context.h" +#include "intel_context_param.h" +#include "intel_ring.h" + +int intel_context_set_ring_size(struct intel_context *ce, long sz) +{ + int err; + + if (intel_context_lock_pinned(ce)) + return -EINTR; + + err = i915_active_wait(&ce->active); + if (err < 0) + goto unlock; + + if (intel_context_is_pinned(ce)) { + err = -EBUSY; /* In active use, come back later! */ + goto unlock; + } + + if (test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) { + struct intel_ring *ring; + + /* Replace the existing ringbuffer */ + ring = intel_engine_create_ring(ce->engine, sz); + if (IS_ERR(ring)) { + err = PTR_ERR(ring); + goto unlock; + } + + intel_ring_put(ce->ring); + ce->ring = ring; + + /* Context image will be updated on next pin */ + } else { + ce->ring = __intel_context_ring_size(sz); + } + +unlock: + intel_context_unlock_pinned(ce); + return err; +} + +long intel_context_get_ring_size(struct intel_context *ce) +{ + long sz = (unsigned long)READ_ONCE(ce->ring); + + if (test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) { + if (intel_context_lock_pinned(ce)) + return -EINTR; + + sz = ce->ring->size; + intel_context_unlock_pinned(ce); + } + + return sz; +} diff --git a/drivers/gpu/drm/i915/gt/intel_context_param.h b/drivers/gpu/drm/i915/gt/intel_context_param.h new file mode 100644 index 000000000..f053d8633 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_context_param.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_CONTEXT_PARAM_H +#define INTEL_CONTEXT_PARAM_H + +struct intel_context; + +int intel_context_set_ring_size(struct intel_context *ce, long sz); +long intel_context_get_ring_size(struct intel_context *ce); + +#endif /* INTEL_CONTEXT_PARAM_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_context_sseu.c b/drivers/gpu/drm/i915/gt/intel_context_sseu.c new file mode 100644 index 000000000..b9c816397 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_context_sseu.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2019 Intel Corporation + */ + +#include "i915_drv.h" +#include "i915_vma.h" +#include "intel_context.h" +#include "intel_engine_pm.h" +#include "intel_gpu_commands.h" +#include "intel_lrc.h" +#include "intel_lrc_reg.h" +#include "intel_ring.h" +#include "intel_sseu.h" + +static int gen8_emit_rpcs_config(struct i915_request *rq, + const struct intel_context *ce, + const struct intel_sseu sseu) +{ + u64 offset; + u32 *cs; + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + offset = i915_ggtt_offset(ce->state) + + LRC_STATE_OFFSET + CTX_R_PWR_CLK_STATE * 4; + + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + *cs++ = intel_sseu_make_rpcs(rq->engine->gt, &sseu); + + intel_ring_advance(rq, cs); + + return 0; +} + +static int +gen8_modify_rpcs(struct intel_context *ce, const struct intel_sseu sseu) +{ + struct i915_request *rq; + int ret; + + lockdep_assert_held(&ce->pin_mutex); + + /* + * If the context is not idle, we have to submit an ordered request to + * modify its context image via the kernel context (writing to our own + * image, or into the registers directory, does not stick). Pristine + * and idle contexts will be configured on pinning. + */ + if (!intel_context_pin_if_active(ce)) + return 0; + + rq = intel_engine_create_kernel_request(ce->engine); + if (IS_ERR(rq)) { + ret = PTR_ERR(rq); + goto out_unpin; + } + + /* Serialise with the remote context */ + ret = intel_context_prepare_remote_request(ce, rq); + if (ret == 0) + ret = gen8_emit_rpcs_config(rq, ce, sseu); + + i915_request_add(rq); +out_unpin: + intel_context_unpin(ce); + return ret; +} + +int +intel_context_reconfigure_sseu(struct intel_context *ce, + const struct intel_sseu sseu) +{ + int ret; + + GEM_BUG_ON(INTEL_GEN(ce->engine->i915) < 8); + + ret = intel_context_lock_pinned(ce); + if (ret) + return ret; + + /* Nothing to do if unmodified. */ + if (!memcmp(&ce->sseu, &sseu, sizeof(sseu))) + goto unlock; + + ret = gen8_modify_rpcs(ce, sseu); + if (!ret) + ce->sseu = sseu; + +unlock: + intel_context_unlock_pinned(ce); + return ret; +} diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h new file mode 100644 index 000000000..52fa9c132 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -0,0 +1,129 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef __INTEL_CONTEXT_TYPES__ +#define __INTEL_CONTEXT_TYPES__ + +#include <linux/average.h> +#include <linux/kref.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/types.h> + +#include "i915_active_types.h" +#include "i915_utils.h" +#include "intel_engine_types.h" +#include "intel_sseu.h" + +#define CONTEXT_REDZONE POISON_INUSE + +DECLARE_EWMA(runtime, 3, 8); + +struct i915_gem_context; +struct i915_gem_ww_ctx; +struct i915_vma; +struct intel_breadcrumbs; +struct intel_context; +struct intel_ring; + +struct intel_context_ops { + int (*alloc)(struct intel_context *ce); + + int (*pre_pin)(struct intel_context *ce, struct i915_gem_ww_ctx *ww, void **vaddr); + int (*pin)(struct intel_context *ce, void *vaddr); + void (*unpin)(struct intel_context *ce); + void (*post_unpin)(struct intel_context *ce); + + void (*enter)(struct intel_context *ce); + void (*exit)(struct intel_context *ce); + + void (*reset)(struct intel_context *ce); + void (*destroy)(struct kref *kref); +}; + +struct intel_context { + /* + * Note: Some fields may be accessed under RCU. + * + * Unless otherwise noted a field can safely be assumed to be protected + * by strong reference counting. + */ + union { + struct kref ref; /* no kref_get_unless_zero()! */ + struct rcu_head rcu; + }; + + struct intel_engine_cs *engine; + struct intel_engine_cs *inflight; +#define intel_context_inflight(ce) ptr_mask_bits(READ_ONCE((ce)->inflight), 2) +#define intel_context_inflight_count(ce) ptr_unmask_bits(READ_ONCE((ce)->inflight), 2) + + struct i915_address_space *vm; + struct i915_gem_context __rcu *gem_context; + + /* + * @signal_lock protects the list of requests that need signaling, + * @signals. While there are any requests that need signaling, + * we add the context to the breadcrumbs worker, and remove it + * upon completion/cancellation of the last request. + */ + struct list_head signal_link; /* Accessed under RCU */ + struct list_head signals; /* Guarded by signal_lock */ + spinlock_t signal_lock; /* protects signals, the list of requests */ + + struct i915_vma *state; + struct intel_ring *ring; + struct intel_timeline *timeline; + + unsigned long flags; +#define CONTEXT_BARRIER_BIT 0 +#define CONTEXT_ALLOC_BIT 1 +#define CONTEXT_VALID_BIT 2 +#define CONTEXT_CLOSED_BIT 3 +#define CONTEXT_USE_SEMAPHORES 4 +#define CONTEXT_BANNED 5 +#define CONTEXT_FORCE_SINGLE_SUBMISSION 6 +#define CONTEXT_NOPREEMPT 7 + + u32 *lrc_reg_state; + union { + struct { + u32 lrca; + u32 ccid; + }; + u64 desc; + } lrc; + u32 tag; /* cookie passed to HW to track this context on submission */ + + /* Time on GPU as tracked by the hw. */ + struct { + struct ewma_runtime avg; + u64 total; + u32 last; + I915_SELFTEST_DECLARE(u32 num_underflow); + I915_SELFTEST_DECLARE(u32 max_underflow); + } runtime; + + unsigned int active_count; /* protected by timeline->mutex */ + + atomic_t pin_count; + struct mutex pin_mutex; /* guards pinning and associated on-gpuing */ + + /** + * active: Active tracker for the rq activity (inc. external) on this + * intel_context object. + */ + struct i915_active active; + + const struct intel_context_ops *ops; + + /** sseu: Control eu/slice partitioning */ + struct intel_sseu sseu; + + u8 wa_bb_page; /* if set, page num reserved for context workarounds */ +}; + +#endif /* __INTEL_CONTEXT_TYPES__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h new file mode 100644 index 000000000..760fefdfe --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_engine.h @@ -0,0 +1,364 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef _INTEL_RINGBUFFER_H_ +#define _INTEL_RINGBUFFER_H_ + +#include <drm/drm_util.h> + +#include <linux/hashtable.h> +#include <linux/irq_work.h> +#include <linux/random.h> +#include <linux/seqlock.h> + +#include "i915_pmu.h" +#include "i915_reg.h" +#include "i915_request.h" +#include "i915_selftest.h" +#include "gt/intel_timeline.h" +#include "intel_engine_types.h" +#include "intel_gpu_commands.h" +#include "intel_workarounds.h" + +struct drm_printer; +struct intel_gt; + +/* Early gen2 devices have a cacheline of just 32 bytes, using 64 is overkill, + * but keeps the logic simple. Indeed, the whole purpose of this macro is just + * to give some inclination as to some of the magic values used in the various + * workarounds! + */ +#define CACHELINE_BYTES 64 +#define CACHELINE_DWORDS (CACHELINE_BYTES / sizeof(u32)) + +#define ENGINE_TRACE(e, fmt, ...) do { \ + const struct intel_engine_cs *e__ __maybe_unused = (e); \ + GEM_TRACE("%s %s: " fmt, \ + dev_name(e__->i915->drm.dev), e__->name, \ + ##__VA_ARGS__); \ +} while (0) + +/* + * The register defines to be used with the following macros need to accept a + * base param, e.g: + * + * REG_FOO(base) _MMIO((base) + <relative offset>) + * ENGINE_READ(engine, REG_FOO); + * + * register arrays are to be defined and accessed as follows: + * + * REG_BAR(base, i) _MMIO((base) + <relative offset> + (i) * <shift>) + * ENGINE_READ_IDX(engine, REG_BAR, i) + */ + +#define __ENGINE_REG_OP(op__, engine__, ...) \ + intel_uncore_##op__((engine__)->uncore, __VA_ARGS__) + +#define __ENGINE_READ_OP(op__, engine__, reg__) \ + __ENGINE_REG_OP(op__, (engine__), reg__((engine__)->mmio_base)) + +#define ENGINE_READ16(...) __ENGINE_READ_OP(read16, __VA_ARGS__) +#define ENGINE_READ(...) __ENGINE_READ_OP(read, __VA_ARGS__) +#define ENGINE_READ_FW(...) __ENGINE_READ_OP(read_fw, __VA_ARGS__) +#define ENGINE_POSTING_READ(...) __ENGINE_READ_OP(posting_read_fw, __VA_ARGS__) +#define ENGINE_POSTING_READ16(...) __ENGINE_READ_OP(posting_read16, __VA_ARGS__) + +#define ENGINE_READ64(engine__, lower_reg__, upper_reg__) \ + __ENGINE_REG_OP(read64_2x32, (engine__), \ + lower_reg__((engine__)->mmio_base), \ + upper_reg__((engine__)->mmio_base)) + +#define ENGINE_READ_IDX(engine__, reg__, idx__) \ + __ENGINE_REG_OP(read, (engine__), reg__((engine__)->mmio_base, (idx__))) + +#define __ENGINE_WRITE_OP(op__, engine__, reg__, val__) \ + __ENGINE_REG_OP(op__, (engine__), reg__((engine__)->mmio_base), (val__)) + +#define ENGINE_WRITE16(...) __ENGINE_WRITE_OP(write16, __VA_ARGS__) +#define ENGINE_WRITE(...) __ENGINE_WRITE_OP(write, __VA_ARGS__) +#define ENGINE_WRITE_FW(...) __ENGINE_WRITE_OP(write_fw, __VA_ARGS__) + +#define GEN6_RING_FAULT_REG_READ(engine__) \ + intel_uncore_read((engine__)->uncore, RING_FAULT_REG(engine__)) + +#define GEN6_RING_FAULT_REG_POSTING_READ(engine__) \ + intel_uncore_posting_read((engine__)->uncore, RING_FAULT_REG(engine__)) + +#define GEN6_RING_FAULT_REG_RMW(engine__, clear__, set__) \ +({ \ + u32 __val; \ +\ + __val = intel_uncore_read((engine__)->uncore, \ + RING_FAULT_REG(engine__)); \ + __val &= ~(clear__); \ + __val |= (set__); \ + intel_uncore_write((engine__)->uncore, RING_FAULT_REG(engine__), \ + __val); \ +}) + +/* seqno size is actually only a uint32, but since we plan to use MI_FLUSH_DW to + * do the writes, and that must have qw aligned offsets, simply pretend it's 8b. + */ + +static inline unsigned int +execlists_num_ports(const struct intel_engine_execlists * const execlists) +{ + return execlists->port_mask + 1; +} + +static inline struct i915_request * +execlists_active(const struct intel_engine_execlists *execlists) +{ + struct i915_request * const *cur, * const *old, *active; + + cur = READ_ONCE(execlists->active); + smp_rmb(); /* pairs with overwrite protection in process_csb() */ + do { + old = cur; + + active = READ_ONCE(*cur); + cur = READ_ONCE(execlists->active); + + smp_rmb(); /* and complete the seqlock retry */ + } while (unlikely(cur != old)); + + return active; +} + +static inline void +execlists_active_lock_bh(struct intel_engine_execlists *execlists) +{ + local_bh_disable(); /* prevent local softirq and lock recursion */ + tasklet_lock(&execlists->tasklet); +} + +static inline void +execlists_active_unlock_bh(struct intel_engine_execlists *execlists) +{ + tasklet_unlock(&execlists->tasklet); + local_bh_enable(); /* restore softirq, and kick ksoftirqd! */ +} + +struct i915_request * +execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists); + +static inline u32 +intel_read_status_page(const struct intel_engine_cs *engine, int reg) +{ + /* Ensure that the compiler doesn't optimize away the load. */ + return READ_ONCE(engine->status_page.addr[reg]); +} + +static inline void +intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value) +{ + /* Writing into the status page should be done sparingly. Since + * we do when we are uncertain of the device state, we take a bit + * of extra paranoia to try and ensure that the HWS takes the value + * we give and that it doesn't end up trapped inside the CPU! + */ + if (static_cpu_has(X86_FEATURE_CLFLUSH)) { + mb(); + clflush(&engine->status_page.addr[reg]); + engine->status_page.addr[reg] = value; + clflush(&engine->status_page.addr[reg]); + mb(); + } else { + WRITE_ONCE(engine->status_page.addr[reg], value); + } +} + +/* + * Reads a dword out of the status page, which is written to from the command + * queue by automatic updates, MI_REPORT_HEAD, MI_STORE_DATA_INDEX, or + * MI_STORE_DATA_IMM. + * + * The following dwords have a reserved meaning: + * 0x00: ISR copy, updated when an ISR bit not set in the HWSTAM changes. + * 0x04: ring 0 head pointer + * 0x05: ring 1 head pointer (915-class) + * 0x06: ring 2 head pointer (915-class) + * 0x10-0x1b: Context status DWords (GM45) + * 0x1f: Last written status offset. (GM45) + * 0x20-0x2f: Reserved (Gen6+) + * + * The area from dword 0x30 to 0x3ff is available for driver usage. + */ +#define I915_GEM_HWS_PREEMPT 0x32 +#define I915_GEM_HWS_PREEMPT_ADDR (I915_GEM_HWS_PREEMPT * sizeof(u32)) +#define I915_GEM_HWS_SEQNO 0x40 +#define I915_GEM_HWS_SEQNO_ADDR (I915_GEM_HWS_SEQNO * sizeof(u32)) +#define I915_GEM_HWS_SCRATCH 0x80 + +#define I915_HWS_CSB_BUF0_INDEX 0x10 +#define I915_HWS_CSB_WRITE_INDEX 0x1f +#define CNL_HWS_CSB_WRITE_INDEX 0x2f + +void intel_engine_stop(struct intel_engine_cs *engine); +void intel_engine_cleanup(struct intel_engine_cs *engine); + +int intel_engines_init_mmio(struct intel_gt *gt); +int intel_engines_init(struct intel_gt *gt); + +void intel_engine_free_request_pool(struct intel_engine_cs *engine); + +void intel_engines_release(struct intel_gt *gt); +void intel_engines_free(struct intel_gt *gt); + +int intel_engine_init_common(struct intel_engine_cs *engine); +void intel_engine_cleanup_common(struct intel_engine_cs *engine); + +int intel_engine_resume(struct intel_engine_cs *engine); + +int intel_ring_submission_setup(struct intel_engine_cs *engine); + +int intel_engine_stop_cs(struct intel_engine_cs *engine); +void intel_engine_cancel_stop_cs(struct intel_engine_cs *engine); + +void intel_engine_set_hwsp_writemask(struct intel_engine_cs *engine, u32 mask); + +u64 intel_engine_get_active_head(const struct intel_engine_cs *engine); +u64 intel_engine_get_last_batch_head(const struct intel_engine_cs *engine); + +void intel_engine_get_instdone(const struct intel_engine_cs *engine, + struct intel_instdone *instdone); + +void intel_engine_init_execlists(struct intel_engine_cs *engine); + +static inline u32 *__gen8_emit_pipe_control(u32 *batch, u32 flags0, u32 flags1, u32 offset) +{ + memset(batch, 0, 6 * sizeof(u32)); + + batch[0] = GFX_OP_PIPE_CONTROL(6) | flags0; + batch[1] = flags1; + batch[2] = offset; + + return batch + 6; +} + +static inline u32 *gen8_emit_pipe_control(u32 *batch, u32 flags, u32 offset) +{ + return __gen8_emit_pipe_control(batch, 0, flags, offset); +} + +static inline u32 *gen12_emit_pipe_control(u32 *batch, u32 flags0, u32 flags1, u32 offset) +{ + return __gen8_emit_pipe_control(batch, flags0, flags1, offset); +} + +static inline u32 * +__gen8_emit_write_rcs(u32 *cs, u32 value, u32 offset, u32 flags0, u32 flags1) +{ + *cs++ = GFX_OP_PIPE_CONTROL(6) | flags0; + *cs++ = flags1 | PIPE_CONTROL_QW_WRITE; + *cs++ = offset; + *cs++ = 0; + *cs++ = value; + *cs++ = 0; /* We're thrashing one extra dword. */ + + return cs; +} + +static inline u32* +gen8_emit_ggtt_write_rcs(u32 *cs, u32 value, u32 gtt_offset, u32 flags) +{ + /* We're using qword write, offset should be aligned to 8 bytes. */ + GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8)); + + return __gen8_emit_write_rcs(cs, + value, + gtt_offset, + 0, + flags | PIPE_CONTROL_GLOBAL_GTT_IVB); +} + +static inline u32* +gen12_emit_ggtt_write_rcs(u32 *cs, u32 value, u32 gtt_offset, u32 flags0, u32 flags1) +{ + /* We're using qword write, offset should be aligned to 8 bytes. */ + GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8)); + + return __gen8_emit_write_rcs(cs, + value, + gtt_offset, + flags0, + flags1 | PIPE_CONTROL_GLOBAL_GTT_IVB); +} + +static inline u32 * +__gen8_emit_flush_dw(u32 *cs, u32 value, u32 gtt_offset, u32 flags) +{ + *cs++ = (MI_FLUSH_DW + 1) | flags; + *cs++ = gtt_offset; + *cs++ = 0; + *cs++ = value; + + return cs; +} + +static inline u32 * +gen8_emit_ggtt_write(u32 *cs, u32 value, u32 gtt_offset, u32 flags) +{ + /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */ + GEM_BUG_ON(gtt_offset & (1 << 5)); + /* Offset should be aligned to 8 bytes for both (QW/DW) write types */ + GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8)); + + return __gen8_emit_flush_dw(cs, + value, + gtt_offset | MI_FLUSH_DW_USE_GTT, + flags | MI_FLUSH_DW_OP_STOREDW); +} + +static inline void __intel_engine_reset(struct intel_engine_cs *engine, + bool stalled) +{ + if (engine->reset.rewind) + engine->reset.rewind(engine, stalled); + engine->serial++; /* contexts lost */ +} + +bool intel_engines_are_idle(struct intel_gt *gt); +bool intel_engine_is_idle(struct intel_engine_cs *engine); +void intel_engine_flush_submission(struct intel_engine_cs *engine); + +void intel_engines_reset_default_submission(struct intel_gt *gt); + +bool intel_engine_can_store_dword(struct intel_engine_cs *engine); + +__printf(3, 4) +void intel_engine_dump(struct intel_engine_cs *engine, + struct drm_printer *m, + const char *header, ...); + +ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, + ktime_t *now); + +struct i915_request * +intel_engine_find_active_request(struct intel_engine_cs *engine); + +u32 intel_engine_context_size(struct intel_gt *gt, u8 class); + +void intel_engine_init_active(struct intel_engine_cs *engine, + unsigned int subclass); +#define ENGINE_PHYSICAL 0 +#define ENGINE_MOCK 1 +#define ENGINE_VIRTUAL 2 + +static inline bool +intel_engine_has_preempt_reset(const struct intel_engine_cs *engine) +{ + if (!IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT)) + return false; + + return intel_engine_has_preemption(engine); +} + +static inline bool +intel_engine_has_heartbeat(const struct intel_engine_cs *engine) +{ + if (!IS_ACTIVE(CONFIG_DRM_I915_HEARTBEAT_INTERVAL)) + return false; + + return READ_ONCE(engine->props.heartbeat_interval_ms); +} + +#endif /* _INTEL_RINGBUFFER_H_ */ diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c new file mode 100644 index 000000000..a19537706 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -0,0 +1,1805 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include <drm/drm_print.h> + +#include "gem/i915_gem_context.h" + +#include "i915_drv.h" + +#include "intel_breadcrumbs.h" +#include "intel_context.h" +#include "intel_engine.h" +#include "intel_engine_pm.h" +#include "intel_engine_user.h" +#include "intel_gt.h" +#include "intel_gt_requests.h" +#include "intel_gt_pm.h" +#include "intel_lrc.h" +#include "intel_reset.h" +#include "intel_ring.h" + +/* Haswell does have the CXT_SIZE register however it does not appear to be + * valid. Now, docs explain in dwords what is in the context object. The full + * size is 70720 bytes, however, the power context and execlist context will + * never be saved (power context is stored elsewhere, and execlists don't work + * on HSW) - so the final size, including the extra state required for the + * Resource Streamer, is 66944 bytes, which rounds to 17 pages. + */ +#define HSW_CXT_TOTAL_SIZE (17 * PAGE_SIZE) + +#define DEFAULT_LR_CONTEXT_RENDER_SIZE (22 * PAGE_SIZE) +#define GEN8_LR_CONTEXT_RENDER_SIZE (20 * PAGE_SIZE) +#define GEN9_LR_CONTEXT_RENDER_SIZE (22 * PAGE_SIZE) +#define GEN10_LR_CONTEXT_RENDER_SIZE (18 * PAGE_SIZE) +#define GEN11_LR_CONTEXT_RENDER_SIZE (14 * PAGE_SIZE) + +#define GEN8_LR_CONTEXT_OTHER_SIZE ( 2 * PAGE_SIZE) + +#define MAX_MMIO_BASES 3 +struct engine_info { + unsigned int hw_id; + u8 class; + u8 instance; + /* mmio bases table *must* be sorted in reverse gen order */ + struct engine_mmio_base { + u32 gen : 8; + u32 base : 24; + } mmio_bases[MAX_MMIO_BASES]; +}; + +static const struct engine_info intel_engines[] = { + [RCS0] = { + .hw_id = RCS0_HW, + .class = RENDER_CLASS, + .instance = 0, + .mmio_bases = { + { .gen = 1, .base = RENDER_RING_BASE } + }, + }, + [BCS0] = { + .hw_id = BCS0_HW, + .class = COPY_ENGINE_CLASS, + .instance = 0, + .mmio_bases = { + { .gen = 6, .base = BLT_RING_BASE } + }, + }, + [VCS0] = { + .hw_id = VCS0_HW, + .class = VIDEO_DECODE_CLASS, + .instance = 0, + .mmio_bases = { + { .gen = 11, .base = GEN11_BSD_RING_BASE }, + { .gen = 6, .base = GEN6_BSD_RING_BASE }, + { .gen = 4, .base = BSD_RING_BASE } + }, + }, + [VCS1] = { + .hw_id = VCS1_HW, + .class = VIDEO_DECODE_CLASS, + .instance = 1, + .mmio_bases = { + { .gen = 11, .base = GEN11_BSD2_RING_BASE }, + { .gen = 8, .base = GEN8_BSD2_RING_BASE } + }, + }, + [VCS2] = { + .hw_id = VCS2_HW, + .class = VIDEO_DECODE_CLASS, + .instance = 2, + .mmio_bases = { + { .gen = 11, .base = GEN11_BSD3_RING_BASE } + }, + }, + [VCS3] = { + .hw_id = VCS3_HW, + .class = VIDEO_DECODE_CLASS, + .instance = 3, + .mmio_bases = { + { .gen = 11, .base = GEN11_BSD4_RING_BASE } + }, + }, + [VECS0] = { + .hw_id = VECS0_HW, + .class = VIDEO_ENHANCEMENT_CLASS, + .instance = 0, + .mmio_bases = { + { .gen = 11, .base = GEN11_VEBOX_RING_BASE }, + { .gen = 7, .base = VEBOX_RING_BASE } + }, + }, + [VECS1] = { + .hw_id = VECS1_HW, + .class = VIDEO_ENHANCEMENT_CLASS, + .instance = 1, + .mmio_bases = { + { .gen = 11, .base = GEN11_VEBOX2_RING_BASE } + }, + }, +}; + +/** + * intel_engine_context_size() - return the size of the context for an engine + * @gt: the gt + * @class: engine class + * + * Each engine class may require a different amount of space for a context + * image. + * + * Return: size (in bytes) of an engine class specific context image + * + * Note: this size includes the HWSP, which is part of the context image + * in LRC mode, but does not include the "shared data page" used with + * GuC submission. The caller should account for this if using the GuC. + */ +u32 intel_engine_context_size(struct intel_gt *gt, u8 class) +{ + struct intel_uncore *uncore = gt->uncore; + u32 cxt_size; + + BUILD_BUG_ON(I915_GTT_PAGE_SIZE != PAGE_SIZE); + + switch (class) { + case RENDER_CLASS: + switch (INTEL_GEN(gt->i915)) { + default: + MISSING_CASE(INTEL_GEN(gt->i915)); + return DEFAULT_LR_CONTEXT_RENDER_SIZE; + case 12: + case 11: + return GEN11_LR_CONTEXT_RENDER_SIZE; + case 10: + return GEN10_LR_CONTEXT_RENDER_SIZE; + case 9: + return GEN9_LR_CONTEXT_RENDER_SIZE; + case 8: + return GEN8_LR_CONTEXT_RENDER_SIZE; + case 7: + if (IS_HASWELL(gt->i915)) + return HSW_CXT_TOTAL_SIZE; + + cxt_size = intel_uncore_read(uncore, GEN7_CXT_SIZE); + return round_up(GEN7_CXT_TOTAL_SIZE(cxt_size) * 64, + PAGE_SIZE); + case 6: + cxt_size = intel_uncore_read(uncore, CXT_SIZE); + return round_up(GEN6_CXT_TOTAL_SIZE(cxt_size) * 64, + PAGE_SIZE); + case 5: + case 4: + /* + * There is a discrepancy here between the size reported + * by the register and the size of the context layout + * in the docs. Both are described as authorative! + * + * The discrepancy is on the order of a few cachelines, + * but the total is under one page (4k), which is our + * minimum allocation anyway so it should all come + * out in the wash. + */ + cxt_size = intel_uncore_read(uncore, CXT_SIZE) + 1; + drm_dbg(>->i915->drm, + "gen%d CXT_SIZE = %d bytes [0x%08x]\n", + INTEL_GEN(gt->i915), cxt_size * 64, + cxt_size - 1); + return round_up(cxt_size * 64, PAGE_SIZE); + case 3: + case 2: + /* For the special day when i810 gets merged. */ + case 1: + return 0; + } + break; + default: + MISSING_CASE(class); + fallthrough; + case VIDEO_DECODE_CLASS: + case VIDEO_ENHANCEMENT_CLASS: + case COPY_ENGINE_CLASS: + if (INTEL_GEN(gt->i915) < 8) + return 0; + return GEN8_LR_CONTEXT_OTHER_SIZE; + } +} + +static u32 __engine_mmio_base(struct drm_i915_private *i915, + const struct engine_mmio_base *bases) +{ + int i; + + for (i = 0; i < MAX_MMIO_BASES; i++) + if (INTEL_GEN(i915) >= bases[i].gen) + break; + + GEM_BUG_ON(i == MAX_MMIO_BASES); + GEM_BUG_ON(!bases[i].base); + + return bases[i].base; +} + +static void __sprint_engine_name(struct intel_engine_cs *engine) +{ + /* + * Before we know what the uABI name for this engine will be, + * we still would like to keep track of this engine in the debug logs. + * We throw in a ' here as a reminder that this isn't its final name. + */ + GEM_WARN_ON(snprintf(engine->name, sizeof(engine->name), "%s'%u", + intel_engine_class_repr(engine->class), + engine->instance) >= sizeof(engine->name)); +} + +void intel_engine_set_hwsp_writemask(struct intel_engine_cs *engine, u32 mask) +{ + /* + * Though they added more rings on g4x/ilk, they did not add + * per-engine HWSTAM until gen6. + */ + if (INTEL_GEN(engine->i915) < 6 && engine->class != RENDER_CLASS) + return; + + if (INTEL_GEN(engine->i915) >= 3) + ENGINE_WRITE(engine, RING_HWSTAM, mask); + else + ENGINE_WRITE16(engine, RING_HWSTAM, mask); +} + +static void intel_engine_sanitize_mmio(struct intel_engine_cs *engine) +{ + /* Mask off all writes into the unknown HWSP */ + intel_engine_set_hwsp_writemask(engine, ~0u); +} + +static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id) +{ + const struct engine_info *info = &intel_engines[id]; + struct drm_i915_private *i915 = gt->i915; + struct intel_engine_cs *engine; + + BUILD_BUG_ON(MAX_ENGINE_CLASS >= BIT(GEN11_ENGINE_CLASS_WIDTH)); + BUILD_BUG_ON(MAX_ENGINE_INSTANCE >= BIT(GEN11_ENGINE_INSTANCE_WIDTH)); + + if (GEM_DEBUG_WARN_ON(id >= ARRAY_SIZE(gt->engine))) + return -EINVAL; + + if (GEM_DEBUG_WARN_ON(info->class > MAX_ENGINE_CLASS)) + return -EINVAL; + + if (GEM_DEBUG_WARN_ON(info->instance > MAX_ENGINE_INSTANCE)) + return -EINVAL; + + if (GEM_DEBUG_WARN_ON(gt->engine_class[info->class][info->instance])) + return -EINVAL; + + engine = kzalloc(sizeof(*engine), GFP_KERNEL); + if (!engine) + return -ENOMEM; + + BUILD_BUG_ON(BITS_PER_TYPE(engine->mask) < I915_NUM_ENGINES); + + engine->id = id; + engine->legacy_idx = INVALID_ENGINE; + engine->mask = BIT(id); + engine->i915 = i915; + engine->gt = gt; + engine->uncore = gt->uncore; + engine->hw_id = engine->guc_id = info->hw_id; + engine->mmio_base = __engine_mmio_base(i915, info->mmio_bases); + + engine->class = info->class; + engine->instance = info->instance; + __sprint_engine_name(engine); + + engine->props.heartbeat_interval_ms = + CONFIG_DRM_I915_HEARTBEAT_INTERVAL; + engine->props.max_busywait_duration_ns = + CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT; + engine->props.preempt_timeout_ms = + CONFIG_DRM_I915_PREEMPT_TIMEOUT; + engine->props.stop_timeout_ms = + CONFIG_DRM_I915_STOP_TIMEOUT; + engine->props.timeslice_duration_ms = + CONFIG_DRM_I915_TIMESLICE_DURATION; + + /* Override to uninterruptible for OpenCL workloads. */ + if (INTEL_GEN(i915) == 12 && engine->class == RENDER_CLASS) + engine->props.preempt_timeout_ms = 0; + + engine->defaults = engine->props; /* never to change again */ + + engine->context_size = intel_engine_context_size(gt, engine->class); + if (WARN_ON(engine->context_size > BIT(20))) + engine->context_size = 0; + if (engine->context_size) + DRIVER_CAPS(i915)->has_logical_contexts = true; + + /* Nothing to do here, execute in order of dependencies */ + engine->schedule = NULL; + + ewma__engine_latency_init(&engine->latency); + seqlock_init(&engine->stats.lock); + + ATOMIC_INIT_NOTIFIER_HEAD(&engine->context_status_notifier); + + /* Scrub mmio state on takeover */ + intel_engine_sanitize_mmio(engine); + + gt->engine_class[info->class][info->instance] = engine; + gt->engine[id] = engine; + + return 0; +} + +static void __setup_engine_capabilities(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + if (engine->class == VIDEO_DECODE_CLASS) { + /* + * HEVC support is present on first engine instance + * before Gen11 and on all instances afterwards. + */ + if (INTEL_GEN(i915) >= 11 || + (INTEL_GEN(i915) >= 9 && engine->instance == 0)) + engine->uabi_capabilities |= + I915_VIDEO_CLASS_CAPABILITY_HEVC; + + /* + * SFC block is present only on even logical engine + * instances. + */ + if ((INTEL_GEN(i915) >= 11 && + (engine->gt->info.vdbox_sfc_access & + BIT(engine->instance))) || + (INTEL_GEN(i915) >= 9 && engine->instance == 0)) + engine->uabi_capabilities |= + I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC; + } else if (engine->class == VIDEO_ENHANCEMENT_CLASS) { + if (INTEL_GEN(i915) >= 9) + engine->uabi_capabilities |= + I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC; + } +} + +static void intel_setup_engine_capabilities(struct intel_gt *gt) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, gt, id) + __setup_engine_capabilities(engine); +} + +/** + * intel_engines_release() - free the resources allocated for Command Streamers + * @gt: pointer to struct intel_gt + */ +void intel_engines_release(struct intel_gt *gt) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + /* + * Before we release the resources held by engine, we must be certain + * that the HW is no longer accessing them -- having the GPU scribble + * to or read from a page being used for something else causes no end + * of fun. + * + * The GPU should be reset by this point, but assume the worst just + * in case we aborted before completely initialising the engines. + */ + GEM_BUG_ON(intel_gt_pm_is_awake(gt)); + if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) + __intel_gt_reset(gt, ALL_ENGINES); + + /* Decouple the backend; but keep the layout for late GPU resets */ + for_each_engine(engine, gt, id) { + if (!engine->release) + continue; + + intel_wakeref_wait_for_idle(&engine->wakeref); + GEM_BUG_ON(intel_engine_pm_is_awake(engine)); + + engine->release(engine); + engine->release = NULL; + + memset(&engine->reset, 0, sizeof(engine->reset)); + } +} + +void intel_engine_free_request_pool(struct intel_engine_cs *engine) +{ + if (!engine->request_pool) + return; + + kmem_cache_free(i915_request_slab_cache(), engine->request_pool); +} + +void intel_engines_free(struct intel_gt *gt) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + /* Free the requests! dma-resv keeps fences around for an eternity */ + rcu_barrier(); + + for_each_engine(engine, gt, id) { + intel_engine_free_request_pool(engine); + kfree(engine); + gt->engine[id] = NULL; + } +} + +/* + * Determine which engines are fused off in our particular hardware. + * Note that we have a catch-22 situation where we need to be able to access + * the blitter forcewake domain to read the engine fuses, but at the same time + * we need to know which engines are available on the system to know which + * forcewake domains are present. We solve this by intializing the forcewake + * domains based on the full engine mask in the platform capabilities before + * calling this function and pruning the domains for fused-off engines + * afterwards. + */ +static intel_engine_mask_t init_engine_mask(struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + struct intel_gt_info *info = >->info; + struct intel_uncore *uncore = gt->uncore; + unsigned int logical_vdbox = 0; + unsigned int i; + u32 media_fuse; + u16 vdbox_mask; + u16 vebox_mask; + + info->engine_mask = INTEL_INFO(i915)->platform_engine_mask; + + if (INTEL_GEN(i915) < 11) + return info->engine_mask; + + media_fuse = ~intel_uncore_read(uncore, GEN11_GT_VEBOX_VDBOX_DISABLE); + + vdbox_mask = media_fuse & GEN11_GT_VDBOX_DISABLE_MASK; + vebox_mask = (media_fuse & GEN11_GT_VEBOX_DISABLE_MASK) >> + GEN11_GT_VEBOX_DISABLE_SHIFT; + + for (i = 0; i < I915_MAX_VCS; i++) { + if (!HAS_ENGINE(gt, _VCS(i))) { + vdbox_mask &= ~BIT(i); + continue; + } + + if (!(BIT(i) & vdbox_mask)) { + info->engine_mask &= ~BIT(_VCS(i)); + drm_dbg(&i915->drm, "vcs%u fused off\n", i); + continue; + } + + /* + * In Gen11, only even numbered logical VDBOXes are + * hooked up to an SFC (Scaler & Format Converter) unit. + * In TGL each VDBOX has access to an SFC. + */ + if (INTEL_GEN(i915) >= 12 || logical_vdbox++ % 2 == 0) + gt->info.vdbox_sfc_access |= BIT(i); + } + drm_dbg(&i915->drm, "vdbox enable: %04x, instances: %04lx\n", + vdbox_mask, VDBOX_MASK(gt)); + GEM_BUG_ON(vdbox_mask != VDBOX_MASK(gt)); + + for (i = 0; i < I915_MAX_VECS; i++) { + if (!HAS_ENGINE(gt, _VECS(i))) { + vebox_mask &= ~BIT(i); + continue; + } + + if (!(BIT(i) & vebox_mask)) { + info->engine_mask &= ~BIT(_VECS(i)); + drm_dbg(&i915->drm, "vecs%u fused off\n", i); + } + } + drm_dbg(&i915->drm, "vebox enable: %04x, instances: %04lx\n", + vebox_mask, VEBOX_MASK(gt)); + GEM_BUG_ON(vebox_mask != VEBOX_MASK(gt)); + + return info->engine_mask; +} + +/** + * intel_engines_init_mmio() - allocate and prepare the Engine Command Streamers + * @gt: pointer to struct intel_gt + * + * Return: non-zero if the initialization failed. + */ +int intel_engines_init_mmio(struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + const unsigned int engine_mask = init_engine_mask(gt); + unsigned int mask = 0; + unsigned int i; + int err; + + drm_WARN_ON(&i915->drm, engine_mask == 0); + drm_WARN_ON(&i915->drm, engine_mask & + GENMASK(BITS_PER_TYPE(mask) - 1, I915_NUM_ENGINES)); + + if (i915_inject_probe_failure(i915)) + return -ENODEV; + + for (i = 0; i < ARRAY_SIZE(intel_engines); i++) { + if (!HAS_ENGINE(gt, i)) + continue; + + err = intel_engine_setup(gt, i); + if (err) + goto cleanup; + + mask |= BIT(i); + } + + /* + * Catch failures to update intel_engines table when the new engines + * are added to the driver by a warning and disabling the forgotten + * engines. + */ + if (drm_WARN_ON(&i915->drm, mask != engine_mask)) + gt->info.engine_mask = mask; + + gt->info.num_engines = hweight32(mask); + + intel_gt_check_and_clear_faults(gt); + + intel_setup_engine_capabilities(gt); + + intel_uncore_prune_engine_fw_domains(gt->uncore, gt); + + return 0; + +cleanup: + intel_engines_free(gt); + return err; +} + +void intel_engine_init_execlists(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + + execlists->port_mask = 1; + GEM_BUG_ON(!is_power_of_2(execlists_num_ports(execlists))); + GEM_BUG_ON(execlists_num_ports(execlists) > EXECLIST_MAX_PORTS); + + memset(execlists->pending, 0, sizeof(execlists->pending)); + execlists->active = + memset(execlists->inflight, 0, sizeof(execlists->inflight)); + + execlists->queue_priority_hint = INT_MIN; + execlists->queue = RB_ROOT_CACHED; +} + +static void cleanup_status_page(struct intel_engine_cs *engine) +{ + struct i915_vma *vma; + + /* Prevent writes into HWSP after returning the page to the system */ + intel_engine_set_hwsp_writemask(engine, ~0u); + + vma = fetch_and_zero(&engine->status_page.vma); + if (!vma) + return; + + if (!HWS_NEEDS_PHYSICAL(engine->i915)) + i915_vma_unpin(vma); + + i915_gem_object_unpin_map(vma->obj); + i915_gem_object_put(vma->obj); +} + +static int pin_ggtt_status_page(struct intel_engine_cs *engine, + struct i915_vma *vma) +{ + unsigned int flags; + + if (!HAS_LLC(engine->i915) && i915_ggtt_has_aperture(engine->gt->ggtt)) + /* + * On g33, we cannot place HWS above 256MiB, so + * restrict its pinning to the low mappable arena. + * Though this restriction is not documented for + * gen4, gen5, or byt, they also behave similarly + * and hang if the HWS is placed at the top of the + * GTT. To generalise, it appears that all !llc + * platforms have issues with us placing the HWS + * above the mappable region (even though we never + * actually map it). + */ + flags = PIN_MAPPABLE; + else + flags = PIN_HIGH; + + return i915_ggtt_pin(vma, NULL, 0, flags); +} + +static int init_status_page(struct intel_engine_cs *engine) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + void *vaddr; + int ret; + + /* + * Though the HWS register does support 36bit addresses, historically + * we have had hangs and corruption reported due to wild writes if + * the HWS is placed above 4G. We only allow objects to be allocated + * in GFP_DMA32 for i965, and no earlier physical address users had + * access to more than 4G. + */ + obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE); + if (IS_ERR(obj)) { + drm_err(&engine->i915->drm, + "Failed to allocate status page\n"); + return PTR_ERR(obj); + } + + i915_gem_object_set_cache_coherency(obj, I915_CACHE_LLC); + + vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto err; + } + + vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB); + if (IS_ERR(vaddr)) { + ret = PTR_ERR(vaddr); + goto err; + } + + engine->status_page.addr = memset(vaddr, 0, PAGE_SIZE); + engine->status_page.vma = vma; + + if (!HWS_NEEDS_PHYSICAL(engine->i915)) { + ret = pin_ggtt_status_page(engine, vma); + if (ret) + goto err_unpin; + } + + return 0; + +err_unpin: + i915_gem_object_unpin_map(obj); +err: + i915_gem_object_put(obj); + return ret; +} + +static int engine_setup_common(struct intel_engine_cs *engine) +{ + int err; + + init_llist_head(&engine->barrier_tasks); + + err = init_status_page(engine); + if (err) + return err; + + engine->breadcrumbs = intel_breadcrumbs_create(engine); + if (!engine->breadcrumbs) { + err = -ENOMEM; + goto err_status; + } + + err = intel_engine_init_cmd_parser(engine); + if (err) + goto err_cmd_parser; + + intel_engine_init_active(engine, ENGINE_PHYSICAL); + intel_engine_init_execlists(engine); + intel_engine_init__pm(engine); + intel_engine_init_retire(engine); + + /* Use the whole device by default */ + engine->sseu = + intel_sseu_from_device_info(&engine->gt->info.sseu); + + intel_engine_init_workarounds(engine); + intel_engine_init_whitelist(engine); + intel_engine_init_ctx_wa(engine); + + return 0; + +err_cmd_parser: + intel_breadcrumbs_free(engine->breadcrumbs); +err_status: + cleanup_status_page(engine); + return err; +} + +struct measure_breadcrumb { + struct i915_request rq; + struct intel_ring ring; + u32 cs[2048]; +}; + +static int measure_breadcrumb_dw(struct intel_context *ce) +{ + struct intel_engine_cs *engine = ce->engine; + struct measure_breadcrumb *frame; + int dw; + + GEM_BUG_ON(!engine->gt->scratch); + + frame = kzalloc(sizeof(*frame), GFP_KERNEL); + if (!frame) + return -ENOMEM; + + frame->rq.engine = engine; + frame->rq.context = ce; + rcu_assign_pointer(frame->rq.timeline, ce->timeline); + + frame->ring.vaddr = frame->cs; + frame->ring.size = sizeof(frame->cs); + frame->ring.wrap = + BITS_PER_TYPE(frame->ring.size) - ilog2(frame->ring.size); + frame->ring.effective_size = frame->ring.size; + intel_ring_update_space(&frame->ring); + frame->rq.ring = &frame->ring; + + mutex_lock(&ce->timeline->mutex); + spin_lock_irq(&engine->active.lock); + + dw = engine->emit_fini_breadcrumb(&frame->rq, frame->cs) - frame->cs; + + spin_unlock_irq(&engine->active.lock); + mutex_unlock(&ce->timeline->mutex); + + GEM_BUG_ON(dw & 1); /* RING_TAIL must be qword aligned */ + + kfree(frame); + return dw; +} + +void +intel_engine_init_active(struct intel_engine_cs *engine, unsigned int subclass) +{ + INIT_LIST_HEAD(&engine->active.requests); + INIT_LIST_HEAD(&engine->active.hold); + + spin_lock_init(&engine->active.lock); + lockdep_set_subclass(&engine->active.lock, subclass); + + /* + * Due to an interesting quirk in lockdep's internal debug tracking, + * after setting a subclass we must ensure the lock is used. Otherwise, + * nr_unused_locks is incremented once too often. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + local_irq_disable(); + lock_map_acquire(&engine->active.lock.dep_map); + lock_map_release(&engine->active.lock.dep_map); + local_irq_enable(); +#endif +} + +static struct intel_context * +create_pinned_context(struct intel_engine_cs *engine, + unsigned int hwsp, + struct lock_class_key *key, + const char *name) +{ + struct intel_context *ce; + int err; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return ce; + + __set_bit(CONTEXT_BARRIER_BIT, &ce->flags); + ce->timeline = page_pack_bits(NULL, hwsp); + + err = intel_context_pin(ce); /* perma-pin so it is always available */ + if (err) { + intel_context_put(ce); + return ERR_PTR(err); + } + + /* + * Give our perma-pinned kernel timelines a separate lockdep class, + * so that we can use them from within the normal user timelines + * should we need to inject GPU operations during their request + * construction. + */ + lockdep_set_class_and_name(&ce->timeline->mutex, key, name); + + return ce; +} + +static struct intel_context * +create_kernel_context(struct intel_engine_cs *engine) +{ + static struct lock_class_key kernel; + + return create_pinned_context(engine, I915_GEM_HWS_SEQNO_ADDR, + &kernel, "kernel_context"); +} + +/** + * intel_engines_init_common - initialize cengine state which might require hw access + * @engine: Engine to initialize. + * + * Initializes @engine@ structure members shared between legacy and execlists + * submission modes which do require hardware access. + * + * Typcally done at later stages of submission mode specific engine setup. + * + * Returns zero on success or an error code on failure. + */ +static int engine_init_common(struct intel_engine_cs *engine) +{ + struct intel_context *ce; + int ret; + + engine->set_default_submission(engine); + + /* + * We may need to do things with the shrinker which + * require us to immediately switch back to the default + * context. This can cause a problem as pinning the + * default context also requires GTT space which may not + * be available. To avoid this we always pin the default + * context. + */ + ce = create_kernel_context(engine); + if (IS_ERR(ce)) + return PTR_ERR(ce); + + ret = measure_breadcrumb_dw(ce); + if (ret < 0) + goto err_context; + + engine->emit_fini_breadcrumb_dw = ret; + engine->kernel_context = ce; + + return 0; + +err_context: + intel_context_put(ce); + return ret; +} + +int intel_engines_init(struct intel_gt *gt) +{ + int (*setup)(struct intel_engine_cs *engine); + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err; + + if (HAS_EXECLISTS(gt->i915)) + setup = intel_execlists_submission_setup; + else + setup = intel_ring_submission_setup; + + for_each_engine(engine, gt, id) { + err = engine_setup_common(engine); + if (err) + return err; + + err = setup(engine); + if (err) + return err; + + err = engine_init_common(engine); + if (err) + return err; + + intel_engine_add_user(engine); + } + + return 0; +} + +/** + * intel_engines_cleanup_common - cleans up the engine state created by + * the common initiailizers. + * @engine: Engine to cleanup. + * + * This cleans up everything created by the common helpers. + */ +void intel_engine_cleanup_common(struct intel_engine_cs *engine) +{ + GEM_BUG_ON(!list_empty(&engine->active.requests)); + tasklet_kill(&engine->execlists.tasklet); /* flush the callback */ + + cleanup_status_page(engine); + intel_breadcrumbs_free(engine->breadcrumbs); + + intel_engine_fini_retire(engine); + intel_engine_cleanup_cmd_parser(engine); + + if (engine->default_state) + fput(engine->default_state); + + if (engine->kernel_context) { + intel_context_unpin(engine->kernel_context); + intel_context_put(engine->kernel_context); + } + GEM_BUG_ON(!llist_empty(&engine->barrier_tasks)); + + intel_wa_list_free(&engine->ctx_wa_list); + intel_wa_list_free(&engine->wa_list); + intel_wa_list_free(&engine->whitelist); +} + +/** + * intel_engine_resume - re-initializes the HW state of the engine + * @engine: Engine to resume. + * + * Returns zero on success or an error code on failure. + */ +int intel_engine_resume(struct intel_engine_cs *engine) +{ + intel_engine_apply_workarounds(engine); + intel_engine_apply_whitelist(engine); + + return engine->resume(engine); +} + +u64 intel_engine_get_active_head(const struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + u64 acthd; + + if (INTEL_GEN(i915) >= 8) + acthd = ENGINE_READ64(engine, RING_ACTHD, RING_ACTHD_UDW); + else if (INTEL_GEN(i915) >= 4) + acthd = ENGINE_READ(engine, RING_ACTHD); + else + acthd = ENGINE_READ(engine, ACTHD); + + return acthd; +} + +u64 intel_engine_get_last_batch_head(const struct intel_engine_cs *engine) +{ + u64 bbaddr; + + if (INTEL_GEN(engine->i915) >= 8) + bbaddr = ENGINE_READ64(engine, RING_BBADDR, RING_BBADDR_UDW); + else + bbaddr = ENGINE_READ(engine, RING_BBADDR); + + return bbaddr; +} + +static unsigned long stop_timeout(const struct intel_engine_cs *engine) +{ + if (in_atomic() || irqs_disabled()) /* inside atomic preempt-reset? */ + return 0; + + /* + * If we are doing a normal GPU reset, we can take our time and allow + * the engine to quiesce. We've stopped submission to the engine, and + * if we wait long enough an innocent context should complete and + * leave the engine idle. So they should not be caught unaware by + * the forthcoming GPU reset (which usually follows the stop_cs)! + */ + return READ_ONCE(engine->props.stop_timeout_ms); +} + +int intel_engine_stop_cs(struct intel_engine_cs *engine) +{ + struct intel_uncore *uncore = engine->uncore; + const u32 base = engine->mmio_base; + const i915_reg_t mode = RING_MI_MODE(base); + int err; + + if (INTEL_GEN(engine->i915) < 3) + return -ENODEV; + + ENGINE_TRACE(engine, "\n"); + + intel_uncore_write_fw(uncore, mode, _MASKED_BIT_ENABLE(STOP_RING)); + + err = 0; + if (__intel_wait_for_register_fw(uncore, + mode, MODE_IDLE, MODE_IDLE, + 1000, stop_timeout(engine), + NULL)) { + ENGINE_TRACE(engine, "timed out on STOP_RING -> IDLE\n"); + err = -ETIMEDOUT; + } + + /* A final mmio read to let GPU writes be hopefully flushed to memory */ + intel_uncore_posting_read_fw(uncore, mode); + + return err; +} + +void intel_engine_cancel_stop_cs(struct intel_engine_cs *engine) +{ + ENGINE_TRACE(engine, "\n"); + + ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); +} + +const char *i915_cache_level_str(struct drm_i915_private *i915, int type) +{ + switch (type) { + case I915_CACHE_NONE: return " uncached"; + case I915_CACHE_LLC: return HAS_LLC(i915) ? " LLC" : " snooped"; + case I915_CACHE_L3_LLC: return " L3+LLC"; + case I915_CACHE_WT: return " WT"; + default: return ""; + } +} + +static u32 +read_subslice_reg(const struct intel_engine_cs *engine, + int slice, int subslice, i915_reg_t reg) +{ + struct drm_i915_private *i915 = engine->i915; + struct intel_uncore *uncore = engine->uncore; + u32 mcr_mask, mcr_ss, mcr, old_mcr, val; + enum forcewake_domains fw_domains; + + if (INTEL_GEN(i915) >= 11) { + mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK; + mcr_ss = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice); + } else { + mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK; + mcr_ss = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice); + } + + fw_domains = intel_uncore_forcewake_for_reg(uncore, reg, + FW_REG_READ); + fw_domains |= intel_uncore_forcewake_for_reg(uncore, + GEN8_MCR_SELECTOR, + FW_REG_READ | FW_REG_WRITE); + + spin_lock_irq(&uncore->lock); + intel_uncore_forcewake_get__locked(uncore, fw_domains); + + old_mcr = mcr = intel_uncore_read_fw(uncore, GEN8_MCR_SELECTOR); + + mcr &= ~mcr_mask; + mcr |= mcr_ss; + intel_uncore_write_fw(uncore, GEN8_MCR_SELECTOR, mcr); + + val = intel_uncore_read_fw(uncore, reg); + + mcr &= ~mcr_mask; + mcr |= old_mcr & mcr_mask; + + intel_uncore_write_fw(uncore, GEN8_MCR_SELECTOR, mcr); + + intel_uncore_forcewake_put__locked(uncore, fw_domains); + spin_unlock_irq(&uncore->lock); + + return val; +} + +/* NB: please notice the memset */ +void intel_engine_get_instdone(const struct intel_engine_cs *engine, + struct intel_instdone *instdone) +{ + struct drm_i915_private *i915 = engine->i915; + const struct sseu_dev_info *sseu = &engine->gt->info.sseu; + struct intel_uncore *uncore = engine->uncore; + u32 mmio_base = engine->mmio_base; + int slice; + int subslice; + + memset(instdone, 0, sizeof(*instdone)); + + switch (INTEL_GEN(i915)) { + default: + instdone->instdone = + intel_uncore_read(uncore, RING_INSTDONE(mmio_base)); + + if (engine->id != RCS0) + break; + + instdone->slice_common = + intel_uncore_read(uncore, GEN7_SC_INSTDONE); + if (INTEL_GEN(i915) >= 12) { + instdone->slice_common_extra[0] = + intel_uncore_read(uncore, GEN12_SC_INSTDONE_EXTRA); + instdone->slice_common_extra[1] = + intel_uncore_read(uncore, GEN12_SC_INSTDONE_EXTRA2); + } + for_each_instdone_slice_subslice(i915, sseu, slice, subslice) { + instdone->sampler[slice][subslice] = + read_subslice_reg(engine, slice, subslice, + GEN7_SAMPLER_INSTDONE); + instdone->row[slice][subslice] = + read_subslice_reg(engine, slice, subslice, + GEN7_ROW_INSTDONE); + } + break; + case 7: + instdone->instdone = + intel_uncore_read(uncore, RING_INSTDONE(mmio_base)); + + if (engine->id != RCS0) + break; + + instdone->slice_common = + intel_uncore_read(uncore, GEN7_SC_INSTDONE); + instdone->sampler[0][0] = + intel_uncore_read(uncore, GEN7_SAMPLER_INSTDONE); + instdone->row[0][0] = + intel_uncore_read(uncore, GEN7_ROW_INSTDONE); + + break; + case 6: + case 5: + case 4: + instdone->instdone = + intel_uncore_read(uncore, RING_INSTDONE(mmio_base)); + if (engine->id == RCS0) + /* HACK: Using the wrong struct member */ + instdone->slice_common = + intel_uncore_read(uncore, GEN4_INSTDONE1); + break; + case 3: + case 2: + instdone->instdone = intel_uncore_read(uncore, GEN2_INSTDONE); + break; + } +} + +static bool ring_is_idle(struct intel_engine_cs *engine) +{ + bool idle = true; + + if (I915_SELFTEST_ONLY(!engine->mmio_base)) + return true; + + if (!intel_engine_pm_get_if_awake(engine)) + return true; + + /* First check that no commands are left in the ring */ + if ((ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR) != + (ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR)) + idle = false; + + /* No bit for gen2, so assume the CS parser is idle */ + if (INTEL_GEN(engine->i915) > 2 && + !(ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE)) + idle = false; + + intel_engine_pm_put(engine); + + return idle; +} + +void intel_engine_flush_submission(struct intel_engine_cs *engine) +{ + struct tasklet_struct *t = &engine->execlists.tasklet; + + if (!t->func) + return; + + /* Synchronise and wait for the tasklet on another CPU */ + tasklet_kill(t); + + /* Having cancelled the tasklet, ensure that is run */ + local_bh_disable(); + if (tasklet_trylock(t)) { + /* Must wait for any GPU reset in progress. */ + if (__tasklet_is_enabled(t)) + t->func(t->data); + tasklet_unlock(t); + } + local_bh_enable(); +} + +/** + * intel_engine_is_idle() - Report if the engine has finished process all work + * @engine: the intel_engine_cs + * + * Return true if there are no requests pending, nothing left to be submitted + * to hardware, and that the engine is idle. + */ +bool intel_engine_is_idle(struct intel_engine_cs *engine) +{ + /* More white lies, if wedged, hw state is inconsistent */ + if (intel_gt_is_wedged(engine->gt)) + return true; + + if (!intel_engine_pm_is_awake(engine)) + return true; + + /* Waiting to drain ELSP? */ + if (execlists_active(&engine->execlists)) { + synchronize_hardirq(engine->i915->drm.pdev->irq); + + intel_engine_flush_submission(engine); + + if (execlists_active(&engine->execlists)) + return false; + } + + /* ELSP is empty, but there are ready requests? E.g. after reset */ + if (!RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)) + return false; + + /* Ring stopped? */ + return ring_is_idle(engine); +} + +bool intel_engines_are_idle(struct intel_gt *gt) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + /* + * If the driver is wedged, HW state may be very inconsistent and + * report that it is still busy, even though we have stopped using it. + */ + if (intel_gt_is_wedged(gt)) + return true; + + /* Already parked (and passed an idleness test); must still be idle */ + if (!READ_ONCE(gt->awake)) + return true; + + for_each_engine(engine, gt, id) { + if (!intel_engine_is_idle(engine)) + return false; + } + + return true; +} + +void intel_engines_reset_default_submission(struct intel_gt *gt) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, gt, id) + engine->set_default_submission(engine); +} + +bool intel_engine_can_store_dword(struct intel_engine_cs *engine) +{ + switch (INTEL_GEN(engine->i915)) { + case 2: + return false; /* uses physical not virtual addresses */ + case 3: + /* maybe only uses physical not virtual addresses */ + return !(IS_I915G(engine->i915) || IS_I915GM(engine->i915)); + case 4: + return !IS_I965G(engine->i915); /* who knows! */ + case 6: + return engine->class != VIDEO_DECODE_CLASS; /* b0rked */ + default: + return true; + } +} + +static int print_sched_attr(const struct i915_sched_attr *attr, + char *buf, int x, int len) +{ + if (attr->priority == I915_PRIORITY_INVALID) + return x; + + x += snprintf(buf + x, len - x, + " prio=%d", attr->priority); + + return x; +} + +static void print_request(struct drm_printer *m, + struct i915_request *rq, + const char *prefix) +{ + const char *name = rq->fence.ops->get_timeline_name(&rq->fence); + char buf[80] = ""; + int x = 0; + + x = print_sched_attr(&rq->sched.attr, buf, x, sizeof(buf)); + + drm_printf(m, "%s %llx:%llx%s%s %s @ %dms: %s\n", + prefix, + rq->fence.context, rq->fence.seqno, + i915_request_completed(rq) ? "!" : + i915_request_started(rq) ? "*" : + "", + test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, + &rq->fence.flags) ? "+" : + test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, + &rq->fence.flags) ? "-" : + "", + buf, + jiffies_to_msecs(jiffies - rq->emitted_jiffies), + name); +} + +static struct intel_timeline *get_timeline(struct i915_request *rq) +{ + struct intel_timeline *tl; + + /* + * Even though we are holding the engine->active.lock here, there + * is no control over the submission queue per-se and we are + * inspecting the active state at a random point in time, with an + * unknown queue. Play safe and make sure the timeline remains valid. + * (Only being used for pretty printing, one extra kref shouldn't + * cause a camel stampede!) + */ + rcu_read_lock(); + tl = rcu_dereference(rq->timeline); + if (!kref_get_unless_zero(&tl->kref)) + tl = NULL; + rcu_read_unlock(); + + return tl; +} + +static int print_ring(char *buf, int sz, struct i915_request *rq) +{ + int len = 0; + + if (!i915_request_signaled(rq)) { + struct intel_timeline *tl = get_timeline(rq); + + len = scnprintf(buf, sz, + "ring:{start:%08x, hwsp:%08x, seqno:%08x, runtime:%llums}, ", + i915_ggtt_offset(rq->ring->vma), + tl ? tl->hwsp_offset : 0, + hwsp_seqno(rq), + DIV_ROUND_CLOSEST_ULL(intel_context_get_total_runtime_ns(rq->context), + 1000 * 1000)); + + if (tl) + intel_timeline_put(tl); + } + + return len; +} + +static void hexdump(struct drm_printer *m, const void *buf, size_t len) +{ + const size_t rowsize = 8 * sizeof(u32); + const void *prev = NULL; + bool skip = false; + size_t pos; + + for (pos = 0; pos < len; pos += rowsize) { + char line[128]; + + if (prev && !memcmp(prev, buf + pos, rowsize)) { + if (!skip) { + drm_printf(m, "*\n"); + skip = true; + } + continue; + } + + WARN_ON_ONCE(hex_dump_to_buffer(buf + pos, len - pos, + rowsize, sizeof(u32), + line, sizeof(line), + false) >= sizeof(line)); + drm_printf(m, "[%04zx] %s\n", pos, line); + + prev = buf + pos; + skip = false; + } +} + +static const char *repr_timer(const struct timer_list *t) +{ + if (!READ_ONCE(t->expires)) + return "inactive"; + + if (timer_pending(t)) + return "active"; + + return "expired"; +} + +static void intel_engine_print_registers(struct intel_engine_cs *engine, + struct drm_printer *m) +{ + struct drm_i915_private *dev_priv = engine->i915; + struct intel_engine_execlists * const execlists = &engine->execlists; + u64 addr; + + if (engine->id == RENDER_CLASS && IS_GEN_RANGE(dev_priv, 4, 7)) + drm_printf(m, "\tCCID: 0x%08x\n", ENGINE_READ(engine, CCID)); + if (HAS_EXECLISTS(dev_priv)) { + drm_printf(m, "\tEL_STAT_HI: 0x%08x\n", + ENGINE_READ(engine, RING_EXECLIST_STATUS_HI)); + drm_printf(m, "\tEL_STAT_LO: 0x%08x\n", + ENGINE_READ(engine, RING_EXECLIST_STATUS_LO)); + } + drm_printf(m, "\tRING_START: 0x%08x\n", + ENGINE_READ(engine, RING_START)); + drm_printf(m, "\tRING_HEAD: 0x%08x\n", + ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR); + drm_printf(m, "\tRING_TAIL: 0x%08x\n", + ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR); + drm_printf(m, "\tRING_CTL: 0x%08x%s\n", + ENGINE_READ(engine, RING_CTL), + ENGINE_READ(engine, RING_CTL) & (RING_WAIT | RING_WAIT_SEMAPHORE) ? " [waiting]" : ""); + if (INTEL_GEN(engine->i915) > 2) { + drm_printf(m, "\tRING_MODE: 0x%08x%s\n", + ENGINE_READ(engine, RING_MI_MODE), + ENGINE_READ(engine, RING_MI_MODE) & (MODE_IDLE) ? " [idle]" : ""); + } + + if (INTEL_GEN(dev_priv) >= 6) { + drm_printf(m, "\tRING_IMR: 0x%08x\n", + ENGINE_READ(engine, RING_IMR)); + drm_printf(m, "\tRING_ESR: 0x%08x\n", + ENGINE_READ(engine, RING_ESR)); + drm_printf(m, "\tRING_EMR: 0x%08x\n", + ENGINE_READ(engine, RING_EMR)); + drm_printf(m, "\tRING_EIR: 0x%08x\n", + ENGINE_READ(engine, RING_EIR)); + } + + addr = intel_engine_get_active_head(engine); + drm_printf(m, "\tACTHD: 0x%08x_%08x\n", + upper_32_bits(addr), lower_32_bits(addr)); + addr = intel_engine_get_last_batch_head(engine); + drm_printf(m, "\tBBADDR: 0x%08x_%08x\n", + upper_32_bits(addr), lower_32_bits(addr)); + if (INTEL_GEN(dev_priv) >= 8) + addr = ENGINE_READ64(engine, RING_DMA_FADD, RING_DMA_FADD_UDW); + else if (INTEL_GEN(dev_priv) >= 4) + addr = ENGINE_READ(engine, RING_DMA_FADD); + else + addr = ENGINE_READ(engine, DMA_FADD_I8XX); + drm_printf(m, "\tDMA_FADDR: 0x%08x_%08x\n", + upper_32_bits(addr), lower_32_bits(addr)); + if (INTEL_GEN(dev_priv) >= 4) { + drm_printf(m, "\tIPEIR: 0x%08x\n", + ENGINE_READ(engine, RING_IPEIR)); + drm_printf(m, "\tIPEHR: 0x%08x\n", + ENGINE_READ(engine, RING_IPEHR)); + } else { + drm_printf(m, "\tIPEIR: 0x%08x\n", ENGINE_READ(engine, IPEIR)); + drm_printf(m, "\tIPEHR: 0x%08x\n", ENGINE_READ(engine, IPEHR)); + } + + if (HAS_EXECLISTS(dev_priv)) { + struct i915_request * const *port, *rq; + const u32 *hws = + &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; + const u8 num_entries = execlists->csb_size; + unsigned int idx; + u8 read, write; + + drm_printf(m, "\tExeclist tasklet queued? %s (%s), preempt? %s, timeslice? %s\n", + yesno(test_bit(TASKLET_STATE_SCHED, + &engine->execlists.tasklet.state)), + enableddisabled(!atomic_read(&engine->execlists.tasklet.count)), + repr_timer(&engine->execlists.preempt), + repr_timer(&engine->execlists.timer)); + + read = execlists->csb_head; + write = READ_ONCE(*execlists->csb_write); + + drm_printf(m, "\tExeclist status: 0x%08x %08x; CSB read:%d, write:%d, entries:%d\n", + ENGINE_READ(engine, RING_EXECLIST_STATUS_LO), + ENGINE_READ(engine, RING_EXECLIST_STATUS_HI), + read, write, num_entries); + + if (read >= num_entries) + read = 0; + if (write >= num_entries) + write = 0; + if (read > write) + write += num_entries; + while (read < write) { + idx = ++read % num_entries; + drm_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: %d\n", + idx, hws[idx * 2], hws[idx * 2 + 1]); + } + + execlists_active_lock_bh(execlists); + rcu_read_lock(); + for (port = execlists->active; (rq = *port); port++) { + char hdr[160]; + int len; + + len = scnprintf(hdr, sizeof(hdr), + "\t\tActive[%d]: ccid:%08x%s%s, ", + (int)(port - execlists->active), + rq->context->lrc.ccid, + intel_context_is_closed(rq->context) ? "!" : "", + intel_context_is_banned(rq->context) ? "*" : ""); + len += print_ring(hdr + len, sizeof(hdr) - len, rq); + scnprintf(hdr + len, sizeof(hdr) - len, "rq: "); + print_request(m, rq, hdr); + } + for (port = execlists->pending; (rq = *port); port++) { + char hdr[160]; + int len; + + len = scnprintf(hdr, sizeof(hdr), + "\t\tPending[%d]: ccid:%08x%s%s, ", + (int)(port - execlists->pending), + rq->context->lrc.ccid, + intel_context_is_closed(rq->context) ? "!" : "", + intel_context_is_banned(rq->context) ? "*" : ""); + len += print_ring(hdr + len, sizeof(hdr) - len, rq); + scnprintf(hdr + len, sizeof(hdr) - len, "rq: "); + print_request(m, rq, hdr); + } + rcu_read_unlock(); + execlists_active_unlock_bh(execlists); + } else if (INTEL_GEN(dev_priv) > 6) { + drm_printf(m, "\tPP_DIR_BASE: 0x%08x\n", + ENGINE_READ(engine, RING_PP_DIR_BASE)); + drm_printf(m, "\tPP_DIR_BASE_READ: 0x%08x\n", + ENGINE_READ(engine, RING_PP_DIR_BASE_READ)); + drm_printf(m, "\tPP_DIR_DCLV: 0x%08x\n", + ENGINE_READ(engine, RING_PP_DIR_DCLV)); + } +} + +static void print_request_ring(struct drm_printer *m, struct i915_request *rq) +{ + void *ring; + int size; + + drm_printf(m, + "[head %04x, postfix %04x, tail %04x, batch 0x%08x_%08x]:\n", + rq->head, rq->postfix, rq->tail, + rq->batch ? upper_32_bits(rq->batch->node.start) : ~0u, + rq->batch ? lower_32_bits(rq->batch->node.start) : ~0u); + + size = rq->tail - rq->head; + if (rq->tail < rq->head) + size += rq->ring->size; + + ring = kmalloc(size, GFP_ATOMIC); + if (ring) { + const void *vaddr = rq->ring->vaddr; + unsigned int head = rq->head; + unsigned int len = 0; + + if (rq->tail < head) { + len = rq->ring->size - head; + memcpy(ring, vaddr + head, len); + head = 0; + } + memcpy(ring + len, vaddr + head, size - len); + + hexdump(m, ring, size); + kfree(ring); + } +} + +static unsigned long list_count(struct list_head *list) +{ + struct list_head *pos; + unsigned long count = 0; + + list_for_each(pos, list) + count++; + + return count; +} + +void intel_engine_dump(struct intel_engine_cs *engine, + struct drm_printer *m, + const char *header, ...) +{ + struct i915_gpu_error * const error = &engine->i915->gpu_error; + struct i915_request *rq; + intel_wakeref_t wakeref; + unsigned long flags; + ktime_t dummy; + + if (header) { + va_list ap; + + va_start(ap, header); + drm_vprintf(m, header, &ap); + va_end(ap); + } + + if (intel_gt_is_wedged(engine->gt)) + drm_printf(m, "*** WEDGED ***\n"); + + drm_printf(m, "\tAwake? %d\n", atomic_read(&engine->wakeref.count)); + drm_printf(m, "\tBarriers?: %s\n", + yesno(!llist_empty(&engine->barrier_tasks))); + drm_printf(m, "\tLatency: %luus\n", + ewma__engine_latency_read(&engine->latency)); + if (intel_engine_supports_stats(engine)) + drm_printf(m, "\tRuntime: %llums\n", + ktime_to_ms(intel_engine_get_busy_time(engine, + &dummy))); + drm_printf(m, "\tForcewake: %x domains, %d active\n", + engine->fw_domain, atomic_read(&engine->fw_active)); + + rcu_read_lock(); + rq = READ_ONCE(engine->heartbeat.systole); + if (rq) + drm_printf(m, "\tHeartbeat: %d ms ago\n", + jiffies_to_msecs(jiffies - rq->emitted_jiffies)); + rcu_read_unlock(); + drm_printf(m, "\tReset count: %d (global %d)\n", + i915_reset_engine_count(error, engine), + i915_reset_count(error)); + + drm_printf(m, "\tRequests:\n"); + + spin_lock_irqsave(&engine->active.lock, flags); + rq = intel_engine_find_active_request(engine); + if (rq) { + struct intel_timeline *tl = get_timeline(rq); + + print_request(m, rq, "\t\tactive "); + + drm_printf(m, "\t\tring->start: 0x%08x\n", + i915_ggtt_offset(rq->ring->vma)); + drm_printf(m, "\t\tring->head: 0x%08x\n", + rq->ring->head); + drm_printf(m, "\t\tring->tail: 0x%08x\n", + rq->ring->tail); + drm_printf(m, "\t\tring->emit: 0x%08x\n", + rq->ring->emit); + drm_printf(m, "\t\tring->space: 0x%08x\n", + rq->ring->space); + + if (tl) { + drm_printf(m, "\t\tring->hwsp: 0x%08x\n", + tl->hwsp_offset); + intel_timeline_put(tl); + } + + print_request_ring(m, rq); + + if (rq->context->lrc_reg_state) { + drm_printf(m, "Logical Ring Context:\n"); + hexdump(m, rq->context->lrc_reg_state, PAGE_SIZE); + } + } + drm_printf(m, "\tOn hold?: %lu\n", list_count(&engine->active.hold)); + spin_unlock_irqrestore(&engine->active.lock, flags); + + drm_printf(m, "\tMMIO base: 0x%08x\n", engine->mmio_base); + wakeref = intel_runtime_pm_get_if_in_use(engine->uncore->rpm); + if (wakeref) { + intel_engine_print_registers(engine, m); + intel_runtime_pm_put(engine->uncore->rpm, wakeref); + } else { + drm_printf(m, "\tDevice is asleep; skipping register dump\n"); + } + + intel_execlists_show_requests(engine, m, print_request, 8); + + drm_printf(m, "HWSP:\n"); + hexdump(m, engine->status_page.addr, PAGE_SIZE); + + drm_printf(m, "Idle? %s\n", yesno(intel_engine_is_idle(engine))); + + intel_engine_print_breadcrumbs(engine, m); +} + +static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine, + ktime_t *now) +{ + ktime_t total = engine->stats.total; + + /* + * If the engine is executing something at the moment + * add it to the total. + */ + *now = ktime_get(); + if (atomic_read(&engine->stats.active)) + total = ktime_add(total, ktime_sub(*now, engine->stats.start)); + + return total; +} + +/** + * intel_engine_get_busy_time() - Return current accumulated engine busyness + * @engine: engine to report on + * @now: monotonic timestamp of sampling + * + * Returns accumulated time @engine was busy since engine stats were enabled. + */ +ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now) +{ + unsigned int seq; + ktime_t total; + + do { + seq = read_seqbegin(&engine->stats.lock); + total = __intel_engine_get_busy_time(engine, now); + } while (read_seqretry(&engine->stats.lock, seq)); + + return total; +} + +static bool match_ring(struct i915_request *rq) +{ + u32 ring = ENGINE_READ(rq->engine, RING_START); + + return ring == i915_ggtt_offset(rq->ring->vma); +} + +struct i915_request * +intel_engine_find_active_request(struct intel_engine_cs *engine) +{ + struct i915_request *request, *active = NULL; + + /* + * We are called by the error capture, reset and to dump engine + * state at random points in time. In particular, note that neither is + * crucially ordered with an interrupt. After a hang, the GPU is dead + * and we assume that no more writes can happen (we waited long enough + * for all writes that were in transaction to be flushed) - adding an + * extra delay for a recent interrupt is pointless. Hence, we do + * not need an engine->irq_seqno_barrier() before the seqno reads. + * At all other times, we must assume the GPU is still running, but + * we only care about the snapshot of this moment. + */ + lockdep_assert_held(&engine->active.lock); + + rcu_read_lock(); + request = execlists_active(&engine->execlists); + if (request) { + struct intel_timeline *tl = request->context->timeline; + + list_for_each_entry_from_reverse(request, &tl->requests, link) { + if (i915_request_completed(request)) + break; + + active = request; + } + } + rcu_read_unlock(); + if (active) + return active; + + list_for_each_entry(request, &engine->active.requests, sched.link) { + if (i915_request_completed(request)) + continue; + + if (!i915_request_started(request)) + continue; + + /* More than one preemptible request may match! */ + if (!match_ring(request)) + continue; + + active = request; + break; + } + + return active; +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "mock_engine.c" +#include "selftest_engine.c" +#include "selftest_engine_cs.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c new file mode 100644 index 000000000..5067d0524 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c @@ -0,0 +1,301 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include "i915_drv.h" +#include "i915_request.h" + +#include "intel_context.h" +#include "intel_engine_heartbeat.h" +#include "intel_engine_pm.h" +#include "intel_engine.h" +#include "intel_gt.h" +#include "intel_reset.h" + +/* + * While the engine is active, we send a periodic pulse along the engine + * to check on its health and to flush any idle-barriers. If that request + * is stuck, and we fail to preempt it, we declare the engine hung and + * issue a reset -- in the hope that restores progress. + */ + +static bool next_heartbeat(struct intel_engine_cs *engine) +{ + long delay; + + delay = READ_ONCE(engine->props.heartbeat_interval_ms); + if (!delay) + return false; + + delay = msecs_to_jiffies_timeout(delay); + if (delay >= HZ) + delay = round_jiffies_up_relative(delay); + mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay); + + return true; +} + +static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq) +{ + engine->wakeref_serial = READ_ONCE(engine->serial) + 1; + i915_request_add_active_barriers(rq); +} + +static void show_heartbeat(const struct i915_request *rq, + struct intel_engine_cs *engine) +{ + struct drm_printer p = drm_debug_printer("heartbeat"); + + intel_engine_dump(engine, &p, + "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n", + engine->name, + rq->fence.context, + rq->fence.seqno, + rq->sched.attr.priority); +} + +static void heartbeat(struct work_struct *wrk) +{ + struct i915_sched_attr attr = { + .priority = I915_USER_PRIORITY(I915_PRIORITY_MIN), + }; + struct intel_engine_cs *engine = + container_of(wrk, typeof(*engine), heartbeat.work.work); + struct intel_context *ce = engine->kernel_context; + struct i915_request *rq; + unsigned long serial; + + /* Just in case everything has gone horribly wrong, give it a kick */ + intel_engine_flush_submission(engine); + + rq = engine->heartbeat.systole; + if (rq && i915_request_completed(rq)) { + i915_request_put(rq); + engine->heartbeat.systole = NULL; + } + + if (!intel_engine_pm_get_if_awake(engine)) + return; + + if (intel_gt_is_wedged(engine->gt)) + goto out; + + if (engine->heartbeat.systole) { + if (!i915_sw_fence_signaled(&rq->submit)) { + /* + * Not yet submitted, system is stalled. + * + * This more often happens for ring submission, + * where all contexts are funnelled into a common + * ringbuffer. If one context is blocked on an + * external fence, not only is it not submitted, + * but all other contexts, including the kernel + * context are stuck waiting for the signal. + */ + } else if (engine->schedule && + rq->sched.attr.priority < I915_PRIORITY_BARRIER) { + /* + * Gradually raise the priority of the heartbeat to + * give high priority work [which presumably desires + * low latency and no jitter] the chance to naturally + * complete before being preempted. + */ + attr.priority = I915_PRIORITY_MASK; + if (rq->sched.attr.priority >= attr.priority) + attr.priority |= I915_USER_PRIORITY(I915_PRIORITY_HEARTBEAT); + if (rq->sched.attr.priority >= attr.priority) + attr.priority = I915_PRIORITY_BARRIER; + + local_bh_disable(); + engine->schedule(rq, &attr); + local_bh_enable(); + } else { + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) + show_heartbeat(rq, engine); + + intel_gt_handle_error(engine->gt, engine->mask, + I915_ERROR_CAPTURE, + "stopped heartbeat on %s", + engine->name); + } + goto out; + } + + serial = READ_ONCE(engine->serial); + if (engine->wakeref_serial == serial) + goto out; + + if (!mutex_trylock(&ce->timeline->mutex)) { + /* Unable to lock the kernel timeline, is the engine stuck? */ + if (xchg(&engine->heartbeat.blocked, serial) == serial) + intel_gt_handle_error(engine->gt, engine->mask, + I915_ERROR_CAPTURE, + "no heartbeat on %s", + engine->name); + goto out; + } + + intel_context_enter(ce); + rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN); + intel_context_exit(ce); + if (IS_ERR(rq)) + goto unlock; + + idle_pulse(engine, rq); + if (engine->i915->params.enable_hangcheck) + engine->heartbeat.systole = i915_request_get(rq); + + __i915_request_commit(rq); + __i915_request_queue(rq, &attr); + +unlock: + mutex_unlock(&ce->timeline->mutex); +out: + if (!next_heartbeat(engine)) + i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); + intel_engine_pm_put(engine); +} + +void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine) +{ + if (!IS_ACTIVE(CONFIG_DRM_I915_HEARTBEAT_INTERVAL)) + return; + + next_heartbeat(engine); +} + +void intel_engine_park_heartbeat(struct intel_engine_cs *engine) +{ + if (cancel_delayed_work(&engine->heartbeat.work)) + i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); +} + +void intel_engine_init_heartbeat(struct intel_engine_cs *engine) +{ + INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat); +} + +static int __intel_engine_pulse(struct intel_engine_cs *engine) +{ + struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER }; + struct intel_context *ce = engine->kernel_context; + struct i915_request *rq; + + lockdep_assert_held(&ce->timeline->mutex); + GEM_BUG_ON(!intel_engine_has_preemption(engine)); + GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); + + intel_context_enter(ce); + rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN); + intel_context_exit(ce); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags); + idle_pulse(engine, rq); + + __i915_request_commit(rq); + __i915_request_queue(rq, &attr); + GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER); + + return 0; +} + +static unsigned long set_heartbeat(struct intel_engine_cs *engine, + unsigned long delay) +{ + unsigned long old; + + old = xchg(&engine->props.heartbeat_interval_ms, delay); + if (delay) + intel_engine_unpark_heartbeat(engine); + else + intel_engine_park_heartbeat(engine); + + return old; +} + +int intel_engine_set_heartbeat(struct intel_engine_cs *engine, + unsigned long delay) +{ + struct intel_context *ce = engine->kernel_context; + int err = 0; + + if (!delay && !intel_engine_has_preempt_reset(engine)) + return -ENODEV; + + intel_engine_pm_get(engine); + + err = mutex_lock_interruptible(&ce->timeline->mutex); + if (err) + goto out_rpm; + + if (delay != engine->props.heartbeat_interval_ms) { + unsigned long saved = set_heartbeat(engine, delay); + + /* recheck current execution */ + if (intel_engine_has_preemption(engine)) { + err = __intel_engine_pulse(engine); + if (err) + set_heartbeat(engine, saved); + } + } + + mutex_unlock(&ce->timeline->mutex); + +out_rpm: + intel_engine_pm_put(engine); + return err; +} + +int intel_engine_pulse(struct intel_engine_cs *engine) +{ + struct intel_context *ce = engine->kernel_context; + int err; + + if (!intel_engine_has_preemption(engine)) + return -ENODEV; + + if (!intel_engine_pm_get_if_awake(engine)) + return 0; + + err = -EINTR; + if (!mutex_lock_interruptible(&ce->timeline->mutex)) { + err = __intel_engine_pulse(engine); + mutex_unlock(&ce->timeline->mutex); + } + + intel_engine_pm_put(engine); + return err; +} + +int intel_engine_flush_barriers(struct intel_engine_cs *engine) +{ + struct i915_request *rq; + int err = 0; + + if (llist_empty(&engine->barrier_tasks)) + return 0; + + if (!intel_engine_pm_get_if_awake(engine)) + return 0; + + rq = i915_request_create(engine->kernel_context); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_rpm; + } + + idle_pulse(engine, rq); + i915_request_add(rq); + +out_rpm: + intel_engine_pm_put(engine); + return err; +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_engine_heartbeat.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h new file mode 100644 index 000000000..a7b8c0f9e --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h @@ -0,0 +1,23 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_ENGINE_HEARTBEAT_H +#define INTEL_ENGINE_HEARTBEAT_H + +struct intel_engine_cs; + +void intel_engine_init_heartbeat(struct intel_engine_cs *engine); + +int intel_engine_set_heartbeat(struct intel_engine_cs *engine, + unsigned long delay); + +void intel_engine_park_heartbeat(struct intel_engine_cs *engine); +void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine); + +int intel_engine_pulse(struct intel_engine_cs *engine); +int intel_engine_flush_barriers(struct intel_engine_cs *engine); + +#endif /* INTEL_ENGINE_HEARTBEAT_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c new file mode 100644 index 000000000..f7b2e07e2 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c @@ -0,0 +1,281 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include "i915_drv.h" + +#include "intel_breadcrumbs.h" +#include "intel_context.h" +#include "intel_engine.h" +#include "intel_engine_heartbeat.h" +#include "intel_engine_pm.h" +#include "intel_gt.h" +#include "intel_gt_pm.h" +#include "intel_rc6.h" +#include "intel_ring.h" +#include "shmem_utils.h" + +static int __engine_unpark(struct intel_wakeref *wf) +{ + struct intel_engine_cs *engine = + container_of(wf, typeof(*engine), wakeref); + struct intel_context *ce; + + ENGINE_TRACE(engine, "\n"); + + intel_gt_pm_get(engine->gt); + + /* Discard stale context state from across idling */ + ce = engine->kernel_context; + if (ce) { + GEM_BUG_ON(test_bit(CONTEXT_VALID_BIT, &ce->flags)); + + /* First poison the image to verify we never fully trust it */ + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) && ce->state) { + struct drm_i915_gem_object *obj = ce->state->obj; + int type = i915_coherent_map_type(engine->i915); + void *map; + + map = i915_gem_object_pin_map(obj, type); + if (!IS_ERR(map)) { + memset(map, CONTEXT_REDZONE, obj->base.size); + i915_gem_object_flush_map(obj); + i915_gem_object_unpin_map(obj); + } + } + + ce->ops->reset(ce); + } + + if (engine->unpark) + engine->unpark(engine); + + intel_engine_unpark_heartbeat(engine); + return 0; +} + +#if IS_ENABLED(CONFIG_LOCKDEP) + +static inline unsigned long __timeline_mark_lock(struct intel_context *ce) +{ + unsigned long flags; + + local_irq_save(flags); + mutex_acquire(&ce->timeline->mutex.dep_map, 2, 0, _THIS_IP_); + + return flags; +} + +static inline void __timeline_mark_unlock(struct intel_context *ce, + unsigned long flags) +{ + mutex_release(&ce->timeline->mutex.dep_map, _THIS_IP_); + local_irq_restore(flags); +} + +#else + +static inline unsigned long __timeline_mark_lock(struct intel_context *ce) +{ + return 0; +} + +static inline void __timeline_mark_unlock(struct intel_context *ce, + unsigned long flags) +{ +} + +#endif /* !IS_ENABLED(CONFIG_LOCKDEP) */ + +static void duration(struct dma_fence *fence, struct dma_fence_cb *cb) +{ + struct i915_request *rq = to_request(fence); + + ewma__engine_latency_add(&rq->engine->latency, + ktime_us_delta(rq->fence.timestamp, + rq->duration.emitted)); +} + +static void +__queue_and_release_pm(struct i915_request *rq, + struct intel_timeline *tl, + struct intel_engine_cs *engine) +{ + struct intel_gt_timelines *timelines = &engine->gt->timelines; + + ENGINE_TRACE(engine, "parking\n"); + + /* + * We have to serialise all potential retirement paths with our + * submission, as we don't want to underflow either the + * engine->wakeref.counter or our timeline->active_count. + * + * Equally, we cannot allow a new submission to start until + * after we finish queueing, nor could we allow that submitter + * to retire us before we are ready! + */ + spin_lock(&timelines->lock); + + /* Let intel_gt_retire_requests() retire us (acquired under lock) */ + if (!atomic_fetch_inc(&tl->active_count)) + list_add_tail(&tl->link, &timelines->active_list); + + /* Hand the request over to HW and so engine_retire() */ + __i915_request_queue(rq, NULL); + + /* Let new submissions commence (and maybe retire this timeline) */ + __intel_wakeref_defer_park(&engine->wakeref); + + spin_unlock(&timelines->lock); +} + +static bool switch_to_kernel_context(struct intel_engine_cs *engine) +{ + struct intel_context *ce = engine->kernel_context; + struct i915_request *rq; + unsigned long flags; + bool result = true; + + /* GPU is pointing to the void, as good as in the kernel context. */ + if (intel_gt_is_wedged(engine->gt)) + return true; + + GEM_BUG_ON(!intel_context_is_barrier(ce)); + GEM_BUG_ON(ce->timeline->hwsp_ggtt != engine->status_page.vma); + + /* Already inside the kernel context, safe to power down. */ + if (engine->wakeref_serial == engine->serial) + return true; + + /* + * Note, we do this without taking the timeline->mutex. We cannot + * as we may be called while retiring the kernel context and so + * already underneath the timeline->mutex. Instead we rely on the + * exclusive property of the __engine_park that prevents anyone + * else from creating a request on this engine. This also requires + * that the ring is empty and we avoid any waits while constructing + * the context, as they assume protection by the timeline->mutex. + * This should hold true as we can only park the engine after + * retiring the last request, thus all rings should be empty and + * all timelines idle. + * + * For unlocking, there are 2 other parties and the GPU who have a + * stake here. + * + * A new gpu user will be waiting on the engine-pm to start their + * engine_unpark. New waiters are predicated on engine->wakeref.count + * and so intel_wakeref_defer_park() acts like a mutex_unlock of the + * engine->wakeref. + * + * The other party is intel_gt_retire_requests(), which is walking the + * list of active timelines looking for completions. Meanwhile as soon + * as we call __i915_request_queue(), the GPU may complete our request. + * Ergo, if we put ourselves on the timelines.active_list + * (se intel_timeline_enter()) before we increment the + * engine->wakeref.count, we may see the request completion and retire + * it causing an underflow of the engine->wakeref. + */ + flags = __timeline_mark_lock(ce); + GEM_BUG_ON(atomic_read(&ce->timeline->active_count) < 0); + + rq = __i915_request_create(ce, GFP_NOWAIT); + if (IS_ERR(rq)) + /* Context switch failed, hope for the best! Maybe reset? */ + goto out_unlock; + + /* Check again on the next retirement. */ + engine->wakeref_serial = engine->serial + 1; + i915_request_add_active_barriers(rq); + + /* Install ourselves as a preemption barrier */ + rq->sched.attr.priority = I915_PRIORITY_BARRIER; + if (likely(!__i915_request_commit(rq))) { /* engine should be idle! */ + /* + * Use an interrupt for precise measurement of duration, + * otherwise we rely on someone else retiring all the requests + * which may delay the signaling (i.e. we will likely wait + * until the background request retirement running every + * second or two). + */ + BUILD_BUG_ON(sizeof(rq->duration) > sizeof(rq->submitq)); + dma_fence_add_callback(&rq->fence, &rq->duration.cb, duration); + rq->duration.emitted = ktime_get(); + } + + /* Expose ourselves to the world */ + __queue_and_release_pm(rq, ce->timeline, engine); + + result = false; +out_unlock: + __timeline_mark_unlock(ce, flags); + return result; +} + +static void call_idle_barriers(struct intel_engine_cs *engine) +{ + struct llist_node *node, *next; + + llist_for_each_safe(node, next, llist_del_all(&engine->barrier_tasks)) { + struct dma_fence_cb *cb = + container_of((struct list_head *)node, + typeof(*cb), node); + + cb->func(ERR_PTR(-EAGAIN), cb); + } +} + +static int __engine_park(struct intel_wakeref *wf) +{ + struct intel_engine_cs *engine = + container_of(wf, typeof(*engine), wakeref); + + engine->saturated = 0; + + /* + * If one and only one request is completed between pm events, + * we know that we are inside the kernel context and it is + * safe to power down. (We are paranoid in case that runtime + * suspend causes corruption to the active context image, and + * want to avoid that impacting userspace.) + */ + if (!switch_to_kernel_context(engine)) + return -EBUSY; + + ENGINE_TRACE(engine, "parked\n"); + + call_idle_barriers(engine); /* cleanup after wedging */ + + intel_engine_park_heartbeat(engine); + intel_breadcrumbs_park(engine->breadcrumbs); + + /* Must be reset upon idling, or we may miss the busy wakeup. */ + GEM_BUG_ON(engine->execlists.queue_priority_hint != INT_MIN); + + if (engine->park) + engine->park(engine); + + engine->execlists.no_priolist = false; + + /* While gt calls i915_vma_parked(), we have to break the lock cycle */ + intel_gt_pm_put_async(engine->gt); + return 0; +} + +static const struct intel_wakeref_ops wf_ops = { + .get = __engine_unpark, + .put = __engine_park, +}; + +void intel_engine_init__pm(struct intel_engine_cs *engine) +{ + struct intel_runtime_pm *rpm = engine->uncore->rpm; + + intel_wakeref_init(&engine->wakeref, rpm, &wf_ops); + intel_engine_init_heartbeat(engine); +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_engine_pm.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.h b/drivers/gpu/drm/i915/gt/intel_engine_pm.h new file mode 100644 index 000000000..418df0a13 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.h @@ -0,0 +1,73 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_ENGINE_PM_H +#define INTEL_ENGINE_PM_H + +#include "i915_request.h" +#include "intel_engine_types.h" +#include "intel_wakeref.h" + +static inline bool +intel_engine_pm_is_awake(const struct intel_engine_cs *engine) +{ + return intel_wakeref_is_active(&engine->wakeref); +} + +static inline void intel_engine_pm_get(struct intel_engine_cs *engine) +{ + intel_wakeref_get(&engine->wakeref); +} + +static inline bool intel_engine_pm_get_if_awake(struct intel_engine_cs *engine) +{ + return intel_wakeref_get_if_active(&engine->wakeref); +} + +static inline void intel_engine_pm_put(struct intel_engine_cs *engine) +{ + intel_wakeref_put(&engine->wakeref); +} + +static inline void intel_engine_pm_put_async(struct intel_engine_cs *engine) +{ + intel_wakeref_put_async(&engine->wakeref); +} + +static inline void intel_engine_pm_put_delay(struct intel_engine_cs *engine, + unsigned long delay) +{ + intel_wakeref_put_delay(&engine->wakeref, delay); +} + +static inline void intel_engine_pm_flush(struct intel_engine_cs *engine) +{ + intel_wakeref_unlock_wait(&engine->wakeref); +} + +static inline struct i915_request * +intel_engine_create_kernel_request(struct intel_engine_cs *engine) +{ + struct i915_request *rq; + + /* + * The engine->kernel_context is special as it is used inside + * the engine-pm barrier (see __engine_park()), circumventing + * the usual mutexes and relying on the engine-pm barrier + * instead. So whenever we use the engine->kernel_context + * outside of the barrier, we must manually handle the + * engine wakeref to serialise with the use inside. + */ + intel_engine_pm_get(engine); + rq = i915_request_create(engine->kernel_context); + intel_engine_pm_put(engine); + + return rq; +} + +void intel_engine_init__pm(struct intel_engine_cs *engine); + +#endif /* INTEL_ENGINE_PM_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h new file mode 100644 index 000000000..ee6312601 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -0,0 +1,635 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef __INTEL_ENGINE_TYPES__ +#define __INTEL_ENGINE_TYPES__ + +#include <linux/average.h> +#include <linux/hashtable.h> +#include <linux/irq_work.h> +#include <linux/kref.h> +#include <linux/list.h> +#include <linux/llist.h> +#include <linux/rbtree.h> +#include <linux/timer.h> +#include <linux/types.h> +#include <linux/workqueue.h> + +#include "i915_gem.h" +#include "i915_pmu.h" +#include "i915_priolist_types.h" +#include "i915_selftest.h" +#include "intel_breadcrumbs_types.h" +#include "intel_sseu.h" +#include "intel_timeline_types.h" +#include "intel_uncore.h" +#include "intel_wakeref.h" +#include "intel_workarounds_types.h" + +/* Legacy HW Engine ID */ + +#define RCS0_HW 0 +#define VCS0_HW 1 +#define BCS0_HW 2 +#define VECS0_HW 3 +#define VCS1_HW 4 +#define VCS2_HW 6 +#define VCS3_HW 7 +#define VECS1_HW 12 + +/* Gen11+ HW Engine class + instance */ +#define RENDER_CLASS 0 +#define VIDEO_DECODE_CLASS 1 +#define VIDEO_ENHANCEMENT_CLASS 2 +#define COPY_ENGINE_CLASS 3 +#define OTHER_CLASS 4 +#define MAX_ENGINE_CLASS 4 +#define MAX_ENGINE_INSTANCE 3 + +#define I915_MAX_SLICES 3 +#define I915_MAX_SUBSLICES 8 + +#define I915_CMD_HASH_ORDER 9 + +struct dma_fence; +struct drm_i915_gem_object; +struct drm_i915_reg_table; +struct i915_gem_context; +struct i915_request; +struct i915_sched_attr; +struct intel_gt; +struct intel_ring; +struct intel_uncore; + +typedef u8 intel_engine_mask_t; +#define ALL_ENGINES ((intel_engine_mask_t)~0ul) + +struct intel_hw_status_page { + struct i915_vma *vma; + u32 *addr; +}; + +struct intel_instdone { + u32 instdone; + /* The following exist only in the RCS engine */ + u32 slice_common; + u32 slice_common_extra[2]; + u32 sampler[I915_MAX_SLICES][I915_MAX_SUBSLICES]; + u32 row[I915_MAX_SLICES][I915_MAX_SUBSLICES]; +}; + +/* + * we use a single page to load ctx workarounds so all of these + * values are referred in terms of dwords + * + * struct i915_wa_ctx_bb: + * offset: specifies batch starting position, also helpful in case + * if we want to have multiple batches at different offsets based on + * some criteria. It is not a requirement at the moment but provides + * an option for future use. + * size: size of the batch in DWORDS + */ +struct i915_ctx_workarounds { + struct i915_wa_ctx_bb { + u32 offset; + u32 size; + } indirect_ctx, per_ctx; + struct i915_vma *vma; +}; + +#define I915_MAX_VCS 4 +#define I915_MAX_VECS 2 + +/* + * Engine IDs definitions. + * Keep instances of the same type engine together. + */ +enum intel_engine_id { + RCS0 = 0, + BCS0, + VCS0, + VCS1, + VCS2, + VCS3, +#define _VCS(n) (VCS0 + (n)) + VECS0, + VECS1, +#define _VECS(n) (VECS0 + (n)) + I915_NUM_ENGINES +#define INVALID_ENGINE ((enum intel_engine_id)-1) +}; + +/* A simple estimator for the round-trip latency of an engine */ +DECLARE_EWMA(_engine_latency, 6, 4) + +struct st_preempt_hang { + struct completion completion; + unsigned int count; +}; + +/** + * struct intel_engine_execlists - execlist submission queue and port state + * + * The struct intel_engine_execlists represents the combined logical state of + * driver and the hardware state for execlist mode of submission. + */ +struct intel_engine_execlists { + /** + * @tasklet: softirq tasklet for bottom handler + */ + struct tasklet_struct tasklet; + + /** + * @timer: kick the current context if its timeslice expires + */ + struct timer_list timer; + + /** + * @preempt: reset the current context if it fails to give way + */ + struct timer_list preempt; + + /** + * @default_priolist: priority list for I915_PRIORITY_NORMAL + */ + struct i915_priolist default_priolist; + + /** + * @ccid: identifier for contexts submitted to this engine + */ + u32 ccid; + + /** + * @yield: CCID at the time of the last semaphore-wait interrupt. + * + * Instead of leaving a semaphore busy-spinning on an engine, we would + * like to switch to another ready context, i.e. yielding the semaphore + * timeslice. + */ + u32 yield; + + /** + * @error_interrupt: CS Master EIR + * + * The CS generates an interrupt when it detects an error. We capture + * the first error interrupt, record the EIR and schedule the tasklet. + * In the tasklet, we process the pending CS events to ensure we have + * the guilty request, and then reset the engine. + * + * Low 16b are used by HW, with the upper 16b used as the enabling mask. + * Reserve the upper 16b for tracking internal errors. + */ + u32 error_interrupt; +#define ERROR_CSB BIT(31) + + /** + * @reset_ccid: Active CCID [EXECLISTS_STATUS_HI] at the time of reset + */ + u32 reset_ccid; + + /** + * @no_priolist: priority lists disabled + */ + bool no_priolist; + + /** + * @submit_reg: gen-specific execlist submission register + * set to the ExecList Submission Port (elsp) register pre-Gen11 and to + * the ExecList Submission Queue Contents register array for Gen11+ + */ + u32 __iomem *submit_reg; + + /** + * @ctrl_reg: the enhanced execlists control register, used to load the + * submit queue on the HW and to request preemptions to idle + */ + u32 __iomem *ctrl_reg; + +#define EXECLIST_MAX_PORTS 2 + /** + * @active: the currently known context executing on HW + */ + struct i915_request * const *active; + /** + * @inflight: the set of contexts submitted and acknowleged by HW + * + * The set of inflight contexts is managed by reading CS events + * from the HW. On a context-switch event (not preemption), we + * know the HW has transitioned from port0 to port1, and we + * advance our inflight/active tracking accordingly. + */ + struct i915_request *inflight[EXECLIST_MAX_PORTS + 1 /* sentinel */]; + /** + * @pending: the next set of contexts submitted to ELSP + * + * We store the array of contexts that we submit to HW (via ELSP) and + * promote them to the inflight array once HW has signaled the + * preemption or idle-to-active event. + */ + struct i915_request *pending[EXECLIST_MAX_PORTS + 1]; + + /** + * @port_mask: number of execlist ports - 1 + */ + unsigned int port_mask; + + /** + * @switch_priority_hint: Second context priority. + * + * We submit multiple contexts to the HW simultaneously and would + * like to occasionally switch between them to emulate timeslicing. + * To know when timeslicing is suitable, we track the priority of + * the context submitted second. + */ + int switch_priority_hint; + + /** + * @queue_priority_hint: Highest pending priority. + * + * When we add requests into the queue, or adjust the priority of + * executing requests, we compute the maximum priority of those + * pending requests. We can then use this value to determine if + * we need to preempt the executing requests to service the queue. + * However, since the we may have recorded the priority of an inflight + * request we wanted to preempt but since completed, at the time of + * dequeuing the priority hint may no longer may match the highest + * available request priority. + */ + int queue_priority_hint; + + /** + * @queue: queue of requests, in priority lists + */ + struct rb_root_cached queue; + struct rb_root_cached virtual; + + /** + * @csb_write: control register for Context Switch buffer + * + * Note this register may be either mmio or HWSP shadow. + */ + u32 *csb_write; + + /** + * @csb_status: status array for Context Switch buffer + * + * Note these register may be either mmio or HWSP shadow. + */ + u64 *csb_status; + + /** + * @csb_size: context status buffer FIFO size + */ + u8 csb_size; + + /** + * @csb_head: context status buffer head + */ + u8 csb_head; + + I915_SELFTEST_DECLARE(struct st_preempt_hang preempt_hang;) +}; + +#define INTEL_ENGINE_CS_MAX_NAME 8 + +struct intel_engine_cs { + struct drm_i915_private *i915; + struct intel_gt *gt; + struct intel_uncore *uncore; + char name[INTEL_ENGINE_CS_MAX_NAME]; + + enum intel_engine_id id; + enum intel_engine_id legacy_idx; + + unsigned int hw_id; + unsigned int guc_id; + + intel_engine_mask_t mask; + + u8 class; + u8 instance; + + u16 uabi_class; + u16 uabi_instance; + + u32 uabi_capabilities; + u32 context_size; + u32 mmio_base; + + /* + * Some w/a require forcewake to be held (which prevents RC6) while + * a particular engine is active. If so, we set fw_domain to which + * domains need to be held for the duration of request activity, + * and 0 if none. We try to limit the duration of the hold as much + * as possible. + */ + enum forcewake_domains fw_domain; + atomic_t fw_active; + + unsigned long context_tag; + + struct rb_node uabi_node; + + struct intel_sseu sseu; + + struct { + spinlock_t lock; + struct list_head requests; + struct list_head hold; /* ready requests, but on hold */ + } active; + + /* keep a request in reserve for a [pm] barrier under oom */ + struct i915_request *request_pool; + + struct llist_head barrier_tasks; + + struct intel_context *kernel_context; /* pinned */ + + intel_engine_mask_t saturated; /* submitting semaphores too late? */ + + struct { + struct delayed_work work; + struct i915_request *systole; + unsigned long blocked; + } heartbeat; + + unsigned long serial; + + unsigned long wakeref_serial; + struct intel_wakeref wakeref; + struct file *default_state; + + struct { + struct intel_ring *ring; + struct intel_timeline *timeline; + } legacy; + + /* + * We track the average duration of the idle pulse on parking the + * engine to keep an estimate of the how the fast the engine is + * under ideal conditions. + */ + struct ewma__engine_latency latency; + + /* Keep track of all the seqno used, a trail of breadcrumbs */ + struct intel_breadcrumbs *breadcrumbs; + + struct intel_engine_pmu { + /** + * @enable: Bitmask of enable sample events on this engine. + * + * Bits correspond to sample event types, for instance + * I915_SAMPLE_QUEUED is bit 0 etc. + */ + u32 enable; + /** + * @enable_count: Reference count for the enabled samplers. + * + * Index number corresponds to @enum drm_i915_pmu_engine_sample. + */ + unsigned int enable_count[I915_ENGINE_SAMPLE_COUNT]; + /** + * @sample: Counter values for sampling events. + * + * Our internal timer stores the current counters in this field. + * + * Index number corresponds to @enum drm_i915_pmu_engine_sample. + */ + struct i915_pmu_sample sample[I915_ENGINE_SAMPLE_COUNT]; + } pmu; + + struct intel_hw_status_page status_page; + struct i915_ctx_workarounds wa_ctx; + struct i915_wa_list ctx_wa_list; + struct i915_wa_list wa_list; + struct i915_wa_list whitelist; + + u32 irq_keep_mask; /* always keep these interrupts */ + u32 irq_enable_mask; /* bitmask to enable ring interrupt */ + void (*irq_enable)(struct intel_engine_cs *engine); + void (*irq_disable)(struct intel_engine_cs *engine); + + void (*sanitize)(struct intel_engine_cs *engine); + int (*resume)(struct intel_engine_cs *engine); + + struct { + void (*prepare)(struct intel_engine_cs *engine); + + void (*rewind)(struct intel_engine_cs *engine, bool stalled); + void (*cancel)(struct intel_engine_cs *engine); + + void (*finish)(struct intel_engine_cs *engine); + } reset; + + void (*park)(struct intel_engine_cs *engine); + void (*unpark)(struct intel_engine_cs *engine); + + void (*set_default_submission)(struct intel_engine_cs *engine); + + const struct intel_context_ops *cops; + + int (*request_alloc)(struct i915_request *rq); + + int (*emit_flush)(struct i915_request *request, u32 mode); +#define EMIT_INVALIDATE BIT(0) +#define EMIT_FLUSH BIT(1) +#define EMIT_BARRIER (EMIT_INVALIDATE | EMIT_FLUSH) + int (*emit_bb_start)(struct i915_request *rq, + u64 offset, u32 length, + unsigned int dispatch_flags); +#define I915_DISPATCH_SECURE BIT(0) +#define I915_DISPATCH_PINNED BIT(1) + int (*emit_init_breadcrumb)(struct i915_request *rq); + u32 *(*emit_fini_breadcrumb)(struct i915_request *rq, + u32 *cs); + unsigned int emit_fini_breadcrumb_dw; + + /* Pass the request to the hardware queue (e.g. directly into + * the legacy ringbuffer or to the end of an execlist). + * + * This is called from an atomic context with irqs disabled; must + * be irq safe. + */ + void (*submit_request)(struct i915_request *rq); + + /* + * Called on signaling of a SUBMIT_FENCE, passing along the signaling + * request down to the bonded pairs. + */ + void (*bond_execute)(struct i915_request *rq, + struct dma_fence *signal); + + /* + * Call when the priority on a request has changed and it and its + * dependencies may need rescheduling. Note the request itself may + * not be ready to run! + */ + void (*schedule)(struct i915_request *request, + const struct i915_sched_attr *attr); + + void (*release)(struct intel_engine_cs *engine); + + struct intel_engine_execlists execlists; + + /* + * Keep track of completed timelines on this engine for early + * retirement with the goal of quickly enabling powersaving as + * soon as the engine is idle. + */ + struct intel_timeline *retire; + struct work_struct retire_work; + + /* status_notifier: list of callbacks for context-switch changes */ + struct atomic_notifier_head context_status_notifier; + +#define I915_ENGINE_USING_CMD_PARSER BIT(0) +#define I915_ENGINE_SUPPORTS_STATS BIT(1) +#define I915_ENGINE_HAS_PREEMPTION BIT(2) +#define I915_ENGINE_HAS_SEMAPHORES BIT(3) +#define I915_ENGINE_HAS_TIMESLICES BIT(4) +#define I915_ENGINE_NEEDS_BREADCRUMB_TASKLET BIT(5) +#define I915_ENGINE_IS_VIRTUAL BIT(6) +#define I915_ENGINE_HAS_RELATIVE_MMIO BIT(7) +#define I915_ENGINE_REQUIRES_CMD_PARSER BIT(8) + unsigned int flags; + + /* + * Table of commands the command parser needs to know about + * for this engine. + */ + DECLARE_HASHTABLE(cmd_hash, I915_CMD_HASH_ORDER); + + /* + * Table of registers allowed in commands that read/write registers. + */ + const struct drm_i915_reg_table *reg_tables; + int reg_table_count; + + /* + * Returns the bitmask for the length field of the specified command. + * Return 0 for an unrecognized/invalid command. + * + * If the command parser finds an entry for a command in the engine's + * cmd_tables, it gets the command's length based on the table entry. + * If not, it calls this function to determine the per-engine length + * field encoding for the command (i.e. different opcode ranges use + * certain bits to encode the command length in the header). + */ + u32 (*get_cmd_length_mask)(u32 cmd_header); + + struct { + /** + * @active: Number of contexts currently scheduled in. + */ + atomic_t active; + + /** + * @lock: Lock protecting the below fields. + */ + seqlock_t lock; + + /** + * @total: Total time this engine was busy. + * + * Accumulated time not counting the most recent block in cases + * where engine is currently busy (active > 0). + */ + ktime_t total; + + /** + * @start: Timestamp of the last idle to active transition. + * + * Idle is defined as active == 0, active is active > 0. + */ + ktime_t start; + + /** + * @rps: Utilisation at last RPS sampling. + */ + ktime_t rps; + } stats; + + struct { + unsigned long heartbeat_interval_ms; + unsigned long max_busywait_duration_ns; + unsigned long preempt_timeout_ms; + unsigned long stop_timeout_ms; + unsigned long timeslice_duration_ms; + } props, defaults; +}; + +static inline bool +intel_engine_using_cmd_parser(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_USING_CMD_PARSER; +} + +static inline bool +intel_engine_requires_cmd_parser(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_REQUIRES_CMD_PARSER; +} + +static inline bool +intel_engine_supports_stats(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_SUPPORTS_STATS; +} + +static inline bool +intel_engine_has_preemption(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_HAS_PREEMPTION; +} + +static inline bool +intel_engine_has_semaphores(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_HAS_SEMAPHORES; +} + +static inline bool +intel_engine_has_timeslices(const struct intel_engine_cs *engine) +{ + if (!IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION)) + return false; + + return engine->flags & I915_ENGINE_HAS_TIMESLICES; +} + +static inline bool +intel_engine_needs_breadcrumb_tasklet(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_NEEDS_BREADCRUMB_TASKLET; +} + +static inline bool +intel_engine_is_virtual(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_IS_VIRTUAL; +} + +static inline bool +intel_engine_has_relative_mmio(const struct intel_engine_cs * const engine) +{ + return engine->flags & I915_ENGINE_HAS_RELATIVE_MMIO; +} + +#define instdone_has_slice(dev_priv___, sseu___, slice___) \ + ((IS_GEN(dev_priv___, 7) ? 1 : ((sseu___)->slice_mask)) & BIT(slice___)) + +#define instdone_has_subslice(dev_priv__, sseu__, slice__, subslice__) \ + (IS_GEN(dev_priv__, 7) ? (1 & BIT(subslice__)) : \ + intel_sseu_has_subslice(sseu__, 0, subslice__)) + +#define for_each_instdone_slice_subslice(dev_priv_, sseu_, slice_, subslice_) \ + for ((slice_) = 0, (subslice_) = 0; (slice_) < I915_MAX_SLICES; \ + (subslice_) = ((subslice_) + 1) % I915_MAX_SUBSLICES, \ + (slice_) += ((subslice_) == 0)) \ + for_each_if((instdone_has_slice(dev_priv_, sseu_, slice_)) && \ + (instdone_has_subslice(dev_priv_, sseu_, slice_, \ + subslice_))) +#endif /* __INTEL_ENGINE_TYPES_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c new file mode 100644 index 000000000..da21d2a10 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c @@ -0,0 +1,301 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include <linux/list.h> +#include <linux/list_sort.h> +#include <linux/llist.h> + +#include "i915_drv.h" +#include "intel_engine.h" +#include "intel_engine_user.h" +#include "intel_gt.h" + +struct intel_engine_cs * +intel_engine_lookup_user(struct drm_i915_private *i915, u8 class, u8 instance) +{ + struct rb_node *p = i915->uabi_engines.rb_node; + + while (p) { + struct intel_engine_cs *it = + rb_entry(p, typeof(*it), uabi_node); + + if (class < it->uabi_class) + p = p->rb_left; + else if (class > it->uabi_class || + instance > it->uabi_instance) + p = p->rb_right; + else if (instance < it->uabi_instance) + p = p->rb_left; + else + return it; + } + + return NULL; +} + +void intel_engine_add_user(struct intel_engine_cs *engine) +{ + llist_add((struct llist_node *)&engine->uabi_node, + (struct llist_head *)&engine->i915->uabi_engines); +} + +static const u8 uabi_classes[] = { + [RENDER_CLASS] = I915_ENGINE_CLASS_RENDER, + [COPY_ENGINE_CLASS] = I915_ENGINE_CLASS_COPY, + [VIDEO_DECODE_CLASS] = I915_ENGINE_CLASS_VIDEO, + [VIDEO_ENHANCEMENT_CLASS] = I915_ENGINE_CLASS_VIDEO_ENHANCE, +}; + +static int engine_cmp(void *priv, const struct list_head *A, + const struct list_head *B) +{ + const struct intel_engine_cs *a = + container_of((struct rb_node *)A, typeof(*a), uabi_node); + const struct intel_engine_cs *b = + container_of((struct rb_node *)B, typeof(*b), uabi_node); + + if (uabi_classes[a->class] < uabi_classes[b->class]) + return -1; + if (uabi_classes[a->class] > uabi_classes[b->class]) + return 1; + + if (a->instance < b->instance) + return -1; + if (a->instance > b->instance) + return 1; + + return 0; +} + +static struct llist_node *get_engines(struct drm_i915_private *i915) +{ + return llist_del_all((struct llist_head *)&i915->uabi_engines); +} + +static void sort_engines(struct drm_i915_private *i915, + struct list_head *engines) +{ + struct llist_node *pos, *next; + + llist_for_each_safe(pos, next, get_engines(i915)) { + struct intel_engine_cs *engine = + container_of((struct rb_node *)pos, typeof(*engine), + uabi_node); + list_add((struct list_head *)&engine->uabi_node, engines); + } + list_sort(NULL, engines, engine_cmp); +} + +static void set_scheduler_caps(struct drm_i915_private *i915) +{ + static const struct { + u8 engine; + u8 sched; + } map[] = { +#define MAP(x, y) { ilog2(I915_ENGINE_##x), ilog2(I915_SCHEDULER_CAP_##y) } + MAP(HAS_PREEMPTION, PREEMPTION), + MAP(HAS_SEMAPHORES, SEMAPHORES), + MAP(SUPPORTS_STATS, ENGINE_BUSY_STATS), +#undef MAP + }; + struct intel_engine_cs *engine; + u32 enabled, disabled; + + enabled = 0; + disabled = 0; + for_each_uabi_engine(engine, i915) { /* all engines must agree! */ + int i; + + if (engine->schedule) + enabled |= (I915_SCHEDULER_CAP_ENABLED | + I915_SCHEDULER_CAP_PRIORITY); + else + disabled |= (I915_SCHEDULER_CAP_ENABLED | + I915_SCHEDULER_CAP_PRIORITY); + + for (i = 0; i < ARRAY_SIZE(map); i++) { + if (engine->flags & BIT(map[i].engine)) + enabled |= BIT(map[i].sched); + else + disabled |= BIT(map[i].sched); + } + } + + i915->caps.scheduler = enabled & ~disabled; + if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_ENABLED)) + i915->caps.scheduler = 0; +} + +const char *intel_engine_class_repr(u8 class) +{ + static const char * const uabi_names[] = { + [RENDER_CLASS] = "rcs", + [COPY_ENGINE_CLASS] = "bcs", + [VIDEO_DECODE_CLASS] = "vcs", + [VIDEO_ENHANCEMENT_CLASS] = "vecs", + }; + + if (class >= ARRAY_SIZE(uabi_names) || !uabi_names[class]) + return "xxx"; + + return uabi_names[class]; +} + +struct legacy_ring { + struct intel_gt *gt; + u8 class; + u8 instance; +}; + +static int legacy_ring_idx(const struct legacy_ring *ring) +{ + static const struct { + u8 base, max; + } map[] = { + [RENDER_CLASS] = { RCS0, 1 }, + [COPY_ENGINE_CLASS] = { BCS0, 1 }, + [VIDEO_DECODE_CLASS] = { VCS0, I915_MAX_VCS }, + [VIDEO_ENHANCEMENT_CLASS] = { VECS0, I915_MAX_VECS }, + }; + + if (GEM_DEBUG_WARN_ON(ring->class >= ARRAY_SIZE(map))) + return INVALID_ENGINE; + + if (GEM_DEBUG_WARN_ON(ring->instance >= map[ring->class].max)) + return INVALID_ENGINE; + + return map[ring->class].base + ring->instance; +} + +static void add_legacy_ring(struct legacy_ring *ring, + struct intel_engine_cs *engine) +{ + if (engine->gt != ring->gt || engine->class != ring->class) { + ring->gt = engine->gt; + ring->class = engine->class; + ring->instance = 0; + } + + engine->legacy_idx = legacy_ring_idx(ring); + if (engine->legacy_idx != INVALID_ENGINE) + ring->instance++; +} + +void intel_engines_driver_register(struct drm_i915_private *i915) +{ + struct legacy_ring ring = {}; + u8 uabi_instances[4] = {}; + struct list_head *it, *next; + struct rb_node **p, *prev; + LIST_HEAD(engines); + + sort_engines(i915, &engines); + + prev = NULL; + p = &i915->uabi_engines.rb_node; + list_for_each_safe(it, next, &engines) { + struct intel_engine_cs *engine = + container_of((struct rb_node *)it, typeof(*engine), + uabi_node); + char old[sizeof(engine->name)]; + + if (intel_gt_has_unrecoverable_error(engine->gt)) + continue; /* ignore incomplete engines */ + + GEM_BUG_ON(engine->class >= ARRAY_SIZE(uabi_classes)); + engine->uabi_class = uabi_classes[engine->class]; + + GEM_BUG_ON(engine->uabi_class >= ARRAY_SIZE(uabi_instances)); + engine->uabi_instance = uabi_instances[engine->uabi_class]++; + + /* Replace the internal name with the final user facing name */ + memcpy(old, engine->name, sizeof(engine->name)); + scnprintf(engine->name, sizeof(engine->name), "%s%u", + intel_engine_class_repr(engine->class), + engine->uabi_instance); + DRM_DEBUG_DRIVER("renamed %s to %s\n", old, engine->name); + + rb_link_node(&engine->uabi_node, prev, p); + rb_insert_color(&engine->uabi_node, &i915->uabi_engines); + + GEM_BUG_ON(intel_engine_lookup_user(i915, + engine->uabi_class, + engine->uabi_instance) != engine); + + /* Fix up the mapping to match default execbuf::user_map[] */ + add_legacy_ring(&ring, engine); + + prev = &engine->uabi_node; + p = &prev->rb_right; + } + + if (IS_ENABLED(CONFIG_DRM_I915_SELFTESTS) && + IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) { + struct intel_engine_cs *engine; + unsigned int isolation; + int class, inst; + int errors = 0; + + for (class = 0; class < ARRAY_SIZE(uabi_instances); class++) { + for (inst = 0; inst < uabi_instances[class]; inst++) { + engine = intel_engine_lookup_user(i915, + class, inst); + if (!engine) { + pr_err("UABI engine not found for { class:%d, instance:%d }\n", + class, inst); + errors++; + continue; + } + + if (engine->uabi_class != class || + engine->uabi_instance != inst) { + pr_err("Wrong UABI engine:%s { class:%d, instance:%d } found for { class:%d, instance:%d }\n", + engine->name, + engine->uabi_class, + engine->uabi_instance, + class, inst); + errors++; + continue; + } + } + } + + /* + * Make sure that classes with multiple engine instances all + * share the same basic configuration. + */ + isolation = intel_engines_has_context_isolation(i915); + for_each_uabi_engine(engine, i915) { + unsigned int bit = BIT(engine->uabi_class); + unsigned int expected = engine->default_state ? bit : 0; + + if ((isolation & bit) != expected) { + pr_err("mismatching default context state for class %d on engine %s\n", + engine->uabi_class, engine->name); + errors++; + } + } + + if (drm_WARN(&i915->drm, errors, + "Invalid UABI engine mapping found")) + i915->uabi_engines = RB_ROOT; + } + + set_scheduler_caps(i915); +} + +unsigned int intel_engines_has_context_isolation(struct drm_i915_private *i915) +{ + struct intel_engine_cs *engine; + unsigned int which; + + which = 0; + for_each_uabi_engine(engine, i915) + if (engine->default_state) + which |= BIT(engine->uabi_class); + + return which; +} diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.h b/drivers/gpu/drm/i915/gt/intel_engine_user.h new file mode 100644 index 000000000..f845ea1cb --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.h @@ -0,0 +1,25 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_ENGINE_USER_H +#define INTEL_ENGINE_USER_H + +#include <linux/types.h> + +struct drm_i915_private; +struct intel_engine_cs; + +struct intel_engine_cs * +intel_engine_lookup_user(struct drm_i915_private *i915, u8 class, u8 instance); + +unsigned int intel_engines_has_context_isolation(struct drm_i915_private *i915); + +void intel_engine_add_user(struct intel_engine_cs *engine); +void intel_engines_driver_register(struct drm_i915_private *i915); + +const char *intel_engine_class_repr(u8 class); + +#endif /* INTEL_ENGINE_USER_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c new file mode 100644 index 000000000..060f826b1 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c @@ -0,0 +1,1489 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +#include <linux/stop_machine.h> + +#include <asm/set_memory.h> +#include <asm/smp.h> + +#include <drm/i915_drm.h> + +#include "intel_gt.h" +#include "i915_drv.h" +#include "i915_scatterlist.h" +#include "i915_vgpu.h" + +#include "intel_gtt.h" + +static int +i915_get_ggtt_vma_pages(struct i915_vma *vma); + +static void i915_ggtt_color_adjust(const struct drm_mm_node *node, + unsigned long color, + u64 *start, + u64 *end) +{ + if (i915_node_color_differs(node, color)) + *start += I915_GTT_PAGE_SIZE; + + /* + * Also leave a space between the unallocated reserved node after the + * GTT and any objects within the GTT, i.e. we use the color adjustment + * to insert a guard page to prevent prefetches crossing over the + * GTT boundary. + */ + node = list_next_entry(node, node_list); + if (node->color != color) + *end -= I915_GTT_PAGE_SIZE; +} + +static int ggtt_init_hw(struct i915_ggtt *ggtt) +{ + struct drm_i915_private *i915 = ggtt->vm.i915; + + i915_address_space_init(&ggtt->vm, VM_CLASS_GGTT); + + ggtt->vm.is_ggtt = true; + + /* Only VLV supports read-only GGTT mappings */ + ggtt->vm.has_read_only = IS_VALLEYVIEW(i915); + + if (!HAS_LLC(i915) && !HAS_PPGTT(i915)) + ggtt->vm.mm.color_adjust = i915_ggtt_color_adjust; + + if (ggtt->mappable_end) { + if (!io_mapping_init_wc(&ggtt->iomap, + ggtt->gmadr.start, + ggtt->mappable_end)) { + ggtt->vm.cleanup(&ggtt->vm); + return -EIO; + } + + ggtt->mtrr = arch_phys_wc_add(ggtt->gmadr.start, + ggtt->mappable_end); + } + + intel_ggtt_init_fences(ggtt); + + return 0; +} + +/** + * i915_ggtt_init_hw - Initialize GGTT hardware + * @i915: i915 device + */ +int i915_ggtt_init_hw(struct drm_i915_private *i915) +{ + int ret; + + /* + * Note that we use page colouring to enforce a guard page at the + * end of the address space. This is required as the CS may prefetch + * beyond the end of the batch buffer, across the page boundary, + * and beyond the end of the GTT if we do not provide a guard. + */ + ret = ggtt_init_hw(&i915->ggtt); + if (ret) + return ret; + + return 0; +} + +/* + * Certain Gen5 chipsets require require idling the GPU before + * unmapping anything from the GTT when VT-d is enabled. + */ +static bool needs_idle_maps(struct drm_i915_private *i915) +{ + /* + * Query intel_iommu to see if we need the workaround. Presumably that + * was loaded first. + */ + return IS_GEN(i915, 5) && IS_MOBILE(i915) && intel_vtd_active(); +} + +void i915_ggtt_suspend(struct i915_ggtt *ggtt) +{ + struct i915_vma *vma, *vn; + int open; + + mutex_lock(&ggtt->vm.mutex); + + /* Skip rewriting PTE on VMA unbind. */ + open = atomic_xchg(&ggtt->vm.open, 0); + + list_for_each_entry_safe(vma, vn, &ggtt->vm.bound_list, vm_link) { + GEM_BUG_ON(!drm_mm_node_allocated(&vma->node)); + i915_vma_wait_for_bind(vma); + + if (i915_vma_is_pinned(vma)) + continue; + + if (!i915_vma_is_bound(vma, I915_VMA_GLOBAL_BIND)) { + __i915_vma_evict(vma); + drm_mm_remove_node(&vma->node); + } + } + + ggtt->vm.clear_range(&ggtt->vm, 0, ggtt->vm.total); + ggtt->invalidate(ggtt); + atomic_set(&ggtt->vm.open, open); + + mutex_unlock(&ggtt->vm.mutex); + + intel_gt_check_and_clear_faults(ggtt->vm.gt); +} + +void gen6_ggtt_invalidate(struct i915_ggtt *ggtt) +{ + struct intel_uncore *uncore = ggtt->vm.gt->uncore; + + spin_lock_irq(&uncore->lock); + intel_uncore_write_fw(uncore, GFX_FLSH_CNTL_GEN6, GFX_FLSH_CNTL_EN); + intel_uncore_read_fw(uncore, GFX_FLSH_CNTL_GEN6); + spin_unlock_irq(&uncore->lock); +} + +static void gen8_ggtt_invalidate(struct i915_ggtt *ggtt) +{ + struct intel_uncore *uncore = ggtt->vm.gt->uncore; + + /* + * Note that as an uncached mmio write, this will flush the + * WCB of the writes into the GGTT before it triggers the invalidate. + */ + intel_uncore_write_fw(uncore, GFX_FLSH_CNTL_GEN6, GFX_FLSH_CNTL_EN); +} + +static void guc_ggtt_invalidate(struct i915_ggtt *ggtt) +{ + struct intel_uncore *uncore = ggtt->vm.gt->uncore; + struct drm_i915_private *i915 = ggtt->vm.i915; + + gen8_ggtt_invalidate(ggtt); + + if (INTEL_GEN(i915) >= 12) + intel_uncore_write_fw(uncore, GEN12_GUC_TLB_INV_CR, + GEN12_GUC_TLB_INV_CR_INVALIDATE); + else + intel_uncore_write_fw(uncore, GEN8_GTCR, GEN8_GTCR_INVALIDATE); +} + +static void gmch_ggtt_invalidate(struct i915_ggtt *ggtt) +{ + intel_gtt_chipset_flush(); +} + +static u64 gen8_ggtt_pte_encode(dma_addr_t addr, + enum i915_cache_level level, + u32 flags) +{ + return addr | _PAGE_PRESENT; +} + +static void gen8_set_pte(void __iomem *addr, gen8_pte_t pte) +{ + writeq(pte, addr); +} + +static void gen8_ggtt_insert_page(struct i915_address_space *vm, + dma_addr_t addr, + u64 offset, + enum i915_cache_level level, + u32 unused) +{ + struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); + gen8_pte_t __iomem *pte = + (gen8_pte_t __iomem *)ggtt->gsm + offset / I915_GTT_PAGE_SIZE; + + gen8_set_pte(pte, gen8_ggtt_pte_encode(addr, level, 0)); + + ggtt->invalidate(ggtt); +} + +static void gen8_ggtt_insert_entries(struct i915_address_space *vm, + struct i915_vma *vma, + enum i915_cache_level level, + u32 flags) +{ + const gen8_pte_t pte_encode = gen8_ggtt_pte_encode(0, level, 0); + struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); + gen8_pte_t __iomem *gte; + gen8_pte_t __iomem *end; + struct sgt_iter iter; + dma_addr_t addr; + + /* + * Note that we ignore PTE_READ_ONLY here. The caller must be careful + * not to allow the user to override access to a read only page. + */ + + gte = (gen8_pte_t __iomem *)ggtt->gsm; + gte += vma->node.start / I915_GTT_PAGE_SIZE; + end = gte + vma->node.size / I915_GTT_PAGE_SIZE; + + for_each_sgt_daddr(addr, iter, vma->pages) + gen8_set_pte(gte++, pte_encode | addr); + GEM_BUG_ON(gte > end); + + /* Fill the allocated but "unused" space beyond the end of the buffer */ + while (gte < end) + gen8_set_pte(gte++, vm->scratch[0]->encode); + + /* + * We want to flush the TLBs only after we're certain all the PTE + * updates have finished. + */ + ggtt->invalidate(ggtt); +} + +static void gen6_ggtt_insert_page(struct i915_address_space *vm, + dma_addr_t addr, + u64 offset, + enum i915_cache_level level, + u32 flags) +{ + struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); + gen6_pte_t __iomem *pte = + (gen6_pte_t __iomem *)ggtt->gsm + offset / I915_GTT_PAGE_SIZE; + + iowrite32(vm->pte_encode(addr, level, flags), pte); + + ggtt->invalidate(ggtt); +} + +/* + * Binds an object into the global gtt with the specified cache level. + * The object will be accessible to the GPU via commands whose operands + * reference offsets within the global GTT as well as accessible by the GPU + * through the GMADR mapped BAR (i915->mm.gtt->gtt). + */ +static void gen6_ggtt_insert_entries(struct i915_address_space *vm, + struct i915_vma *vma, + enum i915_cache_level level, + u32 flags) +{ + struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); + gen6_pte_t __iomem *gte; + gen6_pte_t __iomem *end; + struct sgt_iter iter; + dma_addr_t addr; + + gte = (gen6_pte_t __iomem *)ggtt->gsm; + gte += vma->node.start / I915_GTT_PAGE_SIZE; + end = gte + vma->node.size / I915_GTT_PAGE_SIZE; + + for_each_sgt_daddr(addr, iter, vma->pages) + iowrite32(vm->pte_encode(addr, level, flags), gte++); + GEM_BUG_ON(gte > end); + + /* Fill the allocated but "unused" space beyond the end of the buffer */ + while (gte < end) + iowrite32(vm->scratch[0]->encode, gte++); + + /* + * We want to flush the TLBs only after we're certain all the PTE + * updates have finished. + */ + ggtt->invalidate(ggtt); +} + +static void nop_clear_range(struct i915_address_space *vm, + u64 start, u64 length) +{ +} + +static void gen8_ggtt_clear_range(struct i915_address_space *vm, + u64 start, u64 length) +{ + struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); + unsigned int first_entry = start / I915_GTT_PAGE_SIZE; + unsigned int num_entries = length / I915_GTT_PAGE_SIZE; + const gen8_pte_t scratch_pte = vm->scratch[0]->encode; + gen8_pte_t __iomem *gtt_base = + (gen8_pte_t __iomem *)ggtt->gsm + first_entry; + const int max_entries = ggtt_total_entries(ggtt) - first_entry; + int i; + + if (WARN(num_entries > max_entries, + "First entry = %d; Num entries = %d (max=%d)\n", + first_entry, num_entries, max_entries)) + num_entries = max_entries; + + for (i = 0; i < num_entries; i++) + gen8_set_pte(>t_base[i], scratch_pte); +} + +static void bxt_vtd_ggtt_wa(struct i915_address_space *vm) +{ + /* + * Make sure the internal GAM fifo has been cleared of all GTT + * writes before exiting stop_machine(). This guarantees that + * any aperture accesses waiting to start in another process + * cannot back up behind the GTT writes causing a hang. + * The register can be any arbitrary GAM register. + */ + intel_uncore_posting_read_fw(vm->gt->uncore, GFX_FLSH_CNTL_GEN6); +} + +struct insert_page { + struct i915_address_space *vm; + dma_addr_t addr; + u64 offset; + enum i915_cache_level level; +}; + +static int bxt_vtd_ggtt_insert_page__cb(void *_arg) +{ + struct insert_page *arg = _arg; + + gen8_ggtt_insert_page(arg->vm, arg->addr, arg->offset, arg->level, 0); + bxt_vtd_ggtt_wa(arg->vm); + + return 0; +} + +static void bxt_vtd_ggtt_insert_page__BKL(struct i915_address_space *vm, + dma_addr_t addr, + u64 offset, + enum i915_cache_level level, + u32 unused) +{ + struct insert_page arg = { vm, addr, offset, level }; + + stop_machine(bxt_vtd_ggtt_insert_page__cb, &arg, NULL); +} + +struct insert_entries { + struct i915_address_space *vm; + struct i915_vma *vma; + enum i915_cache_level level; + u32 flags; +}; + +static int bxt_vtd_ggtt_insert_entries__cb(void *_arg) +{ + struct insert_entries *arg = _arg; + + gen8_ggtt_insert_entries(arg->vm, arg->vma, arg->level, arg->flags); + bxt_vtd_ggtt_wa(arg->vm); + + return 0; +} + +static void bxt_vtd_ggtt_insert_entries__BKL(struct i915_address_space *vm, + struct i915_vma *vma, + enum i915_cache_level level, + u32 flags) +{ + struct insert_entries arg = { vm, vma, level, flags }; + + stop_machine(bxt_vtd_ggtt_insert_entries__cb, &arg, NULL); +} + +static void gen6_ggtt_clear_range(struct i915_address_space *vm, + u64 start, u64 length) +{ + struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); + unsigned int first_entry = start / I915_GTT_PAGE_SIZE; + unsigned int num_entries = length / I915_GTT_PAGE_SIZE; + gen6_pte_t scratch_pte, __iomem *gtt_base = + (gen6_pte_t __iomem *)ggtt->gsm + first_entry; + const int max_entries = ggtt_total_entries(ggtt) - first_entry; + int i; + + if (WARN(num_entries > max_entries, + "First entry = %d; Num entries = %d (max=%d)\n", + first_entry, num_entries, max_entries)) + num_entries = max_entries; + + scratch_pte = vm->scratch[0]->encode; + for (i = 0; i < num_entries; i++) + iowrite32(scratch_pte, >t_base[i]); +} + +static void i915_ggtt_insert_page(struct i915_address_space *vm, + dma_addr_t addr, + u64 offset, + enum i915_cache_level cache_level, + u32 unused) +{ + unsigned int flags = (cache_level == I915_CACHE_NONE) ? + AGP_USER_MEMORY : AGP_USER_CACHED_MEMORY; + + intel_gtt_insert_page(addr, offset >> PAGE_SHIFT, flags); +} + +static void i915_ggtt_insert_entries(struct i915_address_space *vm, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 unused) +{ + unsigned int flags = (cache_level == I915_CACHE_NONE) ? + AGP_USER_MEMORY : AGP_USER_CACHED_MEMORY; + + intel_gtt_insert_sg_entries(vma->pages, vma->node.start >> PAGE_SHIFT, + flags); +} + +static void i915_ggtt_clear_range(struct i915_address_space *vm, + u64 start, u64 length) +{ + intel_gtt_clear_range(start >> PAGE_SHIFT, length >> PAGE_SHIFT); +} + +static void ggtt_bind_vma(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags) +{ + struct drm_i915_gem_object *obj = vma->obj; + u32 pte_flags; + + if (i915_vma_is_bound(vma, ~flags & I915_VMA_BIND_MASK)) + return; + + /* Applicable to VLV (gen8+ do not support RO in the GGTT) */ + pte_flags = 0; + if (i915_gem_object_is_readonly(obj)) + pte_flags |= PTE_READ_ONLY; + + vm->insert_entries(vm, vma, cache_level, pte_flags); + vma->page_sizes.gtt = I915_GTT_PAGE_SIZE; +} + +static void ggtt_unbind_vma(struct i915_address_space *vm, struct i915_vma *vma) +{ + vm->clear_range(vm, vma->node.start, vma->size); +} + +static int ggtt_reserve_guc_top(struct i915_ggtt *ggtt) +{ + u64 size; + int ret; + + if (!intel_uc_uses_guc(&ggtt->vm.gt->uc)) + return 0; + + GEM_BUG_ON(ggtt->vm.total <= GUC_GGTT_TOP); + size = ggtt->vm.total - GUC_GGTT_TOP; + + ret = i915_gem_gtt_reserve(&ggtt->vm, &ggtt->uc_fw, size, + GUC_GGTT_TOP, I915_COLOR_UNEVICTABLE, + PIN_NOEVICT); + if (ret) + drm_dbg(&ggtt->vm.i915->drm, + "Failed to reserve top of GGTT for GuC\n"); + + return ret; +} + +static void ggtt_release_guc_top(struct i915_ggtt *ggtt) +{ + if (drm_mm_node_allocated(&ggtt->uc_fw)) + drm_mm_remove_node(&ggtt->uc_fw); +} + +static void cleanup_init_ggtt(struct i915_ggtt *ggtt) +{ + ggtt_release_guc_top(ggtt); + if (drm_mm_node_allocated(&ggtt->error_capture)) + drm_mm_remove_node(&ggtt->error_capture); + mutex_destroy(&ggtt->error_mutex); +} + +static int init_ggtt(struct i915_ggtt *ggtt) +{ + /* + * Let GEM Manage all of the aperture. + * + * However, leave one page at the end still bound to the scratch page. + * There are a number of places where the hardware apparently prefetches + * past the end of the object, and we've seen multiple hangs with the + * GPU head pointer stuck in a batchbuffer bound at the last page of the + * aperture. One page should be enough to keep any prefetching inside + * of the aperture. + */ + unsigned long hole_start, hole_end; + struct drm_mm_node *entry; + int ret; + + /* + * GuC requires all resources that we're sharing with it to be placed in + * non-WOPCM memory. If GuC is not present or not in use we still need a + * small bias as ring wraparound at offset 0 sometimes hangs. No idea + * why. + */ + ggtt->pin_bias = max_t(u32, I915_GTT_PAGE_SIZE, + intel_wopcm_guc_size(&ggtt->vm.i915->wopcm)); + + ret = intel_vgt_balloon(ggtt); + if (ret) + return ret; + + mutex_init(&ggtt->error_mutex); + if (ggtt->mappable_end) { + /* + * Reserve a mappable slot for our lockless error capture. + * + * We strongly prefer taking address 0x0 in order to protect + * other critical buffers against accidental overwrites, + * as writing to address 0 is a very common mistake. + * + * Since 0 may already be in use by the system (e.g. the BIOS + * framebuffer), we let the reservation fail quietly and hope + * 0 remains reserved always. + * + * If we fail to reserve 0, and then fail to find any space + * for an error-capture, remain silent. We can afford not + * to reserve an error_capture node as we have fallback + * paths, and we trust that 0 will remain reserved. However, + * the only likely reason for failure to insert is a driver + * bug, which we expect to cause other failures... + */ + ggtt->error_capture.size = I915_GTT_PAGE_SIZE; + ggtt->error_capture.color = I915_COLOR_UNEVICTABLE; + if (drm_mm_reserve_node(&ggtt->vm.mm, &ggtt->error_capture)) + drm_mm_insert_node_in_range(&ggtt->vm.mm, + &ggtt->error_capture, + ggtt->error_capture.size, 0, + ggtt->error_capture.color, + 0, ggtt->mappable_end, + DRM_MM_INSERT_LOW); + } + if (drm_mm_node_allocated(&ggtt->error_capture)) + drm_dbg(&ggtt->vm.i915->drm, + "Reserved GGTT:[%llx, %llx] for use by error capture\n", + ggtt->error_capture.start, + ggtt->error_capture.start + ggtt->error_capture.size); + + /* + * The upper portion of the GuC address space has a sizeable hole + * (several MB) that is inaccessible by GuC. Reserve this range within + * GGTT as it can comfortably hold GuC/HuC firmware images. + */ + ret = ggtt_reserve_guc_top(ggtt); + if (ret) + goto err; + + /* Clear any non-preallocated blocks */ + drm_mm_for_each_hole(entry, &ggtt->vm.mm, hole_start, hole_end) { + drm_dbg(&ggtt->vm.i915->drm, + "clearing unused GTT space: [%lx, %lx]\n", + hole_start, hole_end); + ggtt->vm.clear_range(&ggtt->vm, hole_start, + hole_end - hole_start); + } + + /* And finally clear the reserved guard page */ + ggtt->vm.clear_range(&ggtt->vm, ggtt->vm.total - PAGE_SIZE, PAGE_SIZE); + + return 0; + +err: + cleanup_init_ggtt(ggtt); + return ret; +} + +static void aliasing_gtt_bind_vma(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags) +{ + u32 pte_flags; + + /* Currently applicable only to VLV */ + pte_flags = 0; + if (i915_gem_object_is_readonly(vma->obj)) + pte_flags |= PTE_READ_ONLY; + + if (flags & I915_VMA_LOCAL_BIND) + ppgtt_bind_vma(&i915_vm_to_ggtt(vm)->alias->vm, + stash, vma, cache_level, flags); + + if (flags & I915_VMA_GLOBAL_BIND) + vm->insert_entries(vm, vma, cache_level, pte_flags); +} + +static void aliasing_gtt_unbind_vma(struct i915_address_space *vm, + struct i915_vma *vma) +{ + if (i915_vma_is_bound(vma, I915_VMA_GLOBAL_BIND)) + vm->clear_range(vm, vma->node.start, vma->size); + + if (i915_vma_is_bound(vma, I915_VMA_LOCAL_BIND)) + ppgtt_unbind_vma(&i915_vm_to_ggtt(vm)->alias->vm, vma); +} + +static int init_aliasing_ppgtt(struct i915_ggtt *ggtt) +{ + struct i915_vm_pt_stash stash = {}; + struct i915_ppgtt *ppgtt; + int err; + + ppgtt = i915_ppgtt_create(ggtt->vm.gt); + if (IS_ERR(ppgtt)) + return PTR_ERR(ppgtt); + + if (GEM_WARN_ON(ppgtt->vm.total < ggtt->vm.total)) { + err = -ENODEV; + goto err_ppgtt; + } + + err = i915_vm_alloc_pt_stash(&ppgtt->vm, &stash, ggtt->vm.total); + if (err) + goto err_ppgtt; + + err = i915_vm_pin_pt_stash(&ppgtt->vm, &stash); + if (err) + goto err_stash; + + /* + * Note we only pre-allocate as far as the end of the global + * GTT. On 48b / 4-level page-tables, the difference is very, + * very significant! We have to preallocate as GVT/vgpu does + * not like the page directory disappearing. + */ + ppgtt->vm.allocate_va_range(&ppgtt->vm, &stash, 0, ggtt->vm.total); + + ggtt->alias = ppgtt; + ggtt->vm.bind_async_flags |= ppgtt->vm.bind_async_flags; + + GEM_BUG_ON(ggtt->vm.vma_ops.bind_vma != ggtt_bind_vma); + ggtt->vm.vma_ops.bind_vma = aliasing_gtt_bind_vma; + + GEM_BUG_ON(ggtt->vm.vma_ops.unbind_vma != ggtt_unbind_vma); + ggtt->vm.vma_ops.unbind_vma = aliasing_gtt_unbind_vma; + + i915_vm_free_pt_stash(&ppgtt->vm, &stash); + return 0; + +err_stash: + i915_vm_free_pt_stash(&ppgtt->vm, &stash); +err_ppgtt: + i915_vm_put(&ppgtt->vm); + return err; +} + +static void fini_aliasing_ppgtt(struct i915_ggtt *ggtt) +{ + struct i915_ppgtt *ppgtt; + + ppgtt = fetch_and_zero(&ggtt->alias); + if (!ppgtt) + return; + + i915_vm_put(&ppgtt->vm); + + ggtt->vm.vma_ops.bind_vma = ggtt_bind_vma; + ggtt->vm.vma_ops.unbind_vma = ggtt_unbind_vma; +} + +int i915_init_ggtt(struct drm_i915_private *i915) +{ + int ret; + + ret = init_ggtt(&i915->ggtt); + if (ret) + return ret; + + if (INTEL_PPGTT(i915) == INTEL_PPGTT_ALIASING) { + ret = init_aliasing_ppgtt(&i915->ggtt); + if (ret) + cleanup_init_ggtt(&i915->ggtt); + } + + return 0; +} + +static void ggtt_cleanup_hw(struct i915_ggtt *ggtt) +{ + struct i915_vma *vma, *vn; + + atomic_set(&ggtt->vm.open, 0); + + rcu_barrier(); /* flush the RCU'ed__i915_vm_release */ + flush_workqueue(ggtt->vm.i915->wq); + + mutex_lock(&ggtt->vm.mutex); + + list_for_each_entry_safe(vma, vn, &ggtt->vm.bound_list, vm_link) + WARN_ON(__i915_vma_unbind(vma)); + + if (drm_mm_node_allocated(&ggtt->error_capture)) + drm_mm_remove_node(&ggtt->error_capture); + mutex_destroy(&ggtt->error_mutex); + + ggtt_release_guc_top(ggtt); + intel_vgt_deballoon(ggtt); + + ggtt->vm.cleanup(&ggtt->vm); + + mutex_unlock(&ggtt->vm.mutex); + i915_address_space_fini(&ggtt->vm); + + arch_phys_wc_del(ggtt->mtrr); + + if (ggtt->iomap.size) + io_mapping_fini(&ggtt->iomap); +} + +/** + * i915_ggtt_driver_release - Clean up GGTT hardware initialization + * @i915: i915 device + */ +void i915_ggtt_driver_release(struct drm_i915_private *i915) +{ + struct i915_ggtt *ggtt = &i915->ggtt; + + fini_aliasing_ppgtt(ggtt); + + intel_ggtt_fini_fences(ggtt); + ggtt_cleanup_hw(ggtt); +} + +static unsigned int gen6_get_total_gtt_size(u16 snb_gmch_ctl) +{ + snb_gmch_ctl >>= SNB_GMCH_GGMS_SHIFT; + snb_gmch_ctl &= SNB_GMCH_GGMS_MASK; + return snb_gmch_ctl << 20; +} + +static unsigned int gen8_get_total_gtt_size(u16 bdw_gmch_ctl) +{ + bdw_gmch_ctl >>= BDW_GMCH_GGMS_SHIFT; + bdw_gmch_ctl &= BDW_GMCH_GGMS_MASK; + if (bdw_gmch_ctl) + bdw_gmch_ctl = 1 << bdw_gmch_ctl; + +#ifdef CONFIG_X86_32 + /* Limit 32b platforms to a 2GB GGTT: 4 << 20 / pte size * I915_GTT_PAGE_SIZE */ + if (bdw_gmch_ctl > 4) + bdw_gmch_ctl = 4; +#endif + + return bdw_gmch_ctl << 20; +} + +static unsigned int chv_get_total_gtt_size(u16 gmch_ctrl) +{ + gmch_ctrl >>= SNB_GMCH_GGMS_SHIFT; + gmch_ctrl &= SNB_GMCH_GGMS_MASK; + + if (gmch_ctrl) + return 1 << (20 + gmch_ctrl); + + return 0; +} + +static int ggtt_probe_common(struct i915_ggtt *ggtt, u64 size) +{ + struct drm_i915_private *i915 = ggtt->vm.i915; + struct pci_dev *pdev = i915->drm.pdev; + phys_addr_t phys_addr; + int ret; + + /* For Modern GENs the PTEs and register space are split in the BAR */ + phys_addr = pci_resource_start(pdev, 0) + pci_resource_len(pdev, 0) / 2; + + /* + * On BXT+/CNL+ writes larger than 64 bit to the GTT pagetable range + * will be dropped. For WC mappings in general we have 64 byte burst + * writes when the WC buffer is flushed, so we can't use it, but have to + * resort to an uncached mapping. The WC issue is easily caught by the + * readback check when writing GTT PTE entries. + */ + if (IS_GEN9_LP(i915) || INTEL_GEN(i915) >= 10) + ggtt->gsm = ioremap(phys_addr, size); + else + ggtt->gsm = ioremap_wc(phys_addr, size); + if (!ggtt->gsm) { + drm_err(&i915->drm, "Failed to map the ggtt page table\n"); + return -ENOMEM; + } + + ret = setup_scratch_page(&ggtt->vm); + if (ret) { + drm_err(&i915->drm, "Scratch setup failed\n"); + /* iounmap will also get called at remove, but meh */ + iounmap(ggtt->gsm); + return ret; + } + + ggtt->vm.scratch[0]->encode = + ggtt->vm.pte_encode(px_dma(ggtt->vm.scratch[0]), + I915_CACHE_NONE, 0); + + return 0; +} + +int ggtt_set_pages(struct i915_vma *vma) +{ + int ret; + + GEM_BUG_ON(vma->pages); + + ret = i915_get_ggtt_vma_pages(vma); + if (ret) + return ret; + + vma->page_sizes = vma->obj->mm.page_sizes; + + return 0; +} + +static void gen6_gmch_remove(struct i915_address_space *vm) +{ + struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); + + iounmap(ggtt->gsm); + free_scratch(vm); +} + +static struct resource pci_resource(struct pci_dev *pdev, int bar) +{ + return (struct resource)DEFINE_RES_MEM(pci_resource_start(pdev, bar), + pci_resource_len(pdev, bar)); +} + +static int gen8_gmch_probe(struct i915_ggtt *ggtt) +{ + struct drm_i915_private *i915 = ggtt->vm.i915; + struct pci_dev *pdev = i915->drm.pdev; + unsigned int size; + u16 snb_gmch_ctl; + + /* TODO: We're not aware of mappable constraints on gen8 yet */ + if (!IS_DGFX(i915)) { + ggtt->gmadr = pci_resource(pdev, 2); + ggtt->mappable_end = resource_size(&ggtt->gmadr); + } + + pci_read_config_word(pdev, SNB_GMCH_CTRL, &snb_gmch_ctl); + if (IS_CHERRYVIEW(i915)) + size = chv_get_total_gtt_size(snb_gmch_ctl); + else + size = gen8_get_total_gtt_size(snb_gmch_ctl); + + ggtt->vm.alloc_pt_dma = alloc_pt_dma; + + ggtt->vm.total = (size / sizeof(gen8_pte_t)) * I915_GTT_PAGE_SIZE; + ggtt->vm.cleanup = gen6_gmch_remove; + ggtt->vm.insert_page = gen8_ggtt_insert_page; + ggtt->vm.clear_range = nop_clear_range; + if (intel_scanout_needs_vtd_wa(i915)) + ggtt->vm.clear_range = gen8_ggtt_clear_range; + + ggtt->vm.insert_entries = gen8_ggtt_insert_entries; + + /* Serialize GTT updates with aperture access on BXT if VT-d is on. */ + if (intel_ggtt_update_needs_vtd_wa(i915) || + IS_CHERRYVIEW(i915) /* fails with concurrent use/update */) { + ggtt->vm.insert_entries = bxt_vtd_ggtt_insert_entries__BKL; + ggtt->vm.insert_page = bxt_vtd_ggtt_insert_page__BKL; + ggtt->vm.bind_async_flags = + I915_VMA_GLOBAL_BIND | I915_VMA_LOCAL_BIND; + } + + ggtt->invalidate = gen8_ggtt_invalidate; + + ggtt->vm.vma_ops.bind_vma = ggtt_bind_vma; + ggtt->vm.vma_ops.unbind_vma = ggtt_unbind_vma; + ggtt->vm.vma_ops.set_pages = ggtt_set_pages; + ggtt->vm.vma_ops.clear_pages = clear_pages; + + ggtt->vm.pte_encode = gen8_ggtt_pte_encode; + + setup_private_pat(ggtt->vm.gt->uncore); + + return ggtt_probe_common(ggtt, size); +} + +static u64 snb_pte_encode(dma_addr_t addr, + enum i915_cache_level level, + u32 flags) +{ + gen6_pte_t pte = GEN6_PTE_ADDR_ENCODE(addr) | GEN6_PTE_VALID; + + switch (level) { + case I915_CACHE_L3_LLC: + case I915_CACHE_LLC: + pte |= GEN6_PTE_CACHE_LLC; + break; + case I915_CACHE_NONE: + pte |= GEN6_PTE_UNCACHED; + break; + default: + MISSING_CASE(level); + } + + return pte; +} + +static u64 ivb_pte_encode(dma_addr_t addr, + enum i915_cache_level level, + u32 flags) +{ + gen6_pte_t pte = GEN6_PTE_ADDR_ENCODE(addr) | GEN6_PTE_VALID; + + switch (level) { + case I915_CACHE_L3_LLC: + pte |= GEN7_PTE_CACHE_L3_LLC; + break; + case I915_CACHE_LLC: + pte |= GEN6_PTE_CACHE_LLC; + break; + case I915_CACHE_NONE: + pte |= GEN6_PTE_UNCACHED; + break; + default: + MISSING_CASE(level); + } + + return pte; +} + +static u64 byt_pte_encode(dma_addr_t addr, + enum i915_cache_level level, + u32 flags) +{ + gen6_pte_t pte = GEN6_PTE_ADDR_ENCODE(addr) | GEN6_PTE_VALID; + + if (!(flags & PTE_READ_ONLY)) + pte |= BYT_PTE_WRITEABLE; + + if (level != I915_CACHE_NONE) + pte |= BYT_PTE_SNOOPED_BY_CPU_CACHES; + + return pte; +} + +static u64 hsw_pte_encode(dma_addr_t addr, + enum i915_cache_level level, + u32 flags) +{ + gen6_pte_t pte = HSW_PTE_ADDR_ENCODE(addr) | GEN6_PTE_VALID; + + if (level != I915_CACHE_NONE) + pte |= HSW_WB_LLC_AGE3; + + return pte; +} + +static u64 iris_pte_encode(dma_addr_t addr, + enum i915_cache_level level, + u32 flags) +{ + gen6_pte_t pte = HSW_PTE_ADDR_ENCODE(addr) | GEN6_PTE_VALID; + + switch (level) { + case I915_CACHE_NONE: + break; + case I915_CACHE_WT: + pte |= HSW_WT_ELLC_LLC_AGE3; + break; + default: + pte |= HSW_WB_ELLC_LLC_AGE3; + break; + } + + return pte; +} + +static int gen6_gmch_probe(struct i915_ggtt *ggtt) +{ + struct drm_i915_private *i915 = ggtt->vm.i915; + struct pci_dev *pdev = i915->drm.pdev; + unsigned int size; + u16 snb_gmch_ctl; + + ggtt->gmadr = pci_resource(pdev, 2); + ggtt->mappable_end = resource_size(&ggtt->gmadr); + + /* + * 64/512MB is the current min/max we actually know of, but this is + * just a coarse sanity check. + */ + if (ggtt->mappable_end < (64<<20) || ggtt->mappable_end > (512<<20)) { + drm_err(&i915->drm, "Unknown GMADR size (%pa)\n", + &ggtt->mappable_end); + return -ENXIO; + } + + pci_read_config_word(pdev, SNB_GMCH_CTRL, &snb_gmch_ctl); + + size = gen6_get_total_gtt_size(snb_gmch_ctl); + ggtt->vm.total = (size / sizeof(gen6_pte_t)) * I915_GTT_PAGE_SIZE; + + ggtt->vm.alloc_pt_dma = alloc_pt_dma; + + ggtt->vm.clear_range = nop_clear_range; + if (!HAS_FULL_PPGTT(i915) || intel_scanout_needs_vtd_wa(i915)) + ggtt->vm.clear_range = gen6_ggtt_clear_range; + ggtt->vm.insert_page = gen6_ggtt_insert_page; + ggtt->vm.insert_entries = gen6_ggtt_insert_entries; + ggtt->vm.cleanup = gen6_gmch_remove; + + ggtt->invalidate = gen6_ggtt_invalidate; + + if (HAS_EDRAM(i915)) + ggtt->vm.pte_encode = iris_pte_encode; + else if (IS_HASWELL(i915)) + ggtt->vm.pte_encode = hsw_pte_encode; + else if (IS_VALLEYVIEW(i915)) + ggtt->vm.pte_encode = byt_pte_encode; + else if (INTEL_GEN(i915) >= 7) + ggtt->vm.pte_encode = ivb_pte_encode; + else + ggtt->vm.pte_encode = snb_pte_encode; + + ggtt->vm.vma_ops.bind_vma = ggtt_bind_vma; + ggtt->vm.vma_ops.unbind_vma = ggtt_unbind_vma; + ggtt->vm.vma_ops.set_pages = ggtt_set_pages; + ggtt->vm.vma_ops.clear_pages = clear_pages; + + return ggtt_probe_common(ggtt, size); +} + +static void i915_gmch_remove(struct i915_address_space *vm) +{ + intel_gmch_remove(); +} + +static int i915_gmch_probe(struct i915_ggtt *ggtt) +{ + struct drm_i915_private *i915 = ggtt->vm.i915; + phys_addr_t gmadr_base; + int ret; + + ret = intel_gmch_probe(i915->bridge_dev, i915->drm.pdev, NULL); + if (!ret) { + drm_err(&i915->drm, "failed to set up gmch\n"); + return -EIO; + } + + intel_gtt_get(&ggtt->vm.total, &gmadr_base, &ggtt->mappable_end); + + ggtt->gmadr = + (struct resource)DEFINE_RES_MEM(gmadr_base, ggtt->mappable_end); + + ggtt->vm.alloc_pt_dma = alloc_pt_dma; + + ggtt->do_idle_maps = needs_idle_maps(i915); + ggtt->vm.insert_page = i915_ggtt_insert_page; + ggtt->vm.insert_entries = i915_ggtt_insert_entries; + ggtt->vm.clear_range = i915_ggtt_clear_range; + ggtt->vm.cleanup = i915_gmch_remove; + + ggtt->invalidate = gmch_ggtt_invalidate; + + ggtt->vm.vma_ops.bind_vma = ggtt_bind_vma; + ggtt->vm.vma_ops.unbind_vma = ggtt_unbind_vma; + ggtt->vm.vma_ops.set_pages = ggtt_set_pages; + ggtt->vm.vma_ops.clear_pages = clear_pages; + + if (unlikely(ggtt->do_idle_maps)) + drm_notice(&i915->drm, + "Applying Ironlake quirks for intel_iommu\n"); + + return 0; +} + +static int ggtt_probe_hw(struct i915_ggtt *ggtt, struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + int ret; + + ggtt->vm.gt = gt; + ggtt->vm.i915 = i915; + ggtt->vm.dma = &i915->drm.pdev->dev; + + if (INTEL_GEN(i915) <= 5) + ret = i915_gmch_probe(ggtt); + else if (INTEL_GEN(i915) < 8) + ret = gen6_gmch_probe(ggtt); + else + ret = gen8_gmch_probe(ggtt); + if (ret) + return ret; + + if ((ggtt->vm.total - 1) >> 32) { + drm_err(&i915->drm, + "We never expected a Global GTT with more than 32bits" + " of address space! Found %lldM!\n", + ggtt->vm.total >> 20); + ggtt->vm.total = 1ULL << 32; + ggtt->mappable_end = + min_t(u64, ggtt->mappable_end, ggtt->vm.total); + } + + if (ggtt->mappable_end > ggtt->vm.total) { + drm_err(&i915->drm, + "mappable aperture extends past end of GGTT," + " aperture=%pa, total=%llx\n", + &ggtt->mappable_end, ggtt->vm.total); + ggtt->mappable_end = ggtt->vm.total; + } + + /* GMADR is the PCI mmio aperture into the global GTT. */ + drm_dbg(&i915->drm, "GGTT size = %lluM\n", ggtt->vm.total >> 20); + drm_dbg(&i915->drm, "GMADR size = %lluM\n", + (u64)ggtt->mappable_end >> 20); + drm_dbg(&i915->drm, "DSM size = %lluM\n", + (u64)resource_size(&intel_graphics_stolen_res) >> 20); + + return 0; +} + +/** + * i915_ggtt_probe_hw - Probe GGTT hardware location + * @i915: i915 device + */ +int i915_ggtt_probe_hw(struct drm_i915_private *i915) +{ + int ret; + + ret = ggtt_probe_hw(&i915->ggtt, &i915->gt); + if (ret) + return ret; + + if (intel_vtd_active()) + drm_info(&i915->drm, "VT-d active for gfx access\n"); + + return 0; +} + +int i915_ggtt_enable_hw(struct drm_i915_private *i915) +{ + if (INTEL_GEN(i915) < 6 && !intel_enable_gtt()) + return -EIO; + + return 0; +} + +void i915_ggtt_enable_guc(struct i915_ggtt *ggtt) +{ + GEM_BUG_ON(ggtt->invalidate != gen8_ggtt_invalidate); + + ggtt->invalidate = guc_ggtt_invalidate; + + ggtt->invalidate(ggtt); +} + +void i915_ggtt_disable_guc(struct i915_ggtt *ggtt) +{ + /* XXX Temporary pardon for error unload */ + if (ggtt->invalidate == gen8_ggtt_invalidate) + return; + + /* We should only be called after i915_ggtt_enable_guc() */ + GEM_BUG_ON(ggtt->invalidate != guc_ggtt_invalidate); + + ggtt->invalidate = gen8_ggtt_invalidate; + + ggtt->invalidate(ggtt); +} + +void i915_ggtt_resume(struct i915_ggtt *ggtt) +{ + struct i915_vma *vma; + bool flush = false; + int open; + + intel_gt_check_and_clear_faults(ggtt->vm.gt); + + /* First fill our portion of the GTT with scratch pages */ + ggtt->vm.clear_range(&ggtt->vm, 0, ggtt->vm.total); + + /* Skip rewriting PTE on VMA unbind. */ + open = atomic_xchg(&ggtt->vm.open, 0); + + /* clflush objects bound into the GGTT and rebind them. */ + list_for_each_entry(vma, &ggtt->vm.bound_list, vm_link) { + struct drm_i915_gem_object *obj = vma->obj; + unsigned int was_bound = + atomic_read(&vma->flags) & I915_VMA_BIND_MASK; + + GEM_BUG_ON(!was_bound); + vma->ops->bind_vma(&ggtt->vm, NULL, vma, + obj ? obj->cache_level : 0, + was_bound); + if (obj) { /* only used during resume => exclusive access */ + flush |= fetch_and_zero(&obj->write_domain); + obj->read_domains |= I915_GEM_DOMAIN_GTT; + } + } + + atomic_set(&ggtt->vm.open, open); + ggtt->invalidate(ggtt); + + if (flush) + wbinvd_on_all_cpus(); + + if (INTEL_GEN(ggtt->vm.i915) >= 8) + setup_private_pat(ggtt->vm.gt->uncore); + + intel_ggtt_restore_fences(ggtt); +} + +static struct scatterlist * +rotate_pages(struct drm_i915_gem_object *obj, unsigned int offset, + unsigned int width, unsigned int height, + unsigned int stride, + struct sg_table *st, struct scatterlist *sg) +{ + unsigned int column, row; + unsigned int src_idx; + + for (column = 0; column < width; column++) { + src_idx = stride * (height - 1) + column + offset; + for (row = 0; row < height; row++) { + st->nents++; + /* + * We don't need the pages, but need to initialize + * the entries so the sg list can be happily traversed. + * The only thing we need are DMA addresses. + */ + sg_set_page(sg, NULL, I915_GTT_PAGE_SIZE, 0); + sg_dma_address(sg) = + i915_gem_object_get_dma_address(obj, src_idx); + sg_dma_len(sg) = I915_GTT_PAGE_SIZE; + sg = sg_next(sg); + src_idx -= stride; + } + } + + return sg; +} + +static noinline struct sg_table * +intel_rotate_pages(struct intel_rotation_info *rot_info, + struct drm_i915_gem_object *obj) +{ + unsigned int size = intel_rotation_info_size(rot_info); + struct drm_i915_private *i915 = to_i915(obj->base.dev); + struct sg_table *st; + struct scatterlist *sg; + int ret = -ENOMEM; + int i; + + /* Allocate target SG list. */ + st = kmalloc(sizeof(*st), GFP_KERNEL); + if (!st) + goto err_st_alloc; + + ret = sg_alloc_table(st, size, GFP_KERNEL); + if (ret) + goto err_sg_alloc; + + st->nents = 0; + sg = st->sgl; + + for (i = 0 ; i < ARRAY_SIZE(rot_info->plane); i++) { + sg = rotate_pages(obj, rot_info->plane[i].offset, + rot_info->plane[i].width, rot_info->plane[i].height, + rot_info->plane[i].stride, st, sg); + } + + return st; + +err_sg_alloc: + kfree(st); +err_st_alloc: + + drm_dbg(&i915->drm, "Failed to create rotated mapping for object size %zu! (%ux%u tiles, %u pages)\n", + obj->base.size, rot_info->plane[0].width, + rot_info->plane[0].height, size); + + return ERR_PTR(ret); +} + +static struct scatterlist * +remap_pages(struct drm_i915_gem_object *obj, unsigned int offset, + unsigned int width, unsigned int height, + unsigned int stride, + struct sg_table *st, struct scatterlist *sg) +{ + unsigned int row; + + for (row = 0; row < height; row++) { + unsigned int left = width * I915_GTT_PAGE_SIZE; + + while (left) { + dma_addr_t addr; + unsigned int length; + + /* + * We don't need the pages, but need to initialize + * the entries so the sg list can be happily traversed. + * The only thing we need are DMA addresses. + */ + + addr = i915_gem_object_get_dma_address_len(obj, offset, &length); + + length = min(left, length); + + st->nents++; + + sg_set_page(sg, NULL, length, 0); + sg_dma_address(sg) = addr; + sg_dma_len(sg) = length; + sg = sg_next(sg); + + offset += length / I915_GTT_PAGE_SIZE; + left -= length; + } + + offset += stride - width; + } + + return sg; +} + +static noinline struct sg_table * +intel_remap_pages(struct intel_remapped_info *rem_info, + struct drm_i915_gem_object *obj) +{ + unsigned int size = intel_remapped_info_size(rem_info); + struct drm_i915_private *i915 = to_i915(obj->base.dev); + struct sg_table *st; + struct scatterlist *sg; + int ret = -ENOMEM; + int i; + + /* Allocate target SG list. */ + st = kmalloc(sizeof(*st), GFP_KERNEL); + if (!st) + goto err_st_alloc; + + ret = sg_alloc_table(st, size, GFP_KERNEL); + if (ret) + goto err_sg_alloc; + + st->nents = 0; + sg = st->sgl; + + for (i = 0 ; i < ARRAY_SIZE(rem_info->plane); i++) { + sg = remap_pages(obj, rem_info->plane[i].offset, + rem_info->plane[i].width, rem_info->plane[i].height, + rem_info->plane[i].stride, st, sg); + } + + i915_sg_trim(st); + + return st; + +err_sg_alloc: + kfree(st); +err_st_alloc: + + drm_dbg(&i915->drm, "Failed to create remapped mapping for object size %zu! (%ux%u tiles, %u pages)\n", + obj->base.size, rem_info->plane[0].width, + rem_info->plane[0].height, size); + + return ERR_PTR(ret); +} + +static noinline struct sg_table * +intel_partial_pages(const struct i915_ggtt_view *view, + struct drm_i915_gem_object *obj) +{ + struct sg_table *st; + struct scatterlist *sg, *iter; + unsigned int count = view->partial.size; + unsigned int offset; + int ret = -ENOMEM; + + st = kmalloc(sizeof(*st), GFP_KERNEL); + if (!st) + goto err_st_alloc; + + ret = sg_alloc_table(st, count, GFP_KERNEL); + if (ret) + goto err_sg_alloc; + + iter = i915_gem_object_get_sg(obj, view->partial.offset, &offset); + GEM_BUG_ON(!iter); + + sg = st->sgl; + st->nents = 0; + do { + unsigned int len; + + len = min(iter->length - (offset << PAGE_SHIFT), + count << PAGE_SHIFT); + sg_set_page(sg, NULL, len, 0); + sg_dma_address(sg) = + sg_dma_address(iter) + (offset << PAGE_SHIFT); + sg_dma_len(sg) = len; + + st->nents++; + count -= len >> PAGE_SHIFT; + if (count == 0) { + sg_mark_end(sg); + i915_sg_trim(st); /* Drop any unused tail entries. */ + + return st; + } + + sg = __sg_next(sg); + iter = __sg_next(iter); + offset = 0; + } while (1); + +err_sg_alloc: + kfree(st); +err_st_alloc: + return ERR_PTR(ret); +} + +static int +i915_get_ggtt_vma_pages(struct i915_vma *vma) +{ + int ret; + + /* + * The vma->pages are only valid within the lifespan of the borrowed + * obj->mm.pages. When the obj->mm.pages sg_table is regenerated, so + * must be the vma->pages. A simple rule is that vma->pages must only + * be accessed when the obj->mm.pages are pinned. + */ + GEM_BUG_ON(!i915_gem_object_has_pinned_pages(vma->obj)); + + switch (vma->ggtt_view.type) { + default: + GEM_BUG_ON(vma->ggtt_view.type); + fallthrough; + case I915_GGTT_VIEW_NORMAL: + vma->pages = vma->obj->mm.pages; + return 0; + + case I915_GGTT_VIEW_ROTATED: + vma->pages = + intel_rotate_pages(&vma->ggtt_view.rotated, vma->obj); + break; + + case I915_GGTT_VIEW_REMAPPED: + vma->pages = + intel_remap_pages(&vma->ggtt_view.remapped, vma->obj); + break; + + case I915_GGTT_VIEW_PARTIAL: + vma->pages = intel_partial_pages(&vma->ggtt_view, vma->obj); + break; + } + + ret = 0; + if (IS_ERR(vma->pages)) { + ret = PTR_ERR(vma->pages); + vma->pages = NULL; + drm_err(&vma->vm->i915->drm, + "Failed to get pages for VMA view type %u (%d)!\n", + vma->ggtt_view.type, ret); + } + return ret; +} diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c b/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c new file mode 100644 index 000000000..cd71631be --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c @@ -0,0 +1,920 @@ +/* + * Copyright © 2008-2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "i915_drv.h" +#include "i915_scatterlist.h" +#include "i915_pvinfo.h" +#include "i915_vgpu.h" + +/** + * DOC: fence register handling + * + * Important to avoid confusions: "fences" in the i915 driver are not execution + * fences used to track command completion but hardware detiler objects which + * wrap a given range of the global GTT. Each platform has only a fairly limited + * set of these objects. + * + * Fences are used to detile GTT memory mappings. They're also connected to the + * hardware frontbuffer render tracking and hence interact with frontbuffer + * compression. Furthermore on older platforms fences are required for tiled + * objects used by the display engine. They can also be used by the render + * engine - they're required for blitter commands and are optional for render + * commands. But on gen4+ both display (with the exception of fbc) and rendering + * have their own tiling state bits and don't need fences. + * + * Also note that fences only support X and Y tiling and hence can't be used for + * the fancier new tiling formats like W, Ys and Yf. + * + * Finally note that because fences are such a restricted resource they're + * dynamically associated with objects. Furthermore fence state is committed to + * the hardware lazily to avoid unnecessary stalls on gen2/3. Therefore code must + * explicitly call i915_gem_object_get_fence() to synchronize fencing status + * for cpu access. Also note that some code wants an unfenced view, for those + * cases the fence can be removed forcefully with i915_gem_object_put_fence(). + * + * Internally these functions will synchronize with userspace access by removing + * CPU ptes into GTT mmaps (not the GTT ptes themselves) as needed. + */ + +#define pipelined 0 + +static struct drm_i915_private *fence_to_i915(struct i915_fence_reg *fence) +{ + return fence->ggtt->vm.i915; +} + +static struct intel_uncore *fence_to_uncore(struct i915_fence_reg *fence) +{ + return fence->ggtt->vm.gt->uncore; +} + +static void i965_write_fence_reg(struct i915_fence_reg *fence) +{ + i915_reg_t fence_reg_lo, fence_reg_hi; + int fence_pitch_shift; + u64 val; + + if (INTEL_GEN(fence_to_i915(fence)) >= 6) { + fence_reg_lo = FENCE_REG_GEN6_LO(fence->id); + fence_reg_hi = FENCE_REG_GEN6_HI(fence->id); + fence_pitch_shift = GEN6_FENCE_PITCH_SHIFT; + + } else { + fence_reg_lo = FENCE_REG_965_LO(fence->id); + fence_reg_hi = FENCE_REG_965_HI(fence->id); + fence_pitch_shift = I965_FENCE_PITCH_SHIFT; + } + + val = 0; + if (fence->tiling) { + unsigned int stride = fence->stride; + + GEM_BUG_ON(!IS_ALIGNED(stride, 128)); + + val = fence->start + fence->size - I965_FENCE_PAGE; + val <<= 32; + val |= fence->start; + val |= (u64)((stride / 128) - 1) << fence_pitch_shift; + if (fence->tiling == I915_TILING_Y) + val |= BIT(I965_FENCE_TILING_Y_SHIFT); + val |= I965_FENCE_REG_VALID; + } + + if (!pipelined) { + struct intel_uncore *uncore = fence_to_uncore(fence); + + /* + * To w/a incoherency with non-atomic 64-bit register updates, + * we split the 64-bit update into two 32-bit writes. In order + * for a partial fence not to be evaluated between writes, we + * precede the update with write to turn off the fence register, + * and only enable the fence as the last step. + * + * For extra levels of paranoia, we make sure each step lands + * before applying the next step. + */ + intel_uncore_write_fw(uncore, fence_reg_lo, 0); + intel_uncore_posting_read_fw(uncore, fence_reg_lo); + + intel_uncore_write_fw(uncore, fence_reg_hi, upper_32_bits(val)); + intel_uncore_write_fw(uncore, fence_reg_lo, lower_32_bits(val)); + intel_uncore_posting_read_fw(uncore, fence_reg_lo); + } +} + +static void i915_write_fence_reg(struct i915_fence_reg *fence) +{ + u32 val; + + val = 0; + if (fence->tiling) { + unsigned int stride = fence->stride; + unsigned int tiling = fence->tiling; + bool is_y_tiled = tiling == I915_TILING_Y; + + if (is_y_tiled && HAS_128_BYTE_Y_TILING(fence_to_i915(fence))) + stride /= 128; + else + stride /= 512; + GEM_BUG_ON(!is_power_of_2(stride)); + + val = fence->start; + if (is_y_tiled) + val |= BIT(I830_FENCE_TILING_Y_SHIFT); + val |= I915_FENCE_SIZE_BITS(fence->size); + val |= ilog2(stride) << I830_FENCE_PITCH_SHIFT; + + val |= I830_FENCE_REG_VALID; + } + + if (!pipelined) { + struct intel_uncore *uncore = fence_to_uncore(fence); + i915_reg_t reg = FENCE_REG(fence->id); + + intel_uncore_write_fw(uncore, reg, val); + intel_uncore_posting_read_fw(uncore, reg); + } +} + +static void i830_write_fence_reg(struct i915_fence_reg *fence) +{ + u32 val; + + val = 0; + if (fence->tiling) { + unsigned int stride = fence->stride; + + val = fence->start; + if (fence->tiling == I915_TILING_Y) + val |= BIT(I830_FENCE_TILING_Y_SHIFT); + val |= I830_FENCE_SIZE_BITS(fence->size); + val |= ilog2(stride / 128) << I830_FENCE_PITCH_SHIFT; + val |= I830_FENCE_REG_VALID; + } + + if (!pipelined) { + struct intel_uncore *uncore = fence_to_uncore(fence); + i915_reg_t reg = FENCE_REG(fence->id); + + intel_uncore_write_fw(uncore, reg, val); + intel_uncore_posting_read_fw(uncore, reg); + } +} + +static void fence_write(struct i915_fence_reg *fence) +{ + struct drm_i915_private *i915 = fence_to_i915(fence); + + /* + * Previous access through the fence register is marshalled by + * the mb() inside the fault handlers (i915_gem_release_mmaps) + * and explicitly managed for internal users. + */ + + if (IS_GEN(i915, 2)) + i830_write_fence_reg(fence); + else if (IS_GEN(i915, 3)) + i915_write_fence_reg(fence); + else + i965_write_fence_reg(fence); + + /* + * Access through the fenced region afterwards is + * ordered by the posting reads whilst writing the registers. + */ +} + +static bool gpu_uses_fence_registers(struct i915_fence_reg *fence) +{ + return INTEL_GEN(fence_to_i915(fence)) < 4; +} + +static int fence_update(struct i915_fence_reg *fence, + struct i915_vma *vma) +{ + struct i915_ggtt *ggtt = fence->ggtt; + struct intel_uncore *uncore = fence_to_uncore(fence); + intel_wakeref_t wakeref; + struct i915_vma *old; + int ret; + + fence->tiling = 0; + if (vma) { + GEM_BUG_ON(!i915_gem_object_get_stride(vma->obj) || + !i915_gem_object_get_tiling(vma->obj)); + + if (!i915_vma_is_map_and_fenceable(vma)) + return -EINVAL; + + if (gpu_uses_fence_registers(fence)) { + /* implicit 'unfenced' GPU blits */ + ret = i915_vma_sync(vma); + if (ret) + return ret; + } + + fence->start = vma->node.start; + fence->size = vma->fence_size; + fence->stride = i915_gem_object_get_stride(vma->obj); + fence->tiling = i915_gem_object_get_tiling(vma->obj); + } + WRITE_ONCE(fence->dirty, false); + + old = xchg(&fence->vma, NULL); + if (old) { + /* XXX Ideally we would move the waiting to outside the mutex */ + ret = i915_active_wait(&fence->active); + if (ret) { + fence->vma = old; + return ret; + } + + i915_vma_flush_writes(old); + + /* + * Ensure that all userspace CPU access is completed before + * stealing the fence. + */ + if (old != vma) { + GEM_BUG_ON(old->fence != fence); + i915_vma_revoke_mmap(old); + old->fence = NULL; + } + + list_move(&fence->link, &ggtt->fence_list); + } + + /* + * We only need to update the register itself if the device is awake. + * If the device is currently powered down, we will defer the write + * to the runtime resume, see intel_ggtt_restore_fences(). + * + * This only works for removing the fence register, on acquisition + * the caller must hold the rpm wakeref. The fence register must + * be cleared before we can use any other fences to ensure that + * the new fences do not overlap the elided clears, confusing HW. + */ + wakeref = intel_runtime_pm_get_if_in_use(uncore->rpm); + if (!wakeref) { + GEM_BUG_ON(vma); + return 0; + } + + WRITE_ONCE(fence->vma, vma); + fence_write(fence); + + if (vma) { + vma->fence = fence; + list_move_tail(&fence->link, &ggtt->fence_list); + } + + intel_runtime_pm_put(uncore->rpm, wakeref); + return 0; +} + +/** + * i915_vma_revoke_fence - force-remove fence for a VMA + * @vma: vma to map linearly (not through a fence reg) + * + * This function force-removes any fence from the given object, which is useful + * if the kernel wants to do untiled GTT access. + */ +void i915_vma_revoke_fence(struct i915_vma *vma) +{ + struct i915_fence_reg *fence = vma->fence; + intel_wakeref_t wakeref; + + lockdep_assert_held(&vma->vm->mutex); + if (!fence) + return; + + GEM_BUG_ON(fence->vma != vma); + GEM_BUG_ON(!i915_active_is_idle(&fence->active)); + GEM_BUG_ON(atomic_read(&fence->pin_count)); + + fence->tiling = 0; + WRITE_ONCE(fence->vma, NULL); + vma->fence = NULL; + + /* + * Skip the write to HW if and only if the device is currently + * suspended. + * + * If the driver does not currently hold a wakeref (if_in_use == 0), + * the device may currently be runtime suspended, or it may be woken + * up before the suspend takes place. If the device is not suspended + * (powered down) and we skip clearing the fence register, the HW is + * left in an undefined state where we may end up with multiple + * registers overlapping. + */ + with_intel_runtime_pm_if_active(fence_to_uncore(fence)->rpm, wakeref) + fence_write(fence); +} + +static struct i915_fence_reg *fence_find(struct i915_ggtt *ggtt) +{ + struct i915_fence_reg *fence; + + list_for_each_entry(fence, &ggtt->fence_list, link) { + GEM_BUG_ON(fence->vma && fence->vma->fence != fence); + + if (atomic_read(&fence->pin_count)) + continue; + + return fence; + } + + /* Wait for completion of pending flips which consume fences */ + if (intel_has_pending_fb_unpin(ggtt->vm.i915)) + return ERR_PTR(-EAGAIN); + + return ERR_PTR(-ENOBUFS); +} + +int __i915_vma_pin_fence(struct i915_vma *vma) +{ + struct i915_ggtt *ggtt = i915_vm_to_ggtt(vma->vm); + struct i915_fence_reg *fence; + struct i915_vma *set = i915_gem_object_is_tiled(vma->obj) ? vma : NULL; + int err; + + lockdep_assert_held(&vma->vm->mutex); + + /* Just update our place in the LRU if our fence is getting reused. */ + if (vma->fence) { + fence = vma->fence; + GEM_BUG_ON(fence->vma != vma); + atomic_inc(&fence->pin_count); + if (!fence->dirty) { + list_move_tail(&fence->link, &ggtt->fence_list); + return 0; + } + } else if (set) { + fence = fence_find(ggtt); + if (IS_ERR(fence)) + return PTR_ERR(fence); + + GEM_BUG_ON(atomic_read(&fence->pin_count)); + atomic_inc(&fence->pin_count); + } else { + return 0; + } + + err = fence_update(fence, set); + if (err) + goto out_unpin; + + GEM_BUG_ON(fence->vma != set); + GEM_BUG_ON(vma->fence != (set ? fence : NULL)); + + if (set) + return 0; + +out_unpin: + atomic_dec(&fence->pin_count); + return err; +} + +/** + * i915_vma_pin_fence - set up fencing for a vma + * @vma: vma to map through a fence reg + * + * When mapping objects through the GTT, userspace wants to be able to write + * to them without having to worry about swizzling if the object is tiled. + * This function walks the fence regs looking for a free one for @obj, + * stealing one if it can't find any. + * + * It then sets up the reg based on the object's properties: address, pitch + * and tiling format. + * + * For an untiled surface, this removes any existing fence. + * + * Returns: + * + * 0 on success, negative error code on failure. + */ +int i915_vma_pin_fence(struct i915_vma *vma) +{ + int err; + + if (!vma->fence && !i915_gem_object_is_tiled(vma->obj)) + return 0; + + /* + * Note that we revoke fences on runtime suspend. Therefore the user + * must keep the device awake whilst using the fence. + */ + assert_rpm_wakelock_held(vma->vm->gt->uncore->rpm); + GEM_BUG_ON(!i915_vma_is_pinned(vma)); + GEM_BUG_ON(!i915_vma_is_ggtt(vma)); + + err = mutex_lock_interruptible(&vma->vm->mutex); + if (err) + return err; + + err = __i915_vma_pin_fence(vma); + mutex_unlock(&vma->vm->mutex); + + return err; +} + +/** + * i915_reserve_fence - Reserve a fence for vGPU + * @ggtt: Global GTT + * + * This function walks the fence regs looking for a free one and remove + * it from the fence_list. It is used to reserve fence for vGPU to use. + */ +struct i915_fence_reg *i915_reserve_fence(struct i915_ggtt *ggtt) +{ + struct i915_fence_reg *fence; + int count; + int ret; + + lockdep_assert_held(&ggtt->vm.mutex); + + /* Keep at least one fence available for the display engine. */ + count = 0; + list_for_each_entry(fence, &ggtt->fence_list, link) + count += !atomic_read(&fence->pin_count); + if (count <= 1) + return ERR_PTR(-ENOSPC); + + fence = fence_find(ggtt); + if (IS_ERR(fence)) + return fence; + + if (fence->vma) { + /* Force-remove fence from VMA */ + ret = fence_update(fence, NULL); + if (ret) + return ERR_PTR(ret); + } + + list_del(&fence->link); + + return fence; +} + +/** + * i915_unreserve_fence - Reclaim a reserved fence + * @fence: the fence reg + * + * This function add a reserved fence register from vGPU to the fence_list. + */ +void i915_unreserve_fence(struct i915_fence_reg *fence) +{ + struct i915_ggtt *ggtt = fence->ggtt; + + lockdep_assert_held(&ggtt->vm.mutex); + + list_add(&fence->link, &ggtt->fence_list); +} + +/** + * intel_ggtt_restore_fences - restore fence state + * @ggtt: Global GTT + * + * Restore the hw fence state to match the software tracking again, to be called + * after a gpu reset and on resume. Note that on runtime suspend we only cancel + * the fences, to be reacquired by the user later. + */ +void intel_ggtt_restore_fences(struct i915_ggtt *ggtt) +{ + int i; + + for (i = 0; i < ggtt->num_fences; i++) + fence_write(&ggtt->fence_regs[i]); +} + +/** + * DOC: tiling swizzling details + * + * The idea behind tiling is to increase cache hit rates by rearranging + * pixel data so that a group of pixel accesses are in the same cacheline. + * Performance improvement from doing this on the back/depth buffer are on + * the order of 30%. + * + * Intel architectures make this somewhat more complicated, though, by + * adjustments made to addressing of data when the memory is in interleaved + * mode (matched pairs of DIMMS) to improve memory bandwidth. + * For interleaved memory, the CPU sends every sequential 64 bytes + * to an alternate memory channel so it can get the bandwidth from both. + * + * The GPU also rearranges its accesses for increased bandwidth to interleaved + * memory, and it matches what the CPU does for non-tiled. However, when tiled + * it does it a little differently, since one walks addresses not just in the + * X direction but also Y. So, along with alternating channels when bit + * 6 of the address flips, it also alternates when other bits flip -- Bits 9 + * (every 512 bytes, an X tile scanline) and 10 (every two X tile scanlines) + * are common to both the 915 and 965-class hardware. + * + * The CPU also sometimes XORs in higher bits as well, to improve + * bandwidth doing strided access like we do so frequently in graphics. This + * is called "Channel XOR Randomization" in the MCH documentation. The result + * is that the CPU is XORing in either bit 11 or bit 17 to bit 6 of its address + * decode. + * + * All of this bit 6 XORing has an effect on our memory management, + * as we need to make sure that the 3d driver can correctly address object + * contents. + * + * If we don't have interleaved memory, all tiling is safe and no swizzling is + * required. + * + * When bit 17 is XORed in, we simply refuse to tile at all. Bit + * 17 is not just a page offset, so as we page an object out and back in, + * individual pages in it will have different bit 17 addresses, resulting in + * each 64 bytes being swapped with its neighbor! + * + * Otherwise, if interleaved, we have to tell the 3d driver what the address + * swizzling it needs to do is, since it's writing with the CPU to the pages + * (bit 6 and potentially bit 11 XORed in), and the GPU is reading from the + * pages (bit 6, 9, and 10 XORed in), resulting in a cumulative bit swizzling + * required by the CPU of XORing in bit 6, 9, 10, and potentially 11, in order + * to match what the GPU expects. + */ + +/** + * detect_bit_6_swizzle - detect bit 6 swizzling pattern + * @ggtt: Global GGTT + * + * Detects bit 6 swizzling of address lookup between IGD access and CPU + * access through main memory. + */ +static void detect_bit_6_swizzle(struct i915_ggtt *ggtt) +{ + struct intel_uncore *uncore = ggtt->vm.gt->uncore; + struct drm_i915_private *i915 = ggtt->vm.i915; + u32 swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN; + u32 swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN; + + if (INTEL_GEN(i915) >= 8 || IS_VALLEYVIEW(i915)) { + /* + * On BDW+, swizzling is not used. We leave the CPU memory + * controller in charge of optimizing memory accesses without + * the extra address manipulation GPU side. + * + * VLV and CHV don't have GPU swizzling. + */ + swizzle_x = I915_BIT_6_SWIZZLE_NONE; + swizzle_y = I915_BIT_6_SWIZZLE_NONE; + } else if (INTEL_GEN(i915) >= 6) { + if (i915->preserve_bios_swizzle) { + if (intel_uncore_read(uncore, DISP_ARB_CTL) & + DISP_TILE_SURFACE_SWIZZLING) { + swizzle_x = I915_BIT_6_SWIZZLE_9_10; + swizzle_y = I915_BIT_6_SWIZZLE_9; + } else { + swizzle_x = I915_BIT_6_SWIZZLE_NONE; + swizzle_y = I915_BIT_6_SWIZZLE_NONE; + } + } else { + u32 dimm_c0, dimm_c1; + dimm_c0 = intel_uncore_read(uncore, MAD_DIMM_C0); + dimm_c1 = intel_uncore_read(uncore, MAD_DIMM_C1); + dimm_c0 &= MAD_DIMM_A_SIZE_MASK | MAD_DIMM_B_SIZE_MASK; + dimm_c1 &= MAD_DIMM_A_SIZE_MASK | MAD_DIMM_B_SIZE_MASK; + /* + * Enable swizzling when the channels are populated + * with identically sized dimms. We don't need to check + * the 3rd channel because no cpu with gpu attached + * ships in that configuration. Also, swizzling only + * makes sense for 2 channels anyway. + */ + if (dimm_c0 == dimm_c1) { + swizzle_x = I915_BIT_6_SWIZZLE_9_10; + swizzle_y = I915_BIT_6_SWIZZLE_9; + } else { + swizzle_x = I915_BIT_6_SWIZZLE_NONE; + swizzle_y = I915_BIT_6_SWIZZLE_NONE; + } + } + } else if (IS_GEN(i915, 5)) { + /* + * On Ironlake whatever DRAM config, GPU always do + * same swizzling setup. + */ + swizzle_x = I915_BIT_6_SWIZZLE_9_10; + swizzle_y = I915_BIT_6_SWIZZLE_9; + } else if (IS_GEN(i915, 2)) { + /* + * As far as we know, the 865 doesn't have these bit 6 + * swizzling issues. + */ + swizzle_x = I915_BIT_6_SWIZZLE_NONE; + swizzle_y = I915_BIT_6_SWIZZLE_NONE; + } else if (IS_G45(i915) || IS_I965G(i915) || IS_G33(i915)) { + /* + * The 965, G33, and newer, have a very flexible memory + * configuration. It will enable dual-channel mode + * (interleaving) on as much memory as it can, and the GPU + * will additionally sometimes enable different bit 6 + * swizzling for tiled objects from the CPU. + * + * Here's what I found on the G965: + * slot fill memory size swizzling + * 0A 0B 1A 1B 1-ch 2-ch + * 512 0 0 0 512 0 O + * 512 0 512 0 16 1008 X + * 512 0 0 512 16 1008 X + * 0 512 0 512 16 1008 X + * 1024 1024 1024 0 2048 1024 O + * + * We could probably detect this based on either the DRB + * matching, which was the case for the swizzling required in + * the table above, or from the 1-ch value being less than + * the minimum size of a rank. + * + * Reports indicate that the swizzling actually + * varies depending upon page placement inside the + * channels, i.e. we see swizzled pages where the + * banks of memory are paired and unswizzled on the + * uneven portion, so leave that as unknown. + */ + if (intel_uncore_read16(uncore, C0DRB3) == + intel_uncore_read16(uncore, C1DRB3)) { + swizzle_x = I915_BIT_6_SWIZZLE_9_10; + swizzle_y = I915_BIT_6_SWIZZLE_9; + } + } else { + u32 dcc = intel_uncore_read(uncore, DCC); + + /* + * On 9xx chipsets, channel interleave by the CPU is + * determined by DCC. For single-channel, neither the CPU + * nor the GPU do swizzling. For dual channel interleaved, + * the GPU's interleave is bit 9 and 10 for X tiled, and bit + * 9 for Y tiled. The CPU's interleave is independent, and + * can be based on either bit 11 (haven't seen this yet) or + * bit 17 (common). + */ + switch (dcc & DCC_ADDRESSING_MODE_MASK) { + case DCC_ADDRESSING_MODE_SINGLE_CHANNEL: + case DCC_ADDRESSING_MODE_DUAL_CHANNEL_ASYMMETRIC: + swizzle_x = I915_BIT_6_SWIZZLE_NONE; + swizzle_y = I915_BIT_6_SWIZZLE_NONE; + break; + case DCC_ADDRESSING_MODE_DUAL_CHANNEL_INTERLEAVED: + if (dcc & DCC_CHANNEL_XOR_DISABLE) { + /* + * This is the base swizzling by the GPU for + * tiled buffers. + */ + swizzle_x = I915_BIT_6_SWIZZLE_9_10; + swizzle_y = I915_BIT_6_SWIZZLE_9; + } else if ((dcc & DCC_CHANNEL_XOR_BIT_17) == 0) { + /* Bit 11 swizzling by the CPU in addition. */ + swizzle_x = I915_BIT_6_SWIZZLE_9_10_11; + swizzle_y = I915_BIT_6_SWIZZLE_9_11; + } else { + /* Bit 17 swizzling by the CPU in addition. */ + swizzle_x = I915_BIT_6_SWIZZLE_9_10_17; + swizzle_y = I915_BIT_6_SWIZZLE_9_17; + } + break; + } + + /* check for L-shaped memory aka modified enhanced addressing */ + if (IS_GEN(i915, 4) && + !(intel_uncore_read(uncore, DCC2) & DCC2_MODIFIED_ENHANCED_DISABLE)) { + swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN; + swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN; + } + + if (dcc == 0xffffffff) { + drm_err(&i915->drm, "Couldn't read from MCHBAR. " + "Disabling tiling.\n"); + swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN; + swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN; + } + } + + if (swizzle_x == I915_BIT_6_SWIZZLE_UNKNOWN || + swizzle_y == I915_BIT_6_SWIZZLE_UNKNOWN) { + /* + * Userspace likes to explode if it sees unknown swizzling, + * so lie. We will finish the lie when reporting through + * the get-tiling-ioctl by reporting the physical swizzle + * mode as unknown instead. + * + * As we don't strictly know what the swizzling is, it may be + * bit17 dependent, and so we need to also prevent the pages + * from being moved. + */ + i915->quirks |= QUIRK_PIN_SWIZZLED_PAGES; + swizzle_x = I915_BIT_6_SWIZZLE_NONE; + swizzle_y = I915_BIT_6_SWIZZLE_NONE; + } + + i915->ggtt.bit_6_swizzle_x = swizzle_x; + i915->ggtt.bit_6_swizzle_y = swizzle_y; +} + +/* + * Swap every 64 bytes of this page around, to account for it having a new + * bit 17 of its physical address and therefore being interpreted differently + * by the GPU. + */ +static void swizzle_page(struct page *page) +{ + char temp[64]; + char *vaddr; + int i; + + vaddr = kmap(page); + + for (i = 0; i < PAGE_SIZE; i += 128) { + memcpy(temp, &vaddr[i], 64); + memcpy(&vaddr[i], &vaddr[i + 64], 64); + memcpy(&vaddr[i + 64], temp, 64); + } + + kunmap(page); +} + +/** + * i915_gem_object_do_bit_17_swizzle - fixup bit 17 swizzling + * @obj: i915 GEM buffer object + * @pages: the scattergather list of physical pages + * + * This function fixes up the swizzling in case any page frame number for this + * object has changed in bit 17 since that state has been saved with + * i915_gem_object_save_bit_17_swizzle(). + * + * This is called when pinning backing storage again, since the kernel is free + * to move unpinned backing storage around (either by directly moving pages or + * by swapping them out and back in again). + */ +void +i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj, + struct sg_table *pages) +{ + struct sgt_iter sgt_iter; + struct page *page; + int i; + + if (obj->bit_17 == NULL) + return; + + i = 0; + for_each_sgt_page(page, sgt_iter, pages) { + char new_bit_17 = page_to_phys(page) >> 17; + if ((new_bit_17 & 0x1) != (test_bit(i, obj->bit_17) != 0)) { + swizzle_page(page); + set_page_dirty(page); + } + i++; + } +} + +/** + * i915_gem_object_save_bit_17_swizzle - save bit 17 swizzling + * @obj: i915 GEM buffer object + * @pages: the scattergather list of physical pages + * + * This function saves the bit 17 of each page frame number so that swizzling + * can be fixed up later on with i915_gem_object_do_bit_17_swizzle(). This must + * be called before the backing storage can be unpinned. + */ +void +i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj, + struct sg_table *pages) +{ + const unsigned int page_count = obj->base.size >> PAGE_SHIFT; + struct sgt_iter sgt_iter; + struct page *page; + int i; + + if (obj->bit_17 == NULL) { + obj->bit_17 = bitmap_zalloc(page_count, GFP_KERNEL); + if (obj->bit_17 == NULL) { + DRM_ERROR("Failed to allocate memory for bit 17 " + "record\n"); + return; + } + } + + i = 0; + + for_each_sgt_page(page, sgt_iter, pages) { + if (page_to_phys(page) & (1 << 17)) + __set_bit(i, obj->bit_17); + else + __clear_bit(i, obj->bit_17); + i++; + } +} + +void intel_ggtt_init_fences(struct i915_ggtt *ggtt) +{ + struct drm_i915_private *i915 = ggtt->vm.i915; + struct intel_uncore *uncore = ggtt->vm.gt->uncore; + int num_fences; + int i; + + INIT_LIST_HEAD(&ggtt->fence_list); + INIT_LIST_HEAD(&ggtt->userfault_list); + intel_wakeref_auto_init(&ggtt->userfault_wakeref, uncore->rpm); + + detect_bit_6_swizzle(ggtt); + + if (!i915_ggtt_has_aperture(ggtt)) + num_fences = 0; + else if (INTEL_GEN(i915) >= 7 && + !(IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915))) + num_fences = 32; + else if (INTEL_GEN(i915) >= 4 || + IS_I945G(i915) || IS_I945GM(i915) || + IS_G33(i915) || IS_PINEVIEW(i915)) + num_fences = 16; + else + num_fences = 8; + + if (intel_vgpu_active(i915)) + num_fences = intel_uncore_read(uncore, + vgtif_reg(avail_rs.fence_num)); + ggtt->fence_regs = kcalloc(num_fences, + sizeof(*ggtt->fence_regs), + GFP_KERNEL); + if (!ggtt->fence_regs) + num_fences = 0; + + /* Initialize fence registers to zero */ + for (i = 0; i < num_fences; i++) { + struct i915_fence_reg *fence = &ggtt->fence_regs[i]; + + i915_active_init(&fence->active, NULL, NULL); + fence->ggtt = ggtt; + fence->id = i; + list_add_tail(&fence->link, &ggtt->fence_list); + } + ggtt->num_fences = num_fences; + + intel_ggtt_restore_fences(ggtt); +} + +void intel_ggtt_fini_fences(struct i915_ggtt *ggtt) +{ + int i; + + for (i = 0; i < ggtt->num_fences; i++) { + struct i915_fence_reg *fence = &ggtt->fence_regs[i]; + + i915_active_fini(&fence->active); + } + + kfree(ggtt->fence_regs); +} + +void intel_gt_init_swizzling(struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + struct intel_uncore *uncore = gt->uncore; + + if (INTEL_GEN(i915) < 5 || + i915->ggtt.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE) + return; + + intel_uncore_rmw(uncore, DISP_ARB_CTL, 0, DISP_TILE_SURFACE_SWIZZLING); + + if (IS_GEN(i915, 5)) + return; + + intel_uncore_rmw(uncore, TILECTL, 0, TILECTL_SWZCTL); + + if (IS_GEN(i915, 6)) + intel_uncore_write(uncore, + ARB_MODE, + _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB)); + else if (IS_GEN(i915, 7)) + intel_uncore_write(uncore, + ARB_MODE, + _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB)); + else if (IS_GEN(i915, 8)) + intel_uncore_write(uncore, + GAMTARBMODE, + _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW)); + else + MISSING_CASE(INTEL_GEN(i915)); +} diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.h b/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.h new file mode 100644 index 000000000..9eef679e1 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.h @@ -0,0 +1,78 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#ifndef __INTEL_GGTT_FENCING_H__ +#define __INTEL_GGTT_FENCING_H__ + +#include <linux/list.h> +#include <linux/types.h> + +#include "i915_active.h" + +struct drm_i915_gem_object; +struct i915_ggtt; +struct i915_vma; +struct intel_gt; +struct sg_table; + +#define I965_FENCE_PAGE 4096UL + +struct i915_fence_reg { + struct list_head link; + struct i915_ggtt *ggtt; + struct i915_vma *vma; + atomic_t pin_count; + struct i915_active active; + int id; + /** + * Whether the tiling parameters for the currently + * associated fence register have changed. Note that + * for the purposes of tracking tiling changes we also + * treat the unfenced register, the register slot that + * the object occupies whilst it executes a fenced + * command (such as BLT on gen2/3), as a "fence". + */ + bool dirty; + u32 start; + u32 size; + u32 tiling; + u32 stride; +}; + +struct i915_fence_reg *i915_reserve_fence(struct i915_ggtt *ggtt); +void i915_unreserve_fence(struct i915_fence_reg *fence); + +void intel_ggtt_restore_fences(struct i915_ggtt *ggtt); + +void i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj, + struct sg_table *pages); +void i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj, + struct sg_table *pages); + +void intel_ggtt_init_fences(struct i915_ggtt *ggtt); +void intel_ggtt_fini_fences(struct i915_ggtt *ggtt); + +void intel_gt_init_swizzling(struct intel_gt *gt); + +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h new file mode 100644 index 000000000..534e435f2 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h @@ -0,0 +1,362 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright � 2003-2018 Intel Corporation + */ + +#ifndef _INTEL_GPU_COMMANDS_H_ +#define _INTEL_GPU_COMMANDS_H_ + +#include <linux/bitops.h> + +/* + * Target address alignments required for GPU access e.g. + * MI_STORE_DWORD_IMM. + */ +#define alignof_dword 4 +#define alignof_qword 8 + +/* + * Instruction field definitions used by the command parser + */ +#define INSTR_CLIENT_SHIFT 29 +#define INSTR_MI_CLIENT 0x0 +#define INSTR_BC_CLIENT 0x2 +#define INSTR_RC_CLIENT 0x3 +#define INSTR_SUBCLIENT_SHIFT 27 +#define INSTR_SUBCLIENT_MASK 0x18000000 +#define INSTR_MEDIA_SUBCLIENT 0x2 +#define INSTR_26_TO_24_MASK 0x7000000 +#define INSTR_26_TO_24_SHIFT 24 + +/* + * Memory interface instructions used by the kernel + */ +#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags)) +/* Many MI commands use bit 22 of the header dword for GGTT vs PPGTT */ +#define MI_GLOBAL_GTT (1<<22) + +#define MI_NOOP MI_INSTR(0, 0) +#define MI_USER_INTERRUPT MI_INSTR(0x02, 0) +#define MI_WAIT_FOR_EVENT MI_INSTR(0x03, 0) +#define MI_WAIT_FOR_OVERLAY_FLIP (1<<16) +#define MI_WAIT_FOR_PLANE_B_FLIP (1<<6) +#define MI_WAIT_FOR_PLANE_A_FLIP (1<<2) +#define MI_WAIT_FOR_PLANE_A_SCANLINES (1<<1) +#define MI_FLUSH MI_INSTR(0x04, 0) +#define MI_READ_FLUSH (1 << 0) +#define MI_EXE_FLUSH (1 << 1) +#define MI_NO_WRITE_FLUSH (1 << 2) +#define MI_SCENE_COUNT (1 << 3) /* just increment scene count */ +#define MI_END_SCENE (1 << 4) /* flush binner and incr scene count */ +#define MI_INVALIDATE_ISP (1 << 5) /* invalidate indirect state pointers */ +#define MI_REPORT_HEAD MI_INSTR(0x07, 0) +#define MI_ARB_ON_OFF MI_INSTR(0x08, 0) +#define MI_ARB_ENABLE (1<<0) +#define MI_ARB_DISABLE (0<<0) +#define MI_BATCH_BUFFER_END MI_INSTR(0x0a, 0) +#define MI_SUSPEND_FLUSH MI_INSTR(0x0b, 0) +#define MI_SUSPEND_FLUSH_EN (1<<0) +#define MI_SET_APPID MI_INSTR(0x0e, 0) +#define MI_OVERLAY_FLIP MI_INSTR(0x11, 0) +#define MI_OVERLAY_CONTINUE (0x0<<21) +#define MI_OVERLAY_ON (0x1<<21) +#define MI_OVERLAY_OFF (0x2<<21) +#define MI_LOAD_SCAN_LINES_INCL MI_INSTR(0x12, 0) +#define MI_DISPLAY_FLIP MI_INSTR(0x14, 2) +#define MI_DISPLAY_FLIP_I915 MI_INSTR(0x14, 1) +#define MI_DISPLAY_FLIP_PLANE(n) ((n) << 20) +/* IVB has funny definitions for which plane to flip. */ +#define MI_DISPLAY_FLIP_IVB_PLANE_A (0 << 19) +#define MI_DISPLAY_FLIP_IVB_PLANE_B (1 << 19) +#define MI_DISPLAY_FLIP_IVB_SPRITE_A (2 << 19) +#define MI_DISPLAY_FLIP_IVB_SPRITE_B (3 << 19) +#define MI_DISPLAY_FLIP_IVB_PLANE_C (4 << 19) +#define MI_DISPLAY_FLIP_IVB_SPRITE_C (5 << 19) +/* SKL ones */ +#define MI_DISPLAY_FLIP_SKL_PLANE_1_A (0 << 8) +#define MI_DISPLAY_FLIP_SKL_PLANE_1_B (1 << 8) +#define MI_DISPLAY_FLIP_SKL_PLANE_1_C (2 << 8) +#define MI_DISPLAY_FLIP_SKL_PLANE_2_A (4 << 8) +#define MI_DISPLAY_FLIP_SKL_PLANE_2_B (5 << 8) +#define MI_DISPLAY_FLIP_SKL_PLANE_2_C (6 << 8) +#define MI_DISPLAY_FLIP_SKL_PLANE_3_A (7 << 8) +#define MI_DISPLAY_FLIP_SKL_PLANE_3_B (8 << 8) +#define MI_DISPLAY_FLIP_SKL_PLANE_3_C (9 << 8) +#define MI_SEMAPHORE_MBOX MI_INSTR(0x16, 1) /* gen6, gen7 */ +#define MI_SEMAPHORE_GLOBAL_GTT (1<<22) +#define MI_SEMAPHORE_UPDATE (1<<21) +#define MI_SEMAPHORE_COMPARE (1<<20) +#define MI_SEMAPHORE_REGISTER (1<<18) +#define MI_SEMAPHORE_SYNC_VR (0<<16) /* RCS wait for VCS (RVSYNC) */ +#define MI_SEMAPHORE_SYNC_VER (1<<16) /* RCS wait for VECS (RVESYNC) */ +#define MI_SEMAPHORE_SYNC_BR (2<<16) /* RCS wait for BCS (RBSYNC) */ +#define MI_SEMAPHORE_SYNC_BV (0<<16) /* VCS wait for BCS (VBSYNC) */ +#define MI_SEMAPHORE_SYNC_VEV (1<<16) /* VCS wait for VECS (VVESYNC) */ +#define MI_SEMAPHORE_SYNC_RV (2<<16) /* VCS wait for RCS (VRSYNC) */ +#define MI_SEMAPHORE_SYNC_RB (0<<16) /* BCS wait for RCS (BRSYNC) */ +#define MI_SEMAPHORE_SYNC_VEB (1<<16) /* BCS wait for VECS (BVESYNC) */ +#define MI_SEMAPHORE_SYNC_VB (2<<16) /* BCS wait for VCS (BVSYNC) */ +#define MI_SEMAPHORE_SYNC_BVE (0<<16) /* VECS wait for BCS (VEBSYNC) */ +#define MI_SEMAPHORE_SYNC_VVE (1<<16) /* VECS wait for VCS (VEVSYNC) */ +#define MI_SEMAPHORE_SYNC_RVE (2<<16) /* VECS wait for RCS (VERSYNC) */ +#define MI_SEMAPHORE_SYNC_INVALID (3<<16) +#define MI_SEMAPHORE_SYNC_MASK (3<<16) +#define MI_SET_CONTEXT MI_INSTR(0x18, 0) +#define MI_MM_SPACE_GTT (1<<8) +#define MI_MM_SPACE_PHYSICAL (0<<8) +#define MI_SAVE_EXT_STATE_EN (1<<3) +#define MI_RESTORE_EXT_STATE_EN (1<<2) +#define MI_FORCE_RESTORE (1<<1) +#define MI_RESTORE_INHIBIT (1<<0) +#define HSW_MI_RS_SAVE_STATE_EN (1<<3) +#define HSW_MI_RS_RESTORE_STATE_EN (1<<2) +#define MI_SEMAPHORE_SIGNAL MI_INSTR(0x1b, 0) /* GEN8+ */ +#define MI_SEMAPHORE_TARGET(engine) ((engine)<<15) +#define MI_SEMAPHORE_WAIT MI_INSTR(0x1c, 2) /* GEN8+ */ +#define MI_SEMAPHORE_WAIT_TOKEN MI_INSTR(0x1c, 3) /* GEN12+ */ +#define MI_SEMAPHORE_POLL (1 << 15) +#define MI_SEMAPHORE_SAD_GT_SDD (0 << 12) +#define MI_SEMAPHORE_SAD_GTE_SDD (1 << 12) +#define MI_SEMAPHORE_SAD_LT_SDD (2 << 12) +#define MI_SEMAPHORE_SAD_LTE_SDD (3 << 12) +#define MI_SEMAPHORE_SAD_EQ_SDD (4 << 12) +#define MI_SEMAPHORE_SAD_NEQ_SDD (5 << 12) +#define MI_SEMAPHORE_TOKEN_MASK REG_GENMASK(9, 5) +#define MI_SEMAPHORE_TOKEN_SHIFT 5 +#define MI_STORE_DWORD_IMM MI_INSTR(0x20, 1) +#define MI_STORE_DWORD_IMM_GEN4 MI_INSTR(0x20, 2) +#define MI_MEM_VIRTUAL (1 << 22) /* 945,g33,965 */ +#define MI_USE_GGTT (1 << 22) /* g4x+ */ +#define MI_STORE_DWORD_INDEX MI_INSTR(0x21, 1) +/* + * Official intel docs are somewhat sloppy concerning MI_LOAD_REGISTER_IMM: + * - Always issue a MI_NOOP _before_ the MI_LOAD_REGISTER_IMM - otherwise hw + * simply ignores the register load under certain conditions. + * - One can actually load arbitrary many arbitrary registers: Simply issue x + * address/value pairs. Don't overdue it, though, x <= 2^4 must hold! + */ +#define MI_LOAD_REGISTER_IMM(x) MI_INSTR(0x22, 2*(x)-1) +/* Gen11+. addr = base + (ctx_restore ? offset & GENMASK(12,2) : offset) */ +#define MI_LRI_LRM_CS_MMIO REG_BIT(19) +#define MI_LRI_FORCE_POSTED (1<<12) +#define MI_LOAD_REGISTER_IMM_MAX_REGS (126) +#define MI_STORE_REGISTER_MEM MI_INSTR(0x24, 1) +#define MI_STORE_REGISTER_MEM_GEN8 MI_INSTR(0x24, 2) +#define MI_SRM_LRM_GLOBAL_GTT (1<<22) +#define MI_FLUSH_DW MI_INSTR(0x26, 1) /* for GEN6 */ +#define MI_FLUSH_DW_STORE_INDEX (1<<21) +#define MI_INVALIDATE_TLB (1<<18) +#define MI_FLUSH_DW_OP_STOREDW (1<<14) +#define MI_FLUSH_DW_OP_MASK (3<<14) +#define MI_FLUSH_DW_NOTIFY (1<<8) +#define MI_INVALIDATE_BSD (1<<7) +#define MI_FLUSH_DW_USE_GTT (1<<2) +#define MI_FLUSH_DW_USE_PPGTT (0<<2) +#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 1) +#define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2) +#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 1) +#define MI_LRR_SOURCE_CS_MMIO REG_BIT(18) +#define MI_BATCH_BUFFER MI_INSTR(0x30, 1) +#define MI_BATCH_NON_SECURE (1) +/* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */ +#define MI_BATCH_NON_SECURE_I965 (1<<8) +#define MI_BATCH_PPGTT_HSW (1<<8) +#define MI_BATCH_NON_SECURE_HSW (1<<13) +#define MI_BATCH_BUFFER_START MI_INSTR(0x31, 0) +#define MI_BATCH_GTT (2<<6) /* aliased with (1<<7) on gen4 */ +#define MI_BATCH_BUFFER_START_GEN8 MI_INSTR(0x31, 1) +#define MI_BATCH_RESOURCE_STREAMER REG_BIT(10) +#define MI_BATCH_PREDICATE REG_BIT(15) /* HSW+ on RCS only*/ + +/* + * 3D instructions used by the kernel + */ +#define GFX_INSTR(opcode, flags) ((0x3 << 29) | ((opcode) << 24) | (flags)) + +#define GEN9_MEDIA_POOL_STATE ((0x3 << 29) | (0x2 << 27) | (0x5 << 16) | 4) +#define GEN9_MEDIA_POOL_ENABLE (1 << 31) +#define GFX_OP_RASTER_RULES ((0x3<<29)|(0x7<<24)) +#define GFX_OP_SCISSOR ((0x3<<29)|(0x1c<<24)|(0x10<<19)) +#define SC_UPDATE_SCISSOR (0x1<<1) +#define SC_ENABLE_MASK (0x1<<0) +#define SC_ENABLE (0x1<<0) +#define GFX_OP_LOAD_INDIRECT ((0x3<<29)|(0x1d<<24)|(0x7<<16)) +#define GFX_OP_SCISSOR_INFO ((0x3<<29)|(0x1d<<24)|(0x81<<16)|(0x1)) +#define SCI_YMIN_MASK (0xffff<<16) +#define SCI_XMIN_MASK (0xffff<<0) +#define SCI_YMAX_MASK (0xffff<<16) +#define SCI_XMAX_MASK (0xffff<<0) +#define GFX_OP_SCISSOR_ENABLE ((0x3<<29)|(0x1c<<24)|(0x10<<19)) +#define GFX_OP_SCISSOR_RECT ((0x3<<29)|(0x1d<<24)|(0x81<<16)|1) +#define GFX_OP_COLOR_FACTOR ((0x3<<29)|(0x1d<<24)|(0x1<<16)|0x0) +#define GFX_OP_STIPPLE ((0x3<<29)|(0x1d<<24)|(0x83<<16)) +#define GFX_OP_MAP_INFO ((0x3<<29)|(0x1d<<24)|0x4) +#define GFX_OP_DESTBUFFER_VARS ((0x3<<29)|(0x1d<<24)|(0x85<<16)|0x0) +#define GFX_OP_DESTBUFFER_INFO ((0x3<<29)|(0x1d<<24)|(0x8e<<16)|1) +#define GFX_OP_DRAWRECT_INFO ((0x3<<29)|(0x1d<<24)|(0x80<<16)|(0x3)) +#define GFX_OP_DRAWRECT_INFO_I965 ((0x7900<<16)|0x2) + +#define COLOR_BLT_CMD (2 << 29 | 0x40 << 22 | (5 - 2)) +#define XY_COLOR_BLT_CMD (2 << 29 | 0x50 << 22) +#define SRC_COPY_BLT_CMD (2 << 29 | 0x43 << 22) +#define GEN9_XY_FAST_COPY_BLT_CMD (2 << 29 | 0x42 << 22) +#define XY_SRC_COPY_BLT_CMD (2 << 29 | 0x53 << 22) +#define XY_MONO_SRC_COPY_IMM_BLT (2 << 29 | 0x71 << 22 | 5) +#define BLT_WRITE_A (2<<20) +#define BLT_WRITE_RGB (1<<20) +#define BLT_WRITE_RGBA (BLT_WRITE_RGB | BLT_WRITE_A) +#define BLT_DEPTH_8 (0<<24) +#define BLT_DEPTH_16_565 (1<<24) +#define BLT_DEPTH_16_1555 (2<<24) +#define BLT_DEPTH_32 (3<<24) +#define BLT_ROP_SRC_COPY (0xcc<<16) +#define BLT_ROP_COLOR_COPY (0xf0<<16) +#define XY_SRC_COPY_BLT_SRC_TILED (1<<15) /* 965+ only */ +#define XY_SRC_COPY_BLT_DST_TILED (1<<11) /* 965+ only */ +#define CMD_OP_DISPLAYBUFFER_INFO ((0x0<<29)|(0x14<<23)|2) +#define ASYNC_FLIP (1<<22) +#define DISPLAY_PLANE_A (0<<20) +#define DISPLAY_PLANE_B (1<<20) +#define GFX_OP_PIPE_CONTROL(len) ((0x3<<29)|(0x3<<27)|(0x2<<24)|((len)-2)) +#define PIPE_CONTROL_COMMAND_CACHE_INVALIDATE (1<<29) /* gen11+ */ +#define PIPE_CONTROL_TILE_CACHE_FLUSH (1<<28) /* gen11+ */ +#define PIPE_CONTROL_FLUSH_L3 (1<<27) +#define PIPE_CONTROL_GLOBAL_GTT_IVB (1<<24) /* gen7+ */ +#define PIPE_CONTROL_MMIO_WRITE (1<<23) +#define PIPE_CONTROL_STORE_DATA_INDEX (1<<21) +#define PIPE_CONTROL_CS_STALL (1<<20) +#define PIPE_CONTROL_TLB_INVALIDATE (1<<18) +#define PIPE_CONTROL_MEDIA_STATE_CLEAR (1<<16) +#define PIPE_CONTROL_WRITE_TIMESTAMP (3<<14) +#define PIPE_CONTROL_QW_WRITE (1<<14) +#define PIPE_CONTROL_POST_SYNC_OP_MASK (3<<14) +#define PIPE_CONTROL_DEPTH_STALL (1<<13) +#define PIPE_CONTROL_WRITE_FLUSH (1<<12) +#define PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH (1<<12) /* gen6+ */ +#define PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE (1<<11) /* MBZ on ILK */ +#define PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE (1<<10) /* GM45+ only */ +#define PIPE_CONTROL_INDIRECT_STATE_DISABLE (1<<9) +#define PIPE_CONTROL0_HDC_PIPELINE_FLUSH REG_BIT(9) /* gen12 */ +#define PIPE_CONTROL_NOTIFY (1<<8) +#define PIPE_CONTROL_FLUSH_ENABLE (1<<7) /* gen7+ */ +#define PIPE_CONTROL_DC_FLUSH_ENABLE (1<<5) +#define PIPE_CONTROL_VF_CACHE_INVALIDATE (1<<4) +#define PIPE_CONTROL_CONST_CACHE_INVALIDATE (1<<3) +#define PIPE_CONTROL_STATE_CACHE_INVALIDATE (1<<2) +#define PIPE_CONTROL_STALL_AT_SCOREBOARD (1<<1) +#define PIPE_CONTROL_DEPTH_CACHE_FLUSH (1<<0) +#define PIPE_CONTROL_GLOBAL_GTT (1<<2) /* in addr dword */ + +#define MI_MATH(x) MI_INSTR(0x1a, (x) - 1) +#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2)) +/* Opcodes for MI_MATH_INSTR */ +#define MI_MATH_NOOP MI_MATH_INSTR(0x000, 0x0, 0x0) +#define MI_MATH_LOAD(op1, op2) MI_MATH_INSTR(0x080, op1, op2) +#define MI_MATH_LOADINV(op1, op2) MI_MATH_INSTR(0x480, op1, op2) +#define MI_MATH_LOAD0(op1) MI_MATH_INSTR(0x081, op1) +#define MI_MATH_LOAD1(op1) MI_MATH_INSTR(0x481, op1) +#define MI_MATH_ADD MI_MATH_INSTR(0x100, 0x0, 0x0) +#define MI_MATH_SUB MI_MATH_INSTR(0x101, 0x0, 0x0) +#define MI_MATH_AND MI_MATH_INSTR(0x102, 0x0, 0x0) +#define MI_MATH_OR MI_MATH_INSTR(0x103, 0x0, 0x0) +#define MI_MATH_XOR MI_MATH_INSTR(0x104, 0x0, 0x0) +#define MI_MATH_STORE(op1, op2) MI_MATH_INSTR(0x180, op1, op2) +#define MI_MATH_STOREINV(op1, op2) MI_MATH_INSTR(0x580, op1, op2) +/* Registers used as operands in MI_MATH_INSTR */ +#define MI_MATH_REG(x) (x) +#define MI_MATH_REG_SRCA 0x20 +#define MI_MATH_REG_SRCB 0x21 +#define MI_MATH_REG_ACCU 0x31 +#define MI_MATH_REG_ZF 0x32 +#define MI_MATH_REG_CF 0x33 + +/* + * Commands used only by the command parser + */ +#define MI_SET_PREDICATE MI_INSTR(0x01, 0) +#define MI_ARB_CHECK MI_INSTR(0x05, 0) +#define MI_RS_CONTROL MI_INSTR(0x06, 0) +#define MI_URB_ATOMIC_ALLOC MI_INSTR(0x09, 0) +#define MI_PREDICATE MI_INSTR(0x0C, 0) +#define MI_RS_CONTEXT MI_INSTR(0x0F, 0) +#define MI_TOPOLOGY_FILTER MI_INSTR(0x0D, 0) +#define MI_LOAD_SCAN_LINES_EXCL MI_INSTR(0x13, 0) +#define MI_URB_CLEAR MI_INSTR(0x19, 0) +#define MI_UPDATE_GTT MI_INSTR(0x23, 0) +#define MI_CLFLUSH MI_INSTR(0x27, 0) +#define MI_REPORT_PERF_COUNT MI_INSTR(0x28, 0) +#define MI_REPORT_PERF_COUNT_GGTT (1<<0) +#define MI_RS_STORE_DATA_IMM MI_INSTR(0x2B, 0) +#define MI_LOAD_URB_MEM MI_INSTR(0x2C, 0) +#define MI_STORE_URB_MEM MI_INSTR(0x2D, 0) +#define MI_CONDITIONAL_BATCH_BUFFER_END MI_INSTR(0x36, 0) + +#define STATE_BASE_ADDRESS \ + ((0x3 << 29) | (0x0 << 27) | (0x1 << 24) | (0x1 << 16)) +#define BASE_ADDRESS_MODIFY REG_BIT(0) +#define PIPELINE_SELECT \ + ((0x3 << 29) | (0x1 << 27) | (0x1 << 24) | (0x4 << 16)) +#define PIPELINE_SELECT_MEDIA REG_BIT(0) +#define GFX_OP_3DSTATE_VF_STATISTICS \ + ((0x3 << 29) | (0x1 << 27) | (0x0 << 24) | (0xB << 16)) +#define MEDIA_VFE_STATE \ + ((0x3 << 29) | (0x2 << 27) | (0x0 << 24) | (0x0 << 16)) +#define MEDIA_VFE_STATE_MMIO_ACCESS_MASK (0x18) +#define MEDIA_INTERFACE_DESCRIPTOR_LOAD \ + ((0x3 << 29) | (0x2 << 27) | (0x0 << 24) | (0x2 << 16)) +#define MEDIA_OBJECT \ + ((0x3 << 29) | (0x2 << 27) | (0x1 << 24) | (0x0 << 16)) +#define GPGPU_OBJECT ((0x3<<29)|(0x2<<27)|(0x1<<24)|(0x4<<16)) +#define GPGPU_WALKER ((0x3<<29)|(0x2<<27)|(0x1<<24)|(0x5<<16)) +#define GFX_OP_3DSTATE_DX9_CONSTANTF_VS \ + ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x39<<16)) +#define GFX_OP_3DSTATE_DX9_CONSTANTF_PS \ + ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x3A<<16)) +#define GFX_OP_3DSTATE_SO_DECL_LIST \ + ((0x3<<29)|(0x3<<27)|(0x1<<24)|(0x17<<16)) + +#define GFX_OP_3DSTATE_BINDING_TABLE_EDIT_VS \ + ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x43<<16)) +#define GFX_OP_3DSTATE_BINDING_TABLE_EDIT_GS \ + ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x44<<16)) +#define GFX_OP_3DSTATE_BINDING_TABLE_EDIT_HS \ + ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x45<<16)) +#define GFX_OP_3DSTATE_BINDING_TABLE_EDIT_DS \ + ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x46<<16)) +#define GFX_OP_3DSTATE_BINDING_TABLE_EDIT_PS \ + ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x47<<16)) + +#define MFX_WAIT ((0x3<<29)|(0x1<<27)|(0x0<<16)) + +#define COLOR_BLT ((0x2<<29)|(0x40<<22)) +#define SRC_COPY_BLT ((0x2<<29)|(0x43<<22)) + +/* + * Used to convert any address to canonical form. + * Starting from gen8, some commands (e.g. STATE_BASE_ADDRESS, + * MI_LOAD_REGISTER_MEM and others, see Broadwell PRM Vol2a) require the + * addresses to be in a canonical form: + * "GraphicsAddress[63:48] are ignored by the HW and assumed to be in correct + * canonical form [63:48] == [47]." + */ +#define GEN8_HIGH_ADDRESS_BIT 47 +static inline u64 gen8_canonical_addr(u64 address) +{ + return sign_extend64(address, GEN8_HIGH_ADDRESS_BIT); +} + +static inline u64 gen8_noncanonical_addr(u64 address) +{ + return address & GENMASK_ULL(GEN8_HIGH_ADDRESS_BIT, 0); +} + +static inline u32 *__gen6_emit_bb_start(u32 *cs, u32 addr, unsigned int flags) +{ + *cs++ = MI_BATCH_BUFFER_START | flags; + *cs++ = addr; + + return cs; +} + +#endif /* _INTEL_GPU_COMMANDS_H_ */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c new file mode 100644 index 000000000..5f86d9aac --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -0,0 +1,782 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2019 Intel Corporation + */ + +#include "debugfs_gt.h" +#include "i915_drv.h" +#include "intel_context.h" +#include "intel_gt.h" +#include "intel_gt_buffer_pool.h" +#include "intel_gt_clock_utils.h" +#include "intel_gt_pm.h" +#include "intel_gt_requests.h" +#include "intel_mocs.h" +#include "intel_rc6.h" +#include "intel_renderstate.h" +#include "intel_rps.h" +#include "intel_uncore.h" +#include "intel_pm.h" +#include "shmem_utils.h" + +void intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915) +{ + gt->i915 = i915; + gt->uncore = &i915->uncore; + + spin_lock_init(>->irq_lock); + + mutex_init(>->tlb_invalidate_lock); + + INIT_LIST_HEAD(>->closed_vma); + spin_lock_init(>->closed_lock); + + intel_gt_init_buffer_pool(gt); + intel_gt_init_reset(gt); + intel_gt_init_requests(gt); + intel_gt_init_timelines(gt); + intel_gt_pm_init_early(gt); + + intel_rps_init_early(>->rps); + intel_uc_init_early(>->uc); +} + +void intel_gt_init_hw_early(struct intel_gt *gt, struct i915_ggtt *ggtt) +{ + gt->ggtt = ggtt; +} + +int intel_gt_init_mmio(struct intel_gt *gt) +{ + intel_uc_init_mmio(>->uc); + intel_sseu_info_init(gt); + + return intel_engines_init_mmio(gt); +} + +static void init_unused_ring(struct intel_gt *gt, u32 base) +{ + struct intel_uncore *uncore = gt->uncore; + + intel_uncore_write(uncore, RING_CTL(base), 0); + intel_uncore_write(uncore, RING_HEAD(base), 0); + intel_uncore_write(uncore, RING_TAIL(base), 0); + intel_uncore_write(uncore, RING_START(base), 0); +} + +static void init_unused_rings(struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + + if (IS_I830(i915)) { + init_unused_ring(gt, PRB1_BASE); + init_unused_ring(gt, SRB0_BASE); + init_unused_ring(gt, SRB1_BASE); + init_unused_ring(gt, SRB2_BASE); + init_unused_ring(gt, SRB3_BASE); + } else if (IS_GEN(i915, 2)) { + init_unused_ring(gt, SRB0_BASE); + init_unused_ring(gt, SRB1_BASE); + } else if (IS_GEN(i915, 3)) { + init_unused_ring(gt, PRB1_BASE); + init_unused_ring(gt, PRB2_BASE); + } +} + +int intel_gt_init_hw(struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + struct intel_uncore *uncore = gt->uncore; + int ret; + + gt->last_init_time = ktime_get(); + + /* Double layer security blanket, see i915_gem_init() */ + intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); + + if (HAS_EDRAM(i915) && INTEL_GEN(i915) < 9) + intel_uncore_rmw(uncore, HSW_IDICR, 0, IDIHASHMSK(0xf)); + + if (IS_HASWELL(i915)) + intel_uncore_write(uncore, + MI_PREDICATE_RESULT_2, + IS_HSW_GT3(i915) ? + LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED); + + /* Apply the GT workarounds... */ + intel_gt_apply_workarounds(gt); + /* ...and determine whether they are sticking. */ + intel_gt_verify_workarounds(gt, "init"); + + intel_gt_init_swizzling(gt); + + /* + * At least 830 can leave some of the unused rings + * "active" (ie. head != tail) after resume which + * will prevent c3 entry. Makes sure all unused rings + * are totally idle. + */ + init_unused_rings(gt); + + ret = i915_ppgtt_init_hw(gt); + if (ret) { + DRM_ERROR("Enabling PPGTT failed (%d)\n", ret); + goto out; + } + + /* We can't enable contexts until all firmware is loaded */ + ret = intel_uc_init_hw(>->uc); + if (ret) { + i915_probe_error(i915, "Enabling uc failed (%d)\n", ret); + goto out; + } + + intel_mocs_init(gt); + +out: + intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL); + return ret; +} + +static void rmw_set(struct intel_uncore *uncore, i915_reg_t reg, u32 set) +{ + intel_uncore_rmw(uncore, reg, 0, set); +} + +static void rmw_clear(struct intel_uncore *uncore, i915_reg_t reg, u32 clr) +{ + intel_uncore_rmw(uncore, reg, clr, 0); +} + +static void clear_register(struct intel_uncore *uncore, i915_reg_t reg) +{ + intel_uncore_rmw(uncore, reg, 0, 0); +} + +static void gen8_clear_engine_error_register(struct intel_engine_cs *engine) +{ + GEN6_RING_FAULT_REG_RMW(engine, RING_FAULT_VALID, 0); + GEN6_RING_FAULT_REG_POSTING_READ(engine); +} + +void +intel_gt_clear_error_registers(struct intel_gt *gt, + intel_engine_mask_t engine_mask) +{ + struct drm_i915_private *i915 = gt->i915; + struct intel_uncore *uncore = gt->uncore; + u32 eir; + + if (!IS_GEN(i915, 2)) + clear_register(uncore, PGTBL_ER); + + if (INTEL_GEN(i915) < 4) + clear_register(uncore, IPEIR(RENDER_RING_BASE)); + else + clear_register(uncore, IPEIR_I965); + + clear_register(uncore, EIR); + eir = intel_uncore_read(uncore, EIR); + if (eir) { + /* + * some errors might have become stuck, + * mask them. + */ + DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir); + rmw_set(uncore, EMR, eir); + intel_uncore_write(uncore, GEN2_IIR, + I915_MASTER_ERROR_INTERRUPT); + } + + if (INTEL_GEN(i915) >= 12) { + rmw_clear(uncore, GEN12_RING_FAULT_REG, RING_FAULT_VALID); + intel_uncore_posting_read(uncore, GEN12_RING_FAULT_REG); + } else if (INTEL_GEN(i915) >= 8) { + rmw_clear(uncore, GEN8_RING_FAULT_REG, RING_FAULT_VALID); + intel_uncore_posting_read(uncore, GEN8_RING_FAULT_REG); + } else if (INTEL_GEN(i915) >= 6) { + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine_masked(engine, gt, engine_mask, id) + gen8_clear_engine_error_register(engine); + } +} + +static void gen6_check_faults(struct intel_gt *gt) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + u32 fault; + + for_each_engine(engine, gt, id) { + fault = GEN6_RING_FAULT_REG_READ(engine); + if (fault & RING_FAULT_VALID) { + drm_dbg(&engine->i915->drm, "Unexpected fault\n" + "\tAddr: 0x%08lx\n" + "\tAddress space: %s\n" + "\tSource ID: %d\n" + "\tType: %d\n", + fault & PAGE_MASK, + fault & RING_FAULT_GTTSEL_MASK ? + "GGTT" : "PPGTT", + RING_FAULT_SRCID(fault), + RING_FAULT_FAULT_TYPE(fault)); + } + } +} + +static void gen8_check_faults(struct intel_gt *gt) +{ + struct intel_uncore *uncore = gt->uncore; + i915_reg_t fault_reg, fault_data0_reg, fault_data1_reg; + u32 fault; + + if (INTEL_GEN(gt->i915) >= 12) { + fault_reg = GEN12_RING_FAULT_REG; + fault_data0_reg = GEN12_FAULT_TLB_DATA0; + fault_data1_reg = GEN12_FAULT_TLB_DATA1; + } else { + fault_reg = GEN8_RING_FAULT_REG; + fault_data0_reg = GEN8_FAULT_TLB_DATA0; + fault_data1_reg = GEN8_FAULT_TLB_DATA1; + } + + fault = intel_uncore_read(uncore, fault_reg); + if (fault & RING_FAULT_VALID) { + u32 fault_data0, fault_data1; + u64 fault_addr; + + fault_data0 = intel_uncore_read(uncore, fault_data0_reg); + fault_data1 = intel_uncore_read(uncore, fault_data1_reg); + + fault_addr = ((u64)(fault_data1 & FAULT_VA_HIGH_BITS) << 44) | + ((u64)fault_data0 << 12); + + drm_dbg(&uncore->i915->drm, "Unexpected fault\n" + "\tAddr: 0x%08x_%08x\n" + "\tAddress space: %s\n" + "\tEngine ID: %d\n" + "\tSource ID: %d\n" + "\tType: %d\n", + upper_32_bits(fault_addr), lower_32_bits(fault_addr), + fault_data1 & FAULT_GTT_SEL ? "GGTT" : "PPGTT", + GEN8_RING_FAULT_ENGINE_ID(fault), + RING_FAULT_SRCID(fault), + RING_FAULT_FAULT_TYPE(fault)); + } +} + +void intel_gt_check_and_clear_faults(struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + + /* From GEN8 onwards we only have one 'All Engine Fault Register' */ + if (INTEL_GEN(i915) >= 8) + gen8_check_faults(gt); + else if (INTEL_GEN(i915) >= 6) + gen6_check_faults(gt); + else + return; + + intel_gt_clear_error_registers(gt, ALL_ENGINES); +} + +void intel_gt_flush_ggtt_writes(struct intel_gt *gt) +{ + struct intel_uncore *uncore = gt->uncore; + intel_wakeref_t wakeref; + + /* + * No actual flushing is required for the GTT write domain for reads + * from the GTT domain. Writes to it "immediately" go to main memory + * as far as we know, so there's no chipset flush. It also doesn't + * land in the GPU render cache. + * + * However, we do have to enforce the order so that all writes through + * the GTT land before any writes to the device, such as updates to + * the GATT itself. + * + * We also have to wait a bit for the writes to land from the GTT. + * An uncached read (i.e. mmio) seems to be ideal for the round-trip + * timing. This issue has only been observed when switching quickly + * between GTT writes and CPU reads from inside the kernel on recent hw, + * and it appears to only affect discrete GTT blocks (i.e. on LLC + * system agents we cannot reproduce this behaviour, until Cannonlake + * that was!). + */ + + wmb(); + + if (INTEL_INFO(gt->i915)->has_coherent_ggtt) + return; + + intel_gt_chipset_flush(gt); + + with_intel_runtime_pm_if_in_use(uncore->rpm, wakeref) { + unsigned long flags; + + spin_lock_irqsave(&uncore->lock, flags); + intel_uncore_posting_read_fw(uncore, + RING_HEAD(RENDER_RING_BASE)); + spin_unlock_irqrestore(&uncore->lock, flags); + } +} + +void intel_gt_chipset_flush(struct intel_gt *gt) +{ + wmb(); + if (INTEL_GEN(gt->i915) < 6) + intel_gtt_chipset_flush(); +} + +void intel_gt_driver_register(struct intel_gt *gt) +{ + intel_rps_driver_register(>->rps); + + debugfs_gt_register(gt); +} + +static int intel_gt_init_scratch(struct intel_gt *gt, unsigned int size) +{ + struct drm_i915_private *i915 = gt->i915; + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + int ret; + + obj = i915_gem_object_create_stolen(i915, size); + if (IS_ERR(obj)) + obj = i915_gem_object_create_internal(i915, size); + if (IS_ERR(obj)) { + DRM_ERROR("Failed to allocate scratch page\n"); + return PTR_ERR(obj); + } + + vma = i915_vma_instance(obj, >->ggtt->vm, NULL); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto err_unref; + } + + ret = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH); + if (ret) + goto err_unref; + + gt->scratch = i915_vma_make_unshrinkable(vma); + + return 0; + +err_unref: + i915_gem_object_put(obj); + return ret; +} + +static void intel_gt_fini_scratch(struct intel_gt *gt) +{ + i915_vma_unpin_and_release(>->scratch, 0); +} + +static struct i915_address_space *kernel_vm(struct intel_gt *gt) +{ + if (INTEL_PPGTT(gt->i915) > INTEL_PPGTT_ALIASING) + return &i915_ppgtt_create(gt)->vm; + else + return i915_vm_get(>->ggtt->vm); +} + +static int __engines_record_defaults(struct intel_gt *gt) +{ + struct i915_request *requests[I915_NUM_ENGINES] = {}; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + /* + * As we reset the gpu during very early sanitisation, the current + * register state on the GPU should reflect its defaults values. + * We load a context onto the hw (with restore-inhibit), then switch + * over to a second context to save that default register state. We + * can then prime every new context with that state so they all start + * from the same default HW values. + */ + + for_each_engine(engine, gt, id) { + struct intel_renderstate so; + struct intel_context *ce; + struct i915_request *rq; + + /* We must be able to switch to something! */ + GEM_BUG_ON(!engine->kernel_context); + + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto out; + } + + err = intel_renderstate_init(&so, ce); + if (err) + goto err; + + rq = i915_request_create(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_fini; + } + + err = intel_engine_emit_ctx_wa(rq); + if (err) + goto err_rq; + + err = intel_renderstate_emit(&so, rq); + if (err) + goto err_rq; + +err_rq: + requests[id] = i915_request_get(rq); + i915_request_add(rq); +err_fini: + intel_renderstate_fini(&so, ce); +err: + if (err) { + intel_context_put(ce); + goto out; + } + } + + /* Flush the default context image to memory, and enable powersaving. */ + if (intel_gt_wait_for_idle(gt, I915_GEM_IDLE_TIMEOUT) == -ETIME) { + err = -EIO; + goto out; + } + + for (id = 0; id < ARRAY_SIZE(requests); id++) { + struct i915_request *rq; + struct file *state; + + rq = requests[id]; + if (!rq) + continue; + + if (rq->fence.error) { + err = -EIO; + goto out; + } + + GEM_BUG_ON(!test_bit(CONTEXT_ALLOC_BIT, &rq->context->flags)); + if (!rq->context->state) + continue; + + /* Keep a copy of the state's backing pages; free the obj */ + state = shmem_create_from_object(rq->context->state->obj); + if (IS_ERR(state)) { + err = PTR_ERR(state); + goto out; + } + rq->engine->default_state = state; + } + +out: + /* + * If we have to abandon now, we expect the engines to be idle + * and ready to be torn-down. The quickest way we can accomplish + * this is by declaring ourselves wedged. + */ + if (err) + intel_gt_set_wedged(gt); + + for (id = 0; id < ARRAY_SIZE(requests); id++) { + struct intel_context *ce; + struct i915_request *rq; + + rq = requests[id]; + if (!rq) + continue; + + ce = rq->context; + i915_request_put(rq); + intel_context_put(ce); + } + return err; +} + +static int __engines_verify_workarounds(struct intel_gt *gt) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) + return 0; + + for_each_engine(engine, gt, id) { + if (intel_engine_verify_workarounds(engine, "load")) + err = -EIO; + } + + /* Flush and restore the kernel context for safety */ + if (intel_gt_wait_for_idle(gt, I915_GEM_IDLE_TIMEOUT) == -ETIME) + err = -EIO; + + return err; +} + +static void __intel_gt_disable(struct intel_gt *gt) +{ + intel_gt_set_wedged_on_fini(gt); + + intel_gt_suspend_prepare(gt); + intel_gt_suspend_late(gt); + + GEM_BUG_ON(intel_gt_pm_is_awake(gt)); +} + +int intel_gt_init(struct intel_gt *gt) +{ + int err; + + err = i915_inject_probe_error(gt->i915, -ENODEV); + if (err) + return err; + + /* + * This is just a security blanket to placate dragons. + * On some systems, we very sporadically observe that the first TLBs + * used by the CS may be stale, despite us poking the TLB reset. If + * we hold the forcewake during initialisation these problems + * just magically go away. + */ + intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); + + intel_gt_init_clock_frequency(gt); + + err = intel_gt_init_scratch(gt, IS_GEN(gt->i915, 2) ? SZ_256K : SZ_4K); + if (err) + goto out_fw; + + intel_gt_pm_init(gt); + + gt->vm = kernel_vm(gt); + if (!gt->vm) { + err = -ENOMEM; + goto err_pm; + } + + err = intel_engines_init(gt); + if (err) + goto err_engines; + + err = intel_uc_init(>->uc); + if (err) + goto err_engines; + + err = intel_gt_resume(gt); + if (err) + goto err_uc_init; + + err = __engines_record_defaults(gt); + if (err) + goto err_gt; + + err = __engines_verify_workarounds(gt); + if (err) + goto err_gt; + + err = i915_inject_probe_error(gt->i915, -EIO); + if (err) + goto err_gt; + + goto out_fw; +err_gt: + __intel_gt_disable(gt); + intel_uc_fini_hw(>->uc); +err_uc_init: + intel_uc_fini(>->uc); +err_engines: + intel_engines_release(gt); + i915_vm_put(fetch_and_zero(>->vm)); +err_pm: + intel_gt_pm_fini(gt); + intel_gt_fini_scratch(gt); +out_fw: + if (err) + intel_gt_set_wedged_on_init(gt); + intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); + return err; +} + +void intel_gt_driver_remove(struct intel_gt *gt) +{ + __intel_gt_disable(gt); + + intel_uc_driver_remove(>->uc); + + intel_engines_release(gt); +} + +void intel_gt_driver_unregister(struct intel_gt *gt) +{ + intel_rps_driver_unregister(>->rps); + + /* + * Upon unregistering the device to prevent any new users, cancel + * all in-flight requests so that we can quickly unbind the active + * resources. + */ + intel_gt_set_wedged(gt); +} + +void intel_gt_driver_release(struct intel_gt *gt) +{ + struct i915_address_space *vm; + intel_wakeref_t wakeref; + + /* Scrub all HW state upon release */ + with_intel_runtime_pm(gt->uncore->rpm, wakeref) + __intel_gt_reset(gt, ALL_ENGINES); + + vm = fetch_and_zero(>->vm); + if (vm) /* FIXME being called twice on error paths :( */ + i915_vm_put(vm); + + intel_gt_pm_fini(gt); + intel_gt_fini_scratch(gt); + intel_gt_fini_buffer_pool(gt); +} + +void intel_gt_driver_late_release(struct intel_gt *gt) +{ + /* We need to wait for inflight RCU frees to release their grip */ + rcu_barrier(); + + intel_uc_driver_late_release(>->uc); + intel_gt_fini_requests(gt); + intel_gt_fini_reset(gt); + intel_gt_fini_timelines(gt); + intel_engines_free(gt); +} + +void intel_gt_info_print(const struct intel_gt_info *info, + struct drm_printer *p) +{ + drm_printf(p, "available engines: %x\n", info->engine_mask); + + intel_sseu_dump(&info->sseu, p); +} + +struct reg_and_bit { + i915_reg_t reg; + u32 bit; +}; + +static struct reg_and_bit +get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8, + const i915_reg_t *regs, const unsigned int num) +{ + const unsigned int class = engine->class; + struct reg_and_bit rb = { }; + + if (drm_WARN_ON_ONCE(&engine->i915->drm, + class >= num || !regs[class].reg)) + return rb; + + rb.reg = regs[class]; + if (gen8 && class == VIDEO_DECODE_CLASS) + rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */ + else + rb.bit = engine->instance; + + rb.bit = BIT(rb.bit); + + return rb; +} + +void intel_gt_invalidate_tlbs(struct intel_gt *gt) +{ + static const i915_reg_t gen8_regs[] = { + [RENDER_CLASS] = GEN8_RTCR, + [VIDEO_DECODE_CLASS] = GEN8_M1TCR, /* , GEN8_M2TCR */ + [VIDEO_ENHANCEMENT_CLASS] = GEN8_VTCR, + [COPY_ENGINE_CLASS] = GEN8_BTCR, + }; + static const i915_reg_t gen12_regs[] = { + [RENDER_CLASS] = GEN12_GFX_TLB_INV_CR, + [VIDEO_DECODE_CLASS] = GEN12_VD_TLB_INV_CR, + [VIDEO_ENHANCEMENT_CLASS] = GEN12_VE_TLB_INV_CR, + [COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR, + }; + struct drm_i915_private *i915 = gt->i915; + struct intel_uncore *uncore = gt->uncore; + struct intel_engine_cs *engine; + enum intel_engine_id id; + const i915_reg_t *regs; + unsigned int num = 0; + + if (I915_SELFTEST_ONLY(gt->awake == -ENODEV)) + return; + + if (INTEL_GEN(i915) == 12) { + regs = gen12_regs; + num = ARRAY_SIZE(gen12_regs); + } else if (INTEL_GEN(i915) >= 8 && INTEL_GEN(i915) <= 11) { + regs = gen8_regs; + num = ARRAY_SIZE(gen8_regs); + } else if (INTEL_GEN(i915) < 8) { + return; + } + + if (drm_WARN_ONCE(&i915->drm, !num, + "Platform does not implement TLB invalidation!")) + return; + + GEM_TRACE("\n"); + + assert_rpm_wakelock_held(&i915->runtime_pm); + + mutex_lock(>->tlb_invalidate_lock); + intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); + + spin_lock_irq(&uncore->lock); /* serialise invalidate with GT reset */ + + for_each_engine(engine, gt, id) { + struct reg_and_bit rb; + + rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num); + if (!i915_mmio_reg_offset(rb.reg)) + continue; + + if (INTEL_GEN(i915) == 12 && (engine->class == VIDEO_DECODE_CLASS || + engine->class == VIDEO_ENHANCEMENT_CLASS)) + rb.bit = _MASKED_BIT_ENABLE(rb.bit); + + intel_uncore_write_fw(uncore, rb.reg, rb.bit); + } + + spin_unlock_irq(&uncore->lock); + + for_each_engine(engine, gt, id) { + /* + * HW architecture suggest typical invalidation time at 40us, + * with pessimistic cases up to 100us and a recommendation to + * cap at 1ms. We go a bit higher just in case. + */ + const unsigned int timeout_us = 100; + const unsigned int timeout_ms = 4; + struct reg_and_bit rb; + + rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num); + if (!i915_mmio_reg_offset(rb.reg)) + continue; + + if (__intel_wait_for_register_fw(uncore, + rb.reg, rb.bit, 0, + timeout_us, timeout_ms, + NULL)) + drm_err_ratelimited(>->i915->drm, + "%s TLB invalidation did not complete in %ums!\n", + engine->name, timeout_ms); + } + + intel_uncore_forcewake_put_delayed(uncore, FORCEWAKE_ALL); + mutex_unlock(>->tlb_invalidate_lock); +} diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h new file mode 100644 index 000000000..d9a116817 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2019 Intel Corporation + */ + +#ifndef __INTEL_GT__ +#define __INTEL_GT__ + +#include "intel_engine_types.h" +#include "intel_gt_types.h" +#include "intel_reset.h" + +struct drm_i915_private; +struct drm_printer; + +#define GT_TRACE(gt, fmt, ...) do { \ + const struct intel_gt *gt__ __maybe_unused = (gt); \ + GEM_TRACE("%s " fmt, dev_name(gt__->i915->drm.dev), \ + ##__VA_ARGS__); \ +} while (0) + +static inline struct intel_gt *uc_to_gt(struct intel_uc *uc) +{ + return container_of(uc, struct intel_gt, uc); +} + +static inline struct intel_gt *guc_to_gt(struct intel_guc *guc) +{ + return container_of(guc, struct intel_gt, uc.guc); +} + +static inline struct intel_gt *huc_to_gt(struct intel_huc *huc) +{ + return container_of(huc, struct intel_gt, uc.huc); +} + +void intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915); +void intel_gt_init_hw_early(struct intel_gt *gt, struct i915_ggtt *ggtt); +int intel_gt_init_mmio(struct intel_gt *gt); +int __must_check intel_gt_init_hw(struct intel_gt *gt); +int intel_gt_init(struct intel_gt *gt); +void intel_gt_driver_register(struct intel_gt *gt); + +void intel_gt_driver_unregister(struct intel_gt *gt); +void intel_gt_driver_remove(struct intel_gt *gt); +void intel_gt_driver_release(struct intel_gt *gt); + +void intel_gt_driver_late_release(struct intel_gt *gt); + +void intel_gt_check_and_clear_faults(struct intel_gt *gt); +void intel_gt_clear_error_registers(struct intel_gt *gt, + intel_engine_mask_t engine_mask); + +void intel_gt_flush_ggtt_writes(struct intel_gt *gt); +void intel_gt_chipset_flush(struct intel_gt *gt); + +static inline u32 intel_gt_scratch_offset(const struct intel_gt *gt, + enum intel_gt_scratch_field field) +{ + return i915_ggtt_offset(gt->scratch) + field; +} + +static inline bool intel_gt_has_unrecoverable_error(const struct intel_gt *gt) +{ + return test_bit(I915_WEDGED_ON_INIT, >->reset.flags) || + test_bit(I915_WEDGED_ON_FINI, >->reset.flags); +} + +static inline bool intel_gt_is_wedged(const struct intel_gt *gt) +{ + GEM_BUG_ON(intel_gt_has_unrecoverable_error(gt) && + !test_bit(I915_WEDGED, >->reset.flags)); + + return unlikely(test_bit(I915_WEDGED, >->reset.flags)); +} + +void intel_gt_info_print(const struct intel_gt_info *info, + struct drm_printer *p); + +void intel_gt_invalidate_tlbs(struct intel_gt *gt); + +#endif /* __INTEL_GT_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c new file mode 100644 index 000000000..104cb30e8 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2014-2018 Intel Corporation + */ + +#include "gem/i915_gem_object.h" + +#include "i915_drv.h" +#include "intel_engine_pm.h" +#include "intel_gt_buffer_pool.h" + +static struct intel_gt *to_gt(struct intel_gt_buffer_pool *pool) +{ + return container_of(pool, struct intel_gt, buffer_pool); +} + +static struct list_head * +bucket_for_size(struct intel_gt_buffer_pool *pool, size_t sz) +{ + int n; + + /* + * Compute a power-of-two bucket, but throw everything greater than + * 16KiB into the same bucket: i.e. the buckets hold objects of + * (1 page, 2 pages, 4 pages, 8+ pages). + */ + n = fls(sz >> PAGE_SHIFT) - 1; + if (n >= ARRAY_SIZE(pool->cache_list)) + n = ARRAY_SIZE(pool->cache_list) - 1; + + return &pool->cache_list[n]; +} + +static void node_free(struct intel_gt_buffer_pool_node *node) +{ + i915_gem_object_put(node->obj); + i915_active_fini(&node->active); + kfree_rcu(node, rcu); +} + +static bool pool_free_older_than(struct intel_gt_buffer_pool *pool, long keep) +{ + struct intel_gt_buffer_pool_node *node, *stale = NULL; + bool active = false; + int n; + + /* Free buffers that have not been used in the past second */ + for (n = 0; n < ARRAY_SIZE(pool->cache_list); n++) { + struct list_head *list = &pool->cache_list[n]; + + if (list_empty(list)) + continue; + + if (spin_trylock_irq(&pool->lock)) { + struct list_head *pos; + + /* Most recent at head; oldest at tail */ + list_for_each_prev(pos, list) { + unsigned long age; + + node = list_entry(pos, typeof(*node), link); + + age = READ_ONCE(node->age); + if (!age || jiffies - age < keep) + break; + + /* Check we are the first to claim this node */ + if (!xchg(&node->age, 0)) + break; + + node->free = stale; + stale = node; + } + if (!list_is_last(pos, list)) + __list_del_many(pos, list); + + spin_unlock_irq(&pool->lock); + } + + active |= !list_empty(list); + } + + while ((node = stale)) { + stale = stale->free; + node_free(node); + } + + return active; +} + +static void pool_free_work(struct work_struct *wrk) +{ + struct intel_gt_buffer_pool *pool = + container_of(wrk, typeof(*pool), work.work); + + if (pool_free_older_than(pool, HZ)) + schedule_delayed_work(&pool->work, + round_jiffies_up_relative(HZ)); +} + +static int pool_active(struct i915_active *ref) +{ + struct intel_gt_buffer_pool_node *node = + container_of(ref, typeof(*node), active); + struct dma_resv *resv = node->obj->base.resv; + int err; + + if (dma_resv_trylock(resv)) { + dma_resv_add_excl_fence(resv, NULL); + dma_resv_unlock(resv); + } + + err = i915_gem_object_pin_pages(node->obj); + if (err) + return err; + + /* Hide this pinned object from the shrinker until retired */ + i915_gem_object_make_unshrinkable(node->obj); + + return 0; +} + +__i915_active_call +static void pool_retire(struct i915_active *ref) +{ + struct intel_gt_buffer_pool_node *node = + container_of(ref, typeof(*node), active); + struct intel_gt_buffer_pool *pool = node->pool; + struct list_head *list = bucket_for_size(pool, node->obj->base.size); + unsigned long flags; + + i915_gem_object_unpin_pages(node->obj); + + /* Return this object to the shrinker pool */ + i915_gem_object_make_purgeable(node->obj); + + GEM_BUG_ON(node->age); + spin_lock_irqsave(&pool->lock, flags); + list_add_rcu(&node->link, list); + WRITE_ONCE(node->age, jiffies ?: 1); /* 0 reserved for active nodes */ + spin_unlock_irqrestore(&pool->lock, flags); + + schedule_delayed_work(&pool->work, + round_jiffies_up_relative(HZ)); +} + +static struct intel_gt_buffer_pool_node * +node_create(struct intel_gt_buffer_pool *pool, size_t sz) +{ + struct intel_gt *gt = to_gt(pool); + struct intel_gt_buffer_pool_node *node; + struct drm_i915_gem_object *obj; + + node = kmalloc(sizeof(*node), + GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); + if (!node) + return ERR_PTR(-ENOMEM); + + node->age = 0; + node->pool = pool; + i915_active_init(&node->active, pool_active, pool_retire); + + obj = i915_gem_object_create_internal(gt->i915, sz); + if (IS_ERR(obj)) { + i915_active_fini(&node->active); + kfree(node); + return ERR_CAST(obj); + } + + i915_gem_object_set_readonly(obj); + + node->obj = obj; + return node; +} + +struct intel_gt_buffer_pool_node * +intel_gt_get_buffer_pool(struct intel_gt *gt, size_t size) +{ + struct intel_gt_buffer_pool *pool = >->buffer_pool; + struct intel_gt_buffer_pool_node *node; + struct list_head *list; + int ret; + + size = PAGE_ALIGN(size); + list = bucket_for_size(pool, size); + + rcu_read_lock(); + list_for_each_entry_rcu(node, list, link) { + unsigned long age; + + if (node->obj->base.size < size) + continue; + + age = READ_ONCE(node->age); + if (!age) + continue; + + if (cmpxchg(&node->age, age, 0) == age) { + spin_lock_irq(&pool->lock); + list_del_rcu(&node->link); + spin_unlock_irq(&pool->lock); + break; + } + } + rcu_read_unlock(); + + if (&node->link == list) { + node = node_create(pool, size); + if (IS_ERR(node)) + return node; + } + + ret = i915_active_acquire(&node->active); + if (ret) { + node_free(node); + return ERR_PTR(ret); + } + + return node; +} + +void intel_gt_init_buffer_pool(struct intel_gt *gt) +{ + struct intel_gt_buffer_pool *pool = >->buffer_pool; + int n; + + spin_lock_init(&pool->lock); + for (n = 0; n < ARRAY_SIZE(pool->cache_list); n++) + INIT_LIST_HEAD(&pool->cache_list[n]); + INIT_DELAYED_WORK(&pool->work, pool_free_work); +} + +void intel_gt_flush_buffer_pool(struct intel_gt *gt) +{ + struct intel_gt_buffer_pool *pool = >->buffer_pool; + + do { + while (pool_free_older_than(pool, 0)) + ; + } while (cancel_delayed_work_sync(&pool->work)); +} + +void intel_gt_fini_buffer_pool(struct intel_gt *gt) +{ + struct intel_gt_buffer_pool *pool = >->buffer_pool; + int n; + + intel_gt_flush_buffer_pool(gt); + + for (n = 0; n < ARRAY_SIZE(pool->cache_list); n++) + GEM_BUG_ON(!list_empty(&pool->cache_list[n])); +} diff --git a/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.h b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.h new file mode 100644 index 000000000..42cbac003 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2014-2018 Intel Corporation + */ + +#ifndef INTEL_GT_BUFFER_POOL_H +#define INTEL_GT_BUFFER_POOL_H + +#include <linux/types.h> + +#include "i915_active.h" +#include "intel_gt_buffer_pool_types.h" + +struct intel_gt; +struct i915_request; + +struct intel_gt_buffer_pool_node * +intel_gt_get_buffer_pool(struct intel_gt *gt, size_t size); + +static inline int +intel_gt_buffer_pool_mark_active(struct intel_gt_buffer_pool_node *node, + struct i915_request *rq) +{ + return i915_active_add_request(&node->active, rq); +} + +static inline void +intel_gt_buffer_pool_put(struct intel_gt_buffer_pool_node *node) +{ + i915_active_release(&node->active); +} + +void intel_gt_init_buffer_pool(struct intel_gt *gt); +void intel_gt_flush_buffer_pool(struct intel_gt *gt); +void intel_gt_fini_buffer_pool(struct intel_gt *gt); + +#endif /* INTEL_GT_BUFFER_POOL_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool_types.h b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool_types.h new file mode 100644 index 000000000..bcf1658c9 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool_types.h @@ -0,0 +1,36 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2014-2018 Intel Corporation + */ + +#ifndef INTEL_GT_BUFFER_POOL_TYPES_H +#define INTEL_GT_BUFFER_POOL_TYPES_H + +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/workqueue.h> + +#include "i915_active_types.h" + +struct drm_i915_gem_object; + +struct intel_gt_buffer_pool { + spinlock_t lock; + struct list_head cache_list[4]; + struct delayed_work work; +}; + +struct intel_gt_buffer_pool_node { + struct i915_active active; + struct drm_i915_gem_object *obj; + struct list_head link; + union { + struct intel_gt_buffer_pool *pool; + struct intel_gt_buffer_pool_node *free; + struct rcu_head rcu; + }; + unsigned long age; +}; + +#endif /* INTEL_GT_BUFFER_POOL_TYPES_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_clock_utils.c b/drivers/gpu/drm/i915/gt/intel_gt_clock_utils.c new file mode 100644 index 000000000..999079686 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_clock_utils.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +#include "i915_drv.h" +#include "intel_gt.h" +#include "intel_gt_clock_utils.h" + +#define MHZ_12 12000000 /* 12MHz (24MHz/2), 83.333ns */ +#define MHZ_12_5 12500000 /* 12.5MHz (25MHz/2), 80ns */ +#define MHZ_19_2 19200000 /* 19.2MHz, 52.083ns */ + +static u32 read_clock_frequency(const struct intel_gt *gt) +{ + if (INTEL_GEN(gt->i915) >= 11) { + u32 config; + + config = intel_uncore_read(gt->uncore, RPM_CONFIG0); + config &= GEN11_RPM_CONFIG0_CRYSTAL_CLOCK_FREQ_MASK; + config >>= GEN11_RPM_CONFIG0_CRYSTAL_CLOCK_FREQ_SHIFT; + + switch (config) { + case 0: return MHZ_12; + case 1: + case 2: return MHZ_19_2; + default: + case 3: return MHZ_12_5; + } + } else if (INTEL_GEN(gt->i915) >= 9) { + if (IS_GEN9_LP(gt->i915)) + return MHZ_19_2; + else + return MHZ_12; + } else { + return MHZ_12_5; + } +} + +void intel_gt_init_clock_frequency(struct intel_gt *gt) +{ + /* + * Note that on gen11+, the clock frequency may be reconfigured. + * We do not, and we assume nobody else does. + */ + gt->clock_frequency = read_clock_frequency(gt); + GT_TRACE(gt, + "Using clock frequency: %dkHz\n", + gt->clock_frequency / 1000); +} + +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) +void intel_gt_check_clock_frequency(const struct intel_gt *gt) +{ + if (gt->clock_frequency != read_clock_frequency(gt)) { + dev_err(gt->i915->drm.dev, + "GT clock frequency changed, was %uHz, now %uHz!\n", + gt->clock_frequency, + read_clock_frequency(gt)); + } +} +#endif + +static u64 div_u64_roundup(u64 nom, u32 den) +{ + return div_u64(nom + den - 1, den); +} + +u32 intel_gt_clock_interval_to_ns(const struct intel_gt *gt, u32 count) +{ + return div_u64_roundup(mul_u32_u32(count, 1000 * 1000 * 1000), + gt->clock_frequency); +} + +u32 intel_gt_pm_interval_to_ns(const struct intel_gt *gt, u32 count) +{ + return intel_gt_clock_interval_to_ns(gt, 16 * count); +} + +u32 intel_gt_ns_to_clock_interval(const struct intel_gt *gt, u32 ns) +{ + return div_u64_roundup(mul_u32_u32(gt->clock_frequency, ns), + 1000 * 1000 * 1000); +} + +u32 intel_gt_ns_to_pm_interval(const struct intel_gt *gt, u32 ns) +{ + u32 val; + + /* + * Make these a multiple of magic 25 to avoid SNB (eg. Dell XPS + * 8300) freezing up around GPU hangs. Looks as if even + * scheduling/timer interrupts start misbehaving if the RPS + * EI/thresholds are "bad", leading to a very sluggish or even + * frozen machine. + */ + val = DIV_ROUND_UP(intel_gt_ns_to_clock_interval(gt, ns), 16); + if (IS_GEN(gt->i915, 6)) + val = roundup(val, 25); + + return val; +} diff --git a/drivers/gpu/drm/i915/gt/intel_gt_clock_utils.h b/drivers/gpu/drm/i915/gt/intel_gt_clock_utils.h new file mode 100644 index 000000000..f793c89f2 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_clock_utils.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2020 Intel Corporation + */ + +#ifndef __INTEL_GT_CLOCK_UTILS_H__ +#define __INTEL_GT_CLOCK_UTILS_H__ + +#include <linux/types.h> + +struct intel_gt; + +void intel_gt_init_clock_frequency(struct intel_gt *gt); + +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) +void intel_gt_check_clock_frequency(const struct intel_gt *gt); +#else +static inline void intel_gt_check_clock_frequency(const struct intel_gt *gt) {} +#endif + +u32 intel_gt_clock_interval_to_ns(const struct intel_gt *gt, u32 count); +u32 intel_gt_pm_interval_to_ns(const struct intel_gt *gt, u32 count); + +u32 intel_gt_ns_to_clock_interval(const struct intel_gt *gt, u32 ns); +u32 intel_gt_ns_to_pm_interval(const struct intel_gt *gt, u32 ns); + +#endif /* __INTEL_GT_CLOCK_UTILS_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_irq.c b/drivers/gpu/drm/i915/gt/intel_gt_irq.c new file mode 100644 index 000000000..257063a57 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_irq.c @@ -0,0 +1,470 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include <linux/sched/clock.h> + +#include "i915_drv.h" +#include "i915_irq.h" +#include "intel_breadcrumbs.h" +#include "intel_gt.h" +#include "intel_gt_irq.h" +#include "intel_uncore.h" +#include "intel_rps.h" + +static void guc_irq_handler(struct intel_guc *guc, u16 iir) +{ + if (iir & GUC_INTR_GUC2HOST) + intel_guc_to_host_event_handler(guc); +} + +static void +cs_irq_handler(struct intel_engine_cs *engine, u32 iir) +{ + bool tasklet = false; + + if (unlikely(iir & GT_CS_MASTER_ERROR_INTERRUPT)) { + u32 eir; + + /* Upper 16b are the enabling mask, rsvd for internal errors */ + eir = ENGINE_READ(engine, RING_EIR) & GENMASK(15, 0); + ENGINE_TRACE(engine, "CS error: %x\n", eir); + + /* Disable the error interrupt until after the reset */ + if (likely(eir)) { + ENGINE_WRITE(engine, RING_EMR, ~0u); + ENGINE_WRITE(engine, RING_EIR, eir); + WRITE_ONCE(engine->execlists.error_interrupt, eir); + tasklet = true; + } + } + + if (iir & GT_WAIT_SEMAPHORE_INTERRUPT) { + WRITE_ONCE(engine->execlists.yield, + ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI)); + ENGINE_TRACE(engine, "semaphore yield: %08x\n", + engine->execlists.yield); + if (del_timer(&engine->execlists.timer)) + tasklet = true; + } + + if (iir & GT_CONTEXT_SWITCH_INTERRUPT) + tasklet = true; + + if (iir & GT_RENDER_USER_INTERRUPT) { + intel_engine_signal_breadcrumbs(engine); + tasklet |= intel_engine_needs_breadcrumb_tasklet(engine); + } + + if (tasklet) + tasklet_hi_schedule(&engine->execlists.tasklet); +} + +static u32 +gen11_gt_engine_identity(struct intel_gt *gt, + const unsigned int bank, const unsigned int bit) +{ + void __iomem * const regs = gt->uncore->regs; + u32 timeout_ts; + u32 ident; + + lockdep_assert_held(>->irq_lock); + + raw_reg_write(regs, GEN11_IIR_REG_SELECTOR(bank), BIT(bit)); + + /* + * NB: Specs do not specify how long to spin wait, + * so we do ~100us as an educated guess. + */ + timeout_ts = (local_clock() >> 10) + 100; + do { + ident = raw_reg_read(regs, GEN11_INTR_IDENTITY_REG(bank)); + } while (!(ident & GEN11_INTR_DATA_VALID) && + !time_after32(local_clock() >> 10, timeout_ts)); + + if (unlikely(!(ident & GEN11_INTR_DATA_VALID))) { + DRM_ERROR("INTR_IDENTITY_REG%u:%u 0x%08x not valid!\n", + bank, bit, ident); + return 0; + } + + raw_reg_write(regs, GEN11_INTR_IDENTITY_REG(bank), + GEN11_INTR_DATA_VALID); + + return ident; +} + +static void +gen11_other_irq_handler(struct intel_gt *gt, const u8 instance, + const u16 iir) +{ + if (instance == OTHER_GUC_INSTANCE) + return guc_irq_handler(>->uc.guc, iir); + + if (instance == OTHER_GTPM_INSTANCE) + return gen11_rps_irq_handler(>->rps, iir); + + WARN_ONCE(1, "unhandled other interrupt instance=0x%x, iir=0x%x\n", + instance, iir); +} + +static void +gen11_engine_irq_handler(struct intel_gt *gt, const u8 class, + const u8 instance, const u16 iir) +{ + struct intel_engine_cs *engine; + + if (instance <= MAX_ENGINE_INSTANCE) + engine = gt->engine_class[class][instance]; + else + engine = NULL; + + if (likely(engine)) + return cs_irq_handler(engine, iir); + + WARN_ONCE(1, "unhandled engine interrupt class=0x%x, instance=0x%x\n", + class, instance); +} + +static void +gen11_gt_identity_handler(struct intel_gt *gt, const u32 identity) +{ + const u8 class = GEN11_INTR_ENGINE_CLASS(identity); + const u8 instance = GEN11_INTR_ENGINE_INSTANCE(identity); + const u16 intr = GEN11_INTR_ENGINE_INTR(identity); + + if (unlikely(!intr)) + return; + + if (class <= COPY_ENGINE_CLASS) + return gen11_engine_irq_handler(gt, class, instance, intr); + + if (class == OTHER_CLASS) + return gen11_other_irq_handler(gt, instance, intr); + + WARN_ONCE(1, "unknown interrupt class=0x%x, instance=0x%x, intr=0x%x\n", + class, instance, intr); +} + +static void +gen11_gt_bank_handler(struct intel_gt *gt, const unsigned int bank) +{ + void __iomem * const regs = gt->uncore->regs; + unsigned long intr_dw; + unsigned int bit; + + lockdep_assert_held(>->irq_lock); + + intr_dw = raw_reg_read(regs, GEN11_GT_INTR_DW(bank)); + + for_each_set_bit(bit, &intr_dw, 32) { + const u32 ident = gen11_gt_engine_identity(gt, bank, bit); + + gen11_gt_identity_handler(gt, ident); + } + + /* Clear must be after shared has been served for engine */ + raw_reg_write(regs, GEN11_GT_INTR_DW(bank), intr_dw); +} + +void gen11_gt_irq_handler(struct intel_gt *gt, const u32 master_ctl) +{ + unsigned int bank; + + spin_lock(>->irq_lock); + + for (bank = 0; bank < 2; bank++) { + if (master_ctl & GEN11_GT_DW_IRQ(bank)) + gen11_gt_bank_handler(gt, bank); + } + + spin_unlock(>->irq_lock); +} + +bool gen11_gt_reset_one_iir(struct intel_gt *gt, + const unsigned int bank, const unsigned int bit) +{ + void __iomem * const regs = gt->uncore->regs; + u32 dw; + + lockdep_assert_held(>->irq_lock); + + dw = raw_reg_read(regs, GEN11_GT_INTR_DW(bank)); + if (dw & BIT(bit)) { + /* + * According to the BSpec, DW_IIR bits cannot be cleared without + * first servicing the Selector & Shared IIR registers. + */ + gen11_gt_engine_identity(gt, bank, bit); + + /* + * We locked GT INT DW by reading it. If we want to (try + * to) recover from this successfully, we need to clear + * our bit, otherwise we are locking the register for + * everybody. + */ + raw_reg_write(regs, GEN11_GT_INTR_DW(bank), BIT(bit)); + + return true; + } + + return false; +} + +void gen11_gt_irq_reset(struct intel_gt *gt) +{ + struct intel_uncore *uncore = gt->uncore; + + /* Disable RCS, BCS, VCS and VECS class engines. */ + intel_uncore_write(uncore, GEN11_RENDER_COPY_INTR_ENABLE, 0); + intel_uncore_write(uncore, GEN11_VCS_VECS_INTR_ENABLE, 0); + + /* Restore masks irqs on RCS, BCS, VCS and VECS engines. */ + intel_uncore_write(uncore, GEN11_RCS0_RSVD_INTR_MASK, ~0); + intel_uncore_write(uncore, GEN11_BCS_RSVD_INTR_MASK, ~0); + intel_uncore_write(uncore, GEN11_VCS0_VCS1_INTR_MASK, ~0); + intel_uncore_write(uncore, GEN11_VCS2_VCS3_INTR_MASK, ~0); + intel_uncore_write(uncore, GEN11_VECS0_VECS1_INTR_MASK, ~0); + + intel_uncore_write(uncore, GEN11_GPM_WGBOXPERF_INTR_ENABLE, 0); + intel_uncore_write(uncore, GEN11_GPM_WGBOXPERF_INTR_MASK, ~0); + intel_uncore_write(uncore, GEN11_GUC_SG_INTR_ENABLE, 0); + intel_uncore_write(uncore, GEN11_GUC_SG_INTR_MASK, ~0); +} + +void gen11_gt_irq_postinstall(struct intel_gt *gt) +{ + const u32 irqs = + GT_CS_MASTER_ERROR_INTERRUPT | + GT_RENDER_USER_INTERRUPT | + GT_CONTEXT_SWITCH_INTERRUPT | + GT_WAIT_SEMAPHORE_INTERRUPT; + struct intel_uncore *uncore = gt->uncore; + const u32 dmask = irqs << 16 | irqs; + const u32 smask = irqs << 16; + + BUILD_BUG_ON(irqs & 0xffff0000); + + /* Enable RCS, BCS, VCS and VECS class interrupts. */ + intel_uncore_write(uncore, GEN11_RENDER_COPY_INTR_ENABLE, dmask); + intel_uncore_write(uncore, GEN11_VCS_VECS_INTR_ENABLE, dmask); + + /* Unmask irqs on RCS, BCS, VCS and VECS engines. */ + intel_uncore_write(uncore, GEN11_RCS0_RSVD_INTR_MASK, ~smask); + intel_uncore_write(uncore, GEN11_BCS_RSVD_INTR_MASK, ~smask); + intel_uncore_write(uncore, GEN11_VCS0_VCS1_INTR_MASK, ~dmask); + intel_uncore_write(uncore, GEN11_VCS2_VCS3_INTR_MASK, ~dmask); + intel_uncore_write(uncore, GEN11_VECS0_VECS1_INTR_MASK, ~dmask); + + /* + * RPS interrupts will get enabled/disabled on demand when RPS itself + * is enabled/disabled. + */ + gt->pm_ier = 0x0; + gt->pm_imr = ~gt->pm_ier; + intel_uncore_write(uncore, GEN11_GPM_WGBOXPERF_INTR_ENABLE, 0); + intel_uncore_write(uncore, GEN11_GPM_WGBOXPERF_INTR_MASK, ~0); + + /* Same thing for GuC interrupts */ + intel_uncore_write(uncore, GEN11_GUC_SG_INTR_ENABLE, 0); + intel_uncore_write(uncore, GEN11_GUC_SG_INTR_MASK, ~0); +} + +void gen5_gt_irq_handler(struct intel_gt *gt, u32 gt_iir) +{ + if (gt_iir & GT_RENDER_USER_INTERRUPT) + intel_engine_signal_breadcrumbs(gt->engine_class[RENDER_CLASS][0]); + if (gt_iir & ILK_BSD_USER_INTERRUPT) + intel_engine_signal_breadcrumbs(gt->engine_class[VIDEO_DECODE_CLASS][0]); +} + +static void gen7_parity_error_irq_handler(struct intel_gt *gt, u32 iir) +{ + if (!HAS_L3_DPF(gt->i915)) + return; + + spin_lock(>->irq_lock); + gen5_gt_disable_irq(gt, GT_PARITY_ERROR(gt->i915)); + spin_unlock(>->irq_lock); + + if (iir & GT_RENDER_L3_PARITY_ERROR_INTERRUPT_S1) + gt->i915->l3_parity.which_slice |= 1 << 1; + + if (iir & GT_RENDER_L3_PARITY_ERROR_INTERRUPT) + gt->i915->l3_parity.which_slice |= 1 << 0; + + schedule_work(>->i915->l3_parity.error_work); +} + +void gen6_gt_irq_handler(struct intel_gt *gt, u32 gt_iir) +{ + if (gt_iir & GT_RENDER_USER_INTERRUPT) + intel_engine_signal_breadcrumbs(gt->engine_class[RENDER_CLASS][0]); + if (gt_iir & GT_BSD_USER_INTERRUPT) + intel_engine_signal_breadcrumbs(gt->engine_class[VIDEO_DECODE_CLASS][0]); + if (gt_iir & GT_BLT_USER_INTERRUPT) + intel_engine_signal_breadcrumbs(gt->engine_class[COPY_ENGINE_CLASS][0]); + + if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT | + GT_BSD_CS_ERROR_INTERRUPT | + GT_CS_MASTER_ERROR_INTERRUPT)) + DRM_DEBUG("Command parser error, gt_iir 0x%08x\n", gt_iir); + + if (gt_iir & GT_PARITY_ERROR(gt->i915)) + gen7_parity_error_irq_handler(gt, gt_iir); +} + +void gen8_gt_irq_handler(struct intel_gt *gt, u32 master_ctl) +{ + void __iomem * const regs = gt->uncore->regs; + u32 iir; + + if (master_ctl & (GEN8_GT_RCS_IRQ | GEN8_GT_BCS_IRQ)) { + iir = raw_reg_read(regs, GEN8_GT_IIR(0)); + if (likely(iir)) { + cs_irq_handler(gt->engine_class[RENDER_CLASS][0], + iir >> GEN8_RCS_IRQ_SHIFT); + cs_irq_handler(gt->engine_class[COPY_ENGINE_CLASS][0], + iir >> GEN8_BCS_IRQ_SHIFT); + raw_reg_write(regs, GEN8_GT_IIR(0), iir); + } + } + + if (master_ctl & (GEN8_GT_VCS0_IRQ | GEN8_GT_VCS1_IRQ)) { + iir = raw_reg_read(regs, GEN8_GT_IIR(1)); + if (likely(iir)) { + cs_irq_handler(gt->engine_class[VIDEO_DECODE_CLASS][0], + iir >> GEN8_VCS0_IRQ_SHIFT); + cs_irq_handler(gt->engine_class[VIDEO_DECODE_CLASS][1], + iir >> GEN8_VCS1_IRQ_SHIFT); + raw_reg_write(regs, GEN8_GT_IIR(1), iir); + } + } + + if (master_ctl & GEN8_GT_VECS_IRQ) { + iir = raw_reg_read(regs, GEN8_GT_IIR(3)); + if (likely(iir)) { + cs_irq_handler(gt->engine_class[VIDEO_ENHANCEMENT_CLASS][0], + iir >> GEN8_VECS_IRQ_SHIFT); + raw_reg_write(regs, GEN8_GT_IIR(3), iir); + } + } + + if (master_ctl & (GEN8_GT_PM_IRQ | GEN8_GT_GUC_IRQ)) { + iir = raw_reg_read(regs, GEN8_GT_IIR(2)); + if (likely(iir)) { + gen6_rps_irq_handler(>->rps, iir); + guc_irq_handler(>->uc.guc, iir >> 16); + raw_reg_write(regs, GEN8_GT_IIR(2), iir); + } + } +} + +void gen8_gt_irq_reset(struct intel_gt *gt) +{ + struct intel_uncore *uncore = gt->uncore; + + GEN8_IRQ_RESET_NDX(uncore, GT, 0); + GEN8_IRQ_RESET_NDX(uncore, GT, 1); + GEN8_IRQ_RESET_NDX(uncore, GT, 2); + GEN8_IRQ_RESET_NDX(uncore, GT, 3); +} + +void gen8_gt_irq_postinstall(struct intel_gt *gt) +{ + /* These are interrupts we'll toggle with the ring mask register */ + const u32 irqs = + GT_CS_MASTER_ERROR_INTERRUPT | + GT_RENDER_USER_INTERRUPT | + GT_CONTEXT_SWITCH_INTERRUPT | + GT_WAIT_SEMAPHORE_INTERRUPT; + const u32 gt_interrupts[] = { + irqs << GEN8_RCS_IRQ_SHIFT | irqs << GEN8_BCS_IRQ_SHIFT, + irqs << GEN8_VCS0_IRQ_SHIFT | irqs << GEN8_VCS1_IRQ_SHIFT, + 0, + irqs << GEN8_VECS_IRQ_SHIFT, + }; + struct intel_uncore *uncore = gt->uncore; + + gt->pm_ier = 0x0; + gt->pm_imr = ~gt->pm_ier; + GEN8_IRQ_INIT_NDX(uncore, GT, 0, ~gt_interrupts[0], gt_interrupts[0]); + GEN8_IRQ_INIT_NDX(uncore, GT, 1, ~gt_interrupts[1], gt_interrupts[1]); + /* + * RPS interrupts will get enabled/disabled on demand when RPS itself + * is enabled/disabled. Same wil be the case for GuC interrupts. + */ + GEN8_IRQ_INIT_NDX(uncore, GT, 2, gt->pm_imr, gt->pm_ier); + GEN8_IRQ_INIT_NDX(uncore, GT, 3, ~gt_interrupts[3], gt_interrupts[3]); +} + +static void gen5_gt_update_irq(struct intel_gt *gt, + u32 interrupt_mask, + u32 enabled_irq_mask) +{ + lockdep_assert_held(>->irq_lock); + + GEM_BUG_ON(enabled_irq_mask & ~interrupt_mask); + + gt->gt_imr &= ~interrupt_mask; + gt->gt_imr |= (~enabled_irq_mask & interrupt_mask); + intel_uncore_write(gt->uncore, GTIMR, gt->gt_imr); +} + +void gen5_gt_enable_irq(struct intel_gt *gt, u32 mask) +{ + gen5_gt_update_irq(gt, mask, mask); + intel_uncore_posting_read_fw(gt->uncore, GTIMR); +} + +void gen5_gt_disable_irq(struct intel_gt *gt, u32 mask) +{ + gen5_gt_update_irq(gt, mask, 0); +} + +void gen5_gt_irq_reset(struct intel_gt *gt) +{ + struct intel_uncore *uncore = gt->uncore; + + GEN3_IRQ_RESET(uncore, GT); + if (INTEL_GEN(gt->i915) >= 6) + GEN3_IRQ_RESET(uncore, GEN6_PM); +} + +void gen5_gt_irq_postinstall(struct intel_gt *gt) +{ + struct intel_uncore *uncore = gt->uncore; + u32 pm_irqs = 0; + u32 gt_irqs = 0; + + gt->gt_imr = ~0; + if (HAS_L3_DPF(gt->i915)) { + /* L3 parity interrupt is always unmasked. */ + gt->gt_imr = ~GT_PARITY_ERROR(gt->i915); + gt_irqs |= GT_PARITY_ERROR(gt->i915); + } + + gt_irqs |= GT_RENDER_USER_INTERRUPT; + if (IS_GEN(gt->i915, 5)) + gt_irqs |= ILK_BSD_USER_INTERRUPT; + else + gt_irqs |= GT_BLT_USER_INTERRUPT | GT_BSD_USER_INTERRUPT; + + GEN3_IRQ_INIT(uncore, GT, gt->gt_imr, gt_irqs); + + if (INTEL_GEN(gt->i915) >= 6) { + /* + * RPS interrupts will get enabled/disabled on demand when RPS + * itself is enabled/disabled. + */ + if (HAS_ENGINE(gt, VECS0)) { + pm_irqs |= PM_VEBOX_USER_INTERRUPT; + gt->pm_ier |= PM_VEBOX_USER_INTERRUPT; + } + + gt->pm_imr = 0xffffffff; + GEN3_IRQ_INIT(uncore, GEN6_PM, gt->pm_imr, pm_irqs); + } +} diff --git a/drivers/gpu/drm/i915/gt/intel_gt_irq.h b/drivers/gpu/drm/i915/gt/intel_gt_irq.h new file mode 100644 index 000000000..886c5cf40 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_irq.h @@ -0,0 +1,43 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_GT_IRQ_H +#define INTEL_GT_IRQ_H + +#include <linux/types.h> + +struct intel_gt; + +#define GEN8_GT_IRQS (GEN8_GT_RCS_IRQ | \ + GEN8_GT_BCS_IRQ | \ + GEN8_GT_VCS0_IRQ | \ + GEN8_GT_VCS1_IRQ | \ + GEN8_GT_VECS_IRQ | \ + GEN8_GT_PM_IRQ | \ + GEN8_GT_GUC_IRQ) + +void gen11_gt_irq_reset(struct intel_gt *gt); +void gen11_gt_irq_postinstall(struct intel_gt *gt); +void gen11_gt_irq_handler(struct intel_gt *gt, const u32 master_ctl); + +bool gen11_gt_reset_one_iir(struct intel_gt *gt, + const unsigned int bank, + const unsigned int bit); + +void gen5_gt_irq_handler(struct intel_gt *gt, u32 gt_iir); + +void gen5_gt_irq_postinstall(struct intel_gt *gt); +void gen5_gt_irq_reset(struct intel_gt *gt); +void gen5_gt_disable_irq(struct intel_gt *gt, u32 mask); +void gen5_gt_enable_irq(struct intel_gt *gt, u32 mask); + +void gen6_gt_irq_handler(struct intel_gt *gt, u32 gt_iir); + +void gen8_gt_irq_handler(struct intel_gt *gt, u32 master_ctl); +void gen8_gt_irq_reset(struct intel_gt *gt); +void gen8_gt_irq_postinstall(struct intel_gt *gt); + +#endif /* INTEL_GT_IRQ_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c new file mode 100644 index 000000000..274aa0dd7 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c @@ -0,0 +1,344 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include <linux/suspend.h> + +#include "i915_drv.h" +#include "i915_globals.h" +#include "i915_params.h" +#include "intel_context.h" +#include "intel_engine_pm.h" +#include "intel_gt.h" +#include "intel_gt_clock_utils.h" +#include "intel_gt_pm.h" +#include "intel_gt_requests.h" +#include "intel_llc.h" +#include "intel_pm.h" +#include "intel_rc6.h" +#include "intel_rps.h" +#include "intel_wakeref.h" + +static void user_forcewake(struct intel_gt *gt, bool suspend) +{ + int count = atomic_read(>->user_wakeref); + + /* Inside suspend/resume so single threaded, no races to worry about. */ + if (likely(!count)) + return; + + intel_gt_pm_get(gt); + if (suspend) { + GEM_BUG_ON(count > atomic_read(>->wakeref.count)); + atomic_sub(count, >->wakeref.count); + } else { + atomic_add(count, >->wakeref.count); + } + intel_gt_pm_put(gt); +} + +static int __gt_unpark(struct intel_wakeref *wf) +{ + struct intel_gt *gt = container_of(wf, typeof(*gt), wakeref); + struct drm_i915_private *i915 = gt->i915; + + GT_TRACE(gt, "\n"); + + i915_globals_unpark(); + + /* + * It seems that the DMC likes to transition between the DC states a lot + * when there are no connected displays (no active power domains) during + * command submission. + * + * This activity has negative impact on the performance of the chip with + * huge latencies observed in the interrupt handler and elsewhere. + * + * Work around it by grabbing a GT IRQ power domain whilst there is any + * GT activity, preventing any DC state transitions. + */ + gt->awake = intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ); + GEM_BUG_ON(!gt->awake); + + intel_rc6_unpark(>->rc6); + intel_rps_unpark(>->rps); + i915_pmu_gt_unparked(i915); + + intel_gt_unpark_requests(gt); + + return 0; +} + +static int __gt_park(struct intel_wakeref *wf) +{ + struct intel_gt *gt = container_of(wf, typeof(*gt), wakeref); + intel_wakeref_t wakeref = fetch_and_zero(>->awake); + struct drm_i915_private *i915 = gt->i915; + + GT_TRACE(gt, "\n"); + + intel_gt_park_requests(gt); + + i915_vma_parked(gt); + i915_pmu_gt_parked(i915); + intel_rps_park(>->rps); + intel_rc6_park(>->rc6); + + /* Everything switched off, flush any residual interrupt just in case */ + intel_synchronize_irq(i915); + + /* Defer dropping the display power well for 100ms, it's slow! */ + GEM_BUG_ON(!wakeref); + intel_display_power_put_async(i915, POWER_DOMAIN_GT_IRQ, wakeref); + + i915_globals_park(); + + return 0; +} + +static const struct intel_wakeref_ops wf_ops = { + .get = __gt_unpark, + .put = __gt_park, +}; + +void intel_gt_pm_init_early(struct intel_gt *gt) +{ + intel_wakeref_init(>->wakeref, gt->uncore->rpm, &wf_ops); +} + +void intel_gt_pm_init(struct intel_gt *gt) +{ + /* + * Enabling power-management should be "self-healing". If we cannot + * enable a feature, simply leave it disabled with a notice to the + * user. + */ + intel_rc6_init(>->rc6); + intel_rps_init(>->rps); +} + +static bool reset_engines(struct intel_gt *gt) +{ + if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) + return false; + + return __intel_gt_reset(gt, ALL_ENGINES) == 0; +} + +static void gt_sanitize(struct intel_gt *gt, bool force) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + intel_wakeref_t wakeref; + + GT_TRACE(gt, "force:%s", yesno(force)); + + /* Use a raw wakeref to avoid calling intel_display_power_get early */ + wakeref = intel_runtime_pm_get(gt->uncore->rpm); + intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); + + intel_gt_check_clock_frequency(gt); + + /* + * As we have just resumed the machine and woken the device up from + * deep PCI sleep (presumably D3_cold), assume the HW has been reset + * back to defaults, recovering from whatever wedged state we left it + * in and so worth trying to use the device once more. + */ + if (intel_gt_is_wedged(gt)) + intel_gt_unset_wedged(gt); + + intel_uc_sanitize(>->uc); + + for_each_engine(engine, gt, id) + if (engine->reset.prepare) + engine->reset.prepare(engine); + + intel_uc_reset_prepare(>->uc); + + for_each_engine(engine, gt, id) + if (engine->sanitize) + engine->sanitize(engine); + + if (reset_engines(gt) || force) { + for_each_engine(engine, gt, id) + __intel_engine_reset(engine, false); + } + + for_each_engine(engine, gt, id) + if (engine->reset.finish) + engine->reset.finish(engine); + + intel_rps_sanitize(>->rps); + + intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); + intel_runtime_pm_put(gt->uncore->rpm, wakeref); +} + +void intel_gt_pm_fini(struct intel_gt *gt) +{ + intel_rc6_fini(>->rc6); +} + +int intel_gt_resume(struct intel_gt *gt) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err; + + err = intel_gt_has_unrecoverable_error(gt); + if (err) + return err; + + GT_TRACE(gt, "\n"); + + /* + * After resume, we may need to poke into the pinned kernel + * contexts to paper over any damage caused by the sudden suspend. + * Only the kernel contexts should remain pinned over suspend, + * allowing us to fixup the user contexts on their first pin. + */ + gt_sanitize(gt, true); + + intel_gt_pm_get(gt); + + intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); + intel_rc6_sanitize(>->rc6); + if (intel_gt_is_wedged(gt)) { + err = -EIO; + goto out_fw; + } + + /* Only when the HW is re-initialised, can we replay the requests */ + err = intel_gt_init_hw(gt); + if (err) { + i915_probe_error(gt->i915, + "Failed to initialize GPU, declaring it wedged!\n"); + goto err_wedged; + } + + intel_rps_enable(>->rps); + intel_llc_enable(>->llc); + + for_each_engine(engine, gt, id) { + intel_engine_pm_get(engine); + + engine->serial++; /* kernel context lost */ + err = intel_engine_resume(engine); + + intel_engine_pm_put(engine); + if (err) { + drm_err(>->i915->drm, + "Failed to restart %s (%d)\n", + engine->name, err); + goto err_wedged; + } + } + + intel_rc6_enable(>->rc6); + + intel_uc_resume(>->uc); + + user_forcewake(gt, false); + +out_fw: + intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); + intel_gt_pm_put(gt); + return err; + +err_wedged: + intel_gt_set_wedged(gt); + goto out_fw; +} + +static void wait_for_suspend(struct intel_gt *gt) +{ + if (!intel_gt_pm_is_awake(gt)) + return; + + if (intel_gt_wait_for_idle(gt, I915_GEM_IDLE_TIMEOUT) == -ETIME) { + /* + * Forcibly cancel outstanding work and leave + * the gpu quiet. + */ + intel_gt_set_wedged(gt); + intel_gt_retire_requests(gt); + } + + intel_gt_pm_wait_for_idle(gt); +} + +void intel_gt_suspend_prepare(struct intel_gt *gt) +{ + user_forcewake(gt, true); + wait_for_suspend(gt); + + intel_uc_suspend(>->uc); +} + +static suspend_state_t pm_suspend_target(void) +{ +#if IS_ENABLED(CONFIG_SUSPEND) && IS_ENABLED(CONFIG_PM_SLEEP) + return pm_suspend_target_state; +#else + return PM_SUSPEND_TO_IDLE; +#endif +} + +void intel_gt_suspend_late(struct intel_gt *gt) +{ + intel_wakeref_t wakeref; + + /* We expect to be idle already; but also want to be independent */ + wait_for_suspend(gt); + + if (is_mock_gt(gt)) + return; + + GEM_BUG_ON(gt->awake); + + /* + * On disabling the device, we want to turn off HW access to memory + * that we no longer own. + * + * However, not all suspend-states disable the device. S0 (s2idle) + * is effectively runtime-suspend, the device is left powered on + * but needs to be put into a low power state. We need to keep + * powermanagement enabled, but we also retain system state and so + * it remains safe to keep on using our allocated memory. + */ + if (pm_suspend_target() == PM_SUSPEND_TO_IDLE) + return; + + with_intel_runtime_pm(gt->uncore->rpm, wakeref) { + intel_rps_disable(>->rps); + intel_rc6_disable(>->rc6); + intel_llc_disable(>->llc); + } + + gt_sanitize(gt, false); + + GT_TRACE(gt, "\n"); +} + +void intel_gt_runtime_suspend(struct intel_gt *gt) +{ + intel_uc_runtime_suspend(>->uc); + + GT_TRACE(gt, "\n"); +} + +int intel_gt_runtime_resume(struct intel_gt *gt) +{ + GT_TRACE(gt, "\n"); + intel_gt_init_swizzling(gt); + intel_ggtt_restore_fences(gt->ggtt); + + return intel_uc_runtime_resume(>->uc); +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_gt_pm.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h new file mode 100644 index 000000000..60f0e2fbe --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h @@ -0,0 +1,66 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_GT_PM_H +#define INTEL_GT_PM_H + +#include <linux/types.h> + +#include "intel_gt_types.h" +#include "intel_wakeref.h" + +static inline bool intel_gt_pm_is_awake(const struct intel_gt *gt) +{ + return intel_wakeref_is_active(>->wakeref); +} + +static inline void intel_gt_pm_get(struct intel_gt *gt) +{ + intel_wakeref_get(>->wakeref); +} + +static inline void __intel_gt_pm_get(struct intel_gt *gt) +{ + __intel_wakeref_get(>->wakeref); +} + +static inline bool intel_gt_pm_get_if_awake(struct intel_gt *gt) +{ + return intel_wakeref_get_if_active(>->wakeref); +} + +static inline void intel_gt_pm_put(struct intel_gt *gt) +{ + intel_wakeref_put(>->wakeref); +} + +static inline void intel_gt_pm_put_async(struct intel_gt *gt) +{ + intel_wakeref_put_async(>->wakeref); +} + +static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt) +{ + return intel_wakeref_wait_for_idle(>->wakeref); +} + +void intel_gt_pm_init_early(struct intel_gt *gt); +void intel_gt_pm_init(struct intel_gt *gt); +void intel_gt_pm_fini(struct intel_gt *gt); + +void intel_gt_suspend_prepare(struct intel_gt *gt); +void intel_gt_suspend_late(struct intel_gt *gt); +int intel_gt_resume(struct intel_gt *gt); + +void intel_gt_runtime_suspend(struct intel_gt *gt); +int intel_gt_runtime_resume(struct intel_gt *gt); + +static inline bool is_mock_gt(const struct intel_gt *gt) +{ + return I915_SELFTEST_ONLY(gt->awake == -ENODEV); +} + +#endif /* INTEL_GT_PM_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm_irq.c b/drivers/gpu/drm/i915/gt/intel_gt_pm_irq.c new file mode 100644 index 000000000..babe86612 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm_irq.c @@ -0,0 +1,109 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include "i915_drv.h" +#include "i915_reg.h" +#include "intel_gt.h" +#include "intel_gt_irq.h" +#include "intel_gt_pm_irq.h" + +static void write_pm_imr(struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + struct intel_uncore *uncore = gt->uncore; + u32 mask = gt->pm_imr; + i915_reg_t reg; + + if (INTEL_GEN(i915) >= 11) { + reg = GEN11_GPM_WGBOXPERF_INTR_MASK; + mask <<= 16; /* pm is in upper half */ + } else if (INTEL_GEN(i915) >= 8) { + reg = GEN8_GT_IMR(2); + } else { + reg = GEN6_PMIMR; + } + + intel_uncore_write(uncore, reg, mask); +} + +static void gen6_gt_pm_update_irq(struct intel_gt *gt, + u32 interrupt_mask, + u32 enabled_irq_mask) +{ + u32 new_val; + + WARN_ON(enabled_irq_mask & ~interrupt_mask); + + lockdep_assert_held(>->irq_lock); + + new_val = gt->pm_imr; + new_val &= ~interrupt_mask; + new_val |= ~enabled_irq_mask & interrupt_mask; + + if (new_val != gt->pm_imr) { + gt->pm_imr = new_val; + write_pm_imr(gt); + } +} + +void gen6_gt_pm_unmask_irq(struct intel_gt *gt, u32 mask) +{ + gen6_gt_pm_update_irq(gt, mask, mask); +} + +void gen6_gt_pm_mask_irq(struct intel_gt *gt, u32 mask) +{ + gen6_gt_pm_update_irq(gt, mask, 0); +} + +void gen6_gt_pm_reset_iir(struct intel_gt *gt, u32 reset_mask) +{ + struct intel_uncore *uncore = gt->uncore; + i915_reg_t reg = INTEL_GEN(gt->i915) >= 8 ? GEN8_GT_IIR(2) : GEN6_PMIIR; + + lockdep_assert_held(>->irq_lock); + + intel_uncore_write(uncore, reg, reset_mask); + intel_uncore_write(uncore, reg, reset_mask); + intel_uncore_posting_read(uncore, reg); +} + +static void write_pm_ier(struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + struct intel_uncore *uncore = gt->uncore; + u32 mask = gt->pm_ier; + i915_reg_t reg; + + if (INTEL_GEN(i915) >= 11) { + reg = GEN11_GPM_WGBOXPERF_INTR_ENABLE; + mask <<= 16; /* pm is in upper half */ + } else if (INTEL_GEN(i915) >= 8) { + reg = GEN8_GT_IER(2); + } else { + reg = GEN6_PMIER; + } + + intel_uncore_write(uncore, reg, mask); +} + +void gen6_gt_pm_enable_irq(struct intel_gt *gt, u32 enable_mask) +{ + lockdep_assert_held(>->irq_lock); + + gt->pm_ier |= enable_mask; + write_pm_ier(gt); + gen6_gt_pm_unmask_irq(gt, enable_mask); +} + +void gen6_gt_pm_disable_irq(struct intel_gt *gt, u32 disable_mask) +{ + lockdep_assert_held(>->irq_lock); + + gt->pm_ier &= ~disable_mask; + gen6_gt_pm_mask_irq(gt, disable_mask); + write_pm_ier(gt); +} diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm_irq.h b/drivers/gpu/drm/i915/gt/intel_gt_pm_irq.h new file mode 100644 index 000000000..b29816a04 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm_irq.h @@ -0,0 +1,22 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_GT_PM_IRQ_H +#define INTEL_GT_PM_IRQ_H + +#include <linux/types.h> + +struct intel_gt; + +void gen6_gt_pm_unmask_irq(struct intel_gt *gt, u32 mask); +void gen6_gt_pm_mask_irq(struct intel_gt *gt, u32 mask); + +void gen6_gt_pm_enable_irq(struct intel_gt *gt, u32 enable_mask); +void gen6_gt_pm_disable_irq(struct intel_gt *gt, u32 disable_mask); + +void gen6_gt_pm_reset_iir(struct intel_gt *gt, u32 reset_mask); + +#endif /* INTEL_GT_PM_IRQ_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c new file mode 100644 index 000000000..cca285185 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c @@ -0,0 +1,251 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include <linux/workqueue.h> + +#include "i915_drv.h" /* for_each_engine() */ +#include "i915_request.h" +#include "intel_engine_heartbeat.h" +#include "intel_gt.h" +#include "intel_gt_pm.h" +#include "intel_gt_requests.h" +#include "intel_timeline.h" + +static bool retire_requests(struct intel_timeline *tl) +{ + struct i915_request *rq, *rn; + + list_for_each_entry_safe(rq, rn, &tl->requests, link) + if (!i915_request_retire(rq)) + return false; + + /* And check nothing new was submitted */ + return !i915_active_fence_isset(&tl->last_request); +} + +static bool engine_active(const struct intel_engine_cs *engine) +{ + return !list_empty(&engine->kernel_context->timeline->requests); +} + +static bool flush_submission(struct intel_gt *gt, long timeout) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + bool active = false; + + if (!timeout) + return false; + + if (!intel_gt_pm_is_awake(gt)) + return false; + + for_each_engine(engine, gt, id) { + intel_engine_flush_submission(engine); + + /* Flush the background retirement and idle barriers */ + flush_work(&engine->retire_work); + flush_delayed_work(&engine->wakeref.work); + + /* Is the idle barrier still outstanding? */ + active |= engine_active(engine); + } + + return active; +} + +static void engine_retire(struct work_struct *work) +{ + struct intel_engine_cs *engine = + container_of(work, typeof(*engine), retire_work); + struct intel_timeline *tl = xchg(&engine->retire, NULL); + + do { + struct intel_timeline *next = xchg(&tl->retire, NULL); + + /* + * Our goal here is to retire _idle_ timelines as soon as + * possible (as they are idle, we do not expect userspace + * to be cleaning up anytime soon). + * + * If the timeline is currently locked, either it is being + * retired elsewhere or about to be! + */ + if (mutex_trylock(&tl->mutex)) { + retire_requests(tl); + mutex_unlock(&tl->mutex); + } + intel_timeline_put(tl); + + GEM_BUG_ON(!next); + tl = ptr_mask_bits(next, 1); + } while (tl); +} + +static bool add_retire(struct intel_engine_cs *engine, + struct intel_timeline *tl) +{ +#define STUB ((struct intel_timeline *)1) + struct intel_timeline *first; + + /* + * We open-code a llist here to include the additional tag [BIT(0)] + * so that we know when the timeline is already on a + * retirement queue: either this engine or another. + */ + + if (cmpxchg(&tl->retire, NULL, STUB)) /* already queued */ + return false; + + intel_timeline_get(tl); + first = READ_ONCE(engine->retire); + do + tl->retire = ptr_pack_bits(first, 1, 1); + while (!try_cmpxchg(&engine->retire, &first, tl)); + + return !first; +} + +void intel_engine_add_retire(struct intel_engine_cs *engine, + struct intel_timeline *tl) +{ + /* We don't deal well with the engine disappearing beneath us */ + GEM_BUG_ON(intel_engine_is_virtual(engine)); + + if (add_retire(engine, tl)) + schedule_work(&engine->retire_work); +} + +void intel_engine_init_retire(struct intel_engine_cs *engine) +{ + INIT_WORK(&engine->retire_work, engine_retire); +} + +void intel_engine_fini_retire(struct intel_engine_cs *engine) +{ + flush_work(&engine->retire_work); + GEM_BUG_ON(engine->retire); +} + +long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout) +{ + struct intel_gt_timelines *timelines = >->timelines; + struct intel_timeline *tl, *tn; + unsigned long active_count = 0; + bool interruptible; + LIST_HEAD(free); + + interruptible = true; + if (unlikely(timeout < 0)) + timeout = -timeout, interruptible = false; + + flush_submission(gt, timeout); /* kick the ksoftirqd tasklets */ + spin_lock(&timelines->lock); + list_for_each_entry_safe(tl, tn, &timelines->active_list, link) { + if (!mutex_trylock(&tl->mutex)) { + active_count++; /* report busy to caller, try again? */ + continue; + } + + intel_timeline_get(tl); + GEM_BUG_ON(!atomic_read(&tl->active_count)); + atomic_inc(&tl->active_count); /* pin the list element */ + spin_unlock(&timelines->lock); + + if (timeout > 0) { + struct dma_fence *fence; + + fence = i915_active_fence_get(&tl->last_request); + if (fence) { + mutex_unlock(&tl->mutex); + + timeout = dma_fence_wait_timeout(fence, + interruptible, + timeout); + dma_fence_put(fence); + + /* Retirement is best effort */ + if (!mutex_trylock(&tl->mutex)) { + active_count++; + goto out_active; + } + } + } + + if (!retire_requests(tl)) + active_count++; + mutex_unlock(&tl->mutex); + +out_active: spin_lock(&timelines->lock); + + /* Resume list iteration after reacquiring spinlock */ + list_safe_reset_next(tl, tn, link); + if (atomic_dec_and_test(&tl->active_count)) + list_del(&tl->link); + + /* Defer the final release to after the spinlock */ + if (refcount_dec_and_test(&tl->kref.refcount)) { + GEM_BUG_ON(atomic_read(&tl->active_count)); + list_add(&tl->link, &free); + } + } + spin_unlock(&timelines->lock); + + list_for_each_entry_safe(tl, tn, &free, link) + __intel_timeline_free(&tl->kref); + + if (flush_submission(gt, timeout)) /* Wait, there's more! */ + active_count++; + + return active_count ? timeout ?: -ETIME : 0; +} + +int intel_gt_wait_for_idle(struct intel_gt *gt, long timeout) +{ + /* If the device is asleep, we have no requests outstanding */ + if (!intel_gt_pm_is_awake(gt)) + return 0; + + while ((timeout = intel_gt_retire_requests_timeout(gt, timeout)) > 0) { + cond_resched(); + if (signal_pending(current)) + return -EINTR; + } + + return timeout; +} + +static void retire_work_handler(struct work_struct *work) +{ + struct intel_gt *gt = + container_of(work, typeof(*gt), requests.retire_work.work); + + schedule_delayed_work(>->requests.retire_work, + round_jiffies_up_relative(HZ)); + intel_gt_retire_requests(gt); +} + +void intel_gt_init_requests(struct intel_gt *gt) +{ + INIT_DELAYED_WORK(>->requests.retire_work, retire_work_handler); +} + +void intel_gt_park_requests(struct intel_gt *gt) +{ + cancel_delayed_work(>->requests.retire_work); +} + +void intel_gt_unpark_requests(struct intel_gt *gt) +{ + schedule_delayed_work(>->requests.retire_work, + round_jiffies_up_relative(HZ)); +} + +void intel_gt_fini_requests(struct intel_gt *gt) +{ + /* Wait until the work is marked as finished before unloading! */ + cancel_delayed_work_sync(>->requests.retire_work); +} diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.h b/drivers/gpu/drm/i915/gt/intel_gt_requests.h new file mode 100644 index 000000000..dbac53baf --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.h @@ -0,0 +1,32 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_GT_REQUESTS_H +#define INTEL_GT_REQUESTS_H + +struct intel_engine_cs; +struct intel_gt; +struct intel_timeline; + +long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout); +static inline void intel_gt_retire_requests(struct intel_gt *gt) +{ + intel_gt_retire_requests_timeout(gt, 0); +} + +void intel_engine_init_retire(struct intel_engine_cs *engine); +void intel_engine_add_retire(struct intel_engine_cs *engine, + struct intel_timeline *tl); +void intel_engine_fini_retire(struct intel_engine_cs *engine); + +int intel_gt_wait_for_idle(struct intel_gt *gt, long timeout); + +void intel_gt_init_requests(struct intel_gt *gt); +void intel_gt_park_requests(struct intel_gt *gt); +void intel_gt_unpark_requests(struct intel_gt *gt); +void intel_gt_fini_requests(struct intel_gt *gt); + +#endif /* INTEL_GT_REQUESTS_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h new file mode 100644 index 000000000..78c061614 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2019 Intel Corporation + */ + +#ifndef __INTEL_GT_TYPES__ +#define __INTEL_GT_TYPES__ + +#include <linux/ktime.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/notifier.h> +#include <linux/spinlock.h> +#include <linux/types.h> + +#include "uc/intel_uc.h" + +#include "i915_vma.h" +#include "intel_engine_types.h" +#include "intel_gt_buffer_pool_types.h" +#include "intel_llc_types.h" +#include "intel_reset_types.h" +#include "intel_rc6_types.h" +#include "intel_rps_types.h" +#include "intel_wakeref.h" + +struct drm_i915_private; +struct i915_ggtt; +struct intel_engine_cs; +struct intel_uncore; + +struct intel_gt { + struct drm_i915_private *i915; + struct intel_uncore *uncore; + struct i915_ggtt *ggtt; + + struct intel_uc uc; + + struct mutex tlb_invalidate_lock; + + struct intel_gt_timelines { + spinlock_t lock; /* protects active_list */ + struct list_head active_list; + + /* Pack multiple timelines' seqnos into the same page */ + spinlock_t hwsp_lock; + struct list_head hwsp_free_list; + } timelines; + + struct intel_gt_requests { + /** + * We leave the user IRQ off as much as possible, + * but this means that requests will finish and never + * be retired once the system goes idle. Set a timer to + * fire periodically while the ring is running. When it + * fires, go retire requests. + */ + struct delayed_work retire_work; + } requests; + + struct intel_wakeref wakeref; + atomic_t user_wakeref; + + struct list_head closed_vma; + spinlock_t closed_lock; /* guards the list of closed_vma */ + + ktime_t last_init_time; + struct intel_reset reset; + + /** + * Is the GPU currently considered idle, or busy executing + * userspace requests? Whilst idle, we allow runtime power + * management to power down the hardware and display clocks. + * In order to reduce the effect on performance, there + * is a slight delay before we do so. + */ + intel_wakeref_t awake; + + u32 clock_frequency; + + struct intel_llc llc; + struct intel_rc6 rc6; + struct intel_rps rps; + + spinlock_t irq_lock; + u32 gt_imr; + u32 pm_ier; + u32 pm_imr; + + u32 pm_guc_events; + + struct intel_engine_cs *engine[I915_NUM_ENGINES]; + struct intel_engine_cs *engine_class[MAX_ENGINE_CLASS + 1] + [MAX_ENGINE_INSTANCE + 1]; + + /* + * Default address space (either GGTT or ppGTT depending on arch). + * + * Reserved for exclusive use by the kernel. + */ + struct i915_address_space *vm; + + /* + * A pool of objects to use as shadow copies of client batch buffers + * when the command parser is enabled. Prevents the client from + * modifying the batch contents after software parsing. + * + * Buffers older than 1s are periodically reaped from the pool, + * or may be reclaimed by the shrinker before then. + */ + struct intel_gt_buffer_pool buffer_pool; + + struct i915_vma *scratch; + + struct intel_gt_info { + intel_engine_mask_t engine_mask; + u8 num_engines; + + /* Media engine access to SFC per instance */ + u8 vdbox_sfc_access; + + /* Slice/subslice/EU info */ + struct sseu_dev_info sseu; + } info; +}; + +enum intel_gt_scratch_field { + /* 8 bytes */ + INTEL_GT_SCRATCH_FIELD_DEFAULT = 0, + + /* 8 bytes */ + INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH = 128, + + /* 8 bytes */ + INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA = 256, + + /* 6 * 8 bytes */ + INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR = 2048, + + /* 4 bytes */ + INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1 = 2096, +}; + +#endif /* __INTEL_GT_TYPES_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c new file mode 100644 index 000000000..3f1114b58 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c @@ -0,0 +1,421 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +#include <linux/slab.h> /* fault-inject.h is not standalone! */ + +#include <linux/fault-inject.h> + +#include "i915_trace.h" +#include "intel_gt.h" +#include "intel_gtt.h" + +struct drm_i915_gem_object *alloc_pt_dma(struct i915_address_space *vm, int sz) +{ + if (I915_SELFTEST_ONLY(should_fail(&vm->fault_attr, 1))) + i915_gem_shrink_all(vm->i915); + + return i915_gem_object_create_internal(vm->i915, sz); +} + +int pin_pt_dma(struct i915_address_space *vm, struct drm_i915_gem_object *obj) +{ + int err; + + err = i915_gem_object_pin_pages(obj); + if (err) + return err; + + i915_gem_object_make_unshrinkable(obj); + return 0; +} + +void __i915_vm_close(struct i915_address_space *vm) +{ + struct i915_vma *vma, *vn; + + if (!atomic_dec_and_mutex_lock(&vm->open, &vm->mutex)) + return; + + list_for_each_entry_safe(vma, vn, &vm->bound_list, vm_link) { + struct drm_i915_gem_object *obj = vma->obj; + + /* Keep the obj (and hence the vma) alive as _we_ destroy it */ + if (!kref_get_unless_zero(&obj->base.refcount)) + continue; + + atomic_and(~I915_VMA_PIN_MASK, &vma->flags); + WARN_ON(__i915_vma_unbind(vma)); + __i915_vma_put(vma); + + i915_gem_object_put(obj); + } + GEM_BUG_ON(!list_empty(&vm->bound_list)); + + mutex_unlock(&vm->mutex); +} + +void i915_address_space_fini(struct i915_address_space *vm) +{ + drm_mm_takedown(&vm->mm); + mutex_destroy(&vm->mutex); +} + +static void __i915_vm_release(struct work_struct *work) +{ + struct i915_address_space *vm = + container_of(work, struct i915_address_space, rcu.work); + + vm->cleanup(vm); + i915_address_space_fini(vm); + + kfree(vm); +} + +void i915_vm_release(struct kref *kref) +{ + struct i915_address_space *vm = + container_of(kref, struct i915_address_space, ref); + + GEM_BUG_ON(i915_is_ggtt(vm)); + trace_i915_ppgtt_release(vm); + + queue_rcu_work(vm->i915->wq, &vm->rcu); +} + +void i915_address_space_init(struct i915_address_space *vm, int subclass) +{ + kref_init(&vm->ref); + INIT_RCU_WORK(&vm->rcu, __i915_vm_release); + atomic_set(&vm->open, 1); + + /* + * The vm->mutex must be reclaim safe (for use in the shrinker). + * Do a dummy acquire now under fs_reclaim so that any allocation + * attempt holding the lock is immediately reported by lockdep. + */ + mutex_init(&vm->mutex); + lockdep_set_subclass(&vm->mutex, subclass); + i915_gem_shrinker_taints_mutex(vm->i915, &vm->mutex); + + GEM_BUG_ON(!vm->total); + drm_mm_init(&vm->mm, 0, vm->total); + vm->mm.head_node.color = I915_COLOR_UNEVICTABLE; + + INIT_LIST_HEAD(&vm->bound_list); +} + +void clear_pages(struct i915_vma *vma) +{ + GEM_BUG_ON(!vma->pages); + + if (vma->pages != vma->obj->mm.pages) { + sg_free_table(vma->pages); + kfree(vma->pages); + } + vma->pages = NULL; + + memset(&vma->page_sizes, 0, sizeof(vma->page_sizes)); +} + +dma_addr_t __px_dma(struct drm_i915_gem_object *p) +{ + GEM_BUG_ON(!i915_gem_object_has_pages(p)); + return sg_dma_address(p->mm.pages->sgl); +} + +struct page *__px_page(struct drm_i915_gem_object *p) +{ + GEM_BUG_ON(!i915_gem_object_has_pages(p)); + return sg_page(p->mm.pages->sgl); +} + +void +fill_page_dma(struct drm_i915_gem_object *p, const u64 val, unsigned int count) +{ + struct page *page = __px_page(p); + void *vaddr; + + vaddr = kmap(page); + memset64(vaddr, val, count); + clflush_cache_range(vaddr, PAGE_SIZE); + kunmap(page); +} + +static void poison_scratch_page(struct drm_i915_gem_object *scratch) +{ + struct sgt_iter sgt; + struct page *page; + u8 val; + + val = 0; + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) + val = POISON_FREE; + + for_each_sgt_page(page, sgt, scratch->mm.pages) { + void *vaddr; + + vaddr = kmap(page); + memset(vaddr, val, PAGE_SIZE); + kunmap(page); + } +} + +int setup_scratch_page(struct i915_address_space *vm) +{ + unsigned long size; + + /* + * In order to utilize 64K pages for an object with a size < 2M, we will + * need to support a 64K scratch page, given that every 16th entry for a + * page-table operating in 64K mode must point to a properly aligned 64K + * region, including any PTEs which happen to point to scratch. + * + * This is only relevant for the 48b PPGTT where we support + * huge-gtt-pages, see also i915_vma_insert(). However, as we share the + * scratch (read-only) between all vm, we create one 64k scratch page + * for all. + */ + size = I915_GTT_PAGE_SIZE_4K; + if (i915_vm_is_4lvl(vm) && + HAS_PAGE_SIZES(vm->i915, I915_GTT_PAGE_SIZE_64K)) + size = I915_GTT_PAGE_SIZE_64K; + + do { + struct drm_i915_gem_object *obj; + + obj = vm->alloc_pt_dma(vm, size); + if (IS_ERR(obj)) + goto skip; + + if (pin_pt_dma(vm, obj)) + goto skip_obj; + + /* We need a single contiguous page for our scratch */ + if (obj->mm.page_sizes.sg < size) + goto skip_obj; + + /* And it needs to be correspondingly aligned */ + if (__px_dma(obj) & (size - 1)) + goto skip_obj; + + /* + * Use a non-zero scratch page for debugging. + * + * We want a value that should be reasonably obvious + * to spot in the error state, while also causing a GPU hang + * if executed. We prefer using a clear page in production, so + * should it ever be accidentally used, the effect should be + * fairly benign. + */ + poison_scratch_page(obj); + + vm->scratch[0] = obj; + vm->scratch_order = get_order(size); + return 0; + +skip_obj: + i915_gem_object_put(obj); +skip: + if (size == I915_GTT_PAGE_SIZE_4K) + return -ENOMEM; + + size = I915_GTT_PAGE_SIZE_4K; + } while (1); +} + +void free_scratch(struct i915_address_space *vm) +{ + int i; + + for (i = 0; i <= vm->top; i++) + i915_gem_object_put(vm->scratch[i]); +} + +void gtt_write_workarounds(struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + struct intel_uncore *uncore = gt->uncore; + + /* + * This function is for gtt related workarounds. This function is + * called on driver load and after a GPU reset, so you can place + * workarounds here even if they get overwritten by GPU reset. + */ + /* WaIncreaseDefaultTLBEntries:chv,bdw,skl,bxt,kbl,glk,cfl,cnl,icl */ + if (IS_BROADWELL(i915)) + intel_uncore_write(uncore, + GEN8_L3_LRA_1_GPGPU, + GEN8_L3_LRA_1_GPGPU_DEFAULT_VALUE_BDW); + else if (IS_CHERRYVIEW(i915)) + intel_uncore_write(uncore, + GEN8_L3_LRA_1_GPGPU, + GEN8_L3_LRA_1_GPGPU_DEFAULT_VALUE_CHV); + else if (IS_GEN9_LP(i915)) + intel_uncore_write(uncore, + GEN8_L3_LRA_1_GPGPU, + GEN9_L3_LRA_1_GPGPU_DEFAULT_VALUE_BXT); + else if (INTEL_GEN(i915) >= 9 && INTEL_GEN(i915) <= 11) + intel_uncore_write(uncore, + GEN8_L3_LRA_1_GPGPU, + GEN9_L3_LRA_1_GPGPU_DEFAULT_VALUE_SKL); + + /* + * To support 64K PTEs we need to first enable the use of the + * Intermediate-Page-Size(IPS) bit of the PDE field via some magical + * mmio, otherwise the page-walker will simply ignore the IPS bit. This + * shouldn't be needed after GEN10. + * + * 64K pages were first introduced from BDW+, although technically they + * only *work* from gen9+. For pre-BDW we instead have the option for + * 32K pages, but we don't currently have any support for it in our + * driver. + */ + if (HAS_PAGE_SIZES(i915, I915_GTT_PAGE_SIZE_64K) && + INTEL_GEN(i915) <= 10) + intel_uncore_rmw(uncore, + GEN8_GAMW_ECO_DEV_RW_IA, + 0, + GAMW_ECO_ENABLE_64K_IPS_FIELD); + + if (IS_GEN_RANGE(i915, 8, 11)) { + bool can_use_gtt_cache = true; + + /* + * According to the BSpec if we use 2M/1G pages then we also + * need to disable the GTT cache. At least on BDW we can see + * visual corruption when using 2M pages, and not disabling the + * GTT cache. + */ + if (HAS_PAGE_SIZES(i915, I915_GTT_PAGE_SIZE_2M)) + can_use_gtt_cache = false; + + /* WaGttCachingOffByDefault */ + intel_uncore_write(uncore, + HSW_GTT_CACHE_EN, + can_use_gtt_cache ? GTT_CACHE_EN_ALL : 0); + drm_WARN_ON_ONCE(&i915->drm, can_use_gtt_cache && + intel_uncore_read(uncore, + HSW_GTT_CACHE_EN) == 0); + } +} + +static void tgl_setup_private_ppat(struct intel_uncore *uncore) +{ + /* TGL doesn't support LLC or AGE settings */ + intel_uncore_write(uncore, GEN12_PAT_INDEX(0), GEN8_PPAT_WB); + intel_uncore_write(uncore, GEN12_PAT_INDEX(1), GEN8_PPAT_WC); + intel_uncore_write(uncore, GEN12_PAT_INDEX(2), GEN8_PPAT_WT); + intel_uncore_write(uncore, GEN12_PAT_INDEX(3), GEN8_PPAT_UC); + intel_uncore_write(uncore, GEN12_PAT_INDEX(4), GEN8_PPAT_WB); + intel_uncore_write(uncore, GEN12_PAT_INDEX(5), GEN8_PPAT_WB); + intel_uncore_write(uncore, GEN12_PAT_INDEX(6), GEN8_PPAT_WB); + intel_uncore_write(uncore, GEN12_PAT_INDEX(7), GEN8_PPAT_WB); +} + +static void cnl_setup_private_ppat(struct intel_uncore *uncore) +{ + intel_uncore_write(uncore, + GEN10_PAT_INDEX(0), + GEN8_PPAT_WB | GEN8_PPAT_LLC); + intel_uncore_write(uncore, + GEN10_PAT_INDEX(1), + GEN8_PPAT_WC | GEN8_PPAT_LLCELLC); + intel_uncore_write(uncore, + GEN10_PAT_INDEX(2), + GEN8_PPAT_WT | GEN8_PPAT_LLCELLC); + intel_uncore_write(uncore, + GEN10_PAT_INDEX(3), + GEN8_PPAT_UC); + intel_uncore_write(uncore, + GEN10_PAT_INDEX(4), + GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0)); + intel_uncore_write(uncore, + GEN10_PAT_INDEX(5), + GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(1)); + intel_uncore_write(uncore, + GEN10_PAT_INDEX(6), + GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(2)); + intel_uncore_write(uncore, + GEN10_PAT_INDEX(7), + GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3)); +} + +/* + * The GGTT and PPGTT need a private PPAT setup in order to handle cacheability + * bits. When using advanced contexts each context stores its own PAT, but + * writing this data shouldn't be harmful even in those cases. + */ +static void bdw_setup_private_ppat(struct intel_uncore *uncore) +{ + u64 pat; + + pat = GEN8_PPAT(0, GEN8_PPAT_WB | GEN8_PPAT_LLC) | /* for normal objects, no eLLC */ + GEN8_PPAT(1, GEN8_PPAT_WC | GEN8_PPAT_LLCELLC) | /* for something pointing to ptes? */ + GEN8_PPAT(2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC) | /* for scanout with eLLC */ + GEN8_PPAT(3, GEN8_PPAT_UC) | /* Uncached objects, mostly for scanout */ + GEN8_PPAT(4, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0)) | + GEN8_PPAT(5, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(1)) | + GEN8_PPAT(6, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(2)) | + GEN8_PPAT(7, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3)); + + intel_uncore_write(uncore, GEN8_PRIVATE_PAT_LO, lower_32_bits(pat)); + intel_uncore_write(uncore, GEN8_PRIVATE_PAT_HI, upper_32_bits(pat)); +} + +static void chv_setup_private_ppat(struct intel_uncore *uncore) +{ + u64 pat; + + /* + * Map WB on BDW to snooped on CHV. + * + * Only the snoop bit has meaning for CHV, the rest is + * ignored. + * + * The hardware will never snoop for certain types of accesses: + * - CPU GTT (GMADR->GGTT->no snoop->memory) + * - PPGTT page tables + * - some other special cycles + * + * As with BDW, we also need to consider the following for GT accesses: + * "For GGTT, there is NO pat_sel[2:0] from the entry, + * so RTL will always use the value corresponding to + * pat_sel = 000". + * Which means we must set the snoop bit in PAT entry 0 + * in order to keep the global status page working. + */ + + pat = GEN8_PPAT(0, CHV_PPAT_SNOOP) | + GEN8_PPAT(1, 0) | + GEN8_PPAT(2, 0) | + GEN8_PPAT(3, 0) | + GEN8_PPAT(4, CHV_PPAT_SNOOP) | + GEN8_PPAT(5, CHV_PPAT_SNOOP) | + GEN8_PPAT(6, CHV_PPAT_SNOOP) | + GEN8_PPAT(7, CHV_PPAT_SNOOP); + + intel_uncore_write(uncore, GEN8_PRIVATE_PAT_LO, lower_32_bits(pat)); + intel_uncore_write(uncore, GEN8_PRIVATE_PAT_HI, upper_32_bits(pat)); +} + +void setup_private_pat(struct intel_uncore *uncore) +{ + struct drm_i915_private *i915 = uncore->i915; + + GEM_BUG_ON(INTEL_GEN(i915) < 8); + + if (INTEL_GEN(i915) >= 12) + tgl_setup_private_ppat(uncore); + else if (INTEL_GEN(i915) >= 10) + cnl_setup_private_ppat(uncore); + else if (IS_CHERRYVIEW(i915) || IS_GEN9_LP(i915)) + chv_setup_private_ppat(uncore); + else + bdw_setup_private_ppat(uncore); +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftests/mock_gtt.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h new file mode 100644 index 000000000..c13c650ce --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gtt.h @@ -0,0 +1,586 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2020 Intel Corporation + * + * Please try to maintain the following order within this file unless it makes + * sense to do otherwise. From top to bottom: + * 1. typedefs + * 2. #defines, and macros + * 3. structure definitions + * 4. function prototypes + * + * Within each section, please try to order by generation in ascending order, + * from top to bottom (ie. gen6 on the top, gen8 on the bottom). + */ + +#ifndef __INTEL_GTT_H__ +#define __INTEL_GTT_H__ + +#include <linux/io-mapping.h> +#include <linux/kref.h> +#include <linux/mm.h> +#include <linux/pagevec.h> +#include <linux/scatterlist.h> +#include <linux/workqueue.h> + +#include <drm/drm_mm.h> + +#include "gt/intel_reset.h" +#include "i915_selftest.h" +#include "i915_vma_types.h" + +#define I915_GFP_ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN) + +#if IS_ENABLED(CONFIG_DRM_I915_TRACE_GTT) +#define DBG(...) trace_printk(__VA_ARGS__) +#else +#define DBG(...) +#endif + +#define NALLOC 3 /* 1 normal, 1 for concurrent threads, 1 for preallocation */ + +#define I915_GTT_PAGE_SIZE_4K BIT_ULL(12) +#define I915_GTT_PAGE_SIZE_64K BIT_ULL(16) +#define I915_GTT_PAGE_SIZE_2M BIT_ULL(21) + +#define I915_GTT_PAGE_SIZE I915_GTT_PAGE_SIZE_4K +#define I915_GTT_MAX_PAGE_SIZE I915_GTT_PAGE_SIZE_2M + +#define I915_GTT_PAGE_MASK -I915_GTT_PAGE_SIZE + +#define I915_GTT_MIN_ALIGNMENT I915_GTT_PAGE_SIZE + +#define I915_FENCE_REG_NONE -1 +#define I915_MAX_NUM_FENCES 32 +/* 32 fences + sign bit for FENCE_REG_NONE */ +#define I915_MAX_NUM_FENCE_BITS 6 + +typedef u32 gen6_pte_t; +typedef u64 gen8_pte_t; + +#define ggtt_total_entries(ggtt) ((ggtt)->vm.total >> PAGE_SHIFT) + +#define I915_PTES(pte_len) ((unsigned int)(PAGE_SIZE / (pte_len))) +#define I915_PTE_MASK(pte_len) (I915_PTES(pte_len) - 1) +#define I915_PDES 512 +#define I915_PDE_MASK (I915_PDES - 1) + +/* gen6-hsw has bit 11-4 for physical addr bit 39-32 */ +#define GEN6_GTT_ADDR_ENCODE(addr) ((addr) | (((addr) >> 28) & 0xff0)) +#define GEN6_PTE_ADDR_ENCODE(addr) GEN6_GTT_ADDR_ENCODE(addr) +#define GEN6_PDE_ADDR_ENCODE(addr) GEN6_GTT_ADDR_ENCODE(addr) +#define GEN6_PTE_CACHE_LLC (2 << 1) +#define GEN6_PTE_UNCACHED (1 << 1) +#define GEN6_PTE_VALID REG_BIT(0) + +#define GEN6_PTES I915_PTES(sizeof(gen6_pte_t)) +#define GEN6_PD_SIZE (I915_PDES * PAGE_SIZE) +#define GEN6_PD_ALIGN (PAGE_SIZE * 16) +#define GEN6_PDE_SHIFT 22 +#define GEN6_PDE_VALID REG_BIT(0) +#define NUM_PTE(pde_shift) (1 << (pde_shift - PAGE_SHIFT)) + +#define GEN7_PTE_CACHE_L3_LLC (3 << 1) + +#define BYT_PTE_SNOOPED_BY_CPU_CACHES REG_BIT(2) +#define BYT_PTE_WRITEABLE REG_BIT(1) + +/* + * Cacheability Control is a 4-bit value. The low three bits are stored in bits + * 3:1 of the PTE, while the fourth bit is stored in bit 11 of the PTE. + */ +#define HSW_CACHEABILITY_CONTROL(bits) ((((bits) & 0x7) << 1) | \ + (((bits) & 0x8) << (11 - 3))) +#define HSW_WB_LLC_AGE3 HSW_CACHEABILITY_CONTROL(0x2) +#define HSW_WB_LLC_AGE0 HSW_CACHEABILITY_CONTROL(0x3) +#define HSW_WB_ELLC_LLC_AGE3 HSW_CACHEABILITY_CONTROL(0x8) +#define HSW_WB_ELLC_LLC_AGE0 HSW_CACHEABILITY_CONTROL(0xb) +#define HSW_WT_ELLC_LLC_AGE3 HSW_CACHEABILITY_CONTROL(0x7) +#define HSW_WT_ELLC_LLC_AGE0 HSW_CACHEABILITY_CONTROL(0x6) +#define HSW_PTE_UNCACHED (0) +#define HSW_GTT_ADDR_ENCODE(addr) ((addr) | (((addr) >> 28) & 0x7f0)) +#define HSW_PTE_ADDR_ENCODE(addr) HSW_GTT_ADDR_ENCODE(addr) + +/* + * GEN8 32b style address is defined as a 3 level page table: + * 31:30 | 29:21 | 20:12 | 11:0 + * PDPE | PDE | PTE | offset + * The difference as compared to normal x86 3 level page table is the PDPEs are + * programmed via register. + * + * GEN8 48b style address is defined as a 4 level page table: + * 47:39 | 38:30 | 29:21 | 20:12 | 11:0 + * PML4E | PDPE | PDE | PTE | offset + */ +#define GEN8_3LVL_PDPES 4 + +#define PPAT_UNCACHED (_PAGE_PWT | _PAGE_PCD) +#define PPAT_CACHED_PDE 0 /* WB LLC */ +#define PPAT_CACHED _PAGE_PAT /* WB LLCeLLC */ +#define PPAT_DISPLAY_ELLC _PAGE_PCD /* WT eLLC */ + +#define CHV_PPAT_SNOOP REG_BIT(6) +#define GEN8_PPAT_AGE(x) ((x)<<4) +#define GEN8_PPAT_LLCeLLC (3<<2) +#define GEN8_PPAT_LLCELLC (2<<2) +#define GEN8_PPAT_LLC (1<<2) +#define GEN8_PPAT_WB (3<<0) +#define GEN8_PPAT_WT (2<<0) +#define GEN8_PPAT_WC (1<<0) +#define GEN8_PPAT_UC (0<<0) +#define GEN8_PPAT_ELLC_OVERRIDE (0<<2) +#define GEN8_PPAT(i, x) ((u64)(x) << ((i) * 8)) + +#define GEN8_PDE_IPS_64K BIT(11) +#define GEN8_PDE_PS_2M BIT(7) + +enum i915_cache_level; + +struct drm_i915_file_private; +struct drm_i915_gem_object; +struct i915_fence_reg; +struct i915_vma; +struct intel_gt; + +#define for_each_sgt_daddr(__dp, __iter, __sgt) \ + __for_each_sgt_daddr(__dp, __iter, __sgt, I915_GTT_PAGE_SIZE) + +struct i915_page_table { + struct drm_i915_gem_object *base; + union { + atomic_t used; + struct i915_page_table *stash; + }; +}; + +struct i915_page_directory { + struct i915_page_table pt; + spinlock_t lock; + void **entry; +}; + +#define __px_choose_expr(x, type, expr, other) \ + __builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(x), type) || \ + __builtin_types_compatible_p(typeof(x), const type), \ + ({ type __x = (type)(x); expr; }), \ + other) + +#define px_base(px) \ + __px_choose_expr(px, struct drm_i915_gem_object *, __x, \ + __px_choose_expr(px, struct i915_page_table *, __x->base, \ + __px_choose_expr(px, struct i915_page_directory *, __x->pt.base, \ + (void)0))) + +struct page *__px_page(struct drm_i915_gem_object *p); +dma_addr_t __px_dma(struct drm_i915_gem_object *p); +#define px_dma(px) (__px_dma(px_base(px))) + +#define px_pt(px) \ + __px_choose_expr(px, struct i915_page_table *, __x, \ + __px_choose_expr(px, struct i915_page_directory *, &__x->pt, \ + (void)0)) +#define px_used(px) (&px_pt(px)->used) + +struct i915_vm_pt_stash { + /* preallocated chains of page tables/directories */ + struct i915_page_table *pt[2]; +}; + +struct i915_vma_ops { + /* Map an object into an address space with the given cache flags. */ + void (*bind_vma)(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags); + /* + * Unmap an object from an address space. This usually consists of + * setting the valid PTE entries to a reserved scratch page. + */ + void (*unbind_vma)(struct i915_address_space *vm, + struct i915_vma *vma); + + int (*set_pages)(struct i915_vma *vma); + void (*clear_pages)(struct i915_vma *vma); +}; + +struct i915_address_space { + struct kref ref; + struct rcu_work rcu; + + struct drm_mm mm; + struct intel_gt *gt; + struct drm_i915_private *i915; + struct device *dma; + /* + * Every address space belongs to a struct file - except for the global + * GTT that is owned by the driver (and so @file is set to NULL). In + * principle, no information should leak from one context to another + * (or between files/processes etc) unless explicitly shared by the + * owner. Tracking the owner is important in order to free up per-file + * objects along with the file, to aide resource tracking, and to + * assign blame. + */ + struct drm_i915_file_private *file; + u64 total; /* size addr space maps (ex. 2GB for ggtt) */ + u64 reserved; /* size addr space reserved */ + + unsigned int bind_async_flags; + + /* + * Each active user context has its own address space (in full-ppgtt). + * Since the vm may be shared between multiple contexts, we count how + * many contexts keep us "open". Once open hits zero, we are closed + * and do not allow any new attachments, and proceed to shutdown our + * vma and page directories. + */ + atomic_t open; + + struct mutex mutex; /* protects vma and our lists */ +#define VM_CLASS_GGTT 0 +#define VM_CLASS_PPGTT 1 + + struct drm_i915_gem_object *scratch[4]; + /** + * List of vma currently bound. + */ + struct list_head bound_list; + + /* Global GTT */ + bool is_ggtt:1; + + /* Some systems support read-only mappings for GGTT and/or PPGTT */ + bool has_read_only:1; + + u8 top; + u8 pd_shift; + u8 scratch_order; + + struct drm_i915_gem_object * + (*alloc_pt_dma)(struct i915_address_space *vm, int sz); + + u64 (*pte_encode)(dma_addr_t addr, + enum i915_cache_level level, + u32 flags); /* Create a valid PTE */ +#define PTE_READ_ONLY BIT(0) + + void (*allocate_va_range)(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + u64 start, u64 length); + void (*clear_range)(struct i915_address_space *vm, + u64 start, u64 length); + void (*insert_page)(struct i915_address_space *vm, + dma_addr_t addr, + u64 offset, + enum i915_cache_level cache_level, + u32 flags); + void (*insert_entries)(struct i915_address_space *vm, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags); + void (*cleanup)(struct i915_address_space *vm); + + struct i915_vma_ops vma_ops; + + I915_SELFTEST_DECLARE(struct fault_attr fault_attr); + I915_SELFTEST_DECLARE(bool scrub_64K); +}; + +/* + * The Graphics Translation Table is the way in which GEN hardware translates a + * Graphics Virtual Address into a Physical Address. In addition to the normal + * collateral associated with any va->pa translations GEN hardware also has a + * portion of the GTT which can be mapped by the CPU and remain both coherent + * and correct (in cases like swizzling). That region is referred to as GMADR in + * the spec. + */ +struct i915_ggtt { + struct i915_address_space vm; + + struct io_mapping iomap; /* Mapping to our CPU mappable region */ + struct resource gmadr; /* GMADR resource */ + resource_size_t mappable_end; /* End offset that we can CPU map */ + + /** "Graphics Stolen Memory" holds the global PTEs */ + void __iomem *gsm; + void (*invalidate)(struct i915_ggtt *ggtt); + + /** PPGTT used for aliasing the PPGTT with the GTT */ + struct i915_ppgtt *alias; + + bool do_idle_maps; + + int mtrr; + + /** Bit 6 swizzling required for X tiling */ + u32 bit_6_swizzle_x; + /** Bit 6 swizzling required for Y tiling */ + u32 bit_6_swizzle_y; + + u32 pin_bias; + + unsigned int num_fences; + struct i915_fence_reg *fence_regs; + struct list_head fence_list; + + /** + * List of all objects in gtt_space, currently mmaped by userspace. + * All objects within this list must also be on bound_list. + */ + struct list_head userfault_list; + + /* Manual runtime pm autosuspend delay for user GGTT mmaps */ + struct intel_wakeref_auto userfault_wakeref; + + struct mutex error_mutex; + struct drm_mm_node error_capture; + struct drm_mm_node uc_fw; +}; + +struct i915_ppgtt { + struct i915_address_space vm; + + struct i915_page_directory *pd; +}; + +#define i915_is_ggtt(vm) ((vm)->is_ggtt) + +static inline bool +i915_vm_is_4lvl(const struct i915_address_space *vm) +{ + return (vm->total - 1) >> 32; +} + +static inline bool +i915_vm_has_scratch_64K(struct i915_address_space *vm) +{ + return vm->scratch_order == get_order(I915_GTT_PAGE_SIZE_64K); +} + +static inline bool +i915_vm_has_cache_coloring(struct i915_address_space *vm) +{ + return i915_is_ggtt(vm) && vm->mm.color_adjust; +} + +static inline struct i915_ggtt * +i915_vm_to_ggtt(struct i915_address_space *vm) +{ + BUILD_BUG_ON(offsetof(struct i915_ggtt, vm)); + GEM_BUG_ON(!i915_is_ggtt(vm)); + return container_of(vm, struct i915_ggtt, vm); +} + +static inline struct i915_ppgtt * +i915_vm_to_ppgtt(struct i915_address_space *vm) +{ + BUILD_BUG_ON(offsetof(struct i915_ppgtt, vm)); + GEM_BUG_ON(i915_is_ggtt(vm)); + return container_of(vm, struct i915_ppgtt, vm); +} + +static inline struct i915_address_space * +i915_vm_get(struct i915_address_space *vm) +{ + kref_get(&vm->ref); + return vm; +} + +void i915_vm_release(struct kref *kref); + +static inline void i915_vm_put(struct i915_address_space *vm) +{ + kref_put(&vm->ref, i915_vm_release); +} + +static inline struct i915_address_space * +i915_vm_open(struct i915_address_space *vm) +{ + GEM_BUG_ON(!atomic_read(&vm->open)); + atomic_inc(&vm->open); + return i915_vm_get(vm); +} + +static inline bool +i915_vm_tryopen(struct i915_address_space *vm) +{ + if (atomic_add_unless(&vm->open, 1, 0)) + return i915_vm_get(vm); + + return false; +} + +void __i915_vm_close(struct i915_address_space *vm); + +static inline void +i915_vm_close(struct i915_address_space *vm) +{ + GEM_BUG_ON(!atomic_read(&vm->open)); + __i915_vm_close(vm); + + i915_vm_put(vm); +} + +void i915_address_space_init(struct i915_address_space *vm, int subclass); +void i915_address_space_fini(struct i915_address_space *vm); + +static inline u32 i915_pte_index(u64 address, unsigned int pde_shift) +{ + const u32 mask = NUM_PTE(pde_shift) - 1; + + return (address >> PAGE_SHIFT) & mask; +} + +/* + * Helper to counts the number of PTEs within the given length. This count + * does not cross a page table boundary, so the max value would be + * GEN6_PTES for GEN6, and GEN8_PTES for GEN8. + */ +static inline u32 i915_pte_count(u64 addr, u64 length, unsigned int pde_shift) +{ + const u64 mask = ~((1ULL << pde_shift) - 1); + u64 end; + + GEM_BUG_ON(length == 0); + GEM_BUG_ON(offset_in_page(addr | length)); + + end = addr + length; + + if ((addr & mask) != (end & mask)) + return NUM_PTE(pde_shift) - i915_pte_index(addr, pde_shift); + + return i915_pte_index(end, pde_shift) - i915_pte_index(addr, pde_shift); +} + +static inline u32 i915_pde_index(u64 addr, u32 shift) +{ + return (addr >> shift) & I915_PDE_MASK; +} + +static inline struct i915_page_table * +i915_pt_entry(const struct i915_page_directory * const pd, + const unsigned short n) +{ + return pd->entry[n]; +} + +static inline struct i915_page_directory * +i915_pd_entry(const struct i915_page_directory * const pdp, + const unsigned short n) +{ + return pdp->entry[n]; +} + +static inline dma_addr_t +i915_page_dir_dma_addr(const struct i915_ppgtt *ppgtt, const unsigned int n) +{ + struct i915_page_table *pt = ppgtt->pd->entry[n]; + + return __px_dma(pt ? px_base(pt) : ppgtt->vm.scratch[ppgtt->vm.top]); +} + +void ppgtt_init(struct i915_ppgtt *ppgtt, struct intel_gt *gt); + +int i915_ggtt_probe_hw(struct drm_i915_private *i915); +int i915_ggtt_init_hw(struct drm_i915_private *i915); +int i915_ggtt_enable_hw(struct drm_i915_private *i915); +void i915_ggtt_enable_guc(struct i915_ggtt *ggtt); +void i915_ggtt_disable_guc(struct i915_ggtt *ggtt); +int i915_init_ggtt(struct drm_i915_private *i915); +void i915_ggtt_driver_release(struct drm_i915_private *i915); + +static inline bool i915_ggtt_has_aperture(const struct i915_ggtt *ggtt) +{ + return ggtt->mappable_end > 0; +} + +int i915_ppgtt_init_hw(struct intel_gt *gt); + +struct i915_ppgtt *i915_ppgtt_create(struct intel_gt *gt); + +void i915_ggtt_suspend(struct i915_ggtt *gtt); +void i915_ggtt_resume(struct i915_ggtt *ggtt); + +#define kmap_atomic_px(px) kmap_atomic(__px_page(px_base(px))) + +void +fill_page_dma(struct drm_i915_gem_object *p, const u64 val, unsigned int count); + +#define fill_px(px, v) fill_page_dma(px_base(px), (v), PAGE_SIZE / sizeof(u64)) +#define fill32_px(px, v) do { \ + u64 v__ = lower_32_bits(v); \ + fill_px((px), v__ << 32 | v__); \ +} while (0) + +int setup_scratch_page(struct i915_address_space *vm); +void free_scratch(struct i915_address_space *vm); + +struct drm_i915_gem_object *alloc_pt_dma(struct i915_address_space *vm, int sz); +struct i915_page_table *alloc_pt(struct i915_address_space *vm); +struct i915_page_directory *alloc_pd(struct i915_address_space *vm); +struct i915_page_directory *__alloc_pd(int npde); + +int pin_pt_dma(struct i915_address_space *vm, struct drm_i915_gem_object *obj); + +void free_px(struct i915_address_space *vm, + struct i915_page_table *pt, int lvl); +#define free_pt(vm, px) free_px(vm, px, 0) +#define free_pd(vm, px) free_px(vm, px_pt(px), 1) + +void +__set_pd_entry(struct i915_page_directory * const pd, + const unsigned short idx, + struct i915_page_table *pt, + u64 (*encode)(const dma_addr_t, const enum i915_cache_level)); + +#define set_pd_entry(pd, idx, to) \ + __set_pd_entry((pd), (idx), px_pt(to), gen8_pde_encode) + +void +clear_pd_entry(struct i915_page_directory * const pd, + const unsigned short idx, + const struct drm_i915_gem_object * const scratch); + +bool +release_pd_entry(struct i915_page_directory * const pd, + const unsigned short idx, + struct i915_page_table * const pt, + const struct drm_i915_gem_object * const scratch); +void gen6_ggtt_invalidate(struct i915_ggtt *ggtt); + +int ggtt_set_pages(struct i915_vma *vma); +int ppgtt_set_pages(struct i915_vma *vma); +void clear_pages(struct i915_vma *vma); + +void ppgtt_bind_vma(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags); +void ppgtt_unbind_vma(struct i915_address_space *vm, + struct i915_vma *vma); + +void gtt_write_workarounds(struct intel_gt *gt); + +void setup_private_pat(struct intel_uncore *uncore); + +int i915_vm_alloc_pt_stash(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + u64 size); +int i915_vm_pin_pt_stash(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash); +void i915_vm_free_pt_stash(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash); + +static inline struct sgt_dma { + struct scatterlist *sg; + dma_addr_t dma, max; +} sgt_dma(struct i915_vma *vma) { + struct scatterlist *sg = vma->pages->sgl; + dma_addr_t addr = sg_dma_address(sg); + + return (struct sgt_dma){ sg, addr, addr + sg->length }; +} + +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_llc.c b/drivers/gpu/drm/i915/gt/intel_llc.c new file mode 100644 index 000000000..e3f637b36 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_llc.c @@ -0,0 +1,163 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include <linux/cpufreq.h> + +#include "i915_drv.h" +#include "intel_gt.h" +#include "intel_llc.h" +#include "intel_sideband.h" + +struct ia_constants { + unsigned int min_gpu_freq; + unsigned int max_gpu_freq; + + unsigned int min_ring_freq; + unsigned int max_ia_freq; +}; + +static struct intel_gt *llc_to_gt(struct intel_llc *llc) +{ + return container_of(llc, struct intel_gt, llc); +} + +static unsigned int cpu_max_MHz(void) +{ + struct cpufreq_policy *policy; + unsigned int max_khz; + + policy = cpufreq_cpu_get(0); + if (policy) { + max_khz = policy->cpuinfo.max_freq; + cpufreq_cpu_put(policy); + } else { + /* + * Default to measured freq if none found, PCU will ensure we + * don't go over + */ + max_khz = tsc_khz; + } + + return max_khz / 1000; +} + +static bool get_ia_constants(struct intel_llc *llc, + struct ia_constants *consts) +{ + struct drm_i915_private *i915 = llc_to_gt(llc)->i915; + struct intel_rps *rps = &llc_to_gt(llc)->rps; + + if (!HAS_LLC(i915) || IS_DGFX(i915)) + return false; + + if (rps->max_freq <= rps->min_freq) + return false; + + consts->max_ia_freq = cpu_max_MHz(); + + consts->min_ring_freq = + intel_uncore_read(llc_to_gt(llc)->uncore, DCLK) & 0xf; + /* convert DDR frequency from units of 266.6MHz to bandwidth */ + consts->min_ring_freq = mult_frac(consts->min_ring_freq, 8, 3); + + consts->min_gpu_freq = rps->min_freq; + consts->max_gpu_freq = rps->max_freq; + if (INTEL_GEN(i915) >= 9) { + /* Convert GT frequency to 50 HZ units */ + consts->min_gpu_freq /= GEN9_FREQ_SCALER; + consts->max_gpu_freq /= GEN9_FREQ_SCALER; + } + + return true; +} + +static void calc_ia_freq(struct intel_llc *llc, + unsigned int gpu_freq, + const struct ia_constants *consts, + unsigned int *out_ia_freq, + unsigned int *out_ring_freq) +{ + struct drm_i915_private *i915 = llc_to_gt(llc)->i915; + const int diff = consts->max_gpu_freq - gpu_freq; + unsigned int ia_freq = 0, ring_freq = 0; + + if (INTEL_GEN(i915) >= 9) { + /* + * ring_freq = 2 * GT. ring_freq is in 100MHz units + * No floor required for ring frequency on SKL. + */ + ring_freq = gpu_freq; + } else if (INTEL_GEN(i915) >= 8) { + /* max(2 * GT, DDR). NB: GT is 50MHz units */ + ring_freq = max(consts->min_ring_freq, gpu_freq); + } else if (IS_HASWELL(i915)) { + ring_freq = mult_frac(gpu_freq, 5, 4); + ring_freq = max(consts->min_ring_freq, ring_freq); + /* leave ia_freq as the default, chosen by cpufreq */ + } else { + const int min_freq = 15; + const int scale = 180; + + /* + * On older processors, there is no separate ring + * clock domain, so in order to boost the bandwidth + * of the ring, we need to upclock the CPU (ia_freq). + * + * For GPU frequencies less than 750MHz, + * just use the lowest ring freq. + */ + if (gpu_freq < min_freq) + ia_freq = 800; + else + ia_freq = consts->max_ia_freq - diff * scale / 2; + ia_freq = DIV_ROUND_CLOSEST(ia_freq, 100); + } + + *out_ia_freq = ia_freq; + *out_ring_freq = ring_freq; +} + +static void gen6_update_ring_freq(struct intel_llc *llc) +{ + struct drm_i915_private *i915 = llc_to_gt(llc)->i915; + struct ia_constants consts; + unsigned int gpu_freq; + + if (!get_ia_constants(llc, &consts)) + return; + + /* + * For each potential GPU frequency, load a ring frequency we'd like + * to use for memory access. We do this by specifying the IA frequency + * the PCU should use as a reference to determine the ring frequency. + */ + for (gpu_freq = consts.max_gpu_freq; + gpu_freq >= consts.min_gpu_freq; + gpu_freq--) { + unsigned int ia_freq, ring_freq; + + calc_ia_freq(llc, gpu_freq, &consts, &ia_freq, &ring_freq); + sandybridge_pcode_write(i915, + GEN6_PCODE_WRITE_MIN_FREQ_TABLE, + ia_freq << GEN6_PCODE_FREQ_IA_RATIO_SHIFT | + ring_freq << GEN6_PCODE_FREQ_RING_RATIO_SHIFT | + gpu_freq); + } +} + +void intel_llc_enable(struct intel_llc *llc) +{ + gen6_update_ring_freq(llc); +} + +void intel_llc_disable(struct intel_llc *llc) +{ + /* Currently there is no HW configuration to be done to disable. */ +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_llc.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_llc.h b/drivers/gpu/drm/i915/gt/intel_llc.h new file mode 100644 index 000000000..ef09a890d --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_llc.h @@ -0,0 +1,15 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_LLC_H +#define INTEL_LLC_H + +struct intel_llc; + +void intel_llc_enable(struct intel_llc *llc); +void intel_llc_disable(struct intel_llc *llc); + +#endif /* INTEL_LLC_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_llc_types.h b/drivers/gpu/drm/i915/gt/intel_llc_types.h new file mode 100644 index 000000000..ecad4687b --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_llc_types.h @@ -0,0 +1,13 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_LLC_TYPES_H +#define INTEL_LLC_TYPES_H + +struct intel_llc { +}; + +#endif /* INTEL_LLC_TYPES_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c new file mode 100644 index 000000000..ee9b33c3a --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -0,0 +1,6110 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Ben Widawsky <ben@bwidawsk.net> + * Michel Thierry <michel.thierry@intel.com> + * Thomas Daniel <thomas.daniel@intel.com> + * Oscar Mateo <oscar.mateo@intel.com> + * + */ + +/** + * DOC: Logical Rings, Logical Ring Contexts and Execlists + * + * Motivation: + * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". + * These expanded contexts enable a number of new abilities, especially + * "Execlists" (also implemented in this file). + * + * One of the main differences with the legacy HW contexts is that logical + * ring contexts incorporate many more things to the context's state, like + * PDPs or ringbuffer control registers: + * + * The reason why PDPs are included in the context is straightforward: as + * PPGTTs (per-process GTTs) are actually per-context, having the PDPs + * contained there mean you don't need to do a ppgtt->switch_mm yourself, + * instead, the GPU will do it for you on the context switch. + * + * But, what about the ringbuffer control registers (head, tail, etc..)? + * shouldn't we just need a set of those per engine command streamer? This is + * where the name "Logical Rings" starts to make sense: by virtualizing the + * rings, the engine cs shifts to a new "ring buffer" with every context + * switch. When you want to submit a workload to the GPU you: A) choose your + * context, B) find its appropriate virtualized ring, C) write commands to it + * and then, finally, D) tell the GPU to switch to that context. + * + * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch + * to a contexts is via a context execution list, ergo "Execlists". + * + * LRC implementation: + * Regarding the creation of contexts, we have: + * + * - One global default context. + * - One local default context for each opened fd. + * - One local extra context for each context create ioctl call. + * + * Now that ringbuffers belong per-context (and not per-engine, like before) + * and that contexts are uniquely tied to a given engine (and not reusable, + * like before) we need: + * + * - One ringbuffer per-engine inside each context. + * - One backing object per-engine inside each context. + * + * The global default context starts its life with these new objects fully + * allocated and populated. The local default context for each opened fd is + * more complex, because we don't know at creation time which engine is going + * to use them. To handle this, we have implemented a deferred creation of LR + * contexts: + * + * The local context starts its life as a hollow or blank holder, that only + * gets populated for a given engine once we receive an execbuffer. If later + * on we receive another execbuffer ioctl for the same context but a different + * engine, we allocate/populate a new ringbuffer and context backing object and + * so on. + * + * Finally, regarding local contexts created using the ioctl call: as they are + * only allowed with the render ring, we can allocate & populate them right + * away (no need to defer anything, at least for now). + * + * Execlists implementation: + * Execlists are the new method by which, on gen8+ hardware, workloads are + * submitted for execution (as opposed to the legacy, ringbuffer-based, method). + * This method works as follows: + * + * When a request is committed, its commands (the BB start and any leading or + * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer + * for the appropriate context. The tail pointer in the hardware context is not + * updated at this time, but instead, kept by the driver in the ringbuffer + * structure. A structure representing this request is added to a request queue + * for the appropriate engine: this structure contains a copy of the context's + * tail after the request was written to the ring buffer and a pointer to the + * context itself. + * + * If the engine's request queue was empty before the request was added, the + * queue is processed immediately. Otherwise the queue will be processed during + * a context switch interrupt. In any case, elements on the queue will get sent + * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a + * globally unique 20-bits submission ID. + * + * When execution of a request completes, the GPU updates the context status + * buffer with a context complete event and generates a context switch interrupt. + * During the interrupt handling, the driver examines the events in the buffer: + * for each context complete event, if the announced ID matches that on the head + * of the request queue, then that request is retired and removed from the queue. + * + * After processing, if any requests were retired and the queue is not empty + * then a new execution list can be submitted. The two requests at the front of + * the queue are next to be submitted but since a context may not occur twice in + * an execution list, if subsequent requests have the same ID as the first then + * the two requests must be combined. This is done simply by discarding requests + * at the head of the queue until either only one requests is left (in which case + * we use a NULL second context) or the first two requests have unique IDs. + * + * By always executing the first two requests in the queue the driver ensures + * that the GPU is kept as busy as possible. In the case where a single context + * completes but a second context is still executing, the request for this second + * context will be at the head of the queue when we remove the first one. This + * request will then be resubmitted along with a new request for a different context, + * which will cause the hardware to continue executing the second request and queue + * the new request (the GPU detects the condition of a context getting preempted + * with the same context and optimizes the context switch flow by not doing + * preemption, but just sampling the new tail pointer). + * + */ +#include <linux/interrupt.h> + +#include "i915_drv.h" +#include "i915_perf.h" +#include "i915_trace.h" +#include "i915_vgpu.h" +#include "intel_breadcrumbs.h" +#include "intel_context.h" +#include "intel_engine_pm.h" +#include "intel_gt.h" +#include "intel_gt_pm.h" +#include "intel_gt_requests.h" +#include "intel_lrc_reg.h" +#include "intel_mocs.h" +#include "intel_reset.h" +#include "intel_ring.h" +#include "intel_workarounds.h" +#include "shmem_utils.h" + +#define RING_EXECLIST_QFULL (1 << 0x2) +#define RING_EXECLIST1_VALID (1 << 0x3) +#define RING_EXECLIST0_VALID (1 << 0x4) +#define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) +#define RING_EXECLIST1_ACTIVE (1 << 0x11) +#define RING_EXECLIST0_ACTIVE (1 << 0x12) + +#define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) +#define GEN8_CTX_STATUS_PREEMPTED (1 << 1) +#define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) +#define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) +#define GEN8_CTX_STATUS_COMPLETE (1 << 4) +#define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) + +#define GEN8_CTX_STATUS_COMPLETED_MASK \ + (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) + +#define CTX_DESC_FORCE_RESTORE BIT_ULL(2) + +#define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ +#define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ +#define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) +#define GEN12_IDLE_CTX_ID 0x7FF +#define GEN12_CSB_CTX_VALID(csb_dw) \ + (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) + +/* Typical size of the average request (2 pipecontrols and a MI_BB) */ +#define EXECLISTS_REQUEST_SIZE 64 /* bytes */ + +struct virtual_engine { + struct intel_engine_cs base; + struct intel_context context; + struct rcu_work rcu; + + /* + * We allow only a single request through the virtual engine at a time + * (each request in the timeline waits for the completion fence of + * the previous before being submitted). By restricting ourselves to + * only submitting a single request, each request is placed on to a + * physical to maximise load spreading (by virtue of the late greedy + * scheduling -- each real engine takes the next available request + * upon idling). + */ + struct i915_request *request; + + /* + * We keep a rbtree of available virtual engines inside each physical + * engine, sorted by priority. Here we preallocate the nodes we need + * for the virtual engine, indexed by physical_engine->id. + */ + struct ve_node { + struct rb_node rb; + int prio; + } nodes[I915_NUM_ENGINES]; + + /* + * Keep track of bonded pairs -- restrictions upon on our selection + * of physical engines any particular request may be submitted to. + * If we receive a submit-fence from a master engine, we will only + * use one of sibling_mask physical engines. + */ + struct ve_bond { + const struct intel_engine_cs *master; + intel_engine_mask_t sibling_mask; + } *bonds; + unsigned int num_bonds; + + /* And finally, which physical engines this virtual engine maps onto. */ + unsigned int num_siblings; + struct intel_engine_cs *siblings[]; +}; + +static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) +{ + GEM_BUG_ON(!intel_engine_is_virtual(engine)); + return container_of(engine, struct virtual_engine, base); +} + +static int __execlists_context_alloc(struct intel_context *ce, + struct intel_engine_cs *engine); + +static void execlists_init_reg_state(u32 *reg_state, + const struct intel_context *ce, + const struct intel_engine_cs *engine, + const struct intel_ring *ring, + bool close); +static void +__execlists_update_reg_state(const struct intel_context *ce, + const struct intel_engine_cs *engine, + u32 head); + +static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) +{ + if (INTEL_GEN(engine->i915) >= 12) + return 0x60; + else if (INTEL_GEN(engine->i915) >= 9) + return 0x54; + else if (engine->class == RENDER_CLASS) + return 0x58; + else + return -1; +} + +static int lrc_ring_gpr0(const struct intel_engine_cs *engine) +{ + if (INTEL_GEN(engine->i915) >= 12) + return 0x74; + else if (INTEL_GEN(engine->i915) >= 9) + return 0x68; + else if (engine->class == RENDER_CLASS) + return 0xd8; + else + return -1; +} + +static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) +{ + if (INTEL_GEN(engine->i915) >= 12) + return 0x12; + else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS) + return 0x18; + else + return -1; +} + +static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) +{ + int x; + + x = lrc_ring_wa_bb_per_ctx(engine); + if (x < 0) + return x; + + return x + 2; +} + +static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) +{ + int x; + + x = lrc_ring_indirect_ptr(engine); + if (x < 0) + return x; + + return x + 2; +} + +static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) +{ + if (engine->class != RENDER_CLASS) + return -1; + + if (INTEL_GEN(engine->i915) >= 12) + return 0xb6; + else if (INTEL_GEN(engine->i915) >= 11) + return 0xaa; + else + return -1; +} + +static u32 +lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) +{ + switch (INTEL_GEN(engine->i915)) { + default: + MISSING_CASE(INTEL_GEN(engine->i915)); + fallthrough; + case 12: + return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; + case 11: + return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; + case 10: + return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; + case 9: + return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; + case 8: + return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; + } +} + +static void +lrc_ring_setup_indirect_ctx(u32 *regs, + const struct intel_engine_cs *engine, + u32 ctx_bb_ggtt_addr, + u32 size) +{ + GEM_BUG_ON(!size); + GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); + GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); + regs[lrc_ring_indirect_ptr(engine) + 1] = + ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); + + GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); + regs[lrc_ring_indirect_offset(engine) + 1] = + lrc_ring_indirect_offset_default(engine) << 6; +} + +static u32 intel_context_get_runtime(const struct intel_context *ce) +{ + /* + * We can use either ppHWSP[16] which is recorded before the context + * switch (and so excludes the cost of context switches) or use the + * value from the context image itself, which is saved/restored earlier + * and so includes the cost of the save. + */ + return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); +} + +static void mark_eio(struct i915_request *rq) +{ + if (i915_request_completed(rq)) + return; + + GEM_BUG_ON(i915_request_signaled(rq)); + + i915_request_set_error_once(rq, -EIO); + i915_request_mark_complete(rq); +} + +static struct i915_request * +active_request(const struct intel_timeline * const tl, struct i915_request *rq) +{ + struct i915_request *active = rq; + + rcu_read_lock(); + list_for_each_entry_continue_reverse(rq, &tl->requests, link) { + if (i915_request_completed(rq)) + break; + + active = rq; + } + rcu_read_unlock(); + + return active; +} + +static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) +{ + return (i915_ggtt_offset(engine->status_page.vma) + + I915_GEM_HWS_PREEMPT_ADDR); +} + +static inline void +ring_set_paused(const struct intel_engine_cs *engine, int state) +{ + /* + * We inspect HWS_PREEMPT with a semaphore inside + * engine->emit_fini_breadcrumb. If the dword is true, + * the ring is paused as the semaphore will busywait + * until the dword is false. + */ + engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; + if (state) + wmb(); +} + +static inline struct i915_priolist *to_priolist(struct rb_node *rb) +{ + return rb_entry(rb, struct i915_priolist, node); +} + +static inline int rq_prio(const struct i915_request *rq) +{ + return READ_ONCE(rq->sched.attr.priority); +} + +static int effective_prio(const struct i915_request *rq) +{ + int prio = rq_prio(rq); + + /* + * If this request is special and must not be interrupted at any + * cost, so be it. Note we are only checking the most recent request + * in the context and so may be masking an earlier vip request. It + * is hoped that under the conditions where nopreempt is used, this + * will not matter (i.e. all requests to that context will be + * nopreempt for as long as desired). + */ + if (i915_request_has_nopreempt(rq)) + prio = I915_PRIORITY_UNPREEMPTABLE; + + return prio; +} + +static int queue_prio(const struct intel_engine_execlists *execlists) +{ + struct i915_priolist *p; + struct rb_node *rb; + + rb = rb_first_cached(&execlists->queue); + if (!rb) + return INT_MIN; + + /* + * As the priolist[] are inverted, with the highest priority in [0], + * we have to flip the index value to become priority. + */ + p = to_priolist(rb); + if (!I915_USER_PRIORITY_SHIFT) + return p->priority; + + return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); +} + +static inline bool need_preempt(const struct intel_engine_cs *engine, + const struct i915_request *rq, + struct rb_node *rb) +{ + int last_prio; + + if (!intel_engine_has_semaphores(engine)) + return false; + + /* + * Check if the current priority hint merits a preemption attempt. + * + * We record the highest value priority we saw during rescheduling + * prior to this dequeue, therefore we know that if it is strictly + * less than the current tail of ESLP[0], we do not need to force + * a preempt-to-idle cycle. + * + * However, the priority hint is a mere hint that we may need to + * preempt. If that hint is stale or we may be trying to preempt + * ourselves, ignore the request. + * + * More naturally we would write + * prio >= max(0, last); + * except that we wish to prevent triggering preemption at the same + * priority level: the task that is running should remain running + * to preserve FIFO ordering of dependencies. + */ + last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1); + if (engine->execlists.queue_priority_hint <= last_prio) + return false; + + /* + * Check against the first request in ELSP[1], it will, thanks to the + * power of PI, be the highest priority of that context. + */ + if (!list_is_last(&rq->sched.link, &engine->active.requests) && + rq_prio(list_next_entry(rq, sched.link)) > last_prio) + return true; + + if (rb) { + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + bool preempt = false; + + if (engine == ve->siblings[0]) { /* only preempt one sibling */ + struct i915_request *next; + + rcu_read_lock(); + next = READ_ONCE(ve->request); + if (next) + preempt = rq_prio(next) > last_prio; + rcu_read_unlock(); + } + + if (preempt) + return preempt; + } + + /* + * If the inflight context did not trigger the preemption, then maybe + * it was the set of queued requests? Pick the highest priority in + * the queue (the first active priolist) and see if it deserves to be + * running instead of ELSP[0]. + * + * The highest priority request in the queue can not be either + * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same + * context, it's priority would not exceed ELSP[0] aka last_prio. + */ + return queue_prio(&engine->execlists) > last_prio; +} + +__maybe_unused static inline bool +assert_priority_queue(const struct i915_request *prev, + const struct i915_request *next) +{ + /* + * Without preemption, the prev may refer to the still active element + * which we refuse to let go. + * + * Even with preemption, there are times when we think it is better not + * to preempt and leave an ostensibly lower priority request in flight. + */ + if (i915_request_is_active(prev)) + return true; + + return rq_prio(prev) >= rq_prio(next); +} + +/* + * The context descriptor encodes various attributes of a context, + * including its GTT address and some flags. Because it's fairly + * expensive to calculate, we'll just do it once and cache the result, + * which remains valid until the context is unpinned. + * + * This is what a descriptor looks like, from LSB to MSB:: + * + * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) + * bits 12-31: LRCA, GTT address of (the HWSP of) this context + * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) + * bits 53-54: mbz, reserved for use by hardware + * bits 55-63: group ID, currently unused and set to 0 + * + * Starting from Gen11, the upper dword of the descriptor has a new format: + * + * bits 32-36: reserved + * bits 37-47: SW context ID + * bits 48:53: engine instance + * bit 54: mbz, reserved for use by hardware + * bits 55-60: SW counter + * bits 61-63: engine class + * + * engine info, SW context ID and SW counter need to form a unique number + * (Context ID) per lrc. + */ +static u32 +lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) +{ + u32 desc; + + desc = INTEL_LEGACY_32B_CONTEXT; + if (i915_vm_is_4lvl(ce->vm)) + desc = INTEL_LEGACY_64B_CONTEXT; + desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; + + desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; + if (IS_GEN(engine->i915, 8)) + desc |= GEN8_CTX_L3LLC_COHERENT; + + return i915_ggtt_offset(ce->state) | desc; +} + +static inline unsigned int dword_in_page(void *addr) +{ + return offset_in_page(addr) / sizeof(u32); +} + +static void set_offsets(u32 *regs, + const u8 *data, + const struct intel_engine_cs *engine, + bool clear) +#define NOP(x) (BIT(7) | (x)) +#define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) +#define POSTED BIT(0) +#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) +#define REG16(x) \ + (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ + (((x) >> 2) & 0x7f) +#define END(total_state_size) 0, (total_state_size) +{ + const u32 base = engine->mmio_base; + + while (*data) { + u8 count, flags; + + if (*data & BIT(7)) { /* skip */ + count = *data++ & ~BIT(7); + if (clear) + memset32(regs, MI_NOOP, count); + regs += count; + continue; + } + + count = *data & 0x3f; + flags = *data >> 6; + data++; + + *regs = MI_LOAD_REGISTER_IMM(count); + if (flags & POSTED) + *regs |= MI_LRI_FORCE_POSTED; + if (INTEL_GEN(engine->i915) >= 11) + *regs |= MI_LRI_LRM_CS_MMIO; + regs++; + + GEM_BUG_ON(!count); + do { + u32 offset = 0; + u8 v; + + do { + v = *data++; + offset <<= 7; + offset |= v & ~BIT(7); + } while (v & BIT(7)); + + regs[0] = base + (offset << 2); + if (clear) + regs[1] = 0; + regs += 2; + } while (--count); + } + + if (clear) { + u8 count = *++data; + + /* Clear past the tail for HW access */ + GEM_BUG_ON(dword_in_page(regs) > count); + memset32(regs, MI_NOOP, count - dword_in_page(regs)); + + /* Close the batch; used mainly by live_lrc_layout() */ + *regs = MI_BATCH_BUFFER_END; + if (INTEL_GEN(engine->i915) >= 10) + *regs |= BIT(0); + } +} + +static const u8 gen8_xcs_offsets[] = { + NOP(1), + LRI(11, 0), + REG16(0x244), + REG(0x034), + REG(0x030), + REG(0x038), + REG(0x03c), + REG(0x168), + REG(0x140), + REG(0x110), + REG(0x11c), + REG(0x114), + REG(0x118), + + NOP(9), + LRI(9, 0), + REG16(0x3a8), + REG16(0x28c), + REG16(0x288), + REG16(0x284), + REG16(0x280), + REG16(0x27c), + REG16(0x278), + REG16(0x274), + REG16(0x270), + + NOP(13), + LRI(2, 0), + REG16(0x200), + REG(0x028), + + END(80) +}; + +static const u8 gen9_xcs_offsets[] = { + NOP(1), + LRI(14, POSTED), + REG16(0x244), + REG(0x034), + REG(0x030), + REG(0x038), + REG(0x03c), + REG(0x168), + REG(0x140), + REG(0x110), + REG(0x11c), + REG(0x114), + REG(0x118), + REG(0x1c0), + REG(0x1c4), + REG(0x1c8), + + NOP(3), + LRI(9, POSTED), + REG16(0x3a8), + REG16(0x28c), + REG16(0x288), + REG16(0x284), + REG16(0x280), + REG16(0x27c), + REG16(0x278), + REG16(0x274), + REG16(0x270), + + NOP(13), + LRI(1, POSTED), + REG16(0x200), + + NOP(13), + LRI(44, POSTED), + REG(0x028), + REG(0x09c), + REG(0x0c0), + REG(0x178), + REG(0x17c), + REG16(0x358), + REG(0x170), + REG(0x150), + REG(0x154), + REG(0x158), + REG16(0x41c), + REG16(0x600), + REG16(0x604), + REG16(0x608), + REG16(0x60c), + REG16(0x610), + REG16(0x614), + REG16(0x618), + REG16(0x61c), + REG16(0x620), + REG16(0x624), + REG16(0x628), + REG16(0x62c), + REG16(0x630), + REG16(0x634), + REG16(0x638), + REG16(0x63c), + REG16(0x640), + REG16(0x644), + REG16(0x648), + REG16(0x64c), + REG16(0x650), + REG16(0x654), + REG16(0x658), + REG16(0x65c), + REG16(0x660), + REG16(0x664), + REG16(0x668), + REG16(0x66c), + REG16(0x670), + REG16(0x674), + REG16(0x678), + REG16(0x67c), + REG(0x068), + + END(176) +}; + +static const u8 gen12_xcs_offsets[] = { + NOP(1), + LRI(13, POSTED), + REG16(0x244), + REG(0x034), + REG(0x030), + REG(0x038), + REG(0x03c), + REG(0x168), + REG(0x140), + REG(0x110), + REG(0x1c0), + REG(0x1c4), + REG(0x1c8), + REG(0x180), + REG16(0x2b4), + + NOP(5), + LRI(9, POSTED), + REG16(0x3a8), + REG16(0x28c), + REG16(0x288), + REG16(0x284), + REG16(0x280), + REG16(0x27c), + REG16(0x278), + REG16(0x274), + REG16(0x270), + + END(80) +}; + +static const u8 gen8_rcs_offsets[] = { + NOP(1), + LRI(14, POSTED), + REG16(0x244), + REG(0x034), + REG(0x030), + REG(0x038), + REG(0x03c), + REG(0x168), + REG(0x140), + REG(0x110), + REG(0x11c), + REG(0x114), + REG(0x118), + REG(0x1c0), + REG(0x1c4), + REG(0x1c8), + + NOP(3), + LRI(9, POSTED), + REG16(0x3a8), + REG16(0x28c), + REG16(0x288), + REG16(0x284), + REG16(0x280), + REG16(0x27c), + REG16(0x278), + REG16(0x274), + REG16(0x270), + + NOP(13), + LRI(1, 0), + REG(0x0c8), + + END(80) +}; + +static const u8 gen9_rcs_offsets[] = { + NOP(1), + LRI(14, POSTED), + REG16(0x244), + REG(0x34), + REG(0x30), + REG(0x38), + REG(0x3c), + REG(0x168), + REG(0x140), + REG(0x110), + REG(0x11c), + REG(0x114), + REG(0x118), + REG(0x1c0), + REG(0x1c4), + REG(0x1c8), + + NOP(3), + LRI(9, POSTED), + REG16(0x3a8), + REG16(0x28c), + REG16(0x288), + REG16(0x284), + REG16(0x280), + REG16(0x27c), + REG16(0x278), + REG16(0x274), + REG16(0x270), + + NOP(13), + LRI(1, 0), + REG(0xc8), + + NOP(13), + LRI(44, POSTED), + REG(0x28), + REG(0x9c), + REG(0xc0), + REG(0x178), + REG(0x17c), + REG16(0x358), + REG(0x170), + REG(0x150), + REG(0x154), + REG(0x158), + REG16(0x41c), + REG16(0x600), + REG16(0x604), + REG16(0x608), + REG16(0x60c), + REG16(0x610), + REG16(0x614), + REG16(0x618), + REG16(0x61c), + REG16(0x620), + REG16(0x624), + REG16(0x628), + REG16(0x62c), + REG16(0x630), + REG16(0x634), + REG16(0x638), + REG16(0x63c), + REG16(0x640), + REG16(0x644), + REG16(0x648), + REG16(0x64c), + REG16(0x650), + REG16(0x654), + REG16(0x658), + REG16(0x65c), + REG16(0x660), + REG16(0x664), + REG16(0x668), + REG16(0x66c), + REG16(0x670), + REG16(0x674), + REG16(0x678), + REG16(0x67c), + REG(0x68), + + END(176) +}; + +static const u8 gen11_rcs_offsets[] = { + NOP(1), + LRI(15, POSTED), + REG16(0x244), + REG(0x034), + REG(0x030), + REG(0x038), + REG(0x03c), + REG(0x168), + REG(0x140), + REG(0x110), + REG(0x11c), + REG(0x114), + REG(0x118), + REG(0x1c0), + REG(0x1c4), + REG(0x1c8), + REG(0x180), + + NOP(1), + LRI(9, POSTED), + REG16(0x3a8), + REG16(0x28c), + REG16(0x288), + REG16(0x284), + REG16(0x280), + REG16(0x27c), + REG16(0x278), + REG16(0x274), + REG16(0x270), + + LRI(1, POSTED), + REG(0x1b0), + + NOP(10), + LRI(1, 0), + REG(0x0c8), + + END(80) +}; + +static const u8 gen12_rcs_offsets[] = { + NOP(1), + LRI(13, POSTED), + REG16(0x244), + REG(0x034), + REG(0x030), + REG(0x038), + REG(0x03c), + REG(0x168), + REG(0x140), + REG(0x110), + REG(0x1c0), + REG(0x1c4), + REG(0x1c8), + REG(0x180), + REG16(0x2b4), + + NOP(5), + LRI(9, POSTED), + REG16(0x3a8), + REG16(0x28c), + REG16(0x288), + REG16(0x284), + REG16(0x280), + REG16(0x27c), + REG16(0x278), + REG16(0x274), + REG16(0x270), + + LRI(3, POSTED), + REG(0x1b0), + REG16(0x5a8), + REG16(0x5ac), + + NOP(6), + LRI(1, 0), + REG(0x0c8), + NOP(3 + 9 + 1), + + LRI(51, POSTED), + REG16(0x588), + REG16(0x588), + REG16(0x588), + REG16(0x588), + REG16(0x588), + REG16(0x588), + REG(0x028), + REG(0x09c), + REG(0x0c0), + REG(0x178), + REG(0x17c), + REG16(0x358), + REG(0x170), + REG(0x150), + REG(0x154), + REG(0x158), + REG16(0x41c), + REG16(0x600), + REG16(0x604), + REG16(0x608), + REG16(0x60c), + REG16(0x610), + REG16(0x614), + REG16(0x618), + REG16(0x61c), + REG16(0x620), + REG16(0x624), + REG16(0x628), + REG16(0x62c), + REG16(0x630), + REG16(0x634), + REG16(0x638), + REG16(0x63c), + REG16(0x640), + REG16(0x644), + REG16(0x648), + REG16(0x64c), + REG16(0x650), + REG16(0x654), + REG16(0x658), + REG16(0x65c), + REG16(0x660), + REG16(0x664), + REG16(0x668), + REG16(0x66c), + REG16(0x670), + REG16(0x674), + REG16(0x678), + REG16(0x67c), + REG(0x068), + REG(0x084), + NOP(1), + + END(192) +}; + +#undef END +#undef REG16 +#undef REG +#undef LRI +#undef NOP + +static const u8 *reg_offsets(const struct intel_engine_cs *engine) +{ + /* + * The gen12+ lists only have the registers we program in the basic + * default state. We rely on the context image using relative + * addressing to automatic fixup the register state between the + * physical engines for virtual engine. + */ + GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && + !intel_engine_has_relative_mmio(engine)); + + if (engine->class == RENDER_CLASS) { + if (INTEL_GEN(engine->i915) >= 12) + return gen12_rcs_offsets; + else if (INTEL_GEN(engine->i915) >= 11) + return gen11_rcs_offsets; + else if (INTEL_GEN(engine->i915) >= 9) + return gen9_rcs_offsets; + else + return gen8_rcs_offsets; + } else { + if (INTEL_GEN(engine->i915) >= 12) + return gen12_xcs_offsets; + else if (INTEL_GEN(engine->i915) >= 9) + return gen9_xcs_offsets; + else + return gen8_xcs_offsets; + } +} + +static struct i915_request * +__unwind_incomplete_requests(struct intel_engine_cs *engine) +{ + struct i915_request *rq, *rn, *active = NULL; + struct list_head *pl; + int prio = I915_PRIORITY_INVALID; + + lockdep_assert_held(&engine->active.lock); + + list_for_each_entry_safe_reverse(rq, rn, + &engine->active.requests, + sched.link) { + if (i915_request_completed(rq)) + continue; /* XXX */ + + __i915_request_unsubmit(rq); + + /* + * Push the request back into the queue for later resubmission. + * If this request is not native to this physical engine (i.e. + * it came from a virtual source), push it back onto the virtual + * engine so that it can be moved across onto another physical + * engine as load dictates. + */ + if (likely(rq->execution_mask == engine->mask)) { + GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); + if (rq_prio(rq) != prio) { + prio = rq_prio(rq); + pl = i915_sched_lookup_priolist(engine, prio); + } + GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); + + list_move(&rq->sched.link, pl); + set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); + + /* Check in case we rollback so far we wrap [size/2] */ + if (intel_ring_direction(rq->ring, + rq->tail, + rq->ring->tail + 8) > 0) + rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE; + + active = rq; + } else { + struct intel_engine_cs *owner = rq->context->engine; + + WRITE_ONCE(rq->engine, owner); + owner->submit_request(rq); + active = NULL; + } + } + + return active; +} + +struct i915_request * +execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) +{ + struct intel_engine_cs *engine = + container_of(execlists, typeof(*engine), execlists); + + return __unwind_incomplete_requests(engine); +} + +static inline void +execlists_context_status_change(struct i915_request *rq, unsigned long status) +{ + /* + * Only used when GVT-g is enabled now. When GVT-g is disabled, + * The compiler should eliminate this function as dead-code. + */ + if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) + return; + + atomic_notifier_call_chain(&rq->engine->context_status_notifier, + status, rq); +} + +static void intel_engine_context_in(struct intel_engine_cs *engine) +{ + unsigned long flags; + + if (atomic_add_unless(&engine->stats.active, 1, 0)) + return; + + write_seqlock_irqsave(&engine->stats.lock, flags); + if (!atomic_add_unless(&engine->stats.active, 1, 0)) { + engine->stats.start = ktime_get(); + atomic_inc(&engine->stats.active); + } + write_sequnlock_irqrestore(&engine->stats.lock, flags); +} + +static void intel_engine_context_out(struct intel_engine_cs *engine) +{ + unsigned long flags; + + GEM_BUG_ON(!atomic_read(&engine->stats.active)); + + if (atomic_add_unless(&engine->stats.active, -1, 1)) + return; + + write_seqlock_irqsave(&engine->stats.lock, flags); + if (atomic_dec_and_test(&engine->stats.active)) { + engine->stats.total = + ktime_add(engine->stats.total, + ktime_sub(ktime_get(), engine->stats.start)); + } + write_sequnlock_irqrestore(&engine->stats.lock, flags); +} + +static void +execlists_check_context(const struct intel_context *ce, + const struct intel_engine_cs *engine) +{ + const struct intel_ring *ring = ce->ring; + u32 *regs = ce->lrc_reg_state; + bool valid = true; + int x; + + if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { + pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", + engine->name, + regs[CTX_RING_START], + i915_ggtt_offset(ring->vma)); + regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); + valid = false; + } + + if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != + (RING_CTL_SIZE(ring->size) | RING_VALID)) { + pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", + engine->name, + regs[CTX_RING_CTL], + (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); + regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; + valid = false; + } + + x = lrc_ring_mi_mode(engine); + if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { + pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", + engine->name, regs[x + 1]); + regs[x + 1] &= ~STOP_RING; + regs[x + 1] |= STOP_RING << 16; + valid = false; + } + + WARN_ONCE(!valid, "Invalid lrc state found before submission\n"); +} + +static void restore_default_state(struct intel_context *ce, + struct intel_engine_cs *engine) +{ + u32 *regs; + + regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE); + execlists_init_reg_state(regs, ce, engine, ce->ring, true); + + ce->runtime.last = intel_context_get_runtime(ce); +} + +static void reset_active(struct i915_request *rq, + struct intel_engine_cs *engine) +{ + struct intel_context * const ce = rq->context; + u32 head; + + /* + * The executing context has been cancelled. We want to prevent + * further execution along this context and propagate the error on + * to anything depending on its results. + * + * In __i915_request_submit(), we apply the -EIO and remove the + * requests' payloads for any banned requests. But first, we must + * rewind the context back to the start of the incomplete request so + * that we do not jump back into the middle of the batch. + * + * We preserve the breadcrumbs and semaphores of the incomplete + * requests so that inter-timeline dependencies (i.e other timelines) + * remain correctly ordered. And we defer to __i915_request_submit() + * so that all asynchronous waits are correctly handled. + */ + ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n", + rq->fence.context, rq->fence.seqno); + + /* On resubmission of the active request, payload will be scrubbed */ + if (i915_request_completed(rq)) + head = rq->tail; + else + head = active_request(ce->timeline, rq)->head; + head = intel_ring_wrap(ce->ring, head); + + /* Scrub the context image to prevent replaying the previous batch */ + restore_default_state(ce, engine); + __execlists_update_reg_state(ce, engine, head); + + /* We've switched away, so this should be a no-op, but intent matters */ + ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; +} + +static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) +{ +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) + ce->runtime.num_underflow += dt < 0; + ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt); +#endif +} + +static void intel_context_update_runtime(struct intel_context *ce) +{ + u32 old; + s32 dt; + + if (intel_context_is_barrier(ce)) + return; + + old = ce->runtime.last; + ce->runtime.last = intel_context_get_runtime(ce); + dt = ce->runtime.last - old; + + if (unlikely(dt <= 0)) { + CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", + old, ce->runtime.last, dt); + st_update_runtime_underflow(ce, dt); + return; + } + + ewma_runtime_add(&ce->runtime.avg, dt); + ce->runtime.total += dt; +} + +static inline struct intel_engine_cs * +__execlists_schedule_in(struct i915_request *rq) +{ + struct intel_engine_cs * const engine = rq->engine; + struct intel_context * const ce = rq->context; + + intel_context_get(ce); + + if (unlikely(intel_context_is_banned(ce))) + reset_active(rq, engine); + + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) + execlists_check_context(ce, engine); + + if (ce->tag) { + /* Use a fixed tag for OA and friends */ + GEM_BUG_ON(ce->tag <= BITS_PER_LONG); + ce->lrc.ccid = ce->tag; + } else { + /* We don't need a strict matching tag, just different values */ + unsigned int tag = ffs(READ_ONCE(engine->context_tag)); + + GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG); + clear_bit(tag - 1, &engine->context_tag); + ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32); + + BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID); + } + + ce->lrc.ccid |= engine->execlists.ccid; + + __intel_gt_pm_get(engine->gt); + if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active)) + intel_uncore_forcewake_get(engine->uncore, engine->fw_domain); + execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); + intel_engine_context_in(engine); + + return engine; +} + +static inline struct i915_request * +execlists_schedule_in(struct i915_request *rq, int idx) +{ + struct intel_context * const ce = rq->context; + struct intel_engine_cs *old; + + GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); + trace_i915_request_in(rq, idx); + + old = READ_ONCE(ce->inflight); + do { + if (!old) { + WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); + break; + } + } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); + + GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); + return i915_request_get(rq); +} + +static void kick_siblings(struct i915_request *rq, struct intel_context *ce) +{ + struct virtual_engine *ve = container_of(ce, typeof(*ve), context); + struct i915_request *next = READ_ONCE(ve->request); + + if (next == rq || (next && next->execution_mask & ~rq->execution_mask)) + tasklet_hi_schedule(&ve->base.execlists.tasklet); +} + +static inline void +__execlists_schedule_out(struct i915_request *rq, + struct intel_engine_cs * const engine, + unsigned int ccid) +{ + struct intel_context * const ce = rq->context; + + /* + * NB process_csb() is not under the engine->active.lock and hence + * schedule_out can race with schedule_in meaning that we should + * refrain from doing non-trivial work here. + */ + + /* + * If we have just completed this context, the engine may now be + * idle and we want to re-enter powersaving. + */ + if (list_is_last_rcu(&rq->link, &ce->timeline->requests) && + i915_request_completed(rq)) + intel_engine_add_retire(engine, ce->timeline); + + ccid >>= GEN11_SW_CTX_ID_SHIFT - 32; + ccid &= GEN12_MAX_CONTEXT_HW_ID; + if (ccid < BITS_PER_LONG) { + GEM_BUG_ON(ccid == 0); + GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag)); + set_bit(ccid - 1, &engine->context_tag); + } + + intel_context_update_runtime(ce); + intel_engine_context_out(engine); + execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); + if (engine->fw_domain && !atomic_dec_return(&engine->fw_active)) + intel_uncore_forcewake_put(engine->uncore, engine->fw_domain); + intel_gt_pm_put_async(engine->gt); + + /* + * If this is part of a virtual engine, its next request may + * have been blocked waiting for access to the active context. + * We have to kick all the siblings again in case we need to + * switch (e.g. the next request is not runnable on this + * engine). Hopefully, we will already have submitted the next + * request before the tasklet runs and do not need to rebuild + * each virtual tree and kick everyone again. + */ + if (ce->engine != engine) + kick_siblings(rq, ce); + + intel_context_put(ce); +} + +static inline void +execlists_schedule_out(struct i915_request *rq) +{ + struct intel_context * const ce = rq->context; + struct intel_engine_cs *cur, *old; + u32 ccid; + + trace_i915_request_out(rq); + + ccid = rq->context->lrc.ccid; + old = READ_ONCE(ce->inflight); + do + cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; + while (!try_cmpxchg(&ce->inflight, &old, cur)); + if (!cur) + __execlists_schedule_out(rq, old, ccid); + + i915_request_put(rq); +} + +static u64 execlists_update_context(struct i915_request *rq) +{ + struct intel_context *ce = rq->context; + u64 desc = ce->lrc.desc; + u32 tail, prev; + + /* + * WaIdleLiteRestore:bdw,skl + * + * We should never submit the context with the same RING_TAIL twice + * just in case we submit an empty ring, which confuses the HW. + * + * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of + * the normal request to be able to always advance the RING_TAIL on + * subsequent resubmissions (for lite restore). Should that fail us, + * and we try and submit the same tail again, force the context + * reload. + * + * If we need to return to a preempted context, we need to skip the + * lite-restore and force it to reload the RING_TAIL. Otherwise, the + * HW has a tendency to ignore us rewinding the TAIL to the end of + * an earlier request. + */ + GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail); + prev = rq->ring->tail; + tail = intel_ring_set_tail(rq->ring, rq->tail); + if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0)) + desc |= CTX_DESC_FORCE_RESTORE; + ce->lrc_reg_state[CTX_RING_TAIL] = tail; + rq->tail = rq->wa_tail; + + /* + * Make sure the context image is complete before we submit it to HW. + * + * Ostensibly, writes (including the WCB) should be flushed prior to + * an uncached write such as our mmio register access, the empirical + * evidence (esp. on Braswell) suggests that the WC write into memory + * may not be visible to the HW prior to the completion of the UC + * register write and that we may begin execution from the context + * before its image is complete leading to invalid PD chasing. + */ + wmb(); + + ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE; + return desc; +} + +static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) +{ + if (execlists->ctrl_reg) { + writel(lower_32_bits(desc), execlists->submit_reg + port * 2); + writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); + } else { + writel(upper_32_bits(desc), execlists->submit_reg); + writel(lower_32_bits(desc), execlists->submit_reg); + } +} + +static __maybe_unused char * +dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq) +{ + if (!rq) + return ""; + + snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d", + prefix, + rq->context->lrc.ccid, + rq->fence.context, rq->fence.seqno, + i915_request_completed(rq) ? "!" : + i915_request_started(rq) ? "*" : + "", + rq_prio(rq)); + + return buf; +} + +static __maybe_unused void +trace_ports(const struct intel_engine_execlists *execlists, + const char *msg, + struct i915_request * const *ports) +{ + const struct intel_engine_cs *engine = + container_of(execlists, typeof(*engine), execlists); + char __maybe_unused p0[40], p1[40]; + + if (!ports[0]) + return; + + ENGINE_TRACE(engine, "%s { %s%s }\n", msg, + dump_port(p0, sizeof(p0), "", ports[0]), + dump_port(p1, sizeof(p1), ", ", ports[1])); +} + +static inline bool +reset_in_progress(const struct intel_engine_execlists *execlists) +{ + return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); +} + +static __maybe_unused bool +assert_pending_valid(const struct intel_engine_execlists *execlists, + const char *msg) +{ + struct intel_engine_cs *engine = + container_of(execlists, typeof(*engine), execlists); + struct i915_request * const *port, *rq; + struct intel_context *ce = NULL; + bool sentinel = false; + u32 ccid = -1; + + trace_ports(execlists, msg, execlists->pending); + + /* We may be messing around with the lists during reset, lalala */ + if (reset_in_progress(execlists)) + return true; + + if (!execlists->pending[0]) { + GEM_TRACE_ERR("%s: Nothing pending for promotion!\n", + engine->name); + return false; + } + + if (execlists->pending[execlists_num_ports(execlists)]) { + GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n", + engine->name, execlists_num_ports(execlists)); + return false; + } + + for (port = execlists->pending; (rq = *port); port++) { + unsigned long flags; + bool ok = true; + + GEM_BUG_ON(!kref_read(&rq->fence.refcount)); + GEM_BUG_ON(!i915_request_is_active(rq)); + + if (ce == rq->context) { + GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n", + engine->name, + ce->timeline->fence_context, + port - execlists->pending); + return false; + } + ce = rq->context; + + if (ccid == ce->lrc.ccid) { + GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n", + engine->name, + ccid, ce->timeline->fence_context, + port - execlists->pending); + return false; + } + ccid = ce->lrc.ccid; + + /* + * Sentinels are supposed to be the last request so they flush + * the current execution off the HW. Check that they are the only + * request in the pending submission. + */ + if (sentinel) { + GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n", + engine->name, + ce->timeline->fence_context, + port - execlists->pending); + return false; + } + sentinel = i915_request_has_sentinel(rq); + + /* Hold tightly onto the lock to prevent concurrent retires! */ + if (!spin_trylock_irqsave(&rq->lock, flags)) + continue; + + if (i915_request_completed(rq)) + goto unlock; + + if (i915_active_is_idle(&ce->active) && + !intel_context_is_barrier(ce)) { + GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n", + engine->name, + ce->timeline->fence_context, + port - execlists->pending); + ok = false; + goto unlock; + } + + if (!i915_vma_is_pinned(ce->state)) { + GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n", + engine->name, + ce->timeline->fence_context, + port - execlists->pending); + ok = false; + goto unlock; + } + + if (!i915_vma_is_pinned(ce->ring->vma)) { + GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n", + engine->name, + ce->timeline->fence_context, + port - execlists->pending); + ok = false; + goto unlock; + } + +unlock: + spin_unlock_irqrestore(&rq->lock, flags); + if (!ok) + return false; + } + + return ce; +} + +static void execlists_submit_ports(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists *execlists = &engine->execlists; + unsigned int n; + + GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); + + /* + * We can skip acquiring intel_runtime_pm_get() here as it was taken + * on our behalf by the request (see i915_gem_mark_busy()) and it will + * not be relinquished until the device is idle (see + * i915_gem_idle_work_handler()). As a precaution, we make sure + * that all ELSP are drained i.e. we have processed the CSB, + * before allowing ourselves to idle and calling intel_runtime_pm_put(). + */ + GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); + + /* + * ELSQ note: the submit queue is not cleared after being submitted + * to the HW so we need to make sure we always clean it up. This is + * currently ensured by the fact that we always write the same number + * of elsq entries, keep this in mind before changing the loop below. + */ + for (n = execlists_num_ports(execlists); n--; ) { + struct i915_request *rq = execlists->pending[n]; + + write_desc(execlists, + rq ? execlists_update_context(rq) : 0, + n); + } + + /* we need to manually load the submit queue */ + if (execlists->ctrl_reg) + writel(EL_CTRL_LOAD, execlists->ctrl_reg); +} + +static bool ctx_single_port_submission(const struct intel_context *ce) +{ + return (IS_ENABLED(CONFIG_DRM_I915_GVT) && + intel_context_force_single_submission(ce)); +} + +static bool can_merge_ctx(const struct intel_context *prev, + const struct intel_context *next) +{ + if (prev != next) + return false; + + if (ctx_single_port_submission(prev)) + return false; + + return true; +} + +static unsigned long i915_request_flags(const struct i915_request *rq) +{ + return READ_ONCE(rq->fence.flags); +} + +static bool can_merge_rq(const struct i915_request *prev, + const struct i915_request *next) +{ + GEM_BUG_ON(prev == next); + GEM_BUG_ON(!assert_priority_queue(prev, next)); + + /* + * We do not submit known completed requests. Therefore if the next + * request is already completed, we can pretend to merge it in + * with the previous context (and we will skip updating the ELSP + * and tracking). Thus hopefully keeping the ELSP full with active + * contexts, despite the best efforts of preempt-to-busy to confuse + * us. + */ + if (i915_request_completed(next)) + return true; + + if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) & + (BIT(I915_FENCE_FLAG_NOPREEMPT) | + BIT(I915_FENCE_FLAG_SENTINEL)))) + return false; + + if (!can_merge_ctx(prev->context, next->context)) + return false; + + GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno)); + return true; +} + +static void virtual_update_register_offsets(u32 *regs, + struct intel_engine_cs *engine) +{ + set_offsets(regs, reg_offsets(engine), engine, false); +} + +static bool virtual_matches(const struct virtual_engine *ve, + const struct i915_request *rq, + const struct intel_engine_cs *engine) +{ + const struct intel_engine_cs *inflight; + + if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ + return false; + + /* + * We track when the HW has completed saving the context image + * (i.e. when we have seen the final CS event switching out of + * the context) and must not overwrite the context image before + * then. This restricts us to only using the active engine + * while the previous virtualized request is inflight (so + * we reuse the register offsets). This is a very small + * hystersis on the greedy seelction algorithm. + */ + inflight = intel_context_inflight(&ve->context); + if (inflight && inflight != engine) + return false; + + return true; +} + +static void virtual_xfer_context(struct virtual_engine *ve, + struct intel_engine_cs *engine) +{ + unsigned int n; + + if (likely(engine == ve->siblings[0])) + return; + + GEM_BUG_ON(READ_ONCE(ve->context.inflight)); + if (!intel_engine_has_relative_mmio(engine)) + virtual_update_register_offsets(ve->context.lrc_reg_state, + engine); + + /* + * Move the bound engine to the top of the list for + * future execution. We then kick this tasklet first + * before checking others, so that we preferentially + * reuse this set of bound registers. + */ + for (n = 1; n < ve->num_siblings; n++) { + if (ve->siblings[n] == engine) { + swap(ve->siblings[n], ve->siblings[0]); + break; + } + } +} + +#define for_each_waiter(p__, rq__) \ + list_for_each_entry_lockless(p__, \ + &(rq__)->sched.waiters_list, \ + wait_link) + +#define for_each_signaler(p__, rq__) \ + list_for_each_entry_rcu(p__, \ + &(rq__)->sched.signalers_list, \ + signal_link) + +static void defer_request(struct i915_request *rq, struct list_head * const pl) +{ + LIST_HEAD(list); + + /* + * We want to move the interrupted request to the back of + * the round-robin list (i.e. its priority level), but + * in doing so, we must then move all requests that were in + * flight and were waiting for the interrupted request to + * be run after it again. + */ + do { + struct i915_dependency *p; + + GEM_BUG_ON(i915_request_is_active(rq)); + list_move_tail(&rq->sched.link, pl); + + for_each_waiter(p, rq) { + struct i915_request *w = + container_of(p->waiter, typeof(*w), sched); + + if (p->flags & I915_DEPENDENCY_WEAK) + continue; + + /* Leave semaphores spinning on the other engines */ + if (w->engine != rq->engine) + continue; + + /* No waiter should start before its signaler */ + GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) && + i915_request_started(w) && + !i915_request_completed(rq)); + + GEM_BUG_ON(i915_request_is_active(w)); + if (!i915_request_is_ready(w)) + continue; + + if (rq_prio(w) < rq_prio(rq)) + continue; + + GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); + list_move_tail(&w->sched.link, &list); + } + + rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); + } while (rq); +} + +static void defer_active(struct intel_engine_cs *engine) +{ + struct i915_request *rq; + + rq = __unwind_incomplete_requests(engine); + if (!rq) + return; + + defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); +} + +static bool +need_timeslice(const struct intel_engine_cs *engine, + const struct i915_request *rq, + const struct rb_node *rb) +{ + int hint; + + if (!intel_engine_has_timeslices(engine)) + return false; + + hint = engine->execlists.queue_priority_hint; + + if (rb) { + const struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + const struct intel_engine_cs *inflight = + intel_context_inflight(&ve->context); + + if (!inflight || inflight == engine) { + struct i915_request *next; + + rcu_read_lock(); + next = READ_ONCE(ve->request); + if (next) + hint = max(hint, rq_prio(next)); + rcu_read_unlock(); + } + } + + if (!list_is_last(&rq->sched.link, &engine->active.requests)) + hint = max(hint, rq_prio(list_next_entry(rq, sched.link))); + + GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE); + return hint >= effective_prio(rq); +} + +static bool +timeslice_yield(const struct intel_engine_execlists *el, + const struct i915_request *rq) +{ + /* + * Once bitten, forever smitten! + * + * If the active context ever busy-waited on a semaphore, + * it will be treated as a hog until the end of its timeslice (i.e. + * until it is scheduled out and replaced by a new submission, + * possibly even its own lite-restore). The HW only sends an interrupt + * on the first miss, and we do know if that semaphore has been + * signaled, or even if it is now stuck on another semaphore. Play + * safe, yield if it might be stuck -- it will be given a fresh + * timeslice in the near future. + */ + return rq->context->lrc.ccid == READ_ONCE(el->yield); +} + +static bool +timeslice_expired(const struct intel_engine_execlists *el, + const struct i915_request *rq) +{ + return timer_expired(&el->timer) || timeslice_yield(el, rq); +} + +static int +switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) +{ + if (list_is_last(&rq->sched.link, &engine->active.requests)) + return engine->execlists.queue_priority_hint; + + return rq_prio(list_next_entry(rq, sched.link)); +} + +static inline unsigned long +timeslice(const struct intel_engine_cs *engine) +{ + return READ_ONCE(engine->props.timeslice_duration_ms); +} + +static unsigned long active_timeslice(const struct intel_engine_cs *engine) +{ + const struct intel_engine_execlists *execlists = &engine->execlists; + const struct i915_request *rq = *execlists->active; + + if (!rq || i915_request_completed(rq)) + return 0; + + if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq)) + return 0; + + return timeslice(engine); +} + +static void set_timeslice(struct intel_engine_cs *engine) +{ + unsigned long duration; + + if (!intel_engine_has_timeslices(engine)) + return; + + duration = active_timeslice(engine); + ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration); + + set_timer_ms(&engine->execlists.timer, duration); +} + +static void start_timeslice(struct intel_engine_cs *engine, int prio) +{ + struct intel_engine_execlists *execlists = &engine->execlists; + unsigned long duration; + + if (!intel_engine_has_timeslices(engine)) + return; + + WRITE_ONCE(execlists->switch_priority_hint, prio); + if (prio == INT_MIN) + return; + + if (timer_pending(&execlists->timer)) + return; + + duration = timeslice(engine); + ENGINE_TRACE(engine, + "start timeslicing, prio:%d, interval:%lu", + prio, duration); + + set_timer_ms(&execlists->timer, duration); +} + +static void record_preemption(struct intel_engine_execlists *execlists) +{ + (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); +} + +static unsigned long active_preempt_timeout(struct intel_engine_cs *engine, + const struct i915_request *rq) +{ + if (!rq) + return 0; + + /* Force a fast reset for terminated contexts (ignoring sysfs!) */ + if (unlikely(intel_context_is_banned(rq->context))) + return 1; + + return READ_ONCE(engine->props.preempt_timeout_ms); +} + +static void set_preempt_timeout(struct intel_engine_cs *engine, + const struct i915_request *rq) +{ + if (!intel_engine_has_preempt_reset(engine)) + return; + + set_timer_ms(&engine->execlists.preempt, + active_preempt_timeout(engine, rq)); +} + +static inline void clear_ports(struct i915_request **ports, int count) +{ + memset_p((void **)ports, NULL, count); +} + +static inline void +copy_ports(struct i915_request **dst, struct i915_request **src, int count) +{ + /* A memcpy_p() would be very useful here! */ + while (count--) + WRITE_ONCE(*dst++, *src++); /* avoid write tearing */ +} + +static void execlists_dequeue(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + struct i915_request **port = execlists->pending; + struct i915_request ** const last_port = port + execlists->port_mask; + struct i915_request * const *active; + struct i915_request *last; + struct rb_node *rb; + bool submit = false; + + /* + * Hardware submission is through 2 ports. Conceptually each port + * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is + * static for a context, and unique to each, so we only execute + * requests belonging to a single context from each ring. RING_HEAD + * is maintained by the CS in the context image, it marks the place + * where it got up to last time, and through RING_TAIL we tell the CS + * where we want to execute up to this time. + * + * In this list the requests are in order of execution. Consecutive + * requests from the same context are adjacent in the ringbuffer. We + * can combine these requests into a single RING_TAIL update: + * + * RING_HEAD...req1...req2 + * ^- RING_TAIL + * since to execute req2 the CS must first execute req1. + * + * Our goal then is to point each port to the end of a consecutive + * sequence of requests as being the most optimal (fewest wake ups + * and context switches) submission. + */ + + for (rb = rb_first_cached(&execlists->virtual); rb; ) { + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + struct i915_request *rq = READ_ONCE(ve->request); + + if (!rq) { /* lazily cleanup after another engine handled rq */ + rb_erase_cached(rb, &execlists->virtual); + RB_CLEAR_NODE(rb); + rb = rb_first_cached(&execlists->virtual); + continue; + } + + if (!virtual_matches(ve, rq, engine)) { + rb = rb_next(rb); + continue; + } + + break; + } + + /* + * If the queue is higher priority than the last + * request in the currently active context, submit afresh. + * We will resubmit again afterwards in case we need to split + * the active context to interject the preemption request, + * i.e. we will retrigger preemption following the ack in case + * of trouble. + */ + active = READ_ONCE(execlists->active); + + /* + * In theory we can skip over completed contexts that have not + * yet been processed by events (as those events are in flight): + * + * while ((last = *active) && i915_request_completed(last)) + * active++; + * + * However, the GPU cannot handle this as it will ultimately + * find itself trying to jump back into a context it has just + * completed and barf. + */ + + if ((last = *active)) { + if (need_preempt(engine, last, rb)) { + if (i915_request_completed(last)) { + tasklet_hi_schedule(&execlists->tasklet); + return; + } + + ENGINE_TRACE(engine, + "preempting last=%llx:%lld, prio=%d, hint=%d\n", + last->fence.context, + last->fence.seqno, + last->sched.attr.priority, + execlists->queue_priority_hint); + record_preemption(execlists); + + /* + * Don't let the RING_HEAD advance past the breadcrumb + * as we unwind (and until we resubmit) so that we do + * not accidentally tell it to go backwards. + */ + ring_set_paused(engine, 1); + + /* + * Note that we have not stopped the GPU at this point, + * so we are unwinding the incomplete requests as they + * remain inflight and so by the time we do complete + * the preemption, some of the unwound requests may + * complete! + */ + __unwind_incomplete_requests(engine); + + last = NULL; + } else if (need_timeslice(engine, last, rb) && + timeslice_expired(execlists, last)) { + if (i915_request_completed(last)) { + tasklet_hi_schedule(&execlists->tasklet); + return; + } + + ENGINE_TRACE(engine, + "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n", + last->fence.context, + last->fence.seqno, + last->sched.attr.priority, + execlists->queue_priority_hint, + yesno(timeslice_yield(execlists, last))); + + ring_set_paused(engine, 1); + defer_active(engine); + + /* + * Unlike for preemption, if we rewind and continue + * executing the same context as previously active, + * the order of execution will remain the same and + * the tail will only advance. We do not need to + * force a full context restore, as a lite-restore + * is sufficient to resample the monotonic TAIL. + * + * If we switch to any other context, similarly we + * will not rewind TAIL of current context, and + * normal save/restore will preserve state and allow + * us to later continue executing the same request. + */ + last = NULL; + } else { + /* + * Otherwise if we already have a request pending + * for execution after the current one, we can + * just wait until the next CS event before + * queuing more. In either case we will force a + * lite-restore preemption event, but if we wait + * we hopefully coalesce several updates into a single + * submission. + */ + if (!list_is_last(&last->sched.link, + &engine->active.requests)) { + /* + * Even if ELSP[1] is occupied and not worthy + * of timeslices, our queue might be. + */ + start_timeslice(engine, queue_prio(execlists)); + return; + } + } + } + + while (rb) { /* XXX virtual is always taking precedence */ + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + struct i915_request *rq; + + spin_lock(&ve->base.active.lock); + + rq = ve->request; + if (unlikely(!rq)) { /* lost the race to a sibling */ + spin_unlock(&ve->base.active.lock); + rb_erase_cached(rb, &execlists->virtual); + RB_CLEAR_NODE(rb); + rb = rb_first_cached(&execlists->virtual); + continue; + } + + GEM_BUG_ON(rq != ve->request); + GEM_BUG_ON(rq->engine != &ve->base); + GEM_BUG_ON(rq->context != &ve->context); + + if (rq_prio(rq) >= queue_prio(execlists)) { + if (!virtual_matches(ve, rq, engine)) { + spin_unlock(&ve->base.active.lock); + rb = rb_next(rb); + continue; + } + + if (last && !can_merge_rq(last, rq)) { + spin_unlock(&ve->base.active.lock); + start_timeslice(engine, rq_prio(rq)); + return; /* leave this for another sibling */ + } + + ENGINE_TRACE(engine, + "virtual rq=%llx:%lld%s, new engine? %s\n", + rq->fence.context, + rq->fence.seqno, + i915_request_completed(rq) ? "!" : + i915_request_started(rq) ? "*" : + "", + yesno(engine != ve->siblings[0])); + + WRITE_ONCE(ve->request, NULL); + WRITE_ONCE(ve->base.execlists.queue_priority_hint, + INT_MIN); + rb_erase_cached(rb, &execlists->virtual); + RB_CLEAR_NODE(rb); + + GEM_BUG_ON(!(rq->execution_mask & engine->mask)); + WRITE_ONCE(rq->engine, engine); + + if (__i915_request_submit(rq)) { + /* + * Only after we confirm that we will submit + * this request (i.e. it has not already + * completed), do we want to update the context. + * + * This serves two purposes. It avoids + * unnecessary work if we are resubmitting an + * already completed request after timeslicing. + * But more importantly, it prevents us altering + * ve->siblings[] on an idle context, where + * we may be using ve->siblings[] in + * virtual_context_enter / virtual_context_exit. + */ + virtual_xfer_context(ve, engine); + GEM_BUG_ON(ve->siblings[0] != engine); + + submit = true; + last = rq; + } + i915_request_put(rq); + + /* + * Hmm, we have a bunch of virtual engine requests, + * but the first one was already completed (thanks + * preempt-to-busy!). Keep looking at the veng queue + * until we have no more relevant requests (i.e. + * the normal submit queue has higher priority). + */ + if (!submit) { + spin_unlock(&ve->base.active.lock); + rb = rb_first_cached(&execlists->virtual); + continue; + } + } + + spin_unlock(&ve->base.active.lock); + break; + } + + while ((rb = rb_first_cached(&execlists->queue))) { + struct i915_priolist *p = to_priolist(rb); + struct i915_request *rq, *rn; + int i; + + priolist_for_each_request_consume(rq, rn, p, i) { + bool merge = true; + + /* + * Can we combine this request with the current port? + * It has to be the same context/ringbuffer and not + * have any exceptions (e.g. GVT saying never to + * combine contexts). + * + * If we can combine the requests, we can execute both + * by updating the RING_TAIL to point to the end of the + * second request, and so we never need to tell the + * hardware about the first. + */ + if (last && !can_merge_rq(last, rq)) { + /* + * If we are on the second port and cannot + * combine this request with the last, then we + * are done. + */ + if (port == last_port) + goto done; + + /* + * We must not populate both ELSP[] with the + * same LRCA, i.e. we must submit 2 different + * contexts if we submit 2 ELSP. + */ + if (last->context == rq->context) + goto done; + + if (i915_request_has_sentinel(last)) + goto done; + + /* + * If GVT overrides us we only ever submit + * port[0], leaving port[1] empty. Note that we + * also have to be careful that we don't queue + * the same context (even though a different + * request) to the second port. + */ + if (ctx_single_port_submission(last->context) || + ctx_single_port_submission(rq->context)) + goto done; + + merge = false; + } + + if (__i915_request_submit(rq)) { + if (!merge) { + *port = execlists_schedule_in(last, port - execlists->pending); + port++; + last = NULL; + } + + GEM_BUG_ON(last && + !can_merge_ctx(last->context, + rq->context)); + GEM_BUG_ON(last && + i915_seqno_passed(last->fence.seqno, + rq->fence.seqno)); + + submit = true; + last = rq; + } + } + + rb_erase_cached(&p->node, &execlists->queue); + i915_priolist_free(p); + } + +done: + /* + * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. + * + * We choose the priority hint such that if we add a request of greater + * priority than this, we kick the submission tasklet to decide on + * the right order of submitting the requests to hardware. We must + * also be prepared to reorder requests as they are in-flight on the + * HW. We derive the priority hint then as the first "hole" in + * the HW submission ports and if there are no available slots, + * the priority of the lowest executing request, i.e. last. + * + * When we do receive a higher priority request ready to run from the + * user, see queue_request(), the priority hint is bumped to that + * request triggering preemption on the next dequeue (or subsequent + * interrupt for secondary ports). + */ + execlists->queue_priority_hint = queue_prio(execlists); + + if (submit) { + *port = execlists_schedule_in(last, port - execlists->pending); + execlists->switch_priority_hint = + switch_prio(engine, *execlists->pending); + + /* + * Skip if we ended up with exactly the same set of requests, + * e.g. trying to timeslice a pair of ordered contexts + */ + if (!memcmp(active, execlists->pending, + (port - execlists->pending + 1) * sizeof(*port))) { + do + execlists_schedule_out(fetch_and_zero(port)); + while (port-- != execlists->pending); + + goto skip_submit; + } + clear_ports(port + 1, last_port - port); + + WRITE_ONCE(execlists->yield, -1); + set_preempt_timeout(engine, *active); + execlists_submit_ports(engine); + } else { + start_timeslice(engine, execlists->queue_priority_hint); +skip_submit: + ring_set_paused(engine, 0); + } +} + +static void +cancel_port_requests(struct intel_engine_execlists * const execlists) +{ + struct i915_request * const *port; + + for (port = execlists->pending; *port; port++) + execlists_schedule_out(*port); + clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending)); + + /* Mark the end of active before we overwrite *active */ + for (port = xchg(&execlists->active, execlists->pending); *port; port++) + execlists_schedule_out(*port); + clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight)); + + smp_wmb(); /* complete the seqlock for execlists_active() */ + WRITE_ONCE(execlists->active, execlists->inflight); +} + +static inline void +invalidate_csb_entries(const u64 *first, const u64 *last) +{ + clflush((void *)first); + clflush((void *)last); +} + +/* + * Starting with Gen12, the status has a new format: + * + * bit 0: switched to new queue + * bit 1: reserved + * bit 2: semaphore wait mode (poll or signal), only valid when + * switch detail is set to "wait on semaphore" + * bits 3-5: engine class + * bits 6-11: engine instance + * bits 12-14: reserved + * bits 15-25: sw context id of the lrc the GT switched to + * bits 26-31: sw counter of the lrc the GT switched to + * bits 32-35: context switch detail + * - 0: ctx complete + * - 1: wait on sync flip + * - 2: wait on vblank + * - 3: wait on scanline + * - 4: wait on semaphore + * - 5: context preempted (not on SEMAPHORE_WAIT or + * WAIT_FOR_EVENT) + * bit 36: reserved + * bits 37-43: wait detail (for switch detail 1 to 4) + * bits 44-46: reserved + * bits 47-57: sw context id of the lrc the GT switched away from + * bits 58-63: sw counter of the lrc the GT switched away from + */ +static inline bool gen12_csb_parse(const u64 *csb) +{ + bool ctx_away_valid; + bool new_queue; + u64 entry; + + /* HSD#22011248461 */ + entry = READ_ONCE(*csb); + if (unlikely(entry == -1)) { + preempt_disable(); + if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50)) + GEM_WARN_ON("50us CSB timeout"); + preempt_enable(); + } + WRITE_ONCE(*(u64 *)csb, -1); + + ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry)); + new_queue = + lower_32_bits(entry) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; + + /* + * The context switch detail is not guaranteed to be 5 when a preemption + * occurs, so we can't just check for that. The check below works for + * all the cases we care about, including preemptions of WAIT + * instructions and lite-restore. Preempt-to-idle via the CTRL register + * would require some extra handling, but we don't support that. + */ + if (!ctx_away_valid || new_queue) { + GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(entry))); + return true; + } + + /* + * switch detail = 5 is covered by the case above and we do not expect a + * context switch on an unsuccessful wait instruction since we always + * use polling mode. + */ + GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(entry))); + return false; +} + +static inline bool gen8_csb_parse(const u64 *csb) +{ + return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); +} + +static void process_csb(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + const u64 * const buf = execlists->csb_status; + const u8 num_entries = execlists->csb_size; + u8 head, tail; + + /* + * As we modify our execlists state tracking we require exclusive + * access. Either we are inside the tasklet, or the tasklet is disabled + * and we assume that is only inside the reset paths and so serialised. + */ + GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) && + !reset_in_progress(execlists)); + GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine)); + + /* + * Note that csb_write, csb_status may be either in HWSP or mmio. + * When reading from the csb_write mmio register, we have to be + * careful to only use the GEN8_CSB_WRITE_PTR portion, which is + * the low 4bits. As it happens we know the next 4bits are always + * zero and so we can simply masked off the low u8 of the register + * and treat it identically to reading from the HWSP (without having + * to use explicit shifting and masking, and probably bifurcating + * the code to handle the legacy mmio read). + */ + head = execlists->csb_head; + tail = READ_ONCE(*execlists->csb_write); + if (unlikely(head == tail)) + return; + + /* + * We will consume all events from HW, or at least pretend to. + * + * The sequence of events from the HW is deterministic, and derived + * from our writes to the ELSP, with a smidgen of variability for + * the arrival of the asynchronous requests wrt to the inflight + * execution. If the HW sends an event that does not correspond with + * the one we are expecting, we have to abandon all hope as we lose + * all tracking of what the engine is actually executing. We will + * only detect we are out of sequence with the HW when we get an + * 'impossible' event because we have already drained our own + * preemption/promotion queue. If this occurs, we know that we likely + * lost track of execution earlier and must unwind and restart, the + * simplest way is by stop processing the event queue and force the + * engine to reset. + */ + execlists->csb_head = tail; + ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail); + + /* + * Hopefully paired with a wmb() in HW! + * + * We must complete the read of the write pointer before any reads + * from the CSB, so that we do not see stale values. Without an rmb + * (lfence) the HW may speculatively perform the CSB[] reads *before* + * we perform the READ_ONCE(*csb_write). + */ + rmb(); + do { + bool promote; + + if (++head == num_entries) + head = 0; + + /* + * We are flying near dragons again. + * + * We hold a reference to the request in execlist_port[] + * but no more than that. We are operating in softirq + * context and so cannot hold any mutex or sleep. That + * prevents us stopping the requests we are processing + * in port[] from being retired simultaneously (the + * breadcrumb will be complete before we see the + * context-switch). As we only hold the reference to the + * request, any pointer chasing underneath the request + * is subject to a potential use-after-free. Thus we + * store all of the bookkeeping within port[] as + * required, and avoid using unguarded pointers beneath + * request itself. The same applies to the atomic + * status notifier. + */ + + ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", + head, + upper_32_bits(buf[head]), + lower_32_bits(buf[head])); + + if (INTEL_GEN(engine->i915) >= 12) + promote = gen12_csb_parse(buf + head); + else + promote = gen8_csb_parse(buf + head); + if (promote) { + struct i915_request * const *old = execlists->active; + + if (GEM_WARN_ON(!*execlists->pending)) { + execlists->error_interrupt |= ERROR_CSB; + break; + } + + ring_set_paused(engine, 0); + + /* Point active to the new ELSP; prevent overwriting */ + WRITE_ONCE(execlists->active, execlists->pending); + smp_wmb(); /* notify execlists_active() */ + + /* cancel old inflight, prepare for switch */ + trace_ports(execlists, "preempted", old); + while (*old) + execlists_schedule_out(*old++); + + /* switch pending to inflight */ + GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); + copy_ports(execlists->inflight, + execlists->pending, + execlists_num_ports(execlists)); + smp_wmb(); /* complete the seqlock */ + WRITE_ONCE(execlists->active, execlists->inflight); + + /* XXX Magic delay for tgl */ + ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); + + WRITE_ONCE(execlists->pending[0], NULL); + } else { + if (GEM_WARN_ON(!*execlists->active)) { + execlists->error_interrupt |= ERROR_CSB; + break; + } + + /* port0 completed, advanced to port1 */ + trace_ports(execlists, "completed", execlists->active); + + /* + * We rely on the hardware being strongly + * ordered, that the breadcrumb write is + * coherent (visible from the CPU) before the + * user interrupt is processed. One might assume + * that the breadcrumb write being before the + * user interrupt and the CS event for the context + * switch would therefore be before the CS event + * itself... + */ + if (GEM_SHOW_DEBUG() && + !i915_request_completed(*execlists->active)) { + struct i915_request *rq = *execlists->active; + const u32 *regs __maybe_unused = + rq->context->lrc_reg_state; + + ENGINE_TRACE(engine, + "context completed before request!\n"); + ENGINE_TRACE(engine, + "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n", + ENGINE_READ(engine, RING_START), + ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR, + ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR, + ENGINE_READ(engine, RING_CTL), + ENGINE_READ(engine, RING_MI_MODE)); + ENGINE_TRACE(engine, + "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ", + i915_ggtt_offset(rq->ring->vma), + rq->head, rq->tail, + rq->fence.context, + lower_32_bits(rq->fence.seqno), + hwsp_seqno(rq)); + ENGINE_TRACE(engine, + "ctx:{start:%08x, head:%04x, tail:%04x}, ", + regs[CTX_RING_START], + regs[CTX_RING_HEAD], + regs[CTX_RING_TAIL]); + } + + execlists_schedule_out(*execlists->active++); + + GEM_BUG_ON(execlists->active - execlists->inflight > + execlists_num_ports(execlists)); + } + } while (head != tail); + + set_timeslice(engine); + + /* + * Gen11 has proven to fail wrt global observation point between + * entry and tail update, failing on the ordering and thus + * we see an old entry in the context status buffer. + * + * Forcibly evict out entries for the next gpu csb update, + * to increase the odds that we get a fresh entries with non + * working hardware. The cost for doing so comes out mostly with + * the wash as hardware, working or not, will need to do the + * invalidation before. + */ + invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); +} + +static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) +{ + lockdep_assert_held(&engine->active.lock); + if (!READ_ONCE(engine->execlists.pending[0])) { + rcu_read_lock(); /* protect peeking at execlists->active */ + execlists_dequeue(engine); + rcu_read_unlock(); + } +} + +static void __execlists_hold(struct i915_request *rq) +{ + LIST_HEAD(list); + + do { + struct i915_dependency *p; + + if (i915_request_is_active(rq)) + __i915_request_unsubmit(rq); + + clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); + list_move_tail(&rq->sched.link, &rq->engine->active.hold); + i915_request_set_hold(rq); + RQ_TRACE(rq, "on hold\n"); + + for_each_waiter(p, rq) { + struct i915_request *w = + container_of(p->waiter, typeof(*w), sched); + + /* Leave semaphores spinning on the other engines */ + if (w->engine != rq->engine) + continue; + + if (!i915_request_is_ready(w)) + continue; + + if (i915_request_completed(w)) + continue; + + if (i915_request_on_hold(w)) + continue; + + list_move_tail(&w->sched.link, &list); + } + + rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); + } while (rq); +} + +static bool execlists_hold(struct intel_engine_cs *engine, + struct i915_request *rq) +{ + if (i915_request_on_hold(rq)) + return false; + + spin_lock_irq(&engine->active.lock); + + if (i915_request_completed(rq)) { /* too late! */ + rq = NULL; + goto unlock; + } + + if (rq->engine != engine) { /* preempted virtual engine */ + struct virtual_engine *ve = to_virtual_engine(rq->engine); + + /* + * intel_context_inflight() is only protected by virtue + * of process_csb() being called only by the tasklet (or + * directly from inside reset while the tasklet is suspended). + * Assert that neither of those are allowed to run while we + * poke at the request queues. + */ + GEM_BUG_ON(!reset_in_progress(&engine->execlists)); + + /* + * An unsubmitted request along a virtual engine will + * remain on the active (this) engine until we are able + * to process the context switch away (and so mark the + * context as no longer in flight). That cannot have happened + * yet, otherwise we would not be hanging! + */ + spin_lock(&ve->base.active.lock); + GEM_BUG_ON(intel_context_inflight(rq->context) != engine); + GEM_BUG_ON(ve->request != rq); + ve->request = NULL; + spin_unlock(&ve->base.active.lock); + i915_request_put(rq); + + rq->engine = engine; + } + + /* + * Transfer this request onto the hold queue to prevent it + * being resumbitted to HW (and potentially completed) before we have + * released it. Since we may have already submitted following + * requests, we need to remove those as well. + */ + GEM_BUG_ON(i915_request_on_hold(rq)); + GEM_BUG_ON(rq->engine != engine); + __execlists_hold(rq); + GEM_BUG_ON(list_empty(&engine->active.hold)); + +unlock: + spin_unlock_irq(&engine->active.lock); + return rq; +} + +static bool hold_request(const struct i915_request *rq) +{ + struct i915_dependency *p; + bool result = false; + + /* + * If one of our ancestors is on hold, we must also be on hold, + * otherwise we will bypass it and execute before it. + */ + rcu_read_lock(); + for_each_signaler(p, rq) { + const struct i915_request *s = + container_of(p->signaler, typeof(*s), sched); + + if (s->engine != rq->engine) + continue; + + result = i915_request_on_hold(s); + if (result) + break; + } + rcu_read_unlock(); + + return result; +} + +static void __execlists_unhold(struct i915_request *rq) +{ + LIST_HEAD(list); + + do { + struct i915_dependency *p; + + RQ_TRACE(rq, "hold release\n"); + + GEM_BUG_ON(!i915_request_on_hold(rq)); + GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); + + i915_request_clear_hold(rq); + list_move_tail(&rq->sched.link, + i915_sched_lookup_priolist(rq->engine, + rq_prio(rq))); + set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); + + /* Also release any children on this engine that are ready */ + for_each_waiter(p, rq) { + struct i915_request *w = + container_of(p->waiter, typeof(*w), sched); + + /* Propagate any change in error status */ + if (rq->fence.error) + i915_request_set_error_once(w, rq->fence.error); + + if (w->engine != rq->engine) + continue; + + if (!i915_request_on_hold(w)) + continue; + + /* Check that no other parents are also on hold */ + if (hold_request(w)) + continue; + + list_move_tail(&w->sched.link, &list); + } + + rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); + } while (rq); +} + +static void execlists_unhold(struct intel_engine_cs *engine, + struct i915_request *rq) +{ + spin_lock_irq(&engine->active.lock); + + /* + * Move this request back to the priority queue, and all of its + * children and grandchildren that were suspended along with it. + */ + __execlists_unhold(rq); + + if (rq_prio(rq) > engine->execlists.queue_priority_hint) { + engine->execlists.queue_priority_hint = rq_prio(rq); + tasklet_hi_schedule(&engine->execlists.tasklet); + } + + spin_unlock_irq(&engine->active.lock); +} + +struct execlists_capture { + struct work_struct work; + struct i915_request *rq; + struct i915_gpu_coredump *error; +}; + +static void execlists_capture_work(struct work_struct *work) +{ + struct execlists_capture *cap = container_of(work, typeof(*cap), work); + const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN; + struct intel_engine_cs *engine = cap->rq->engine; + struct intel_gt_coredump *gt = cap->error->gt; + struct intel_engine_capture_vma *vma; + + /* Compress all the objects attached to the request, slow! */ + vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp); + if (vma) { + struct i915_vma_compress *compress = + i915_vma_capture_prepare(gt); + + intel_engine_coredump_add_vma(gt->engine, vma, compress); + i915_vma_capture_finish(gt, compress); + } + + gt->simulated = gt->engine->simulated; + cap->error->simulated = gt->simulated; + + /* Publish the error state, and announce it to the world */ + i915_error_state_store(cap->error); + i915_gpu_coredump_put(cap->error); + + /* Return this request and all that depend upon it for signaling */ + execlists_unhold(engine, cap->rq); + i915_request_put(cap->rq); + + kfree(cap); +} + +static struct execlists_capture *capture_regs(struct intel_engine_cs *engine) +{ + const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; + struct execlists_capture *cap; + + cap = kmalloc(sizeof(*cap), gfp); + if (!cap) + return NULL; + + cap->error = i915_gpu_coredump_alloc(engine->i915, gfp); + if (!cap->error) + goto err_cap; + + cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp); + if (!cap->error->gt) + goto err_gpu; + + cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp); + if (!cap->error->gt->engine) + goto err_gt; + + return cap; + +err_gt: + kfree(cap->error->gt); +err_gpu: + kfree(cap->error); +err_cap: + kfree(cap); + return NULL; +} + +static struct i915_request * +active_context(struct intel_engine_cs *engine, u32 ccid) +{ + const struct intel_engine_execlists * const el = &engine->execlists; + struct i915_request * const *port, *rq; + + /* + * Use the most recent result from process_csb(), but just in case + * we trigger an error (via interrupt) before the first CS event has + * been written, peek at the next submission. + */ + + for (port = el->active; (rq = *port); port++) { + if (rq->context->lrc.ccid == ccid) { + ENGINE_TRACE(engine, + "ccid found at active:%zd\n", + port - el->active); + return rq; + } + } + + for (port = el->pending; (rq = *port); port++) { + if (rq->context->lrc.ccid == ccid) { + ENGINE_TRACE(engine, + "ccid found at pending:%zd\n", + port - el->pending); + return rq; + } + } + + ENGINE_TRACE(engine, "ccid:%x not found\n", ccid); + return NULL; +} + +static u32 active_ccid(struct intel_engine_cs *engine) +{ + return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI); +} + +static void execlists_capture(struct intel_engine_cs *engine) +{ + struct execlists_capture *cap; + + if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)) + return; + + /* + * We need to _quickly_ capture the engine state before we reset. + * We are inside an atomic section (softirq) here and we are delaying + * the forced preemption event. + */ + cap = capture_regs(engine); + if (!cap) + return; + + spin_lock_irq(&engine->active.lock); + cap->rq = active_context(engine, active_ccid(engine)); + if (cap->rq) { + cap->rq = active_request(cap->rq->context->timeline, cap->rq); + cap->rq = i915_request_get_rcu(cap->rq); + } + spin_unlock_irq(&engine->active.lock); + if (!cap->rq) + goto err_free; + + /* + * Remove the request from the execlists queue, and take ownership + * of the request. We pass it to our worker who will _slowly_ compress + * all the pages the _user_ requested for debugging their batch, after + * which we return it to the queue for signaling. + * + * By removing them from the execlists queue, we also remove the + * requests from being processed by __unwind_incomplete_requests() + * during the intel_engine_reset(), and so they will *not* be replayed + * afterwards. + * + * Note that because we have not yet reset the engine at this point, + * it is possible for the request that we have identified as being + * guilty, did in fact complete and we will then hit an arbitration + * point allowing the outstanding preemption to succeed. The likelihood + * of that is very low (as capturing of the engine registers should be + * fast enough to run inside an irq-off atomic section!), so we will + * simply hold that request accountable for being non-preemptible + * long enough to force the reset. + */ + if (!execlists_hold(engine, cap->rq)) + goto err_rq; + + INIT_WORK(&cap->work, execlists_capture_work); + schedule_work(&cap->work); + return; + +err_rq: + i915_request_put(cap->rq); +err_free: + i915_gpu_coredump_put(cap->error); + kfree(cap); +} + +static void execlists_reset(struct intel_engine_cs *engine, const char *msg) +{ + const unsigned int bit = I915_RESET_ENGINE + engine->id; + unsigned long *lock = &engine->gt->reset.flags; + + if (!intel_has_reset_engine(engine->gt)) + return; + + if (test_and_set_bit(bit, lock)) + return; + + ENGINE_TRACE(engine, "reset for %s\n", msg); + + /* Mark this tasklet as disabled to avoid waiting for it to complete */ + tasklet_disable_nosync(&engine->execlists.tasklet); + + ring_set_paused(engine, 1); /* Freeze the current request in place */ + execlists_capture(engine); + intel_engine_reset(engine, msg); + + tasklet_enable(&engine->execlists.tasklet); + clear_and_wake_up_bit(bit, lock); +} + +static bool preempt_timeout(const struct intel_engine_cs *const engine) +{ + const struct timer_list *t = &engine->execlists.preempt; + + if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) + return false; + + if (!timer_expired(t)) + return false; + + return READ_ONCE(engine->execlists.pending[0]); +} + +/* + * Check the unread Context Status Buffers and manage the submission of new + * contexts to the ELSP accordingly. + */ +static void execlists_submission_tasklet(unsigned long data) +{ + struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; + bool timeout = preempt_timeout(engine); + + process_csb(engine); + + if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) { + const char *msg; + + /* Generate the error message in priority wrt to the user! */ + if (engine->execlists.error_interrupt & GENMASK(15, 0)) + msg = "CS error"; /* thrown by a user payload */ + else if (engine->execlists.error_interrupt & ERROR_CSB) + msg = "invalid CSB event"; + else + msg = "internal error"; + + engine->execlists.error_interrupt = 0; + execlists_reset(engine, msg); + } + + if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { + unsigned long flags; + + spin_lock_irqsave(&engine->active.lock, flags); + __execlists_submission_tasklet(engine); + spin_unlock_irqrestore(&engine->active.lock, flags); + + /* Recheck after serialising with direct-submission */ + if (unlikely(timeout && preempt_timeout(engine))) { + cancel_timer(&engine->execlists.preempt); + execlists_reset(engine, "preemption time out"); + } + } +} + +static void __execlists_kick(struct intel_engine_execlists *execlists) +{ + /* Kick the tasklet for some interrupt coalescing and reset handling */ + tasklet_hi_schedule(&execlists->tasklet); +} + +#define execlists_kick(t, member) \ + __execlists_kick(container_of(t, struct intel_engine_execlists, member)) + +static void execlists_timeslice(struct timer_list *timer) +{ + execlists_kick(timer, timer); +} + +static void execlists_preempt(struct timer_list *timer) +{ + execlists_kick(timer, preempt); +} + +static void queue_request(struct intel_engine_cs *engine, + struct i915_request *rq) +{ + GEM_BUG_ON(!list_empty(&rq->sched.link)); + list_add_tail(&rq->sched.link, + i915_sched_lookup_priolist(engine, rq_prio(rq))); + set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); +} + +static void __submit_queue_imm(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + + if (reset_in_progress(execlists)) + return; /* defer until we restart the engine following reset */ + + __execlists_submission_tasklet(engine); +} + +static void submit_queue(struct intel_engine_cs *engine, + const struct i915_request *rq) +{ + struct intel_engine_execlists *execlists = &engine->execlists; + + if (rq_prio(rq) <= execlists->queue_priority_hint) + return; + + execlists->queue_priority_hint = rq_prio(rq); + __submit_queue_imm(engine); +} + +static bool ancestor_on_hold(const struct intel_engine_cs *engine, + const struct i915_request *rq) +{ + GEM_BUG_ON(i915_request_on_hold(rq)); + return !list_empty(&engine->active.hold) && hold_request(rq); +} + +static void flush_csb(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists *el = &engine->execlists; + + if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) { + if (!reset_in_progress(el)) + process_csb(engine); + tasklet_unlock(&el->tasklet); + } +} + +static void execlists_submit_request(struct i915_request *request) +{ + struct intel_engine_cs *engine = request->engine; + unsigned long flags; + + /* Hopefully we clear execlists->pending[] to let us through */ + flush_csb(engine); + + /* Will be called from irq-context when using foreign fences. */ + spin_lock_irqsave(&engine->active.lock, flags); + + if (unlikely(ancestor_on_hold(engine, request))) { + RQ_TRACE(request, "ancestor on hold\n"); + list_add_tail(&request->sched.link, &engine->active.hold); + i915_request_set_hold(request); + } else { + queue_request(engine, request); + + GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); + GEM_BUG_ON(list_empty(&request->sched.link)); + + submit_queue(engine, request); + } + + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +static void __execlists_context_fini(struct intel_context *ce) +{ + intel_ring_put(ce->ring); + i915_vma_put(ce->state); +} + +static void execlists_context_destroy(struct kref *kref) +{ + struct intel_context *ce = container_of(kref, typeof(*ce), ref); + + GEM_BUG_ON(!i915_active_is_idle(&ce->active)); + GEM_BUG_ON(intel_context_is_pinned(ce)); + + if (ce->state) + __execlists_context_fini(ce); + + intel_context_fini(ce); + intel_context_free(ce); +} + +static void +set_redzone(void *vaddr, const struct intel_engine_cs *engine) +{ + if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) + return; + + vaddr += engine->context_size; + + memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); +} + +static void +check_redzone(const void *vaddr, const struct intel_engine_cs *engine) +{ + if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) + return; + + vaddr += engine->context_size; + + if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) + drm_err_once(&engine->i915->drm, + "%s context redzone overwritten!\n", + engine->name); +} + +static void execlists_context_unpin(struct intel_context *ce) +{ + check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, + ce->engine); +} + +static void execlists_context_post_unpin(struct intel_context *ce) +{ + i915_gem_object_unpin_map(ce->state->obj); +} + +static u32 * +gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) +{ + *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | + MI_SRM_LRM_GLOBAL_GTT | + MI_LRI_LRM_CS_MMIO; + *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); + *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + + CTX_TIMESTAMP * sizeof(u32); + *cs++ = 0; + + *cs++ = MI_LOAD_REGISTER_REG | + MI_LRR_SOURCE_CS_MMIO | + MI_LRI_LRM_CS_MMIO; + *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); + + *cs++ = MI_LOAD_REGISTER_REG | + MI_LRR_SOURCE_CS_MMIO | + MI_LRI_LRM_CS_MMIO; + *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); + + return cs; +} + +static u32 * +gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) +{ + GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); + + *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | + MI_SRM_LRM_GLOBAL_GTT | + MI_LRI_LRM_CS_MMIO; + *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); + *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + + (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); + *cs++ = 0; + + return cs; +} + +static u32 * +gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) +{ + GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); + + *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | + MI_SRM_LRM_GLOBAL_GTT | + MI_LRI_LRM_CS_MMIO; + *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); + *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + + (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); + *cs++ = 0; + + *cs++ = MI_LOAD_REGISTER_REG | + MI_LRR_SOURCE_CS_MMIO | + MI_LRI_LRM_CS_MMIO; + *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); + *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); + + return cs; +} + +static u32 * +gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) +{ + cs = gen12_emit_timestamp_wa(ce, cs); + cs = gen12_emit_cmd_buf_wa(ce, cs); + cs = gen12_emit_restore_scratch(ce, cs); + + return cs; +} + +static u32 * +gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) +{ + cs = gen12_emit_timestamp_wa(ce, cs); + cs = gen12_emit_restore_scratch(ce, cs); + + return cs; +} + +static inline u32 context_wa_bb_offset(const struct intel_context *ce) +{ + return PAGE_SIZE * ce->wa_bb_page; +} + +static u32 *context_indirect_bb(const struct intel_context *ce) +{ + void *ptr; + + GEM_BUG_ON(!ce->wa_bb_page); + + ptr = ce->lrc_reg_state; + ptr -= LRC_STATE_OFFSET; /* back to start of context image */ + ptr += context_wa_bb_offset(ce); + + return ptr; +} + +static void +setup_indirect_ctx_bb(const struct intel_context *ce, + const struct intel_engine_cs *engine, + u32 *(*emit)(const struct intel_context *, u32 *)) +{ + u32 * const start = context_indirect_bb(ce); + u32 *cs; + + cs = emit(ce, start); + GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); + while ((unsigned long)cs % CACHELINE_BYTES) + *cs++ = MI_NOOP; + + lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine, + i915_ggtt_offset(ce->state) + + context_wa_bb_offset(ce), + (cs - start) * sizeof(*cs)); +} + +static void +__execlists_update_reg_state(const struct intel_context *ce, + const struct intel_engine_cs *engine, + u32 head) +{ + struct intel_ring *ring = ce->ring; + u32 *regs = ce->lrc_reg_state; + + GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); + GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); + + regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); + regs[CTX_RING_HEAD] = head; + regs[CTX_RING_TAIL] = ring->tail; + regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; + + /* RPCS */ + if (engine->class == RENDER_CLASS) { + regs[CTX_R_PWR_CLK_STATE] = + intel_sseu_make_rpcs(engine->gt, &ce->sseu); + + i915_oa_init_reg_state(ce, engine); + } + + if (ce->wa_bb_page) { + u32 *(*fn)(const struct intel_context *ce, u32 *cs); + + fn = gen12_emit_indirect_ctx_xcs; + if (ce->engine->class == RENDER_CLASS) + fn = gen12_emit_indirect_ctx_rcs; + + /* Mutually exclusive wrt to global indirect bb */ + GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); + setup_indirect_ctx_bb(ce, engine, fn); + } +} + +static int +execlists_context_pre_pin(struct intel_context *ce, + struct i915_gem_ww_ctx *ww, void **vaddr) +{ + GEM_BUG_ON(!ce->state); + GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); + + *vaddr = i915_gem_object_pin_map(ce->state->obj, + i915_coherent_map_type(ce->engine->i915) | + I915_MAP_OVERRIDE); + + return PTR_ERR_OR_ZERO(*vaddr); +} + +static int +__execlists_context_pin(struct intel_context *ce, + struct intel_engine_cs *engine, + void *vaddr) +{ + ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; + ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; + __execlists_update_reg_state(ce, engine, ce->ring->tail); + + return 0; +} + +static int execlists_context_pin(struct intel_context *ce, void *vaddr) +{ + return __execlists_context_pin(ce, ce->engine, vaddr); +} + +static int execlists_context_alloc(struct intel_context *ce) +{ + return __execlists_context_alloc(ce, ce->engine); +} + +static void execlists_context_reset(struct intel_context *ce) +{ + CE_TRACE(ce, "reset\n"); + GEM_BUG_ON(!intel_context_is_pinned(ce)); + + intel_ring_reset(ce->ring, ce->ring->emit); + + /* Scrub away the garbage */ + execlists_init_reg_state(ce->lrc_reg_state, + ce, ce->engine, ce->ring, true); + __execlists_update_reg_state(ce, ce->engine, ce->ring->tail); + + ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; +} + +static const struct intel_context_ops execlists_context_ops = { + .alloc = execlists_context_alloc, + + .pre_pin = execlists_context_pre_pin, + .pin = execlists_context_pin, + .unpin = execlists_context_unpin, + .post_unpin = execlists_context_post_unpin, + + .enter = intel_context_enter_engine, + .exit = intel_context_exit_engine, + + .reset = execlists_context_reset, + .destroy = execlists_context_destroy, +}; + +static u32 hwsp_offset(const struct i915_request *rq) +{ + const struct intel_timeline_cacheline *cl; + + /* Before the request is executed, the timeline/cachline is fixed */ + + cl = rcu_dereference_protected(rq->hwsp_cacheline, 1); + if (cl) + return cl->ggtt_offset; + + return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset; +} + +static int gen8_emit_init_breadcrumb(struct i915_request *rq) +{ + u32 *cs; + + GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq)); + if (!i915_request_timeline(rq)->has_initial_breadcrumb) + return 0; + + cs = intel_ring_begin(rq, 6); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* + * Check if we have been preempted before we even get started. + * + * After this point i915_request_started() reports true, even if + * we get preempted and so are no longer running. + */ + *cs++ = MI_ARB_CHECK; + *cs++ = MI_NOOP; + + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = hwsp_offset(rq); + *cs++ = 0; + *cs++ = rq->fence.seqno - 1; + + intel_ring_advance(rq, cs); + + /* Record the updated position of the request's payload */ + rq->infix = intel_ring_offset(rq, cs); + + __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags); + + return 0; +} + +static int emit_pdps(struct i915_request *rq) +{ + const struct intel_engine_cs * const engine = rq->engine; + struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm); + int err, i; + u32 *cs; + + GEM_BUG_ON(intel_vgpu_active(rq->engine->i915)); + + /* + * Beware ye of the dragons, this sequence is magic! + * + * Small changes to this sequence can cause anything from + * GPU hangs to forcewake errors and machine lockups! + */ + + /* Flush any residual operations from the context load */ + err = engine->emit_flush(rq, EMIT_FLUSH); + if (err) + return err; + + /* Magic required to prevent forcewake errors! */ + err = engine->emit_flush(rq, EMIT_INVALIDATE); + if (err) + return err; + + cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* Ensure the LRI have landed before we invalidate & continue */ + *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED; + for (i = GEN8_3LVL_PDPES; i--; ) { + const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); + u32 base = engine->mmio_base; + + *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i)); + *cs++ = upper_32_bits(pd_daddr); + *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i)); + *cs++ = lower_32_bits(pd_daddr); + } + *cs++ = MI_NOOP; + + intel_ring_advance(rq, cs); + + return 0; +} + +static int execlists_request_alloc(struct i915_request *request) +{ + int ret; + + GEM_BUG_ON(!intel_context_is_pinned(request->context)); + + /* + * Flush enough space to reduce the likelihood of waiting after + * we start building the request - in which case we will just + * have to repeat work. + */ + request->reserved_space += EXECLISTS_REQUEST_SIZE; + + /* + * Note that after this point, we have committed to using + * this request as it is being used to both track the + * state of engine initialisation and liveness of the + * golden renderstate above. Think twice before you try + * to cancel/unwind this request now. + */ + + if (!i915_vm_is_4lvl(request->context->vm)) { + ret = emit_pdps(request); + if (ret) + return ret; + } + + /* Unconditionally invalidate GPU caches and TLBs. */ + ret = request->engine->emit_flush(request, EMIT_INVALIDATE); + if (ret) + return ret; + + request->reserved_space -= EXECLISTS_REQUEST_SIZE; + return 0; +} + +/* + * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after + * PIPE_CONTROL instruction. This is required for the flush to happen correctly + * but there is a slight complication as this is applied in WA batch where the + * values are only initialized once so we cannot take register value at the + * beginning and reuse it further; hence we save its value to memory, upload a + * constant value with bit21 set and then we restore it back with the saved value. + * To simplify the WA, a constant value is formed by using the default value + * of this register. This shouldn't be a problem because we are only modifying + * it for a short period and this batch in non-premptible. We can ofcourse + * use additional instructions that read the actual value of the register + * at that time and set our bit of interest but it makes the WA complicated. + * + * This WA is also required for Gen9 so extracting as a function avoids + * code duplication. + */ +static u32 * +gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) +{ + /* NB no one else is allowed to scribble over scratch + 256! */ + *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; + *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); + *batch++ = intel_gt_scratch_offset(engine->gt, + INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); + *batch++ = 0; + + *batch++ = MI_LOAD_REGISTER_IMM(1); + *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); + *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; + + batch = gen8_emit_pipe_control(batch, + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_DC_FLUSH_ENABLE, + 0); + + *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; + *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); + *batch++ = intel_gt_scratch_offset(engine->gt, + INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); + *batch++ = 0; + + return batch; +} + +/* + * Typically we only have one indirect_ctx and per_ctx batch buffer which are + * initialized at the beginning and shared across all contexts but this field + * helps us to have multiple batches at different offsets and select them based + * on a criteria. At the moment this batch always start at the beginning of the page + * and at this point we don't have multiple wa_ctx batch buffers. + * + * The number of WA applied are not known at the beginning; we use this field + * to return the no of DWORDS written. + * + * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END + * so it adds NOOPs as padding to make it cacheline aligned. + * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together + * makes a complete batch buffer. + */ +static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) +{ + /* WaDisableCtxRestoreArbitration:bdw,chv */ + *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + + /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ + if (IS_BROADWELL(engine->i915)) + batch = gen8_emit_flush_coherentl3_wa(engine, batch); + + /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ + /* Actual scratch location is at 128 bytes offset */ + batch = gen8_emit_pipe_control(batch, + PIPE_CONTROL_FLUSH_L3 | + PIPE_CONTROL_STORE_DATA_INDEX | + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_QW_WRITE, + LRC_PPHWSP_SCRATCH_ADDR); + + *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + + /* Pad to end of cacheline */ + while ((unsigned long)batch % CACHELINE_BYTES) + *batch++ = MI_NOOP; + + /* + * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because + * execution depends on the length specified in terms of cache lines + * in the register CTX_RCS_INDIRECT_CTX + */ + + return batch; +} + +struct lri { + i915_reg_t reg; + u32 value; +}; + +static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) +{ + GEM_BUG_ON(!count || count > 63); + + *batch++ = MI_LOAD_REGISTER_IMM(count); + do { + *batch++ = i915_mmio_reg_offset(lri->reg); + *batch++ = lri->value; + } while (lri++, --count); + *batch++ = MI_NOOP; + + return batch; +} + +static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) +{ + static const struct lri lri[] = { + /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ + { + COMMON_SLICE_CHICKEN2, + __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, + 0), + }, + + /* BSpec: 11391 */ + { + FF_SLICE_CHICKEN, + __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, + FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), + }, + + /* BSpec: 11299 */ + { + _3D_CHICKEN3, + __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, + _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), + } + }; + + *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + + /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ + batch = gen8_emit_flush_coherentl3_wa(engine, batch); + + /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ + batch = gen8_emit_pipe_control(batch, + PIPE_CONTROL_FLUSH_L3 | + PIPE_CONTROL_STORE_DATA_INDEX | + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_QW_WRITE, + LRC_PPHWSP_SCRATCH_ADDR); + + batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); + + /* WaMediaPoolStateCmdInWABB:bxt,glk */ + if (HAS_POOLED_EU(engine->i915)) { + /* + * EU pool configuration is setup along with golden context + * during context initialization. This value depends on + * device type (2x6 or 3x6) and needs to be updated based + * on which subslice is disabled especially for 2x6 + * devices, however it is safe to load default + * configuration of 3x6 device instead of masking off + * corresponding bits because HW ignores bits of a disabled + * subslice and drops down to appropriate config. Please + * see render_state_setup() in i915_gem_render_state.c for + * possible configurations, to avoid duplication they are + * not shown here again. + */ + *batch++ = GEN9_MEDIA_POOL_STATE; + *batch++ = GEN9_MEDIA_POOL_ENABLE; + *batch++ = 0x00777000; + *batch++ = 0; + *batch++ = 0; + *batch++ = 0; + } + + *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + + /* Pad to end of cacheline */ + while ((unsigned long)batch % CACHELINE_BYTES) + *batch++ = MI_NOOP; + + return batch; +} + +static u32 * +gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) +{ + int i; + + /* + * WaPipeControlBefore3DStateSamplePattern: cnl + * + * Ensure the engine is idle prior to programming a + * 3DSTATE_SAMPLE_PATTERN during a context restore. + */ + batch = gen8_emit_pipe_control(batch, + PIPE_CONTROL_CS_STALL, + 0); + /* + * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for + * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in + * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is + * confusing. Since gen8_emit_pipe_control() already advances the + * batch by 6 dwords, we advance the other 10 here, completing a + * cacheline. It's not clear if the workaround requires this padding + * before other commands, or if it's just the regular padding we would + * already have for the workaround bb, so leave it here for now. + */ + for (i = 0; i < 10; i++) + *batch++ = MI_NOOP; + + /* Pad to end of cacheline */ + while ((unsigned long)batch % CACHELINE_BYTES) + *batch++ = MI_NOOP; + + return batch; +} + +#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) + +static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + int err; + + obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err; + } + + err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH); + if (err) + goto err; + + engine->wa_ctx.vma = vma; + return 0; + +err: + i915_gem_object_put(obj); + return err; +} + +static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) +{ + i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); + + /* Called on error unwind, clear all flags to prevent further use */ + memset(&engine->wa_ctx, 0, sizeof(engine->wa_ctx)); +} + +typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); + +static int intel_init_workaround_bb(struct intel_engine_cs *engine) +{ + struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; + struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, + &wa_ctx->per_ctx }; + wa_bb_func_t wa_bb_fn[2]; + void *batch, *batch_ptr; + unsigned int i; + int ret; + + if (engine->class != RENDER_CLASS) + return 0; + + switch (INTEL_GEN(engine->i915)) { + case 12: + case 11: + return 0; + case 10: + wa_bb_fn[0] = gen10_init_indirectctx_bb; + wa_bb_fn[1] = NULL; + break; + case 9: + wa_bb_fn[0] = gen9_init_indirectctx_bb; + wa_bb_fn[1] = NULL; + break; + case 8: + wa_bb_fn[0] = gen8_init_indirectctx_bb; + wa_bb_fn[1] = NULL; + break; + default: + MISSING_CASE(INTEL_GEN(engine->i915)); + return 0; + } + + ret = lrc_setup_wa_ctx(engine); + if (ret) { + drm_dbg(&engine->i915->drm, + "Failed to setup context WA page: %d\n", ret); + return ret; + } + + batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); + + /* + * Emit the two workaround batch buffers, recording the offset from the + * start of the workaround batch buffer object for each and their + * respective sizes. + */ + batch_ptr = batch; + for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { + wa_bb[i]->offset = batch_ptr - batch; + if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, + CACHELINE_BYTES))) { + ret = -EINVAL; + break; + } + if (wa_bb_fn[i]) + batch_ptr = wa_bb_fn[i](engine, batch_ptr); + wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); + } + GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); + + __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); + __i915_gem_object_release_map(wa_ctx->vma->obj); + if (ret) + lrc_destroy_wa_ctx(engine); + + return ret; +} + +static void reset_csb_pointers(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + const unsigned int reset_value = execlists->csb_size - 1; + + ring_set_paused(engine, 0); + + /* + * Sometimes Icelake forgets to reset its pointers on a GPU reset. + * Bludgeon them with a mmio update to be sure. + */ + ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, + 0xffff << 16 | reset_value << 8 | reset_value); + ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); + + /* + * After a reset, the HW starts writing into CSB entry [0]. We + * therefore have to set our HEAD pointer back one entry so that + * the *first* entry we check is entry 0. To complicate this further, + * as we don't wait for the first interrupt after reset, we have to + * fake the HW write to point back to the last entry so that our + * inline comparison of our cached head position against the last HW + * write works even before the first interrupt. + */ + execlists->csb_head = reset_value; + WRITE_ONCE(*execlists->csb_write, reset_value); + wmb(); /* Make sure this is visible to HW (paranoia?) */ + + /* Check that the GPU does indeed update the CSB entries! */ + memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64)); + invalidate_csb_entries(&execlists->csb_status[0], + &execlists->csb_status[reset_value]); + + /* Once more for luck and our trusty paranoia */ + ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, + 0xffff << 16 | reset_value << 8 | reset_value); + ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); + + GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value); +} + +static void execlists_sanitize(struct intel_engine_cs *engine) +{ + /* + * Poison residual state on resume, in case the suspend didn't! + * + * We have to assume that across suspend/resume (or other loss + * of control) that the contents of our pinned buffers has been + * lost, replaced by garbage. Since this doesn't always happen, + * let's poison such state so that we more quickly spot when + * we falsely assume it has been preserved. + */ + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) + memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE); + + reset_csb_pointers(engine); + + /* + * The kernel_context HWSP is stored in the status_page. As above, + * that may be lost on resume/initialisation, and so we need to + * reset the value in the HWSP. + */ + intel_timeline_reset_seqno(engine->kernel_context->timeline); + + /* And scrub the dirty cachelines for the HWSP */ + clflush_cache_range(engine->status_page.addr, PAGE_SIZE); +} + +static void enable_error_interrupt(struct intel_engine_cs *engine) +{ + u32 status; + + engine->execlists.error_interrupt = 0; + ENGINE_WRITE(engine, RING_EMR, ~0u); + ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */ + + status = ENGINE_READ(engine, RING_ESR); + if (unlikely(status)) { + drm_err(&engine->i915->drm, + "engine '%s' resumed still in error: %08x\n", + engine->name, status); + __intel_gt_reset(engine->gt, engine->mask); + } + + /* + * On current gen8+, we have 2 signals to play with + * + * - I915_ERROR_INSTUCTION (bit 0) + * + * Generate an error if the command parser encounters an invalid + * instruction + * + * This is a fatal error. + * + * - CP_PRIV (bit 2) + * + * Generate an error on privilege violation (where the CP replaces + * the instruction with a no-op). This also fires for writes into + * read-only scratch pages. + * + * This is a non-fatal error, parsing continues. + * + * * there are a few others defined for odd HW that we do not use + * + * Since CP_PRIV fires for cases where we have chosen to ignore the + * error (as the HW is validating and suppressing the mistakes), we + * only unmask the instruction error bit. + */ + ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION); +} + +static void enable_execlists(struct intel_engine_cs *engine) +{ + u32 mode; + + assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); + + intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ + + if (INTEL_GEN(engine->i915) >= 11) + mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); + else + mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); + ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); + + ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); + + ENGINE_WRITE_FW(engine, + RING_HWS_PGA, + i915_ggtt_offset(engine->status_page.vma)); + ENGINE_POSTING_READ(engine, RING_HWS_PGA); + + enable_error_interrupt(engine); + + engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0); +} + +static bool unexpected_starting_state(struct intel_engine_cs *engine) +{ + bool unexpected = false; + + if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { + drm_dbg(&engine->i915->drm, + "STOP_RING still set in RING_MI_MODE\n"); + unexpected = true; + } + + return unexpected; +} + +static int execlists_resume(struct intel_engine_cs *engine) +{ + intel_mocs_init_engine(engine); + + intel_breadcrumbs_reset(engine->breadcrumbs); + + if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { + struct drm_printer p = drm_debug_printer(__func__); + + intel_engine_dump(engine, &p, NULL); + } + + enable_execlists(engine); + + return 0; +} + +static void execlists_reset_prepare(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + unsigned long flags; + + ENGINE_TRACE(engine, "depth<-%d\n", + atomic_read(&execlists->tasklet.count)); + + /* + * Prevent request submission to the hardware until we have + * completed the reset in i915_gem_reset_finish(). If a request + * is completed by one engine, it may then queue a request + * to a second via its execlists->tasklet *just* as we are + * calling engine->resume() and also writing the ELSP. + * Turning off the execlists->tasklet until the reset is over + * prevents the race. + */ + __tasklet_disable_sync_once(&execlists->tasklet); + GEM_BUG_ON(!reset_in_progress(execlists)); + + /* And flush any current direct submission. */ + spin_lock_irqsave(&engine->active.lock, flags); + spin_unlock_irqrestore(&engine->active.lock, flags); + + /* + * We stop engines, otherwise we might get failed reset and a + * dead gpu (on elk). Also as modern gpu as kbl can suffer + * from system hang if batchbuffer is progressing when + * the reset is issued, regardless of READY_TO_RESET ack. + * Thus assume it is best to stop engines on all gens + * where we have a gpu reset. + * + * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) + * + * FIXME: Wa for more modern gens needs to be validated + */ + ring_set_paused(engine, 1); + intel_engine_stop_cs(engine); + + engine->execlists.reset_ccid = active_ccid(engine); +} + +static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) +{ + int x; + + x = lrc_ring_mi_mode(engine); + if (x != -1) { + regs[x + 1] &= ~STOP_RING; + regs[x + 1] |= STOP_RING << 16; + } +} + +static void __execlists_reset_reg_state(const struct intel_context *ce, + const struct intel_engine_cs *engine) +{ + u32 *regs = ce->lrc_reg_state; + + __reset_stop_ring(regs, engine); +} + +static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + struct intel_context *ce; + struct i915_request *rq; + u32 head; + + mb(); /* paranoia: read the CSB pointers from after the reset */ + clflush(execlists->csb_write); + mb(); + + process_csb(engine); /* drain preemption events */ + + /* Following the reset, we need to reload the CSB read/write pointers */ + reset_csb_pointers(engine); + + /* + * Save the currently executing context, even if we completed + * its request, it was still running at the time of the + * reset and will have been clobbered. + */ + rq = active_context(engine, engine->execlists.reset_ccid); + if (!rq) + goto unwind; + + ce = rq->context; + GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); + + if (i915_request_completed(rq)) { + /* Idle context; tidy up the ring so we can restart afresh */ + head = intel_ring_wrap(ce->ring, rq->tail); + goto out_replay; + } + + /* We still have requests in-flight; the engine should be active */ + GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); + + /* Context has requests still in-flight; it should not be idle! */ + GEM_BUG_ON(i915_active_is_idle(&ce->active)); + + rq = active_request(ce->timeline, rq); + head = intel_ring_wrap(ce->ring, rq->head); + GEM_BUG_ON(head == ce->ring->tail); + + /* + * If this request hasn't started yet, e.g. it is waiting on a + * semaphore, we need to avoid skipping the request or else we + * break the signaling chain. However, if the context is corrupt + * the request will not restart and we will be stuck with a wedged + * device. It is quite often the case that if we issue a reset + * while the GPU is loading the context image, that the context + * image becomes corrupt. + * + * Otherwise, if we have not started yet, the request should replay + * perfectly and we do not need to flag the result as being erroneous. + */ + if (!i915_request_started(rq)) + goto out_replay; + + /* + * If the request was innocent, we leave the request in the ELSP + * and will try to replay it on restarting. The context image may + * have been corrupted by the reset, in which case we may have + * to service a new GPU hang, but more likely we can continue on + * without impact. + * + * If the request was guilty, we presume the context is corrupt + * and have to at least restore the RING register in the context + * image back to the expected values to skip over the guilty request. + */ + __i915_request_reset(rq, stalled); + + /* + * We want a simple context + ring to execute the breadcrumb update. + * We cannot rely on the context being intact across the GPU hang, + * so clear it and rebuild just what we need for the breadcrumb. + * All pending requests for this context will be zapped, and any + * future request will be after userspace has had the opportunity + * to recreate its own state. + */ +out_replay: + ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n", + head, ce->ring->tail); + __execlists_reset_reg_state(ce, engine); + __execlists_update_reg_state(ce, engine, head); + ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ + +unwind: + /* Push back any incomplete requests for replay after the reset. */ + cancel_port_requests(execlists); + __unwind_incomplete_requests(engine); +} + +static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled) +{ + unsigned long flags; + + ENGINE_TRACE(engine, "\n"); + + spin_lock_irqsave(&engine->active.lock, flags); + + __execlists_reset(engine, stalled); + + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +static void nop_submission_tasklet(unsigned long data) +{ + struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; + + /* The driver is wedged; don't process any more events. */ + WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN); +} + +static void execlists_reset_cancel(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + struct i915_request *rq, *rn; + struct rb_node *rb; + unsigned long flags; + + ENGINE_TRACE(engine, "\n"); + + /* + * Before we call engine->cancel_requests(), we should have exclusive + * access to the submission state. This is arranged for us by the + * caller disabling the interrupt generation, the tasklet and other + * threads that may then access the same state, giving us a free hand + * to reset state. However, we still need to let lockdep be aware that + * we know this state may be accessed in hardirq context, so we + * disable the irq around this manipulation and we want to keep + * the spinlock focused on its duties and not accidentally conflate + * coverage to the submission's irq state. (Similarly, although we + * shouldn't need to disable irq around the manipulation of the + * submission's irq state, we also wish to remind ourselves that + * it is irq state.) + */ + spin_lock_irqsave(&engine->active.lock, flags); + + __execlists_reset(engine, true); + + /* Mark all executing requests as skipped. */ + list_for_each_entry(rq, &engine->active.requests, sched.link) + mark_eio(rq); + + /* Flush the queued requests to the timeline list (for retiring). */ + while ((rb = rb_first_cached(&execlists->queue))) { + struct i915_priolist *p = to_priolist(rb); + int i; + + priolist_for_each_request_consume(rq, rn, p, i) { + mark_eio(rq); + __i915_request_submit(rq); + } + + rb_erase_cached(&p->node, &execlists->queue); + i915_priolist_free(p); + } + + /* On-hold requests will be flushed to timeline upon their release */ + list_for_each_entry(rq, &engine->active.hold, sched.link) + mark_eio(rq); + + /* Cancel all attached virtual engines */ + while ((rb = rb_first_cached(&execlists->virtual))) { + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + + rb_erase_cached(rb, &execlists->virtual); + RB_CLEAR_NODE(rb); + + spin_lock(&ve->base.active.lock); + rq = fetch_and_zero(&ve->request); + if (rq) { + mark_eio(rq); + + rq->engine = engine; + __i915_request_submit(rq); + i915_request_put(rq); + + ve->base.execlists.queue_priority_hint = INT_MIN; + } + spin_unlock(&ve->base.active.lock); + } + + /* Remaining _unready_ requests will be nop'ed when submitted */ + + execlists->queue_priority_hint = INT_MIN; + execlists->queue = RB_ROOT_CACHED; + + GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); + execlists->tasklet.func = nop_submission_tasklet; + + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +static void execlists_reset_finish(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + + /* + * After a GPU reset, we may have requests to replay. Do so now while + * we still have the forcewake to be sure that the GPU is not allowed + * to sleep before we restart and reload a context. + */ + GEM_BUG_ON(!reset_in_progress(execlists)); + if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) + execlists->tasklet.func(execlists->tasklet.data); + + if (__tasklet_enable(&execlists->tasklet)) + /* And kick in case we missed a new request submission. */ + tasklet_hi_schedule(&execlists->tasklet); + ENGINE_TRACE(engine, "depth->%d\n", + atomic_read(&execlists->tasklet.count)); +} + +static int gen8_emit_bb_start_noarb(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags) +{ + u32 *cs; + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* + * WaDisableCtxRestoreArbitration:bdw,chv + * + * We don't need to perform MI_ARB_ENABLE as often as we do (in + * particular all the gen that do not need the w/a at all!), if we + * took care to make sure that on every switch into this context + * (both ordinary and for preemption) that arbitrartion was enabled + * we would be fine. However, for gen8 there is another w/a that + * requires us to not preempt inside GPGPU execution, so we keep + * arbitration disabled for gen8 batches. Arbitration will be + * re-enabled before we close the request + * (engine->emit_fini_breadcrumb). + */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + + /* FIXME(BDW+): Address space and security selectors. */ + *cs++ = MI_BATCH_BUFFER_START_GEN8 | + (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + + intel_ring_advance(rq, cs); + + return 0; +} + +static int gen8_emit_bb_start(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags) +{ + u32 *cs; + + cs = intel_ring_begin(rq, 6); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + + *cs++ = MI_BATCH_BUFFER_START_GEN8 | + (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + *cs++ = MI_NOOP; + + intel_ring_advance(rq, cs); + + return 0; +} + +static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) +{ + ENGINE_WRITE(engine, RING_IMR, + ~(engine->irq_enable_mask | engine->irq_keep_mask)); + ENGINE_POSTING_READ(engine, RING_IMR); +} + +static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) +{ + ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); +} + +static int gen8_emit_flush(struct i915_request *request, u32 mode) +{ + u32 cmd, *cs; + + cs = intel_ring_begin(request, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + cmd = MI_FLUSH_DW + 1; + + /* We always require a command barrier so that subsequent + * commands, such as breadcrumb interrupts, are strictly ordered + * wrt the contents of the write cache being flushed to memory + * (and thus being coherent from the CPU). + */ + cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; + + if (mode & EMIT_INVALIDATE) { + cmd |= MI_INVALIDATE_TLB; + if (request->engine->class == VIDEO_DECODE_CLASS) + cmd |= MI_INVALIDATE_BSD; + } + + *cs++ = cmd; + *cs++ = LRC_PPHWSP_SCRATCH_ADDR; + *cs++ = 0; /* upper addr */ + *cs++ = 0; /* value */ + intel_ring_advance(request, cs); + + return 0; +} + +static int gen8_emit_flush_render(struct i915_request *request, + u32 mode) +{ + bool vf_flush_wa = false, dc_flush_wa = false; + u32 *cs, flags = 0; + int len; + + flags |= PIPE_CONTROL_CS_STALL; + + if (mode & EMIT_FLUSH) { + flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; + flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; + flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; + flags |= PIPE_CONTROL_FLUSH_ENABLE; + } + + if (mode & EMIT_INVALIDATE) { + flags |= PIPE_CONTROL_TLB_INVALIDATE; + flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_QW_WRITE; + flags |= PIPE_CONTROL_STORE_DATA_INDEX; + + /* + * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL + * pipe control. + */ + if (IS_GEN(request->engine->i915, 9)) + vf_flush_wa = true; + + /* WaForGAMHang:kbl */ + if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0)) + dc_flush_wa = true; + } + + len = 6; + + if (vf_flush_wa) + len += 6; + + if (dc_flush_wa) + len += 12; + + cs = intel_ring_begin(request, len); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + if (vf_flush_wa) + cs = gen8_emit_pipe_control(cs, 0, 0); + + if (dc_flush_wa) + cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, + 0); + + cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); + + if (dc_flush_wa) + cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); + + intel_ring_advance(request, cs); + + return 0; +} + +static int gen11_emit_flush_render(struct i915_request *request, + u32 mode) +{ + if (mode & EMIT_FLUSH) { + u32 *cs; + u32 flags = 0; + + flags |= PIPE_CONTROL_CS_STALL; + + flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; + flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; + flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; + flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; + flags |= PIPE_CONTROL_FLUSH_ENABLE; + flags |= PIPE_CONTROL_QW_WRITE; + flags |= PIPE_CONTROL_STORE_DATA_INDEX; + + cs = intel_ring_begin(request, 6); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); + intel_ring_advance(request, cs); + } + + if (mode & EMIT_INVALIDATE) { + u32 *cs; + u32 flags = 0; + + flags |= PIPE_CONTROL_CS_STALL; + + flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_TLB_INVALIDATE; + flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_QW_WRITE; + flags |= PIPE_CONTROL_STORE_DATA_INDEX; + + cs = intel_ring_begin(request, 6); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); + intel_ring_advance(request, cs); + } + + return 0; +} + +static u32 preparser_disable(bool state) +{ + return MI_ARB_CHECK | 1 << 8 | state; +} + +static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine) +{ + static const i915_reg_t vd[] = { + GEN12_VD0_AUX_NV, + GEN12_VD1_AUX_NV, + GEN12_VD2_AUX_NV, + GEN12_VD3_AUX_NV, + }; + + static const i915_reg_t ve[] = { + GEN12_VE0_AUX_NV, + GEN12_VE1_AUX_NV, + }; + + if (engine->class == VIDEO_DECODE_CLASS) + return vd[engine->instance]; + + if (engine->class == VIDEO_ENHANCEMENT_CLASS) + return ve[engine->instance]; + + GEM_BUG_ON("unknown aux_inv_reg\n"); + + return INVALID_MMIO_REG; +} + +static u32 * +gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs) +{ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(inv_reg); + *cs++ = AUX_INV; + *cs++ = MI_NOOP; + + return cs; +} + +static int gen12_emit_flush_render(struct i915_request *request, + u32 mode) +{ + if (mode & EMIT_FLUSH) { + u32 flags = 0; + u32 *cs; + + flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; + flags |= PIPE_CONTROL_FLUSH_L3; + flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; + flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; + /* Wa_1409600907:tgl */ + flags |= PIPE_CONTROL_DEPTH_STALL; + flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; + flags |= PIPE_CONTROL_FLUSH_ENABLE; + + flags |= PIPE_CONTROL_STORE_DATA_INDEX; + flags |= PIPE_CONTROL_QW_WRITE; + + flags |= PIPE_CONTROL_CS_STALL; + + cs = intel_ring_begin(request, 6); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + cs = gen12_emit_pipe_control(cs, + PIPE_CONTROL0_HDC_PIPELINE_FLUSH, + flags, LRC_PPHWSP_SCRATCH_ADDR); + intel_ring_advance(request, cs); + } + + if (mode & EMIT_INVALIDATE) { + u32 flags = 0; + u32 *cs; + + flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_TLB_INVALIDATE; + flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; + + flags |= PIPE_CONTROL_STORE_DATA_INDEX; + flags |= PIPE_CONTROL_QW_WRITE; + + flags |= PIPE_CONTROL_CS_STALL; + + cs = intel_ring_begin(request, 8 + 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* + * Prevent the pre-parser from skipping past the TLB + * invalidate and loading a stale page for the batch + * buffer / request payload. + */ + *cs++ = preparser_disable(true); + + cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); + + /* hsdes: 1809175790 */ + cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs); + + *cs++ = preparser_disable(false); + intel_ring_advance(request, cs); + } + + return 0; +} + +static int gen12_emit_flush(struct i915_request *request, u32 mode) +{ + intel_engine_mask_t aux_inv = 0; + u32 cmd, *cs; + + cmd = 4; + if (mode & EMIT_INVALIDATE) + cmd += 2; + if (mode & EMIT_INVALIDATE) + aux_inv = request->engine->mask & ~BIT(BCS0); + if (aux_inv) + cmd += 2 * hweight8(aux_inv) + 2; + + cs = intel_ring_begin(request, cmd); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + if (mode & EMIT_INVALIDATE) + *cs++ = preparser_disable(true); + + cmd = MI_FLUSH_DW + 1; + + /* We always require a command barrier so that subsequent + * commands, such as breadcrumb interrupts, are strictly ordered + * wrt the contents of the write cache being flushed to memory + * (and thus being coherent from the CPU). + */ + cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; + + if (mode & EMIT_INVALIDATE) { + cmd |= MI_INVALIDATE_TLB; + if (request->engine->class == VIDEO_DECODE_CLASS) + cmd |= MI_INVALIDATE_BSD; + } + + *cs++ = cmd; + *cs++ = LRC_PPHWSP_SCRATCH_ADDR; + *cs++ = 0; /* upper addr */ + *cs++ = 0; /* value */ + + if (aux_inv) { /* hsdes: 1809175790 */ + struct intel_engine_cs *engine; + unsigned int tmp; + + *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv)); + for_each_engine_masked(engine, request->engine->gt, + aux_inv, tmp) { + *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine)); + *cs++ = AUX_INV; + } + *cs++ = MI_NOOP; + } + + if (mode & EMIT_INVALIDATE) + *cs++ = preparser_disable(false); + + intel_ring_advance(request, cs); + + return 0; +} + +static void assert_request_valid(struct i915_request *rq) +{ + struct intel_ring *ring __maybe_unused = rq->ring; + + /* Can we unwind this request without appearing to go forwards? */ + GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0); +} + +/* + * Reserve space for 2 NOOPs at the end of each request to be + * used as a workaround for not being allowed to do lite + * restore with HEAD==TAIL (WaIdleLiteRestore). + */ +static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) +{ + /* Ensure there's always at least one preemption point per-request. */ + *cs++ = MI_ARB_CHECK; + *cs++ = MI_NOOP; + request->wa_tail = intel_ring_offset(request, cs); + + /* Check that entire request is less than half the ring */ + assert_request_valid(request); + + return cs; +} + +static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) +{ + *cs++ = MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD; + *cs++ = 0; + *cs++ = intel_hws_preempt_address(request->engine); + *cs++ = 0; + + return cs; +} + +static __always_inline u32* +gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) +{ + *cs++ = MI_USER_INTERRUPT; + + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + if (intel_engine_has_semaphores(request->engine)) + cs = emit_preempt_busywait(request, cs); + + request->tail = intel_ring_offset(request, cs); + assert_ring_tail_valid(request->ring, request->tail); + + return gen8_emit_wa_tail(request, cs); +} + +static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs) +{ + return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0); +} + +static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) +{ + return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); +} + +static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) +{ + cs = gen8_emit_pipe_control(cs, + PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_DC_FLUSH_ENABLE, + 0); + + /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ + cs = gen8_emit_ggtt_write_rcs(cs, + request->fence.seqno, + hwsp_offset(request), + PIPE_CONTROL_FLUSH_ENABLE | + PIPE_CONTROL_CS_STALL); + + return gen8_emit_fini_breadcrumb_tail(request, cs); +} + +static u32 * +gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) +{ + cs = gen8_emit_ggtt_write_rcs(cs, + request->fence.seqno, + hwsp_offset(request), + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_TILE_CACHE_FLUSH | + PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_DC_FLUSH_ENABLE | + PIPE_CONTROL_FLUSH_ENABLE); + + return gen8_emit_fini_breadcrumb_tail(request, cs); +} + +/* + * Note that the CS instruction pre-parser will not stall on the breadcrumb + * flush and will continue pre-fetching the instructions after it before the + * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at + * BB_START/END instructions, so, even though we might pre-fetch the pre-amble + * of the next request before the memory has been flushed, we're guaranteed that + * we won't access the batch itself too early. + * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, + * so, if the current request is modifying an instruction in the next request on + * the same intel_context, we might pre-fetch and then execute the pre-update + * instruction. To avoid this, the users of self-modifying code should either + * disable the parser around the code emitting the memory writes, via a new flag + * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For + * the in-kernel use-cases we've opted to use a separate context, see + * reloc_gpu() as an example. + * All the above applies only to the instructions themselves. Non-inline data + * used by the instructions is not pre-fetched. + */ + +static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) +{ + *cs++ = MI_SEMAPHORE_WAIT_TOKEN | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD; + *cs++ = 0; + *cs++ = intel_hws_preempt_address(request->engine); + *cs++ = 0; + *cs++ = 0; + *cs++ = MI_NOOP; + + return cs; +} + +static __always_inline u32* +gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) +{ + *cs++ = MI_USER_INTERRUPT; + + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + if (intel_engine_has_semaphores(request->engine)) + cs = gen12_emit_preempt_busywait(request, cs); + + request->tail = intel_ring_offset(request, cs); + assert_ring_tail_valid(request->ring, request->tail); + + return gen8_emit_wa_tail(request, cs); +} + +static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) +{ + /* XXX Stalling flush before seqno write; post-sync not */ + cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0)); + return gen12_emit_fini_breadcrumb_tail(rq, cs); +} + +static u32 * +gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) +{ + cs = gen12_emit_ggtt_write_rcs(cs, + request->fence.seqno, + hwsp_offset(request), + PIPE_CONTROL0_HDC_PIPELINE_FLUSH, + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_TILE_CACHE_FLUSH | + PIPE_CONTROL_FLUSH_L3 | + PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + /* Wa_1409600907:tgl */ + PIPE_CONTROL_DEPTH_STALL | + PIPE_CONTROL_DC_FLUSH_ENABLE | + PIPE_CONTROL_FLUSH_ENABLE); + + return gen12_emit_fini_breadcrumb_tail(request, cs); +} + +static void execlists_park(struct intel_engine_cs *engine) +{ + cancel_timer(&engine->execlists.timer); + cancel_timer(&engine->execlists.preempt); +} + +void intel_execlists_set_default_submission(struct intel_engine_cs *engine) +{ + engine->submit_request = execlists_submit_request; + engine->schedule = i915_schedule; + engine->execlists.tasklet.func = execlists_submission_tasklet; + + engine->reset.prepare = execlists_reset_prepare; + engine->reset.rewind = execlists_reset_rewind; + engine->reset.cancel = execlists_reset_cancel; + engine->reset.finish = execlists_reset_finish; + + engine->park = execlists_park; + engine->unpark = NULL; + + engine->flags |= I915_ENGINE_SUPPORTS_STATS; + if (!intel_vgpu_active(engine->i915)) { + engine->flags |= I915_ENGINE_HAS_SEMAPHORES; + if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) { + engine->flags |= I915_ENGINE_HAS_PREEMPTION; + if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION)) + engine->flags |= I915_ENGINE_HAS_TIMESLICES; + } + } + + if (INTEL_GEN(engine->i915) >= 12) + engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO; + + if (intel_engine_has_preemption(engine)) + engine->emit_bb_start = gen8_emit_bb_start; + else + engine->emit_bb_start = gen8_emit_bb_start_noarb; +} + +static void execlists_shutdown(struct intel_engine_cs *engine) +{ + /* Synchronise with residual timers and any softirq they raise */ + del_timer_sync(&engine->execlists.timer); + del_timer_sync(&engine->execlists.preempt); + tasklet_kill(&engine->execlists.tasklet); +} + +static void execlists_release(struct intel_engine_cs *engine) +{ + engine->sanitize = NULL; /* no longer in control, nothing to sanitize */ + + execlists_shutdown(engine); + + intel_engine_cleanup_common(engine); + lrc_destroy_wa_ctx(engine); +} + +static void +logical_ring_default_vfuncs(struct intel_engine_cs *engine) +{ + /* Default vfuncs which can be overriden by each engine. */ + + engine->resume = execlists_resume; + + engine->cops = &execlists_context_ops; + engine->request_alloc = execlists_request_alloc; + + engine->emit_flush = gen8_emit_flush; + engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; + engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; + if (INTEL_GEN(engine->i915) >= 12) { + engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; + engine->emit_flush = gen12_emit_flush; + } + engine->set_default_submission = intel_execlists_set_default_submission; + + if (INTEL_GEN(engine->i915) < 11) { + engine->irq_enable = gen8_logical_ring_enable_irq; + engine->irq_disable = gen8_logical_ring_disable_irq; + } else { + /* + * TODO: On Gen11 interrupt masks need to be clear + * to allow C6 entry. Keep interrupts enabled at + * and take the hit of generating extra interrupts + * until a more refined solution exists. + */ + } +} + +static inline void +logical_ring_default_irqs(struct intel_engine_cs *engine) +{ + unsigned int shift = 0; + + if (INTEL_GEN(engine->i915) < 11) { + const u8 irq_shifts[] = { + [RCS0] = GEN8_RCS_IRQ_SHIFT, + [BCS0] = GEN8_BCS_IRQ_SHIFT, + [VCS0] = GEN8_VCS0_IRQ_SHIFT, + [VCS1] = GEN8_VCS1_IRQ_SHIFT, + [VECS0] = GEN8_VECS_IRQ_SHIFT, + }; + + shift = irq_shifts[engine->id]; + } + + engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; + engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; + engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift; + engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift; +} + +static void rcs_submission_override(struct intel_engine_cs *engine) +{ + switch (INTEL_GEN(engine->i915)) { + case 12: + engine->emit_flush = gen12_emit_flush_render; + engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs; + break; + case 11: + engine->emit_flush = gen11_emit_flush_render; + engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; + break; + default: + engine->emit_flush = gen8_emit_flush_render; + engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; + break; + } +} + +int intel_execlists_submission_setup(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + struct drm_i915_private *i915 = engine->i915; + struct intel_uncore *uncore = engine->uncore; + u32 base = engine->mmio_base; + + tasklet_init(&engine->execlists.tasklet, + execlists_submission_tasklet, (unsigned long)engine); + timer_setup(&engine->execlists.timer, execlists_timeslice, 0); + timer_setup(&engine->execlists.preempt, execlists_preempt, 0); + + logical_ring_default_vfuncs(engine); + logical_ring_default_irqs(engine); + + if (engine->class == RENDER_CLASS) + rcs_submission_override(engine); + + if (intel_init_workaround_bb(engine)) + /* + * We continue even if we fail to initialize WA batch + * because we only expect rare glitches but nothing + * critical to prevent us from using GPU + */ + drm_err(&i915->drm, "WA batch buffer initialization failed\n"); + + if (HAS_LOGICAL_RING_ELSQ(i915)) { + execlists->submit_reg = uncore->regs + + i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); + execlists->ctrl_reg = uncore->regs + + i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); + } else { + execlists->submit_reg = uncore->regs + + i915_mmio_reg_offset(RING_ELSP(base)); + } + + execlists->csb_status = + (u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; + + execlists->csb_write = + &engine->status_page.addr[intel_hws_csb_write_index(i915)]; + + if (INTEL_GEN(i915) < 11) + execlists->csb_size = GEN8_CSB_ENTRIES; + else + execlists->csb_size = GEN11_CSB_ENTRIES; + + if (INTEL_GEN(engine->i915) >= 11) { + execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32); + execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32); + } + + /* Finally, take ownership and responsibility for cleanup! */ + engine->sanitize = execlists_sanitize; + engine->release = execlists_release; + + return 0; +} + +static void init_common_reg_state(u32 * const regs, + const struct intel_engine_cs *engine, + const struct intel_ring *ring, + bool inhibit) +{ + u32 ctl; + + ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); + ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); + if (inhibit) + ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; + if (INTEL_GEN(engine->i915) < 11) + ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | + CTX_CTRL_RS_CTX_ENABLE); + regs[CTX_CONTEXT_CONTROL] = ctl; + + regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; + regs[CTX_TIMESTAMP] = 0; +} + +static void init_wa_bb_reg_state(u32 * const regs, + const struct intel_engine_cs *engine) +{ + const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; + + if (wa_ctx->per_ctx.size) { + const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); + + GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); + regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = + (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; + } + + if (wa_ctx->indirect_ctx.size) { + lrc_ring_setup_indirect_ctx(regs, engine, + i915_ggtt_offset(wa_ctx->vma) + + wa_ctx->indirect_ctx.offset, + wa_ctx->indirect_ctx.size); + } +} + +static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt) +{ + if (i915_vm_is_4lvl(&ppgtt->vm)) { + /* 64b PPGTT (48bit canonical) + * PDP0_DESCRIPTOR contains the base address to PML4 and + * other PDP Descriptors are ignored. + */ + ASSIGN_CTX_PML4(ppgtt, regs); + } else { + ASSIGN_CTX_PDP(ppgtt, regs, 3); + ASSIGN_CTX_PDP(ppgtt, regs, 2); + ASSIGN_CTX_PDP(ppgtt, regs, 1); + ASSIGN_CTX_PDP(ppgtt, regs, 0); + } +} + +static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) +{ + if (i915_is_ggtt(vm)) + return i915_vm_to_ggtt(vm)->alias; + else + return i915_vm_to_ppgtt(vm); +} + +static void execlists_init_reg_state(u32 *regs, + const struct intel_context *ce, + const struct intel_engine_cs *engine, + const struct intel_ring *ring, + bool inhibit) +{ + /* + * A context is actually a big batch buffer with several + * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The + * values we are setting here are only for the first context restore: + * on a subsequent save, the GPU will recreate this batchbuffer with new + * values (including all the missing MI_LOAD_REGISTER_IMM commands that + * we are not initializing here). + * + * Must keep consistent with virtual_update_register_offsets(). + */ + set_offsets(regs, reg_offsets(engine), engine, inhibit); + + init_common_reg_state(regs, engine, ring, inhibit); + init_ppgtt_reg_state(regs, vm_alias(ce->vm)); + + init_wa_bb_reg_state(regs, engine); + + __reset_stop_ring(regs, engine); +} + +static int +populate_lr_context(struct intel_context *ce, + struct drm_i915_gem_object *ctx_obj, + struct intel_engine_cs *engine, + struct intel_ring *ring) +{ + bool inhibit = true; + void *vaddr; + + vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); + if (IS_ERR(vaddr)) { + drm_dbg(&engine->i915->drm, "Could not map object pages!\n"); + return PTR_ERR(vaddr); + } + + set_redzone(vaddr, engine); + + if (engine->default_state) { + shmem_read(engine->default_state, 0, + vaddr, engine->context_size); + __set_bit(CONTEXT_VALID_BIT, &ce->flags); + inhibit = false; + } + + /* Clear the ppHWSP (inc. per-context counters) */ + memset(vaddr, 0, PAGE_SIZE); + + /* + * The second page of the context object contains some registers which + * must be set up prior to the first execution. + */ + execlists_init_reg_state(vaddr + LRC_STATE_OFFSET, + ce, engine, ring, inhibit); + + __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); + i915_gem_object_unpin_map(ctx_obj); + return 0; +} + +static struct intel_timeline *pinned_timeline(struct intel_context *ce) +{ + struct intel_timeline *tl = fetch_and_zero(&ce->timeline); + + return intel_timeline_create_from_engine(ce->engine, + page_unmask_bits(tl)); +} + +static int __execlists_context_alloc(struct intel_context *ce, + struct intel_engine_cs *engine) +{ + struct drm_i915_gem_object *ctx_obj; + struct intel_ring *ring; + struct i915_vma *vma; + u32 context_size; + int ret; + + GEM_BUG_ON(ce->state); + context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); + + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) + context_size += I915_GTT_PAGE_SIZE; /* for redzone */ + + if (INTEL_GEN(engine->i915) == 12) { + ce->wa_bb_page = context_size / PAGE_SIZE; + context_size += PAGE_SIZE; + } + + ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); + if (IS_ERR(ctx_obj)) + return PTR_ERR(ctx_obj); + + vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto error_deref_obj; + } + + if (!page_mask_bits(ce->timeline)) { + struct intel_timeline *tl; + + /* + * Use the static global HWSP for the kernel context, and + * a dynamically allocated cacheline for everyone else. + */ + if (unlikely(ce->timeline)) + tl = pinned_timeline(ce); + else + tl = intel_timeline_create(engine->gt); + if (IS_ERR(tl)) { + ret = PTR_ERR(tl); + goto error_deref_obj; + } + + ce->timeline = tl; + } + + ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); + if (IS_ERR(ring)) { + ret = PTR_ERR(ring); + goto error_deref_obj; + } + + ret = populate_lr_context(ce, ctx_obj, engine, ring); + if (ret) { + drm_dbg(&engine->i915->drm, + "Failed to populate LRC: %d\n", ret); + goto error_ring_free; + } + + ce->ring = ring; + ce->state = vma; + + return 0; + +error_ring_free: + intel_ring_put(ring); +error_deref_obj: + i915_gem_object_put(ctx_obj); + return ret; +} + +static struct list_head *virtual_queue(struct virtual_engine *ve) +{ + return &ve->base.execlists.default_priolist.requests[0]; +} + +static void rcu_virtual_context_destroy(struct work_struct *wrk) +{ + struct virtual_engine *ve = + container_of(wrk, typeof(*ve), rcu.work); + unsigned int n; + + GEM_BUG_ON(ve->context.inflight); + + /* Preempt-to-busy may leave a stale request behind. */ + if (unlikely(ve->request)) { + struct i915_request *old; + + spin_lock_irq(&ve->base.active.lock); + + old = fetch_and_zero(&ve->request); + if (old) { + GEM_BUG_ON(!i915_request_completed(old)); + __i915_request_submit(old); + i915_request_put(old); + } + + spin_unlock_irq(&ve->base.active.lock); + } + + /* + * Flush the tasklet in case it is still running on another core. + * + * This needs to be done before we remove ourselves from the siblings' + * rbtrees as in the case it is running in parallel, it may reinsert + * the rb_node into a sibling. + */ + tasklet_kill(&ve->base.execlists.tasklet); + + /* Decouple ourselves from the siblings, no more access allowed. */ + for (n = 0; n < ve->num_siblings; n++) { + struct intel_engine_cs *sibling = ve->siblings[n]; + struct rb_node *node = &ve->nodes[sibling->id].rb; + + if (RB_EMPTY_NODE(node)) + continue; + + spin_lock_irq(&sibling->active.lock); + + /* Detachment is lazily performed in the execlists tasklet */ + if (!RB_EMPTY_NODE(node)) + rb_erase_cached(node, &sibling->execlists.virtual); + + spin_unlock_irq(&sibling->active.lock); + } + GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); + GEM_BUG_ON(!list_empty(virtual_queue(ve))); + + if (ve->context.state) + __execlists_context_fini(&ve->context); + intel_context_fini(&ve->context); + + intel_breadcrumbs_free(ve->base.breadcrumbs); + intel_engine_free_request_pool(&ve->base); + + kfree(ve->bonds); + kfree(ve); +} + +static void virtual_context_destroy(struct kref *kref) +{ + struct virtual_engine *ve = + container_of(kref, typeof(*ve), context.ref); + + GEM_BUG_ON(!list_empty(&ve->context.signals)); + + /* + * When destroying the virtual engine, we have to be aware that + * it may still be in use from an hardirq/softirq context causing + * the resubmission of a completed request (background completion + * due to preempt-to-busy). Before we can free the engine, we need + * to flush the submission code and tasklets that are still potentially + * accessing the engine. Flushing the tasklets requires process context, + * and since we can guard the resubmit onto the engine with an RCU read + * lock, we can delegate the free of the engine to an RCU worker. + */ + INIT_RCU_WORK(&ve->rcu, rcu_virtual_context_destroy); + queue_rcu_work(system_wq, &ve->rcu); +} + +static void virtual_engine_initial_hint(struct virtual_engine *ve) +{ + int swp; + + /* + * Pick a random sibling on starting to help spread the load around. + * + * New contexts are typically created with exactly the same order + * of siblings, and often started in batches. Due to the way we iterate + * the array of sibling when submitting requests, sibling[0] is + * prioritised for dequeuing. If we make sure that sibling[0] is fairly + * randomised across the system, we also help spread the load by the + * first engine we inspect being different each time. + * + * NB This does not force us to execute on this engine, it will just + * typically be the first we inspect for submission. + */ + swp = prandom_u32_max(ve->num_siblings); + if (swp) + swap(ve->siblings[swp], ve->siblings[0]); +} + +static int virtual_context_alloc(struct intel_context *ce) +{ + struct virtual_engine *ve = container_of(ce, typeof(*ve), context); + + return __execlists_context_alloc(ce, ve->siblings[0]); +} + +static int virtual_context_pin(struct intel_context *ce, void *vaddr) +{ + struct virtual_engine *ve = container_of(ce, typeof(*ve), context); + + /* Note: we must use a real engine class for setting up reg state */ + return __execlists_context_pin(ce, ve->siblings[0], vaddr); +} + +static void virtual_context_enter(struct intel_context *ce) +{ + struct virtual_engine *ve = container_of(ce, typeof(*ve), context); + unsigned int n; + + for (n = 0; n < ve->num_siblings; n++) + intel_engine_pm_get(ve->siblings[n]); + + intel_timeline_enter(ce->timeline); +} + +static void virtual_context_exit(struct intel_context *ce) +{ + struct virtual_engine *ve = container_of(ce, typeof(*ve), context); + unsigned int n; + + intel_timeline_exit(ce->timeline); + + for (n = 0; n < ve->num_siblings; n++) + intel_engine_pm_put(ve->siblings[n]); +} + +static const struct intel_context_ops virtual_context_ops = { + .alloc = virtual_context_alloc, + + .pre_pin = execlists_context_pre_pin, + .pin = virtual_context_pin, + .unpin = execlists_context_unpin, + .post_unpin = execlists_context_post_unpin, + + .enter = virtual_context_enter, + .exit = virtual_context_exit, + + .destroy = virtual_context_destroy, +}; + +static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) +{ + struct i915_request *rq; + intel_engine_mask_t mask; + + rq = READ_ONCE(ve->request); + if (!rq) + return 0; + + /* The rq is ready for submission; rq->execution_mask is now stable. */ + mask = rq->execution_mask; + if (unlikely(!mask)) { + /* Invalid selection, submit to a random engine in error */ + i915_request_set_error_once(rq, -ENODEV); + mask = ve->siblings[0]->mask; + } + + ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n", + rq->fence.context, rq->fence.seqno, + mask, ve->base.execlists.queue_priority_hint); + + return mask; +} + +static void virtual_submission_tasklet(unsigned long data) +{ + struct virtual_engine * const ve = (struct virtual_engine *)data; + const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint); + intel_engine_mask_t mask; + unsigned int n; + + rcu_read_lock(); + mask = virtual_submission_mask(ve); + rcu_read_unlock(); + if (unlikely(!mask)) + return; + + local_irq_disable(); + for (n = 0; n < ve->num_siblings; n++) { + struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]); + struct ve_node * const node = &ve->nodes[sibling->id]; + struct rb_node **parent, *rb; + bool first; + + if (!READ_ONCE(ve->request)) + break; /* already handled by a sibling's tasklet */ + + if (unlikely(!(mask & sibling->mask))) { + if (!RB_EMPTY_NODE(&node->rb)) { + spin_lock(&sibling->active.lock); + rb_erase_cached(&node->rb, + &sibling->execlists.virtual); + RB_CLEAR_NODE(&node->rb); + spin_unlock(&sibling->active.lock); + } + continue; + } + + spin_lock(&sibling->active.lock); + + if (!RB_EMPTY_NODE(&node->rb)) { + /* + * Cheat and avoid rebalancing the tree if we can + * reuse this node in situ. + */ + first = rb_first_cached(&sibling->execlists.virtual) == + &node->rb; + if (prio == node->prio || (prio > node->prio && first)) + goto submit_engine; + + rb_erase_cached(&node->rb, &sibling->execlists.virtual); + } + + rb = NULL; + first = true; + parent = &sibling->execlists.virtual.rb_root.rb_node; + while (*parent) { + struct ve_node *other; + + rb = *parent; + other = rb_entry(rb, typeof(*other), rb); + if (prio > other->prio) { + parent = &rb->rb_left; + } else { + parent = &rb->rb_right; + first = false; + } + } + + rb_link_node(&node->rb, rb, parent); + rb_insert_color_cached(&node->rb, + &sibling->execlists.virtual, + first); + +submit_engine: + GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); + node->prio = prio; + if (first && prio > sibling->execlists.queue_priority_hint) + tasklet_hi_schedule(&sibling->execlists.tasklet); + + spin_unlock(&sibling->active.lock); + } + local_irq_enable(); +} + +static void virtual_submit_request(struct i915_request *rq) +{ + struct virtual_engine *ve = to_virtual_engine(rq->engine); + struct i915_request *old; + unsigned long flags; + + ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n", + rq->fence.context, + rq->fence.seqno); + + GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); + + spin_lock_irqsave(&ve->base.active.lock, flags); + + old = ve->request; + if (old) { /* background completion event from preempt-to-busy */ + GEM_BUG_ON(!i915_request_completed(old)); + __i915_request_submit(old); + i915_request_put(old); + } + + if (i915_request_completed(rq)) { + __i915_request_submit(rq); + + ve->base.execlists.queue_priority_hint = INT_MIN; + ve->request = NULL; + } else { + ve->base.execlists.queue_priority_hint = rq_prio(rq); + ve->request = i915_request_get(rq); + + GEM_BUG_ON(!list_empty(virtual_queue(ve))); + list_move_tail(&rq->sched.link, virtual_queue(ve)); + + tasklet_hi_schedule(&ve->base.execlists.tasklet); + } + + spin_unlock_irqrestore(&ve->base.active.lock, flags); +} + +static struct ve_bond * +virtual_find_bond(struct virtual_engine *ve, + const struct intel_engine_cs *master) +{ + int i; + + for (i = 0; i < ve->num_bonds; i++) { + if (ve->bonds[i].master == master) + return &ve->bonds[i]; + } + + return NULL; +} + +static void +virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) +{ + struct virtual_engine *ve = to_virtual_engine(rq->engine); + intel_engine_mask_t allowed, exec; + struct ve_bond *bond; + + allowed = ~to_request(signal)->engine->mask; + + bond = virtual_find_bond(ve, to_request(signal)->engine); + if (bond) + allowed &= bond->sibling_mask; + + /* Restrict the bonded request to run on only the available engines */ + exec = READ_ONCE(rq->execution_mask); + while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) + ; + + /* Prevent the master from being re-run on the bonded engines */ + to_request(signal)->execution_mask &= ~allowed; +} + +struct intel_context * +intel_execlists_create_virtual(struct intel_engine_cs **siblings, + unsigned int count) +{ + struct virtual_engine *ve; + unsigned int n; + int err; + + if (count == 0) + return ERR_PTR(-EINVAL); + + if (count == 1) + return intel_context_create(siblings[0]); + + ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); + if (!ve) + return ERR_PTR(-ENOMEM); + + ve->base.i915 = siblings[0]->i915; + ve->base.gt = siblings[0]->gt; + ve->base.uncore = siblings[0]->uncore; + ve->base.id = -1; + + ve->base.class = OTHER_CLASS; + ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; + ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; + ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; + + /* + * The decision on whether to submit a request using semaphores + * depends on the saturated state of the engine. We only compute + * this during HW submission of the request, and we need for this + * state to be globally applied to all requests being submitted + * to this engine. Virtual engines encompass more than one physical + * engine and so we cannot accurately tell in advance if one of those + * engines is already saturated and so cannot afford to use a semaphore + * and be pessimized in priority for doing so -- if we are the only + * context using semaphores after all other clients have stopped, we + * will be starved on the saturated system. Such a global switch for + * semaphores is less than ideal, but alas is the current compromise. + */ + ve->base.saturated = ALL_ENGINES; + + snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); + + intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); + intel_engine_init_execlists(&ve->base); + + ve->base.cops = &virtual_context_ops; + ve->base.request_alloc = execlists_request_alloc; + + ve->base.schedule = i915_schedule; + ve->base.submit_request = virtual_submit_request; + ve->base.bond_execute = virtual_bond_execute; + + INIT_LIST_HEAD(virtual_queue(ve)); + ve->base.execlists.queue_priority_hint = INT_MIN; + tasklet_init(&ve->base.execlists.tasklet, + virtual_submission_tasklet, + (unsigned long)ve); + + intel_context_init(&ve->context, &ve->base); + + ve->base.breadcrumbs = intel_breadcrumbs_create(NULL); + if (!ve->base.breadcrumbs) { + err = -ENOMEM; + goto err_put; + } + + for (n = 0; n < count; n++) { + struct intel_engine_cs *sibling = siblings[n]; + + GEM_BUG_ON(!is_power_of_2(sibling->mask)); + if (sibling->mask & ve->base.mask) { + DRM_DEBUG("duplicate %s entry in load balancer\n", + sibling->name); + err = -EINVAL; + goto err_put; + } + + /* + * The virtual engine implementation is tightly coupled to + * the execlists backend -- we push out request directly + * into a tree inside each physical engine. We could support + * layering if we handle cloning of the requests and + * submitting a copy into each backend. + */ + if (sibling->execlists.tasklet.func != + execlists_submission_tasklet) { + err = -ENODEV; + goto err_put; + } + + GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); + RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); + + ve->siblings[ve->num_siblings++] = sibling; + ve->base.mask |= sibling->mask; + + /* + * All physical engines must be compatible for their emission + * functions (as we build the instructions during request + * construction and do not alter them before submission + * on the physical engine). We use the engine class as a guide + * here, although that could be refined. + */ + if (ve->base.class != OTHER_CLASS) { + if (ve->base.class != sibling->class) { + DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", + sibling->class, ve->base.class); + err = -EINVAL; + goto err_put; + } + continue; + } + + ve->base.class = sibling->class; + ve->base.uabi_class = sibling->uabi_class; + snprintf(ve->base.name, sizeof(ve->base.name), + "v%dx%d", ve->base.class, count); + ve->base.context_size = sibling->context_size; + + ve->base.emit_bb_start = sibling->emit_bb_start; + ve->base.emit_flush = sibling->emit_flush; + ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; + ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; + ve->base.emit_fini_breadcrumb_dw = + sibling->emit_fini_breadcrumb_dw; + + ve->base.flags = sibling->flags; + } + + ve->base.flags |= I915_ENGINE_IS_VIRTUAL; + + virtual_engine_initial_hint(ve); + return &ve->context; + +err_put: + intel_context_put(&ve->context); + return ERR_PTR(err); +} + +struct intel_context * +intel_execlists_clone_virtual(struct intel_engine_cs *src) +{ + struct virtual_engine *se = to_virtual_engine(src); + struct intel_context *dst; + + dst = intel_execlists_create_virtual(se->siblings, + se->num_siblings); + if (IS_ERR(dst)) + return dst; + + if (se->num_bonds) { + struct virtual_engine *de = to_virtual_engine(dst->engine); + + de->bonds = kmemdup(se->bonds, + sizeof(*se->bonds) * se->num_bonds, + GFP_KERNEL); + if (!de->bonds) { + intel_context_put(dst); + return ERR_PTR(-ENOMEM); + } + + de->num_bonds = se->num_bonds; + } + + return dst; +} + +int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, + const struct intel_engine_cs *master, + const struct intel_engine_cs *sibling) +{ + struct virtual_engine *ve = to_virtual_engine(engine); + struct ve_bond *bond; + int n; + + /* Sanity check the sibling is part of the virtual engine */ + for (n = 0; n < ve->num_siblings; n++) + if (sibling == ve->siblings[n]) + break; + if (n == ve->num_siblings) + return -EINVAL; + + bond = virtual_find_bond(ve, master); + if (bond) { + bond->sibling_mask |= sibling->mask; + return 0; + } + + bond = krealloc(ve->bonds, + sizeof(*bond) * (ve->num_bonds + 1), + GFP_KERNEL); + if (!bond) + return -ENOMEM; + + bond[ve->num_bonds].master = master; + bond[ve->num_bonds].sibling_mask = sibling->mask; + + ve->bonds = bond; + ve->num_bonds++; + + return 0; +} + +struct intel_engine_cs * +intel_virtual_engine_get_sibling(struct intel_engine_cs *engine, + unsigned int sibling) +{ + struct virtual_engine *ve = to_virtual_engine(engine); + + if (sibling >= ve->num_siblings) + return NULL; + + return ve->siblings[sibling]; +} + +void intel_execlists_show_requests(struct intel_engine_cs *engine, + struct drm_printer *m, + void (*show_request)(struct drm_printer *m, + struct i915_request *rq, + const char *prefix), + unsigned int max) +{ + const struct intel_engine_execlists *execlists = &engine->execlists; + struct i915_request *rq, *last; + unsigned long flags; + unsigned int count; + struct rb_node *rb; + + spin_lock_irqsave(&engine->active.lock, flags); + + last = NULL; + count = 0; + list_for_each_entry(rq, &engine->active.requests, sched.link) { + if (count++ < max - 1) + show_request(m, rq, "\t\tE "); + else + last = rq; + } + if (last) { + if (count > max) { + drm_printf(m, + "\t\t...skipping %d executing requests...\n", + count - max); + } + show_request(m, last, "\t\tE "); + } + + if (execlists->switch_priority_hint != INT_MIN) + drm_printf(m, "\t\tSwitch priority hint: %d\n", + READ_ONCE(execlists->switch_priority_hint)); + if (execlists->queue_priority_hint != INT_MIN) + drm_printf(m, "\t\tQueue priority hint: %d\n", + READ_ONCE(execlists->queue_priority_hint)); + + last = NULL; + count = 0; + for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { + struct i915_priolist *p = rb_entry(rb, typeof(*p), node); + int i; + + priolist_for_each_request(rq, p, i) { + if (count++ < max - 1) + show_request(m, rq, "\t\tQ "); + else + last = rq; + } + } + if (last) { + if (count > max) { + drm_printf(m, + "\t\t...skipping %d queued requests...\n", + count - max); + } + show_request(m, last, "\t\tQ "); + } + + last = NULL; + count = 0; + for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + struct i915_request *rq = READ_ONCE(ve->request); + + if (rq) { + if (count++ < max - 1) + show_request(m, rq, "\t\tV "); + else + last = rq; + } + } + if (last) { + if (count > max) { + drm_printf(m, + "\t\t...skipping %d virtual requests...\n", + count - max); + } + show_request(m, last, "\t\tV "); + } + + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +void intel_lr_context_reset(struct intel_engine_cs *engine, + struct intel_context *ce, + u32 head, + bool scrub) +{ + GEM_BUG_ON(!intel_context_is_pinned(ce)); + + /* + * We want a simple context + ring to execute the breadcrumb update. + * We cannot rely on the context being intact across the GPU hang, + * so clear it and rebuild just what we need for the breadcrumb. + * All pending requests for this context will be zapped, and any + * future request will be after userspace has had the opportunity + * to recreate its own state. + */ + if (scrub) + restore_default_state(ce, engine); + + /* Rerun the request; its payload has been neutered (if guilty). */ + __execlists_update_reg_state(ce, engine, head); +} + +bool +intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine) +{ + return engine->set_default_submission == + intel_execlists_set_default_submission; +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_lrc.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.h b/drivers/gpu/drm/i915/gt/intel_lrc.h new file mode 100644 index 000000000..91fd8e452 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_lrc.h @@ -0,0 +1,131 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef _INTEL_LRC_H_ +#define _INTEL_LRC_H_ + +#include <linux/types.h> + +struct drm_printer; + +struct drm_i915_private; +struct i915_gem_context; +struct i915_request; +struct intel_context; +struct intel_engine_cs; + +/* Execlists regs */ +#define RING_ELSP(base) _MMIO((base) + 0x230) +#define RING_EXECLIST_STATUS_LO(base) _MMIO((base) + 0x234) +#define RING_EXECLIST_STATUS_HI(base) _MMIO((base) + 0x234 + 4) +#define RING_CONTEXT_CONTROL(base) _MMIO((base) + 0x244) +#define CTX_CTRL_INHIBIT_SYN_CTX_SWITCH (1 << 3) +#define CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT (1 << 0) +#define CTX_CTRL_RS_CTX_ENABLE (1 << 1) +#define CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT (1 << 2) +#define GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE (1 << 8) +#define RING_CONTEXT_STATUS_PTR(base) _MMIO((base) + 0x3a0) +#define RING_EXECLIST_SQ_CONTENTS(base) _MMIO((base) + 0x510) +#define RING_EXECLIST_CONTROL(base) _MMIO((base) + 0x550) + +#define EL_CTRL_LOAD (1 << 0) + +/* The docs specify that the write pointer wraps around after 5h, "After status + * is written out to the last available status QW at offset 5h, this pointer + * wraps to 0." + * + * Therefore, one must infer than even though there are 3 bits available, 6 and + * 7 appear to be * reserved. + */ +#define GEN8_CSB_ENTRIES 6 +#define GEN8_CSB_PTR_MASK 0x7 +#define GEN8_CSB_READ_PTR_MASK (GEN8_CSB_PTR_MASK << 8) +#define GEN8_CSB_WRITE_PTR_MASK (GEN8_CSB_PTR_MASK << 0) + +#define GEN11_CSB_ENTRIES 12 +#define GEN11_CSB_PTR_MASK 0xf +#define GEN11_CSB_READ_PTR_MASK (GEN11_CSB_PTR_MASK << 8) +#define GEN11_CSB_WRITE_PTR_MASK (GEN11_CSB_PTR_MASK << 0) + +#define MAX_CONTEXT_HW_ID (1<<21) /* exclusive */ +#define MAX_GUC_CONTEXT_HW_ID (1 << 20) /* exclusive */ +#define GEN11_MAX_CONTEXT_HW_ID (1<<11) /* exclusive */ +/* in Gen12 ID 0x7FF is reserved to indicate idle */ +#define GEN12_MAX_CONTEXT_HW_ID (GEN11_MAX_CONTEXT_HW_ID - 1) + +enum { + INTEL_CONTEXT_SCHEDULE_IN = 0, + INTEL_CONTEXT_SCHEDULE_OUT, + INTEL_CONTEXT_SCHEDULE_PREEMPTED, +}; + +/* Logical Rings */ +void intel_logical_ring_cleanup(struct intel_engine_cs *engine); + +int intel_execlists_submission_setup(struct intel_engine_cs *engine); + +/* Logical Ring Contexts */ +/* At the start of the context image is its per-process HWS page */ +#define LRC_PPHWSP_PN (0) +#define LRC_PPHWSP_SZ (1) +/* After the PPHWSP we have the logical state for the context */ +#define LRC_STATE_PN (LRC_PPHWSP_PN + LRC_PPHWSP_SZ) +#define LRC_STATE_OFFSET (LRC_STATE_PN * PAGE_SIZE) + +/* Space within PPHWSP reserved to be used as scratch */ +#define LRC_PPHWSP_SCRATCH 0x34 +#define LRC_PPHWSP_SCRATCH_ADDR (LRC_PPHWSP_SCRATCH * sizeof(u32)) + +void intel_execlists_set_default_submission(struct intel_engine_cs *engine); + +void intel_lr_context_reset(struct intel_engine_cs *engine, + struct intel_context *ce, + u32 head, + bool scrub); + +void intel_execlists_show_requests(struct intel_engine_cs *engine, + struct drm_printer *m, + void (*show_request)(struct drm_printer *m, + struct i915_request *rq, + const char *prefix), + unsigned int max); + +struct intel_context * +intel_execlists_create_virtual(struct intel_engine_cs **siblings, + unsigned int count); + +struct intel_context * +intel_execlists_clone_virtual(struct intel_engine_cs *src); + +int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, + const struct intel_engine_cs *master, + const struct intel_engine_cs *sibling); + +struct intel_engine_cs * +intel_virtual_engine_get_sibling(struct intel_engine_cs *engine, + unsigned int sibling); + +bool +intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine); + +#endif /* _INTEL_LRC_H_ */ diff --git a/drivers/gpu/drm/i915/gt/intel_lrc_reg.h b/drivers/gpu/drm/i915/gt/intel_lrc_reg.h new file mode 100644 index 000000000..93cb6c460 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_lrc_reg.h @@ -0,0 +1,52 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2014-2018 Intel Corporation + */ + +#ifndef _INTEL_LRC_REG_H_ +#define _INTEL_LRC_REG_H_ + +#include <linux/types.h> + +/* GEN8 to GEN12 Reg State Context */ +#define CTX_CONTEXT_CONTROL (0x02 + 1) +#define CTX_RING_HEAD (0x04 + 1) +#define CTX_RING_TAIL (0x06 + 1) +#define CTX_RING_START (0x08 + 1) +#define CTX_RING_CTL (0x0a + 1) +#define CTX_BB_STATE (0x10 + 1) +#define CTX_TIMESTAMP (0x22 + 1) +#define CTX_PDP3_UDW (0x24 + 1) +#define CTX_PDP3_LDW (0x26 + 1) +#define CTX_PDP2_UDW (0x28 + 1) +#define CTX_PDP2_LDW (0x2a + 1) +#define CTX_PDP1_UDW (0x2c + 1) +#define CTX_PDP1_LDW (0x2e + 1) +#define CTX_PDP0_UDW (0x30 + 1) +#define CTX_PDP0_LDW (0x32 + 1) +#define CTX_R_PWR_CLK_STATE (0x42 + 1) + +#define GEN9_CTX_RING_MI_MODE 0x54 + +#define ASSIGN_CTX_PDP(ppgtt, reg_state, n) do { \ + u32 *reg_state__ = (reg_state); \ + const u64 addr__ = i915_page_dir_dma_addr((ppgtt), (n)); \ + (reg_state__)[CTX_PDP ## n ## _UDW] = upper_32_bits(addr__); \ + (reg_state__)[CTX_PDP ## n ## _LDW] = lower_32_bits(addr__); \ +} while (0) + +#define ASSIGN_CTX_PML4(ppgtt, reg_state) do { \ + u32 *reg_state__ = (reg_state); \ + const u64 addr__ = px_dma(ppgtt->pd); \ + (reg_state__)[CTX_PDP0_UDW] = upper_32_bits(addr__); \ + (reg_state__)[CTX_PDP0_LDW] = lower_32_bits(addr__); \ +} while (0) + +#define GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x17 +#define GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x26 +#define GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x19 +#define GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x1A +#define GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0xD + +#endif /* _INTEL_LRC_REG_H_ */ diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c b/drivers/gpu/drm/i915/gt/intel_mocs.c new file mode 100644 index 000000000..413dadfac --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_mocs.c @@ -0,0 +1,502 @@ +/* + * Copyright (c) 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "i915_drv.h" + +#include "intel_engine.h" +#include "intel_gt.h" +#include "intel_mocs.h" +#include "intel_lrc.h" +#include "intel_ring.h" + +/* structures required */ +struct drm_i915_mocs_entry { + u32 control_value; + u16 l3cc_value; + u16 used; +}; + +struct drm_i915_mocs_table { + unsigned int size; + unsigned int n_entries; + const struct drm_i915_mocs_entry *table; +}; + +/* Defines for the tables (XXX_MOCS_0 - XXX_MOCS_63) */ +#define _LE_CACHEABILITY(value) ((value) << 0) +#define _LE_TGT_CACHE(value) ((value) << 2) +#define LE_LRUM(value) ((value) << 4) +#define LE_AOM(value) ((value) << 6) +#define LE_RSC(value) ((value) << 7) +#define LE_SCC(value) ((value) << 8) +#define LE_PFM(value) ((value) << 11) +#define LE_SCF(value) ((value) << 14) +#define LE_COS(value) ((value) << 15) +#define LE_SSE(value) ((value) << 17) + +/* Defines for the tables (LNCFMOCS0 - LNCFMOCS31) - two entries per word */ +#define L3_ESC(value) ((value) << 0) +#define L3_SCC(value) ((value) << 1) +#define _L3_CACHEABILITY(value) ((value) << 4) + +/* Helper defines */ +#define GEN9_NUM_MOCS_ENTRIES 64 /* 63-64 are reserved, but configured. */ + +/* (e)LLC caching options */ +/* + * Note: LE_0_PAGETABLE works only up to Gen11; for newer gens it means + * the same as LE_UC + */ +#define LE_0_PAGETABLE _LE_CACHEABILITY(0) +#define LE_1_UC _LE_CACHEABILITY(1) +#define LE_2_WT _LE_CACHEABILITY(2) +#define LE_3_WB _LE_CACHEABILITY(3) + +/* Target cache */ +#define LE_TC_0_PAGETABLE _LE_TGT_CACHE(0) +#define LE_TC_1_LLC _LE_TGT_CACHE(1) +#define LE_TC_2_LLC_ELLC _LE_TGT_CACHE(2) +#define LE_TC_3_LLC_ELLC_ALT _LE_TGT_CACHE(3) + +/* L3 caching options */ +#define L3_0_DIRECT _L3_CACHEABILITY(0) +#define L3_1_UC _L3_CACHEABILITY(1) +#define L3_2_RESERVED _L3_CACHEABILITY(2) +#define L3_3_WB _L3_CACHEABILITY(3) + +#define MOCS_ENTRY(__idx, __control_value, __l3cc_value) \ + [__idx] = { \ + .control_value = __control_value, \ + .l3cc_value = __l3cc_value, \ + .used = 1, \ + } + +/* + * MOCS tables + * + * These are the MOCS tables that are programmed across all the rings. + * The control value is programmed to all the rings that support the + * MOCS registers. While the l3cc_values are only programmed to the + * LNCFCMOCS0 - LNCFCMOCS32 registers. + * + * These tables are intended to be kept reasonably consistent across + * HW platforms, and for ICL+, be identical across OSes. To achieve + * that, for Icelake and above, list of entries is published as part + * of bspec. + * + * Entries not part of the following tables are undefined as far as + * userspace is concerned and shouldn't be relied upon. For Gen < 12 + * they will be initialized to PTE. Gen >= 12 onwards don't have a setting for + * PTE and will be initialized to an invalid value. + * + * The last two entries are reserved by the hardware. For ICL+ they + * should be initialized according to bspec and never used, for older + * platforms they should never be written to. + * + * NOTE: These tables are part of bspec and defined as part of hardware + * interface for ICL+. For older platforms, they are part of kernel + * ABI. It is expected that, for specific hardware platform, existing + * entries will remain constant and the table will only be updated by + * adding new entries, filling unused positions. + */ +#define GEN9_MOCS_ENTRIES \ + MOCS_ENTRY(I915_MOCS_UNCACHED, \ + LE_1_UC | LE_TC_2_LLC_ELLC, \ + L3_1_UC), \ + MOCS_ENTRY(I915_MOCS_PTE, \ + LE_0_PAGETABLE | LE_TC_2_LLC_ELLC | LE_LRUM(3), \ + L3_3_WB) + +static const struct drm_i915_mocs_entry skl_mocs_table[] = { + GEN9_MOCS_ENTRIES, + MOCS_ENTRY(I915_MOCS_CACHED, + LE_3_WB | LE_TC_2_LLC_ELLC | LE_LRUM(3), + L3_3_WB), + + /* + * mocs:63 + * - used by the L3 for all of its evictions. + * Thus it is expected to allow LLC cacheability to enable coherent + * flows to be maintained. + * - used to force L3 uncachable cycles. + * Thus it is expected to make the surface L3 uncacheable. + */ + MOCS_ENTRY(63, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), + L3_1_UC) +}; + +/* NOTE: the LE_TGT_CACHE is not used on Broxton */ +static const struct drm_i915_mocs_entry broxton_mocs_table[] = { + GEN9_MOCS_ENTRIES, + MOCS_ENTRY(I915_MOCS_CACHED, + LE_1_UC | LE_TC_2_LLC_ELLC | LE_LRUM(3), + L3_3_WB) +}; + +#define GEN11_MOCS_ENTRIES \ + /* Entries 0 and 1 are defined per-platform */ \ + /* Base - L3 + LLC */ \ + MOCS_ENTRY(2, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), \ + L3_3_WB), \ + /* Base - Uncached */ \ + MOCS_ENTRY(3, \ + LE_1_UC | LE_TC_1_LLC, \ + L3_1_UC), \ + /* Base - L3 */ \ + MOCS_ENTRY(4, \ + LE_1_UC | LE_TC_1_LLC, \ + L3_3_WB), \ + /* Base - LLC */ \ + MOCS_ENTRY(5, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), \ + L3_1_UC), \ + /* Age 0 - LLC */ \ + MOCS_ENTRY(6, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(1), \ + L3_1_UC), \ + /* Age 0 - L3 + LLC */ \ + MOCS_ENTRY(7, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(1), \ + L3_3_WB), \ + /* Age: Don't Chg. - LLC */ \ + MOCS_ENTRY(8, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(2), \ + L3_1_UC), \ + /* Age: Don't Chg. - L3 + LLC */ \ + MOCS_ENTRY(9, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(2), \ + L3_3_WB), \ + /* No AOM - LLC */ \ + MOCS_ENTRY(10, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_AOM(1), \ + L3_1_UC), \ + /* No AOM - L3 + LLC */ \ + MOCS_ENTRY(11, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_AOM(1), \ + L3_3_WB), \ + /* No AOM; Age 0 - LLC */ \ + MOCS_ENTRY(12, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(1) | LE_AOM(1), \ + L3_1_UC), \ + /* No AOM; Age 0 - L3 + LLC */ \ + MOCS_ENTRY(13, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(1) | LE_AOM(1), \ + L3_3_WB), \ + /* No AOM; Age:DC - LLC */ \ + MOCS_ENTRY(14, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(2) | LE_AOM(1), \ + L3_1_UC), \ + /* No AOM; Age:DC - L3 + LLC */ \ + MOCS_ENTRY(15, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(2) | LE_AOM(1), \ + L3_3_WB), \ + /* Self-Snoop - L3 + LLC */ \ + MOCS_ENTRY(18, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_SSE(3), \ + L3_3_WB), \ + /* Skip Caching - L3 + LLC(12.5%) */ \ + MOCS_ENTRY(19, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_SCC(7), \ + L3_3_WB), \ + /* Skip Caching - L3 + LLC(25%) */ \ + MOCS_ENTRY(20, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_SCC(3), \ + L3_3_WB), \ + /* Skip Caching - L3 + LLC(50%) */ \ + MOCS_ENTRY(21, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_SCC(1), \ + L3_3_WB), \ + /* Skip Caching - L3 + LLC(75%) */ \ + MOCS_ENTRY(22, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_RSC(1) | LE_SCC(3), \ + L3_3_WB), \ + /* Skip Caching - L3 + LLC(87.5%) */ \ + MOCS_ENTRY(23, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_RSC(1) | LE_SCC(7), \ + L3_3_WB), \ + /* HW Reserved - SW program but never use */ \ + MOCS_ENTRY(62, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), \ + L3_1_UC), \ + /* HW Reserved - SW program but never use */ \ + MOCS_ENTRY(63, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), \ + L3_1_UC) + +static const struct drm_i915_mocs_entry tgl_mocs_table[] = { + /* + * NOTE: + * Reserved and unspecified MOCS indices have been set to (L3 + LCC). + * These reserved entries should never be used, they may be changed + * to low performant variants with better coherency in the future if + * more entries are needed. We are programming index I915_MOCS_PTE(1) + * only, __init_mocs_table() take care to program unused index with + * this entry. + */ + MOCS_ENTRY(I915_MOCS_PTE, + LE_0_PAGETABLE | LE_TC_0_PAGETABLE, + L3_1_UC), + GEN11_MOCS_ENTRIES, + + /* Implicitly enable L1 - HDC:L1 + L3 + LLC */ + MOCS_ENTRY(48, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), + L3_3_WB), + /* Implicitly enable L1 - HDC:L1 + L3 */ + MOCS_ENTRY(49, + LE_1_UC | LE_TC_1_LLC, + L3_3_WB), + /* Implicitly enable L1 - HDC:L1 + LLC */ + MOCS_ENTRY(50, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), + L3_1_UC), + /* Implicitly enable L1 - HDC:L1 */ + MOCS_ENTRY(51, + LE_1_UC | LE_TC_1_LLC, + L3_1_UC), + /* HW Special Case (CCS) */ + MOCS_ENTRY(60, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), + L3_1_UC), + /* HW Special Case (Displayable) */ + MOCS_ENTRY(61, + LE_1_UC | LE_TC_1_LLC, + L3_3_WB), +}; + +static const struct drm_i915_mocs_entry icl_mocs_table[] = { + /* Base - Uncached (Deprecated) */ + MOCS_ENTRY(I915_MOCS_UNCACHED, + LE_1_UC | LE_TC_1_LLC, + L3_1_UC), + /* Base - L3 + LeCC:PAT (Deprecated) */ + MOCS_ENTRY(I915_MOCS_PTE, + LE_0_PAGETABLE | LE_TC_1_LLC, + L3_3_WB), + + GEN11_MOCS_ENTRIES +}; + +enum { + HAS_GLOBAL_MOCS = BIT(0), + HAS_ENGINE_MOCS = BIT(1), + HAS_RENDER_L3CC = BIT(2), +}; + +static bool has_l3cc(const struct drm_i915_private *i915) +{ + return true; +} + +static bool has_global_mocs(const struct drm_i915_private *i915) +{ + return HAS_GLOBAL_MOCS_REGISTERS(i915); +} + +static bool has_mocs(const struct drm_i915_private *i915) +{ + return !IS_DGFX(i915); +} + +static unsigned int get_mocs_settings(const struct drm_i915_private *i915, + struct drm_i915_mocs_table *table) +{ + unsigned int flags; + + if (INTEL_GEN(i915) >= 12) { + table->size = ARRAY_SIZE(tgl_mocs_table); + table->table = tgl_mocs_table; + table->n_entries = GEN9_NUM_MOCS_ENTRIES; + } else if (IS_GEN(i915, 11)) { + table->size = ARRAY_SIZE(icl_mocs_table); + table->table = icl_mocs_table; + table->n_entries = GEN9_NUM_MOCS_ENTRIES; + } else if (IS_GEN9_BC(i915) || IS_CANNONLAKE(i915)) { + table->size = ARRAY_SIZE(skl_mocs_table); + table->n_entries = GEN9_NUM_MOCS_ENTRIES; + table->table = skl_mocs_table; + } else if (IS_GEN9_LP(i915)) { + table->size = ARRAY_SIZE(broxton_mocs_table); + table->n_entries = GEN9_NUM_MOCS_ENTRIES; + table->table = broxton_mocs_table; + } else { + drm_WARN_ONCE(&i915->drm, INTEL_GEN(i915) >= 9, + "Platform that should have a MOCS table does not.\n"); + return 0; + } + + if (GEM_DEBUG_WARN_ON(table->size > table->n_entries)) + return 0; + + /* WaDisableSkipCaching:skl,bxt,kbl,glk */ + if (IS_GEN(i915, 9)) { + int i; + + for (i = 0; i < table->size; i++) + if (GEM_DEBUG_WARN_ON(table->table[i].l3cc_value & + (L3_ESC(1) | L3_SCC(0x7)))) + return 0; + } + + flags = 0; + if (has_mocs(i915)) { + if (has_global_mocs(i915)) + flags |= HAS_GLOBAL_MOCS; + else + flags |= HAS_ENGINE_MOCS; + } + if (has_l3cc(i915)) + flags |= HAS_RENDER_L3CC; + + return flags; +} + +/* + * Get control_value from MOCS entry taking into account when it's not used: + * I915_MOCS_PTE's value is returned in this case. + */ +static u32 get_entry_control(const struct drm_i915_mocs_table *table, + unsigned int index) +{ + if (index < table->size && table->table[index].used) + return table->table[index].control_value; + + return table->table[I915_MOCS_PTE].control_value; +} + +#define for_each_mocs(mocs, t, i) \ + for (i = 0; \ + i < (t)->n_entries ? (mocs = get_entry_control((t), i)), 1 : 0;\ + i++) + +static void __init_mocs_table(struct intel_uncore *uncore, + const struct drm_i915_mocs_table *table, + u32 addr) +{ + unsigned int i; + u32 mocs; + + for_each_mocs(mocs, table, i) + intel_uncore_write_fw(uncore, _MMIO(addr + i * 4), mocs); +} + +static u32 mocs_offset(const struct intel_engine_cs *engine) +{ + static const u32 offset[] = { + [RCS0] = __GEN9_RCS0_MOCS0, + [VCS0] = __GEN9_VCS0_MOCS0, + [VCS1] = __GEN9_VCS1_MOCS0, + [VECS0] = __GEN9_VECS0_MOCS0, + [BCS0] = __GEN9_BCS0_MOCS0, + [VCS2] = __GEN11_VCS2_MOCS0, + }; + + GEM_BUG_ON(engine->id >= ARRAY_SIZE(offset)); + return offset[engine->id]; +} + +static void init_mocs_table(struct intel_engine_cs *engine, + const struct drm_i915_mocs_table *table) +{ + __init_mocs_table(engine->uncore, table, mocs_offset(engine)); +} + +/* + * Get l3cc_value from MOCS entry taking into account when it's not used: + * I915_MOCS_PTE's value is returned in this case. + */ +static u16 get_entry_l3cc(const struct drm_i915_mocs_table *table, + unsigned int index) +{ + if (index < table->size && table->table[index].used) + return table->table[index].l3cc_value; + + return table->table[I915_MOCS_PTE].l3cc_value; +} + +static inline u32 l3cc_combine(u16 low, u16 high) +{ + return low | (u32)high << 16; +} + +#define for_each_l3cc(l3cc, t, i) \ + for (i = 0; \ + i < ((t)->n_entries + 1) / 2 ? \ + (l3cc = l3cc_combine(get_entry_l3cc((t), 2 * i), \ + get_entry_l3cc((t), 2 * i + 1))), 1 : \ + 0; \ + i++) + +static void init_l3cc_table(struct intel_engine_cs *engine, + const struct drm_i915_mocs_table *table) +{ + struct intel_uncore *uncore = engine->uncore; + unsigned int i; + u32 l3cc; + + for_each_l3cc(l3cc, table, i) + intel_uncore_write_fw(uncore, GEN9_LNCFCMOCS(i), l3cc); +} + +void intel_mocs_init_engine(struct intel_engine_cs *engine) +{ + struct drm_i915_mocs_table table; + unsigned int flags; + + /* Called under a blanket forcewake */ + assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); + + flags = get_mocs_settings(engine->i915, &table); + if (!flags) + return; + + /* Platforms with global MOCS do not need per-engine initialization. */ + if (flags & HAS_ENGINE_MOCS) + init_mocs_table(engine, &table); + + if (flags & HAS_RENDER_L3CC && engine->class == RENDER_CLASS) + init_l3cc_table(engine, &table); +} + +static u32 global_mocs_offset(void) +{ + return i915_mmio_reg_offset(GEN12_GLOBAL_MOCS(0)); +} + +void intel_mocs_init(struct intel_gt *gt) +{ + struct drm_i915_mocs_table table; + unsigned int flags; + + /* + * LLC and eDRAM control values are not applicable to dgfx + */ + flags = get_mocs_settings(gt->i915, &table); + if (flags & HAS_GLOBAL_MOCS) + __init_mocs_table(gt->uncore, &table, global_mocs_offset()); +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_mocs.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.h b/drivers/gpu/drm/i915/gt/intel_mocs.h new file mode 100644 index 000000000..83371f3e6 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_mocs.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INTEL_MOCS_H +#define INTEL_MOCS_H + +/** + * DOC: Memory Objects Control State (MOCS) + * + * Motivation: + * In previous Gens the MOCS settings was a value that was set by user land as + * part of the batch. In Gen9 this has changed to be a single table (per ring) + * that all batches now reference by index instead of programming the MOCS + * directly. + * + * The one wrinkle in this is that only PART of the MOCS tables are included + * in context (The GFX_MOCS_0 - GFX_MOCS_64 and the LNCFCMOCS0 - LNCFCMOCS32 + * registers). The rest are not (the settings for the other rings). + * + * This table needs to be set at system start-up because the way the table + * interacts with the contexts and the GmmLib interface. + * + * + * Implementation: + * + * The tables (one per supported platform) are defined in intel_mocs.c + * and are programmed in the first batch after the context is loaded + * (with the hardware workarounds). This will then let the usual + * context handling keep the MOCS in step. + */ + +struct intel_engine_cs; +struct intel_gt; + +void intel_mocs_init(struct intel_gt *gt); +void intel_mocs_init_engine(struct intel_engine_cs *engine); + +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_ppgtt.c b/drivers/gpu/drm/i915/gt/intel_ppgtt.c new file mode 100644 index 000000000..46d9aceda --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_ppgtt.c @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +#include <linux/slab.h> + +#include "i915_trace.h" +#include "intel_gtt.h" +#include "gen6_ppgtt.h" +#include "gen8_ppgtt.h" + +struct i915_page_table *alloc_pt(struct i915_address_space *vm) +{ + struct i915_page_table *pt; + + pt = kmalloc(sizeof(*pt), I915_GFP_ALLOW_FAIL); + if (unlikely(!pt)) + return ERR_PTR(-ENOMEM); + + pt->base = vm->alloc_pt_dma(vm, I915_GTT_PAGE_SIZE_4K); + if (IS_ERR(pt->base)) { + kfree(pt); + return ERR_PTR(-ENOMEM); + } + + atomic_set(&pt->used, 0); + return pt; +} + +struct i915_page_directory *__alloc_pd(int count) +{ + struct i915_page_directory *pd; + + pd = kzalloc(sizeof(*pd), I915_GFP_ALLOW_FAIL); + if (unlikely(!pd)) + return NULL; + + pd->entry = kcalloc(count, sizeof(*pd->entry), I915_GFP_ALLOW_FAIL); + if (unlikely(!pd->entry)) { + kfree(pd); + return NULL; + } + + spin_lock_init(&pd->lock); + return pd; +} + +struct i915_page_directory *alloc_pd(struct i915_address_space *vm) +{ + struct i915_page_directory *pd; + + pd = __alloc_pd(I915_PDES); + if (unlikely(!pd)) + return ERR_PTR(-ENOMEM); + + pd->pt.base = vm->alloc_pt_dma(vm, I915_GTT_PAGE_SIZE_4K); + if (IS_ERR(pd->pt.base)) { + kfree(pd->entry); + kfree(pd); + return ERR_PTR(-ENOMEM); + } + + return pd; +} + +void free_px(struct i915_address_space *vm, struct i915_page_table *pt, int lvl) +{ + BUILD_BUG_ON(offsetof(struct i915_page_directory, pt)); + + if (lvl) { + struct i915_page_directory *pd = + container_of(pt, typeof(*pd), pt); + kfree(pd->entry); + } + + if (pt->base) + i915_gem_object_put(pt->base); + + kfree(pt); +} + +static inline void +write_dma_entry(struct drm_i915_gem_object * const pdma, + const unsigned short idx, + const u64 encoded_entry) +{ + u64 * const vaddr = kmap_atomic(__px_page(pdma)); + + vaddr[idx] = encoded_entry; + clflush_cache_range(&vaddr[idx], sizeof(u64)); + kunmap_atomic(vaddr); +} + +void +__set_pd_entry(struct i915_page_directory * const pd, + const unsigned short idx, + struct i915_page_table * const to, + u64 (*encode)(const dma_addr_t, const enum i915_cache_level)) +{ + /* Each thread pre-pins the pd, and we may have a thread per pde. */ + GEM_BUG_ON(atomic_read(px_used(pd)) > NALLOC * I915_PDES); + + atomic_inc(px_used(pd)); + pd->entry[idx] = to; + write_dma_entry(px_base(pd), idx, encode(px_dma(to), I915_CACHE_LLC)); +} + +void +clear_pd_entry(struct i915_page_directory * const pd, + const unsigned short idx, + const struct drm_i915_gem_object * const scratch) +{ + GEM_BUG_ON(atomic_read(px_used(pd)) == 0); + + write_dma_entry(px_base(pd), idx, scratch->encode); + pd->entry[idx] = NULL; + atomic_dec(px_used(pd)); +} + +bool +release_pd_entry(struct i915_page_directory * const pd, + const unsigned short idx, + struct i915_page_table * const pt, + const struct drm_i915_gem_object * const scratch) +{ + bool free = false; + + if (atomic_add_unless(&pt->used, -1, 1)) + return false; + + spin_lock(&pd->lock); + if (atomic_dec_and_test(&pt->used)) { + clear_pd_entry(pd, idx, scratch); + free = true; + } + spin_unlock(&pd->lock); + + return free; +} + +int i915_ppgtt_init_hw(struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + + gtt_write_workarounds(gt); + + if (IS_GEN(i915, 6)) + gen6_ppgtt_enable(gt); + else if (IS_GEN(i915, 7)) + gen7_ppgtt_enable(gt); + + return 0; +} + +static struct i915_ppgtt * +__ppgtt_create(struct intel_gt *gt) +{ + if (INTEL_GEN(gt->i915) < 8) + return gen6_ppgtt_create(gt); + else + return gen8_ppgtt_create(gt); +} + +struct i915_ppgtt *i915_ppgtt_create(struct intel_gt *gt) +{ + struct i915_ppgtt *ppgtt; + + ppgtt = __ppgtt_create(gt); + if (IS_ERR(ppgtt)) + return ppgtt; + + trace_i915_ppgtt_create(&ppgtt->vm); + + return ppgtt; +} + +void ppgtt_bind_vma(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + struct i915_vma *vma, + enum i915_cache_level cache_level, + u32 flags) +{ + u32 pte_flags; + + if (!test_bit(I915_VMA_ALLOC_BIT, __i915_vma_flags(vma))) { + vm->allocate_va_range(vm, stash, vma->node.start, vma->size); + set_bit(I915_VMA_ALLOC_BIT, __i915_vma_flags(vma)); + } + + /* Applicable to VLV, and gen8+ */ + pte_flags = 0; + if (i915_gem_object_is_readonly(vma->obj)) + pte_flags |= PTE_READ_ONLY; + + vm->insert_entries(vm, vma, cache_level, pte_flags); + wmb(); +} + +void ppgtt_unbind_vma(struct i915_address_space *vm, struct i915_vma *vma) +{ + if (test_and_clear_bit(I915_VMA_ALLOC_BIT, __i915_vma_flags(vma))) + vm->clear_range(vm, vma->node.start, vma->size); +} + +static unsigned long pd_count(u64 size, int shift) +{ + /* Beware later misalignment */ + return (size + 2 * (BIT_ULL(shift) - 1)) >> shift; +} + +int i915_vm_alloc_pt_stash(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash, + u64 size) +{ + unsigned long count; + int shift, n; + + shift = vm->pd_shift; + if (!shift) + return 0; + + count = pd_count(size, shift); + while (count--) { + struct i915_page_table *pt; + + pt = alloc_pt(vm); + if (IS_ERR(pt)) { + i915_vm_free_pt_stash(vm, stash); + return PTR_ERR(pt); + } + + pt->stash = stash->pt[0]; + stash->pt[0] = pt; + } + + for (n = 1; n < vm->top; n++) { + shift += ilog2(I915_PDES); /* Each PD holds 512 entries */ + count = pd_count(size, shift); + while (count--) { + struct i915_page_directory *pd; + + pd = alloc_pd(vm); + if (IS_ERR(pd)) { + i915_vm_free_pt_stash(vm, stash); + return PTR_ERR(pd); + } + + pd->pt.stash = stash->pt[1]; + stash->pt[1] = &pd->pt; + } + } + + return 0; +} + +int i915_vm_pin_pt_stash(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash) +{ + struct i915_page_table *pt; + int n, err; + + for (n = 0; n < ARRAY_SIZE(stash->pt); n++) { + for (pt = stash->pt[n]; pt; pt = pt->stash) { + err = pin_pt_dma(vm, pt->base); + if (err) + return err; + } + } + + return 0; +} + +void i915_vm_free_pt_stash(struct i915_address_space *vm, + struct i915_vm_pt_stash *stash) +{ + struct i915_page_table *pt; + int n; + + for (n = 0; n < ARRAY_SIZE(stash->pt); n++) { + while ((pt = stash->pt[n])) { + stash->pt[n] = pt->stash; + free_px(vm, pt, n); + } + } +} + +int ppgtt_set_pages(struct i915_vma *vma) +{ + GEM_BUG_ON(vma->pages); + + vma->pages = vma->obj->mm.pages; + vma->page_sizes = vma->obj->mm.page_sizes; + + return 0; +} + +void ppgtt_init(struct i915_ppgtt *ppgtt, struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + + ppgtt->vm.gt = gt; + ppgtt->vm.i915 = i915; + ppgtt->vm.dma = &i915->drm.pdev->dev; + ppgtt->vm.total = BIT_ULL(INTEL_INFO(i915)->ppgtt_size); + + i915_address_space_init(&ppgtt->vm, VM_CLASS_PPGTT); + + ppgtt->vm.vma_ops.bind_vma = ppgtt_bind_vma; + ppgtt->vm.vma_ops.unbind_vma = ppgtt_unbind_vma; + ppgtt->vm.vma_ops.set_pages = ppgtt_set_pages; + ppgtt->vm.vma_ops.clear_pages = clear_pages; +} diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.c b/drivers/gpu/drm/i915/gt/intel_rc6.c new file mode 100644 index 000000000..d7b8e4457 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_rc6.c @@ -0,0 +1,794 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include <linux/pm_runtime.h> + +#include "i915_drv.h" +#include "i915_vgpu.h" +#include "intel_gt.h" +#include "intel_gt_pm.h" +#include "intel_rc6.h" +#include "intel_sideband.h" + +/** + * DOC: RC6 + * + * RC6 is a special power stage which allows the GPU to enter an very + * low-voltage mode when idle, using down to 0V while at this stage. This + * stage is entered automatically when the GPU is idle when RC6 support is + * enabled, and as soon as new workload arises GPU wakes up automatically as + * well. + * + * There are different RC6 modes available in Intel GPU, which differentiate + * among each other with the latency required to enter and leave RC6 and + * voltage consumed by the GPU in different states. + * + * The combination of the following flags define which states GPU is allowed + * to enter, while RC6 is the normal RC6 state, RC6p is the deep RC6, and + * RC6pp is deepest RC6. Their support by hardware varies according to the + * GPU, BIOS, chipset and platform. RC6 is usually the safest one and the one + * which brings the most power savings; deeper states save more power, but + * require higher latency to switch to and wake up. + */ + +static struct intel_gt *rc6_to_gt(struct intel_rc6 *rc6) +{ + return container_of(rc6, struct intel_gt, rc6); +} + +static struct intel_uncore *rc6_to_uncore(struct intel_rc6 *rc) +{ + return rc6_to_gt(rc)->uncore; +} + +static struct drm_i915_private *rc6_to_i915(struct intel_rc6 *rc) +{ + return rc6_to_gt(rc)->i915; +} + +static inline void set(struct intel_uncore *uncore, i915_reg_t reg, u32 val) +{ + intel_uncore_write_fw(uncore, reg, val); +} + +static void gen11_rc6_enable(struct intel_rc6 *rc6) +{ + struct intel_gt *gt = rc6_to_gt(rc6); + struct intel_uncore *uncore = gt->uncore; + struct intel_engine_cs *engine; + enum intel_engine_id id; + u32 pg_enable; + int i; + + /* 2b: Program RC6 thresholds.*/ + set(uncore, GEN6_RC6_WAKE_RATE_LIMIT, 54 << 16 | 85); + set(uncore, GEN10_MEDIA_WAKE_RATE_LIMIT, 150); + + set(uncore, GEN6_RC_EVALUATION_INTERVAL, 125000); /* 12500 * 1280ns */ + set(uncore, GEN6_RC_IDLE_HYSTERSIS, 25); /* 25 * 1280ns */ + for_each_engine(engine, rc6_to_gt(rc6), id) + set(uncore, RING_MAX_IDLE(engine->mmio_base), 10); + + set(uncore, GUC_MAX_IDLE_COUNT, 0xA); + + set(uncore, GEN6_RC_SLEEP, 0); + + set(uncore, GEN6_RC6_THRESHOLD, 50000); /* 50/125ms per EI */ + + /* + * 2c: Program Coarse Power Gating Policies. + * + * Bspec's guidance is to use 25us (really 25 * 1280ns) here. What we + * use instead is a more conservative estimate for the maximum time + * it takes us to service a CS interrupt and submit a new ELSP - that + * is the time which the GPU is idle waiting for the CPU to select the + * next request to execute. If the idle hysteresis is less than that + * interrupt service latency, the hardware will automatically gate + * the power well and we will then incur the wake up cost on top of + * the service latency. A similar guide from plane_state is that we + * do not want the enable hysteresis to less than the wakeup latency. + * + * igt/gem_exec_nop/sequential provides a rough estimate for the + * service latency, and puts it under 10us for Icelake, similar to + * Broadwell+, To be conservative, we want to factor in a context + * switch on top (due to ksoftirqd). + */ + set(uncore, GEN9_MEDIA_PG_IDLE_HYSTERESIS, 60); + set(uncore, GEN9_RENDER_PG_IDLE_HYSTERESIS, 60); + + /* 3a: Enable RC6 */ + rc6->ctl_enable = + GEN6_RC_CTL_HW_ENABLE | + GEN6_RC_CTL_RC6_ENABLE | + GEN6_RC_CTL_EI_MODE(1); + + pg_enable = + GEN9_RENDER_PG_ENABLE | + GEN9_MEDIA_PG_ENABLE | + GEN11_MEDIA_SAMPLER_PG_ENABLE; + + if (INTEL_GEN(gt->i915) >= 12) { + for (i = 0; i < I915_MAX_VCS; i++) + if (HAS_ENGINE(gt, _VCS(i))) + pg_enable |= (VDN_HCP_POWERGATE_ENABLE(i) | + VDN_MFX_POWERGATE_ENABLE(i)); + } + + set(uncore, GEN9_PG_ENABLE, pg_enable); +} + +static void gen9_rc6_enable(struct intel_rc6 *rc6) +{ + struct intel_uncore *uncore = rc6_to_uncore(rc6); + struct intel_engine_cs *engine; + enum intel_engine_id id; + + /* 2b: Program RC6 thresholds.*/ + if (INTEL_GEN(rc6_to_i915(rc6)) >= 10) { + set(uncore, GEN6_RC6_WAKE_RATE_LIMIT, 54 << 16 | 85); + set(uncore, GEN10_MEDIA_WAKE_RATE_LIMIT, 150); + } else if (IS_SKYLAKE(rc6_to_i915(rc6))) { + /* + * WaRsDoubleRc6WrlWithCoarsePowerGating:skl Doubling WRL only + * when CPG is enabled + */ + set(uncore, GEN6_RC6_WAKE_RATE_LIMIT, 108 << 16); + } else { + set(uncore, GEN6_RC6_WAKE_RATE_LIMIT, 54 << 16); + } + + set(uncore, GEN6_RC_EVALUATION_INTERVAL, 125000); /* 12500 * 1280ns */ + set(uncore, GEN6_RC_IDLE_HYSTERSIS, 25); /* 25 * 1280ns */ + for_each_engine(engine, rc6_to_gt(rc6), id) + set(uncore, RING_MAX_IDLE(engine->mmio_base), 10); + + set(uncore, GUC_MAX_IDLE_COUNT, 0xA); + + set(uncore, GEN6_RC_SLEEP, 0); + + /* + * 2c: Program Coarse Power Gating Policies. + * + * Bspec's guidance is to use 25us (really 25 * 1280ns) here. What we + * use instead is a more conservative estimate for the maximum time + * it takes us to service a CS interrupt and submit a new ELSP - that + * is the time which the GPU is idle waiting for the CPU to select the + * next request to execute. If the idle hysteresis is less than that + * interrupt service latency, the hardware will automatically gate + * the power well and we will then incur the wake up cost on top of + * the service latency. A similar guide from plane_state is that we + * do not want the enable hysteresis to less than the wakeup latency. + * + * igt/gem_exec_nop/sequential provides a rough estimate for the + * service latency, and puts it around 10us for Broadwell (and other + * big core) and around 40us for Broxton (and other low power cores). + * [Note that for legacy ringbuffer submission, this is less than 1us!] + * However, the wakeup latency on Broxton is closer to 100us. To be + * conservative, we have to factor in a context switch on top (due + * to ksoftirqd). + */ + set(uncore, GEN9_MEDIA_PG_IDLE_HYSTERESIS, 250); + set(uncore, GEN9_RENDER_PG_IDLE_HYSTERESIS, 250); + + /* 3a: Enable RC6 */ + set(uncore, GEN6_RC6_THRESHOLD, 37500); /* 37.5/125ms per EI */ + + + rc6->ctl_enable = + GEN6_RC_CTL_HW_ENABLE | + GEN6_RC_CTL_RC6_ENABLE | + GEN6_RC_CTL_EI_MODE(1); + + /* + * WaRsDisableCoarsePowerGating:skl,cnl + * - Render/Media PG need to be disabled with RC6. + */ + if (!NEEDS_WaRsDisableCoarsePowerGating(rc6_to_i915(rc6))) + set(uncore, GEN9_PG_ENABLE, + GEN9_RENDER_PG_ENABLE | GEN9_MEDIA_PG_ENABLE); +} + +static void gen8_rc6_enable(struct intel_rc6 *rc6) +{ + struct intel_uncore *uncore = rc6_to_uncore(rc6); + struct intel_engine_cs *engine; + enum intel_engine_id id; + + /* 2b: Program RC6 thresholds.*/ + set(uncore, GEN6_RC6_WAKE_RATE_LIMIT, 40 << 16); + set(uncore, GEN6_RC_EVALUATION_INTERVAL, 125000); /* 12500 * 1280ns */ + set(uncore, GEN6_RC_IDLE_HYSTERSIS, 25); /* 25 * 1280ns */ + for_each_engine(engine, rc6_to_gt(rc6), id) + set(uncore, RING_MAX_IDLE(engine->mmio_base), 10); + set(uncore, GEN6_RC_SLEEP, 0); + set(uncore, GEN6_RC6_THRESHOLD, 625); /* 800us/1.28 for TO */ + + /* 3: Enable RC6 */ + rc6->ctl_enable = + GEN6_RC_CTL_HW_ENABLE | + GEN7_RC_CTL_TO_MODE | + GEN6_RC_CTL_RC6_ENABLE; +} + +static void gen6_rc6_enable(struct intel_rc6 *rc6) +{ + struct intel_uncore *uncore = rc6_to_uncore(rc6); + struct drm_i915_private *i915 = rc6_to_i915(rc6); + struct intel_engine_cs *engine; + enum intel_engine_id id; + u32 rc6vids, rc6_mask; + int ret; + + set(uncore, GEN6_RC1_WAKE_RATE_LIMIT, 1000 << 16); + set(uncore, GEN6_RC6_WAKE_RATE_LIMIT, 40 << 16 | 30); + set(uncore, GEN6_RC6pp_WAKE_RATE_LIMIT, 30); + set(uncore, GEN6_RC_EVALUATION_INTERVAL, 125000); + set(uncore, GEN6_RC_IDLE_HYSTERSIS, 25); + + for_each_engine(engine, rc6_to_gt(rc6), id) + set(uncore, RING_MAX_IDLE(engine->mmio_base), 10); + + set(uncore, GEN6_RC_SLEEP, 0); + set(uncore, GEN6_RC1e_THRESHOLD, 1000); + set(uncore, GEN6_RC6_THRESHOLD, 50000); + set(uncore, GEN6_RC6p_THRESHOLD, 150000); + set(uncore, GEN6_RC6pp_THRESHOLD, 64000); /* unused */ + + /* We don't use those on Haswell */ + rc6_mask = GEN6_RC_CTL_RC6_ENABLE; + if (HAS_RC6p(i915)) + rc6_mask |= GEN6_RC_CTL_RC6p_ENABLE; + if (HAS_RC6pp(i915)) + rc6_mask |= GEN6_RC_CTL_RC6pp_ENABLE; + rc6->ctl_enable = + rc6_mask | + GEN6_RC_CTL_EI_MODE(1) | + GEN6_RC_CTL_HW_ENABLE; + + rc6vids = 0; + ret = sandybridge_pcode_read(i915, GEN6_PCODE_READ_RC6VIDS, + &rc6vids, NULL); + if (IS_GEN(i915, 6) && ret) { + drm_dbg(&i915->drm, "Couldn't check for BIOS workaround\n"); + } else if (IS_GEN(i915, 6) && + (GEN6_DECODE_RC6_VID(rc6vids & 0xff) < 450)) { + drm_dbg(&i915->drm, + "You should update your BIOS. Correcting minimum rc6 voltage (%dmV->%dmV)\n", + GEN6_DECODE_RC6_VID(rc6vids & 0xff), 450); + rc6vids &= 0xffff00; + rc6vids |= GEN6_ENCODE_RC6_VID(450); + ret = sandybridge_pcode_write(i915, GEN6_PCODE_WRITE_RC6VIDS, rc6vids); + if (ret) + drm_err(&i915->drm, + "Couldn't fix incorrect rc6 voltage\n"); + } +} + +/* Check that the pcbr address is not empty. */ +static int chv_rc6_init(struct intel_rc6 *rc6) +{ + struct intel_uncore *uncore = rc6_to_uncore(rc6); + struct drm_i915_private *i915 = rc6_to_i915(rc6); + resource_size_t pctx_paddr, paddr; + resource_size_t pctx_size = 32 * SZ_1K; + u32 pcbr; + + pcbr = intel_uncore_read(uncore, VLV_PCBR); + if ((pcbr >> VLV_PCBR_ADDR_SHIFT) == 0) { + drm_dbg(&i915->drm, "BIOS didn't set up PCBR, fixing up\n"); + paddr = i915->dsm.end + 1 - pctx_size; + GEM_BUG_ON(paddr > U32_MAX); + + pctx_paddr = (paddr & ~4095); + intel_uncore_write(uncore, VLV_PCBR, pctx_paddr); + } + + return 0; +} + +static int vlv_rc6_init(struct intel_rc6 *rc6) +{ + struct drm_i915_private *i915 = rc6_to_i915(rc6); + struct intel_uncore *uncore = rc6_to_uncore(rc6); + struct drm_i915_gem_object *pctx; + resource_size_t pctx_paddr; + resource_size_t pctx_size = 24 * SZ_1K; + u32 pcbr; + + pcbr = intel_uncore_read(uncore, VLV_PCBR); + if (pcbr) { + /* BIOS set it up already, grab the pre-alloc'd space */ + resource_size_t pcbr_offset; + + pcbr_offset = (pcbr & ~4095) - i915->dsm.start; + pctx = i915_gem_object_create_stolen_for_preallocated(i915, + pcbr_offset, + pctx_size); + if (IS_ERR(pctx)) + return PTR_ERR(pctx); + + goto out; + } + + drm_dbg(&i915->drm, "BIOS didn't set up PCBR, fixing up\n"); + + /* + * From the Gunit register HAS: + * The Gfx driver is expected to program this register and ensure + * proper allocation within Gfx stolen memory. For example, this + * register should be programmed such than the PCBR range does not + * overlap with other ranges, such as the frame buffer, protected + * memory, or any other relevant ranges. + */ + pctx = i915_gem_object_create_stolen(i915, pctx_size); + if (IS_ERR(pctx)) { + drm_dbg(&i915->drm, + "not enough stolen space for PCTX, disabling\n"); + return PTR_ERR(pctx); + } + + GEM_BUG_ON(range_overflows_end_t(u64, + i915->dsm.start, + pctx->stolen->start, + U32_MAX)); + pctx_paddr = i915->dsm.start + pctx->stolen->start; + intel_uncore_write(uncore, VLV_PCBR, pctx_paddr); + +out: + rc6->pctx = pctx; + return 0; +} + +static void chv_rc6_enable(struct intel_rc6 *rc6) +{ + struct intel_uncore *uncore = rc6_to_uncore(rc6); + struct intel_engine_cs *engine; + enum intel_engine_id id; + + /* 2a: Program RC6 thresholds.*/ + set(uncore, GEN6_RC6_WAKE_RATE_LIMIT, 40 << 16); + set(uncore, GEN6_RC_EVALUATION_INTERVAL, 125000); /* 12500 * 1280ns */ + set(uncore, GEN6_RC_IDLE_HYSTERSIS, 25); /* 25 * 1280ns */ + + for_each_engine(engine, rc6_to_gt(rc6), id) + set(uncore, RING_MAX_IDLE(engine->mmio_base), 10); + set(uncore, GEN6_RC_SLEEP, 0); + + /* TO threshold set to 500 us (0x186 * 1.28 us) */ + set(uncore, GEN6_RC6_THRESHOLD, 0x186); + + /* Allows RC6 residency counter to work */ + set(uncore, VLV_COUNTER_CONTROL, + _MASKED_BIT_ENABLE(VLV_COUNT_RANGE_HIGH | + VLV_MEDIA_RC6_COUNT_EN | + VLV_RENDER_RC6_COUNT_EN)); + + /* 3: Enable RC6 */ + rc6->ctl_enable = GEN7_RC_CTL_TO_MODE; +} + +static void vlv_rc6_enable(struct intel_rc6 *rc6) +{ + struct intel_uncore *uncore = rc6_to_uncore(rc6); + struct intel_engine_cs *engine; + enum intel_engine_id id; + + set(uncore, GEN6_RC6_WAKE_RATE_LIMIT, 0x00280000); + set(uncore, GEN6_RC_EVALUATION_INTERVAL, 125000); + set(uncore, GEN6_RC_IDLE_HYSTERSIS, 25); + + for_each_engine(engine, rc6_to_gt(rc6), id) + set(uncore, RING_MAX_IDLE(engine->mmio_base), 10); + + set(uncore, GEN6_RC6_THRESHOLD, 0x557); + + /* Allows RC6 residency counter to work */ + set(uncore, VLV_COUNTER_CONTROL, + _MASKED_BIT_ENABLE(VLV_COUNT_RANGE_HIGH | + VLV_MEDIA_RC0_COUNT_EN | + VLV_RENDER_RC0_COUNT_EN | + VLV_MEDIA_RC6_COUNT_EN | + VLV_RENDER_RC6_COUNT_EN)); + + rc6->ctl_enable = + GEN7_RC_CTL_TO_MODE | VLV_RC_CTL_CTX_RST_PARALLEL; +} + +static bool bxt_check_bios_rc6_setup(struct intel_rc6 *rc6) +{ + struct intel_uncore *uncore = rc6_to_uncore(rc6); + struct drm_i915_private *i915 = rc6_to_i915(rc6); + u32 rc6_ctx_base, rc_ctl, rc_sw_target; + bool enable_rc6 = true; + + rc_ctl = intel_uncore_read(uncore, GEN6_RC_CONTROL); + rc_sw_target = intel_uncore_read(uncore, GEN6_RC_STATE); + rc_sw_target &= RC_SW_TARGET_STATE_MASK; + rc_sw_target >>= RC_SW_TARGET_STATE_SHIFT; + drm_dbg(&i915->drm, "BIOS enabled RC states: " + "HW_CTRL %s HW_RC6 %s SW_TARGET_STATE %x\n", + onoff(rc_ctl & GEN6_RC_CTL_HW_ENABLE), + onoff(rc_ctl & GEN6_RC_CTL_RC6_ENABLE), + rc_sw_target); + + if (!(intel_uncore_read(uncore, RC6_LOCATION) & RC6_CTX_IN_DRAM)) { + drm_dbg(&i915->drm, "RC6 Base location not set properly.\n"); + enable_rc6 = false; + } + + /* + * The exact context size is not known for BXT, so assume a page size + * for this check. + */ + rc6_ctx_base = + intel_uncore_read(uncore, RC6_CTX_BASE) & RC6_CTX_BASE_MASK; + if (!(rc6_ctx_base >= i915->dsm_reserved.start && + rc6_ctx_base + PAGE_SIZE < i915->dsm_reserved.end)) { + drm_dbg(&i915->drm, "RC6 Base address not as expected.\n"); + enable_rc6 = false; + } + + if (!((intel_uncore_read(uncore, PWRCTX_MAXCNT_RCSUNIT) & IDLE_TIME_MASK) > 1 && + (intel_uncore_read(uncore, PWRCTX_MAXCNT_VCSUNIT0) & IDLE_TIME_MASK) > 1 && + (intel_uncore_read(uncore, PWRCTX_MAXCNT_BCSUNIT) & IDLE_TIME_MASK) > 1 && + (intel_uncore_read(uncore, PWRCTX_MAXCNT_VECSUNIT) & IDLE_TIME_MASK) > 1)) { + drm_dbg(&i915->drm, + "Engine Idle wait time not set properly.\n"); + enable_rc6 = false; + } + + if (!intel_uncore_read(uncore, GEN8_PUSHBUS_CONTROL) || + !intel_uncore_read(uncore, GEN8_PUSHBUS_ENABLE) || + !intel_uncore_read(uncore, GEN8_PUSHBUS_SHIFT)) { + drm_dbg(&i915->drm, "Pushbus not setup properly.\n"); + enable_rc6 = false; + } + + if (!intel_uncore_read(uncore, GEN6_GFXPAUSE)) { + drm_dbg(&i915->drm, "GFX pause not setup properly.\n"); + enable_rc6 = false; + } + + if (!intel_uncore_read(uncore, GEN8_MISC_CTRL0)) { + drm_dbg(&i915->drm, "GPM control not setup properly.\n"); + enable_rc6 = false; + } + + return enable_rc6; +} + +static bool rc6_supported(struct intel_rc6 *rc6) +{ + struct drm_i915_private *i915 = rc6_to_i915(rc6); + + if (!HAS_RC6(i915)) + return false; + + if (intel_vgpu_active(i915)) + return false; + + if (is_mock_gt(rc6_to_gt(rc6))) + return false; + + if (IS_GEN9_LP(i915) && !bxt_check_bios_rc6_setup(rc6)) { + drm_notice(&i915->drm, + "RC6 and powersaving disabled by BIOS\n"); + return false; + } + + return true; +} + +static void rpm_get(struct intel_rc6 *rc6) +{ + GEM_BUG_ON(rc6->wakeref); + pm_runtime_get_sync(&rc6_to_i915(rc6)->drm.pdev->dev); + rc6->wakeref = true; +} + +static void rpm_put(struct intel_rc6 *rc6) +{ + GEM_BUG_ON(!rc6->wakeref); + pm_runtime_put(&rc6_to_i915(rc6)->drm.pdev->dev); + rc6->wakeref = false; +} + +static bool pctx_corrupted(struct intel_rc6 *rc6) +{ + struct drm_i915_private *i915 = rc6_to_i915(rc6); + + if (!NEEDS_RC6_CTX_CORRUPTION_WA(i915)) + return false; + + if (intel_uncore_read(rc6_to_uncore(rc6), GEN8_RC6_CTX_INFO)) + return false; + + drm_notice(&i915->drm, + "RC6 context corruption, disabling runtime power management\n"); + return true; +} + +static void __intel_rc6_disable(struct intel_rc6 *rc6) +{ + struct drm_i915_private *i915 = rc6_to_i915(rc6); + struct intel_uncore *uncore = rc6_to_uncore(rc6); + + intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); + if (INTEL_GEN(i915) >= 9) + set(uncore, GEN9_PG_ENABLE, 0); + set(uncore, GEN6_RC_CONTROL, 0); + set(uncore, GEN6_RC_STATE, 0); + intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL); +} + +void intel_rc6_init(struct intel_rc6 *rc6) +{ + struct drm_i915_private *i915 = rc6_to_i915(rc6); + int err; + + /* Disable runtime-pm until we can save the GPU state with rc6 pctx */ + rpm_get(rc6); + + if (!rc6_supported(rc6)) + return; + + if (IS_CHERRYVIEW(i915)) + err = chv_rc6_init(rc6); + else if (IS_VALLEYVIEW(i915)) + err = vlv_rc6_init(rc6); + else + err = 0; + + /* Sanitize rc6, ensure it is disabled before we are ready. */ + __intel_rc6_disable(rc6); + + rc6->supported = err == 0; +} + +void intel_rc6_sanitize(struct intel_rc6 *rc6) +{ + memset(rc6->prev_hw_residency, 0, sizeof(rc6->prev_hw_residency)); + + if (rc6->enabled) { /* unbalanced suspend/resume */ + rpm_get(rc6); + rc6->enabled = false; + } + + if (rc6->supported) + __intel_rc6_disable(rc6); +} + +void intel_rc6_enable(struct intel_rc6 *rc6) +{ + struct drm_i915_private *i915 = rc6_to_i915(rc6); + struct intel_uncore *uncore = rc6_to_uncore(rc6); + + if (!rc6->supported) + return; + + GEM_BUG_ON(rc6->enabled); + + intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); + + if (IS_CHERRYVIEW(i915)) + chv_rc6_enable(rc6); + else if (IS_VALLEYVIEW(i915)) + vlv_rc6_enable(rc6); + else if (INTEL_GEN(i915) >= 11) + gen11_rc6_enable(rc6); + else if (INTEL_GEN(i915) >= 9) + gen9_rc6_enable(rc6); + else if (IS_BROADWELL(i915)) + gen8_rc6_enable(rc6); + else if (INTEL_GEN(i915) >= 6) + gen6_rc6_enable(rc6); + + rc6->manual = rc6->ctl_enable & GEN6_RC_CTL_RC6_ENABLE; + if (NEEDS_RC6_CTX_CORRUPTION_WA(i915)) + rc6->ctl_enable = 0; + + intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL); + + if (unlikely(pctx_corrupted(rc6))) + return; + + /* rc6 is ready, runtime-pm is go! */ + rpm_put(rc6); + rc6->enabled = true; +} + +void intel_rc6_unpark(struct intel_rc6 *rc6) +{ + struct intel_uncore *uncore = rc6_to_uncore(rc6); + + if (!rc6->enabled) + return; + + /* Restore HW timers for automatic RC6 entry while busy */ + set(uncore, GEN6_RC_CONTROL, rc6->ctl_enable); +} + +void intel_rc6_park(struct intel_rc6 *rc6) +{ + struct intel_uncore *uncore = rc6_to_uncore(rc6); + unsigned int target; + + if (!rc6->enabled) + return; + + if (unlikely(pctx_corrupted(rc6))) { + intel_rc6_disable(rc6); + return; + } + + if (!rc6->manual) + return; + + /* Turn off the HW timers and go directly to rc6 */ + set(uncore, GEN6_RC_CONTROL, GEN6_RC_CTL_RC6_ENABLE); + + if (HAS_RC6pp(rc6_to_i915(rc6))) + target = 0x6; /* deepest rc6 */ + else if (HAS_RC6p(rc6_to_i915(rc6))) + target = 0x5; /* deep rc6 */ + else + target = 0x4; /* normal rc6 */ + set(uncore, GEN6_RC_STATE, target << RC_SW_TARGET_STATE_SHIFT); +} + +void intel_rc6_disable(struct intel_rc6 *rc6) +{ + if (!rc6->enabled) + return; + + rpm_get(rc6); + rc6->enabled = false; + + __intel_rc6_disable(rc6); +} + +void intel_rc6_fini(struct intel_rc6 *rc6) +{ + struct drm_i915_gem_object *pctx; + + intel_rc6_disable(rc6); + + pctx = fetch_and_zero(&rc6->pctx); + if (pctx) + i915_gem_object_put(pctx); + + if (rc6->wakeref) + rpm_put(rc6); +} + +static u64 vlv_residency_raw(struct intel_uncore *uncore, const i915_reg_t reg) +{ + u32 lower, upper, tmp; + int loop = 2; + + /* + * The register accessed do not need forcewake. We borrow + * uncore lock to prevent concurrent access to range reg. + */ + lockdep_assert_held(&uncore->lock); + + /* + * vlv and chv residency counters are 40 bits in width. + * With a control bit, we can choose between upper or lower + * 32bit window into this counter. + * + * Although we always use the counter in high-range mode elsewhere, + * userspace may attempt to read the value before rc6 is initialised, + * before we have set the default VLV_COUNTER_CONTROL value. So always + * set the high bit to be safe. + */ + set(uncore, VLV_COUNTER_CONTROL, + _MASKED_BIT_ENABLE(VLV_COUNT_RANGE_HIGH)); + upper = intel_uncore_read_fw(uncore, reg); + do { + tmp = upper; + + set(uncore, VLV_COUNTER_CONTROL, + _MASKED_BIT_DISABLE(VLV_COUNT_RANGE_HIGH)); + lower = intel_uncore_read_fw(uncore, reg); + + set(uncore, VLV_COUNTER_CONTROL, + _MASKED_BIT_ENABLE(VLV_COUNT_RANGE_HIGH)); + upper = intel_uncore_read_fw(uncore, reg); + } while (upper != tmp && --loop); + + /* + * Everywhere else we always use VLV_COUNTER_CONTROL with the + * VLV_COUNT_RANGE_HIGH bit set - so it is safe to leave it set + * now. + */ + + return lower | (u64)upper << 8; +} + +u64 intel_rc6_residency_ns(struct intel_rc6 *rc6, const i915_reg_t reg) +{ + struct drm_i915_private *i915 = rc6_to_i915(rc6); + struct intel_uncore *uncore = rc6_to_uncore(rc6); + u64 time_hw, prev_hw, overflow_hw; + unsigned int fw_domains; + unsigned long flags; + unsigned int i; + u32 mul, div; + + if (!rc6->supported) + return 0; + + /* + * Store previous hw counter values for counter wrap-around handling. + * + * There are only four interesting registers and they live next to each + * other so we can use the relative address, compared to the smallest + * one as the index into driver storage. + */ + i = (i915_mmio_reg_offset(reg) - + i915_mmio_reg_offset(GEN6_GT_GFX_RC6_LOCKED)) / sizeof(u32); + if (drm_WARN_ON_ONCE(&i915->drm, i >= ARRAY_SIZE(rc6->cur_residency))) + return 0; + + fw_domains = intel_uncore_forcewake_for_reg(uncore, reg, FW_REG_READ); + + spin_lock_irqsave(&uncore->lock, flags); + intel_uncore_forcewake_get__locked(uncore, fw_domains); + + /* On VLV and CHV, residency time is in CZ units rather than 1.28us */ + if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915)) { + mul = 1000000; + div = i915->czclk_freq; + overflow_hw = BIT_ULL(40); + time_hw = vlv_residency_raw(uncore, reg); + } else { + /* 833.33ns units on Gen9LP, 1.28us elsewhere. */ + if (IS_GEN9_LP(i915)) { + mul = 10000; + div = 12; + } else { + mul = 1280; + div = 1; + } + + overflow_hw = BIT_ULL(32); + time_hw = intel_uncore_read_fw(uncore, reg); + } + + /* + * Counter wrap handling. + * + * But relying on a sufficient frequency of queries otherwise counters + * can still wrap. + */ + prev_hw = rc6->prev_hw_residency[i]; + rc6->prev_hw_residency[i] = time_hw; + + /* RC6 delta from last sample. */ + if (time_hw >= prev_hw) + time_hw -= prev_hw; + else + time_hw += overflow_hw - prev_hw; + + /* Add delta to RC6 extended raw driver copy. */ + time_hw += rc6->cur_residency[i]; + rc6->cur_residency[i] = time_hw; + + intel_uncore_forcewake_put__locked(uncore, fw_domains); + spin_unlock_irqrestore(&uncore->lock, flags); + + return mul_u64_u32_div(time_hw, mul, div); +} + +u64 intel_rc6_residency_us(struct intel_rc6 *rc6, i915_reg_t reg) +{ + return DIV_ROUND_UP_ULL(intel_rc6_residency_ns(rc6, reg), 1000); +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_rc6.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.h b/drivers/gpu/drm/i915/gt/intel_rc6.h new file mode 100644 index 000000000..9f0f23fca --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_rc6.h @@ -0,0 +1,28 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_RC6_H +#define INTEL_RC6_H + +#include "i915_reg.h" + +struct intel_engine_cs; +struct intel_rc6; + +void intel_rc6_init(struct intel_rc6 *rc6); +void intel_rc6_fini(struct intel_rc6 *rc6); + +void intel_rc6_unpark(struct intel_rc6 *rc6); +void intel_rc6_park(struct intel_rc6 *rc6); + +void intel_rc6_sanitize(struct intel_rc6 *rc6); +void intel_rc6_enable(struct intel_rc6 *rc6); +void intel_rc6_disable(struct intel_rc6 *rc6); + +u64 intel_rc6_residency_ns(struct intel_rc6 *rc6, i915_reg_t reg); +u64 intel_rc6_residency_us(struct intel_rc6 *rc6, i915_reg_t reg); + +#endif /* INTEL_RC6_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_rc6_types.h b/drivers/gpu/drm/i915/gt/intel_rc6_types.h new file mode 100644 index 000000000..bfbb623f7 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_rc6_types.h @@ -0,0 +1,31 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_RC6_TYPES_H +#define INTEL_RC6_TYPES_H + +#include <linux/spinlock.h> +#include <linux/types.h> + +#include "intel_engine_types.h" + +struct drm_i915_gem_object; + +struct intel_rc6 { + u64 prev_hw_residency[4]; + u64 cur_residency[4]; + + u32 ctl_enable; + + struct drm_i915_gem_object *pctx; + + bool supported : 1; + bool enabled : 1; + bool manual : 1; + bool wakeref : 1; +}; + +#endif /* INTEL_RC6_TYPES_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_renderstate.c b/drivers/gpu/drm/i915/gt/intel_renderstate.c new file mode 100644 index 000000000..ea2a77c7b --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_renderstate.c @@ -0,0 +1,272 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Mika Kuoppala <mika.kuoppala@intel.com> + * + */ + +#include "i915_drv.h" +#include "intel_renderstate.h" +#include "gt/intel_context.h" +#include "intel_ring.h" + +static const struct intel_renderstate_rodata * +render_state_get_rodata(const struct intel_engine_cs *engine) +{ + if (engine->class != RENDER_CLASS) + return NULL; + + switch (INTEL_GEN(engine->i915)) { + case 6: + return &gen6_null_state; + case 7: + return &gen7_null_state; + case 8: + return &gen8_null_state; + case 9: + return &gen9_null_state; + } + + return NULL; +} + +/* + * Macro to add commands to auxiliary batch. + * This macro only checks for page overflow before inserting the commands, + * this is sufficient as the null state generator makes the final batch + * with two passes to build command and state separately. At this point + * the size of both are known and it compacts them by relocating the state + * right after the commands taking care of alignment so we should sufficient + * space below them for adding new commands. + */ +#define OUT_BATCH(batch, i, val) \ + do { \ + if ((i) >= PAGE_SIZE / sizeof(u32)) \ + goto out; \ + (batch)[(i)++] = (val); \ + } while(0) + +static int render_state_setup(struct intel_renderstate *so, + struct drm_i915_private *i915) +{ + const struct intel_renderstate_rodata *rodata = so->rodata; + unsigned int i = 0, reloc_index = 0; + int ret = -EINVAL; + u32 *d; + + d = i915_gem_object_pin_map(so->vma->obj, I915_MAP_WB); + if (IS_ERR(d)) + return PTR_ERR(d); + + while (i < rodata->batch_items) { + u32 s = rodata->batch[i]; + + if (i * 4 == rodata->reloc[reloc_index]) { + u64 r = s + so->vma->node.start; + s = lower_32_bits(r); + if (HAS_64BIT_RELOC(i915)) { + if (i + 1 >= rodata->batch_items || + rodata->batch[i + 1] != 0) + goto out; + + d[i++] = s; + s = upper_32_bits(r); + } + + reloc_index++; + } + + d[i++] = s; + } + + if (rodata->reloc[reloc_index] != -1) { + drm_err(&i915->drm, "only %d relocs resolved\n", reloc_index); + goto out; + } + + so->batch_offset = i915_ggtt_offset(so->vma); + so->batch_size = rodata->batch_items * sizeof(u32); + + while (i % CACHELINE_DWORDS) + OUT_BATCH(d, i, MI_NOOP); + + so->aux_offset = i * sizeof(u32); + + if (HAS_POOLED_EU(i915)) { + /* + * We always program 3x6 pool config but depending upon which + * subslice is disabled HW drops down to appropriate config + * shown below. + * + * In the below table 2x6 config always refers to + * fused-down version, native 2x6 is not available and can + * be ignored + * + * SNo subslices config eu pool configuration + * ----------------------------------------------------------- + * 1 3 subslices enabled (3x6) - 0x00777000 (9+9) + * 2 ss0 disabled (2x6) - 0x00777000 (3+9) + * 3 ss1 disabled (2x6) - 0x00770000 (6+6) + * 4 ss2 disabled (2x6) - 0x00007000 (9+3) + */ + u32 eu_pool_config = 0x00777000; + + OUT_BATCH(d, i, GEN9_MEDIA_POOL_STATE); + OUT_BATCH(d, i, GEN9_MEDIA_POOL_ENABLE); + OUT_BATCH(d, i, eu_pool_config); + OUT_BATCH(d, i, 0); + OUT_BATCH(d, i, 0); + OUT_BATCH(d, i, 0); + } + + OUT_BATCH(d, i, MI_BATCH_BUFFER_END); + so->aux_size = i * sizeof(u32) - so->aux_offset; + so->aux_offset += so->batch_offset; + /* + * Since we are sending length, we need to strictly conform to + * all requirements. For Gen2 this must be a multiple of 8. + */ + so->aux_size = ALIGN(so->aux_size, 8); + + ret = 0; +out: + __i915_gem_object_flush_map(so->vma->obj, 0, i * sizeof(u32)); + __i915_gem_object_release_map(so->vma->obj); + return ret; +} + +#undef OUT_BATCH + +int intel_renderstate_init(struct intel_renderstate *so, + struct intel_context *ce) +{ + struct intel_engine_cs *engine = ce->engine; + struct drm_i915_gem_object *obj = NULL; + int err; + + memset(so, 0, sizeof(*so)); + + so->rodata = render_state_get_rodata(engine); + if (so->rodata) { + if (so->rodata->batch_items * 4 > PAGE_SIZE) + return -EINVAL; + + obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + so->vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); + if (IS_ERR(so->vma)) { + err = PTR_ERR(so->vma); + goto err_obj; + } + } + + i915_gem_ww_ctx_init(&so->ww, true); +retry: + err = intel_context_pin_ww(ce, &so->ww); + if (err) + goto err_fini; + + /* return early if there's nothing to setup */ + if (!err && !so->rodata) + return 0; + + err = i915_gem_object_lock(so->vma->obj, &so->ww); + if (err) + goto err_context; + + err = i915_vma_pin(so->vma, 0, 0, PIN_GLOBAL | PIN_HIGH); + if (err) + goto err_context; + + err = render_state_setup(so, engine->i915); + if (err) + goto err_unpin; + + return 0; + +err_unpin: + i915_vma_unpin(so->vma); +err_context: + intel_context_unpin(ce); +err_fini: + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&so->ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&so->ww); +err_obj: + if (obj) + i915_gem_object_put(obj); + so->vma = NULL; + return err; +} + +int intel_renderstate_emit(struct intel_renderstate *so, + struct i915_request *rq) +{ + struct intel_engine_cs *engine = rq->engine; + int err; + + if (!so->vma) + return 0; + + err = i915_request_await_object(rq, so->vma->obj, false); + if (err == 0) + err = i915_vma_move_to_active(so->vma, rq, 0); + if (err) + return err; + + err = engine->emit_bb_start(rq, + so->batch_offset, so->batch_size, + I915_DISPATCH_SECURE); + if (err) + return err; + + if (so->aux_size > 8) { + err = engine->emit_bb_start(rq, + so->aux_offset, so->aux_size, + I915_DISPATCH_SECURE); + if (err) + return err; + } + + return 0; +} + +void intel_renderstate_fini(struct intel_renderstate *so, + struct intel_context *ce) +{ + if (so->vma) { + i915_vma_unpin(so->vma); + i915_vma_close(so->vma); + } + + intel_context_unpin(ce); + i915_gem_ww_ctx_fini(&so->ww); + + if (so->vma) + i915_gem_object_put(so->vma->obj); +} diff --git a/drivers/gpu/drm/i915/gt/intel_renderstate.h b/drivers/gpu/drm/i915/gt/intel_renderstate.h new file mode 100644 index 000000000..713aa1e86 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_renderstate.h @@ -0,0 +1,69 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef _INTEL_RENDERSTATE_H_ +#define _INTEL_RENDERSTATE_H_ + +#include <linux/types.h> +#include "i915_gem.h" + +struct i915_request; +struct intel_context; +struct i915_vma; + +struct intel_renderstate_rodata { + const u32 *reloc; + const u32 *batch; + const u32 batch_items; +}; + +#define RO_RENDERSTATE(_g) \ + const struct intel_renderstate_rodata gen ## _g ## _null_state = { \ + .reloc = gen ## _g ## _null_state_relocs, \ + .batch = gen ## _g ## _null_state_batch, \ + .batch_items = sizeof(gen ## _g ## _null_state_batch)/4, \ + } + +extern const struct intel_renderstate_rodata gen6_null_state; +extern const struct intel_renderstate_rodata gen7_null_state; +extern const struct intel_renderstate_rodata gen8_null_state; +extern const struct intel_renderstate_rodata gen9_null_state; + +struct intel_renderstate { + struct i915_gem_ww_ctx ww; + const struct intel_renderstate_rodata *rodata; + struct i915_vma *vma; + u32 batch_offset; + u32 batch_size; + u32 aux_offset; + u32 aux_size; +}; + +int intel_renderstate_init(struct intel_renderstate *so, + struct intel_context *ce); +int intel_renderstate_emit(struct intel_renderstate *so, + struct i915_request *rq); +void intel_renderstate_fini(struct intel_renderstate *so, + struct intel_context *ce); + +#endif /* _INTEL_RENDERSTATE_H_ */ diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c new file mode 100644 index 000000000..00b5912a8 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -0,0 +1,1444 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2008-2018 Intel Corporation + */ + +#include <linux/sched/mm.h> +#include <linux/stop_machine.h> + +#include "display/intel_display_types.h" +#include "display/intel_overlay.h" + +#include "gem/i915_gem_context.h" + +#include "i915_drv.h" +#include "i915_gpu_error.h" +#include "i915_irq.h" +#include "intel_breadcrumbs.h" +#include "intel_engine_pm.h" +#include "intel_gt.h" +#include "intel_gt_pm.h" +#include "intel_reset.h" + +#include "uc/intel_guc.h" +#include "uc/intel_guc_submission.h" + +#define RESET_MAX_RETRIES 3 + +/* XXX How to handle concurrent GGTT updates using tiling registers? */ +#define RESET_UNDER_STOP_MACHINE 0 + +static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set) +{ + intel_uncore_rmw_fw(uncore, reg, 0, set); +} + +static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr) +{ + intel_uncore_rmw_fw(uncore, reg, clr, 0); +} + +static void engine_skip_context(struct i915_request *rq) +{ + struct intel_engine_cs *engine = rq->engine; + struct intel_context *hung_ctx = rq->context; + + if (!i915_request_is_active(rq)) + return; + + lockdep_assert_held(&engine->active.lock); + list_for_each_entry_continue(rq, &engine->active.requests, sched.link) + if (rq->context == hung_ctx) { + i915_request_set_error_once(rq, -EIO); + __i915_request_skip(rq); + } +} + +static void client_mark_guilty(struct i915_gem_context *ctx, bool banned) +{ + struct drm_i915_file_private *file_priv = ctx->file_priv; + unsigned long prev_hang; + unsigned int score; + + if (IS_ERR_OR_NULL(file_priv)) + return; + + score = 0; + if (banned) + score = I915_CLIENT_SCORE_CONTEXT_BAN; + + prev_hang = xchg(&file_priv->hang_timestamp, jiffies); + if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES)) + score += I915_CLIENT_SCORE_HANG_FAST; + + if (score) { + atomic_add(score, &file_priv->ban_score); + + drm_dbg(&ctx->i915->drm, + "client %s: gained %u ban score, now %u\n", + ctx->name, score, + atomic_read(&file_priv->ban_score)); + } +} + +static bool mark_guilty(struct i915_request *rq) +{ + struct i915_gem_context *ctx; + unsigned long prev_hang; + bool banned; + int i; + + if (intel_context_is_closed(rq->context)) { + intel_context_set_banned(rq->context); + return true; + } + + rcu_read_lock(); + ctx = rcu_dereference(rq->context->gem_context); + if (ctx && !kref_get_unless_zero(&ctx->ref)) + ctx = NULL; + rcu_read_unlock(); + if (!ctx) + return intel_context_is_banned(rq->context); + + atomic_inc(&ctx->guilty_count); + + /* Cool contexts are too cool to be banned! (Used for reset testing.) */ + if (!i915_gem_context_is_bannable(ctx)) { + banned = false; + goto out; + } + + drm_notice(&ctx->i915->drm, + "%s context reset due to GPU hang\n", + ctx->name); + + /* Record the timestamp for the last N hangs */ + prev_hang = ctx->hang_timestamp[0]; + for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++) + ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1]; + ctx->hang_timestamp[i] = jiffies; + + /* If we have hung N+1 times in rapid succession, we ban the context! */ + banned = !i915_gem_context_is_recoverable(ctx); + if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES)) + banned = true; + if (banned) { + drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n", + ctx->name, atomic_read(&ctx->guilty_count)); + intel_context_set_banned(rq->context); + } + + client_mark_guilty(ctx, banned); + +out: + i915_gem_context_put(ctx); + return banned; +} + +static void mark_innocent(struct i915_request *rq) +{ + struct i915_gem_context *ctx; + + rcu_read_lock(); + ctx = rcu_dereference(rq->context->gem_context); + if (ctx) + atomic_inc(&ctx->active_count); + rcu_read_unlock(); +} + +void __i915_request_reset(struct i915_request *rq, bool guilty) +{ + RQ_TRACE(rq, "guilty? %s\n", yesno(guilty)); + + GEM_BUG_ON(i915_request_completed(rq)); + + rcu_read_lock(); /* protect the GEM context */ + if (guilty) { + i915_request_set_error_once(rq, -EIO); + __i915_request_skip(rq); + if (mark_guilty(rq)) + engine_skip_context(rq); + } else { + i915_request_set_error_once(rq, -EAGAIN); + mark_innocent(rq); + } + rcu_read_unlock(); +} + +static bool i915_in_reset(struct pci_dev *pdev) +{ + u8 gdrst; + + pci_read_config_byte(pdev, I915_GDRST, &gdrst); + return gdrst & GRDOM_RESET_STATUS; +} + +static int i915_do_reset(struct intel_gt *gt, + intel_engine_mask_t engine_mask, + unsigned int retry) +{ + struct pci_dev *pdev = gt->i915->drm.pdev; + int err; + + /* Assert reset for at least 20 usec, and wait for acknowledgement. */ + pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); + udelay(50); + err = wait_for_atomic(i915_in_reset(pdev), 50); + + /* Clear the reset request. */ + pci_write_config_byte(pdev, I915_GDRST, 0); + udelay(50); + if (!err) + err = wait_for_atomic(!i915_in_reset(pdev), 50); + + return err; +} + +static bool g4x_reset_complete(struct pci_dev *pdev) +{ + u8 gdrst; + + pci_read_config_byte(pdev, I915_GDRST, &gdrst); + return (gdrst & GRDOM_RESET_ENABLE) == 0; +} + +static int g33_do_reset(struct intel_gt *gt, + intel_engine_mask_t engine_mask, + unsigned int retry) +{ + struct pci_dev *pdev = gt->i915->drm.pdev; + + pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); + return wait_for_atomic(g4x_reset_complete(pdev), 50); +} + +static int g4x_do_reset(struct intel_gt *gt, + intel_engine_mask_t engine_mask, + unsigned int retry) +{ + struct pci_dev *pdev = gt->i915->drm.pdev; + struct intel_uncore *uncore = gt->uncore; + int ret; + + /* WaVcpClkGateDisableForMediaReset:ctg,elk */ + rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); + intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); + + pci_write_config_byte(pdev, I915_GDRST, + GRDOM_MEDIA | GRDOM_RESET_ENABLE); + ret = wait_for_atomic(g4x_reset_complete(pdev), 50); + if (ret) { + drm_dbg(>->i915->drm, "Wait for media reset failed\n"); + goto out; + } + + pci_write_config_byte(pdev, I915_GDRST, + GRDOM_RENDER | GRDOM_RESET_ENABLE); + ret = wait_for_atomic(g4x_reset_complete(pdev), 50); + if (ret) { + drm_dbg(>->i915->drm, "Wait for render reset failed\n"); + goto out; + } + +out: + pci_write_config_byte(pdev, I915_GDRST, 0); + + rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); + intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); + + return ret; +} + +static int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask, + unsigned int retry) +{ + struct intel_uncore *uncore = gt->uncore; + int ret; + + intel_uncore_write_fw(uncore, ILK_GDSR, + ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE); + ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, + ILK_GRDOM_RESET_ENABLE, 0, + 5000, 0, + NULL); + if (ret) { + drm_dbg(>->i915->drm, "Wait for render reset failed\n"); + goto out; + } + + intel_uncore_write_fw(uncore, ILK_GDSR, + ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE); + ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, + ILK_GRDOM_RESET_ENABLE, 0, + 5000, 0, + NULL); + if (ret) { + drm_dbg(>->i915->drm, "Wait for media reset failed\n"); + goto out; + } + +out: + intel_uncore_write_fw(uncore, ILK_GDSR, 0); + intel_uncore_posting_read_fw(uncore, ILK_GDSR); + return ret; +} + +/* Reset the hardware domains (GENX_GRDOM_*) specified by mask */ +static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask) +{ + struct intel_uncore *uncore = gt->uncore; + int loops = 2; + int err; + + /* + * GEN6_GDRST is not in the gt power well, no need to check + * for fifo space for the write or forcewake the chip for + * the read + */ + do { + intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask); + + /* + * Wait for the device to ack the reset requests. + * + * On some platforms, e.g. Jasperlake, we see that the + * engine register state is not cleared until shortly after + * GDRST reports completion, causing a failure as we try + * to immediately resume while the internal state is still + * in flux. If we immediately repeat the reset, the second + * reset appears to serialise with the first, and since + * it is a no-op, the registers should retain their reset + * value. However, there is still a concern that upon + * leaving the second reset, the internal engine state + * is still in flux and not ready for resuming. + */ + err = __intel_wait_for_register_fw(uncore, GEN6_GDRST, + hw_domain_mask, 0, + 2000, 0, + NULL); + } while (err == 0 && --loops); + if (err) + drm_dbg(>->i915->drm, + "Wait for 0x%08x engines reset failed\n", + hw_domain_mask); + + /* + * As we have observed that the engine state is still volatile + * after GDRST is acked, impose a small delay to let everything settle. + */ + udelay(50); + + return err; +} + +static int gen6_reset_engines(struct intel_gt *gt, + intel_engine_mask_t engine_mask, + unsigned int retry) +{ + static const u32 hw_engine_mask[] = { + [RCS0] = GEN6_GRDOM_RENDER, + [BCS0] = GEN6_GRDOM_BLT, + [VCS0] = GEN6_GRDOM_MEDIA, + [VCS1] = GEN8_GRDOM_MEDIA2, + [VECS0] = GEN6_GRDOM_VECS, + }; + struct intel_engine_cs *engine; + u32 hw_mask; + + if (engine_mask == ALL_ENGINES) { + hw_mask = GEN6_GRDOM_FULL; + } else { + intel_engine_mask_t tmp; + + hw_mask = 0; + for_each_engine_masked(engine, gt, engine_mask, tmp) { + GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask)); + hw_mask |= hw_engine_mask[engine->id]; + } + } + + return gen6_hw_domain_reset(gt, hw_mask); +} + +static int gen11_lock_sfc(struct intel_engine_cs *engine, u32 *hw_mask) +{ + struct intel_uncore *uncore = engine->uncore; + u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access; + i915_reg_t sfc_forced_lock, sfc_forced_lock_ack; + u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit; + i915_reg_t sfc_usage; + u32 sfc_usage_bit; + u32 sfc_reset_bit; + int ret; + + switch (engine->class) { + case VIDEO_DECODE_CLASS: + if ((BIT(engine->instance) & vdbox_sfc_access) == 0) + return 0; + + sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); + sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; + + sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine); + sfc_forced_lock_ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT; + + sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine); + sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT; + sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance); + break; + + case VIDEO_ENHANCEMENT_CLASS: + sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); + sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; + + sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine); + sfc_forced_lock_ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT; + + sfc_usage = GEN11_VECS_SFC_USAGE(engine); + sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT; + sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance); + break; + + default: + return 0; + } + + /* + * If the engine is using a SFC, tell the engine that a software reset + * is going to happen. The engine will then try to force lock the SFC. + * If SFC ends up being locked to the engine we want to reset, we have + * to reset it as well (we will unlock it once the reset sequence is + * completed). + */ + if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit)) + return 0; + + rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit); + + ret = __intel_wait_for_register_fw(uncore, + sfc_forced_lock_ack, + sfc_forced_lock_ack_bit, + sfc_forced_lock_ack_bit, + 1000, 0, NULL); + + /* Was the SFC released while we were trying to lock it? */ + if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit)) + return 0; + + if (ret) { + drm_dbg(&engine->i915->drm, + "Wait for SFC forced lock ack failed\n"); + return ret; + } + + *hw_mask |= sfc_reset_bit; + return 0; +} + +static void gen11_unlock_sfc(struct intel_engine_cs *engine) +{ + struct intel_uncore *uncore = engine->uncore; + u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access; + i915_reg_t sfc_forced_lock; + u32 sfc_forced_lock_bit; + + switch (engine->class) { + case VIDEO_DECODE_CLASS: + if ((BIT(engine->instance) & vdbox_sfc_access) == 0) + return; + + sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); + sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; + break; + + case VIDEO_ENHANCEMENT_CLASS: + sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); + sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; + break; + + default: + return; + } + + rmw_clear_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit); +} + +static int gen11_reset_engines(struct intel_gt *gt, + intel_engine_mask_t engine_mask, + unsigned int retry) +{ + static const u32 hw_engine_mask[] = { + [RCS0] = GEN11_GRDOM_RENDER, + [BCS0] = GEN11_GRDOM_BLT, + [VCS0] = GEN11_GRDOM_MEDIA, + [VCS1] = GEN11_GRDOM_MEDIA2, + [VCS2] = GEN11_GRDOM_MEDIA3, + [VCS3] = GEN11_GRDOM_MEDIA4, + [VECS0] = GEN11_GRDOM_VECS, + [VECS1] = GEN11_GRDOM_VECS2, + }; + struct intel_engine_cs *engine; + intel_engine_mask_t tmp; + u32 hw_mask; + int ret; + + if (engine_mask == ALL_ENGINES) { + hw_mask = GEN11_GRDOM_FULL; + } else { + hw_mask = 0; + for_each_engine_masked(engine, gt, engine_mask, tmp) { + GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask)); + hw_mask |= hw_engine_mask[engine->id]; + ret = gen11_lock_sfc(engine, &hw_mask); + if (ret) + goto sfc_unlock; + } + } + + ret = gen6_hw_domain_reset(gt, hw_mask); + +sfc_unlock: + /* + * We unlock the SFC based on the lock status and not the result of + * gen11_lock_sfc to make sure that we clean properly if something + * wrong happened during the lock (e.g. lock acquired after timeout + * expiration). + */ + if (engine_mask != ALL_ENGINES) + for_each_engine_masked(engine, gt, engine_mask, tmp) + gen11_unlock_sfc(engine); + + return ret; +} + +static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) +{ + struct intel_uncore *uncore = engine->uncore; + const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base); + u32 request, mask, ack; + int ret; + + ack = intel_uncore_read_fw(uncore, reg); + if (ack & RESET_CTL_CAT_ERROR) { + /* + * For catastrophic errors, ready-for-reset sequence + * needs to be bypassed: HAS#396813 + */ + request = RESET_CTL_CAT_ERROR; + mask = RESET_CTL_CAT_ERROR; + + /* Catastrophic errors need to be cleared by HW */ + ack = 0; + } else if (!(ack & RESET_CTL_READY_TO_RESET)) { + request = RESET_CTL_REQUEST_RESET; + mask = RESET_CTL_READY_TO_RESET; + ack = RESET_CTL_READY_TO_RESET; + } else { + return 0; + } + + intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request)); + ret = __intel_wait_for_register_fw(uncore, reg, mask, ack, + 700, 0, NULL); + if (ret) + drm_err(&engine->i915->drm, + "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", + engine->name, request, + intel_uncore_read_fw(uncore, reg)); + + return ret; +} + +static void gen8_engine_reset_cancel(struct intel_engine_cs *engine) +{ + intel_uncore_write_fw(engine->uncore, + RING_RESET_CTL(engine->mmio_base), + _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET)); +} + +static int gen8_reset_engines(struct intel_gt *gt, + intel_engine_mask_t engine_mask, + unsigned int retry) +{ + struct intel_engine_cs *engine; + const bool reset_non_ready = retry >= 1; + intel_engine_mask_t tmp; + int ret; + + for_each_engine_masked(engine, gt, engine_mask, tmp) { + ret = gen8_engine_reset_prepare(engine); + if (ret && !reset_non_ready) + goto skip_reset; + + /* + * If this is not the first failed attempt to prepare, + * we decide to proceed anyway. + * + * By doing so we risk context corruption and with + * some gens (kbl), possible system hang if reset + * happens during active bb execution. + * + * We rather take context corruption instead of + * failed reset with a wedged driver/gpu. And + * active bb execution case should be covered by + * stop_engines() we have before the reset. + */ + } + + if (INTEL_GEN(gt->i915) >= 11) + ret = gen11_reset_engines(gt, engine_mask, retry); + else + ret = gen6_reset_engines(gt, engine_mask, retry); + +skip_reset: + for_each_engine_masked(engine, gt, engine_mask, tmp) + gen8_engine_reset_cancel(engine); + + return ret; +} + +static int mock_reset(struct intel_gt *gt, + intel_engine_mask_t mask, + unsigned int retry) +{ + return 0; +} + +typedef int (*reset_func)(struct intel_gt *, + intel_engine_mask_t engine_mask, + unsigned int retry); + +static reset_func intel_get_gpu_reset(const struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + + if (is_mock_gt(gt)) + return mock_reset; + else if (INTEL_GEN(i915) >= 8) + return gen8_reset_engines; + else if (INTEL_GEN(i915) >= 6) + return gen6_reset_engines; + else if (INTEL_GEN(i915) >= 5) + return ilk_do_reset; + else if (IS_G4X(i915)) + return g4x_do_reset; + else if (IS_G33(i915) || IS_PINEVIEW(i915)) + return g33_do_reset; + else if (INTEL_GEN(i915) >= 3) + return i915_do_reset; + else + return NULL; +} + +int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) +{ + const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1; + reset_func reset; + int ret = -ETIMEDOUT; + int retry; + + reset = intel_get_gpu_reset(gt); + if (!reset) + return -ENODEV; + + /* + * If the power well sleeps during the reset, the reset + * request may be dropped and never completes (causing -EIO). + */ + intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); + for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) { + GT_TRACE(gt, "engine_mask=%x\n", engine_mask); + preempt_disable(); + ret = reset(gt, engine_mask, retry); + preempt_enable(); + } + intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); + + return ret; +} + +bool intel_has_gpu_reset(const struct intel_gt *gt) +{ + if (!gt->i915->params.reset) + return NULL; + + return intel_get_gpu_reset(gt); +} + +bool intel_has_reset_engine(const struct intel_gt *gt) +{ + if (gt->i915->params.reset < 2) + return false; + + return INTEL_INFO(gt->i915)->has_reset_engine; +} + +int intel_reset_guc(struct intel_gt *gt) +{ + u32 guc_domain = + INTEL_GEN(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC; + int ret; + + GEM_BUG_ON(!HAS_GT_UC(gt->i915)); + + intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); + ret = gen6_hw_domain_reset(gt, guc_domain); + intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); + + return ret; +} + +/* + * Ensure irq handler finishes, and not run again. + * Also return the active request so that we only search for it once. + */ +static void reset_prepare_engine(struct intel_engine_cs *engine) +{ + /* + * During the reset sequence, we must prevent the engine from + * entering RC6. As the context state is undefined until we restart + * the engine, if it does enter RC6 during the reset, the state + * written to the powercontext is undefined and so we may lose + * GPU state upon resume, i.e. fail to restart after a reset. + */ + intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL); + if (engine->reset.prepare) + engine->reset.prepare(engine); +} + +static void revoke_mmaps(struct intel_gt *gt) +{ + int i; + + for (i = 0; i < gt->ggtt->num_fences; i++) { + struct drm_vma_offset_node *node; + struct i915_vma *vma; + u64 vma_offset; + + vma = READ_ONCE(gt->ggtt->fence_regs[i].vma); + if (!vma) + continue; + + if (!i915_vma_has_userfault(vma)) + continue; + + GEM_BUG_ON(vma->fence != >->ggtt->fence_regs[i]); + + if (!vma->mmo) + continue; + + node = &vma->mmo->vma_node; + vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT; + + unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping, + drm_vma_node_offset_addr(node) + vma_offset, + vma->size, + 1); + } +} + +static intel_engine_mask_t reset_prepare(struct intel_gt *gt) +{ + struct intel_engine_cs *engine; + intel_engine_mask_t awake = 0; + enum intel_engine_id id; + + for_each_engine(engine, gt, id) { + if (intel_engine_pm_get_if_awake(engine)) + awake |= engine->mask; + reset_prepare_engine(engine); + } + + intel_uc_reset_prepare(>->uc); + + return awake; +} + +static void gt_revoke(struct intel_gt *gt) +{ + revoke_mmaps(gt); +} + +static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err; + + /* + * Everything depends on having the GTT running, so we need to start + * there. + */ + err = i915_ggtt_enable_hw(gt->i915); + if (err) + return err; + + for_each_engine(engine, gt, id) + __intel_engine_reset(engine, stalled_mask & engine->mask); + + intel_ggtt_restore_fences(gt->ggtt); + + return err; +} + +static void reset_finish_engine(struct intel_engine_cs *engine) +{ + if (engine->reset.finish) + engine->reset.finish(engine); + intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL); + + intel_engine_signal_breadcrumbs(engine); +} + +static void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, gt, id) { + reset_finish_engine(engine); + if (awake & engine->mask) + intel_engine_pm_put(engine); + } +} + +static void nop_submit_request(struct i915_request *request) +{ + struct intel_engine_cs *engine = request->engine; + unsigned long flags; + + RQ_TRACE(request, "-EIO\n"); + i915_request_set_error_once(request, -EIO); + + spin_lock_irqsave(&engine->active.lock, flags); + __i915_request_submit(request); + i915_request_mark_complete(request); + spin_unlock_irqrestore(&engine->active.lock, flags); + + intel_engine_signal_breadcrumbs(engine); +} + +static void __intel_gt_set_wedged(struct intel_gt *gt) +{ + struct intel_engine_cs *engine; + intel_engine_mask_t awake; + enum intel_engine_id id; + + if (test_bit(I915_WEDGED, >->reset.flags)) + return; + + GT_TRACE(gt, "start\n"); + + /* + * First, stop submission to hw, but do not yet complete requests by + * rolling the global seqno forward (since this would complete requests + * for which we haven't set the fence error to EIO yet). + */ + awake = reset_prepare(gt); + + /* Even if the GPU reset fails, it should still stop the engines */ + if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) + __intel_gt_reset(gt, ALL_ENGINES); + + for_each_engine(engine, gt, id) + engine->submit_request = nop_submit_request; + + /* + * Make sure no request can slip through without getting completed by + * either this call here to intel_engine_write_global_seqno, or the one + * in nop_submit_request. + */ + synchronize_rcu_expedited(); + set_bit(I915_WEDGED, >->reset.flags); + + /* Mark all executing requests as skipped */ + for_each_engine(engine, gt, id) + if (engine->reset.cancel) + engine->reset.cancel(engine); + + reset_finish(gt, awake); + + GT_TRACE(gt, "end\n"); +} + +void intel_gt_set_wedged(struct intel_gt *gt) +{ + intel_wakeref_t wakeref; + + if (test_bit(I915_WEDGED, >->reset.flags)) + return; + + wakeref = intel_runtime_pm_get(gt->uncore->rpm); + mutex_lock(>->reset.mutex); + + if (GEM_SHOW_DEBUG()) { + struct drm_printer p = drm_debug_printer(__func__); + struct intel_engine_cs *engine; + enum intel_engine_id id; + + drm_printf(&p, "called from %pS\n", (void *)_RET_IP_); + for_each_engine(engine, gt, id) { + if (intel_engine_is_idle(engine)) + continue; + + intel_engine_dump(engine, &p, "%s\n", engine->name); + } + } + + __intel_gt_set_wedged(gt); + + mutex_unlock(>->reset.mutex); + intel_runtime_pm_put(gt->uncore->rpm, wakeref); +} + +static bool __intel_gt_unset_wedged(struct intel_gt *gt) +{ + struct intel_gt_timelines *timelines = >->timelines; + struct intel_timeline *tl; + bool ok; + + if (!test_bit(I915_WEDGED, >->reset.flags)) + return true; + + /* Never fully initialised, recovery impossible */ + if (intel_gt_has_unrecoverable_error(gt)) + return false; + + GT_TRACE(gt, "start\n"); + + /* + * Before unwedging, make sure that all pending operations + * are flushed and errored out - we may have requests waiting upon + * third party fences. We marked all inflight requests as EIO, and + * every execbuf since returned EIO, for consistency we want all + * the currently pending requests to also be marked as EIO, which + * is done inside our nop_submit_request - and so we must wait. + * + * No more can be submitted until we reset the wedged bit. + */ + spin_lock(&timelines->lock); + list_for_each_entry(tl, &timelines->active_list, link) { + struct dma_fence *fence; + + fence = i915_active_fence_get(&tl->last_request); + if (!fence) + continue; + + spin_unlock(&timelines->lock); + + /* + * All internal dependencies (i915_requests) will have + * been flushed by the set-wedge, but we may be stuck waiting + * for external fences. These should all be capped to 10s + * (I915_FENCE_TIMEOUT) so this wait should not be unbounded + * in the worst case. + */ + dma_fence_default_wait(fence, false, MAX_SCHEDULE_TIMEOUT); + dma_fence_put(fence); + + /* Restart iteration after droping lock */ + spin_lock(&timelines->lock); + tl = list_entry(&timelines->active_list, typeof(*tl), link); + } + spin_unlock(&timelines->lock); + + /* We must reset pending GPU events before restoring our submission */ + ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */ + if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) + ok = __intel_gt_reset(gt, ALL_ENGINES) == 0; + if (!ok) { + /* + * Warn CI about the unrecoverable wedged condition. + * Time for a reboot. + */ + add_taint_for_CI(gt->i915, TAINT_WARN); + return false; + } + + /* + * Undo nop_submit_request. We prevent all new i915 requests from + * being queued (by disallowing execbuf whilst wedged) so having + * waited for all active requests above, we know the system is idle + * and do not have to worry about a thread being inside + * engine->submit_request() as we swap over. So unlike installing + * the nop_submit_request on reset, we can do this from normal + * context and do not require stop_machine(). + */ + intel_engines_reset_default_submission(gt); + + GT_TRACE(gt, "end\n"); + + smp_mb__before_atomic(); /* complete takeover before enabling execbuf */ + clear_bit(I915_WEDGED, >->reset.flags); + + return true; +} + +bool intel_gt_unset_wedged(struct intel_gt *gt) +{ + bool result; + + mutex_lock(>->reset.mutex); + result = __intel_gt_unset_wedged(gt); + mutex_unlock(>->reset.mutex); + + return result; +} + +static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask) +{ + int err, i; + + gt_revoke(gt); + + err = __intel_gt_reset(gt, ALL_ENGINES); + for (i = 0; err && i < RESET_MAX_RETRIES; i++) { + msleep(10 * (i + 1)); + err = __intel_gt_reset(gt, ALL_ENGINES); + } + if (err) + return err; + + return gt_reset(gt, stalled_mask); +} + +static int resume(struct intel_gt *gt) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + int ret; + + for_each_engine(engine, gt, id) { + ret = intel_engine_resume(engine); + if (ret) + return ret; + } + + return 0; +} + +/** + * intel_gt_reset - reset chip after a hang + * @gt: #intel_gt to reset + * @stalled_mask: mask of the stalled engines with the guilty requests + * @reason: user error message for why we are resetting + * + * Reset the chip. Useful if a hang is detected. Marks the device as wedged + * on failure. + * + * Procedure is fairly simple: + * - reset the chip using the reset reg + * - re-init context state + * - re-init hardware status page + * - re-init ring buffer + * - re-init interrupt state + * - re-init display + */ +void intel_gt_reset(struct intel_gt *gt, + intel_engine_mask_t stalled_mask, + const char *reason) +{ + intel_engine_mask_t awake; + int ret; + + GT_TRACE(gt, "flags=%lx\n", gt->reset.flags); + + might_sleep(); + GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, >->reset.flags)); + mutex_lock(>->reset.mutex); + + /* Clear any previous failed attempts at recovery. Time to try again. */ + if (!__intel_gt_unset_wedged(gt)) + goto unlock; + + if (reason) + drm_notice(>->i915->drm, + "Resetting chip for %s\n", reason); + atomic_inc(>->i915->gpu_error.reset_count); + + awake = reset_prepare(gt); + + if (!intel_has_gpu_reset(gt)) { + if (gt->i915->params.reset) + drm_err(>->i915->drm, "GPU reset not supported\n"); + else + drm_dbg(>->i915->drm, "GPU reset disabled\n"); + goto error; + } + + if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) + intel_runtime_pm_disable_interrupts(gt->i915); + + if (do_reset(gt, stalled_mask)) { + drm_err(>->i915->drm, "Failed to reset chip\n"); + goto taint; + } + + if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) + intel_runtime_pm_enable_interrupts(gt->i915); + + intel_overlay_reset(gt->i915); + + /* + * Next we need to restore the context, but we don't use those + * yet either... + * + * Ring buffer needs to be re-initialized in the KMS case, or if X + * was running at the time of the reset (i.e. we weren't VT + * switched away). + */ + ret = intel_gt_init_hw(gt); + if (ret) { + drm_err(>->i915->drm, + "Failed to initialise HW following reset (%d)\n", + ret); + goto taint; + } + + ret = resume(gt); + if (ret) + goto taint; + +finish: + reset_finish(gt, awake); +unlock: + mutex_unlock(>->reset.mutex); + return; + +taint: + /* + * History tells us that if we cannot reset the GPU now, we + * never will. This then impacts everything that is run + * subsequently. On failing the reset, we mark the driver + * as wedged, preventing further execution on the GPU. + * We also want to go one step further and add a taint to the + * kernel so that any subsequent faults can be traced back to + * this failure. This is important for CI, where if the + * GPU/driver fails we would like to reboot and restart testing + * rather than continue on into oblivion. For everyone else, + * the system should still plod along, but they have been warned! + */ + add_taint_for_CI(gt->i915, TAINT_WARN); +error: + __intel_gt_set_wedged(gt); + goto finish; +} + +static inline int intel_gt_reset_engine(struct intel_engine_cs *engine) +{ + return __intel_gt_reset(engine->gt, engine->mask); +} + +/** + * intel_engine_reset - reset GPU engine to recover from a hang + * @engine: engine to reset + * @msg: reason for GPU reset; or NULL for no drm_notice() + * + * Reset a specific GPU engine. Useful if a hang is detected. + * Returns zero on successful reset or otherwise an error code. + * + * Procedure is: + * - identifies the request that caused the hang and it is dropped + * - reset engine (which will force the engine to idle) + * - re-init/configure engine + */ +int intel_engine_reset(struct intel_engine_cs *engine, const char *msg) +{ + struct intel_gt *gt = engine->gt; + bool uses_guc = intel_engine_in_guc_submission_mode(engine); + int ret; + + ENGINE_TRACE(engine, "flags=%lx\n", gt->reset.flags); + GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, >->reset.flags)); + + if (!intel_engine_pm_get_if_awake(engine)) + return 0; + + reset_prepare_engine(engine); + + if (msg) + drm_notice(&engine->i915->drm, + "Resetting %s for %s\n", engine->name, msg); + atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]); + + if (!uses_guc) + ret = intel_gt_reset_engine(engine); + else + ret = intel_guc_reset_engine(&engine->gt->uc.guc, engine); + if (ret) { + /* If we fail here, we expect to fallback to a global reset */ + drm_dbg(>->i915->drm, "%sFailed to reset %s, ret=%d\n", + uses_guc ? "GuC " : "", engine->name, ret); + goto out; + } + + /* + * The request that caused the hang is stuck on elsp, we know the + * active request and can drop it, adjust head to skip the offending + * request to resume executing remaining requests in the queue. + */ + __intel_engine_reset(engine, true); + + /* + * The engine and its registers (and workarounds in case of render) + * have been reset to their default values. Follow the init_ring + * process to program RING_MODE, HWSP and re-enable submission. + */ + ret = intel_engine_resume(engine); + +out: + intel_engine_cancel_stop_cs(engine); + reset_finish_engine(engine); + intel_engine_pm_put_async(engine); + return ret; +} + +static void intel_gt_reset_global(struct intel_gt *gt, + u32 engine_mask, + const char *reason) +{ + struct kobject *kobj = >->i915->drm.primary->kdev->kobj; + char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; + char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; + char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL }; + struct intel_wedge_me w; + + kobject_uevent_env(kobj, KOBJ_CHANGE, error_event); + + drm_dbg(>->i915->drm, "resetting chip, engines=%x\n", engine_mask); + kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event); + + /* Use a watchdog to ensure that our reset completes */ + intel_wedge_on_timeout(&w, gt, 5 * HZ) { + intel_prepare_reset(gt->i915); + + /* Flush everyone using a resource about to be clobbered */ + synchronize_srcu_expedited(>->reset.backoff_srcu); + + intel_gt_reset(gt, engine_mask, reason); + + intel_finish_reset(gt->i915); + } + + if (!test_bit(I915_WEDGED, >->reset.flags)) + kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event); +} + +/** + * intel_gt_handle_error - handle a gpu error + * @gt: the intel_gt + * @engine_mask: mask representing engines that are hung + * @flags: control flags + * @fmt: Error message format string + * + * Do some basic checking of register state at error time and + * dump it to the syslog. Also call i915_capture_error_state() to make + * sure we get a record and make it available in debugfs. Fire a uevent + * so userspace knows something bad happened (should trigger collection + * of a ring dump etc.). + */ +void intel_gt_handle_error(struct intel_gt *gt, + intel_engine_mask_t engine_mask, + unsigned long flags, + const char *fmt, ...) +{ + struct intel_engine_cs *engine; + intel_wakeref_t wakeref; + intel_engine_mask_t tmp; + char error_msg[80]; + char *msg = NULL; + + if (fmt) { + va_list args; + + va_start(args, fmt); + vscnprintf(error_msg, sizeof(error_msg), fmt, args); + va_end(args); + + msg = error_msg; + } + + /* + * In most cases it's guaranteed that we get here with an RPM + * reference held, for example because there is a pending GPU + * request that won't finish until the reset is done. This + * isn't the case at least when we get here by doing a + * simulated reset via debugfs, so get an RPM reference. + */ + wakeref = intel_runtime_pm_get(gt->uncore->rpm); + + engine_mask &= gt->info.engine_mask; + + if (flags & I915_ERROR_CAPTURE) { + i915_capture_error_state(gt->i915); + intel_gt_clear_error_registers(gt, engine_mask); + } + + /* + * Try engine reset when available. We fall back to full reset if + * single reset fails. + */ + if (intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) { + for_each_engine_masked(engine, gt, engine_mask, tmp) { + BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE); + if (test_and_set_bit(I915_RESET_ENGINE + engine->id, + >->reset.flags)) + continue; + + if (intel_engine_reset(engine, msg) == 0) + engine_mask &= ~engine->mask; + + clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id, + >->reset.flags); + } + } + + if (!engine_mask) + goto out; + + /* Full reset needs the mutex, stop any other user trying to do so. */ + if (test_and_set_bit(I915_RESET_BACKOFF, >->reset.flags)) { + wait_event(gt->reset.queue, + !test_bit(I915_RESET_BACKOFF, >->reset.flags)); + goto out; /* piggy-back on the other reset */ + } + + /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */ + synchronize_rcu_expedited(); + + /* Prevent any other reset-engine attempt. */ + for_each_engine(engine, gt, tmp) { + while (test_and_set_bit(I915_RESET_ENGINE + engine->id, + >->reset.flags)) + wait_on_bit(>->reset.flags, + I915_RESET_ENGINE + engine->id, + TASK_UNINTERRUPTIBLE); + } + + intel_gt_reset_global(gt, engine_mask, msg); + + for_each_engine(engine, gt, tmp) + clear_bit_unlock(I915_RESET_ENGINE + engine->id, + >->reset.flags); + clear_bit_unlock(I915_RESET_BACKOFF, >->reset.flags); + smp_mb__after_atomic(); + wake_up_all(>->reset.queue); + +out: + intel_runtime_pm_put(gt->uncore->rpm, wakeref); +} + +int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu) +{ + might_lock(>->reset.backoff_srcu); + might_sleep(); + + rcu_read_lock(); + while (test_bit(I915_RESET_BACKOFF, >->reset.flags)) { + rcu_read_unlock(); + + if (wait_event_interruptible(gt->reset.queue, + !test_bit(I915_RESET_BACKOFF, + >->reset.flags))) + return -EINTR; + + rcu_read_lock(); + } + *srcu = srcu_read_lock(>->reset.backoff_srcu); + rcu_read_unlock(); + + return 0; +} + +void intel_gt_reset_unlock(struct intel_gt *gt, int tag) +__releases(>->reset.backoff_srcu) +{ + srcu_read_unlock(>->reset.backoff_srcu, tag); +} + +int intel_gt_terminally_wedged(struct intel_gt *gt) +{ + might_sleep(); + + if (!intel_gt_is_wedged(gt)) + return 0; + + if (intel_gt_has_unrecoverable_error(gt)) + return -EIO; + + /* Reset still in progress? Maybe we will recover? */ + if (wait_event_interruptible(gt->reset.queue, + !test_bit(I915_RESET_BACKOFF, + >->reset.flags))) + return -EINTR; + + return intel_gt_is_wedged(gt) ? -EIO : 0; +} + +void intel_gt_set_wedged_on_init(struct intel_gt *gt) +{ + BUILD_BUG_ON(I915_RESET_ENGINE + I915_NUM_ENGINES > + I915_WEDGED_ON_INIT); + intel_gt_set_wedged(gt); + set_bit(I915_WEDGED_ON_INIT, >->reset.flags); + + /* Wedged on init is non-recoverable */ + add_taint_for_CI(gt->i915, TAINT_WARN); +} + +void intel_gt_set_wedged_on_fini(struct intel_gt *gt) +{ + intel_gt_set_wedged(gt); + set_bit(I915_WEDGED_ON_FINI, >->reset.flags); +} + +void intel_gt_init_reset(struct intel_gt *gt) +{ + init_waitqueue_head(>->reset.queue); + mutex_init(>->reset.mutex); + init_srcu_struct(>->reset.backoff_srcu); + + /* no GPU until we are ready! */ + __set_bit(I915_WEDGED, >->reset.flags); +} + +void intel_gt_fini_reset(struct intel_gt *gt) +{ + cleanup_srcu_struct(>->reset.backoff_srcu); +} + +static void intel_wedge_me(struct work_struct *work) +{ + struct intel_wedge_me *w = container_of(work, typeof(*w), work.work); + + drm_err(&w->gt->i915->drm, + "%s timed out, cancelling all in-flight rendering.\n", + w->name); + intel_gt_set_wedged(w->gt); +} + +void __intel_init_wedge(struct intel_wedge_me *w, + struct intel_gt *gt, + long timeout, + const char *name) +{ + w->gt = gt; + w->name = name; + + INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me); + schedule_delayed_work(&w->work, timeout); +} + +void __intel_fini_wedge(struct intel_wedge_me *w) +{ + cancel_delayed_work_sync(&w->work); + destroy_delayed_work_on_stack(&w->work); + w->gt = NULL; +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_reset.c" +#include "selftest_hangcheck.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h b/drivers/gpu/drm/i915/gt/intel_reset.h new file mode 100644 index 000000000..a0eec7c11 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_reset.h @@ -0,0 +1,79 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2008-2018 Intel Corporation + */ + +#ifndef I915_RESET_H +#define I915_RESET_H + +#include <linux/compiler.h> +#include <linux/types.h> +#include <linux/srcu.h> + +#include "intel_engine_types.h" +#include "intel_reset_types.h" + +struct i915_request; +struct intel_engine_cs; +struct intel_gt; +struct intel_guc; + +void intel_gt_init_reset(struct intel_gt *gt); +void intel_gt_fini_reset(struct intel_gt *gt); + +__printf(4, 5) +void intel_gt_handle_error(struct intel_gt *gt, + intel_engine_mask_t engine_mask, + unsigned long flags, + const char *fmt, ...); +#define I915_ERROR_CAPTURE BIT(0) + +void intel_gt_reset(struct intel_gt *gt, + intel_engine_mask_t stalled_mask, + const char *reason); +int intel_engine_reset(struct intel_engine_cs *engine, + const char *reason); + +void __i915_request_reset(struct i915_request *rq, bool guilty); + +int __must_check intel_gt_reset_trylock(struct intel_gt *gt, int *srcu); +void intel_gt_reset_unlock(struct intel_gt *gt, int tag); + +void intel_gt_set_wedged(struct intel_gt *gt); +bool intel_gt_unset_wedged(struct intel_gt *gt); +int intel_gt_terminally_wedged(struct intel_gt *gt); + +/* + * There's no unset_wedged_on_init paired with this one. + * Once we're wedged on init, there's no going back. + * Same thing for unset_wedged_on_fini. + */ +void intel_gt_set_wedged_on_init(struct intel_gt *gt); +void intel_gt_set_wedged_on_fini(struct intel_gt *gt); + +int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask); + +int intel_reset_guc(struct intel_gt *gt); + +struct intel_wedge_me { + struct delayed_work work; + struct intel_gt *gt; + const char *name; +}; + +void __intel_init_wedge(struct intel_wedge_me *w, + struct intel_gt *gt, + long timeout, + const char *name); +void __intel_fini_wedge(struct intel_wedge_me *w); + +#define intel_wedge_on_timeout(W, GT, TIMEOUT) \ + for (__intel_init_wedge((W), (GT), (TIMEOUT), __func__); \ + (W)->gt; \ + __intel_fini_wedge((W))) + +bool intel_has_gpu_reset(const struct intel_gt *gt); +bool intel_has_reset_engine(const struct intel_gt *gt); + +#endif /* I915_RESET_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_reset_types.h b/drivers/gpu/drm/i915/gt/intel_reset_types.h new file mode 100644 index 000000000..add6b86d9 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_reset_types.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2019 Intel Corporation + */ + +#ifndef __INTEL_RESET_TYPES_H_ +#define __INTEL_RESET_TYPES_H_ + +#include <linux/mutex.h> +#include <linux/wait.h> +#include <linux/srcu.h> + +struct intel_reset { + /** + * flags: Control various stages of the GPU reset + * + * #I915_RESET_BACKOFF - When we start a global reset, we need to + * serialise with any other users attempting to do the same, and + * any global resources that may be clobber by the reset (such as + * FENCE registers). + * + * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to + * acquire the struct_mutex to reset an engine, we need an explicit + * flag to prevent two concurrent reset attempts in the same engine. + * As the number of engines continues to grow, allocate the flags from + * the most significant bits. + * + * #I915_WEDGED - If reset fails and we can no longer use the GPU, + * we set the #I915_WEDGED bit. Prior to command submission, e.g. + * i915_request_alloc(), this bit is checked and the sequence + * aborted (with -EIO reported to userspace) if set. + * + * #I915_WEDGED_ON_INIT - If we fail to initialize the GPU we can no + * longer use the GPU - similar to #I915_WEDGED bit. The difference in + * in the way we're handling "forced" unwedged (e.g. through debugfs), + * which is not allowed in case we failed to initialize. + * + * #I915_WEDGED_ON_FINI - Similar to #I915_WEDGED_ON_INIT, except we + * use it to mark that the GPU is no longer available (and prevent + * users from using it). + */ + unsigned long flags; +#define I915_RESET_BACKOFF 0 +#define I915_RESET_MODESET 1 +#define I915_RESET_ENGINE 2 +#define I915_WEDGED_ON_INIT (BITS_PER_LONG - 3) +#define I915_WEDGED_ON_FINI (BITS_PER_LONG - 2) +#define I915_WEDGED (BITS_PER_LONG - 1) + + struct mutex mutex; /* serialises wedging/unwedging */ + + /** + * Waitqueue to signal when the reset has completed. Used by clients + * that wait for dev_priv->mm.wedged to settle. + */ + wait_queue_head_t queue; + + struct srcu_struct backoff_srcu; +}; + +#endif /* _INTEL_RESET_TYPES_H_ */ diff --git a/drivers/gpu/drm/i915/gt/intel_ring.c b/drivers/gpu/drm/i915/gt/intel_ring.c new file mode 100644 index 000000000..de67b2745 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_ring.c @@ -0,0 +1,327 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include "gem/i915_gem_object.h" +#include "i915_drv.h" +#include "i915_vma.h" +#include "intel_engine.h" +#include "intel_ring.h" +#include "intel_timeline.h" + +unsigned int intel_ring_update_space(struct intel_ring *ring) +{ + unsigned int space; + + space = __intel_ring_space(ring->head, ring->emit, ring->size); + + ring->space = space; + return space; +} + +void __intel_ring_pin(struct intel_ring *ring) +{ + GEM_BUG_ON(!atomic_read(&ring->pin_count)); + atomic_inc(&ring->pin_count); +} + +int intel_ring_pin(struct intel_ring *ring, struct i915_gem_ww_ctx *ww) +{ + struct i915_vma *vma = ring->vma; + unsigned int flags; + void *addr; + int ret; + + if (atomic_fetch_inc(&ring->pin_count)) + return 0; + + /* Ring wraparound at offset 0 sometimes hangs. No idea why. */ + flags = PIN_OFFSET_BIAS | i915_ggtt_pin_bias(vma); + + if (vma->obj->stolen) + flags |= PIN_MAPPABLE; + else + flags |= PIN_HIGH; + + ret = i915_ggtt_pin(vma, ww, 0, flags); + if (unlikely(ret)) + goto err_unpin; + + if (i915_vma_is_map_and_fenceable(vma) && !HAS_LLC(vma->vm->i915)) + addr = (void __force *)i915_vma_pin_iomap(vma); + else + addr = i915_gem_object_pin_map(vma->obj, + i915_coherent_map_type(vma->vm->i915)); + if (IS_ERR(addr)) { + ret = PTR_ERR(addr); + goto err_ring; + } + + i915_vma_make_unshrinkable(vma); + + /* Discard any unused bytes beyond that submitted to hw. */ + intel_ring_reset(ring, ring->emit); + + ring->vaddr = addr; + return 0; + +err_ring: + i915_vma_unpin(vma); +err_unpin: + atomic_dec(&ring->pin_count); + return ret; +} + +void intel_ring_reset(struct intel_ring *ring, u32 tail) +{ + tail = intel_ring_wrap(ring, tail); + ring->tail = tail; + ring->head = tail; + ring->emit = tail; + intel_ring_update_space(ring); +} + +void intel_ring_unpin(struct intel_ring *ring) +{ + struct i915_vma *vma = ring->vma; + + if (!atomic_dec_and_test(&ring->pin_count)) + return; + + i915_vma_unset_ggtt_write(vma); + if (i915_vma_is_map_and_fenceable(vma) && !HAS_LLC(vma->vm->i915)) + i915_vma_unpin_iomap(vma); + else + i915_gem_object_unpin_map(vma->obj); + + i915_vma_make_purgeable(vma); + i915_vma_unpin(vma); +} + +static struct i915_vma *create_ring_vma(struct i915_ggtt *ggtt, int size) +{ + struct i915_address_space *vm = &ggtt->vm; + struct drm_i915_private *i915 = vm->i915; + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + + obj = ERR_PTR(-ENODEV); + if (i915_ggtt_has_aperture(ggtt) && !HAS_LLC(i915)) + obj = i915_gem_object_create_stolen(i915, size); + if (IS_ERR(obj)) + obj = i915_gem_object_create_internal(i915, size); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + /* + * Mark ring buffers as read-only from GPU side (so no stray overwrites) + * if supported by the platform's GGTT. + */ + if (vm->has_read_only) + i915_gem_object_set_readonly(obj); + + vma = i915_vma_instance(obj, vm, NULL); + if (IS_ERR(vma)) + goto err; + + return vma; + +err: + i915_gem_object_put(obj); + return vma; +} + +struct intel_ring * +intel_engine_create_ring(struct intel_engine_cs *engine, int size) +{ + struct drm_i915_private *i915 = engine->i915; + struct intel_ring *ring; + struct i915_vma *vma; + + GEM_BUG_ON(!is_power_of_2(size)); + GEM_BUG_ON(RING_CTL_SIZE(size) & ~RING_NR_PAGES); + + ring = kzalloc(sizeof(*ring), GFP_KERNEL); + if (!ring) + return ERR_PTR(-ENOMEM); + + kref_init(&ring->ref); + ring->size = size; + ring->wrap = BITS_PER_TYPE(ring->size) - ilog2(size); + + /* + * Workaround an erratum on the i830 which causes a hang if + * the TAIL pointer points to within the last 2 cachelines + * of the buffer. + */ + ring->effective_size = size; + if (IS_I830(i915) || IS_I845G(i915)) + ring->effective_size -= 2 * CACHELINE_BYTES; + + intel_ring_update_space(ring); + + vma = create_ring_vma(engine->gt->ggtt, size); + if (IS_ERR(vma)) { + kfree(ring); + return ERR_CAST(vma); + } + ring->vma = vma; + + return ring; +} + +void intel_ring_free(struct kref *ref) +{ + struct intel_ring *ring = container_of(ref, typeof(*ring), ref); + + i915_vma_put(ring->vma); + kfree(ring); +} + +static noinline int +wait_for_space(struct intel_ring *ring, + struct intel_timeline *tl, + unsigned int bytes) +{ + struct i915_request *target; + long timeout; + + if (intel_ring_update_space(ring) >= bytes) + return 0; + + GEM_BUG_ON(list_empty(&tl->requests)); + list_for_each_entry(target, &tl->requests, link) { + if (target->ring != ring) + continue; + + /* Would completion of this request free enough space? */ + if (bytes <= __intel_ring_space(target->postfix, + ring->emit, ring->size)) + break; + } + + if (GEM_WARN_ON(&target->link == &tl->requests)) + return -ENOSPC; + + timeout = i915_request_wait(target, + I915_WAIT_INTERRUPTIBLE, + MAX_SCHEDULE_TIMEOUT); + if (timeout < 0) + return timeout; + + i915_request_retire_upto(target); + + intel_ring_update_space(ring); + GEM_BUG_ON(ring->space < bytes); + return 0; +} + +u32 *intel_ring_begin(struct i915_request *rq, unsigned int num_dwords) +{ + struct intel_ring *ring = rq->ring; + const unsigned int remain_usable = ring->effective_size - ring->emit; + const unsigned int bytes = num_dwords * sizeof(u32); + unsigned int need_wrap = 0; + unsigned int total_bytes; + u32 *cs; + + /* Packets must be qword aligned. */ + GEM_BUG_ON(num_dwords & 1); + + total_bytes = bytes + rq->reserved_space; + GEM_BUG_ON(total_bytes > ring->effective_size); + + if (unlikely(total_bytes > remain_usable)) { + const int remain_actual = ring->size - ring->emit; + + if (bytes > remain_usable) { + /* + * Not enough space for the basic request. So need to + * flush out the remainder and then wait for + * base + reserved. + */ + total_bytes += remain_actual; + need_wrap = remain_actual | 1; + } else { + /* + * The base request will fit but the reserved space + * falls off the end. So we don't need an immediate + * wrap and only need to effectively wait for the + * reserved size from the start of ringbuffer. + */ + total_bytes = rq->reserved_space + remain_actual; + } + } + + if (unlikely(total_bytes > ring->space)) { + int ret; + + /* + * Space is reserved in the ringbuffer for finalising the + * request, as that cannot be allowed to fail. During request + * finalisation, reserved_space is set to 0 to stop the + * overallocation and the assumption is that then we never need + * to wait (which has the risk of failing with EINTR). + * + * See also i915_request_alloc() and i915_request_add(). + */ + GEM_BUG_ON(!rq->reserved_space); + + ret = wait_for_space(ring, + i915_request_timeline(rq), + total_bytes); + if (unlikely(ret)) + return ERR_PTR(ret); + } + + if (unlikely(need_wrap)) { + need_wrap &= ~1; + GEM_BUG_ON(need_wrap > ring->space); + GEM_BUG_ON(ring->emit + need_wrap > ring->size); + GEM_BUG_ON(!IS_ALIGNED(need_wrap, sizeof(u64))); + + /* Fill the tail with MI_NOOP */ + memset64(ring->vaddr + ring->emit, 0, need_wrap / sizeof(u64)); + ring->space -= need_wrap; + ring->emit = 0; + } + + GEM_BUG_ON(ring->emit > ring->size - bytes); + GEM_BUG_ON(ring->space < bytes); + cs = ring->vaddr + ring->emit; + GEM_DEBUG_EXEC(memset32(cs, POISON_INUSE, bytes / sizeof(*cs))); + ring->emit += bytes; + ring->space -= bytes; + + return cs; +} + +/* Align the ring tail to a cacheline boundary */ +int intel_ring_cacheline_align(struct i915_request *rq) +{ + int num_dwords; + void *cs; + + num_dwords = (rq->ring->emit & (CACHELINE_BYTES - 1)) / sizeof(u32); + if (num_dwords == 0) + return 0; + + num_dwords = CACHELINE_DWORDS - num_dwords; + GEM_BUG_ON(num_dwords & 1); + + cs = intel_ring_begin(rq, num_dwords); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + memset64(cs, (u64)MI_NOOP << 32 | MI_NOOP, num_dwords / 2); + intel_ring_advance(rq, cs + num_dwords); + + GEM_BUG_ON(rq->ring->emit & (CACHELINE_BYTES - 1)); + return 0; +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_ring.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_ring.h b/drivers/gpu/drm/i915/gt/intel_ring.h new file mode 100644 index 000000000..1700579bd --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_ring.h @@ -0,0 +1,141 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_RING_H +#define INTEL_RING_H + +#include "i915_gem.h" /* GEM_BUG_ON */ +#include "i915_request.h" +#include "intel_ring_types.h" + +struct intel_engine_cs; + +struct intel_ring * +intel_engine_create_ring(struct intel_engine_cs *engine, int size); + +u32 *intel_ring_begin(struct i915_request *rq, unsigned int num_dwords); +int intel_ring_cacheline_align(struct i915_request *rq); + +unsigned int intel_ring_update_space(struct intel_ring *ring); + +void __intel_ring_pin(struct intel_ring *ring); +int intel_ring_pin(struct intel_ring *ring, struct i915_gem_ww_ctx *ww); +void intel_ring_unpin(struct intel_ring *ring); +void intel_ring_reset(struct intel_ring *ring, u32 tail); + +void intel_ring_free(struct kref *ref); + +static inline struct intel_ring *intel_ring_get(struct intel_ring *ring) +{ + kref_get(&ring->ref); + return ring; +} + +static inline void intel_ring_put(struct intel_ring *ring) +{ + kref_put(&ring->ref, intel_ring_free); +} + +static inline void intel_ring_advance(struct i915_request *rq, u32 *cs) +{ + /* Dummy function. + * + * This serves as a placeholder in the code so that the reader + * can compare against the preceding intel_ring_begin() and + * check that the number of dwords emitted matches the space + * reserved for the command packet (i.e. the value passed to + * intel_ring_begin()). + */ + GEM_BUG_ON((rq->ring->vaddr + rq->ring->emit) != cs); +} + +static inline u32 intel_ring_wrap(const struct intel_ring *ring, u32 pos) +{ + return pos & (ring->size - 1); +} + +static inline int intel_ring_direction(const struct intel_ring *ring, + u32 next, u32 prev) +{ + typecheck(typeof(ring->size), next); + typecheck(typeof(ring->size), prev); + return (next - prev) << ring->wrap; +} + +static inline bool +intel_ring_offset_valid(const struct intel_ring *ring, + unsigned int pos) +{ + if (pos & -ring->size) /* must be strictly within the ring */ + return false; + + if (!IS_ALIGNED(pos, 8)) /* must be qword aligned */ + return false; + + return true; +} + +static inline u32 intel_ring_offset(const struct i915_request *rq, void *addr) +{ + /* Don't write ring->size (equivalent to 0) as that hangs some GPUs. */ + u32 offset = addr - rq->ring->vaddr; + GEM_BUG_ON(offset > rq->ring->size); + return intel_ring_wrap(rq->ring, offset); +} + +static inline void +assert_ring_tail_valid(const struct intel_ring *ring, unsigned int tail) +{ + unsigned int head = READ_ONCE(ring->head); + + GEM_BUG_ON(!intel_ring_offset_valid(ring, tail)); + + /* + * "Ring Buffer Use" + * Gen2 BSpec "1. Programming Environment" / 1.4.4.6 + * Gen3 BSpec "1c Memory Interface Functions" / 2.3.4.5 + * Gen4+ BSpec "1c Memory Interface and Command Stream" / 5.3.4.5 + * "If the Ring Buffer Head Pointer and the Tail Pointer are on the + * same cacheline, the Head Pointer must not be greater than the Tail + * Pointer." + * + * We use ring->head as the last known location of the actual RING_HEAD, + * it may have advanced but in the worst case it is equally the same + * as ring->head and so we should never program RING_TAIL to advance + * into the same cacheline as ring->head. + */ +#define cacheline(a) round_down(a, CACHELINE_BYTES) + GEM_BUG_ON(cacheline(tail) == cacheline(head) && tail < head); +#undef cacheline +} + +static inline unsigned int +intel_ring_set_tail(struct intel_ring *ring, unsigned int tail) +{ + /* Whilst writes to the tail are strictly order, there is no + * serialisation between readers and the writers. The tail may be + * read by i915_request_retire() just as it is being updated + * by execlists, as although the breadcrumb is complete, the context + * switch hasn't been seen. + */ + assert_ring_tail_valid(ring, tail); + ring->tail = tail; + return tail; +} + +static inline unsigned int +__intel_ring_space(unsigned int head, unsigned int tail, unsigned int size) +{ + /* + * "If the Ring Buffer Head Pointer and the Tail Pointer are on the + * same cacheline, the Head Pointer must not be greater than the Tail + * Pointer." + */ + GEM_BUG_ON(!is_power_of_2(size)); + return (head - tail - CACHELINE_BYTES) & (size - 1); +} + +#endif /* INTEL_RING_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c new file mode 100644 index 000000000..6aaca73ea --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c @@ -0,0 +1,1320 @@ +/* + * Copyright © 2008-2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt <eric@anholt.net> + * Zou Nan hai <nanhai.zou@intel.com> + * Xiang Hai hao<haihao.xiang@intel.com> + * + */ + +#include "gen2_engine_cs.h" +#include "gen6_engine_cs.h" +#include "gen6_ppgtt.h" +#include "gen7_renderclear.h" +#include "i915_drv.h" +#include "i915_mitigations.h" +#include "intel_breadcrumbs.h" +#include "intel_context.h" +#include "intel_gt.h" +#include "intel_reset.h" +#include "intel_ring.h" +#include "shmem_utils.h" + +/* Rough estimate of the typical request size, performing a flush, + * set-context and then emitting the batch. + */ +#define LEGACY_REQUEST_SIZE 200 + +static void set_hwstam(struct intel_engine_cs *engine, u32 mask) +{ + /* + * Keep the render interrupt unmasked as this papers over + * lost interrupts following a reset. + */ + if (engine->class == RENDER_CLASS) { + if (INTEL_GEN(engine->i915) >= 6) + mask &= ~BIT(0); + else + mask &= ~I915_USER_INTERRUPT; + } + + intel_engine_set_hwsp_writemask(engine, mask); +} + +static void set_hws_pga(struct intel_engine_cs *engine, phys_addr_t phys) +{ + u32 addr; + + addr = lower_32_bits(phys); + if (INTEL_GEN(engine->i915) >= 4) + addr |= (phys >> 28) & 0xf0; + + intel_uncore_write(engine->uncore, HWS_PGA, addr); +} + +static struct page *status_page(struct intel_engine_cs *engine) +{ + struct drm_i915_gem_object *obj = engine->status_page.vma->obj; + + GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj)); + return sg_page(obj->mm.pages->sgl); +} + +static void ring_setup_phys_status_page(struct intel_engine_cs *engine) +{ + set_hws_pga(engine, PFN_PHYS(page_to_pfn(status_page(engine)))); + set_hwstam(engine, ~0u); +} + +static void set_hwsp(struct intel_engine_cs *engine, u32 offset) +{ + i915_reg_t hwsp; + + /* + * The ring status page addresses are no longer next to the rest of + * the ring registers as of gen7. + */ + if (IS_GEN(engine->i915, 7)) { + switch (engine->id) { + /* + * No more rings exist on Gen7. Default case is only to shut up + * gcc switch check warning. + */ + default: + GEM_BUG_ON(engine->id); + fallthrough; + case RCS0: + hwsp = RENDER_HWS_PGA_GEN7; + break; + case BCS0: + hwsp = BLT_HWS_PGA_GEN7; + break; + case VCS0: + hwsp = BSD_HWS_PGA_GEN7; + break; + case VECS0: + hwsp = VEBOX_HWS_PGA_GEN7; + break; + } + } else if (IS_GEN(engine->i915, 6)) { + hwsp = RING_HWS_PGA_GEN6(engine->mmio_base); + } else { + hwsp = RING_HWS_PGA(engine->mmio_base); + } + + intel_uncore_write(engine->uncore, hwsp, offset); + intel_uncore_posting_read(engine->uncore, hwsp); +} + +static void flush_cs_tlb(struct intel_engine_cs *engine) +{ + struct drm_i915_private *dev_priv = engine->i915; + + if (!IS_GEN_RANGE(dev_priv, 6, 7)) + return; + + /* ring should be idle before issuing a sync flush*/ + drm_WARN_ON(&dev_priv->drm, + (ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0); + + ENGINE_WRITE(engine, RING_INSTPM, + _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE | + INSTPM_SYNC_FLUSH)); + if (intel_wait_for_register(engine->uncore, + RING_INSTPM(engine->mmio_base), + INSTPM_SYNC_FLUSH, 0, + 1000)) + drm_err(&dev_priv->drm, + "%s: wait for SyncFlush to complete for TLB invalidation timed out\n", + engine->name); +} + +static void ring_setup_status_page(struct intel_engine_cs *engine) +{ + set_hwsp(engine, i915_ggtt_offset(engine->status_page.vma)); + set_hwstam(engine, ~0u); + + flush_cs_tlb(engine); +} + +static bool stop_ring(struct intel_engine_cs *engine) +{ + struct drm_i915_private *dev_priv = engine->i915; + + if (INTEL_GEN(dev_priv) > 2) { + ENGINE_WRITE(engine, + RING_MI_MODE, _MASKED_BIT_ENABLE(STOP_RING)); + if (intel_wait_for_register(engine->uncore, + RING_MI_MODE(engine->mmio_base), + MODE_IDLE, + MODE_IDLE, + 1000)) { + drm_err(&dev_priv->drm, + "%s : timed out trying to stop ring\n", + engine->name); + + /* + * Sometimes we observe that the idle flag is not + * set even though the ring is empty. So double + * check before giving up. + */ + if (ENGINE_READ(engine, RING_HEAD) != + ENGINE_READ(engine, RING_TAIL)) + return false; + } + } + + ENGINE_WRITE(engine, RING_HEAD, ENGINE_READ(engine, RING_TAIL)); + + ENGINE_WRITE(engine, RING_HEAD, 0); + ENGINE_WRITE(engine, RING_TAIL, 0); + + /* The ring must be empty before it is disabled */ + ENGINE_WRITE(engine, RING_CTL, 0); + + return (ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR) == 0; +} + +static struct i915_address_space *vm_alias(struct i915_address_space *vm) +{ + if (i915_is_ggtt(vm)) + vm = &i915_vm_to_ggtt(vm)->alias->vm; + + return vm; +} + +static u32 pp_dir(struct i915_address_space *vm) +{ + return to_gen6_ppgtt(i915_vm_to_ppgtt(vm))->pp_dir; +} + +static void set_pp_dir(struct intel_engine_cs *engine) +{ + struct i915_address_space *vm = vm_alias(engine->gt->vm); + + if (vm) { + ENGINE_WRITE(engine, RING_PP_DIR_DCLV, PP_DIR_DCLV_2G); + ENGINE_WRITE(engine, RING_PP_DIR_BASE, pp_dir(vm)); + } +} + +static int xcs_resume(struct intel_engine_cs *engine) +{ + struct drm_i915_private *dev_priv = engine->i915; + struct intel_ring *ring = engine->legacy.ring; + int ret = 0; + + ENGINE_TRACE(engine, "ring:{HEAD:%04x, TAIL:%04x}\n", + ring->head, ring->tail); + + intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL); + + /* WaClearRingBufHeadRegAtInit:ctg,elk */ + if (!stop_ring(engine)) { + /* G45 ring initialization often fails to reset head to zero */ + drm_dbg(&dev_priv->drm, "%s head not reset to zero " + "ctl %08x head %08x tail %08x start %08x\n", + engine->name, + ENGINE_READ(engine, RING_CTL), + ENGINE_READ(engine, RING_HEAD), + ENGINE_READ(engine, RING_TAIL), + ENGINE_READ(engine, RING_START)); + + if (!stop_ring(engine)) { + drm_err(&dev_priv->drm, + "failed to set %s head to zero " + "ctl %08x head %08x tail %08x start %08x\n", + engine->name, + ENGINE_READ(engine, RING_CTL), + ENGINE_READ(engine, RING_HEAD), + ENGINE_READ(engine, RING_TAIL), + ENGINE_READ(engine, RING_START)); + ret = -EIO; + goto out; + } + } + + if (HWS_NEEDS_PHYSICAL(dev_priv)) + ring_setup_phys_status_page(engine); + else + ring_setup_status_page(engine); + + intel_breadcrumbs_reset(engine->breadcrumbs); + + /* Enforce ordering by reading HEAD register back */ + ENGINE_POSTING_READ(engine, RING_HEAD); + + /* + * Initialize the ring. This must happen _after_ we've cleared the ring + * registers with the above sequence (the readback of the HEAD registers + * also enforces ordering), otherwise the hw might lose the new ring + * register values. + */ + ENGINE_WRITE(engine, RING_START, i915_ggtt_offset(ring->vma)); + + /* Check that the ring offsets point within the ring! */ + GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head)); + GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); + intel_ring_update_space(ring); + + set_pp_dir(engine); + + /* First wake the ring up to an empty/idle ring */ + ENGINE_WRITE(engine, RING_HEAD, ring->head); + ENGINE_WRITE(engine, RING_TAIL, ring->head); + ENGINE_POSTING_READ(engine, RING_TAIL); + + ENGINE_WRITE(engine, RING_CTL, RING_CTL_SIZE(ring->size) | RING_VALID); + + /* If the head is still not zero, the ring is dead */ + if (intel_wait_for_register(engine->uncore, + RING_CTL(engine->mmio_base), + RING_VALID, RING_VALID, + 50)) { + drm_err(&dev_priv->drm, "%s initialization failed " + "ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n", + engine->name, + ENGINE_READ(engine, RING_CTL), + ENGINE_READ(engine, RING_CTL) & RING_VALID, + ENGINE_READ(engine, RING_HEAD), ring->head, + ENGINE_READ(engine, RING_TAIL), ring->tail, + ENGINE_READ(engine, RING_START), + i915_ggtt_offset(ring->vma)); + ret = -EIO; + goto out; + } + + if (INTEL_GEN(dev_priv) > 2) + ENGINE_WRITE(engine, + RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); + + /* Now awake, let it get started */ + if (ring->tail != ring->head) { + ENGINE_WRITE(engine, RING_TAIL, ring->tail); + ENGINE_POSTING_READ(engine, RING_TAIL); + } + + /* Papering over lost _interrupts_ immediately following the restart */ + intel_engine_signal_breadcrumbs(engine); +out: + intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL); + + return ret; +} + +static void reset_prepare(struct intel_engine_cs *engine) +{ + struct intel_uncore *uncore = engine->uncore; + const u32 base = engine->mmio_base; + + /* + * We stop engines, otherwise we might get failed reset and a + * dead gpu (on elk). Also as modern gpu as kbl can suffer + * from system hang if batchbuffer is progressing when + * the reset is issued, regardless of READY_TO_RESET ack. + * Thus assume it is best to stop engines on all gens + * where we have a gpu reset. + * + * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) + * + * WaMediaResetMainRingCleanup:ctg,elk (presumably) + * + * FIXME: Wa for more modern gens needs to be validated + */ + ENGINE_TRACE(engine, "\n"); + + if (intel_engine_stop_cs(engine)) + ENGINE_TRACE(engine, "timed out on STOP_RING\n"); + + intel_uncore_write_fw(uncore, + RING_HEAD(base), + intel_uncore_read_fw(uncore, RING_TAIL(base))); + intel_uncore_posting_read_fw(uncore, RING_HEAD(base)); /* paranoia */ + + intel_uncore_write_fw(uncore, RING_HEAD(base), 0); + intel_uncore_write_fw(uncore, RING_TAIL(base), 0); + intel_uncore_posting_read_fw(uncore, RING_TAIL(base)); + + /* The ring must be empty before it is disabled */ + intel_uncore_write_fw(uncore, RING_CTL(base), 0); + + /* Check acts as a post */ + if (intel_uncore_read_fw(uncore, RING_HEAD(base))) + ENGINE_TRACE(engine, "ring head [%x] not parked\n", + intel_uncore_read_fw(uncore, RING_HEAD(base))); +} + +static void reset_rewind(struct intel_engine_cs *engine, bool stalled) +{ + struct i915_request *pos, *rq; + unsigned long flags; + u32 head; + + rq = NULL; + spin_lock_irqsave(&engine->active.lock, flags); + list_for_each_entry(pos, &engine->active.requests, sched.link) { + if (!i915_request_completed(pos)) { + rq = pos; + break; + } + } + + /* + * The guilty request will get skipped on a hung engine. + * + * Users of client default contexts do not rely on logical + * state preserved between batches so it is safe to execute + * queued requests following the hang. Non default contexts + * rely on preserved state, so skipping a batch loses the + * evolution of the state and it needs to be considered corrupted. + * Executing more queued batches on top of corrupted state is + * risky. But we take the risk by trying to advance through + * the queued requests in order to make the client behaviour + * more predictable around resets, by not throwing away random + * amount of batches it has prepared for execution. Sophisticated + * clients can use gem_reset_stats_ioctl and dma fence status + * (exported via sync_file info ioctl on explicit fences) to observe + * when it loses the context state and should rebuild accordingly. + * + * The context ban, and ultimately the client ban, mechanism are safety + * valves if client submission ends up resulting in nothing more than + * subsequent hangs. + */ + + if (rq) { + /* + * Try to restore the logical GPU state to match the + * continuation of the request queue. If we skip the + * context/PD restore, then the next request may try to execute + * assuming that its context is valid and loaded on the GPU and + * so may try to access invalid memory, prompting repeated GPU + * hangs. + * + * If the request was guilty, we still restore the logical + * state in case the next request requires it (e.g. the + * aliasing ppgtt), but skip over the hung batch. + * + * If the request was innocent, we try to replay the request + * with the restored context. + */ + __i915_request_reset(rq, stalled); + + GEM_BUG_ON(rq->ring != engine->legacy.ring); + head = rq->head; + } else { + head = engine->legacy.ring->tail; + } + engine->legacy.ring->head = intel_ring_wrap(engine->legacy.ring, head); + + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +static void reset_finish(struct intel_engine_cs *engine) +{ +} + +static void reset_cancel(struct intel_engine_cs *engine) +{ + struct i915_request *request; + unsigned long flags; + + spin_lock_irqsave(&engine->active.lock, flags); + + /* Mark all submitted requests as skipped. */ + list_for_each_entry(request, &engine->active.requests, sched.link) { + i915_request_set_error_once(request, -EIO); + i915_request_mark_complete(request); + } + + /* Remaining _unready_ requests will be nop'ed when submitted */ + + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +static void i9xx_submit_request(struct i915_request *request) +{ + i915_request_submit(request); + wmb(); /* paranoid flush writes out of the WCB before mmio */ + + ENGINE_WRITE(request->engine, RING_TAIL, + intel_ring_set_tail(request->ring, request->tail)); +} + +static void __ring_context_fini(struct intel_context *ce) +{ + i915_vma_put(ce->state); +} + +static void ring_context_destroy(struct kref *ref) +{ + struct intel_context *ce = container_of(ref, typeof(*ce), ref); + + GEM_BUG_ON(intel_context_is_pinned(ce)); + + if (ce->state) + __ring_context_fini(ce); + + intel_context_fini(ce); + intel_context_free(ce); +} + +static int ring_context_pre_pin(struct intel_context *ce, + struct i915_gem_ww_ctx *ww, + void **unused) +{ + struct i915_address_space *vm; + int err = 0; + + vm = vm_alias(ce->vm); + if (vm) + err = gen6_ppgtt_pin(i915_vm_to_ppgtt((vm)), ww); + + return err; +} + +static void __context_unpin_ppgtt(struct intel_context *ce) +{ + struct i915_address_space *vm; + + vm = vm_alias(ce->vm); + if (vm) + gen6_ppgtt_unpin(i915_vm_to_ppgtt(vm)); +} + +static void ring_context_unpin(struct intel_context *ce) +{ +} + +static void ring_context_post_unpin(struct intel_context *ce) +{ + __context_unpin_ppgtt(ce); +} + +static struct i915_vma * +alloc_context_vma(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + int err; + + obj = i915_gem_object_create_shmem(i915, engine->context_size); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + /* + * Try to make the context utilize L3 as well as LLC. + * + * On VLV we don't have L3 controls in the PTEs so we + * shouldn't touch the cache level, especially as that + * would make the object snooped which might have a + * negative performance impact. + * + * Snooping is required on non-llc platforms in execlist + * mode, but since all GGTT accesses use PAT entry 0 we + * get snooping anyway regardless of cache_level. + * + * This is only applicable for Ivy Bridge devices since + * later platforms don't have L3 control bits in the PTE. + */ + if (IS_IVYBRIDGE(i915)) + i915_gem_object_set_cache_coherency(obj, I915_CACHE_L3_LLC); + + if (engine->default_state) { + void *vaddr; + + vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB); + if (IS_ERR(vaddr)) { + err = PTR_ERR(vaddr); + goto err_obj; + } + + shmem_read(engine->default_state, 0, + vaddr, engine->context_size); + + i915_gem_object_flush_map(obj); + __i915_gem_object_release_map(obj); + } + + vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_obj; + } + + return vma; + +err_obj: + i915_gem_object_put(obj); + return ERR_PTR(err); +} + +static int ring_context_alloc(struct intel_context *ce) +{ + struct intel_engine_cs *engine = ce->engine; + + /* One ringbuffer to rule them all */ + GEM_BUG_ON(!engine->legacy.ring); + ce->ring = engine->legacy.ring; + ce->timeline = intel_timeline_get(engine->legacy.timeline); + + GEM_BUG_ON(ce->state); + if (engine->context_size) { + struct i915_vma *vma; + + vma = alloc_context_vma(engine); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + ce->state = vma; + if (engine->default_state) + __set_bit(CONTEXT_VALID_BIT, &ce->flags); + } + + return 0; +} + +static int ring_context_pin(struct intel_context *ce, void *unused) +{ + return 0; +} + +static void ring_context_reset(struct intel_context *ce) +{ + intel_ring_reset(ce->ring, ce->ring->emit); +} + +static const struct intel_context_ops ring_context_ops = { + .alloc = ring_context_alloc, + + .pre_pin = ring_context_pre_pin, + .pin = ring_context_pin, + .unpin = ring_context_unpin, + .post_unpin = ring_context_post_unpin, + + .enter = intel_context_enter_engine, + .exit = intel_context_exit_engine, + + .reset = ring_context_reset, + .destroy = ring_context_destroy, +}; + +static int load_pd_dir(struct i915_request *rq, + struct i915_address_space *vm, + u32 valid) +{ + const struct intel_engine_cs * const engine = rq->engine; + u32 *cs; + + cs = intel_ring_begin(rq, 12); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->mmio_base)); + *cs++ = valid; + + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base)); + *cs++ = pp_dir(vm); + + /* Stall until the page table load is complete? */ + *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT; + *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base)); + *cs++ = intel_gt_scratch_offset(engine->gt, + INTEL_GT_SCRATCH_FIELD_DEFAULT); + + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(RING_INSTPM(engine->mmio_base)); + *cs++ = _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE); + + intel_ring_advance(rq, cs); + + return rq->engine->emit_flush(rq, EMIT_FLUSH); +} + +static inline int mi_set_context(struct i915_request *rq, + struct intel_context *ce, + u32 flags) +{ + struct intel_engine_cs *engine = rq->engine; + struct drm_i915_private *i915 = engine->i915; + enum intel_engine_id id; + const int num_engines = + IS_HASWELL(i915) ? engine->gt->info.num_engines - 1 : 0; + bool force_restore = false; + int len; + u32 *cs; + + len = 4; + if (IS_GEN(i915, 7)) + len += 2 + (num_engines ? 4 * num_engines + 6 : 0); + else if (IS_GEN(i915, 5)) + len += 2; + if (flags & MI_FORCE_RESTORE) { + GEM_BUG_ON(flags & MI_RESTORE_INHIBIT); + flags &= ~MI_FORCE_RESTORE; + force_restore = true; + len += 2; + } + + cs = intel_ring_begin(rq, len); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* WaProgramMiArbOnOffAroundMiSetContext:ivb,vlv,hsw,bdw,chv */ + if (IS_GEN(i915, 7)) { + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + if (num_engines) { + struct intel_engine_cs *signaller; + + *cs++ = MI_LOAD_REGISTER_IMM(num_engines); + for_each_engine(signaller, engine->gt, id) { + if (signaller == engine) + continue; + + *cs++ = i915_mmio_reg_offset( + RING_PSMI_CTL(signaller->mmio_base)); + *cs++ = _MASKED_BIT_ENABLE( + GEN6_PSMI_SLEEP_MSG_DISABLE); + } + } + } else if (IS_GEN(i915, 5)) { + /* + * This w/a is only listed for pre-production ilk a/b steppings, + * but is also mentioned for programming the powerctx. To be + * safe, just apply the workaround; we do not use SyncFlush so + * this should never take effect and so be a no-op! + */ + *cs++ = MI_SUSPEND_FLUSH | MI_SUSPEND_FLUSH_EN; + } + + if (force_restore) { + /* + * The HW doesn't handle being told to restore the current + * context very well. Quite often it likes goes to go off and + * sulk, especially when it is meant to be reloading PP_DIR. + * A very simple fix to force the reload is to simply switch + * away from the current context and back again. + * + * Note that the kernel_context will contain random state + * following the INHIBIT_RESTORE. We accept this since we + * never use the kernel_context state; it is merely a + * placeholder we use to flush other contexts. + */ + *cs++ = MI_SET_CONTEXT; + *cs++ = i915_ggtt_offset(engine->kernel_context->state) | + MI_MM_SPACE_GTT | + MI_RESTORE_INHIBIT; + } + + *cs++ = MI_NOOP; + *cs++ = MI_SET_CONTEXT; + *cs++ = i915_ggtt_offset(ce->state) | flags; + /* + * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP + * WaMiSetContext_Hang:snb,ivb,vlv + */ + *cs++ = MI_NOOP; + + if (IS_GEN(i915, 7)) { + if (num_engines) { + struct intel_engine_cs *signaller; + i915_reg_t last_reg = {}; /* keep gcc quiet */ + + *cs++ = MI_LOAD_REGISTER_IMM(num_engines); + for_each_engine(signaller, engine->gt, id) { + if (signaller == engine) + continue; + + last_reg = RING_PSMI_CTL(signaller->mmio_base); + *cs++ = i915_mmio_reg_offset(last_reg); + *cs++ = _MASKED_BIT_DISABLE( + GEN6_PSMI_SLEEP_MSG_DISABLE); + } + + /* Insert a delay before the next switch! */ + *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT; + *cs++ = i915_mmio_reg_offset(last_reg); + *cs++ = intel_gt_scratch_offset(engine->gt, + INTEL_GT_SCRATCH_FIELD_DEFAULT); + *cs++ = MI_NOOP; + } + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + } else if (IS_GEN(i915, 5)) { + *cs++ = MI_SUSPEND_FLUSH; + } + + intel_ring_advance(rq, cs); + + return 0; +} + +static int remap_l3_slice(struct i915_request *rq, int slice) +{ + u32 *cs, *remap_info = rq->engine->i915->l3_parity.remap_info[slice]; + int i; + + if (!remap_info) + return 0; + + cs = intel_ring_begin(rq, GEN7_L3LOG_SIZE/4 * 2 + 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* + * Note: We do not worry about the concurrent register cacheline hang + * here because no other code should access these registers other than + * at initialization time. + */ + *cs++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4); + for (i = 0; i < GEN7_L3LOG_SIZE/4; i++) { + *cs++ = i915_mmio_reg_offset(GEN7_L3LOG(slice, i)); + *cs++ = remap_info[i]; + } + *cs++ = MI_NOOP; + intel_ring_advance(rq, cs); + + return 0; +} + +static int remap_l3(struct i915_request *rq) +{ + struct i915_gem_context *ctx = i915_request_gem_context(rq); + int i, err; + + if (!ctx || !ctx->remap_slice) + return 0; + + for (i = 0; i < MAX_L3_SLICES; i++) { + if (!(ctx->remap_slice & BIT(i))) + continue; + + err = remap_l3_slice(rq, i); + if (err) + return err; + } + + ctx->remap_slice = 0; + return 0; +} + +static int switch_mm(struct i915_request *rq, struct i915_address_space *vm) +{ + int ret; + + if (!vm) + return 0; + + ret = rq->engine->emit_flush(rq, EMIT_FLUSH); + if (ret) + return ret; + + /* + * Not only do we need a full barrier (post-sync write) after + * invalidating the TLBs, but we need to wait a little bit + * longer. Whether this is merely delaying us, or the + * subsequent flush is a key part of serialising with the + * post-sync op, this extra pass appears vital before a + * mm switch! + */ + ret = load_pd_dir(rq, vm, PP_DIR_DCLV_2G); + if (ret) + return ret; + + return rq->engine->emit_flush(rq, EMIT_INVALIDATE); +} + +static int clear_residuals(struct i915_request *rq) +{ + struct intel_engine_cs *engine = rq->engine; + int ret; + + ret = switch_mm(rq, vm_alias(engine->kernel_context->vm)); + if (ret) + return ret; + + if (engine->kernel_context->state) { + ret = mi_set_context(rq, + engine->kernel_context, + MI_MM_SPACE_GTT | MI_RESTORE_INHIBIT); + if (ret) + return ret; + } + + ret = engine->emit_bb_start(rq, + engine->wa_ctx.vma->node.start, 0, + 0); + if (ret) + return ret; + + ret = engine->emit_flush(rq, EMIT_FLUSH); + if (ret) + return ret; + + /* Always invalidate before the next switch_mm() */ + return engine->emit_flush(rq, EMIT_INVALIDATE); +} + +static int switch_context(struct i915_request *rq) +{ + struct intel_engine_cs *engine = rq->engine; + struct intel_context *ce = rq->context; + void **residuals = NULL; + int ret; + + GEM_BUG_ON(HAS_EXECLISTS(engine->i915)); + + if (engine->wa_ctx.vma && ce != engine->kernel_context) { + if (engine->wa_ctx.vma->private != ce && + i915_mitigate_clear_residuals()) { + ret = clear_residuals(rq); + if (ret) + return ret; + + residuals = &engine->wa_ctx.vma->private; + } + } + + ret = switch_mm(rq, vm_alias(ce->vm)); + if (ret) + return ret; + + if (ce->state) { + u32 flags; + + GEM_BUG_ON(engine->id != RCS0); + + /* For resource streamer on HSW+ and power context elsewhere */ + BUILD_BUG_ON(HSW_MI_RS_SAVE_STATE_EN != MI_SAVE_EXT_STATE_EN); + BUILD_BUG_ON(HSW_MI_RS_RESTORE_STATE_EN != MI_RESTORE_EXT_STATE_EN); + + flags = MI_SAVE_EXT_STATE_EN | MI_MM_SPACE_GTT; + if (test_bit(CONTEXT_VALID_BIT, &ce->flags)) + flags |= MI_RESTORE_EXT_STATE_EN; + else + flags |= MI_RESTORE_INHIBIT; + + ret = mi_set_context(rq, ce, flags); + if (ret) + return ret; + } + + ret = remap_l3(rq); + if (ret) + return ret; + + /* + * Now past the point of no return, this request _will_ be emitted. + * + * Or at least this preamble will be emitted, the request may be + * interrupted prior to submitting the user payload. If so, we + * still submit the "empty" request in order to preserve global + * state tracking such as this, our tracking of the current + * dirty context. + */ + if (residuals) { + intel_context_put(*residuals); + *residuals = intel_context_get(ce); + } + + return 0; +} + +static int ring_request_alloc(struct i915_request *request) +{ + int ret; + + GEM_BUG_ON(!intel_context_is_pinned(request->context)); + GEM_BUG_ON(i915_request_timeline(request)->has_initial_breadcrumb); + + /* + * Flush enough space to reduce the likelihood of waiting after + * we start building the request - in which case we will just + * have to repeat work. + */ + request->reserved_space += LEGACY_REQUEST_SIZE; + + /* Unconditionally invalidate GPU caches and TLBs. */ + ret = request->engine->emit_flush(request, EMIT_INVALIDATE); + if (ret) + return ret; + + ret = switch_context(request); + if (ret) + return ret; + + request->reserved_space -= LEGACY_REQUEST_SIZE; + return 0; +} + +static void gen6_bsd_submit_request(struct i915_request *request) +{ + struct intel_uncore *uncore = request->engine->uncore; + + intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); + + /* Every tail move must follow the sequence below */ + + /* Disable notification that the ring is IDLE. The GT + * will then assume that it is busy and bring it out of rc6. + */ + intel_uncore_write_fw(uncore, GEN6_BSD_SLEEP_PSMI_CONTROL, + _MASKED_BIT_ENABLE(GEN6_BSD_SLEEP_MSG_DISABLE)); + + /* Clear the context id. Here be magic! */ + intel_uncore_write64_fw(uncore, GEN6_BSD_RNCID, 0x0); + + /* Wait for the ring not to be idle, i.e. for it to wake up. */ + if (__intel_wait_for_register_fw(uncore, + GEN6_BSD_SLEEP_PSMI_CONTROL, + GEN6_BSD_SLEEP_INDICATOR, + 0, + 1000, 0, NULL)) + drm_err(&uncore->i915->drm, + "timed out waiting for the BSD ring to wake up\n"); + + /* Now that the ring is fully powered up, update the tail */ + i9xx_submit_request(request); + + /* Let the ring send IDLE messages to the GT again, + * and so let it sleep to conserve power when idle. + */ + intel_uncore_write_fw(uncore, GEN6_BSD_SLEEP_PSMI_CONTROL, + _MASKED_BIT_DISABLE(GEN6_BSD_SLEEP_MSG_DISABLE)); + + intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL); +} + +static void i9xx_set_default_submission(struct intel_engine_cs *engine) +{ + engine->submit_request = i9xx_submit_request; + + engine->park = NULL; + engine->unpark = NULL; +} + +static void gen6_bsd_set_default_submission(struct intel_engine_cs *engine) +{ + i9xx_set_default_submission(engine); + engine->submit_request = gen6_bsd_submit_request; +} + +static void ring_release(struct intel_engine_cs *engine) +{ + struct drm_i915_private *dev_priv = engine->i915; + + drm_WARN_ON(&dev_priv->drm, INTEL_GEN(dev_priv) > 2 && + (ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0); + + intel_engine_cleanup_common(engine); + + if (engine->wa_ctx.vma) { + intel_context_put(engine->wa_ctx.vma->private); + i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); + } + + intel_ring_unpin(engine->legacy.ring); + intel_ring_put(engine->legacy.ring); + + intel_timeline_unpin(engine->legacy.timeline); + intel_timeline_put(engine->legacy.timeline); +} + +static void setup_irq(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + if (INTEL_GEN(i915) >= 6) { + engine->irq_enable = gen6_irq_enable; + engine->irq_disable = gen6_irq_disable; + } else if (INTEL_GEN(i915) >= 5) { + engine->irq_enable = gen5_irq_enable; + engine->irq_disable = gen5_irq_disable; + } else if (INTEL_GEN(i915) >= 3) { + engine->irq_enable = gen3_irq_enable; + engine->irq_disable = gen3_irq_disable; + } else { + engine->irq_enable = gen2_irq_enable; + engine->irq_disable = gen2_irq_disable; + } +} + +static void setup_common(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + /* gen8+ are only supported with execlists */ + GEM_BUG_ON(INTEL_GEN(i915) >= 8); + + setup_irq(engine); + + engine->resume = xcs_resume; + engine->reset.prepare = reset_prepare; + engine->reset.rewind = reset_rewind; + engine->reset.cancel = reset_cancel; + engine->reset.finish = reset_finish; + + engine->cops = &ring_context_ops; + engine->request_alloc = ring_request_alloc; + + /* + * Using a global execution timeline; the previous final breadcrumb is + * equivalent to our next initial bread so we can elide + * engine->emit_init_breadcrumb(). + */ + engine->emit_fini_breadcrumb = gen3_emit_breadcrumb; + if (IS_GEN(i915, 5)) + engine->emit_fini_breadcrumb = gen5_emit_breadcrumb; + + engine->set_default_submission = i9xx_set_default_submission; + + if (INTEL_GEN(i915) >= 6) + engine->emit_bb_start = gen6_emit_bb_start; + else if (INTEL_GEN(i915) >= 4) + engine->emit_bb_start = gen4_emit_bb_start; + else if (IS_I830(i915) || IS_I845G(i915)) + engine->emit_bb_start = i830_emit_bb_start; + else + engine->emit_bb_start = gen3_emit_bb_start; +} + +static void setup_rcs(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + if (HAS_L3_DPF(i915)) + engine->irq_keep_mask = GT_RENDER_L3_PARITY_ERROR_INTERRUPT; + + engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT; + + if (INTEL_GEN(i915) >= 7) { + engine->emit_flush = gen7_emit_flush_rcs; + engine->emit_fini_breadcrumb = gen7_emit_breadcrumb_rcs; + } else if (IS_GEN(i915, 6)) { + engine->emit_flush = gen6_emit_flush_rcs; + engine->emit_fini_breadcrumb = gen6_emit_breadcrumb_rcs; + } else if (IS_GEN(i915, 5)) { + engine->emit_flush = gen4_emit_flush_rcs; + } else { + if (INTEL_GEN(i915) < 4) + engine->emit_flush = gen2_emit_flush; + else + engine->emit_flush = gen4_emit_flush_rcs; + engine->irq_enable_mask = I915_USER_INTERRUPT; + } + + if (IS_HASWELL(i915)) + engine->emit_bb_start = hsw_emit_bb_start; +} + +static void setup_vcs(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + if (INTEL_GEN(i915) >= 6) { + /* gen6 bsd needs a special wa for tail updates */ + if (IS_GEN(i915, 6)) + engine->set_default_submission = gen6_bsd_set_default_submission; + engine->emit_flush = gen6_emit_flush_vcs; + engine->irq_enable_mask = GT_BSD_USER_INTERRUPT; + + if (IS_GEN(i915, 6)) + engine->emit_fini_breadcrumb = gen6_emit_breadcrumb_xcs; + else + engine->emit_fini_breadcrumb = gen7_emit_breadcrumb_xcs; + } else { + engine->emit_flush = gen4_emit_flush_vcs; + if (IS_GEN(i915, 5)) + engine->irq_enable_mask = ILK_BSD_USER_INTERRUPT; + else + engine->irq_enable_mask = I915_BSD_USER_INTERRUPT; + } +} + +static void setup_bcs(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + engine->emit_flush = gen6_emit_flush_xcs; + engine->irq_enable_mask = GT_BLT_USER_INTERRUPT; + + if (IS_GEN(i915, 6)) + engine->emit_fini_breadcrumb = gen6_emit_breadcrumb_xcs; + else + engine->emit_fini_breadcrumb = gen7_emit_breadcrumb_xcs; +} + +static void setup_vecs(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + GEM_BUG_ON(INTEL_GEN(i915) < 7); + + engine->emit_flush = gen6_emit_flush_xcs; + engine->irq_enable_mask = PM_VEBOX_USER_INTERRUPT; + engine->irq_enable = hsw_irq_enable_vecs; + engine->irq_disable = hsw_irq_disable_vecs; + + engine->emit_fini_breadcrumb = gen7_emit_breadcrumb_xcs; +} + +static int gen7_ctx_switch_bb_setup(struct intel_engine_cs * const engine, + struct i915_vma * const vma) +{ + return gen7_setup_clear_gpr_bb(engine, vma); +} + +static int gen7_ctx_switch_bb_init(struct intel_engine_cs *engine) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + int size; + int err; + + size = gen7_ctx_switch_bb_setup(engine, NULL /* probe size */); + if (size <= 0) + return size; + + size = ALIGN(size, PAGE_SIZE); + obj = i915_gem_object_create_internal(engine->i915, size); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + vma = i915_vma_instance(obj, engine->gt->vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_obj; + } + + vma->private = intel_context_create(engine); /* dummy residuals */ + if (IS_ERR(vma->private)) { + err = PTR_ERR(vma->private); + goto err_obj; + } + + err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_HIGH); + if (err) + goto err_private; + + err = i915_vma_sync(vma); + if (err) + goto err_unpin; + + err = gen7_ctx_switch_bb_setup(engine, vma); + if (err) + goto err_unpin; + + engine->wa_ctx.vma = vma; + return 0; + +err_unpin: + i915_vma_unpin(vma); +err_private: + intel_context_put(vma->private); +err_obj: + i915_gem_object_put(obj); + return err; +} + +int intel_ring_submission_setup(struct intel_engine_cs *engine) +{ + struct intel_timeline *timeline; + struct intel_ring *ring; + int err; + + setup_common(engine); + + switch (engine->class) { + case RENDER_CLASS: + setup_rcs(engine); + break; + case VIDEO_DECODE_CLASS: + setup_vcs(engine); + break; + case COPY_ENGINE_CLASS: + setup_bcs(engine); + break; + case VIDEO_ENHANCEMENT_CLASS: + setup_vecs(engine); + break; + default: + MISSING_CASE(engine->class); + return -ENODEV; + } + + timeline = intel_timeline_create_from_engine(engine, + I915_GEM_HWS_SEQNO_ADDR); + if (IS_ERR(timeline)) { + err = PTR_ERR(timeline); + goto err; + } + GEM_BUG_ON(timeline->has_initial_breadcrumb); + + err = intel_timeline_pin(timeline, NULL); + if (err) + goto err_timeline; + + ring = intel_engine_create_ring(engine, SZ_16K); + if (IS_ERR(ring)) { + err = PTR_ERR(ring); + goto err_timeline_unpin; + } + + err = intel_ring_pin(ring, NULL); + if (err) + goto err_ring; + + GEM_BUG_ON(engine->legacy.ring); + engine->legacy.ring = ring; + engine->legacy.timeline = timeline; + + GEM_BUG_ON(timeline->hwsp_ggtt != engine->status_page.vma); + + if (IS_GEN(engine->i915, 7) && engine->class == RENDER_CLASS) { + err = gen7_ctx_switch_bb_init(engine); + if (err) + goto err_ring_unpin; + } + + /* Finally, take ownership and responsibility for cleanup! */ + engine->release = ring_release; + + return 0; + +err_ring_unpin: + intel_ring_unpin(ring); +err_ring: + intel_ring_put(ring); +err_timeline_unpin: + intel_timeline_unpin(timeline); +err_timeline: + intel_timeline_put(timeline); +err: + intel_engine_cleanup_common(engine); + return err; +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_ring_submission.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_ring_types.h b/drivers/gpu/drm/i915/gt/intel_ring_types.h new file mode 100644 index 000000000..1a189ea00 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_ring_types.h @@ -0,0 +1,52 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_RING_TYPES_H +#define INTEL_RING_TYPES_H + +#include <linux/atomic.h> +#include <linux/kref.h> +#include <linux/types.h> + +/* + * Early gen2 devices have a cacheline of just 32 bytes, using 64 is overkill, + * but keeps the logic simple. Indeed, the whole purpose of this macro is just + * to give some inclination as to some of the magic values used in the various + * workarounds! + */ +#define CACHELINE_BYTES 64 +#define CACHELINE_DWORDS (CACHELINE_BYTES / sizeof(u32)) + +struct i915_vma; + +struct intel_ring { + struct kref ref; + struct i915_vma *vma; + void *vaddr; + + /* + * As we have two types of rings, one global to the engine used + * by ringbuffer submission and those that are exclusive to a + * context used by execlists, we have to play safe and allow + * atomic updates to the pin_count. However, the actual pinning + * of the context is either done during initialisation for + * ringbuffer submission or serialised as part of the context + * pinning for execlists, and so we do not need a mutex ourselves + * to serialise intel_ring_pin/intel_ring_unpin. + */ + atomic_t pin_count; + + u32 head; /* updated during retire, loosely tracks RING_HEAD */ + u32 tail; /* updated on submission, used for RING_TAIL */ + u32 emit; /* updated during request construction */ + + u32 space; + u32 size; + u32 wrap; + u32 effective_size; +}; + +#endif /* INTEL_RING_TYPES_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c new file mode 100644 index 000000000..c1c9cc0ad --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_rps.c @@ -0,0 +1,2098 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include <drm/i915_drm.h> + +#include "i915_drv.h" +#include "intel_breadcrumbs.h" +#include "intel_gt.h" +#include "intel_gt_clock_utils.h" +#include "intel_gt_irq.h" +#include "intel_gt_pm_irq.h" +#include "intel_rps.h" +#include "intel_sideband.h" +#include "../../../platform/x86/intel_ips.h" + +#define BUSY_MAX_EI 20u /* ms */ + +/* + * Lock protecting IPS related data structures + */ +static DEFINE_SPINLOCK(mchdev_lock); + +static struct intel_gt *rps_to_gt(struct intel_rps *rps) +{ + return container_of(rps, struct intel_gt, rps); +} + +static struct drm_i915_private *rps_to_i915(struct intel_rps *rps) +{ + return rps_to_gt(rps)->i915; +} + +static struct intel_uncore *rps_to_uncore(struct intel_rps *rps) +{ + return rps_to_gt(rps)->uncore; +} + +static u32 rps_pm_sanitize_mask(struct intel_rps *rps, u32 mask) +{ + return mask & ~rps->pm_intrmsk_mbz; +} + +static inline void set(struct intel_uncore *uncore, i915_reg_t reg, u32 val) +{ + intel_uncore_write_fw(uncore, reg, val); +} + +static void rps_timer(struct timer_list *t) +{ + struct intel_rps *rps = from_timer(rps, t, timer); + struct intel_engine_cs *engine; + ktime_t dt, last, timestamp; + enum intel_engine_id id; + s64 max_busy[3] = {}; + + timestamp = 0; + for_each_engine(engine, rps_to_gt(rps), id) { + s64 busy; + int i; + + dt = intel_engine_get_busy_time(engine, ×tamp); + last = engine->stats.rps; + engine->stats.rps = dt; + + busy = ktime_to_ns(ktime_sub(dt, last)); + for (i = 0; i < ARRAY_SIZE(max_busy); i++) { + if (busy > max_busy[i]) + swap(busy, max_busy[i]); + } + } + last = rps->pm_timestamp; + rps->pm_timestamp = timestamp; + + if (intel_rps_is_active(rps)) { + s64 busy; + int i; + + dt = ktime_sub(timestamp, last); + + /* + * Our goal is to evaluate each engine independently, so we run + * at the lowest clocks required to sustain the heaviest + * workload. However, a task may be split into sequential + * dependent operations across a set of engines, such that + * the independent contributions do not account for high load, + * but overall the task is GPU bound. For example, consider + * video decode on vcs followed by colour post-processing + * on vecs, followed by general post-processing on rcs. + * Since multi-engines being active does imply a single + * continuous workload across all engines, we hedge our + * bets by only contributing a factor of the distributed + * load into our busyness calculation. + */ + busy = max_busy[0]; + for (i = 1; i < ARRAY_SIZE(max_busy); i++) { + if (!max_busy[i]) + break; + + busy += div_u64(max_busy[i], 1 << i); + } + GT_TRACE(rps_to_gt(rps), + "busy:%lld [%d%%], max:[%lld, %lld, %lld], interval:%d\n", + busy, (int)div64_u64(100 * busy, dt), + max_busy[0], max_busy[1], max_busy[2], + rps->pm_interval); + + if (100 * busy > rps->power.up_threshold * dt && + rps->cur_freq < rps->max_freq_softlimit) { + rps->pm_iir |= GEN6_PM_RP_UP_THRESHOLD; + rps->pm_interval = 1; + schedule_work(&rps->work); + } else if (100 * busy < rps->power.down_threshold * dt && + rps->cur_freq > rps->min_freq_softlimit) { + rps->pm_iir |= GEN6_PM_RP_DOWN_THRESHOLD; + rps->pm_interval = 1; + schedule_work(&rps->work); + } else { + rps->last_adj = 0; + } + + mod_timer(&rps->timer, + jiffies + msecs_to_jiffies(rps->pm_interval)); + rps->pm_interval = min(rps->pm_interval * 2, BUSY_MAX_EI); + } +} + +static void rps_start_timer(struct intel_rps *rps) +{ + rps->pm_timestamp = ktime_sub(ktime_get(), rps->pm_timestamp); + rps->pm_interval = 1; + mod_timer(&rps->timer, jiffies + 1); +} + +static void rps_stop_timer(struct intel_rps *rps) +{ + del_timer_sync(&rps->timer); + rps->pm_timestamp = ktime_sub(ktime_get(), rps->pm_timestamp); + cancel_work_sync(&rps->work); +} + +static u32 rps_pm_mask(struct intel_rps *rps, u8 val) +{ + u32 mask = 0; + + /* We use UP_EI_EXPIRED interrupts for both up/down in manual mode */ + if (val > rps->min_freq_softlimit) + mask |= (GEN6_PM_RP_UP_EI_EXPIRED | + GEN6_PM_RP_DOWN_THRESHOLD | + GEN6_PM_RP_DOWN_TIMEOUT); + + if (val < rps->max_freq_softlimit) + mask |= GEN6_PM_RP_UP_EI_EXPIRED | GEN6_PM_RP_UP_THRESHOLD; + + mask &= rps->pm_events; + + return rps_pm_sanitize_mask(rps, ~mask); +} + +static void rps_reset_ei(struct intel_rps *rps) +{ + memset(&rps->ei, 0, sizeof(rps->ei)); +} + +static void rps_enable_interrupts(struct intel_rps *rps) +{ + struct intel_gt *gt = rps_to_gt(rps); + + GT_TRACE(gt, "interrupts:on rps->pm_events: %x, rps_pm_mask:%x\n", + rps->pm_events, rps_pm_mask(rps, rps->last_freq)); + + rps_reset_ei(rps); + + spin_lock_irq(>->irq_lock); + gen6_gt_pm_enable_irq(gt, rps->pm_events); + spin_unlock_irq(>->irq_lock); + + intel_uncore_write(gt->uncore, + GEN6_PMINTRMSK, rps_pm_mask(rps, rps->last_freq)); +} + +static void gen6_rps_reset_interrupts(struct intel_rps *rps) +{ + gen6_gt_pm_reset_iir(rps_to_gt(rps), GEN6_PM_RPS_EVENTS); +} + +static void gen11_rps_reset_interrupts(struct intel_rps *rps) +{ + while (gen11_gt_reset_one_iir(rps_to_gt(rps), 0, GEN11_GTPM)) + ; +} + +static void rps_reset_interrupts(struct intel_rps *rps) +{ + struct intel_gt *gt = rps_to_gt(rps); + + spin_lock_irq(>->irq_lock); + if (INTEL_GEN(gt->i915) >= 11) + gen11_rps_reset_interrupts(rps); + else + gen6_rps_reset_interrupts(rps); + + rps->pm_iir = 0; + spin_unlock_irq(>->irq_lock); +} + +static void rps_disable_interrupts(struct intel_rps *rps) +{ + struct intel_gt *gt = rps_to_gt(rps); + + intel_uncore_write(gt->uncore, + GEN6_PMINTRMSK, rps_pm_sanitize_mask(rps, ~0u)); + + spin_lock_irq(>->irq_lock); + gen6_gt_pm_disable_irq(gt, GEN6_PM_RPS_EVENTS); + spin_unlock_irq(>->irq_lock); + + intel_synchronize_irq(gt->i915); + + /* + * Now that we will not be generating any more work, flush any + * outstanding tasks. As we are called on the RPS idle path, + * we will reset the GPU to minimum frequencies, so the current + * state of the worker can be discarded. + */ + cancel_work_sync(&rps->work); + + rps_reset_interrupts(rps); + GT_TRACE(gt, "interrupts:off\n"); +} + +static const struct cparams { + u16 i; + u16 t; + u16 m; + u16 c; +} cparams[] = { + { 1, 1333, 301, 28664 }, + { 1, 1066, 294, 24460 }, + { 1, 800, 294, 25192 }, + { 0, 1333, 276, 27605 }, + { 0, 1066, 276, 27605 }, + { 0, 800, 231, 23784 }, +}; + +static void gen5_rps_init(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + struct intel_uncore *uncore = rps_to_uncore(rps); + u8 fmax, fmin, fstart; + u32 rgvmodectl; + int c_m, i; + + if (i915->fsb_freq <= 3200) + c_m = 0; + else if (i915->fsb_freq <= 4800) + c_m = 1; + else + c_m = 2; + + for (i = 0; i < ARRAY_SIZE(cparams); i++) { + if (cparams[i].i == c_m && cparams[i].t == i915->mem_freq) { + rps->ips.m = cparams[i].m; + rps->ips.c = cparams[i].c; + break; + } + } + + rgvmodectl = intel_uncore_read(uncore, MEMMODECTL); + + /* Set up min, max, and cur for interrupt handling */ + fmax = (rgvmodectl & MEMMODE_FMAX_MASK) >> MEMMODE_FMAX_SHIFT; + fmin = (rgvmodectl & MEMMODE_FMIN_MASK); + fstart = (rgvmodectl & MEMMODE_FSTART_MASK) >> + MEMMODE_FSTART_SHIFT; + drm_dbg(&i915->drm, "fmax: %d, fmin: %d, fstart: %d\n", + fmax, fmin, fstart); + + rps->min_freq = fmax; + rps->efficient_freq = fstart; + rps->max_freq = fmin; +} + +static unsigned long +__ips_chipset_val(struct intel_ips *ips) +{ + struct intel_uncore *uncore = + rps_to_uncore(container_of(ips, struct intel_rps, ips)); + unsigned long now = jiffies_to_msecs(jiffies), dt; + unsigned long result; + u64 total, delta; + + lockdep_assert_held(&mchdev_lock); + + /* + * Prevent division-by-zero if we are asking too fast. + * Also, we don't get interesting results if we are polling + * faster than once in 10ms, so just return the saved value + * in such cases. + */ + dt = now - ips->last_time1; + if (dt <= 10) + return ips->chipset_power; + + /* FIXME: handle per-counter overflow */ + total = intel_uncore_read(uncore, DMIEC); + total += intel_uncore_read(uncore, DDREC); + total += intel_uncore_read(uncore, CSIEC); + + delta = total - ips->last_count1; + + result = div_u64(div_u64(ips->m * delta, dt) + ips->c, 10); + + ips->last_count1 = total; + ips->last_time1 = now; + + ips->chipset_power = result; + + return result; +} + +static unsigned long ips_mch_val(struct intel_uncore *uncore) +{ + unsigned int m, x, b; + u32 tsfs; + + tsfs = intel_uncore_read(uncore, TSFS); + x = intel_uncore_read8(uncore, TR1); + + b = tsfs & TSFS_INTR_MASK; + m = (tsfs & TSFS_SLOPE_MASK) >> TSFS_SLOPE_SHIFT; + + return m * x / 127 - b; +} + +static int _pxvid_to_vd(u8 pxvid) +{ + if (pxvid == 0) + return 0; + + if (pxvid >= 8 && pxvid < 31) + pxvid = 31; + + return (pxvid + 2) * 125; +} + +static u32 pvid_to_extvid(struct drm_i915_private *i915, u8 pxvid) +{ + const int vd = _pxvid_to_vd(pxvid); + + if (INTEL_INFO(i915)->is_mobile) + return max(vd - 1125, 0); + + return vd; +} + +static void __gen5_ips_update(struct intel_ips *ips) +{ + struct intel_uncore *uncore = + rps_to_uncore(container_of(ips, struct intel_rps, ips)); + u64 now, delta, dt; + u32 count; + + lockdep_assert_held(&mchdev_lock); + + now = ktime_get_raw_ns(); + dt = now - ips->last_time2; + do_div(dt, NSEC_PER_MSEC); + + /* Don't divide by 0 */ + if (dt <= 10) + return; + + count = intel_uncore_read(uncore, GFXEC); + delta = count - ips->last_count2; + + ips->last_count2 = count; + ips->last_time2 = now; + + /* More magic constants... */ + ips->gfx_power = div_u64(delta * 1181, dt * 10); +} + +static void gen5_rps_update(struct intel_rps *rps) +{ + spin_lock_irq(&mchdev_lock); + __gen5_ips_update(&rps->ips); + spin_unlock_irq(&mchdev_lock); +} + +static bool gen5_rps_set(struct intel_rps *rps, u8 val) +{ + struct intel_uncore *uncore = rps_to_uncore(rps); + u16 rgvswctl; + + lockdep_assert_held(&mchdev_lock); + + rgvswctl = intel_uncore_read16(uncore, MEMSWCTL); + if (rgvswctl & MEMCTL_CMD_STS) { + DRM_DEBUG("gpu busy, RCS change rejected\n"); + return false; /* still busy with another command */ + } + + /* Invert the frequency bin into an ips delay */ + val = rps->max_freq - val; + val = rps->min_freq + val; + + rgvswctl = + (MEMCTL_CMD_CHFREQ << MEMCTL_CMD_SHIFT) | + (val << MEMCTL_FREQ_SHIFT) | + MEMCTL_SFCAVM; + intel_uncore_write16(uncore, MEMSWCTL, rgvswctl); + intel_uncore_posting_read16(uncore, MEMSWCTL); + + rgvswctl |= MEMCTL_CMD_STS; + intel_uncore_write16(uncore, MEMSWCTL, rgvswctl); + + return true; +} + +static unsigned long intel_pxfreq(u32 vidfreq) +{ + int div = (vidfreq & 0x3f0000) >> 16; + int post = (vidfreq & 0x3000) >> 12; + int pre = (vidfreq & 0x7); + + if (!pre) + return 0; + + return div * 133333 / (pre << post); +} + +static unsigned int init_emon(struct intel_uncore *uncore) +{ + u8 pxw[16]; + int i; + + /* Disable to program */ + intel_uncore_write(uncore, ECR, 0); + intel_uncore_posting_read(uncore, ECR); + + /* Program energy weights for various events */ + intel_uncore_write(uncore, SDEW, 0x15040d00); + intel_uncore_write(uncore, CSIEW0, 0x007f0000); + intel_uncore_write(uncore, CSIEW1, 0x1e220004); + intel_uncore_write(uncore, CSIEW2, 0x04000004); + + for (i = 0; i < 5; i++) + intel_uncore_write(uncore, PEW(i), 0); + for (i = 0; i < 3; i++) + intel_uncore_write(uncore, DEW(i), 0); + + /* Program P-state weights to account for frequency power adjustment */ + for (i = 0; i < 16; i++) { + u32 pxvidfreq = intel_uncore_read(uncore, PXVFREQ(i)); + unsigned int freq = intel_pxfreq(pxvidfreq); + unsigned int vid = + (pxvidfreq & PXVFREQ_PX_MASK) >> PXVFREQ_PX_SHIFT; + unsigned int val; + + val = vid * vid * freq / 1000 * 255; + val /= 127 * 127 * 900; + + pxw[i] = val; + } + /* Render standby states get 0 weight */ + pxw[14] = 0; + pxw[15] = 0; + + for (i = 0; i < 4; i++) { + intel_uncore_write(uncore, PXW(i), + pxw[i * 4 + 0] << 24 | + pxw[i * 4 + 1] << 16 | + pxw[i * 4 + 2] << 8 | + pxw[i * 4 + 3] << 0); + } + + /* Adjust magic regs to magic values (more experimental results) */ + intel_uncore_write(uncore, OGW0, 0); + intel_uncore_write(uncore, OGW1, 0); + intel_uncore_write(uncore, EG0, 0x00007f00); + intel_uncore_write(uncore, EG1, 0x0000000e); + intel_uncore_write(uncore, EG2, 0x000e0000); + intel_uncore_write(uncore, EG3, 0x68000300); + intel_uncore_write(uncore, EG4, 0x42000000); + intel_uncore_write(uncore, EG5, 0x00140031); + intel_uncore_write(uncore, EG6, 0); + intel_uncore_write(uncore, EG7, 0); + + for (i = 0; i < 8; i++) + intel_uncore_write(uncore, PXWL(i), 0); + + /* Enable PMON + select events */ + intel_uncore_write(uncore, ECR, 0x80000019); + + return intel_uncore_read(uncore, LCFUSE02) & LCFUSE_HIV_MASK; +} + +static bool gen5_rps_enable(struct intel_rps *rps) +{ + struct intel_uncore *uncore = rps_to_uncore(rps); + u8 fstart, vstart; + u32 rgvmodectl; + + spin_lock_irq(&mchdev_lock); + + rgvmodectl = intel_uncore_read(uncore, MEMMODECTL); + + /* Enable temp reporting */ + intel_uncore_write16(uncore, PMMISC, + intel_uncore_read16(uncore, PMMISC) | MCPPCE_EN); + intel_uncore_write16(uncore, TSC1, + intel_uncore_read16(uncore, TSC1) | TSE); + + /* 100ms RC evaluation intervals */ + intel_uncore_write(uncore, RCUPEI, 100000); + intel_uncore_write(uncore, RCDNEI, 100000); + + /* Set max/min thresholds to 90ms and 80ms respectively */ + intel_uncore_write(uncore, RCBMAXAVG, 90000); + intel_uncore_write(uncore, RCBMINAVG, 80000); + + intel_uncore_write(uncore, MEMIHYST, 1); + + /* Set up min, max, and cur for interrupt handling */ + fstart = (rgvmodectl & MEMMODE_FSTART_MASK) >> + MEMMODE_FSTART_SHIFT; + + vstart = (intel_uncore_read(uncore, PXVFREQ(fstart)) & + PXVFREQ_PX_MASK) >> PXVFREQ_PX_SHIFT; + + intel_uncore_write(uncore, + MEMINTREN, + MEMINT_CX_SUPR_EN | MEMINT_EVAL_CHG_EN); + + intel_uncore_write(uncore, VIDSTART, vstart); + intel_uncore_posting_read(uncore, VIDSTART); + + rgvmodectl |= MEMMODE_SWMODE_EN; + intel_uncore_write(uncore, MEMMODECTL, rgvmodectl); + + if (wait_for_atomic((intel_uncore_read(uncore, MEMSWCTL) & + MEMCTL_CMD_STS) == 0, 10)) + drm_err(&uncore->i915->drm, + "stuck trying to change perf mode\n"); + mdelay(1); + + gen5_rps_set(rps, rps->cur_freq); + + rps->ips.last_count1 = intel_uncore_read(uncore, DMIEC); + rps->ips.last_count1 += intel_uncore_read(uncore, DDREC); + rps->ips.last_count1 += intel_uncore_read(uncore, CSIEC); + rps->ips.last_time1 = jiffies_to_msecs(jiffies); + + rps->ips.last_count2 = intel_uncore_read(uncore, GFXEC); + rps->ips.last_time2 = ktime_get_raw_ns(); + + spin_unlock_irq(&mchdev_lock); + + rps->ips.corr = init_emon(uncore); + + return true; +} + +static void gen5_rps_disable(struct intel_rps *rps) +{ + struct intel_uncore *uncore = rps_to_uncore(rps); + u16 rgvswctl; + + spin_lock_irq(&mchdev_lock); + + rgvswctl = intel_uncore_read16(uncore, MEMSWCTL); + + /* Ack interrupts, disable EFC interrupt */ + intel_uncore_write(uncore, MEMINTREN, + intel_uncore_read(uncore, MEMINTREN) & + ~MEMINT_EVAL_CHG_EN); + intel_uncore_write(uncore, MEMINTRSTS, MEMINT_EVAL_CHG); + intel_uncore_write(uncore, DEIER, + intel_uncore_read(uncore, DEIER) & ~DE_PCU_EVENT); + intel_uncore_write(uncore, DEIIR, DE_PCU_EVENT); + intel_uncore_write(uncore, DEIMR, + intel_uncore_read(uncore, DEIMR) | DE_PCU_EVENT); + + /* Go back to the starting frequency */ + gen5_rps_set(rps, rps->idle_freq); + mdelay(1); + rgvswctl |= MEMCTL_CMD_STS; + intel_uncore_write(uncore, MEMSWCTL, rgvswctl); + mdelay(1); + + spin_unlock_irq(&mchdev_lock); +} + +static u32 rps_limits(struct intel_rps *rps, u8 val) +{ + u32 limits; + + /* + * Only set the down limit when we've reached the lowest level to avoid + * getting more interrupts, otherwise leave this clear. This prevents a + * race in the hw when coming out of rc6: There's a tiny window where + * the hw runs at the minimal clock before selecting the desired + * frequency, if the down threshold expires in that window we will not + * receive a down interrupt. + */ + if (INTEL_GEN(rps_to_i915(rps)) >= 9) { + limits = rps->max_freq_softlimit << 23; + if (val <= rps->min_freq_softlimit) + limits |= rps->min_freq_softlimit << 14; + } else { + limits = rps->max_freq_softlimit << 24; + if (val <= rps->min_freq_softlimit) + limits |= rps->min_freq_softlimit << 16; + } + + return limits; +} + +static void rps_set_power(struct intel_rps *rps, int new_power) +{ + struct intel_gt *gt = rps_to_gt(rps); + struct intel_uncore *uncore = gt->uncore; + u32 threshold_up = 0, threshold_down = 0; /* in % */ + u32 ei_up = 0, ei_down = 0; + + lockdep_assert_held(&rps->power.mutex); + + if (new_power == rps->power.mode) + return; + + threshold_up = 95; + threshold_down = 85; + + /* Note the units here are not exactly 1us, but 1280ns. */ + switch (new_power) { + case LOW_POWER: + ei_up = 16000; + ei_down = 32000; + break; + + case BETWEEN: + ei_up = 13000; + ei_down = 32000; + break; + + case HIGH_POWER: + ei_up = 10000; + ei_down = 32000; + break; + } + + /* When byt can survive without system hang with dynamic + * sw freq adjustments, this restriction can be lifted. + */ + if (IS_VALLEYVIEW(gt->i915)) + goto skip_hw_write; + + GT_TRACE(gt, + "changing power mode [%d], up %d%% @ %dus, down %d%% @ %dus\n", + new_power, threshold_up, ei_up, threshold_down, ei_down); + + set(uncore, GEN6_RP_UP_EI, + intel_gt_ns_to_pm_interval(gt, ei_up * 1000)); + set(uncore, GEN6_RP_UP_THRESHOLD, + intel_gt_ns_to_pm_interval(gt, ei_up * threshold_up * 10)); + + set(uncore, GEN6_RP_DOWN_EI, + intel_gt_ns_to_pm_interval(gt, ei_down * 1000)); + set(uncore, GEN6_RP_DOWN_THRESHOLD, + intel_gt_ns_to_pm_interval(gt, ei_down * threshold_down * 10)); + + set(uncore, GEN6_RP_CONTROL, + (INTEL_GEN(gt->i915) > 9 ? 0 : GEN6_RP_MEDIA_TURBO) | + GEN6_RP_MEDIA_HW_NORMAL_MODE | + GEN6_RP_MEDIA_IS_GFX | + GEN6_RP_ENABLE | + GEN6_RP_UP_BUSY_AVG | + GEN6_RP_DOWN_IDLE_AVG); + +skip_hw_write: + rps->power.mode = new_power; + rps->power.up_threshold = threshold_up; + rps->power.down_threshold = threshold_down; +} + +static void gen6_rps_set_thresholds(struct intel_rps *rps, u8 val) +{ + int new_power; + + new_power = rps->power.mode; + switch (rps->power.mode) { + case LOW_POWER: + if (val > rps->efficient_freq + 1 && + val > rps->cur_freq) + new_power = BETWEEN; + break; + + case BETWEEN: + if (val <= rps->efficient_freq && + val < rps->cur_freq) + new_power = LOW_POWER; + else if (val >= rps->rp0_freq && + val > rps->cur_freq) + new_power = HIGH_POWER; + break; + + case HIGH_POWER: + if (val < (rps->rp1_freq + rps->rp0_freq) >> 1 && + val < rps->cur_freq) + new_power = BETWEEN; + break; + } + /* Max/min bins are special */ + if (val <= rps->min_freq_softlimit) + new_power = LOW_POWER; + if (val >= rps->max_freq_softlimit) + new_power = HIGH_POWER; + + mutex_lock(&rps->power.mutex); + if (rps->power.interactive) + new_power = HIGH_POWER; + rps_set_power(rps, new_power); + mutex_unlock(&rps->power.mutex); +} + +void intel_rps_mark_interactive(struct intel_rps *rps, bool interactive) +{ + GT_TRACE(rps_to_gt(rps), "mark interactive: %s\n", yesno(interactive)); + + mutex_lock(&rps->power.mutex); + if (interactive) { + if (!rps->power.interactive++ && intel_rps_is_active(rps)) + rps_set_power(rps, HIGH_POWER); + } else { + GEM_BUG_ON(!rps->power.interactive); + rps->power.interactive--; + } + mutex_unlock(&rps->power.mutex); +} + +static int gen6_rps_set(struct intel_rps *rps, u8 val) +{ + struct intel_uncore *uncore = rps_to_uncore(rps); + struct drm_i915_private *i915 = rps_to_i915(rps); + u32 swreq; + + if (INTEL_GEN(i915) >= 9) + swreq = GEN9_FREQUENCY(val); + else if (IS_HASWELL(i915) || IS_BROADWELL(i915)) + swreq = HSW_FREQUENCY(val); + else + swreq = (GEN6_FREQUENCY(val) | + GEN6_OFFSET(0) | + GEN6_AGGRESSIVE_TURBO); + set(uncore, GEN6_RPNSWREQ, swreq); + + GT_TRACE(rps_to_gt(rps), "set val:%x, freq:%d, swreq:%x\n", + val, intel_gpu_freq(rps, val), swreq); + + return 0; +} + +static int vlv_rps_set(struct intel_rps *rps, u8 val) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + int err; + + vlv_punit_get(i915); + err = vlv_punit_write(i915, PUNIT_REG_GPU_FREQ_REQ, val); + vlv_punit_put(i915); + + GT_TRACE(rps_to_gt(rps), "set val:%x, freq:%d\n", + val, intel_gpu_freq(rps, val)); + + return err; +} + +static int rps_set(struct intel_rps *rps, u8 val, bool update) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + int err; + + if (INTEL_GEN(i915) < 6) + return 0; + + if (val == rps->last_freq) + return 0; + + if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915)) + err = vlv_rps_set(rps, val); + else + err = gen6_rps_set(rps, val); + if (err) + return err; + + if (update) + gen6_rps_set_thresholds(rps, val); + rps->last_freq = val; + + return 0; +} + +void intel_rps_unpark(struct intel_rps *rps) +{ + if (!intel_rps_is_enabled(rps)) + return; + + GT_TRACE(rps_to_gt(rps), "unpark:%x\n", rps->cur_freq); + + /* + * Use the user's desired frequency as a guide, but for better + * performance, jump directly to RPe as our starting frequency. + */ + mutex_lock(&rps->lock); + + intel_rps_set_active(rps); + intel_rps_set(rps, + clamp(rps->cur_freq, + rps->min_freq_softlimit, + rps->max_freq_softlimit)); + + mutex_unlock(&rps->lock); + + rps->pm_iir = 0; + if (intel_rps_has_interrupts(rps)) + rps_enable_interrupts(rps); + if (intel_rps_uses_timer(rps)) + rps_start_timer(rps); + + if (IS_GEN(rps_to_i915(rps), 5)) + gen5_rps_update(rps); +} + +void intel_rps_park(struct intel_rps *rps) +{ + int adj; + + if (!intel_rps_clear_active(rps)) + return; + + if (intel_rps_uses_timer(rps)) + rps_stop_timer(rps); + if (intel_rps_has_interrupts(rps)) + rps_disable_interrupts(rps); + + if (rps->last_freq <= rps->idle_freq) + return; + + /* + * The punit delays the write of the frequency and voltage until it + * determines the GPU is awake. During normal usage we don't want to + * waste power changing the frequency if the GPU is sleeping (rc6). + * However, the GPU and driver is now idle and we do not want to delay + * switching to minimum voltage (reducing power whilst idle) as we do + * not expect to be woken in the near future and so must flush the + * change by waking the device. + * + * We choose to take the media powerwell (either would do to trick the + * punit into committing the voltage change) as that takes a lot less + * power than the render powerwell. + */ + intel_uncore_forcewake_get(rps_to_uncore(rps), FORCEWAKE_MEDIA); + rps_set(rps, rps->idle_freq, false); + intel_uncore_forcewake_put(rps_to_uncore(rps), FORCEWAKE_MEDIA); + + /* + * Since we will try and restart from the previously requested + * frequency on unparking, treat this idle point as a downclock + * interrupt and reduce the frequency for resume. If we park/unpark + * more frequently than the rps worker can run, we will not respond + * to any EI and never see a change in frequency. + * + * (Note we accommodate Cherryview's limitation of only using an + * even bin by applying it to all.) + */ + adj = rps->last_adj; + if (adj < 0) + adj *= 2; + else /* CHV needs even encode values */ + adj = -2; + rps->last_adj = adj; + rps->cur_freq = max_t(int, rps->cur_freq + adj, rps->min_freq); + if (rps->cur_freq < rps->efficient_freq) { + rps->cur_freq = rps->efficient_freq; + rps->last_adj = 0; + } + + GT_TRACE(rps_to_gt(rps), "park:%x\n", rps->cur_freq); +} + +void intel_rps_boost(struct i915_request *rq) +{ + struct intel_rps *rps = &READ_ONCE(rq->engine)->gt->rps; + unsigned long flags; + + if (i915_request_signaled(rq) || !intel_rps_is_active(rps)) + return; + + /* Serializes with i915_request_retire() */ + spin_lock_irqsave(&rq->lock, flags); + if (!i915_request_has_waitboost(rq) && + !dma_fence_is_signaled_locked(&rq->fence)) { + set_bit(I915_FENCE_FLAG_BOOST, &rq->fence.flags); + + GT_TRACE(rps_to_gt(rps), "boost fence:%llx:%llx\n", + rq->fence.context, rq->fence.seqno); + + if (!atomic_fetch_inc(&rps->num_waiters) && + READ_ONCE(rps->cur_freq) < rps->boost_freq) + schedule_work(&rps->work); + + atomic_inc(&rps->boosts); + } + spin_unlock_irqrestore(&rq->lock, flags); +} + +int intel_rps_set(struct intel_rps *rps, u8 val) +{ + int err; + + lockdep_assert_held(&rps->lock); + GEM_BUG_ON(val > rps->max_freq); + GEM_BUG_ON(val < rps->min_freq); + + if (intel_rps_is_active(rps)) { + err = rps_set(rps, val, true); + if (err) + return err; + + /* + * Make sure we continue to get interrupts + * until we hit the minimum or maximum frequencies. + */ + if (intel_rps_has_interrupts(rps)) { + struct intel_uncore *uncore = rps_to_uncore(rps); + + set(uncore, + GEN6_RP_INTERRUPT_LIMITS, rps_limits(rps, val)); + + set(uncore, GEN6_PMINTRMSK, rps_pm_mask(rps, val)); + } + } + + rps->cur_freq = val; + return 0; +} + +static void gen6_rps_init(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + struct intel_uncore *uncore = rps_to_uncore(rps); + + /* All of these values are in units of 50MHz */ + + /* static values from HW: RP0 > RP1 > RPn (min_freq) */ + if (IS_GEN9_LP(i915)) { + u32 rp_state_cap = intel_uncore_read(uncore, BXT_RP_STATE_CAP); + + rps->rp0_freq = (rp_state_cap >> 16) & 0xff; + rps->rp1_freq = (rp_state_cap >> 8) & 0xff; + rps->min_freq = (rp_state_cap >> 0) & 0xff; + } else { + u32 rp_state_cap = intel_uncore_read(uncore, GEN6_RP_STATE_CAP); + + rps->rp0_freq = (rp_state_cap >> 0) & 0xff; + rps->rp1_freq = (rp_state_cap >> 8) & 0xff; + rps->min_freq = (rp_state_cap >> 16) & 0xff; + } + + /* hw_max = RP0 until we check for overclocking */ + rps->max_freq = rps->rp0_freq; + + rps->efficient_freq = rps->rp1_freq; + if (IS_HASWELL(i915) || IS_BROADWELL(i915) || + IS_GEN9_BC(i915) || INTEL_GEN(i915) >= 10) { + u32 ddcc_status = 0; + + if (sandybridge_pcode_read(i915, + HSW_PCODE_DYNAMIC_DUTY_CYCLE_CONTROL, + &ddcc_status, NULL) == 0) + rps->efficient_freq = + clamp_t(u8, + (ddcc_status >> 8) & 0xff, + rps->min_freq, + rps->max_freq); + } + + if (IS_GEN9_BC(i915) || INTEL_GEN(i915) >= 10) { + /* Store the frequency values in 16.66 MHZ units, which is + * the natural hardware unit for SKL + */ + rps->rp0_freq *= GEN9_FREQ_SCALER; + rps->rp1_freq *= GEN9_FREQ_SCALER; + rps->min_freq *= GEN9_FREQ_SCALER; + rps->max_freq *= GEN9_FREQ_SCALER; + rps->efficient_freq *= GEN9_FREQ_SCALER; + } +} + +static bool rps_reset(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + + /* force a reset */ + rps->power.mode = -1; + rps->last_freq = -1; + + if (rps_set(rps, rps->min_freq, true)) { + drm_err(&i915->drm, "Failed to reset RPS to initial values\n"); + return false; + } + + rps->cur_freq = rps->min_freq; + return true; +} + +/* See the Gen9_GT_PM_Programming_Guide doc for the below */ +static bool gen9_rps_enable(struct intel_rps *rps) +{ + struct intel_gt *gt = rps_to_gt(rps); + struct intel_uncore *uncore = gt->uncore; + + /* Program defaults and thresholds for RPS */ + if (IS_GEN(gt->i915, 9)) + intel_uncore_write_fw(uncore, GEN6_RC_VIDEO_FREQ, + GEN9_FREQUENCY(rps->rp1_freq)); + + intel_uncore_write_fw(uncore, GEN6_RP_IDLE_HYSTERSIS, 0xa); + + rps->pm_events = GEN6_PM_RP_UP_THRESHOLD | GEN6_PM_RP_DOWN_THRESHOLD; + + return rps_reset(rps); +} + +static bool gen8_rps_enable(struct intel_rps *rps) +{ + struct intel_uncore *uncore = rps_to_uncore(rps); + + intel_uncore_write_fw(uncore, GEN6_RC_VIDEO_FREQ, + HSW_FREQUENCY(rps->rp1_freq)); + + intel_uncore_write_fw(uncore, GEN6_RP_IDLE_HYSTERSIS, 10); + + rps->pm_events = GEN6_PM_RP_UP_THRESHOLD | GEN6_PM_RP_DOWN_THRESHOLD; + + return rps_reset(rps); +} + +static bool gen6_rps_enable(struct intel_rps *rps) +{ + struct intel_uncore *uncore = rps_to_uncore(rps); + + /* Power down if completely idle for over 50ms */ + intel_uncore_write_fw(uncore, GEN6_RP_DOWN_TIMEOUT, 50000); + intel_uncore_write_fw(uncore, GEN6_RP_IDLE_HYSTERSIS, 10); + + rps->pm_events = (GEN6_PM_RP_UP_THRESHOLD | + GEN6_PM_RP_DOWN_THRESHOLD | + GEN6_PM_RP_DOWN_TIMEOUT); + + return rps_reset(rps); +} + +static int chv_rps_max_freq(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + struct intel_gt *gt = rps_to_gt(rps); + u32 val; + + val = vlv_punit_read(i915, FB_GFX_FMAX_AT_VMAX_FUSE); + + switch (gt->info.sseu.eu_total) { + case 8: + /* (2 * 4) config */ + val >>= FB_GFX_FMAX_AT_VMAX_2SS4EU_FUSE_SHIFT; + break; + case 12: + /* (2 * 6) config */ + val >>= FB_GFX_FMAX_AT_VMAX_2SS6EU_FUSE_SHIFT; + break; + case 16: + /* (2 * 8) config */ + default: + /* Setting (2 * 8) Min RP0 for any other combination */ + val >>= FB_GFX_FMAX_AT_VMAX_2SS8EU_FUSE_SHIFT; + break; + } + + return val & FB_GFX_FREQ_FUSE_MASK; +} + +static int chv_rps_rpe_freq(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + u32 val; + + val = vlv_punit_read(i915, PUNIT_GPU_DUTYCYCLE_REG); + val >>= PUNIT_GPU_DUTYCYCLE_RPE_FREQ_SHIFT; + + return val & PUNIT_GPU_DUTYCYCLE_RPE_FREQ_MASK; +} + +static int chv_rps_guar_freq(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + u32 val; + + val = vlv_punit_read(i915, FB_GFX_FMAX_AT_VMAX_FUSE); + + return val & FB_GFX_FREQ_FUSE_MASK; +} + +static u32 chv_rps_min_freq(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + u32 val; + + val = vlv_punit_read(i915, FB_GFX_FMIN_AT_VMIN_FUSE); + val >>= FB_GFX_FMIN_AT_VMIN_FUSE_SHIFT; + + return val & FB_GFX_FREQ_FUSE_MASK; +} + +static bool chv_rps_enable(struct intel_rps *rps) +{ + struct intel_uncore *uncore = rps_to_uncore(rps); + struct drm_i915_private *i915 = rps_to_i915(rps); + u32 val; + + /* 1: Program defaults and thresholds for RPS*/ + intel_uncore_write_fw(uncore, GEN6_RP_DOWN_TIMEOUT, 1000000); + intel_uncore_write_fw(uncore, GEN6_RP_UP_THRESHOLD, 59400); + intel_uncore_write_fw(uncore, GEN6_RP_DOWN_THRESHOLD, 245000); + intel_uncore_write_fw(uncore, GEN6_RP_UP_EI, 66000); + intel_uncore_write_fw(uncore, GEN6_RP_DOWN_EI, 350000); + + intel_uncore_write_fw(uncore, GEN6_RP_IDLE_HYSTERSIS, 10); + + /* 2: Enable RPS */ + intel_uncore_write_fw(uncore, GEN6_RP_CONTROL, + GEN6_RP_MEDIA_HW_NORMAL_MODE | + GEN6_RP_MEDIA_IS_GFX | + GEN6_RP_ENABLE | + GEN6_RP_UP_BUSY_AVG | + GEN6_RP_DOWN_IDLE_AVG); + + rps->pm_events = (GEN6_PM_RP_UP_THRESHOLD | + GEN6_PM_RP_DOWN_THRESHOLD | + GEN6_PM_RP_DOWN_TIMEOUT); + + /* Setting Fixed Bias */ + vlv_punit_get(i915); + + val = VLV_OVERRIDE_EN | VLV_SOC_TDP_EN | CHV_BIAS_CPU_50_SOC_50; + vlv_punit_write(i915, VLV_TURBO_SOC_OVERRIDE, val); + + val = vlv_punit_read(i915, PUNIT_REG_GPU_FREQ_STS); + + vlv_punit_put(i915); + + /* RPS code assumes GPLL is used */ + drm_WARN_ONCE(&i915->drm, (val & GPLLENABLE) == 0, + "GPLL not enabled\n"); + + drm_dbg(&i915->drm, "GPLL enabled? %s\n", yesno(val & GPLLENABLE)); + drm_dbg(&i915->drm, "GPU status: 0x%08x\n", val); + + return rps_reset(rps); +} + +static int vlv_rps_guar_freq(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + u32 val, rp1; + + val = vlv_nc_read(i915, IOSF_NC_FB_GFX_FREQ_FUSE); + + rp1 = val & FB_GFX_FGUARANTEED_FREQ_FUSE_MASK; + rp1 >>= FB_GFX_FGUARANTEED_FREQ_FUSE_SHIFT; + + return rp1; +} + +static int vlv_rps_max_freq(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + u32 val, rp0; + + val = vlv_nc_read(i915, IOSF_NC_FB_GFX_FREQ_FUSE); + + rp0 = (val & FB_GFX_MAX_FREQ_FUSE_MASK) >> FB_GFX_MAX_FREQ_FUSE_SHIFT; + /* Clamp to max */ + rp0 = min_t(u32, rp0, 0xea); + + return rp0; +} + +static int vlv_rps_rpe_freq(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + u32 val, rpe; + + val = vlv_nc_read(i915, IOSF_NC_FB_GFX_FMAX_FUSE_LO); + rpe = (val & FB_FMAX_VMIN_FREQ_LO_MASK) >> FB_FMAX_VMIN_FREQ_LO_SHIFT; + val = vlv_nc_read(i915, IOSF_NC_FB_GFX_FMAX_FUSE_HI); + rpe |= (val & FB_FMAX_VMIN_FREQ_HI_MASK) << 5; + + return rpe; +} + +static int vlv_rps_min_freq(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + u32 val; + + val = vlv_punit_read(i915, PUNIT_REG_GPU_LFM) & 0xff; + /* + * According to the BYT Punit GPU turbo HAS 1.1.6.3 the minimum value + * for the minimum frequency in GPLL mode is 0xc1. Contrary to this on + * a BYT-M B0 the above register contains 0xbf. Moreover when setting + * a frequency Punit will not allow values below 0xc0. Clamp it 0xc0 + * to make sure it matches what Punit accepts. + */ + return max_t(u32, val, 0xc0); +} + +static bool vlv_rps_enable(struct intel_rps *rps) +{ + struct intel_uncore *uncore = rps_to_uncore(rps); + struct drm_i915_private *i915 = rps_to_i915(rps); + u32 val; + + intel_uncore_write_fw(uncore, GEN6_RP_DOWN_TIMEOUT, 1000000); + intel_uncore_write_fw(uncore, GEN6_RP_UP_THRESHOLD, 59400); + intel_uncore_write_fw(uncore, GEN6_RP_DOWN_THRESHOLD, 245000); + intel_uncore_write_fw(uncore, GEN6_RP_UP_EI, 66000); + intel_uncore_write_fw(uncore, GEN6_RP_DOWN_EI, 350000); + + intel_uncore_write_fw(uncore, GEN6_RP_IDLE_HYSTERSIS, 10); + + intel_uncore_write_fw(uncore, GEN6_RP_CONTROL, + GEN6_RP_MEDIA_TURBO | + GEN6_RP_MEDIA_HW_NORMAL_MODE | + GEN6_RP_MEDIA_IS_GFX | + GEN6_RP_ENABLE | + GEN6_RP_UP_BUSY_AVG | + GEN6_RP_DOWN_IDLE_CONT); + + /* WaGsvRC0ResidencyMethod:vlv */ + rps->pm_events = GEN6_PM_RP_UP_EI_EXPIRED; + + vlv_punit_get(i915); + + /* Setting Fixed Bias */ + val = VLV_OVERRIDE_EN | VLV_SOC_TDP_EN | VLV_BIAS_CPU_125_SOC_875; + vlv_punit_write(i915, VLV_TURBO_SOC_OVERRIDE, val); + + val = vlv_punit_read(i915, PUNIT_REG_GPU_FREQ_STS); + + vlv_punit_put(i915); + + /* RPS code assumes GPLL is used */ + drm_WARN_ONCE(&i915->drm, (val & GPLLENABLE) == 0, + "GPLL not enabled\n"); + + drm_dbg(&i915->drm, "GPLL enabled? %s\n", yesno(val & GPLLENABLE)); + drm_dbg(&i915->drm, "GPU status: 0x%08x\n", val); + + return rps_reset(rps); +} + +static unsigned long __ips_gfx_val(struct intel_ips *ips) +{ + struct intel_rps *rps = container_of(ips, typeof(*rps), ips); + struct intel_uncore *uncore = rps_to_uncore(rps); + unsigned long t, corr, state1, corr2, state2; + u32 pxvid, ext_v; + + lockdep_assert_held(&mchdev_lock); + + pxvid = intel_uncore_read(uncore, PXVFREQ(rps->cur_freq)); + pxvid = (pxvid >> 24) & 0x7f; + ext_v = pvid_to_extvid(rps_to_i915(rps), pxvid); + + state1 = ext_v; + + /* Revel in the empirically derived constants */ + + /* Correction factor in 1/100000 units */ + t = ips_mch_val(uncore); + if (t > 80) + corr = t * 2349 + 135940; + else if (t >= 50) + corr = t * 964 + 29317; + else /* < 50 */ + corr = t * 301 + 1004; + + corr = corr * 150142 * state1 / 10000 - 78642; + corr /= 100000; + corr2 = corr * ips->corr; + + state2 = corr2 * state1 / 10000; + state2 /= 100; /* convert to mW */ + + __gen5_ips_update(ips); + + return ips->gfx_power + state2; +} + +static bool has_busy_stats(struct intel_rps *rps) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, rps_to_gt(rps), id) { + if (!intel_engine_supports_stats(engine)) + return false; + } + + return true; +} + +void intel_rps_enable(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + struct intel_uncore *uncore = rps_to_uncore(rps); + bool enabled = false; + + if (!HAS_RPS(i915)) + return; + + intel_gt_check_clock_frequency(rps_to_gt(rps)); + + intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); + if (rps->max_freq <= rps->min_freq) + /* leave disabled, no room for dynamic reclocking */; + else if (IS_CHERRYVIEW(i915)) + enabled = chv_rps_enable(rps); + else if (IS_VALLEYVIEW(i915)) + enabled = vlv_rps_enable(rps); + else if (INTEL_GEN(i915) >= 9) + enabled = gen9_rps_enable(rps); + else if (INTEL_GEN(i915) >= 8) + enabled = gen8_rps_enable(rps); + else if (INTEL_GEN(i915) >= 6) + enabled = gen6_rps_enable(rps); + else if (IS_IRONLAKE_M(i915)) + enabled = gen5_rps_enable(rps); + else + MISSING_CASE(INTEL_GEN(i915)); + intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL); + if (!enabled) + return; + + GT_TRACE(rps_to_gt(rps), + "min:%x, max:%x, freq:[%d, %d]\n", + rps->min_freq, rps->max_freq, + intel_gpu_freq(rps, rps->min_freq), + intel_gpu_freq(rps, rps->max_freq)); + + GEM_BUG_ON(rps->max_freq < rps->min_freq); + GEM_BUG_ON(rps->idle_freq > rps->max_freq); + + GEM_BUG_ON(rps->efficient_freq < rps->min_freq); + GEM_BUG_ON(rps->efficient_freq > rps->max_freq); + + if (has_busy_stats(rps)) + intel_rps_set_timer(rps); + else if (INTEL_GEN(i915) >= 6) + intel_rps_set_interrupts(rps); + else + /* Ironlake currently uses intel_ips.ko */ {} + + intel_rps_set_enabled(rps); +} + +static void gen6_rps_disable(struct intel_rps *rps) +{ + set(rps_to_uncore(rps), GEN6_RP_CONTROL, 0); +} + +void intel_rps_disable(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + + intel_rps_clear_enabled(rps); + intel_rps_clear_interrupts(rps); + intel_rps_clear_timer(rps); + + if (INTEL_GEN(i915) >= 6) + gen6_rps_disable(rps); + else if (IS_IRONLAKE_M(i915)) + gen5_rps_disable(rps); +} + +static int byt_gpu_freq(struct intel_rps *rps, int val) +{ + /* + * N = val - 0xb7 + * Slow = Fast = GPLL ref * N + */ + return DIV_ROUND_CLOSEST(rps->gpll_ref_freq * (val - 0xb7), 1000); +} + +static int byt_freq_opcode(struct intel_rps *rps, int val) +{ + return DIV_ROUND_CLOSEST(1000 * val, rps->gpll_ref_freq) + 0xb7; +} + +static int chv_gpu_freq(struct intel_rps *rps, int val) +{ + /* + * N = val / 2 + * CU (slow) = CU2x (fast) / 2 = GPLL ref * N / 2 + */ + return DIV_ROUND_CLOSEST(rps->gpll_ref_freq * val, 2 * 2 * 1000); +} + +static int chv_freq_opcode(struct intel_rps *rps, int val) +{ + /* CHV needs even values */ + return DIV_ROUND_CLOSEST(2 * 1000 * val, rps->gpll_ref_freq) * 2; +} + +int intel_gpu_freq(struct intel_rps *rps, int val) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + + if (INTEL_GEN(i915) >= 9) + return DIV_ROUND_CLOSEST(val * GT_FREQUENCY_MULTIPLIER, + GEN9_FREQ_SCALER); + else if (IS_CHERRYVIEW(i915)) + return chv_gpu_freq(rps, val); + else if (IS_VALLEYVIEW(i915)) + return byt_gpu_freq(rps, val); + else + return val * GT_FREQUENCY_MULTIPLIER; +} + +int intel_freq_opcode(struct intel_rps *rps, int val) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + + if (INTEL_GEN(i915) >= 9) + return DIV_ROUND_CLOSEST(val * GEN9_FREQ_SCALER, + GT_FREQUENCY_MULTIPLIER); + else if (IS_CHERRYVIEW(i915)) + return chv_freq_opcode(rps, val); + else if (IS_VALLEYVIEW(i915)) + return byt_freq_opcode(rps, val); + else + return DIV_ROUND_CLOSEST(val, GT_FREQUENCY_MULTIPLIER); +} + +static void vlv_init_gpll_ref_freq(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + + rps->gpll_ref_freq = + vlv_get_cck_clock(i915, "GPLL ref", + CCK_GPLL_CLOCK_CONTROL, + i915->czclk_freq); + + drm_dbg(&i915->drm, "GPLL reference freq: %d kHz\n", + rps->gpll_ref_freq); +} + +static void vlv_rps_init(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + u32 val; + + vlv_iosf_sb_get(i915, + BIT(VLV_IOSF_SB_PUNIT) | + BIT(VLV_IOSF_SB_NC) | + BIT(VLV_IOSF_SB_CCK)); + + vlv_init_gpll_ref_freq(rps); + + val = vlv_punit_read(i915, PUNIT_REG_GPU_FREQ_STS); + switch ((val >> 6) & 3) { + case 0: + case 1: + i915->mem_freq = 800; + break; + case 2: + i915->mem_freq = 1066; + break; + case 3: + i915->mem_freq = 1333; + break; + } + drm_dbg(&i915->drm, "DDR speed: %d MHz\n", i915->mem_freq); + + rps->max_freq = vlv_rps_max_freq(rps); + rps->rp0_freq = rps->max_freq; + drm_dbg(&i915->drm, "max GPU freq: %d MHz (%u)\n", + intel_gpu_freq(rps, rps->max_freq), rps->max_freq); + + rps->efficient_freq = vlv_rps_rpe_freq(rps); + drm_dbg(&i915->drm, "RPe GPU freq: %d MHz (%u)\n", + intel_gpu_freq(rps, rps->efficient_freq), rps->efficient_freq); + + rps->rp1_freq = vlv_rps_guar_freq(rps); + drm_dbg(&i915->drm, "RP1(Guar Freq) GPU freq: %d MHz (%u)\n", + intel_gpu_freq(rps, rps->rp1_freq), rps->rp1_freq); + + rps->min_freq = vlv_rps_min_freq(rps); + drm_dbg(&i915->drm, "min GPU freq: %d MHz (%u)\n", + intel_gpu_freq(rps, rps->min_freq), rps->min_freq); + + vlv_iosf_sb_put(i915, + BIT(VLV_IOSF_SB_PUNIT) | + BIT(VLV_IOSF_SB_NC) | + BIT(VLV_IOSF_SB_CCK)); +} + +static void chv_rps_init(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + u32 val; + + vlv_iosf_sb_get(i915, + BIT(VLV_IOSF_SB_PUNIT) | + BIT(VLV_IOSF_SB_NC) | + BIT(VLV_IOSF_SB_CCK)); + + vlv_init_gpll_ref_freq(rps); + + val = vlv_cck_read(i915, CCK_FUSE_REG); + + switch ((val >> 2) & 0x7) { + case 3: + i915->mem_freq = 2000; + break; + default: + i915->mem_freq = 1600; + break; + } + drm_dbg(&i915->drm, "DDR speed: %d MHz\n", i915->mem_freq); + + rps->max_freq = chv_rps_max_freq(rps); + rps->rp0_freq = rps->max_freq; + drm_dbg(&i915->drm, "max GPU freq: %d MHz (%u)\n", + intel_gpu_freq(rps, rps->max_freq), rps->max_freq); + + rps->efficient_freq = chv_rps_rpe_freq(rps); + drm_dbg(&i915->drm, "RPe GPU freq: %d MHz (%u)\n", + intel_gpu_freq(rps, rps->efficient_freq), rps->efficient_freq); + + rps->rp1_freq = chv_rps_guar_freq(rps); + drm_dbg(&i915->drm, "RP1(Guar) GPU freq: %d MHz (%u)\n", + intel_gpu_freq(rps, rps->rp1_freq), rps->rp1_freq); + + rps->min_freq = chv_rps_min_freq(rps); + drm_dbg(&i915->drm, "min GPU freq: %d MHz (%u)\n", + intel_gpu_freq(rps, rps->min_freq), rps->min_freq); + + vlv_iosf_sb_put(i915, + BIT(VLV_IOSF_SB_PUNIT) | + BIT(VLV_IOSF_SB_NC) | + BIT(VLV_IOSF_SB_CCK)); + + drm_WARN_ONCE(&i915->drm, (rps->max_freq | rps->efficient_freq | + rps->rp1_freq | rps->min_freq) & 1, + "Odd GPU freq values\n"); +} + +static void vlv_c0_read(struct intel_uncore *uncore, struct intel_rps_ei *ei) +{ + ei->ktime = ktime_get_raw(); + ei->render_c0 = intel_uncore_read(uncore, VLV_RENDER_C0_COUNT); + ei->media_c0 = intel_uncore_read(uncore, VLV_MEDIA_C0_COUNT); +} + +static u32 vlv_wa_c0_ei(struct intel_rps *rps, u32 pm_iir) +{ + struct intel_uncore *uncore = rps_to_uncore(rps); + const struct intel_rps_ei *prev = &rps->ei; + struct intel_rps_ei now; + u32 events = 0; + + if ((pm_iir & GEN6_PM_RP_UP_EI_EXPIRED) == 0) + return 0; + + vlv_c0_read(uncore, &now); + + if (prev->ktime) { + u64 time, c0; + u32 render, media; + + time = ktime_us_delta(now.ktime, prev->ktime); + + time *= rps_to_i915(rps)->czclk_freq; + + /* Workload can be split between render + media, + * e.g. SwapBuffers being blitted in X after being rendered in + * mesa. To account for this we need to combine both engines + * into our activity counter. + */ + render = now.render_c0 - prev->render_c0; + media = now.media_c0 - prev->media_c0; + c0 = max(render, media); + c0 *= 1000 * 100 << 8; /* to usecs and scale to threshold% */ + + if (c0 > time * rps->power.up_threshold) + events = GEN6_PM_RP_UP_THRESHOLD; + else if (c0 < time * rps->power.down_threshold) + events = GEN6_PM_RP_DOWN_THRESHOLD; + } + + rps->ei = now; + return events; +} + +static void rps_work(struct work_struct *work) +{ + struct intel_rps *rps = container_of(work, typeof(*rps), work); + struct intel_gt *gt = rps_to_gt(rps); + struct drm_i915_private *i915 = rps_to_i915(rps); + bool client_boost = false; + int new_freq, adj, min, max; + u32 pm_iir = 0; + + spin_lock_irq(>->irq_lock); + pm_iir = fetch_and_zero(&rps->pm_iir) & rps->pm_events; + client_boost = atomic_read(&rps->num_waiters); + spin_unlock_irq(>->irq_lock); + + /* Make sure we didn't queue anything we're not going to process. */ + if (!pm_iir && !client_boost) + goto out; + + mutex_lock(&rps->lock); + if (!intel_rps_is_active(rps)) { + mutex_unlock(&rps->lock); + return; + } + + pm_iir |= vlv_wa_c0_ei(rps, pm_iir); + + adj = rps->last_adj; + new_freq = rps->cur_freq; + min = rps->min_freq_softlimit; + max = rps->max_freq_softlimit; + if (client_boost) + max = rps->max_freq; + + GT_TRACE(gt, + "pm_iir:%x, client_boost:%s, last:%d, cur:%x, min:%x, max:%x\n", + pm_iir, yesno(client_boost), + adj, new_freq, min, max); + + if (client_boost && new_freq < rps->boost_freq) { + new_freq = rps->boost_freq; + adj = 0; + } else if (pm_iir & GEN6_PM_RP_UP_THRESHOLD) { + if (adj > 0) + adj *= 2; + else /* CHV needs even encode values */ + adj = IS_CHERRYVIEW(gt->i915) ? 2 : 1; + + if (new_freq >= rps->max_freq_softlimit) + adj = 0; + } else if (client_boost) { + adj = 0; + } else if (pm_iir & GEN6_PM_RP_DOWN_TIMEOUT) { + if (rps->cur_freq > rps->efficient_freq) + new_freq = rps->efficient_freq; + else if (rps->cur_freq > rps->min_freq_softlimit) + new_freq = rps->min_freq_softlimit; + adj = 0; + } else if (pm_iir & GEN6_PM_RP_DOWN_THRESHOLD) { + if (adj < 0) + adj *= 2; + else /* CHV needs even encode values */ + adj = IS_CHERRYVIEW(gt->i915) ? -2 : -1; + + if (new_freq <= rps->min_freq_softlimit) + adj = 0; + } else { /* unknown event */ + adj = 0; + } + + /* + * sysfs frequency limits may have snuck in while + * servicing the interrupt + */ + new_freq += adj; + new_freq = clamp_t(int, new_freq, min, max); + + if (intel_rps_set(rps, new_freq)) { + drm_dbg(&i915->drm, "Failed to set new GPU frequency\n"); + adj = 0; + } + rps->last_adj = adj; + + mutex_unlock(&rps->lock); + +out: + spin_lock_irq(>->irq_lock); + gen6_gt_pm_unmask_irq(gt, rps->pm_events); + spin_unlock_irq(>->irq_lock); +} + +void gen11_rps_irq_handler(struct intel_rps *rps, u32 pm_iir) +{ + struct intel_gt *gt = rps_to_gt(rps); + const u32 events = rps->pm_events & pm_iir; + + lockdep_assert_held(>->irq_lock); + + if (unlikely(!events)) + return; + + GT_TRACE(gt, "irq events:%x\n", events); + + gen6_gt_pm_mask_irq(gt, events); + + rps->pm_iir |= events; + schedule_work(&rps->work); +} + +void gen6_rps_irq_handler(struct intel_rps *rps, u32 pm_iir) +{ + struct intel_gt *gt = rps_to_gt(rps); + u32 events; + + events = pm_iir & rps->pm_events; + if (events) { + spin_lock(>->irq_lock); + + GT_TRACE(gt, "irq events:%x\n", events); + + gen6_gt_pm_mask_irq(gt, events); + rps->pm_iir |= events; + + schedule_work(&rps->work); + spin_unlock(>->irq_lock); + } + + if (INTEL_GEN(gt->i915) >= 8) + return; + + if (pm_iir & PM_VEBOX_USER_INTERRUPT) + intel_engine_signal_breadcrumbs(gt->engine[VECS0]); + + if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT) + DRM_DEBUG("Command parser error, pm_iir 0x%08x\n", pm_iir); +} + +void gen5_rps_irq_handler(struct intel_rps *rps) +{ + struct intel_uncore *uncore = rps_to_uncore(rps); + u32 busy_up, busy_down, max_avg, min_avg; + u8 new_freq; + + spin_lock(&mchdev_lock); + + intel_uncore_write16(uncore, + MEMINTRSTS, + intel_uncore_read(uncore, MEMINTRSTS)); + + intel_uncore_write16(uncore, MEMINTRSTS, MEMINT_EVAL_CHG); + busy_up = intel_uncore_read(uncore, RCPREVBSYTUPAVG); + busy_down = intel_uncore_read(uncore, RCPREVBSYTDNAVG); + max_avg = intel_uncore_read(uncore, RCBMAXAVG); + min_avg = intel_uncore_read(uncore, RCBMINAVG); + + /* Handle RCS change request from hw */ + new_freq = rps->cur_freq; + if (busy_up > max_avg) + new_freq++; + else if (busy_down < min_avg) + new_freq--; + new_freq = clamp(new_freq, + rps->min_freq_softlimit, + rps->max_freq_softlimit); + + if (new_freq != rps->cur_freq && gen5_rps_set(rps, new_freq)) + rps->cur_freq = new_freq; + + spin_unlock(&mchdev_lock); +} + +void intel_rps_init_early(struct intel_rps *rps) +{ + mutex_init(&rps->lock); + mutex_init(&rps->power.mutex); + + INIT_WORK(&rps->work, rps_work); + timer_setup(&rps->timer, rps_timer, 0); + + atomic_set(&rps->num_waiters, 0); +} + +void intel_rps_init(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + + if (IS_CHERRYVIEW(i915)) + chv_rps_init(rps); + else if (IS_VALLEYVIEW(i915)) + vlv_rps_init(rps); + else if (INTEL_GEN(i915) >= 6) + gen6_rps_init(rps); + else if (IS_IRONLAKE_M(i915)) + gen5_rps_init(rps); + + /* Derive initial user preferences/limits from the hardware limits */ + rps->max_freq_softlimit = rps->max_freq; + rps->min_freq_softlimit = rps->min_freq; + + /* After setting max-softlimit, find the overclock max freq */ + if (IS_GEN(i915, 6) || IS_IVYBRIDGE(i915) || IS_HASWELL(i915)) { + u32 params = 0; + + sandybridge_pcode_read(i915, GEN6_READ_OC_PARAMS, + ¶ms, NULL); + if (params & BIT(31)) { /* OC supported */ + drm_dbg(&i915->drm, + "Overclocking supported, max: %dMHz, overclock: %dMHz\n", + (rps->max_freq & 0xff) * 50, + (params & 0xff) * 50); + rps->max_freq = params & 0xff; + } + } + + /* Finally allow us to boost to max by default */ + rps->boost_freq = rps->max_freq; + rps->idle_freq = rps->min_freq; + + /* Start in the middle, from here we will autotune based on workload */ + rps->cur_freq = rps->efficient_freq; + + rps->pm_intrmsk_mbz = 0; + + /* + * SNB,IVB,HSW can while VLV,CHV may hard hang on looping batchbuffer + * if GEN6_PM_UP_EI_EXPIRED is masked. + * + * TODO: verify if this can be reproduced on VLV,CHV. + */ + if (INTEL_GEN(i915) <= 7) + rps->pm_intrmsk_mbz |= GEN6_PM_RP_UP_EI_EXPIRED; + + if (INTEL_GEN(i915) >= 8 && INTEL_GEN(i915) < 11) + rps->pm_intrmsk_mbz |= GEN8_PMINTR_DISABLE_REDIRECT_TO_GUC; +} + +void intel_rps_sanitize(struct intel_rps *rps) +{ + if (INTEL_GEN(rps_to_i915(rps)) >= 6) + rps_disable_interrupts(rps); +} + +u32 intel_rps_get_cagf(struct intel_rps *rps, u32 rpstat) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + u32 cagf; + + if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915)) + cagf = (rpstat >> 8) & 0xff; + else if (INTEL_GEN(i915) >= 9) + cagf = (rpstat & GEN9_CAGF_MASK) >> GEN9_CAGF_SHIFT; + else if (IS_HASWELL(i915) || IS_BROADWELL(i915)) + cagf = (rpstat & HSW_CAGF_MASK) >> HSW_CAGF_SHIFT; + else + cagf = (rpstat & GEN6_CAGF_MASK) >> GEN6_CAGF_SHIFT; + + return cagf; +} + +static u32 read_cagf(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + u32 freq; + + if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915)) { + vlv_punit_get(i915); + freq = vlv_punit_read(i915, PUNIT_REG_GPU_FREQ_STS); + vlv_punit_put(i915); + } else { + freq = intel_uncore_read(rps_to_uncore(rps), GEN6_RPSTAT1); + } + + return intel_rps_get_cagf(rps, freq); +} + +u32 intel_rps_read_actual_frequency(struct intel_rps *rps) +{ + struct intel_runtime_pm *rpm = rps_to_uncore(rps)->rpm; + intel_wakeref_t wakeref; + u32 freq = 0; + + with_intel_runtime_pm_if_in_use(rpm, wakeref) + freq = intel_gpu_freq(rps, read_cagf(rps)); + + return freq; +} + +/* External interface for intel_ips.ko */ + +static struct drm_i915_private __rcu *ips_mchdev; + +/** + * Tells the intel_ips driver that the i915 driver is now loaded, if + * IPS got loaded first. + * + * This awkward dance is so that neither module has to depend on the + * other in order for IPS to do the appropriate communication of + * GPU turbo limits to i915. + */ +static void +ips_ping_for_i915_load(void) +{ + void (*link)(void); + + link = symbol_get(ips_link_to_i915_driver); + if (link) { + link(); + symbol_put(ips_link_to_i915_driver); + } +} + +void intel_rps_driver_register(struct intel_rps *rps) +{ + struct intel_gt *gt = rps_to_gt(rps); + + /* + * We only register the i915 ips part with intel-ips once everything is + * set up, to avoid intel-ips sneaking in and reading bogus values. + */ + if (IS_GEN(gt->i915, 5)) { + GEM_BUG_ON(ips_mchdev); + rcu_assign_pointer(ips_mchdev, gt->i915); + ips_ping_for_i915_load(); + } +} + +void intel_rps_driver_unregister(struct intel_rps *rps) +{ + if (rcu_access_pointer(ips_mchdev) == rps_to_i915(rps)) + rcu_assign_pointer(ips_mchdev, NULL); +} + +static struct drm_i915_private *mchdev_get(void) +{ + struct drm_i915_private *i915; + + rcu_read_lock(); + i915 = rcu_dereference(ips_mchdev); + if (!kref_get_unless_zero(&i915->drm.ref)) + i915 = NULL; + rcu_read_unlock(); + + return i915; +} + +/** + * i915_read_mch_val - return value for IPS use + * + * Calculate and return a value for the IPS driver to use when deciding whether + * we have thermal and power headroom to increase CPU or GPU power budget. + */ +unsigned long i915_read_mch_val(void) +{ + struct drm_i915_private *i915; + unsigned long chipset_val = 0; + unsigned long graphics_val = 0; + intel_wakeref_t wakeref; + + i915 = mchdev_get(); + if (!i915) + return 0; + + with_intel_runtime_pm(&i915->runtime_pm, wakeref) { + struct intel_ips *ips = &i915->gt.rps.ips; + + spin_lock_irq(&mchdev_lock); + chipset_val = __ips_chipset_val(ips); + graphics_val = __ips_gfx_val(ips); + spin_unlock_irq(&mchdev_lock); + } + + drm_dev_put(&i915->drm); + return chipset_val + graphics_val; +} +EXPORT_SYMBOL_GPL(i915_read_mch_val); + +/** + * i915_gpu_raise - raise GPU frequency limit + * + * Raise the limit; IPS indicates we have thermal headroom. + */ +bool i915_gpu_raise(void) +{ + struct drm_i915_private *i915; + struct intel_rps *rps; + + i915 = mchdev_get(); + if (!i915) + return false; + + rps = &i915->gt.rps; + + spin_lock_irq(&mchdev_lock); + if (rps->max_freq_softlimit < rps->max_freq) + rps->max_freq_softlimit++; + spin_unlock_irq(&mchdev_lock); + + drm_dev_put(&i915->drm); + return true; +} +EXPORT_SYMBOL_GPL(i915_gpu_raise); + +/** + * i915_gpu_lower - lower GPU frequency limit + * + * IPS indicates we're close to a thermal limit, so throttle back the GPU + * frequency maximum. + */ +bool i915_gpu_lower(void) +{ + struct drm_i915_private *i915; + struct intel_rps *rps; + + i915 = mchdev_get(); + if (!i915) + return false; + + rps = &i915->gt.rps; + + spin_lock_irq(&mchdev_lock); + if (rps->max_freq_softlimit > rps->min_freq) + rps->max_freq_softlimit--; + spin_unlock_irq(&mchdev_lock); + + drm_dev_put(&i915->drm); + return true; +} +EXPORT_SYMBOL_GPL(i915_gpu_lower); + +/** + * i915_gpu_busy - indicate GPU business to IPS + * + * Tell the IPS driver whether or not the GPU is busy. + */ +bool i915_gpu_busy(void) +{ + struct drm_i915_private *i915; + bool ret; + + i915 = mchdev_get(); + if (!i915) + return false; + + ret = i915->gt.awake; + + drm_dev_put(&i915->drm); + return ret; +} +EXPORT_SYMBOL_GPL(i915_gpu_busy); + +/** + * i915_gpu_turbo_disable - disable graphics turbo + * + * Disable graphics turbo by resetting the max frequency and setting the + * current frequency to the default. + */ +bool i915_gpu_turbo_disable(void) +{ + struct drm_i915_private *i915; + struct intel_rps *rps; + bool ret; + + i915 = mchdev_get(); + if (!i915) + return false; + + rps = &i915->gt.rps; + + spin_lock_irq(&mchdev_lock); + rps->max_freq_softlimit = rps->min_freq; + ret = gen5_rps_set(&i915->gt.rps, rps->min_freq); + spin_unlock_irq(&mchdev_lock); + + drm_dev_put(&i915->drm); + return ret; +} +EXPORT_SYMBOL_GPL(i915_gpu_turbo_disable); + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_rps.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_rps.h b/drivers/gpu/drm/i915/gt/intel_rps.h new file mode 100644 index 000000000..8d3c9d663 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_rps.h @@ -0,0 +1,100 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_RPS_H +#define INTEL_RPS_H + +#include "intel_rps_types.h" + +struct i915_request; + +void intel_rps_init_early(struct intel_rps *rps); +void intel_rps_init(struct intel_rps *rps); +void intel_rps_sanitize(struct intel_rps *rps); + +void intel_rps_driver_register(struct intel_rps *rps); +void intel_rps_driver_unregister(struct intel_rps *rps); + +void intel_rps_enable(struct intel_rps *rps); +void intel_rps_disable(struct intel_rps *rps); + +void intel_rps_park(struct intel_rps *rps); +void intel_rps_unpark(struct intel_rps *rps); +void intel_rps_boost(struct i915_request *rq); + +int intel_rps_set(struct intel_rps *rps, u8 val); +void intel_rps_mark_interactive(struct intel_rps *rps, bool interactive); + +int intel_gpu_freq(struct intel_rps *rps, int val); +int intel_freq_opcode(struct intel_rps *rps, int val); +u32 intel_rps_get_cagf(struct intel_rps *rps, u32 rpstat1); +u32 intel_rps_read_actual_frequency(struct intel_rps *rps); + +void gen5_rps_irq_handler(struct intel_rps *rps); +void gen6_rps_irq_handler(struct intel_rps *rps, u32 pm_iir); +void gen11_rps_irq_handler(struct intel_rps *rps, u32 pm_iir); + +static inline bool intel_rps_is_enabled(const struct intel_rps *rps) +{ + return test_bit(INTEL_RPS_ENABLED, &rps->flags); +} + +static inline void intel_rps_set_enabled(struct intel_rps *rps) +{ + set_bit(INTEL_RPS_ENABLED, &rps->flags); +} + +static inline void intel_rps_clear_enabled(struct intel_rps *rps) +{ + clear_bit(INTEL_RPS_ENABLED, &rps->flags); +} + +static inline bool intel_rps_is_active(const struct intel_rps *rps) +{ + return test_bit(INTEL_RPS_ACTIVE, &rps->flags); +} + +static inline void intel_rps_set_active(struct intel_rps *rps) +{ + set_bit(INTEL_RPS_ACTIVE, &rps->flags); +} + +static inline bool intel_rps_clear_active(struct intel_rps *rps) +{ + return test_and_clear_bit(INTEL_RPS_ACTIVE, &rps->flags); +} + +static inline bool intel_rps_has_interrupts(const struct intel_rps *rps) +{ + return test_bit(INTEL_RPS_INTERRUPTS, &rps->flags); +} + +static inline void intel_rps_set_interrupts(struct intel_rps *rps) +{ + set_bit(INTEL_RPS_INTERRUPTS, &rps->flags); +} + +static inline void intel_rps_clear_interrupts(struct intel_rps *rps) +{ + clear_bit(INTEL_RPS_INTERRUPTS, &rps->flags); +} + +static inline bool intel_rps_uses_timer(const struct intel_rps *rps) +{ + return test_bit(INTEL_RPS_TIMER, &rps->flags); +} + +static inline void intel_rps_set_timer(struct intel_rps *rps) +{ + set_bit(INTEL_RPS_TIMER, &rps->flags); +} + +static inline void intel_rps_clear_timer(struct intel_rps *rps) +{ + clear_bit(INTEL_RPS_TIMER, &rps->flags); +} + +#endif /* INTEL_RPS_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_rps_types.h b/drivers/gpu/drm/i915/gt/intel_rps_types.h new file mode 100644 index 000000000..38083f040 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_rps_types.h @@ -0,0 +1,103 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_RPS_TYPES_H +#define INTEL_RPS_TYPES_H + +#include <linux/atomic.h> +#include <linux/ktime.h> +#include <linux/mutex.h> +#include <linux/types.h> +#include <linux/workqueue.h> + +struct intel_ips { + u64 last_count1; + unsigned long last_time1; + unsigned long chipset_power; + u64 last_count2; + u64 last_time2; + unsigned long gfx_power; + u8 corr; + + int c, m; +}; + +struct intel_rps_ei { + ktime_t ktime; + u32 render_c0; + u32 media_c0; +}; + +enum { + INTEL_RPS_ENABLED = 0, + INTEL_RPS_ACTIVE, + INTEL_RPS_INTERRUPTS, + INTEL_RPS_TIMER, +}; + +struct intel_rps { + struct mutex lock; /* protects enabling and the worker */ + + /* + * work, interrupts_enabled and pm_iir are protected by + * dev_priv->irq_lock + */ + struct timer_list timer; + struct work_struct work; + unsigned long flags; + + ktime_t pm_timestamp; + u32 pm_interval; + u32 pm_iir; + + /* PM interrupt bits that should never be masked */ + u32 pm_intrmsk_mbz; + u32 pm_events; + + /* Frequencies are stored in potentially platform dependent multiples. + * In other words, *_freq needs to be multiplied by X to be interesting. + * Soft limits are those which are used for the dynamic reclocking done + * by the driver (raise frequencies under heavy loads, and lower for + * lighter loads). Hard limits are those imposed by the hardware. + * + * A distinction is made for overclocking, which is never enabled by + * default, and is considered to be above the hard limit if it's + * possible at all. + */ + u8 cur_freq; /* Current frequency (cached, may not == HW) */ + u8 last_freq; /* Last SWREQ frequency */ + u8 min_freq_softlimit; /* Minimum frequency permitted by the driver */ + u8 max_freq_softlimit; /* Max frequency permitted by the driver */ + u8 max_freq; /* Maximum frequency, RP0 if not overclocking */ + u8 min_freq; /* AKA RPn. Minimum frequency */ + u8 boost_freq; /* Frequency to request when wait boosting */ + u8 idle_freq; /* Frequency to request when we are idle */ + u8 efficient_freq; /* AKA RPe. Pre-determined balanced frequency */ + u8 rp1_freq; /* "less than" RP0 power/freqency */ + u8 rp0_freq; /* Non-overclocked max frequency. */ + u16 gpll_ref_freq; /* vlv/chv GPLL reference frequency */ + + int last_adj; + + struct { + struct mutex mutex; + + enum { LOW_POWER, BETWEEN, HIGH_POWER } mode; + unsigned int interactive; + + u8 up_threshold; /* Current %busy required to uplock */ + u8 down_threshold; /* Current %busy required to downclock */ + } power; + + atomic_t num_waiters; + atomic_t boosts; + + /* manual wa residency calculations */ + struct intel_rps_ei ei; + struct intel_ips ips; +}; + +#endif /* INTEL_RPS_TYPES_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_sseu.c b/drivers/gpu/drm/i915/gt/intel_sseu.c new file mode 100644 index 000000000..f1c039e1b --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_sseu.c @@ -0,0 +1,762 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include "i915_drv.h" +#include "intel_lrc_reg.h" +#include "intel_sseu.h" + +void intel_sseu_set_info(struct sseu_dev_info *sseu, u8 max_slices, + u8 max_subslices, u8 max_eus_per_subslice) +{ + sseu->max_slices = max_slices; + sseu->max_subslices = max_subslices; + sseu->max_eus_per_subslice = max_eus_per_subslice; + + sseu->ss_stride = GEN_SSEU_STRIDE(sseu->max_subslices); + GEM_BUG_ON(sseu->ss_stride > GEN_MAX_SUBSLICE_STRIDE); + sseu->eu_stride = GEN_SSEU_STRIDE(sseu->max_eus_per_subslice); + GEM_BUG_ON(sseu->eu_stride > GEN_MAX_EU_STRIDE); +} + +unsigned int +intel_sseu_subslice_total(const struct sseu_dev_info *sseu) +{ + unsigned int i, total = 0; + + for (i = 0; i < ARRAY_SIZE(sseu->subslice_mask); i++) + total += hweight8(sseu->subslice_mask[i]); + + return total; +} + +u32 intel_sseu_get_subslices(const struct sseu_dev_info *sseu, u8 slice) +{ + int i, offset = slice * sseu->ss_stride; + u32 mask = 0; + + GEM_BUG_ON(slice >= sseu->max_slices); + + for (i = 0; i < sseu->ss_stride; i++) + mask |= (u32)sseu->subslice_mask[offset + i] << + i * BITS_PER_BYTE; + + return mask; +} + +void intel_sseu_set_subslices(struct sseu_dev_info *sseu, int slice, + u32 ss_mask) +{ + int offset = slice * sseu->ss_stride; + + memcpy(&sseu->subslice_mask[offset], &ss_mask, sseu->ss_stride); +} + +unsigned int +intel_sseu_subslices_per_slice(const struct sseu_dev_info *sseu, u8 slice) +{ + return hweight32(intel_sseu_get_subslices(sseu, slice)); +} + +static int sseu_eu_idx(const struct sseu_dev_info *sseu, int slice, + int subslice) +{ + int slice_stride = sseu->max_subslices * sseu->eu_stride; + + return slice * slice_stride + subslice * sseu->eu_stride; +} + +static u16 sseu_get_eus(const struct sseu_dev_info *sseu, int slice, + int subslice) +{ + int i, offset = sseu_eu_idx(sseu, slice, subslice); + u16 eu_mask = 0; + + for (i = 0; i < sseu->eu_stride; i++) + eu_mask |= + ((u16)sseu->eu_mask[offset + i]) << (i * BITS_PER_BYTE); + + return eu_mask; +} + +static void sseu_set_eus(struct sseu_dev_info *sseu, int slice, int subslice, + u16 eu_mask) +{ + int i, offset = sseu_eu_idx(sseu, slice, subslice); + + for (i = 0; i < sseu->eu_stride; i++) + sseu->eu_mask[offset + i] = + (eu_mask >> (BITS_PER_BYTE * i)) & 0xff; +} + +static u16 compute_eu_total(const struct sseu_dev_info *sseu) +{ + u16 i, total = 0; + + for (i = 0; i < ARRAY_SIZE(sseu->eu_mask); i++) + total += hweight8(sseu->eu_mask[i]); + + return total; +} + +static void gen11_compute_sseu_info(struct sseu_dev_info *sseu, + u8 s_en, u32 ss_en, u16 eu_en) +{ + int s, ss; + + /* ss_en represents entire subslice mask across all slices */ + GEM_BUG_ON(sseu->max_slices * sseu->max_subslices > + sizeof(ss_en) * BITS_PER_BYTE); + + for (s = 0; s < sseu->max_slices; s++) { + if ((s_en & BIT(s)) == 0) + continue; + + sseu->slice_mask |= BIT(s); + + intel_sseu_set_subslices(sseu, s, ss_en); + + for (ss = 0; ss < sseu->max_subslices; ss++) + if (intel_sseu_has_subslice(sseu, s, ss)) + sseu_set_eus(sseu, s, ss, eu_en); + } + sseu->eu_per_subslice = hweight16(eu_en); + sseu->eu_total = compute_eu_total(sseu); +} + +static void gen12_sseu_info_init(struct intel_gt *gt) +{ + struct sseu_dev_info *sseu = >->info.sseu; + struct intel_uncore *uncore = gt->uncore; + u32 dss_en; + u16 eu_en = 0; + u8 eu_en_fuse; + u8 s_en; + int eu; + + /* + * Gen12 has Dual-Subslices, which behave similarly to 2 gen11 SS. + * Instead of splitting these, provide userspace with an array + * of DSS to more closely represent the hardware resource. + */ + intel_sseu_set_info(sseu, 1, 6, 16); + + s_en = intel_uncore_read(uncore, GEN11_GT_SLICE_ENABLE) & + GEN11_GT_S_ENA_MASK; + + dss_en = intel_uncore_read(uncore, GEN12_GT_DSS_ENABLE); + + /* one bit per pair of EUs */ + eu_en_fuse = ~(intel_uncore_read(uncore, GEN11_EU_DISABLE) & + GEN11_EU_DIS_MASK); + for (eu = 0; eu < sseu->max_eus_per_subslice / 2; eu++) + if (eu_en_fuse & BIT(eu)) + eu_en |= BIT(eu * 2) | BIT(eu * 2 + 1); + + gen11_compute_sseu_info(sseu, s_en, dss_en, eu_en); + + /* TGL only supports slice-level power gating */ + sseu->has_slice_pg = 1; +} + +static void gen11_sseu_info_init(struct intel_gt *gt) +{ + struct sseu_dev_info *sseu = >->info.sseu; + struct intel_uncore *uncore = gt->uncore; + u32 ss_en; + u8 eu_en; + u8 s_en; + + if (IS_ELKHARTLAKE(gt->i915)) + intel_sseu_set_info(sseu, 1, 4, 8); + else + intel_sseu_set_info(sseu, 1, 8, 8); + + s_en = intel_uncore_read(uncore, GEN11_GT_SLICE_ENABLE) & + GEN11_GT_S_ENA_MASK; + ss_en = ~intel_uncore_read(uncore, GEN11_GT_SUBSLICE_DISABLE); + + eu_en = ~(intel_uncore_read(uncore, GEN11_EU_DISABLE) & + GEN11_EU_DIS_MASK); + + gen11_compute_sseu_info(sseu, s_en, ss_en, eu_en); + + /* ICL has no power gating restrictions. */ + sseu->has_slice_pg = 1; + sseu->has_subslice_pg = 1; + sseu->has_eu_pg = 1; +} + +static void gen10_sseu_info_init(struct intel_gt *gt) +{ + struct intel_uncore *uncore = gt->uncore; + struct sseu_dev_info *sseu = >->info.sseu; + const u32 fuse2 = intel_uncore_read(uncore, GEN8_FUSE2); + const int eu_mask = 0xff; + u32 subslice_mask, eu_en; + int s, ss; + + intel_sseu_set_info(sseu, 6, 4, 8); + + sseu->slice_mask = (fuse2 & GEN10_F2_S_ENA_MASK) >> + GEN10_F2_S_ENA_SHIFT; + + /* Slice0 */ + eu_en = ~intel_uncore_read(uncore, GEN8_EU_DISABLE0); + for (ss = 0; ss < sseu->max_subslices; ss++) + sseu_set_eus(sseu, 0, ss, (eu_en >> (8 * ss)) & eu_mask); + /* Slice1 */ + sseu_set_eus(sseu, 1, 0, (eu_en >> 24) & eu_mask); + eu_en = ~intel_uncore_read(uncore, GEN8_EU_DISABLE1); + sseu_set_eus(sseu, 1, 1, eu_en & eu_mask); + /* Slice2 */ + sseu_set_eus(sseu, 2, 0, (eu_en >> 8) & eu_mask); + sseu_set_eus(sseu, 2, 1, (eu_en >> 16) & eu_mask); + /* Slice3 */ + sseu_set_eus(sseu, 3, 0, (eu_en >> 24) & eu_mask); + eu_en = ~intel_uncore_read(uncore, GEN8_EU_DISABLE2); + sseu_set_eus(sseu, 3, 1, eu_en & eu_mask); + /* Slice4 */ + sseu_set_eus(sseu, 4, 0, (eu_en >> 8) & eu_mask); + sseu_set_eus(sseu, 4, 1, (eu_en >> 16) & eu_mask); + /* Slice5 */ + sseu_set_eus(sseu, 5, 0, (eu_en >> 24) & eu_mask); + eu_en = ~intel_uncore_read(uncore, GEN10_EU_DISABLE3); + sseu_set_eus(sseu, 5, 1, eu_en & eu_mask); + + subslice_mask = (1 << 4) - 1; + subslice_mask &= ~((fuse2 & GEN10_F2_SS_DIS_MASK) >> + GEN10_F2_SS_DIS_SHIFT); + + for (s = 0; s < sseu->max_slices; s++) { + u32 subslice_mask_with_eus = subslice_mask; + + for (ss = 0; ss < sseu->max_subslices; ss++) { + if (sseu_get_eus(sseu, s, ss) == 0) + subslice_mask_with_eus &= ~BIT(ss); + } + + /* + * Slice0 can have up to 3 subslices, but there are only 2 in + * slice1/2. + */ + intel_sseu_set_subslices(sseu, s, s == 0 ? + subslice_mask_with_eus : + subslice_mask_with_eus & 0x3); + } + + sseu->eu_total = compute_eu_total(sseu); + + /* + * CNL is expected to always have a uniform distribution + * of EU across subslices with the exception that any one + * EU in any one subslice may be fused off for die + * recovery. + */ + sseu->eu_per_subslice = + intel_sseu_subslice_total(sseu) ? + DIV_ROUND_UP(sseu->eu_total, intel_sseu_subslice_total(sseu)) : + 0; + + /* No restrictions on Power Gating */ + sseu->has_slice_pg = 1; + sseu->has_subslice_pg = 1; + sseu->has_eu_pg = 1; +} + +static void cherryview_sseu_info_init(struct intel_gt *gt) +{ + struct sseu_dev_info *sseu = >->info.sseu; + u32 fuse; + u8 subslice_mask = 0; + + fuse = intel_uncore_read(gt->uncore, CHV_FUSE_GT); + + sseu->slice_mask = BIT(0); + intel_sseu_set_info(sseu, 1, 2, 8); + + if (!(fuse & CHV_FGT_DISABLE_SS0)) { + u8 disabled_mask = + ((fuse & CHV_FGT_EU_DIS_SS0_R0_MASK) >> + CHV_FGT_EU_DIS_SS0_R0_SHIFT) | + (((fuse & CHV_FGT_EU_DIS_SS0_R1_MASK) >> + CHV_FGT_EU_DIS_SS0_R1_SHIFT) << 4); + + subslice_mask |= BIT(0); + sseu_set_eus(sseu, 0, 0, ~disabled_mask); + } + + if (!(fuse & CHV_FGT_DISABLE_SS1)) { + u8 disabled_mask = + ((fuse & CHV_FGT_EU_DIS_SS1_R0_MASK) >> + CHV_FGT_EU_DIS_SS1_R0_SHIFT) | + (((fuse & CHV_FGT_EU_DIS_SS1_R1_MASK) >> + CHV_FGT_EU_DIS_SS1_R1_SHIFT) << 4); + + subslice_mask |= BIT(1); + sseu_set_eus(sseu, 0, 1, ~disabled_mask); + } + + intel_sseu_set_subslices(sseu, 0, subslice_mask); + + sseu->eu_total = compute_eu_total(sseu); + + /* + * CHV expected to always have a uniform distribution of EU + * across subslices. + */ + sseu->eu_per_subslice = intel_sseu_subslice_total(sseu) ? + sseu->eu_total / + intel_sseu_subslice_total(sseu) : + 0; + /* + * CHV supports subslice power gating on devices with more than + * one subslice, and supports EU power gating on devices with + * more than one EU pair per subslice. + */ + sseu->has_slice_pg = 0; + sseu->has_subslice_pg = intel_sseu_subslice_total(sseu) > 1; + sseu->has_eu_pg = (sseu->eu_per_subslice > 2); +} + +static void gen9_sseu_info_init(struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + struct intel_device_info *info = mkwrite_device_info(i915); + struct sseu_dev_info *sseu = >->info.sseu; + struct intel_uncore *uncore = gt->uncore; + u32 fuse2, eu_disable, subslice_mask; + const u8 eu_mask = 0xff; + int s, ss; + + fuse2 = intel_uncore_read(uncore, GEN8_FUSE2); + sseu->slice_mask = (fuse2 & GEN8_F2_S_ENA_MASK) >> GEN8_F2_S_ENA_SHIFT; + + /* BXT has a single slice and at most 3 subslices. */ + intel_sseu_set_info(sseu, IS_GEN9_LP(i915) ? 1 : 3, + IS_GEN9_LP(i915) ? 3 : 4, 8); + + /* + * The subslice disable field is global, i.e. it applies + * to each of the enabled slices. + */ + subslice_mask = (1 << sseu->max_subslices) - 1; + subslice_mask &= ~((fuse2 & GEN9_F2_SS_DIS_MASK) >> + GEN9_F2_SS_DIS_SHIFT); + + /* + * Iterate through enabled slices and subslices to + * count the total enabled EU. + */ + for (s = 0; s < sseu->max_slices; s++) { + if (!(sseu->slice_mask & BIT(s))) + /* skip disabled slice */ + continue; + + intel_sseu_set_subslices(sseu, s, subslice_mask); + + eu_disable = intel_uncore_read(uncore, GEN9_EU_DISABLE(s)); + for (ss = 0; ss < sseu->max_subslices; ss++) { + int eu_per_ss; + u8 eu_disabled_mask; + + if (!intel_sseu_has_subslice(sseu, s, ss)) + /* skip disabled subslice */ + continue; + + eu_disabled_mask = (eu_disable >> (ss * 8)) & eu_mask; + + sseu_set_eus(sseu, s, ss, ~eu_disabled_mask); + + eu_per_ss = sseu->max_eus_per_subslice - + hweight8(eu_disabled_mask); + + /* + * Record which subslice(s) has(have) 7 EUs. we + * can tune the hash used to spread work among + * subslices if they are unbalanced. + */ + if (eu_per_ss == 7) + sseu->subslice_7eu[s] |= BIT(ss); + } + } + + sseu->eu_total = compute_eu_total(sseu); + + /* + * SKL is expected to always have a uniform distribution + * of EU across subslices with the exception that any one + * EU in any one subslice may be fused off for die + * recovery. BXT is expected to be perfectly uniform in EU + * distribution. + */ + sseu->eu_per_subslice = + intel_sseu_subslice_total(sseu) ? + DIV_ROUND_UP(sseu->eu_total, intel_sseu_subslice_total(sseu)) : + 0; + + /* + * SKL+ supports slice power gating on devices with more than + * one slice, and supports EU power gating on devices with + * more than one EU pair per subslice. BXT+ supports subslice + * power gating on devices with more than one subslice, and + * supports EU power gating on devices with more than one EU + * pair per subslice. + */ + sseu->has_slice_pg = + !IS_GEN9_LP(i915) && hweight8(sseu->slice_mask) > 1; + sseu->has_subslice_pg = + IS_GEN9_LP(i915) && intel_sseu_subslice_total(sseu) > 1; + sseu->has_eu_pg = sseu->eu_per_subslice > 2; + + if (IS_GEN9_LP(i915)) { +#define IS_SS_DISABLED(ss) (!(sseu->subslice_mask[0] & BIT(ss))) + info->has_pooled_eu = hweight8(sseu->subslice_mask[0]) == 3; + + sseu->min_eu_in_pool = 0; + if (info->has_pooled_eu) { + if (IS_SS_DISABLED(2) || IS_SS_DISABLED(0)) + sseu->min_eu_in_pool = 3; + else if (IS_SS_DISABLED(1)) + sseu->min_eu_in_pool = 6; + else + sseu->min_eu_in_pool = 9; + } +#undef IS_SS_DISABLED + } +} + +static void bdw_sseu_info_init(struct intel_gt *gt) +{ + struct sseu_dev_info *sseu = >->info.sseu; + struct intel_uncore *uncore = gt->uncore; + int s, ss; + u32 fuse2, subslice_mask, eu_disable[3]; /* s_max */ + u32 eu_disable0, eu_disable1, eu_disable2; + + fuse2 = intel_uncore_read(uncore, GEN8_FUSE2); + sseu->slice_mask = (fuse2 & GEN8_F2_S_ENA_MASK) >> GEN8_F2_S_ENA_SHIFT; + intel_sseu_set_info(sseu, 3, 3, 8); + + /* + * The subslice disable field is global, i.e. it applies + * to each of the enabled slices. + */ + subslice_mask = GENMASK(sseu->max_subslices - 1, 0); + subslice_mask &= ~((fuse2 & GEN8_F2_SS_DIS_MASK) >> + GEN8_F2_SS_DIS_SHIFT); + eu_disable0 = intel_uncore_read(uncore, GEN8_EU_DISABLE0); + eu_disable1 = intel_uncore_read(uncore, GEN8_EU_DISABLE1); + eu_disable2 = intel_uncore_read(uncore, GEN8_EU_DISABLE2); + eu_disable[0] = eu_disable0 & GEN8_EU_DIS0_S0_MASK; + eu_disable[1] = (eu_disable0 >> GEN8_EU_DIS0_S1_SHIFT) | + ((eu_disable1 & GEN8_EU_DIS1_S1_MASK) << + (32 - GEN8_EU_DIS0_S1_SHIFT)); + eu_disable[2] = (eu_disable1 >> GEN8_EU_DIS1_S2_SHIFT) | + ((eu_disable2 & GEN8_EU_DIS2_S2_MASK) << + (32 - GEN8_EU_DIS1_S2_SHIFT)); + + /* + * Iterate through enabled slices and subslices to + * count the total enabled EU. + */ + for (s = 0; s < sseu->max_slices; s++) { + if (!(sseu->slice_mask & BIT(s))) + /* skip disabled slice */ + continue; + + intel_sseu_set_subslices(sseu, s, subslice_mask); + + for (ss = 0; ss < sseu->max_subslices; ss++) { + u8 eu_disabled_mask; + u32 n_disabled; + + if (!intel_sseu_has_subslice(sseu, s, ss)) + /* skip disabled subslice */ + continue; + + eu_disabled_mask = + eu_disable[s] >> (ss * sseu->max_eus_per_subslice); + + sseu_set_eus(sseu, s, ss, ~eu_disabled_mask); + + n_disabled = hweight8(eu_disabled_mask); + + /* + * Record which subslices have 7 EUs. + */ + if (sseu->max_eus_per_subslice - n_disabled == 7) + sseu->subslice_7eu[s] |= 1 << ss; + } + } + + sseu->eu_total = compute_eu_total(sseu); + + /* + * BDW is expected to always have a uniform distribution of EU across + * subslices with the exception that any one EU in any one subslice may + * be fused off for die recovery. + */ + sseu->eu_per_subslice = + intel_sseu_subslice_total(sseu) ? + DIV_ROUND_UP(sseu->eu_total, intel_sseu_subslice_total(sseu)) : + 0; + + /* + * BDW supports slice power gating on devices with more than + * one slice. + */ + sseu->has_slice_pg = hweight8(sseu->slice_mask) > 1; + sseu->has_subslice_pg = 0; + sseu->has_eu_pg = 0; +} + +static void hsw_sseu_info_init(struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + struct sseu_dev_info *sseu = >->info.sseu; + u32 fuse1; + u8 subslice_mask = 0; + int s, ss; + + /* + * There isn't a register to tell us how many slices/subslices. We + * work off the PCI-ids here. + */ + switch (INTEL_INFO(i915)->gt) { + default: + MISSING_CASE(INTEL_INFO(i915)->gt); + fallthrough; + case 1: + sseu->slice_mask = BIT(0); + subslice_mask = BIT(0); + break; + case 2: + sseu->slice_mask = BIT(0); + subslice_mask = BIT(0) | BIT(1); + break; + case 3: + sseu->slice_mask = BIT(0) | BIT(1); + subslice_mask = BIT(0) | BIT(1); + break; + } + + fuse1 = intel_uncore_read(gt->uncore, HSW_PAVP_FUSE1); + switch ((fuse1 & HSW_F1_EU_DIS_MASK) >> HSW_F1_EU_DIS_SHIFT) { + default: + MISSING_CASE((fuse1 & HSW_F1_EU_DIS_MASK) >> + HSW_F1_EU_DIS_SHIFT); + fallthrough; + case HSW_F1_EU_DIS_10EUS: + sseu->eu_per_subslice = 10; + break; + case HSW_F1_EU_DIS_8EUS: + sseu->eu_per_subslice = 8; + break; + case HSW_F1_EU_DIS_6EUS: + sseu->eu_per_subslice = 6; + break; + } + + intel_sseu_set_info(sseu, hweight8(sseu->slice_mask), + hweight8(subslice_mask), + sseu->eu_per_subslice); + + for (s = 0; s < sseu->max_slices; s++) { + intel_sseu_set_subslices(sseu, s, subslice_mask); + + for (ss = 0; ss < sseu->max_subslices; ss++) { + sseu_set_eus(sseu, s, ss, + (1UL << sseu->eu_per_subslice) - 1); + } + } + + sseu->eu_total = compute_eu_total(sseu); + + /* No powergating for you. */ + sseu->has_slice_pg = 0; + sseu->has_subslice_pg = 0; + sseu->has_eu_pg = 0; +} + +void intel_sseu_info_init(struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + + if (IS_HASWELL(i915)) + hsw_sseu_info_init(gt); + else if (IS_CHERRYVIEW(i915)) + cherryview_sseu_info_init(gt); + else if (IS_BROADWELL(i915)) + bdw_sseu_info_init(gt); + else if (IS_GEN(i915, 9)) + gen9_sseu_info_init(gt); + else if (IS_GEN(i915, 10)) + gen10_sseu_info_init(gt); + else if (IS_GEN(i915, 11)) + gen11_sseu_info_init(gt); + else if (INTEL_GEN(i915) >= 12) + gen12_sseu_info_init(gt); +} + +u32 intel_sseu_make_rpcs(struct intel_gt *gt, + const struct intel_sseu *req_sseu) +{ + struct drm_i915_private *i915 = gt->i915; + const struct sseu_dev_info *sseu = >->info.sseu; + bool subslice_pg = sseu->has_subslice_pg; + u8 slices, subslices; + u32 rpcs = 0; + + /* + * No explicit RPCS request is needed to ensure full + * slice/subslice/EU enablement prior to Gen9. + */ + if (INTEL_GEN(i915) < 9) + return 0; + + /* + * If i915/perf is active, we want a stable powergating configuration + * on the system. Use the configuration pinned by i915/perf. + */ + if (i915->perf.exclusive_stream) + req_sseu = &i915->perf.sseu; + + slices = hweight8(req_sseu->slice_mask); + subslices = hweight8(req_sseu->subslice_mask); + + /* + * Since the SScount bitfield in GEN8_R_PWR_CLK_STATE is only three bits + * wide and Icelake has up to eight subslices, specfial programming is + * needed in order to correctly enable all subslices. + * + * According to documentation software must consider the configuration + * as 2x4x8 and hardware will translate this to 1x8x8. + * + * Furthemore, even though SScount is three bits, maximum documented + * value for it is four. From this some rules/restrictions follow: + * + * 1. + * If enabled subslice count is greater than four, two whole slices must + * be enabled instead. + * + * 2. + * When more than one slice is enabled, hardware ignores the subslice + * count altogether. + * + * From these restrictions it follows that it is not possible to enable + * a count of subslices between the SScount maximum of four restriction, + * and the maximum available number on a particular SKU. Either all + * subslices are enabled, or a count between one and four on the first + * slice. + */ + if (IS_GEN(i915, 11) && + slices == 1 && + subslices > min_t(u8, 4, hweight8(sseu->subslice_mask[0]) / 2)) { + GEM_BUG_ON(subslices & 1); + + subslice_pg = false; + slices *= 2; + } + + /* + * Starting in Gen9, render power gating can leave + * slice/subslice/EU in a partially enabled state. We + * must make an explicit request through RPCS for full + * enablement. + */ + if (sseu->has_slice_pg) { + u32 mask, val = slices; + + if (INTEL_GEN(i915) >= 11) { + mask = GEN11_RPCS_S_CNT_MASK; + val <<= GEN11_RPCS_S_CNT_SHIFT; + } else { + mask = GEN8_RPCS_S_CNT_MASK; + val <<= GEN8_RPCS_S_CNT_SHIFT; + } + + GEM_BUG_ON(val & ~mask); + val &= mask; + + rpcs |= GEN8_RPCS_ENABLE | GEN8_RPCS_S_CNT_ENABLE | val; + } + + if (subslice_pg) { + u32 val = subslices; + + val <<= GEN8_RPCS_SS_CNT_SHIFT; + + GEM_BUG_ON(val & ~GEN8_RPCS_SS_CNT_MASK); + val &= GEN8_RPCS_SS_CNT_MASK; + + rpcs |= GEN8_RPCS_ENABLE | GEN8_RPCS_SS_CNT_ENABLE | val; + } + + if (sseu->has_eu_pg) { + u32 val; + + val = req_sseu->min_eus_per_subslice << GEN8_RPCS_EU_MIN_SHIFT; + GEM_BUG_ON(val & ~GEN8_RPCS_EU_MIN_MASK); + val &= GEN8_RPCS_EU_MIN_MASK; + + rpcs |= val; + + val = req_sseu->max_eus_per_subslice << GEN8_RPCS_EU_MAX_SHIFT; + GEM_BUG_ON(val & ~GEN8_RPCS_EU_MAX_MASK); + val &= GEN8_RPCS_EU_MAX_MASK; + + rpcs |= val; + + rpcs |= GEN8_RPCS_ENABLE; + } + + return rpcs; +} + +void intel_sseu_dump(const struct sseu_dev_info *sseu, struct drm_printer *p) +{ + int s; + + drm_printf(p, "slice total: %u, mask=%04x\n", + hweight8(sseu->slice_mask), sseu->slice_mask); + drm_printf(p, "subslice total: %u\n", intel_sseu_subslice_total(sseu)); + for (s = 0; s < sseu->max_slices; s++) { + drm_printf(p, "slice%d: %u subslices, mask=%08x\n", + s, intel_sseu_subslices_per_slice(sseu, s), + intel_sseu_get_subslices(sseu, s)); + } + drm_printf(p, "EU total: %u\n", sseu->eu_total); + drm_printf(p, "EU per subslice: %u\n", sseu->eu_per_subslice); + drm_printf(p, "has slice power gating: %s\n", + yesno(sseu->has_slice_pg)); + drm_printf(p, "has subslice power gating: %s\n", + yesno(sseu->has_subslice_pg)); + drm_printf(p, "has EU power gating: %s\n", yesno(sseu->has_eu_pg)); +} + +void intel_sseu_print_topology(const struct sseu_dev_info *sseu, + struct drm_printer *p) +{ + int s, ss; + + if (sseu->max_slices == 0) { + drm_printf(p, "Unavailable\n"); + return; + } + + for (s = 0; s < sseu->max_slices; s++) { + drm_printf(p, "slice%d: %u subslice(s) (0x%08x):\n", + s, intel_sseu_subslices_per_slice(sseu, s), + intel_sseu_get_subslices(sseu, s)); + + for (ss = 0; ss < sseu->max_subslices; ss++) { + u16 enabled_eus = sseu_get_eus(sseu, s, ss); + + drm_printf(p, "\tsubslice%d: %u EUs (0x%hx)\n", + ss, hweight16(enabled_eus), enabled_eus); + } + } +} diff --git a/drivers/gpu/drm/i915/gt/intel_sseu.h b/drivers/gpu/drm/i915/gt/intel_sseu.h new file mode 100644 index 000000000..23ba6c2eb --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_sseu.h @@ -0,0 +1,108 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef __INTEL_SSEU_H__ +#define __INTEL_SSEU_H__ + +#include <linux/types.h> +#include <linux/kernel.h> + +#include "i915_gem.h" + +struct drm_i915_private; +struct intel_gt; +struct drm_printer; + +#define GEN_MAX_SLICES (6) /* CNL upper bound */ +#define GEN_MAX_SUBSLICES (8) /* ICL upper bound */ +#define GEN_SSEU_STRIDE(max_entries) DIV_ROUND_UP(max_entries, BITS_PER_BYTE) +#define GEN_MAX_SUBSLICE_STRIDE GEN_SSEU_STRIDE(GEN_MAX_SUBSLICES) +#define GEN_MAX_EUS (16) /* TGL upper bound */ +#define GEN_MAX_EU_STRIDE GEN_SSEU_STRIDE(GEN_MAX_EUS) + +struct sseu_dev_info { + u8 slice_mask; + u8 subslice_mask[GEN_MAX_SLICES * GEN_MAX_SUBSLICE_STRIDE]; + u8 eu_mask[GEN_MAX_SLICES * GEN_MAX_SUBSLICES * GEN_MAX_EU_STRIDE]; + u16 eu_total; + u8 eu_per_subslice; + u8 min_eu_in_pool; + /* For each slice, which subslice(s) has(have) 7 EUs (bitfield)? */ + u8 subslice_7eu[3]; + u8 has_slice_pg:1; + u8 has_subslice_pg:1; + u8 has_eu_pg:1; + + /* Topology fields */ + u8 max_slices; + u8 max_subslices; + u8 max_eus_per_subslice; + + u8 ss_stride; + u8 eu_stride; +}; + +/* + * Powergating configuration for a particular (context,engine). + */ +struct intel_sseu { + u8 slice_mask; + u8 subslice_mask; + u8 min_eus_per_subslice; + u8 max_eus_per_subslice; +}; + +static inline struct intel_sseu +intel_sseu_from_device_info(const struct sseu_dev_info *sseu) +{ + struct intel_sseu value = { + .slice_mask = sseu->slice_mask, + .subslice_mask = sseu->subslice_mask[0], + .min_eus_per_subslice = sseu->max_eus_per_subslice, + .max_eus_per_subslice = sseu->max_eus_per_subslice, + }; + + return value; +} + +static inline bool +intel_sseu_has_subslice(const struct sseu_dev_info *sseu, int slice, + int subslice) +{ + u8 mask; + int ss_idx = subslice / BITS_PER_BYTE; + + GEM_BUG_ON(ss_idx >= sseu->ss_stride); + + mask = sseu->subslice_mask[slice * sseu->ss_stride + ss_idx]; + + return mask & BIT(subslice % BITS_PER_BYTE); +} + +void intel_sseu_set_info(struct sseu_dev_info *sseu, u8 max_slices, + u8 max_subslices, u8 max_eus_per_subslice); + +unsigned int +intel_sseu_subslice_total(const struct sseu_dev_info *sseu); + +unsigned int +intel_sseu_subslices_per_slice(const struct sseu_dev_info *sseu, u8 slice); + +u32 intel_sseu_get_subslices(const struct sseu_dev_info *sseu, u8 slice); + +void intel_sseu_set_subslices(struct sseu_dev_info *sseu, int slice, + u32 ss_mask); + +void intel_sseu_info_init(struct intel_gt *gt); + +u32 intel_sseu_make_rpcs(struct intel_gt *gt, + const struct intel_sseu *req_sseu); + +void intel_sseu_dump(const struct sseu_dev_info *sseu, struct drm_printer *p); +void intel_sseu_print_topology(const struct sseu_dev_info *sseu, + struct drm_printer *p); + +#endif /* __INTEL_SSEU_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_sseu_debugfs.c b/drivers/gpu/drm/i915/gt/intel_sseu_debugfs.c new file mode 100644 index 000000000..51780282d --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_sseu_debugfs.c @@ -0,0 +1,306 @@ +// SPDX-License-Identifier: MIT + +/* + * Copyright © 2020 Intel Corporation + */ + +#include "debugfs_gt.h" +#include "intel_sseu_debugfs.h" +#include "i915_drv.h" + +static void sseu_copy_subslices(const struct sseu_dev_info *sseu, + int slice, u8 *to_mask) +{ + int offset = slice * sseu->ss_stride; + + memcpy(&to_mask[offset], &sseu->subslice_mask[offset], sseu->ss_stride); +} + +static void cherryview_sseu_device_status(struct intel_gt *gt, + struct sseu_dev_info *sseu) +{ +#define SS_MAX 2 + struct intel_uncore *uncore = gt->uncore; + const int ss_max = SS_MAX; + u32 sig1[SS_MAX], sig2[SS_MAX]; + int ss; + + sig1[0] = intel_uncore_read(uncore, CHV_POWER_SS0_SIG1); + sig1[1] = intel_uncore_read(uncore, CHV_POWER_SS1_SIG1); + sig2[0] = intel_uncore_read(uncore, CHV_POWER_SS0_SIG2); + sig2[1] = intel_uncore_read(uncore, CHV_POWER_SS1_SIG2); + + for (ss = 0; ss < ss_max; ss++) { + unsigned int eu_cnt; + + if (sig1[ss] & CHV_SS_PG_ENABLE) + /* skip disabled subslice */ + continue; + + sseu->slice_mask = BIT(0); + sseu->subslice_mask[0] |= BIT(ss); + eu_cnt = ((sig1[ss] & CHV_EU08_PG_ENABLE) ? 0 : 2) + + ((sig1[ss] & CHV_EU19_PG_ENABLE) ? 0 : 2) + + ((sig1[ss] & CHV_EU210_PG_ENABLE) ? 0 : 2) + + ((sig2[ss] & CHV_EU311_PG_ENABLE) ? 0 : 2); + sseu->eu_total += eu_cnt; + sseu->eu_per_subslice = max_t(unsigned int, + sseu->eu_per_subslice, eu_cnt); + } +#undef SS_MAX +} + +static void gen10_sseu_device_status(struct intel_gt *gt, + struct sseu_dev_info *sseu) +{ +#define SS_MAX 6 + struct intel_uncore *uncore = gt->uncore; + const struct intel_gt_info *info = >->info; + u32 s_reg[SS_MAX], eu_reg[2 * SS_MAX], eu_mask[2]; + int s, ss; + + for (s = 0; s < info->sseu.max_slices; s++) { + /* + * FIXME: Valid SS Mask respects the spec and read + * only valid bits for those registers, excluding reserved + * although this seems wrong because it would leave many + * subslices without ACK. + */ + s_reg[s] = intel_uncore_read(uncore, GEN10_SLICE_PGCTL_ACK(s)) & + GEN10_PGCTL_VALID_SS_MASK(s); + eu_reg[2 * s] = intel_uncore_read(uncore, + GEN10_SS01_EU_PGCTL_ACK(s)); + eu_reg[2 * s + 1] = intel_uncore_read(uncore, + GEN10_SS23_EU_PGCTL_ACK(s)); + } + + eu_mask[0] = GEN9_PGCTL_SSA_EU08_ACK | + GEN9_PGCTL_SSA_EU19_ACK | + GEN9_PGCTL_SSA_EU210_ACK | + GEN9_PGCTL_SSA_EU311_ACK; + eu_mask[1] = GEN9_PGCTL_SSB_EU08_ACK | + GEN9_PGCTL_SSB_EU19_ACK | + GEN9_PGCTL_SSB_EU210_ACK | + GEN9_PGCTL_SSB_EU311_ACK; + + for (s = 0; s < info->sseu.max_slices; s++) { + if ((s_reg[s] & GEN9_PGCTL_SLICE_ACK) == 0) + /* skip disabled slice */ + continue; + + sseu->slice_mask |= BIT(s); + sseu_copy_subslices(&info->sseu, s, sseu->subslice_mask); + + for (ss = 0; ss < info->sseu.max_subslices; ss++) { + unsigned int eu_cnt; + + if (info->sseu.has_subslice_pg && + !(s_reg[s] & (GEN9_PGCTL_SS_ACK(ss)))) + /* skip disabled subslice */ + continue; + + eu_cnt = 2 * hweight32(eu_reg[2 * s + ss / 2] & + eu_mask[ss % 2]); + sseu->eu_total += eu_cnt; + sseu->eu_per_subslice = max_t(unsigned int, + sseu->eu_per_subslice, + eu_cnt); + } + } +#undef SS_MAX +} + +static void gen9_sseu_device_status(struct intel_gt *gt, + struct sseu_dev_info *sseu) +{ +#define SS_MAX 3 + struct intel_uncore *uncore = gt->uncore; + const struct intel_gt_info *info = >->info; + u32 s_reg[SS_MAX], eu_reg[2 * SS_MAX], eu_mask[2]; + int s, ss; + + for (s = 0; s < info->sseu.max_slices; s++) { + s_reg[s] = intel_uncore_read(uncore, GEN9_SLICE_PGCTL_ACK(s)); + eu_reg[2 * s] = + intel_uncore_read(uncore, GEN9_SS01_EU_PGCTL_ACK(s)); + eu_reg[2 * s + 1] = + intel_uncore_read(uncore, GEN9_SS23_EU_PGCTL_ACK(s)); + } + + eu_mask[0] = GEN9_PGCTL_SSA_EU08_ACK | + GEN9_PGCTL_SSA_EU19_ACK | + GEN9_PGCTL_SSA_EU210_ACK | + GEN9_PGCTL_SSA_EU311_ACK; + eu_mask[1] = GEN9_PGCTL_SSB_EU08_ACK | + GEN9_PGCTL_SSB_EU19_ACK | + GEN9_PGCTL_SSB_EU210_ACK | + GEN9_PGCTL_SSB_EU311_ACK; + + for (s = 0; s < info->sseu.max_slices; s++) { + if ((s_reg[s] & GEN9_PGCTL_SLICE_ACK) == 0) + /* skip disabled slice */ + continue; + + sseu->slice_mask |= BIT(s); + + if (IS_GEN9_BC(gt->i915)) + sseu_copy_subslices(&info->sseu, s, + sseu->subslice_mask); + + for (ss = 0; ss < info->sseu.max_subslices; ss++) { + unsigned int eu_cnt; + u8 ss_idx = s * info->sseu.ss_stride + + ss / BITS_PER_BYTE; + + if (IS_GEN9_LP(gt->i915)) { + if (!(s_reg[s] & (GEN9_PGCTL_SS_ACK(ss)))) + /* skip disabled subslice */ + continue; + + sseu->subslice_mask[ss_idx] |= + BIT(ss % BITS_PER_BYTE); + } + + eu_cnt = eu_reg[2 * s + ss / 2] & eu_mask[ss % 2]; + eu_cnt = 2 * hweight32(eu_cnt); + + sseu->eu_total += eu_cnt; + sseu->eu_per_subslice = max_t(unsigned int, + sseu->eu_per_subslice, + eu_cnt); + } + } +#undef SS_MAX +} + +static void bdw_sseu_device_status(struct intel_gt *gt, + struct sseu_dev_info *sseu) +{ + const struct intel_gt_info *info = >->info; + u32 slice_info = intel_uncore_read(gt->uncore, GEN8_GT_SLICE_INFO); + int s; + + sseu->slice_mask = slice_info & GEN8_LSLICESTAT_MASK; + + if (sseu->slice_mask) { + sseu->eu_per_subslice = info->sseu.eu_per_subslice; + for (s = 0; s < fls(sseu->slice_mask); s++) + sseu_copy_subslices(&info->sseu, s, + sseu->subslice_mask); + sseu->eu_total = sseu->eu_per_subslice * + intel_sseu_subslice_total(sseu); + + /* subtract fused off EU(s) from enabled slice(s) */ + for (s = 0; s < fls(sseu->slice_mask); s++) { + u8 subslice_7eu = info->sseu.subslice_7eu[s]; + + sseu->eu_total -= hweight8(subslice_7eu); + } + } +} + +static void i915_print_sseu_info(struct seq_file *m, + bool is_available_info, + bool has_pooled_eu, + const struct sseu_dev_info *sseu) +{ + const char *type = is_available_info ? "Available" : "Enabled"; + int s; + + seq_printf(m, " %s Slice Mask: %04x\n", type, + sseu->slice_mask); + seq_printf(m, " %s Slice Total: %u\n", type, + hweight8(sseu->slice_mask)); + seq_printf(m, " %s Subslice Total: %u\n", type, + intel_sseu_subslice_total(sseu)); + for (s = 0; s < fls(sseu->slice_mask); s++) { + seq_printf(m, " %s Slice%i subslices: %u\n", type, + s, intel_sseu_subslices_per_slice(sseu, s)); + } + seq_printf(m, " %s EU Total: %u\n", type, + sseu->eu_total); + seq_printf(m, " %s EU Per Subslice: %u\n", type, + sseu->eu_per_subslice); + + if (!is_available_info) + return; + + seq_printf(m, " Has Pooled EU: %s\n", yesno(has_pooled_eu)); + if (has_pooled_eu) + seq_printf(m, " Min EU in pool: %u\n", sseu->min_eu_in_pool); + + seq_printf(m, " Has Slice Power Gating: %s\n", + yesno(sseu->has_slice_pg)); + seq_printf(m, " Has Subslice Power Gating: %s\n", + yesno(sseu->has_subslice_pg)); + seq_printf(m, " Has EU Power Gating: %s\n", + yesno(sseu->has_eu_pg)); +} + +/* + * this is called from top-level debugfs as well, so we can't get the gt from + * the seq_file. + */ +int intel_sseu_status(struct seq_file *m, struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + const struct intel_gt_info *info = >->info; + struct sseu_dev_info sseu; + intel_wakeref_t wakeref; + + if (INTEL_GEN(i915) < 8) + return -ENODEV; + + seq_puts(m, "SSEU Device Info\n"); + i915_print_sseu_info(m, true, HAS_POOLED_EU(i915), &info->sseu); + + seq_puts(m, "SSEU Device Status\n"); + memset(&sseu, 0, sizeof(sseu)); + intel_sseu_set_info(&sseu, info->sseu.max_slices, + info->sseu.max_subslices, + info->sseu.max_eus_per_subslice); + + with_intel_runtime_pm(&i915->runtime_pm, wakeref) { + if (IS_CHERRYVIEW(i915)) + cherryview_sseu_device_status(gt, &sseu); + else if (IS_BROADWELL(i915)) + bdw_sseu_device_status(gt, &sseu); + else if (IS_GEN(i915, 9)) + gen9_sseu_device_status(gt, &sseu); + else if (INTEL_GEN(i915) >= 10) + gen10_sseu_device_status(gt, &sseu); + } + + i915_print_sseu_info(m, false, HAS_POOLED_EU(i915), &sseu); + + return 0; +} + +static int sseu_status_show(struct seq_file *m, void *unused) +{ + struct intel_gt *gt = m->private; + + return intel_sseu_status(m, gt); +} +DEFINE_GT_DEBUGFS_ATTRIBUTE(sseu_status); + +static int rcs_topology_show(struct seq_file *m, void *unused) +{ + struct intel_gt *gt = m->private; + struct drm_printer p = drm_seq_file_printer(m); + + intel_sseu_print_topology(>->info.sseu, &p); + + return 0; +} +DEFINE_GT_DEBUGFS_ATTRIBUTE(rcs_topology); + +void intel_sseu_debugfs_register(struct intel_gt *gt, struct dentry *root) +{ + static const struct debugfs_gt_file files[] = { + { "sseu_status", &sseu_status_fops, NULL }, + { "rcs_topology", &rcs_topology_fops, NULL }, + }; + + intel_gt_debugfs_register_files(root, files, ARRAY_SIZE(files), gt); +} diff --git a/drivers/gpu/drm/i915/gt/intel_sseu_debugfs.h b/drivers/gpu/drm/i915/gt/intel_sseu_debugfs.h new file mode 100644 index 000000000..73f001589 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_sseu_debugfs.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: MIT */ + +/* + * Copyright © 2020 Intel Corporation + */ + +#ifndef INTEL_SSEU_DEBUGFS_H +#define INTEL_SSEU_DEBUGFS_H + +struct intel_gt; +struct dentry; +struct seq_file; + +int intel_sseu_status(struct seq_file *m, struct intel_gt *gt); +void intel_sseu_debugfs_register(struct intel_gt *gt, struct dentry *root); + +#endif /* INTEL_SSEU_DEBUGFS_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c new file mode 100644 index 000000000..e25385ad2 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_timeline.c @@ -0,0 +1,629 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2016-2018 Intel Corporation + */ + +#include "i915_drv.h" + +#include "i915_active.h" +#include "i915_syncmap.h" +#include "intel_gt.h" +#include "intel_ring.h" +#include "intel_timeline.h" + +#define ptr_set_bit(ptr, bit) ((typeof(ptr))((unsigned long)(ptr) | BIT(bit))) +#define ptr_test_bit(ptr, bit) ((unsigned long)(ptr) & BIT(bit)) + +#define CACHELINE_BITS 6 +#define CACHELINE_FREE CACHELINE_BITS + +struct intel_timeline_hwsp { + struct intel_gt *gt; + struct intel_gt_timelines *gt_timelines; + struct list_head free_link; + struct i915_vma *vma; + u64 free_bitmap; +}; + +static struct i915_vma *__hwsp_alloc(struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + + obj = i915_gem_object_create_internal(i915, PAGE_SIZE); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + i915_gem_object_set_cache_coherency(obj, I915_CACHE_LLC); + + vma = i915_vma_instance(obj, >->ggtt->vm, NULL); + if (IS_ERR(vma)) + i915_gem_object_put(obj); + + return vma; +} + +static struct i915_vma * +hwsp_alloc(struct intel_timeline *timeline, unsigned int *cacheline) +{ + struct intel_gt_timelines *gt = &timeline->gt->timelines; + struct intel_timeline_hwsp *hwsp; + + BUILD_BUG_ON(BITS_PER_TYPE(u64) * CACHELINE_BYTES > PAGE_SIZE); + + spin_lock_irq(>->hwsp_lock); + + /* hwsp_free_list only contains HWSP that have available cachelines */ + hwsp = list_first_entry_or_null(>->hwsp_free_list, + typeof(*hwsp), free_link); + if (!hwsp) { + struct i915_vma *vma; + + spin_unlock_irq(>->hwsp_lock); + + hwsp = kmalloc(sizeof(*hwsp), GFP_KERNEL); + if (!hwsp) + return ERR_PTR(-ENOMEM); + + vma = __hwsp_alloc(timeline->gt); + if (IS_ERR(vma)) { + kfree(hwsp); + return vma; + } + + GT_TRACE(timeline->gt, "new HWSP allocated\n"); + + vma->private = hwsp; + hwsp->gt = timeline->gt; + hwsp->vma = vma; + hwsp->free_bitmap = ~0ull; + hwsp->gt_timelines = gt; + + spin_lock_irq(>->hwsp_lock); + list_add(&hwsp->free_link, >->hwsp_free_list); + } + + GEM_BUG_ON(!hwsp->free_bitmap); + *cacheline = __ffs64(hwsp->free_bitmap); + hwsp->free_bitmap &= ~BIT_ULL(*cacheline); + if (!hwsp->free_bitmap) + list_del(&hwsp->free_link); + + spin_unlock_irq(>->hwsp_lock); + + GEM_BUG_ON(hwsp->vma->private != hwsp); + return hwsp->vma; +} + +static void __idle_hwsp_free(struct intel_timeline_hwsp *hwsp, int cacheline) +{ + struct intel_gt_timelines *gt = hwsp->gt_timelines; + unsigned long flags; + + spin_lock_irqsave(>->hwsp_lock, flags); + + /* As a cacheline becomes available, publish the HWSP on the freelist */ + if (!hwsp->free_bitmap) + list_add_tail(&hwsp->free_link, >->hwsp_free_list); + + GEM_BUG_ON(cacheline >= BITS_PER_TYPE(hwsp->free_bitmap)); + hwsp->free_bitmap |= BIT_ULL(cacheline); + + /* And if no one is left using it, give the page back to the system */ + if (hwsp->free_bitmap == ~0ull) { + i915_vma_put(hwsp->vma); + list_del(&hwsp->free_link); + kfree(hwsp); + } + + spin_unlock_irqrestore(>->hwsp_lock, flags); +} + +static void __rcu_cacheline_free(struct rcu_head *rcu) +{ + struct intel_timeline_cacheline *cl = + container_of(rcu, typeof(*cl), rcu); + + /* Must wait until after all *rq->hwsp are complete before removing */ + i915_gem_object_unpin_map(cl->hwsp->vma->obj); + __idle_hwsp_free(cl->hwsp, ptr_unmask_bits(cl->vaddr, CACHELINE_BITS)); + + i915_active_fini(&cl->active); + kfree(cl); +} + +static void __idle_cacheline_free(struct intel_timeline_cacheline *cl) +{ + GEM_BUG_ON(!i915_active_is_idle(&cl->active)); + call_rcu(&cl->rcu, __rcu_cacheline_free); +} + +__i915_active_call +static void __cacheline_retire(struct i915_active *active) +{ + struct intel_timeline_cacheline *cl = + container_of(active, typeof(*cl), active); + + i915_vma_unpin(cl->hwsp->vma); + if (ptr_test_bit(cl->vaddr, CACHELINE_FREE)) + __idle_cacheline_free(cl); +} + +static int __cacheline_active(struct i915_active *active) +{ + struct intel_timeline_cacheline *cl = + container_of(active, typeof(*cl), active); + + __i915_vma_pin(cl->hwsp->vma); + return 0; +} + +static struct intel_timeline_cacheline * +cacheline_alloc(struct intel_timeline_hwsp *hwsp, unsigned int cacheline) +{ + struct intel_timeline_cacheline *cl; + void *vaddr; + + GEM_BUG_ON(cacheline >= BIT(CACHELINE_BITS)); + + cl = kmalloc(sizeof(*cl), GFP_KERNEL); + if (!cl) + return ERR_PTR(-ENOMEM); + + vaddr = i915_gem_object_pin_map(hwsp->vma->obj, I915_MAP_WB); + if (IS_ERR(vaddr)) { + kfree(cl); + return ERR_CAST(vaddr); + } + + cl->hwsp = hwsp; + cl->vaddr = page_pack_bits(vaddr, cacheline); + + i915_active_init(&cl->active, __cacheline_active, __cacheline_retire); + + return cl; +} + +static void cacheline_acquire(struct intel_timeline_cacheline *cl, + u32 ggtt_offset) +{ + if (!cl) + return; + + cl->ggtt_offset = ggtt_offset; + i915_active_acquire(&cl->active); +} + +static void cacheline_release(struct intel_timeline_cacheline *cl) +{ + if (cl) + i915_active_release(&cl->active); +} + +static void cacheline_free(struct intel_timeline_cacheline *cl) +{ + if (!i915_active_acquire_if_busy(&cl->active)) { + __idle_cacheline_free(cl); + return; + } + + GEM_BUG_ON(ptr_test_bit(cl->vaddr, CACHELINE_FREE)); + cl->vaddr = ptr_set_bit(cl->vaddr, CACHELINE_FREE); + + i915_active_release(&cl->active); +} + +static int intel_timeline_init(struct intel_timeline *timeline, + struct intel_gt *gt, + struct i915_vma *hwsp, + unsigned int offset) +{ + void *vaddr; + + kref_init(&timeline->kref); + atomic_set(&timeline->pin_count, 0); + + timeline->gt = gt; + + timeline->has_initial_breadcrumb = !hwsp; + timeline->hwsp_cacheline = NULL; + + if (!hwsp) { + struct intel_timeline_cacheline *cl; + unsigned int cacheline; + + hwsp = hwsp_alloc(timeline, &cacheline); + if (IS_ERR(hwsp)) + return PTR_ERR(hwsp); + + cl = cacheline_alloc(hwsp->private, cacheline); + if (IS_ERR(cl)) { + __idle_hwsp_free(hwsp->private, cacheline); + return PTR_ERR(cl); + } + + timeline->hwsp_cacheline = cl; + timeline->hwsp_offset = cacheline * CACHELINE_BYTES; + + vaddr = page_mask_bits(cl->vaddr); + } else { + timeline->hwsp_offset = offset; + vaddr = i915_gem_object_pin_map(hwsp->obj, I915_MAP_WB); + if (IS_ERR(vaddr)) + return PTR_ERR(vaddr); + } + + timeline->hwsp_seqno = + memset(vaddr + timeline->hwsp_offset, 0, CACHELINE_BYTES); + + timeline->hwsp_ggtt = i915_vma_get(hwsp); + GEM_BUG_ON(timeline->hwsp_offset >= hwsp->size); + + timeline->fence_context = dma_fence_context_alloc(1); + + mutex_init(&timeline->mutex); + + INIT_ACTIVE_FENCE(&timeline->last_request); + INIT_LIST_HEAD(&timeline->requests); + + i915_syncmap_init(&timeline->sync); + + return 0; +} + +void intel_gt_init_timelines(struct intel_gt *gt) +{ + struct intel_gt_timelines *timelines = >->timelines; + + spin_lock_init(&timelines->lock); + INIT_LIST_HEAD(&timelines->active_list); + + spin_lock_init(&timelines->hwsp_lock); + INIT_LIST_HEAD(&timelines->hwsp_free_list); +} + +static void intel_timeline_fini(struct intel_timeline *timeline) +{ + GEM_BUG_ON(atomic_read(&timeline->pin_count)); + GEM_BUG_ON(!list_empty(&timeline->requests)); + GEM_BUG_ON(timeline->retire); + + if (timeline->hwsp_cacheline) + cacheline_free(timeline->hwsp_cacheline); + else + i915_gem_object_unpin_map(timeline->hwsp_ggtt->obj); + + i915_vma_put(timeline->hwsp_ggtt); + + /* + * A small race exists between intel_gt_retire_requests_timeout and + * intel_timeline_exit which could result in the syncmap not getting + * free'd. Rather than work to hard to seal this race, simply cleanup + * the syncmap on fini. + */ + i915_syncmap_free(&timeline->sync); +} + +struct intel_timeline * +__intel_timeline_create(struct intel_gt *gt, + struct i915_vma *global_hwsp, + unsigned int offset) +{ + struct intel_timeline *timeline; + int err; + + timeline = kzalloc(sizeof(*timeline), GFP_KERNEL); + if (!timeline) + return ERR_PTR(-ENOMEM); + + err = intel_timeline_init(timeline, gt, global_hwsp, offset); + if (err) { + kfree(timeline); + return ERR_PTR(err); + } + + return timeline; +} + +void __intel_timeline_pin(struct intel_timeline *tl) +{ + GEM_BUG_ON(!atomic_read(&tl->pin_count)); + atomic_inc(&tl->pin_count); +} + +int intel_timeline_pin(struct intel_timeline *tl, struct i915_gem_ww_ctx *ww) +{ + int err; + + if (atomic_add_unless(&tl->pin_count, 1, 0)) + return 0; + + err = i915_ggtt_pin(tl->hwsp_ggtt, ww, 0, PIN_HIGH); + if (err) + return err; + + tl->hwsp_offset = + i915_ggtt_offset(tl->hwsp_ggtt) + + offset_in_page(tl->hwsp_offset); + GT_TRACE(tl->gt, "timeline:%llx using HWSP offset:%x\n", + tl->fence_context, tl->hwsp_offset); + + cacheline_acquire(tl->hwsp_cacheline, tl->hwsp_offset); + if (atomic_fetch_inc(&tl->pin_count)) { + cacheline_release(tl->hwsp_cacheline); + __i915_vma_unpin(tl->hwsp_ggtt); + } + + return 0; +} + +void intel_timeline_reset_seqno(const struct intel_timeline *tl) +{ + /* Must be pinned to be writable, and no requests in flight. */ + GEM_BUG_ON(!atomic_read(&tl->pin_count)); + WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno); +} + +void intel_timeline_enter(struct intel_timeline *tl) +{ + struct intel_gt_timelines *timelines = &tl->gt->timelines; + + /* + * Pretend we are serialised by the timeline->mutex. + * + * While generally true, there are a few exceptions to the rule + * for the engine->kernel_context being used to manage power + * transitions. As the engine_park may be called from under any + * timeline, it uses the power mutex as a global serialisation + * lock to prevent any other request entering its timeline. + * + * The rule is generally tl->mutex, otherwise engine->wakeref.mutex. + * + * However, intel_gt_retire_request() does not know which engine + * it is retiring along and so cannot partake in the engine-pm + * barrier, and there we use the tl->active_count as a means to + * pin the timeline in the active_list while the locks are dropped. + * Ergo, as that is outside of the engine-pm barrier, we need to + * use atomic to manipulate tl->active_count. + */ + lockdep_assert_held(&tl->mutex); + + if (atomic_add_unless(&tl->active_count, 1, 0)) + return; + + spin_lock(&timelines->lock); + if (!atomic_fetch_inc(&tl->active_count)) { + /* + * The HWSP is volatile, and may have been lost while inactive, + * e.g. across suspend/resume. Be paranoid, and ensure that + * the HWSP value matches our seqno so we don't proclaim + * the next request as already complete. + */ + intel_timeline_reset_seqno(tl); + list_add_tail(&tl->link, &timelines->active_list); + } + spin_unlock(&timelines->lock); +} + +void intel_timeline_exit(struct intel_timeline *tl) +{ + struct intel_gt_timelines *timelines = &tl->gt->timelines; + + /* See intel_timeline_enter() */ + lockdep_assert_held(&tl->mutex); + + GEM_BUG_ON(!atomic_read(&tl->active_count)); + if (atomic_add_unless(&tl->active_count, -1, 1)) + return; + + spin_lock(&timelines->lock); + if (atomic_dec_and_test(&tl->active_count)) + list_del(&tl->link); + spin_unlock(&timelines->lock); + + /* + * Since this timeline is idle, all bariers upon which we were waiting + * must also be complete and so we can discard the last used barriers + * without loss of information. + */ + i915_syncmap_free(&tl->sync); +} + +static u32 timeline_advance(struct intel_timeline *tl) +{ + GEM_BUG_ON(!atomic_read(&tl->pin_count)); + GEM_BUG_ON(tl->seqno & tl->has_initial_breadcrumb); + + return tl->seqno += 1 + tl->has_initial_breadcrumb; +} + +static void timeline_rollback(struct intel_timeline *tl) +{ + tl->seqno -= 1 + tl->has_initial_breadcrumb; +} + +static noinline int +__intel_timeline_get_seqno(struct intel_timeline *tl, + struct i915_request *rq, + u32 *seqno) +{ + struct intel_timeline_cacheline *cl; + unsigned int cacheline; + struct i915_vma *vma; + void *vaddr; + int err; + + might_lock(&tl->gt->ggtt->vm.mutex); + GT_TRACE(tl->gt, "timeline:%llx wrapped\n", tl->fence_context); + + /* + * If there is an outstanding GPU reference to this cacheline, + * such as it being sampled by a HW semaphore on another timeline, + * we cannot wraparound our seqno value (the HW semaphore does + * a strict greater-than-or-equals compare, not i915_seqno_passed). + * So if the cacheline is still busy, we must detach ourselves + * from it and leave it inflight alongside its users. + * + * However, if nobody is watching and we can guarantee that nobody + * will, we could simply reuse the same cacheline. + * + * if (i915_active_request_is_signaled(&tl->last_request) && + * i915_active_is_signaled(&tl->hwsp_cacheline->active)) + * return 0; + * + * That seems unlikely for a busy timeline that needed to wrap in + * the first place, so just replace the cacheline. + */ + + vma = hwsp_alloc(tl, &cacheline); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_rollback; + } + + err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH); + if (err) { + __idle_hwsp_free(vma->private, cacheline); + goto err_rollback; + } + + cl = cacheline_alloc(vma->private, cacheline); + if (IS_ERR(cl)) { + err = PTR_ERR(cl); + __idle_hwsp_free(vma->private, cacheline); + goto err_unpin; + } + GEM_BUG_ON(cl->hwsp->vma != vma); + + /* + * Attach the old cacheline to the current request, so that we only + * free it after the current request is retired, which ensures that + * all writes into the cacheline from previous requests are complete. + */ + err = i915_active_ref(&tl->hwsp_cacheline->active, + tl->fence_context, + &rq->fence); + if (err) + goto err_cacheline; + + cacheline_release(tl->hwsp_cacheline); /* ownership now xfered to rq */ + cacheline_free(tl->hwsp_cacheline); + + i915_vma_unpin(tl->hwsp_ggtt); /* binding kept alive by old cacheline */ + i915_vma_put(tl->hwsp_ggtt); + + tl->hwsp_ggtt = i915_vma_get(vma); + + vaddr = page_mask_bits(cl->vaddr); + tl->hwsp_offset = cacheline * CACHELINE_BYTES; + tl->hwsp_seqno = + memset(vaddr + tl->hwsp_offset, 0, CACHELINE_BYTES); + + tl->hwsp_offset += i915_ggtt_offset(vma); + GT_TRACE(tl->gt, "timeline:%llx using HWSP offset:%x\n", + tl->fence_context, tl->hwsp_offset); + + cacheline_acquire(cl, tl->hwsp_offset); + tl->hwsp_cacheline = cl; + + *seqno = timeline_advance(tl); + GEM_BUG_ON(i915_seqno_passed(*tl->hwsp_seqno, *seqno)); + return 0; + +err_cacheline: + cacheline_free(cl); +err_unpin: + i915_vma_unpin(vma); +err_rollback: + timeline_rollback(tl); + return err; +} + +int intel_timeline_get_seqno(struct intel_timeline *tl, + struct i915_request *rq, + u32 *seqno) +{ + *seqno = timeline_advance(tl); + + /* Replace the HWSP on wraparound for HW semaphores */ + if (unlikely(!*seqno && tl->hwsp_cacheline)) + return __intel_timeline_get_seqno(tl, rq, seqno); + + return 0; +} + +static int cacheline_ref(struct intel_timeline_cacheline *cl, + struct i915_request *rq) +{ + return i915_active_add_request(&cl->active, rq); +} + +int intel_timeline_read_hwsp(struct i915_request *from, + struct i915_request *to, + u32 *hwsp) +{ + struct intel_timeline_cacheline *cl; + int err; + + GEM_BUG_ON(!rcu_access_pointer(from->hwsp_cacheline)); + + rcu_read_lock(); + cl = rcu_dereference(from->hwsp_cacheline); + if (i915_request_completed(from)) /* confirm cacheline is valid */ + goto unlock; + if (unlikely(!i915_active_acquire_if_busy(&cl->active))) + goto unlock; /* seqno wrapped and completed! */ + if (unlikely(i915_request_completed(from))) + goto release; + rcu_read_unlock(); + + err = cacheline_ref(cl, to); + if (err) + goto out; + + *hwsp = cl->ggtt_offset; +out: + i915_active_release(&cl->active); + return err; + +release: + i915_active_release(&cl->active); +unlock: + rcu_read_unlock(); + return 1; +} + +void intel_timeline_unpin(struct intel_timeline *tl) +{ + GEM_BUG_ON(!atomic_read(&tl->pin_count)); + if (!atomic_dec_and_test(&tl->pin_count)) + return; + + cacheline_release(tl->hwsp_cacheline); + + __i915_vma_unpin(tl->hwsp_ggtt); +} + +void __intel_timeline_free(struct kref *kref) +{ + struct intel_timeline *timeline = + container_of(kref, typeof(*timeline), kref); + + intel_timeline_fini(timeline); + kfree_rcu(timeline, rcu); +} + +void intel_gt_fini_timelines(struct intel_gt *gt) +{ + struct intel_gt_timelines *timelines = >->timelines; + + GEM_BUG_ON(!list_empty(&timelines->active_list)); + GEM_BUG_ON(!list_empty(&timelines->hwsp_free_list)); +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "gt/selftests/mock_timeline.c" +#include "gt/selftest_timeline.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.h b/drivers/gpu/drm/i915/gt/intel_timeline.h new file mode 100644 index 000000000..9882cd911 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_timeline.h @@ -0,0 +1,109 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#ifndef I915_TIMELINE_H +#define I915_TIMELINE_H + +#include <linux/lockdep.h> + +#include "i915_active.h" +#include "i915_syncmap.h" +#include "intel_timeline_types.h" + +struct intel_timeline * +__intel_timeline_create(struct intel_gt *gt, + struct i915_vma *global_hwsp, + unsigned int offset); + +static inline struct intel_timeline * +intel_timeline_create(struct intel_gt *gt) +{ + return __intel_timeline_create(gt, NULL, 0); +} + +static inline struct intel_timeline * +intel_timeline_create_from_engine(struct intel_engine_cs *engine, + unsigned int offset) +{ + return __intel_timeline_create(engine->gt, + engine->status_page.vma, + offset); +} + +static inline struct intel_timeline * +intel_timeline_get(struct intel_timeline *timeline) +{ + kref_get(&timeline->kref); + return timeline; +} + +void __intel_timeline_free(struct kref *kref); +static inline void intel_timeline_put(struct intel_timeline *timeline) +{ + kref_put(&timeline->kref, __intel_timeline_free); +} + +static inline int __intel_timeline_sync_set(struct intel_timeline *tl, + u64 context, u32 seqno) +{ + return i915_syncmap_set(&tl->sync, context, seqno); +} + +static inline int intel_timeline_sync_set(struct intel_timeline *tl, + const struct dma_fence *fence) +{ + return __intel_timeline_sync_set(tl, fence->context, fence->seqno); +} + +static inline bool __intel_timeline_sync_is_later(struct intel_timeline *tl, + u64 context, u32 seqno) +{ + return i915_syncmap_is_later(&tl->sync, context, seqno); +} + +static inline bool intel_timeline_sync_is_later(struct intel_timeline *tl, + const struct dma_fence *fence) +{ + return __intel_timeline_sync_is_later(tl, fence->context, fence->seqno); +} + +void __intel_timeline_pin(struct intel_timeline *tl); +int intel_timeline_pin(struct intel_timeline *tl, struct i915_gem_ww_ctx *ww); +void intel_timeline_enter(struct intel_timeline *tl); +int intel_timeline_get_seqno(struct intel_timeline *tl, + struct i915_request *rq, + u32 *seqno); +void intel_timeline_exit(struct intel_timeline *tl); +void intel_timeline_unpin(struct intel_timeline *tl); + +void intel_timeline_reset_seqno(const struct intel_timeline *tl); + +int intel_timeline_read_hwsp(struct i915_request *from, + struct i915_request *until, + u32 *hwsp_offset); + +void intel_gt_init_timelines(struct intel_gt *gt); +void intel_gt_fini_timelines(struct intel_gt *gt); + +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_timeline_types.h b/drivers/gpu/drm/i915/gt/intel_timeline_types.h new file mode 100644 index 000000000..4474f487f --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_timeline_types.h @@ -0,0 +1,102 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2016 Intel Corporation + */ + +#ifndef __I915_TIMELINE_TYPES_H__ +#define __I915_TIMELINE_TYPES_H__ + +#include <linux/list.h> +#include <linux/kref.h> +#include <linux/mutex.h> +#include <linux/rcupdate.h> +#include <linux/types.h> + +#include "i915_active_types.h" + +struct i915_vma; +struct i915_syncmap; +struct intel_gt; +struct intel_timeline_hwsp; + +struct intel_timeline { + u64 fence_context; + u32 seqno; + + struct mutex mutex; /* protects the flow of requests */ + + /* + * pin_count and active_count track essentially the same thing: + * How many requests are in flight or may be under construction. + * + * We need two distinct counters so that we can assign different + * lifetimes to the events for different use-cases. For example, + * we want to permanently keep the timeline pinned for the kernel + * context so that we can issue requests at any time without having + * to acquire space in the GGTT. However, we want to keep tracking + * the activity (to be able to detect when we become idle) along that + * permanently pinned timeline and so end up requiring two counters. + * + * Note that the active_count is protected by the intel_timeline.mutex, + * but the pin_count is protected by a combination of serialisation + * from the intel_context caller plus internal atomicity. + */ + atomic_t pin_count; + atomic_t active_count; + + const u32 *hwsp_seqno; + struct i915_vma *hwsp_ggtt; + u32 hwsp_offset; + + struct intel_timeline_cacheline *hwsp_cacheline; + + bool has_initial_breadcrumb; + + /** + * List of breadcrumbs associated with GPU requests currently + * outstanding. + */ + struct list_head requests; + + /* + * Contains an RCU guarded pointer to the last request. No reference is + * held to the request, users must carefully acquire a reference to + * the request using i915_active_fence_get(), or manage the RCU + * protection themselves (cf the i915_active_fence API). + */ + struct i915_active_fence last_request; + + /** A chain of completed timelines ready for early retirement. */ + struct intel_timeline *retire; + + /** + * We track the most recent seqno that we wait on in every context so + * that we only have to emit a new await and dependency on a more + * recent sync point. As the contexts may be executed out-of-order, we + * have to track each individually and can not rely on an absolute + * global_seqno. When we know that all tracked fences are completed + * (i.e. when the driver is idle), we know that the syncmap is + * redundant and we can discard it without loss of generality. + */ + struct i915_syncmap *sync; + + struct list_head link; + struct intel_gt *gt; + + struct kref kref; + struct rcu_head rcu; +}; + +struct intel_timeline_cacheline { + struct i915_active active; + + struct intel_timeline_hwsp *hwsp; + void *vaddr; + + u32 ggtt_offset; + + struct rcu_head rcu; +}; + +#endif /* __I915_TIMELINE_TYPES_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c new file mode 100644 index 000000000..ae5cf2b55 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -0,0 +1,2204 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2014-2018 Intel Corporation + */ + +#include "i915_drv.h" +#include "intel_context.h" +#include "intel_engine_pm.h" +#include "intel_gt.h" +#include "intel_ring.h" +#include "intel_workarounds.h" + +/** + * DOC: Hardware workarounds + * + * This file is intended as a central place to implement most [1]_ of the + * required workarounds for hardware to work as originally intended. They fall + * in five basic categories depending on how/when they are applied: + * + * - Workarounds that touch registers that are saved/restored to/from the HW + * context image. The list is emitted (via Load Register Immediate commands) + * everytime a new context is created. + * - GT workarounds. The list of these WAs is applied whenever these registers + * revert to default values (on GPU reset, suspend/resume [2]_, etc..). + * - Display workarounds. The list is applied during display clock-gating + * initialization. + * - Workarounds that whitelist a privileged register, so that UMDs can manage + * them directly. This is just a special case of a MMMIO workaround (as we + * write the list of these to/be-whitelisted registers to some special HW + * registers). + * - Workaround batchbuffers, that get executed automatically by the hardware + * on every HW context restore. + * + * .. [1] Please notice that there are other WAs that, due to their nature, + * cannot be applied from a central place. Those are peppered around the rest + * of the code, as needed. + * + * .. [2] Technically, some registers are powercontext saved & restored, so they + * survive a suspend/resume. In practice, writing them again is not too + * costly and simplifies things. We can revisit this in the future. + * + * Layout + * ~~~~~~ + * + * Keep things in this file ordered by WA type, as per the above (context, GT, + * display, register whitelist, batchbuffer). Then, inside each type, keep the + * following order: + * + * - Infrastructure functions and macros + * - WAs per platform in standard gen/chrono order + * - Public functions to init or apply the given workaround type. + */ + +/* + * KBL revision ID ordering is bizarre; higher revision ID's map to lower + * steppings in some cases. So rather than test against the revision ID + * directly, let's map that into our own range of increasing ID's that we + * can test against in a regular manner. + */ + +const struct i915_rev_steppings kbl_revids[] = { + [0] = { .gt_stepping = KBL_REVID_A0, .disp_stepping = KBL_REVID_A0 }, + [1] = { .gt_stepping = KBL_REVID_B0, .disp_stepping = KBL_REVID_B0 }, + [2] = { .gt_stepping = KBL_REVID_C0, .disp_stepping = KBL_REVID_B0 }, + [3] = { .gt_stepping = KBL_REVID_D0, .disp_stepping = KBL_REVID_B0 }, + [4] = { .gt_stepping = KBL_REVID_F0, .disp_stepping = KBL_REVID_C0 }, + [5] = { .gt_stepping = KBL_REVID_C0, .disp_stepping = KBL_REVID_B1 }, + [6] = { .gt_stepping = KBL_REVID_D1, .disp_stepping = KBL_REVID_B1 }, + [7] = { .gt_stepping = KBL_REVID_G0, .disp_stepping = KBL_REVID_C0 }, +}; + +const struct i915_rev_steppings tgl_uy_revids[] = { + [0] = { .gt_stepping = TGL_REVID_A0, .disp_stepping = TGL_REVID_A0 }, + [1] = { .gt_stepping = TGL_REVID_B0, .disp_stepping = TGL_REVID_C0 }, + [2] = { .gt_stepping = TGL_REVID_B1, .disp_stepping = TGL_REVID_C0 }, + [3] = { .gt_stepping = TGL_REVID_C0, .disp_stepping = TGL_REVID_D0 }, +}; + +/* Same GT stepping between tgl_uy_revids and tgl_revids don't mean the same HW */ +const struct i915_rev_steppings tgl_revids[] = { + [0] = { .gt_stepping = TGL_REVID_A0, .disp_stepping = TGL_REVID_B0 }, + [1] = { .gt_stepping = TGL_REVID_B0, .disp_stepping = TGL_REVID_D0 }, +}; + +static void wa_init_start(struct i915_wa_list *wal, const char *name, const char *engine_name) +{ + wal->name = name; + wal->engine_name = engine_name; +} + +#define WA_LIST_CHUNK (1 << 4) + +static void wa_init_finish(struct i915_wa_list *wal) +{ + /* Trim unused entries. */ + if (!IS_ALIGNED(wal->count, WA_LIST_CHUNK)) { + struct i915_wa *list = kmemdup(wal->list, + wal->count * sizeof(*list), + GFP_KERNEL); + + if (list) { + kfree(wal->list); + wal->list = list; + } + } + + if (!wal->count) + return; + + DRM_DEBUG_DRIVER("Initialized %u %s workarounds on %s\n", + wal->wa_count, wal->name, wal->engine_name); +} + +static void _wa_add(struct i915_wa_list *wal, const struct i915_wa *wa) +{ + unsigned int addr = i915_mmio_reg_offset(wa->reg); + unsigned int start = 0, end = wal->count; + const unsigned int grow = WA_LIST_CHUNK; + struct i915_wa *wa_; + + GEM_BUG_ON(!is_power_of_2(grow)); + + if (IS_ALIGNED(wal->count, grow)) { /* Either uninitialized or full. */ + struct i915_wa *list; + + list = kmalloc_array(ALIGN(wal->count + 1, grow), sizeof(*wa), + GFP_KERNEL); + if (!list) { + DRM_ERROR("No space for workaround init!\n"); + return; + } + + if (wal->list) { + memcpy(list, wal->list, sizeof(*wa) * wal->count); + kfree(wal->list); + } + + wal->list = list; + } + + while (start < end) { + unsigned int mid = start + (end - start) / 2; + + if (i915_mmio_reg_offset(wal->list[mid].reg) < addr) { + start = mid + 1; + } else if (i915_mmio_reg_offset(wal->list[mid].reg) > addr) { + end = mid; + } else { + wa_ = &wal->list[mid]; + + if ((wa->clr | wa_->clr) && !(wa->clr & ~wa_->clr)) { + DRM_ERROR("Discarding overwritten w/a for reg %04x (clear: %08x, set: %08x)\n", + i915_mmio_reg_offset(wa_->reg), + wa_->clr, wa_->set); + + wa_->set &= ~wa->clr; + } + + wal->wa_count++; + wa_->set |= wa->set; + wa_->clr |= wa->clr; + wa_->read |= wa->read; + return; + } + } + + wal->wa_count++; + wa_ = &wal->list[wal->count++]; + *wa_ = *wa; + + while (wa_-- > wal->list) { + GEM_BUG_ON(i915_mmio_reg_offset(wa_[0].reg) == + i915_mmio_reg_offset(wa_[1].reg)); + if (i915_mmio_reg_offset(wa_[1].reg) > + i915_mmio_reg_offset(wa_[0].reg)) + break; + + swap(wa_[1], wa_[0]); + } +} + +static void wa_add(struct i915_wa_list *wal, i915_reg_t reg, + u32 clear, u32 set, u32 read_mask) +{ + struct i915_wa wa = { + .reg = reg, + .clr = clear, + .set = set, + .read = read_mask, + }; + + _wa_add(wal, &wa); +} + +static void +wa_write_masked_or(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set) +{ + wa_add(wal, reg, clear, set, clear); +} + +static void +wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 set) +{ + wa_write_masked_or(wal, reg, ~0, set); +} + +static void +wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set) +{ + wa_write_masked_or(wal, reg, set, set); +} + +static void +wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr) +{ + wa_write_masked_or(wal, reg, clr, 0); +} + +static void +wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val) +{ + wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val); +} + +static void +wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val) +{ + wa_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val); +} + +#define WA_SET_BIT_MASKED(addr, mask) \ + wa_masked_en(wal, (addr), (mask)) + +#define WA_CLR_BIT_MASKED(addr, mask) \ + wa_masked_dis(wal, (addr), (mask)) + +#define WA_SET_FIELD_MASKED(addr, mask, value) \ + wa_write_masked_or(wal, (addr), 0, _MASKED_FIELD((mask), (value))) + +static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING); +} + +static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING); +} + +static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING); + + /* WaDisableAsyncFlipPerfMode:bdw,chv */ + WA_SET_BIT_MASKED(MI_MODE, ASYNC_FLIP_PERF_DISABLE); + + /* WaDisablePartialInstShootdown:bdw,chv */ + WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, + PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE); + + /* Use Force Non-Coherent whenever executing a 3D context. This is a + * workaround for for a possible hang in the unlikely event a TLB + * invalidation occurs during a PSD flush. + */ + /* WaForceEnableNonCoherent:bdw,chv */ + /* WaHdcDisableFetchWhenMasked:bdw,chv */ + WA_SET_BIT_MASKED(HDC_CHICKEN0, + HDC_DONOT_FETCH_MEM_WHEN_MASKED | + HDC_FORCE_NON_COHERENT); + + /* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0: + * "The Hierarchical Z RAW Stall Optimization allows non-overlapping + * polygons in the same 8x4 pixel/sample area to be processed without + * stalling waiting for the earlier ones to write to Hierarchical Z + * buffer." + * + * This optimization is off by default for BDW and CHV; turn it on. + */ + WA_CLR_BIT_MASKED(CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE); + + /* Wa4x4STCOptimizationDisable:bdw,chv */ + WA_SET_BIT_MASKED(CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE); + + /* + * BSpec recommends 8x4 when MSAA is used, + * however in practice 16x4 seems fastest. + * + * Note that PS/WM thread counts depend on the WIZ hashing + * disable bit, which we don't touch here, but it's good + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). + */ + WA_SET_FIELD_MASKED(GEN7_GT_MODE, + GEN6_WIZ_HASHING_MASK, + GEN6_WIZ_HASHING_16x4); +} + +static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + struct drm_i915_private *i915 = engine->i915; + + gen8_ctx_workarounds_init(engine, wal); + + /* WaDisableThreadStallDopClockGating:bdw (pre-production) */ + WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE); + + /* WaDisableDopClockGating:bdw + * + * Also see the related UCGTCL1 write in bdw_init_clock_gating() + * to disable EUTC clock gating. + */ + WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2, + DOP_CLOCK_GATING_DISABLE); + + WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3, + GEN8_SAMPLER_POWER_BYPASS_DIS); + + WA_SET_BIT_MASKED(HDC_CHICKEN0, + /* WaForceContextSaveRestoreNonCoherent:bdw */ + HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT | + /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */ + (IS_BDW_GT3(i915) ? HDC_FENCE_DEST_SLM_DISABLE : 0)); +} + +static void chv_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + gen8_ctx_workarounds_init(engine, wal); + + /* WaDisableThreadStallDopClockGating:chv */ + WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE); + + /* Improve HiZ throughput on CHV. */ + WA_SET_BIT_MASKED(HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X); +} + +static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + struct drm_i915_private *i915 = engine->i915; + + if (HAS_LLC(i915)) { + /* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl + * + * Must match Display Engine. See + * WaCompressedResourceDisplayNewHashMode. + */ + WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, + GEN9_PBE_COMPRESSED_HASH_SELECTION); + WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7, + GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR); + } + + /* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */ + /* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */ + WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, + FLOW_CONTROL_ENABLE | + PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE); + + /* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */ + /* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */ + WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7, + GEN9_ENABLE_YV12_BUGFIX | + GEN9_ENABLE_GPGPU_PREEMPTION); + + /* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */ + /* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */ + WA_SET_BIT_MASKED(CACHE_MODE_1, + GEN8_4x4_STC_OPTIMIZATION_DISABLE | + GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE); + + /* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */ + WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5, + GEN9_CCS_TLB_PREFETCH_ENABLE); + + /* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */ + WA_SET_BIT_MASKED(HDC_CHICKEN0, + HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT | + HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE); + + /* WaForceEnableNonCoherent and WaDisableHDCInvalidation are + * both tied to WaForceContextSaveRestoreNonCoherent + * in some hsds for skl. We keep the tie for all gen9. The + * documentation is a bit hazy and so we want to get common behaviour, + * even though there is no clear evidence we would need both on kbl/bxt. + * This area has been source of system hangs so we play it safe + * and mimic the skl regardless of what bspec says. + * + * Use Force Non-Coherent whenever executing a 3D context. This + * is a workaround for a possible hang in the unlikely event + * a TLB invalidation occurs during a PSD flush. + */ + + /* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */ + WA_SET_BIT_MASKED(HDC_CHICKEN0, + HDC_FORCE_NON_COHERENT); + + /* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */ + if (IS_SKYLAKE(i915) || + IS_KABYLAKE(i915) || + IS_COFFEELAKE(i915) || + IS_COMETLAKE(i915)) + WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3, + GEN8_SAMPLER_POWER_BYPASS_DIS); + + /* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */ + WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE); + + /* + * Supporting preemption with fine-granularity requires changes in the + * batch buffer programming. Since we can't break old userspace, we + * need to set our default preemption level to safe value. Userspace is + * still able to use more fine-grained preemption levels, since in + * WaEnablePreemptionGranularityControlByUMD we're whitelisting the + * per-ctx register. As such, WaDisable{3D,GPGPU}MidCmdPreemption are + * not real HW workarounds, but merely a way to start using preemption + * while maintaining old contract with userspace. + */ + + /* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */ + WA_CLR_BIT_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL); + + /* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */ + WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, + GEN9_PREEMPT_GPGPU_LEVEL_MASK, + GEN9_PREEMPT_GPGPU_COMMAND_LEVEL); + + /* WaClearHIZ_WM_CHICKEN3:bxt,glk */ + if (IS_GEN9_LP(i915)) + WA_SET_BIT_MASKED(GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ); +} + +static void skl_tune_iz_hashing(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + struct intel_gt *gt = engine->gt; + u8 vals[3] = { 0, 0, 0 }; + unsigned int i; + + for (i = 0; i < 3; i++) { + u8 ss; + + /* + * Only consider slices where one, and only one, subslice has 7 + * EUs + */ + if (!is_power_of_2(gt->info.sseu.subslice_7eu[i])) + continue; + + /* + * subslice_7eu[i] != 0 (because of the check above) and + * ss_max == 4 (maximum number of subslices possible per slice) + * + * -> 0 <= ss <= 3; + */ + ss = ffs(gt->info.sseu.subslice_7eu[i]) - 1; + vals[i] = 3 - ss; + } + + if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0) + return; + + /* Tune IZ hashing. See intel_device_info_runtime_init() */ + WA_SET_FIELD_MASKED(GEN7_GT_MODE, + GEN9_IZ_HASHING_MASK(2) | + GEN9_IZ_HASHING_MASK(1) | + GEN9_IZ_HASHING_MASK(0), + GEN9_IZ_HASHING(2, vals[2]) | + GEN9_IZ_HASHING(1, vals[1]) | + GEN9_IZ_HASHING(0, vals[0])); +} + +static void skl_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + gen9_ctx_workarounds_init(engine, wal); + skl_tune_iz_hashing(engine, wal); +} + +static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + gen9_ctx_workarounds_init(engine, wal); + + /* WaDisableThreadStallDopClockGating:bxt */ + WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, + STALL_DOP_GATING_DISABLE); + + /* WaToEnableHwFixForPushConstHWBug:bxt */ + WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, + GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); +} + +static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + struct drm_i915_private *i915 = engine->i915; + + gen9_ctx_workarounds_init(engine, wal); + + /* WaToEnableHwFixForPushConstHWBug:kbl */ + if (IS_KBL_GT_REVID(i915, KBL_REVID_C0, REVID_FOREVER)) + WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, + GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); + + /* WaDisableSbeCacheDispatchPortSharing:kbl */ + WA_SET_BIT_MASKED(GEN7_HALF_SLICE_CHICKEN1, + GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE); +} + +static void glk_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + gen9_ctx_workarounds_init(engine, wal); + + /* WaToEnableHwFixForPushConstHWBug:glk */ + WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, + GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); +} + +static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + gen9_ctx_workarounds_init(engine, wal); + + /* WaToEnableHwFixForPushConstHWBug:cfl */ + WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, + GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); + + /* WaDisableSbeCacheDispatchPortSharing:cfl */ + WA_SET_BIT_MASKED(GEN7_HALF_SLICE_CHICKEN1, + GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE); +} + +static void cnl_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + /* WaForceContextSaveRestoreNonCoherent:cnl */ + WA_SET_BIT_MASKED(CNL_HDC_CHICKEN0, + HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT); + + /* WaDisableReplayBufferBankArbitrationOptimization:cnl */ + WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, + GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); + + /* WaPushConstantDereferenceHoldDisable:cnl */ + WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2, PUSH_CONSTANT_DEREF_DISABLE); + + /* FtrEnableFastAnisoL1BankingFix:cnl */ + WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3, CNL_FAST_ANISO_L1_BANKING_FIX); + + /* WaDisable3DMidCmdPreemption:cnl */ + WA_CLR_BIT_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL); + + /* WaDisableGPGPUMidCmdPreemption:cnl */ + WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, + GEN9_PREEMPT_GPGPU_LEVEL_MASK, + GEN9_PREEMPT_GPGPU_COMMAND_LEVEL); + + /* WaDisableEarlyEOT:cnl */ + WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, DISABLE_EARLY_EOT); +} + +static void icl_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + struct drm_i915_private *i915 = engine->i915; + + /* WaDisableBankHangMode:icl */ + wa_write(wal, + GEN8_L3CNTLREG, + intel_uncore_read(engine->uncore, GEN8_L3CNTLREG) | + GEN8_ERRDETBCTRL); + + /* Wa_1604370585:icl (pre-prod) + * Formerly known as WaPushConstantDereferenceHoldDisable + */ + if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_B0)) + WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2, + PUSH_CONSTANT_DEREF_DISABLE); + + /* WaForceEnableNonCoherent:icl + * This is not the same workaround as in early Gen9 platforms, where + * lacking this could cause system hangs, but coherency performance + * overhead is high and only a few compute workloads really need it + * (the register is whitelisted in hardware now, so UMDs can opt in + * for coherency if they have a good reason). + */ + WA_SET_BIT_MASKED(ICL_HDC_MODE, HDC_FORCE_NON_COHERENT); + + /* Wa_2006611047:icl (pre-prod) + * Formerly known as WaDisableImprovedTdlClkGating + */ + if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_A0)) + WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2, + GEN11_TDL_CLOCK_GATING_FIX_DISABLE); + + /* Wa_2006665173:icl (pre-prod) */ + if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_A0)) + WA_SET_BIT_MASKED(GEN11_COMMON_SLICE_CHICKEN3, + GEN11_BLEND_EMB_FIX_DISABLE_IN_RCC); + + /* WaEnableFloatBlendOptimization:icl */ + wa_write_masked_or(wal, + GEN10_CACHE_MODE_SS, + 0, /* write-only, so skip validation */ + _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE)); + + /* WaDisableGPGPUMidThreadPreemption:icl */ + WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, + GEN9_PREEMPT_GPGPU_LEVEL_MASK, + GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL); + + /* allow headerless messages for preemptible GPGPU context */ + WA_SET_BIT_MASKED(GEN10_SAMPLER_MODE, + GEN11_SAMPLER_ENABLE_HEADLESS_MSG); + + /* Wa_1604278689:icl,ehl */ + wa_write(wal, IVB_FBC_RT_BASE, 0xFFFFFFFF & ~ILK_FBC_RT_VALID); + wa_write_masked_or(wal, IVB_FBC_RT_BASE_UPPER, + 0, /* write-only register; skip validation */ + 0xFFFFFFFF); + + /* Wa_1406306137:icl,ehl */ + wa_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU); +} + +static void gen12_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + /* + * Wa_1409142259:tgl + * Wa_1409347922:tgl + * Wa_1409252684:tgl + * Wa_1409217633:tgl + * Wa_1409207793:tgl + * Wa_1409178076:tgl + * Wa_1408979724:tgl + * Wa_14010443199:rkl + * Wa_14010698770:rkl + */ + WA_SET_BIT_MASKED(GEN11_COMMON_SLICE_CHICKEN3, + GEN12_DISABLE_CPS_AWARE_COLOR_PIPE); + + /* WaDisableGPGPUMidThreadPreemption:gen12 */ + WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, + GEN9_PREEMPT_GPGPU_LEVEL_MASK, + GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL); +} + +static void tgl_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + gen12_ctx_workarounds_init(engine, wal); + + /* + * Wa_1604555607:tgl,rkl + * + * Note that the implementation of this workaround is further modified + * according to the FF_MODE2 guidance given by Wa_1608008084:gen12. + * FF_MODE2 register will return the wrong value when read. The default + * value for this register is zero for all fields and there are no bit + * masks. So instead of doing a RMW we should just write the GS Timer + * and TDS timer values for Wa_1604555607 and Wa_16011163337. + */ + wa_add(wal, + FF_MODE2, + FF_MODE2_GS_TIMER_MASK | FF_MODE2_TDS_TIMER_MASK, + FF_MODE2_GS_TIMER_224 | FF_MODE2_TDS_TIMER_128, + 0); +} + +static void +__intel_engine_init_ctx_wa(struct intel_engine_cs *engine, + struct i915_wa_list *wal, + const char *name) +{ + struct drm_i915_private *i915 = engine->i915; + + if (engine->class != RENDER_CLASS) + return; + + wa_init_start(wal, name, engine->name); + + if (IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) + tgl_ctx_workarounds_init(engine, wal); + else if (IS_GEN(i915, 12)) + gen12_ctx_workarounds_init(engine, wal); + else if (IS_GEN(i915, 11)) + icl_ctx_workarounds_init(engine, wal); + else if (IS_CANNONLAKE(i915)) + cnl_ctx_workarounds_init(engine, wal); + else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915)) + cfl_ctx_workarounds_init(engine, wal); + else if (IS_GEMINILAKE(i915)) + glk_ctx_workarounds_init(engine, wal); + else if (IS_KABYLAKE(i915)) + kbl_ctx_workarounds_init(engine, wal); + else if (IS_BROXTON(i915)) + bxt_ctx_workarounds_init(engine, wal); + else if (IS_SKYLAKE(i915)) + skl_ctx_workarounds_init(engine, wal); + else if (IS_CHERRYVIEW(i915)) + chv_ctx_workarounds_init(engine, wal); + else if (IS_BROADWELL(i915)) + bdw_ctx_workarounds_init(engine, wal); + else if (IS_GEN(i915, 7)) + gen7_ctx_workarounds_init(engine, wal); + else if (IS_GEN(i915, 6)) + gen6_ctx_workarounds_init(engine, wal); + else if (INTEL_GEN(i915) < 8) + return; + else + MISSING_CASE(INTEL_GEN(i915)); + + wa_init_finish(wal); +} + +void intel_engine_init_ctx_wa(struct intel_engine_cs *engine) +{ + __intel_engine_init_ctx_wa(engine, &engine->ctx_wa_list, "context"); +} + +int intel_engine_emit_ctx_wa(struct i915_request *rq) +{ + struct i915_wa_list *wal = &rq->engine->ctx_wa_list; + struct i915_wa *wa; + unsigned int i; + u32 *cs; + int ret; + + if (wal->count == 0) + return 0; + + ret = rq->engine->emit_flush(rq, EMIT_BARRIER); + if (ret) + return ret; + + cs = intel_ring_begin(rq, (wal->count * 2 + 2)); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_LOAD_REGISTER_IMM(wal->count); + for (i = 0, wa = wal->list; i < wal->count; i++, wa++) { + *cs++ = i915_mmio_reg_offset(wa->reg); + *cs++ = wa->set; + } + *cs++ = MI_NOOP; + + intel_ring_advance(rq, cs); + + ret = rq->engine->emit_flush(rq, EMIT_BARRIER); + if (ret) + return ret; + + return 0; +} + +static void +gen4_gt_workarounds_init(struct drm_i915_private *i915, + struct i915_wa_list *wal) +{ + /* WaDisable_RenderCache_OperationalFlush:gen4,ilk */ + wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE); +} + +static void +g4x_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + gen4_gt_workarounds_init(i915, wal); + + /* WaDisableRenderCachePipelinedFlush:g4x,ilk */ + wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE); +} + +static void +ilk_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + g4x_gt_workarounds_init(i915, wal); + + wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED); +} + +static void +snb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + /* WaDisableHiZPlanesWhenMSAAEnabled:snb */ + wa_masked_en(wal, + _3D_CHICKEN, + _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB); + + /* WaDisable_RenderCache_OperationalFlush:snb */ + wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE); + + /* + * BSpec recommends 8x4 when MSAA is used, + * however in practice 16x4 seems fastest. + * + * Note that PS/WM thread counts depend on the WIZ hashing + * disable bit, which we don't touch here, but it's good + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). + */ + wa_add(wal, + GEN6_GT_MODE, 0, + _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4), + GEN6_WIZ_HASHING_16x4); + + wa_masked_dis(wal, CACHE_MODE_0, CM0_STC_EVICT_DISABLE_LRA_SNB); + + wa_masked_en(wal, + _3D_CHICKEN3, + /* WaStripsFansDisableFastClipPerformanceFix:snb */ + _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL | + /* + * Bspec says: + * "This bit must be set if 3DSTATE_CLIP clip mode is set + * to normal and 3DSTATE_SF number of SF output attributes + * is more than 16." + */ + _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH); +} + +static void +ivb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + /* WaDisableEarlyCull:ivb */ + wa_masked_en(wal, _3D_CHICKEN3, _3D_CHICKEN_SF_DISABLE_OBJEND_CULL); + + /* WaDisablePSDDualDispatchEnable:ivb */ + if (IS_IVB_GT1(i915)) + wa_masked_en(wal, + GEN7_HALF_SLICE_CHICKEN1, + GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE); + + /* WaDisable_RenderCache_OperationalFlush:ivb */ + wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE); + + /* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */ + wa_masked_dis(wal, + GEN7_COMMON_SLICE_CHICKEN1, + GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC); + + /* WaApplyL3ControlAndL3ChickenMode:ivb */ + wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL); + wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE); + + /* WaForceL3Serialization:ivb */ + wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE); + + /* + * WaVSThreadDispatchOverride:ivb,vlv + * + * This actually overrides the dispatch + * mode for all thread types. + */ + wa_write_masked_or(wal, GEN7_FF_THREAD_MODE, + GEN7_FF_SCHED_MASK, + GEN7_FF_TS_SCHED_HW | + GEN7_FF_VS_SCHED_HW | + GEN7_FF_DS_SCHED_HW); + + if (0) { /* causes HiZ corruption on ivb:gt1 */ + /* enable HiZ Raw Stall Optimization */ + wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE); + } + + /* WaDisable4x2SubspanOptimization:ivb */ + wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE); + + /* + * BSpec recommends 8x4 when MSAA is used, + * however in practice 16x4 seems fastest. + * + * Note that PS/WM thread counts depend on the WIZ hashing + * disable bit, which we don't touch here, but it's good + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). + */ + wa_add(wal, GEN7_GT_MODE, 0, + _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4), + GEN6_WIZ_HASHING_16x4); +} + +static void +vlv_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + /* WaDisableEarlyCull:vlv */ + wa_masked_en(wal, _3D_CHICKEN3, _3D_CHICKEN_SF_DISABLE_OBJEND_CULL); + + /* WaPsdDispatchEnable:vlv */ + /* WaDisablePSDDualDispatchEnable:vlv */ + wa_masked_en(wal, + GEN7_HALF_SLICE_CHICKEN1, + GEN7_MAX_PS_THREAD_DEP | + GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE); + + /* WaDisable_RenderCache_OperationalFlush:vlv */ + wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE); + + /* WaForceL3Serialization:vlv */ + wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE); + + /* + * WaVSThreadDispatchOverride:ivb,vlv + * + * This actually overrides the dispatch + * mode for all thread types. + */ + wa_write_masked_or(wal, + GEN7_FF_THREAD_MODE, + GEN7_FF_SCHED_MASK, + GEN7_FF_TS_SCHED_HW | + GEN7_FF_VS_SCHED_HW | + GEN7_FF_DS_SCHED_HW); + + /* + * BSpec says this must be set, even though + * WaDisable4x2SubspanOptimization isn't listed for VLV. + */ + wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE); + + /* + * BSpec recommends 8x4 when MSAA is used, + * however in practice 16x4 seems fastest. + * + * Note that PS/WM thread counts depend on the WIZ hashing + * disable bit, which we don't touch here, but it's good + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). + */ + wa_add(wal, GEN7_GT_MODE, 0, + _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4), + GEN6_WIZ_HASHING_16x4); + + /* + * WaIncreaseL3CreditsForVLVB0:vlv + * This is the hardware default actually. + */ + wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE); +} + +static void +hsw_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + /* L3 caching of data atomics doesn't work -- disable it. */ + wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE); + + wa_add(wal, + HSW_ROW_CHICKEN3, 0, + _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE), + 0 /* XXX does this reg exist? */); + + /* WaVSRefCountFullforceMissDisable:hsw */ + wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME); + + wa_masked_dis(wal, + CACHE_MODE_0_GEN7, + /* WaDisable_RenderCache_OperationalFlush:hsw */ + RC_OP_FLUSH_ENABLE | + /* enable HiZ Raw Stall Optimization */ + HIZ_RAW_STALL_OPT_DISABLE); + + /* WaDisable4x2SubspanOptimization:hsw */ + wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE); + + /* + * BSpec recommends 8x4 when MSAA is used, + * however in practice 16x4 seems fastest. + * + * Note that PS/WM thread counts depend on the WIZ hashing + * disable bit, which we don't touch here, but it's good + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). + */ + wa_add(wal, GEN7_GT_MODE, 0, + _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4), + GEN6_WIZ_HASHING_16x4); + + /* WaSampleCChickenBitEnable:hsw */ + wa_masked_en(wal, HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE); +} + +static void +gen9_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + /* WaDisableKillLogic:bxt,skl,kbl */ + if (!IS_COFFEELAKE(i915) && !IS_COMETLAKE(i915)) + wa_write_or(wal, + GAM_ECOCHK, + ECOCHK_DIS_TLB); + + if (HAS_LLC(i915)) { + /* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl + * + * Must match Display Engine. See + * WaCompressedResourceDisplayNewHashMode. + */ + wa_write_or(wal, + MMCD_MISC_CTRL, + MMCD_PCLA | MMCD_HOTSPOT_EN); + } + + /* WaDisableHDCInvalidation:skl,bxt,kbl,cfl */ + wa_write_or(wal, + GAM_ECOCHK, + BDW_DISABLE_HDC_INVALIDATION); +} + +static void +skl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + gen9_gt_workarounds_init(i915, wal); + + /* WaDisableGafsUnitClkGating:skl */ + wa_write_or(wal, + GEN7_UCGCTL4, + GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE); + + /* WaInPlaceDecompressionHang:skl */ + if (IS_SKL_REVID(i915, SKL_REVID_H0, REVID_FOREVER)) + wa_write_or(wal, + GEN9_GAMT_ECO_REG_RW_IA, + GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); +} + +static void +bxt_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + gen9_gt_workarounds_init(i915, wal); + + /* WaInPlaceDecompressionHang:bxt */ + wa_write_or(wal, + GEN9_GAMT_ECO_REG_RW_IA, + GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); +} + +static void +kbl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + gen9_gt_workarounds_init(i915, wal); + + /* WaDisableDynamicCreditSharing:kbl */ + if (IS_KBL_GT_REVID(i915, 0, KBL_REVID_B0)) + wa_write_or(wal, + GAMT_CHKN_BIT_REG, + GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING); + + /* WaDisableGafsUnitClkGating:kbl */ + wa_write_or(wal, + GEN7_UCGCTL4, + GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE); + + /* WaInPlaceDecompressionHang:kbl */ + wa_write_or(wal, + GEN9_GAMT_ECO_REG_RW_IA, + GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); +} + +static void +glk_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + gen9_gt_workarounds_init(i915, wal); +} + +static void +cfl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + gen9_gt_workarounds_init(i915, wal); + + /* WaDisableGafsUnitClkGating:cfl */ + wa_write_or(wal, + GEN7_UCGCTL4, + GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE); + + /* WaInPlaceDecompressionHang:cfl */ + wa_write_or(wal, + GEN9_GAMT_ECO_REG_RW_IA, + GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); +} + +static void +wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + const struct sseu_dev_info *sseu = &i915->gt.info.sseu; + unsigned int slice, subslice; + u32 l3_en, mcr, mcr_mask; + + GEM_BUG_ON(INTEL_GEN(i915) < 10); + + /* + * WaProgramMgsrForL3BankSpecificMmioReads: cnl,icl + * L3Banks could be fused off in single slice scenario. If that is + * the case, we might need to program MCR select to a valid L3Bank + * by default, to make sure we correctly read certain registers + * later on (in the range 0xB100 - 0xB3FF). + * + * WaProgramMgsrForCorrectSliceSpecificMmioReads:cnl,icl + * Before any MMIO read into slice/subslice specific registers, MCR + * packet control register needs to be programmed to point to any + * enabled s/ss pair. Otherwise, incorrect values will be returned. + * This means each subsequent MMIO read will be forwarded to an + * specific s/ss combination, but this is OK since these registers + * are consistent across s/ss in almost all cases. In the rare + * occasions, such as INSTDONE, where this value is dependent + * on s/ss combo, the read should be done with read_subslice_reg. + * + * Since GEN8_MCR_SELECTOR contains dual-purpose bits which select both + * to which subslice, or to which L3 bank, the respective mmio reads + * will go, we have to find a common index which works for both + * accesses. + * + * Case where we cannot find a common index fortunately should not + * happen in production hardware, so we only emit a warning instead of + * implementing something more complex that requires checking the range + * of every MMIO read. + */ + + if (INTEL_GEN(i915) >= 10 && is_power_of_2(sseu->slice_mask)) { + u32 l3_fuse = + intel_uncore_read(&i915->uncore, GEN10_MIRROR_FUSE3) & + GEN10_L3BANK_MASK; + + drm_dbg(&i915->drm, "L3 fuse = %x\n", l3_fuse); + l3_en = ~(l3_fuse << GEN10_L3BANK_PAIR_COUNT | l3_fuse); + } else { + l3_en = ~0; + } + + slice = fls(sseu->slice_mask) - 1; + subslice = fls(l3_en & intel_sseu_get_subslices(sseu, slice)); + if (!subslice) { + drm_warn(&i915->drm, + "No common index found between subslice mask %x and L3 bank mask %x!\n", + intel_sseu_get_subslices(sseu, slice), l3_en); + subslice = fls(l3_en); + drm_WARN_ON(&i915->drm, !subslice); + } + subslice--; + + if (INTEL_GEN(i915) >= 11) { + mcr = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice); + mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK; + } else { + mcr = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice); + mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK; + } + + drm_dbg(&i915->drm, "MCR slice/subslice = %x\n", mcr); + + wa_write_masked_or(wal, GEN8_MCR_SELECTOR, mcr_mask, mcr); +} + +static void +cnl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + wa_init_mcr(i915, wal); + + /* WaInPlaceDecompressionHang:cnl */ + wa_write_or(wal, + GEN9_GAMT_ECO_REG_RW_IA, + GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); +} + +static void +icl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + wa_init_mcr(i915, wal); + + /* WaInPlaceDecompressionHang:icl */ + wa_write_or(wal, + GEN9_GAMT_ECO_REG_RW_IA, + GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); + + /* WaModifyGamTlbPartitioning:icl */ + wa_write_masked_or(wal, + GEN11_GACB_PERF_CTRL, + GEN11_HASH_CTRL_MASK, + GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4); + + /* Wa_1405766107:icl + * Formerly known as WaCL2SFHalfMaxAlloc + */ + wa_write_or(wal, + GEN11_LSN_UNSLCVC, + GEN11_LSN_UNSLCVC_GAFS_HALF_SF_MAXALLOC | + GEN11_LSN_UNSLCVC_GAFS_HALF_CL2_MAXALLOC); + + /* Wa_220166154:icl + * Formerly known as WaDisCtxReload + */ + wa_write_or(wal, + GEN8_GAMW_ECO_DEV_RW_IA, + GAMW_ECO_DEV_CTX_RELOAD_DISABLE); + + /* Wa_1405779004:icl (pre-prod) */ + if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_A0)) + wa_write_or(wal, + SLICE_UNIT_LEVEL_CLKGATE, + MSCUNIT_CLKGATE_DIS); + + /* Wa_1406838659:icl (pre-prod) */ + if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_B0)) + wa_write_or(wal, + INF_UNIT_LEVEL_CLKGATE, + CGPSF_CLKGATE_DIS); + + /* Wa_1406463099:icl + * Formerly known as WaGamTlbPendError + */ + wa_write_or(wal, + GAMT_CHKN_BIT_REG, + GAMT_CHKN_DISABLE_L3_COH_PIPE); + + /* + * Wa_1408615072:icl,ehl (vsunit) + * Wa_1407596294:icl,ehl (hsunit) + */ + wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE, + VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS); + + /* Wa_1407352427:icl,ehl */ + wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2, + PSDUNIT_CLKGATE_DIS); + + /* Wa_1406680159:icl,ehl */ + wa_write_or(wal, + SUBSLICE_UNIT_LEVEL_CLKGATE, + GWUNIT_CLKGATE_DIS); + + /* Wa_1607087056:icl,ehl,jsl */ + if (IS_ICELAKE(i915) || + IS_EHL_REVID(i915, EHL_REVID_A0, EHL_REVID_A0)) { + wa_write_or(wal, + SLICE_UNIT_LEVEL_CLKGATE, + L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS); + } +} + +static void +gen12_gt_workarounds_init(struct drm_i915_private *i915, + struct i915_wa_list *wal) +{ + wa_init_mcr(i915, wal); +} + +static void +tgl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + gen12_gt_workarounds_init(i915, wal); + + /* Wa_1409420604:tgl */ + if (IS_TGL_UY_GT_REVID(i915, TGL_REVID_A0, TGL_REVID_A0)) + wa_write_or(wal, + SUBSLICE_UNIT_LEVEL_CLKGATE2, + CPSSUNIT_CLKGATE_DIS); + + /* Wa_1607087056:tgl also know as BUG:1409180338 */ + if (IS_TGL_UY_GT_REVID(i915, TGL_REVID_A0, TGL_REVID_A0)) + wa_write_or(wal, + SLICE_UNIT_LEVEL_CLKGATE, + L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS); +} + +static void +gt_init_workarounds(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + if (IS_TIGERLAKE(i915)) + tgl_gt_workarounds_init(i915, wal); + else if (IS_GEN(i915, 12)) + gen12_gt_workarounds_init(i915, wal); + else if (IS_GEN(i915, 11)) + icl_gt_workarounds_init(i915, wal); + else if (IS_CANNONLAKE(i915)) + cnl_gt_workarounds_init(i915, wal); + else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915)) + cfl_gt_workarounds_init(i915, wal); + else if (IS_GEMINILAKE(i915)) + glk_gt_workarounds_init(i915, wal); + else if (IS_KABYLAKE(i915)) + kbl_gt_workarounds_init(i915, wal); + else if (IS_BROXTON(i915)) + bxt_gt_workarounds_init(i915, wal); + else if (IS_SKYLAKE(i915)) + skl_gt_workarounds_init(i915, wal); + else if (IS_HASWELL(i915)) + hsw_gt_workarounds_init(i915, wal); + else if (IS_VALLEYVIEW(i915)) + vlv_gt_workarounds_init(i915, wal); + else if (IS_IVYBRIDGE(i915)) + ivb_gt_workarounds_init(i915, wal); + else if (IS_GEN(i915, 6)) + snb_gt_workarounds_init(i915, wal); + else if (IS_GEN(i915, 5)) + ilk_gt_workarounds_init(i915, wal); + else if (IS_G4X(i915)) + g4x_gt_workarounds_init(i915, wal); + else if (IS_GEN(i915, 4)) + gen4_gt_workarounds_init(i915, wal); + else if (INTEL_GEN(i915) <= 8) + return; + else + MISSING_CASE(INTEL_GEN(i915)); +} + +void intel_gt_init_workarounds(struct drm_i915_private *i915) +{ + struct i915_wa_list *wal = &i915->gt_wa_list; + + wa_init_start(wal, "GT", "global"); + gt_init_workarounds(i915, wal); + wa_init_finish(wal); +} + +static enum forcewake_domains +wal_get_fw_for_rmw(struct intel_uncore *uncore, const struct i915_wa_list *wal) +{ + enum forcewake_domains fw = 0; + struct i915_wa *wa; + unsigned int i; + + for (i = 0, wa = wal->list; i < wal->count; i++, wa++) + fw |= intel_uncore_forcewake_for_reg(uncore, + wa->reg, + FW_REG_READ | + FW_REG_WRITE); + + return fw; +} + +static bool +wa_verify(const struct i915_wa *wa, u32 cur, const char *name, const char *from) +{ + if ((cur ^ wa->set) & wa->read) { + DRM_ERROR("%s workaround lost on %s! (%x=%x/%x, expected %x)\n", + name, from, i915_mmio_reg_offset(wa->reg), + cur, cur & wa->read, wa->set); + + return false; + } + + return true; +} + +static void +wa_list_apply(struct intel_uncore *uncore, const struct i915_wa_list *wal) +{ + enum forcewake_domains fw; + unsigned long flags; + struct i915_wa *wa; + unsigned int i; + + if (!wal->count) + return; + + fw = wal_get_fw_for_rmw(uncore, wal); + + spin_lock_irqsave(&uncore->lock, flags); + intel_uncore_forcewake_get__locked(uncore, fw); + + for (i = 0, wa = wal->list; i < wal->count; i++, wa++) { + if (wa->clr) + intel_uncore_rmw_fw(uncore, wa->reg, wa->clr, wa->set); + else + intel_uncore_write_fw(uncore, wa->reg, wa->set); + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) + wa_verify(wa, + intel_uncore_read_fw(uncore, wa->reg), + wal->name, "application"); + } + + intel_uncore_forcewake_put__locked(uncore, fw); + spin_unlock_irqrestore(&uncore->lock, flags); +} + +void intel_gt_apply_workarounds(struct intel_gt *gt) +{ + wa_list_apply(gt->uncore, >->i915->gt_wa_list); +} + +static bool wa_list_verify(struct intel_uncore *uncore, + const struct i915_wa_list *wal, + const char *from) +{ + struct i915_wa *wa; + unsigned int i; + bool ok = true; + + for (i = 0, wa = wal->list; i < wal->count; i++, wa++) + ok &= wa_verify(wa, + intel_uncore_read(uncore, wa->reg), + wal->name, from); + + return ok; +} + +bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from) +{ + return wa_list_verify(gt->uncore, >->i915->gt_wa_list, from); +} + +static inline bool is_nonpriv_flags_valid(u32 flags) +{ + /* Check only valid flag bits are set */ + if (flags & ~RING_FORCE_TO_NONPRIV_MASK_VALID) + return false; + + /* NB: Only 3 out of 4 enum values are valid for access field */ + if ((flags & RING_FORCE_TO_NONPRIV_ACCESS_MASK) == + RING_FORCE_TO_NONPRIV_ACCESS_INVALID) + return false; + + return true; +} + +static void +whitelist_reg_ext(struct i915_wa_list *wal, i915_reg_t reg, u32 flags) +{ + struct i915_wa wa = { + .reg = reg + }; + + if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS)) + return; + + if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags))) + return; + + wa.reg.reg |= flags; + _wa_add(wal, &wa); +} + +static void +whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg) +{ + whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW); +} + +static void gen9_whitelist_build(struct i915_wa_list *w) +{ + /* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */ + whitelist_reg(w, GEN9_CTX_PREEMPT_REG); + + /* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl,cfl,[cnl] */ + whitelist_reg(w, GEN8_CS_CHICKEN1); + + /* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl,glk,cfl */ + whitelist_reg(w, GEN8_HDC_CHICKEN1); + + /* WaSendPushConstantsFromMMIO:skl,bxt */ + whitelist_reg(w, COMMON_SLICE_CHICKEN2); +} + +static void skl_whitelist_build(struct intel_engine_cs *engine) +{ + struct i915_wa_list *w = &engine->whitelist; + + if (engine->class != RENDER_CLASS) + return; + + gen9_whitelist_build(w); + + /* WaDisableLSQCROPERFforOCL:skl */ + whitelist_reg(w, GEN8_L3SQCREG4); +} + +static void bxt_whitelist_build(struct intel_engine_cs *engine) +{ + if (engine->class != RENDER_CLASS) + return; + + gen9_whitelist_build(&engine->whitelist); +} + +static void kbl_whitelist_build(struct intel_engine_cs *engine) +{ + struct i915_wa_list *w = &engine->whitelist; + + if (engine->class != RENDER_CLASS) + return; + + gen9_whitelist_build(w); + + /* WaDisableLSQCROPERFforOCL:kbl */ + whitelist_reg(w, GEN8_L3SQCREG4); +} + +static void glk_whitelist_build(struct intel_engine_cs *engine) +{ + struct i915_wa_list *w = &engine->whitelist; + + if (engine->class != RENDER_CLASS) + return; + + gen9_whitelist_build(w); + + /* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */ + whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1); +} + +static void cfl_whitelist_build(struct intel_engine_cs *engine) +{ + struct i915_wa_list *w = &engine->whitelist; + + if (engine->class != RENDER_CLASS) + return; + + gen9_whitelist_build(w); + + /* + * WaAllowPMDepthAndInvocationCountAccessFromUMD:cfl,whl,cml,aml + * + * This covers 4 register which are next to one another : + * - PS_INVOCATION_COUNT + * - PS_INVOCATION_COUNT_UDW + * - PS_DEPTH_COUNT + * - PS_DEPTH_COUNT_UDW + */ + whitelist_reg_ext(w, PS_INVOCATION_COUNT, + RING_FORCE_TO_NONPRIV_ACCESS_RD | + RING_FORCE_TO_NONPRIV_RANGE_4); +} + +static void cml_whitelist_build(struct intel_engine_cs *engine) +{ + struct i915_wa_list *w = &engine->whitelist; + + if (engine->class != RENDER_CLASS) + whitelist_reg_ext(w, + RING_CTX_TIMESTAMP(engine->mmio_base), + RING_FORCE_TO_NONPRIV_ACCESS_RD); + + cfl_whitelist_build(engine); +} + +static void cnl_whitelist_build(struct intel_engine_cs *engine) +{ + struct i915_wa_list *w = &engine->whitelist; + + if (engine->class != RENDER_CLASS) + return; + + /* WaEnablePreemptionGranularityControlByUMD:cnl */ + whitelist_reg(w, GEN8_CS_CHICKEN1); +} + +static void icl_whitelist_build(struct intel_engine_cs *engine) +{ + struct i915_wa_list *w = &engine->whitelist; + + switch (engine->class) { + case RENDER_CLASS: + /* WaAllowUMDToModifyHalfSliceChicken7:icl */ + whitelist_reg(w, GEN9_HALF_SLICE_CHICKEN7); + + /* WaAllowUMDToModifySamplerMode:icl */ + whitelist_reg(w, GEN10_SAMPLER_MODE); + + /* WaEnableStateCacheRedirectToCS:icl */ + whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1); + + /* + * WaAllowPMDepthAndInvocationCountAccessFromUMD:icl + * + * This covers 4 register which are next to one another : + * - PS_INVOCATION_COUNT + * - PS_INVOCATION_COUNT_UDW + * - PS_DEPTH_COUNT + * - PS_DEPTH_COUNT_UDW + */ + whitelist_reg_ext(w, PS_INVOCATION_COUNT, + RING_FORCE_TO_NONPRIV_ACCESS_RD | + RING_FORCE_TO_NONPRIV_RANGE_4); + break; + + case VIDEO_DECODE_CLASS: + /* hucStatusRegOffset */ + whitelist_reg_ext(w, _MMIO(0x2000 + engine->mmio_base), + RING_FORCE_TO_NONPRIV_ACCESS_RD); + /* hucUKernelHdrInfoRegOffset */ + whitelist_reg_ext(w, _MMIO(0x2014 + engine->mmio_base), + RING_FORCE_TO_NONPRIV_ACCESS_RD); + /* hucStatus2RegOffset */ + whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base), + RING_FORCE_TO_NONPRIV_ACCESS_RD); + whitelist_reg_ext(w, + RING_CTX_TIMESTAMP(engine->mmio_base), + RING_FORCE_TO_NONPRIV_ACCESS_RD); + break; + + default: + whitelist_reg_ext(w, + RING_CTX_TIMESTAMP(engine->mmio_base), + RING_FORCE_TO_NONPRIV_ACCESS_RD); + break; + } +} + +static void tgl_whitelist_build(struct intel_engine_cs *engine) +{ + struct i915_wa_list *w = &engine->whitelist; + + switch (engine->class) { + case RENDER_CLASS: + /* + * WaAllowPMDepthAndInvocationCountAccessFromUMD:tgl + * Wa_1408556865:tgl + * + * This covers 4 registers which are next to one another : + * - PS_INVOCATION_COUNT + * - PS_INVOCATION_COUNT_UDW + * - PS_DEPTH_COUNT + * - PS_DEPTH_COUNT_UDW + */ + whitelist_reg_ext(w, PS_INVOCATION_COUNT, + RING_FORCE_TO_NONPRIV_ACCESS_RD | + RING_FORCE_TO_NONPRIV_RANGE_4); + + /* Wa_1808121037:tgl */ + whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1); + + /* Wa_1806527549:tgl */ + whitelist_reg(w, HIZ_CHICKEN); + break; + default: + whitelist_reg_ext(w, + RING_CTX_TIMESTAMP(engine->mmio_base), + RING_FORCE_TO_NONPRIV_ACCESS_RD); + break; + } +} + +void intel_engine_init_whitelist(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + struct i915_wa_list *w = &engine->whitelist; + + wa_init_start(w, "whitelist", engine->name); + + if (IS_GEN(i915, 12)) + tgl_whitelist_build(engine); + else if (IS_GEN(i915, 11)) + icl_whitelist_build(engine); + else if (IS_CANNONLAKE(i915)) + cnl_whitelist_build(engine); + else if (IS_COMETLAKE(i915)) + cml_whitelist_build(engine); + else if (IS_COFFEELAKE(i915)) + cfl_whitelist_build(engine); + else if (IS_GEMINILAKE(i915)) + glk_whitelist_build(engine); + else if (IS_KABYLAKE(i915)) + kbl_whitelist_build(engine); + else if (IS_BROXTON(i915)) + bxt_whitelist_build(engine); + else if (IS_SKYLAKE(i915)) + skl_whitelist_build(engine); + else if (INTEL_GEN(i915) <= 8) + return; + else + MISSING_CASE(INTEL_GEN(i915)); + + wa_init_finish(w); +} + +void intel_engine_apply_whitelist(struct intel_engine_cs *engine) +{ + const struct i915_wa_list *wal = &engine->whitelist; + struct intel_uncore *uncore = engine->uncore; + const u32 base = engine->mmio_base; + struct i915_wa *wa; + unsigned int i; + + if (!wal->count) + return; + + for (i = 0, wa = wal->list; i < wal->count; i++, wa++) + intel_uncore_write(uncore, + RING_FORCE_TO_NONPRIV(base, i), + i915_mmio_reg_offset(wa->reg)); + + /* And clear the rest just in case of garbage */ + for (; i < RING_MAX_NONPRIV_SLOTS; i++) + intel_uncore_write(uncore, + RING_FORCE_TO_NONPRIV(base, i), + i915_mmio_reg_offset(RING_NOPID(base))); +} + +static void +rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) +{ + struct drm_i915_private *i915 = engine->i915; + + if (IS_TGL_UY_GT_REVID(i915, TGL_REVID_A0, TGL_REVID_A0)) { + /* + * Wa_1607138336:tgl + * Wa_1607063988:tgl + */ + wa_write_or(wal, + GEN9_CTX_PREEMPT_REG, + GEN12_DISABLE_POSH_BUSY_FF_DOP_CG); + + /* + * Wa_1606679103:tgl + * (see also Wa_1606682166:icl) + */ + wa_write_or(wal, + GEN7_SARCHKMD, + GEN7_DISABLE_SAMPLER_PREFETCH); + + /* Wa_1408615072:tgl */ + wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2, + VSUNIT_CLKGATE_DIS_TGL); + } + + if (IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) { + /* Wa_1606931601:tgl,rkl */ + wa_masked_en(wal, GEN7_ROW_CHICKEN2, GEN12_DISABLE_EARLY_READ); + + /* Wa_1409804808:tgl,rkl */ + wa_masked_en(wal, GEN7_ROW_CHICKEN2, + GEN12_PUSH_CONST_DEREF_HOLD_DIS); + + /* + * Wa_1409085225:tgl + * Wa_14010229206:tgl,rkl + */ + wa_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH); + + /* + * Wa_1407928979:tgl A* + * Wa_18011464164:tgl B0+ + * Wa_22010931296:tgl B0+ + * Wa_14010919138:rkl,tgl + */ + wa_write_or(wal, GEN7_FF_THREAD_MODE, + GEN12_FF_TESSELATION_DOP_GATE_DISABLE); + + /* + * Wa_1607030317:tgl + * Wa_1607186500:tgl + * Wa_1607297627:tgl,rkl there are multiple entries for this + * WA in the BSpec; some indicate this is an A0-only WA, + * others indicate it applies to all steppings. + */ + wa_masked_en(wal, + GEN6_RC_SLEEP_PSMI_CONTROL, + GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE | + GEN8_RC_SEMA_IDLE_MSG_DISABLE); + + /* + * Wa_1606700617:tgl + * Wa_22010271021:tgl,rkl + */ + wa_masked_en(wal, + GEN9_CS_DEBUG_MODE1, + FF_DOP_CLOCK_GATE_DISABLE); + } + + if (IS_GEN(i915, 12)) { + /* Wa_1406941453:gen12 */ + wa_masked_en(wal, + GEN10_SAMPLER_MODE, + ENABLE_SMALLPL); + } + + if (IS_GEN(i915, 11)) { + /* This is not an Wa. Enable for better image quality */ + wa_masked_en(wal, + _3D_CHICKEN3, + _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE); + + /* WaPipelineFlushCoherentLines:icl */ + wa_write_or(wal, + GEN8_L3SQCREG4, + GEN8_LQSC_FLUSH_COHERENT_LINES); + + /* + * Wa_1405543622:icl + * Formerly known as WaGAPZPriorityScheme + */ + wa_write_or(wal, + GEN8_GARBCNTL, + GEN11_ARBITRATION_PRIO_ORDER_MASK); + + /* + * Wa_1604223664:icl + * Formerly known as WaL3BankAddressHashing + */ + wa_write_masked_or(wal, + GEN8_GARBCNTL, + GEN11_HASH_CTRL_EXCL_MASK, + GEN11_HASH_CTRL_EXCL_BIT0); + wa_write_masked_or(wal, + GEN11_GLBLINVL, + GEN11_BANK_HASH_ADDR_EXCL_MASK, + GEN11_BANK_HASH_ADDR_EXCL_BIT0); + + /* + * Wa_1405733216:icl + * Formerly known as WaDisableCleanEvicts + */ + wa_write_or(wal, + GEN8_L3SQCREG4, + GEN11_LQSC_CLEAN_EVICT_DISABLE); + + /* WaForwardProgressSoftReset:icl */ + wa_write_or(wal, + GEN10_SCRATCH_LNCF2, + PMFLUSHDONE_LNICRSDROP | + PMFLUSH_GAPL3UNBLOCK | + PMFLUSHDONE_LNEBLK); + + /* Wa_1406609255:icl (pre-prod) */ + if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_B0)) + wa_write_or(wal, + GEN7_SARCHKMD, + GEN7_DISABLE_DEMAND_PREFETCH); + + /* Wa_1606682166:icl */ + wa_write_or(wal, + GEN7_SARCHKMD, + GEN7_DISABLE_SAMPLER_PREFETCH); + + /* Wa_1409178092:icl */ + wa_write_masked_or(wal, + GEN11_SCRATCH2, + GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE, + 0); + + /* WaEnable32PlaneMode:icl */ + wa_masked_en(wal, GEN9_CSFE_CHICKEN1_RCS, + GEN11_ENABLE_32_PLANE_MODE); + + /* + * Wa_1408767742:icl[a2..forever],ehl[all] + * Wa_1605460711:icl[a0..c0] + */ + wa_write_or(wal, + GEN7_FF_THREAD_MODE, + GEN12_FF_TESSELATION_DOP_GATE_DISABLE); + + /* Wa_22010271021:ehl */ + if (IS_ELKHARTLAKE(i915)) + wa_masked_en(wal, + GEN9_CS_DEBUG_MODE1, + FF_DOP_CLOCK_GATE_DISABLE); + } + + if (IS_GEN_RANGE(i915, 9, 12)) { + /* FtrPerCtxtPreemptionGranularityControl:skl,bxt,kbl,cfl,cnl,icl,tgl */ + wa_masked_en(wal, + GEN7_FF_SLICE_CS_CHICKEN1, + GEN9_FFSC_PERCTX_PREEMPT_CTRL); + } + + if (IS_SKYLAKE(i915) || + IS_KABYLAKE(i915) || + IS_COFFEELAKE(i915) || + IS_COMETLAKE(i915)) { + /* WaEnableGapsTsvCreditFix:skl,kbl,cfl */ + wa_write_or(wal, + GEN8_GARBCNTL, + GEN9_GAPS_TSV_CREDIT_DISABLE); + } + + if (IS_BROXTON(i915)) { + /* WaDisablePooledEuLoadBalancingFix:bxt */ + wa_masked_en(wal, + FF_SLICE_CS_CHICKEN2, + GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE); + } + + if (IS_GEN(i915, 9)) { + /* WaContextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk,cfl */ + wa_masked_en(wal, + GEN9_CSFE_CHICKEN1_RCS, + GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE); + + /* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl,glk,cfl */ + wa_write_or(wal, + BDW_SCRATCH1, + GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE); + + /* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */ + if (IS_GEN9_LP(i915)) + wa_write_masked_or(wal, + GEN8_L3SQCREG1, + L3_PRIO_CREDITS_MASK, + L3_GENERAL_PRIO_CREDITS(62) | + L3_HIGH_PRIO_CREDITS(2)); + + /* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */ + wa_write_or(wal, + GEN8_L3SQCREG4, + GEN8_LQSC_FLUSH_COHERENT_LINES); + } + + if (IS_GEN(i915, 7)) + /* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */ + wa_masked_en(wal, + GFX_MODE_GEN7, + GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE); + + if (IS_GEN_RANGE(i915, 6, 7)) + /* + * We need to disable the AsyncFlip performance optimisations in + * order to use MI_WAIT_FOR_EVENT within the CS. It should + * already be programmed to '1' on all products. + * + * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv + */ + wa_masked_en(wal, + MI_MODE, + ASYNC_FLIP_PERF_DISABLE); + + if (IS_GEN(i915, 6)) { + /* + * Required for the hardware to program scanline values for + * waiting + * WaEnableFlushTlbInvalidationMode:snb + */ + wa_masked_en(wal, + GFX_MODE, + GFX_TLB_INVALIDATE_EXPLICIT); + + /* + * From the Sandybridge PRM, volume 1 part 3, page 24: + * "If this bit is set, STCunit will have LRA as replacement + * policy. [...] This bit must be reset. LRA replacement + * policy is not supported." + */ + wa_masked_dis(wal, + CACHE_MODE_0, + CM0_STC_EVICT_DISABLE_LRA_SNB); + } + + if (IS_GEN_RANGE(i915, 4, 6)) + /* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */ + wa_add(wal, MI_MODE, + 0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH), + /* XXX bit doesn't stick on Broadwater */ + IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH); + + if (IS_GEN(i915, 4)) + /* + * Disable CONSTANT_BUFFER before it is loaded from the context + * image. For as it is loaded, it is executed and the stored + * address may no longer be valid, leading to a GPU hang. + * + * This imposes the requirement that userspace reload their + * CONSTANT_BUFFER on every batch, fortunately a requirement + * they are already accustomed to from before contexts were + * enabled. + */ + wa_add(wal, ECOSKPD, + 0, _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE), + 0 /* XXX bit doesn't stick on Broadwater */); +} + +static void +xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) +{ + struct drm_i915_private *i915 = engine->i915; + + /* WaKBLVECSSemaphoreWaitPoll:kbl */ + if (IS_KBL_GT_REVID(i915, KBL_REVID_A0, KBL_REVID_E0)) { + wa_write(wal, + RING_SEMA_WAIT_POLL(engine->mmio_base), + 1); + } +} + +static void +engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal) +{ + if (I915_SELFTEST_ONLY(INTEL_GEN(engine->i915) < 4)) + return; + + if (engine->class == RENDER_CLASS) + rcs_engine_wa_init(engine, wal); + else + xcs_engine_wa_init(engine, wal); +} + +void intel_engine_init_workarounds(struct intel_engine_cs *engine) +{ + struct i915_wa_list *wal = &engine->wa_list; + + if (INTEL_GEN(engine->i915) < 4) + return; + + wa_init_start(wal, "engine", engine->name); + engine_init_workarounds(engine, wal); + wa_init_finish(wal); +} + +void intel_engine_apply_workarounds(struct intel_engine_cs *engine) +{ + wa_list_apply(engine->uncore, &engine->wa_list); +} + +static struct i915_vma * +create_scratch(struct i915_address_space *vm, int count) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + unsigned int size; + int err; + + size = round_up(count * sizeof(u32), PAGE_SIZE); + obj = i915_gem_object_create_internal(vm->i915, size); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + i915_gem_object_set_cache_coherency(obj, I915_CACHE_LLC); + + vma = i915_vma_instance(obj, vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_obj; + } + + err = i915_vma_pin(vma, 0, 0, + i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER); + if (err) + goto err_obj; + + return vma; + +err_obj: + i915_gem_object_put(obj); + return ERR_PTR(err); +} + +static const struct { + u32 start; + u32 end; +} mcr_ranges_gen8[] = { + { .start = 0x5500, .end = 0x55ff }, + { .start = 0x7000, .end = 0x7fff }, + { .start = 0x9400, .end = 0x97ff }, + { .start = 0xb000, .end = 0xb3ff }, + { .start = 0xe000, .end = 0xe7ff }, + {}, +}; + +static bool mcr_range(struct drm_i915_private *i915, u32 offset) +{ + int i; + + if (INTEL_GEN(i915) < 8) + return false; + + /* + * Registers in these ranges are affected by the MCR selector + * which only controls CPU initiated MMIO. Routing does not + * work for CS access so we cannot verify them on this path. + */ + for (i = 0; mcr_ranges_gen8[i].start; i++) + if (offset >= mcr_ranges_gen8[i].start && + offset <= mcr_ranges_gen8[i].end) + return true; + + return false; +} + +static int +wa_list_srm(struct i915_request *rq, + const struct i915_wa_list *wal, + struct i915_vma *vma) +{ + struct drm_i915_private *i915 = rq->engine->i915; + unsigned int i, count = 0; + const struct i915_wa *wa; + u32 srm, *cs; + + srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT; + if (INTEL_GEN(i915) >= 8) + srm++; + + for (i = 0, wa = wal->list; i < wal->count; i++, wa++) { + if (!mcr_range(i915, i915_mmio_reg_offset(wa->reg))) + count++; + } + + cs = intel_ring_begin(rq, 4 * count); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + for (i = 0, wa = wal->list; i < wal->count; i++, wa++) { + u32 offset = i915_mmio_reg_offset(wa->reg); + + if (mcr_range(i915, offset)) + continue; + + *cs++ = srm; + *cs++ = offset; + *cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i; + *cs++ = 0; + } + intel_ring_advance(rq, cs); + + return 0; +} + +static int engine_wa_list_verify(struct intel_context *ce, + const struct i915_wa_list * const wal, + const char *from) +{ + const struct i915_wa *wa; + struct i915_request *rq; + struct i915_vma *vma; + struct i915_gem_ww_ctx ww; + unsigned int i; + u32 *results; + int err; + + if (!wal->count) + return 0; + + vma = create_scratch(&ce->engine->gt->ggtt->vm, wal->count); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + intel_engine_pm_get(ce->engine); + i915_gem_ww_ctx_init(&ww, false); +retry: + err = i915_gem_object_lock(vma->obj, &ww); + if (err == 0) + err = intel_context_pin_ww(ce, &ww); + if (err) + goto err_pm; + + rq = i915_request_create(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_unpin; + } + + err = i915_request_await_object(rq, vma->obj, true); + if (err == 0) + err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); + if (err == 0) + err = wa_list_srm(rq, wal, vma); + + i915_request_get(rq); + if (err) + i915_request_set_error_once(rq, err); + i915_request_add(rq); + + if (err) + goto err_rq; + + if (i915_request_wait(rq, 0, HZ / 5) < 0) { + err = -ETIME; + goto err_rq; + } + + results = i915_gem_object_pin_map(vma->obj, I915_MAP_WB); + if (IS_ERR(results)) { + err = PTR_ERR(results); + goto err_rq; + } + + err = 0; + for (i = 0, wa = wal->list; i < wal->count; i++, wa++) { + if (mcr_range(rq->engine->i915, i915_mmio_reg_offset(wa->reg))) + continue; + + if (!wa_verify(wa, results[i], wal->name, from)) + err = -ENXIO; + } + + i915_gem_object_unpin_map(vma->obj); + +err_rq: + i915_request_put(rq); +err_unpin: + intel_context_unpin(ce); +err_pm: + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); + intel_engine_pm_put(ce->engine); + i915_vma_unpin(vma); + i915_vma_put(vma); + return err; +} + +int intel_engine_verify_workarounds(struct intel_engine_cs *engine, + const char *from) +{ + return engine_wa_list_verify(engine->kernel_context, + &engine->wa_list, + from); +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_workarounds.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.h b/drivers/gpu/drm/i915/gt/intel_workarounds.h new file mode 100644 index 000000000..8c9c769c2 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.h @@ -0,0 +1,40 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2014-2018 Intel Corporation + */ + +#ifndef _INTEL_WORKAROUNDS_H_ +#define _INTEL_WORKAROUNDS_H_ + +#include <linux/slab.h> + +#include "intel_workarounds_types.h" + +struct drm_i915_private; +struct i915_request; +struct intel_engine_cs; +struct intel_gt; + +static inline void intel_wa_list_free(struct i915_wa_list *wal) +{ + kfree(wal->list); + memset(wal, 0, sizeof(*wal)); +} + +void intel_engine_init_ctx_wa(struct intel_engine_cs *engine); +int intel_engine_emit_ctx_wa(struct i915_request *rq); + +void intel_gt_init_workarounds(struct drm_i915_private *i915); +void intel_gt_apply_workarounds(struct intel_gt *gt); +bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from); + +void intel_engine_init_whitelist(struct intel_engine_cs *engine); +void intel_engine_apply_whitelist(struct intel_engine_cs *engine); + +void intel_engine_init_workarounds(struct intel_engine_cs *engine); +void intel_engine_apply_workarounds(struct intel_engine_cs *engine); +int intel_engine_verify_workarounds(struct intel_engine_cs *engine, + const char *from); + +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds_types.h b/drivers/gpu/drm/i915/gt/intel_workarounds_types.h new file mode 100644 index 000000000..d166a7145 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_workarounds_types.h @@ -0,0 +1,29 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2014-2018 Intel Corporation + */ + +#ifndef __INTEL_WORKAROUNDS_TYPES_H__ +#define __INTEL_WORKAROUNDS_TYPES_H__ + +#include <linux/types.h> + +#include "i915_reg.h" + +struct i915_wa { + i915_reg_t reg; + u32 clr; + u32 set; + u32 read; +}; + +struct i915_wa_list { + const char *name; + const char *engine_name; + struct i915_wa *list; + unsigned int count; + unsigned int wa_count; +}; + +#endif /* __INTEL_WORKAROUNDS_TYPES_H__ */ diff --git a/drivers/gpu/drm/i915/gt/ivb_clear_kernel.c b/drivers/gpu/drm/i915/gt/ivb_clear_kernel.c new file mode 100644 index 000000000..610ca7687 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/ivb_clear_kernel.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + * + * Generated by: IGT Gpu Tools on Fri 21 Feb 2020 05:29:32 AM UTC + */ + +static const u32 ivb_clear_kernel[] = { + 0x00000001, 0x26020128, 0x00000024, 0x00000000, + 0x00000040, 0x20280c21, 0x00000028, 0x00000001, + 0x01000010, 0x20000c20, 0x0000002c, 0x00000000, + 0x00010220, 0x34001c00, 0x00001400, 0x0000002c, + 0x00600001, 0x20600061, 0x00000000, 0x00000000, + 0x00000008, 0x20601c85, 0x00000e00, 0x0000000c, + 0x00000005, 0x20601ca5, 0x00000060, 0x00000001, + 0x00000008, 0x20641c85, 0x00000e00, 0x0000000d, + 0x00000005, 0x20641ca5, 0x00000064, 0x00000003, + 0x00000041, 0x207424a5, 0x00000064, 0x00000034, + 0x00000040, 0x206014a5, 0x00000060, 0x00000074, + 0x00000008, 0x20681c85, 0x00000e00, 0x00000008, + 0x00000005, 0x20681ca5, 0x00000068, 0x0000000f, + 0x00000041, 0x20701ca5, 0x00000060, 0x00000010, + 0x00000040, 0x206814a5, 0x00000068, 0x00000070, + 0x00600001, 0x20a00061, 0x00000000, 0x00000000, + 0x00000005, 0x206c1c85, 0x00000e00, 0x00000007, + 0x00000041, 0x206c1ca5, 0x0000006c, 0x00000004, + 0x00600001, 0x20800021, 0x008d0000, 0x00000000, + 0x00000001, 0x20800021, 0x0000006c, 0x00000000, + 0x00000001, 0x20840021, 0x00000068, 0x00000000, + 0x00000001, 0x20880061, 0x00000000, 0x00000003, + 0x00000005, 0x208c0d21, 0x00000086, 0xffffffff, + 0x05600032, 0x20a00fa1, 0x008d0080, 0x02190001, + 0x00000040, 0x20a01ca5, 0x000000a0, 0x00000001, + 0x05600032, 0x20a00fa1, 0x008d0080, 0x040a8001, + 0x02000040, 0x20281c21, 0x00000028, 0xffffffff, + 0x00010220, 0x34001c00, 0x00001400, 0xfffffffc, + 0x00000001, 0x26020128, 0x00000024, 0x00000000, + 0x00000001, 0x220010e4, 0x00000000, 0x00000000, + 0x00000001, 0x220831ec, 0x00000000, 0x007f007f, + 0x00600001, 0x20400021, 0x008d0000, 0x00000000, + 0x00600001, 0x2fe00021, 0x008d0000, 0x00000000, + 0x00200001, 0x20400121, 0x00450020, 0x00000000, + 0x00000001, 0x20480061, 0x00000000, 0x000f000f, + 0x00000005, 0x204c0d21, 0x00000046, 0xffffffef, + 0x00800001, 0x20600061, 0x00000000, 0x00000000, + 0x00800001, 0x20800061, 0x00000000, 0x00000000, + 0x00800001, 0x20a00061, 0x00000000, 0x00000000, + 0x00800001, 0x20c00061, 0x00000000, 0x00000000, + 0x00800001, 0x20e00061, 0x00000000, 0x00000000, + 0x00800001, 0x21000061, 0x00000000, 0x00000000, + 0x00800001, 0x21200061, 0x00000000, 0x00000000, + 0x00800001, 0x21400061, 0x00000000, 0x00000000, + 0x05600032, 0x20000fa0, 0x008d0040, 0x120a8000, + 0x00000040, 0x20402d21, 0x00000020, 0x00100010, + 0x05600032, 0x20000fa0, 0x008d0040, 0x120a8000, + 0x02000040, 0x22083d8c, 0x00000208, 0xffffffff, + 0x00800001, 0xa0000109, 0x00000602, 0x00000000, + 0x00000040, 0x22001c84, 0x00000200, 0x00000020, + 0x00010220, 0x34001c00, 0x00001400, 0xfffffff8, + 0x07600032, 0x20000fa0, 0x008d0fe0, 0x82000010, +}; diff --git a/drivers/gpu/drm/i915/gt/mock_engine.c b/drivers/gpu/drm/i915/gt/mock_engine.c new file mode 100644 index 000000000..dfd1cfb8a --- /dev/null +++ b/drivers/gpu/drm/i915/gt/mock_engine.c @@ -0,0 +1,377 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "gem/i915_gem_context.h" +#include "gt/intel_ring.h" + +#include "i915_drv.h" +#include "intel_context.h" +#include "intel_engine_pm.h" + +#include "mock_engine.h" +#include "selftests/mock_request.h" + +static void mock_timeline_pin(struct intel_timeline *tl) +{ + atomic_inc(&tl->pin_count); +} + +static void mock_timeline_unpin(struct intel_timeline *tl) +{ + GEM_BUG_ON(!atomic_read(&tl->pin_count)); + atomic_dec(&tl->pin_count); +} + +static struct intel_ring *mock_ring(struct intel_engine_cs *engine) +{ + const unsigned long sz = PAGE_SIZE / 2; + struct intel_ring *ring; + + ring = kzalloc(sizeof(*ring) + sz, GFP_KERNEL); + if (!ring) + return NULL; + + kref_init(&ring->ref); + ring->size = sz; + ring->effective_size = sz; + ring->vaddr = (void *)(ring + 1); + atomic_set(&ring->pin_count, 1); + + ring->vma = i915_vma_alloc(); + if (!ring->vma) { + kfree(ring); + return NULL; + } + i915_active_init(&ring->vma->active, NULL, NULL); + __set_bit(I915_VMA_GGTT_BIT, __i915_vma_flags(ring->vma)); + __set_bit(DRM_MM_NODE_ALLOCATED_BIT, &ring->vma->node.flags); + ring->vma->node.size = sz; + + intel_ring_update_space(ring); + + return ring; +} + +static void mock_ring_free(struct intel_ring *ring) +{ + i915_active_fini(&ring->vma->active); + i915_vma_free(ring->vma); + + kfree(ring); +} + +static struct i915_request *first_request(struct mock_engine *engine) +{ + return list_first_entry_or_null(&engine->hw_queue, + struct i915_request, + mock.link); +} + +static void advance(struct i915_request *request) +{ + list_del_init(&request->mock.link); + i915_request_mark_complete(request); + GEM_BUG_ON(!i915_request_completed(request)); + + intel_engine_signal_breadcrumbs(request->engine); +} + +static void hw_delay_complete(struct timer_list *t) +{ + struct mock_engine *engine = from_timer(engine, t, hw_delay); + struct i915_request *request; + unsigned long flags; + + spin_lock_irqsave(&engine->hw_lock, flags); + + /* Timer fired, first request is complete */ + request = first_request(engine); + if (request) + advance(request); + + /* + * Also immediately signal any subsequent 0-delay requests, but + * requeue the timer for the next delayed request. + */ + while ((request = first_request(engine))) { + if (request->mock.delay) { + mod_timer(&engine->hw_delay, + jiffies + request->mock.delay); + break; + } + + advance(request); + } + + spin_unlock_irqrestore(&engine->hw_lock, flags); +} + +static void mock_context_unpin(struct intel_context *ce) +{ +} + +static void mock_context_post_unpin(struct intel_context *ce) +{ +} + +static void mock_context_destroy(struct kref *ref) +{ + struct intel_context *ce = container_of(ref, typeof(*ce), ref); + + GEM_BUG_ON(intel_context_is_pinned(ce)); + + if (test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) { + mock_ring_free(ce->ring); + mock_timeline_unpin(ce->timeline); + } + + intel_context_fini(ce); + intel_context_free(ce); +} + +static int mock_context_alloc(struct intel_context *ce) +{ + ce->ring = mock_ring(ce->engine); + if (!ce->ring) + return -ENOMEM; + + ce->timeline = intel_timeline_create(ce->engine->gt); + if (IS_ERR(ce->timeline)) { + kfree(ce->engine); + return PTR_ERR(ce->timeline); + } + + mock_timeline_pin(ce->timeline); + + return 0; +} + +static int mock_context_pre_pin(struct intel_context *ce, + struct i915_gem_ww_ctx *ww, void **unused) +{ + return 0; +} + +static int mock_context_pin(struct intel_context *ce, void *unused) +{ + return 0; +} + +static void mock_context_reset(struct intel_context *ce) +{ +} + +static const struct intel_context_ops mock_context_ops = { + .alloc = mock_context_alloc, + + .pre_pin = mock_context_pre_pin, + .pin = mock_context_pin, + .unpin = mock_context_unpin, + .post_unpin = mock_context_post_unpin, + + .enter = intel_context_enter_engine, + .exit = intel_context_exit_engine, + + .reset = mock_context_reset, + .destroy = mock_context_destroy, +}; + +static int mock_request_alloc(struct i915_request *request) +{ + INIT_LIST_HEAD(&request->mock.link); + request->mock.delay = 0; + + return 0; +} + +static int mock_emit_flush(struct i915_request *request, + unsigned int flags) +{ + return 0; +} + +static u32 *mock_emit_breadcrumb(struct i915_request *request, u32 *cs) +{ + return cs; +} + +static void mock_submit_request(struct i915_request *request) +{ + struct mock_engine *engine = + container_of(request->engine, typeof(*engine), base); + unsigned long flags; + + i915_request_submit(request); + + spin_lock_irqsave(&engine->hw_lock, flags); + list_add_tail(&request->mock.link, &engine->hw_queue); + if (list_is_first(&request->mock.link, &engine->hw_queue)) { + if (request->mock.delay) + mod_timer(&engine->hw_delay, + jiffies + request->mock.delay); + else + advance(request); + } + spin_unlock_irqrestore(&engine->hw_lock, flags); +} + +static void mock_reset_prepare(struct intel_engine_cs *engine) +{ +} + +static void mock_reset_rewind(struct intel_engine_cs *engine, bool stalled) +{ + GEM_BUG_ON(stalled); +} + +static void mock_reset_cancel(struct intel_engine_cs *engine) +{ + struct i915_request *request; + unsigned long flags; + + spin_lock_irqsave(&engine->active.lock, flags); + + /* Mark all submitted requests as skipped. */ + list_for_each_entry(request, &engine->active.requests, sched.link) { + i915_request_set_error_once(request, -EIO); + i915_request_mark_complete(request); + } + + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +static void mock_reset_finish(struct intel_engine_cs *engine) +{ +} + +static void mock_engine_release(struct intel_engine_cs *engine) +{ + struct mock_engine *mock = + container_of(engine, typeof(*mock), base); + + GEM_BUG_ON(timer_pending(&mock->hw_delay)); + + intel_breadcrumbs_free(engine->breadcrumbs); + + intel_context_unpin(engine->kernel_context); + intel_context_put(engine->kernel_context); + + intel_engine_fini_retire(engine); +} + +struct intel_engine_cs *mock_engine(struct drm_i915_private *i915, + const char *name, + int id) +{ + struct mock_engine *engine; + + GEM_BUG_ON(id >= I915_NUM_ENGINES); + GEM_BUG_ON(!i915->gt.uncore); + + engine = kzalloc(sizeof(*engine) + PAGE_SIZE, GFP_KERNEL); + if (!engine) + return NULL; + + /* minimal engine setup for requests */ + engine->base.i915 = i915; + engine->base.gt = &i915->gt; + engine->base.uncore = i915->gt.uncore; + snprintf(engine->base.name, sizeof(engine->base.name), "%s", name); + engine->base.id = id; + engine->base.mask = BIT(id); + engine->base.legacy_idx = INVALID_ENGINE; + engine->base.instance = id; + engine->base.status_page.addr = (void *)(engine + 1); + + engine->base.cops = &mock_context_ops; + engine->base.request_alloc = mock_request_alloc; + engine->base.emit_flush = mock_emit_flush; + engine->base.emit_fini_breadcrumb = mock_emit_breadcrumb; + engine->base.submit_request = mock_submit_request; + + engine->base.reset.prepare = mock_reset_prepare; + engine->base.reset.rewind = mock_reset_rewind; + engine->base.reset.cancel = mock_reset_cancel; + engine->base.reset.finish = mock_reset_finish; + + engine->base.release = mock_engine_release; + + i915->gt.engine[id] = &engine->base; + i915->gt.engine_class[0][id] = &engine->base; + + /* fake hw queue */ + spin_lock_init(&engine->hw_lock); + timer_setup(&engine->hw_delay, hw_delay_complete, 0); + INIT_LIST_HEAD(&engine->hw_queue); + + intel_engine_add_user(&engine->base); + + return &engine->base; +} + +int mock_engine_init(struct intel_engine_cs *engine) +{ + struct intel_context *ce; + + intel_engine_init_active(engine, ENGINE_MOCK); + intel_engine_init_execlists(engine); + intel_engine_init__pm(engine); + intel_engine_init_retire(engine); + + engine->breadcrumbs = intel_breadcrumbs_create(NULL); + if (!engine->breadcrumbs) + return -ENOMEM; + + ce = create_kernel_context(engine); + if (IS_ERR(ce)) + goto err_breadcrumbs; + + /* We insist the kernel context is using the status_page */ + engine->status_page.vma = ce->timeline->hwsp_ggtt; + + engine->kernel_context = ce; + return 0; + +err_breadcrumbs: + intel_breadcrumbs_free(engine->breadcrumbs); + return -ENOMEM; +} + +void mock_engine_flush(struct intel_engine_cs *engine) +{ + struct mock_engine *mock = + container_of(engine, typeof(*mock), base); + struct i915_request *request, *rn; + + del_timer_sync(&mock->hw_delay); + + spin_lock_irq(&mock->hw_lock); + list_for_each_entry_safe(request, rn, &mock->hw_queue, mock.link) + advance(request); + spin_unlock_irq(&mock->hw_lock); +} + +void mock_engine_reset(struct intel_engine_cs *engine) +{ +} diff --git a/drivers/gpu/drm/i915/gt/mock_engine.h b/drivers/gpu/drm/i915/gt/mock_engine.h new file mode 100644 index 000000000..3f9b698c4 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/mock_engine.h @@ -0,0 +1,51 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#ifndef __MOCK_ENGINE_H__ +#define __MOCK_ENGINE_H__ + +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/timer.h> + +#include "gt/intel_engine.h" + +struct mock_engine { + struct intel_engine_cs base; + + spinlock_t hw_lock; + struct list_head hw_queue; + struct timer_list hw_delay; +}; + +struct intel_engine_cs *mock_engine(struct drm_i915_private *i915, + const char *name, + int id); +int mock_engine_init(struct intel_engine_cs *engine); + +void mock_engine_flush(struct intel_engine_cs *engine); +void mock_engine_reset(struct intel_engine_cs *engine); +void mock_engine_free(struct intel_engine_cs *engine); + +#endif /* !__MOCK_ENGINE_H__ */ diff --git a/drivers/gpu/drm/i915/gt/selftest_context.c b/drivers/gpu/drm/i915/gt/selftest_context.c new file mode 100644 index 000000000..1f4020e90 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_context.c @@ -0,0 +1,441 @@ +/* + * SPDX-License-Identifier: GPL-2.0 + * + * Copyright © 2019 Intel Corporation + */ + +#include "i915_selftest.h" +#include "intel_engine_heartbeat.h" +#include "intel_engine_pm.h" +#include "intel_gt.h" + +#include "gem/selftests/mock_context.h" +#include "selftests/igt_flush_test.h" +#include "selftests/mock_drm.h" + +static int request_sync(struct i915_request *rq) +{ + struct intel_timeline *tl = i915_request_timeline(rq); + long timeout; + int err = 0; + + intel_timeline_get(tl); + i915_request_get(rq); + + /* Opencode i915_request_add() so we can keep the timeline locked. */ + __i915_request_commit(rq); + rq->sched.attr.priority = I915_PRIORITY_BARRIER; + __i915_request_queue(rq, NULL); + + timeout = i915_request_wait(rq, 0, HZ / 10); + if (timeout < 0) + err = timeout; + else + i915_request_retire_upto(rq); + + lockdep_unpin_lock(&tl->mutex, rq->cookie); + mutex_unlock(&tl->mutex); + + i915_request_put(rq); + intel_timeline_put(tl); + + return err; +} + +static int context_sync(struct intel_context *ce) +{ + struct intel_timeline *tl = ce->timeline; + int err = 0; + + mutex_lock(&tl->mutex); + do { + struct i915_request *rq; + long timeout; + + if (list_empty(&tl->requests)) + break; + + rq = list_last_entry(&tl->requests, typeof(*rq), link); + i915_request_get(rq); + + timeout = i915_request_wait(rq, 0, HZ / 10); + if (timeout < 0) + err = timeout; + else + i915_request_retire_upto(rq); + + i915_request_put(rq); + } while (!err); + mutex_unlock(&tl->mutex); + + /* Wait for all barriers to complete (remote CPU) before we check */ + i915_active_unlock_wait(&ce->active); + return err; +} + +static int __live_context_size(struct intel_engine_cs *engine) +{ + struct intel_context *ce; + struct i915_request *rq; + void *vaddr; + int err; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return PTR_ERR(ce); + + err = intel_context_pin(ce); + if (err) + goto err; + + vaddr = i915_gem_object_pin_map(ce->state->obj, + i915_coherent_map_type(engine->i915)); + if (IS_ERR(vaddr)) { + err = PTR_ERR(vaddr); + intel_context_unpin(ce); + goto err; + } + + /* + * Note that execlists also applies a redzone which it checks on + * context unpin when debugging. We are using the same location + * and same poison value so that our checks overlap. Despite the + * redundancy, we want to keep this little selftest so that we + * get coverage of any and all submission backends, and we can + * always extend this test to ensure we trick the HW into a + * compromising position wrt to the various sections that need + * to be written into the context state. + * + * TLDR; this overlaps with the execlists redzone. + */ + vaddr += engine->context_size - I915_GTT_PAGE_SIZE; + memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE); + + rq = intel_context_create_request(ce); + intel_context_unpin(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_unpin; + } + + err = request_sync(rq); + if (err) + goto err_unpin; + + /* Force the context switch */ + rq = intel_engine_create_kernel_request(engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_unpin; + } + err = request_sync(rq); + if (err) + goto err_unpin; + + if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE)) { + pr_err("%s context overwrote trailing red-zone!", engine->name); + err = -EINVAL; + } + +err_unpin: + i915_gem_object_unpin_map(ce->state->obj); +err: + intel_context_put(ce); + return err; +} + +static int live_context_size(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + /* + * Check that our context sizes are correct by seeing if the + * HW tries to write past the end of one. + */ + + for_each_engine(engine, gt, id) { + struct file *saved; + + if (!engine->context_size) + continue; + + intel_engine_pm_get(engine); + + /* + * Hide the old default state -- we lie about the context size + * and get confused when the default state is smaller than + * expected. For our do nothing request, inheriting the + * active state is sufficient, we are only checking that we + * don't use more than we planned. + */ + saved = fetch_and_zero(&engine->default_state); + + /* Overlaps with the execlists redzone */ + engine->context_size += I915_GTT_PAGE_SIZE; + + err = __live_context_size(engine); + + engine->context_size -= I915_GTT_PAGE_SIZE; + + engine->default_state = saved; + + intel_engine_pm_put(engine); + + if (err) + break; + } + + return err; +} + +static int __live_active_context(struct intel_engine_cs *engine) +{ + unsigned long saved_heartbeat; + struct intel_context *ce; + int pass; + int err; + + /* + * We keep active contexts alive until after a subsequent context + * switch as the final write from the context-save will be after + * we retire the final request. We track when we unpin the context, + * under the presumption that the final pin is from the last request, + * and instead of immediately unpinning the context, we add a task + * to unpin the context from the next idle-barrier. + * + * This test makes sure that the context is kept alive until a + * subsequent idle-barrier (emitted when the engine wakeref hits 0 + * with no more outstanding requests). + */ + + if (intel_engine_pm_is_awake(engine)) { + pr_err("%s is awake before starting %s!\n", + engine->name, __func__); + return -EINVAL; + } + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return PTR_ERR(ce); + + saved_heartbeat = engine->props.heartbeat_interval_ms; + engine->props.heartbeat_interval_ms = 0; + + for (pass = 0; pass <= 2; pass++) { + struct i915_request *rq; + + intel_engine_pm_get(engine); + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_engine; + } + + err = request_sync(rq); + if (err) + goto out_engine; + + /* Context will be kept active until after an idle-barrier. */ + if (i915_active_is_idle(&ce->active)) { + pr_err("context is not active; expected idle-barrier (%s pass %d)\n", + engine->name, pass); + err = -EINVAL; + goto out_engine; + } + + if (!intel_engine_pm_is_awake(engine)) { + pr_err("%s is asleep before idle-barrier\n", + engine->name); + err = -EINVAL; + goto out_engine; + } + +out_engine: + intel_engine_pm_put(engine); + if (err) + goto err; + } + + /* Now make sure our idle-barriers are flushed */ + err = intel_engine_flush_barriers(engine); + if (err) + goto err; + + /* Wait for the barrier and in the process wait for engine to park */ + err = context_sync(engine->kernel_context); + if (err) + goto err; + + if (!i915_active_is_idle(&ce->active)) { + pr_err("context is still active!"); + err = -EINVAL; + } + + intel_engine_pm_flush(engine); + + if (intel_engine_pm_is_awake(engine)) { + struct drm_printer p = drm_debug_printer(__func__); + + intel_engine_dump(engine, &p, + "%s is still awake:%d after idle-barriers\n", + engine->name, + atomic_read(&engine->wakeref.count)); + GEM_TRACE_DUMP(); + + err = -EINVAL; + goto err; + } + +err: + engine->props.heartbeat_interval_ms = saved_heartbeat; + intel_context_put(ce); + return err; +} + +static int live_active_context(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + for_each_engine(engine, gt, id) { + err = __live_active_context(engine); + if (err) + break; + + err = igt_flush_test(gt->i915); + if (err) + break; + } + + return err; +} + +static int __remote_sync(struct intel_context *ce, struct intel_context *remote) +{ + struct i915_request *rq; + int err; + + err = intel_context_pin(remote); + if (err) + return err; + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto unpin; + } + + err = intel_context_prepare_remote_request(remote, rq); + if (err) { + i915_request_add(rq); + goto unpin; + } + + err = request_sync(rq); + +unpin: + intel_context_unpin(remote); + return err; +} + +static int __live_remote_context(struct intel_engine_cs *engine) +{ + struct intel_context *local, *remote; + unsigned long saved_heartbeat; + int pass; + int err; + + /* + * Check that our idle barriers do not interfere with normal + * activity tracking. In particular, check that operating + * on the context image remotely (intel_context_prepare_remote_request), + * which inserts foreign fences into intel_context.active, does not + * clobber the idle-barrier. + */ + + if (intel_engine_pm_is_awake(engine)) { + pr_err("%s is awake before starting %s!\n", + engine->name, __func__); + return -EINVAL; + } + + remote = intel_context_create(engine); + if (IS_ERR(remote)) + return PTR_ERR(remote); + + local = intel_context_create(engine); + if (IS_ERR(local)) { + err = PTR_ERR(local); + goto err_remote; + } + + saved_heartbeat = engine->props.heartbeat_interval_ms; + engine->props.heartbeat_interval_ms = 0; + intel_engine_pm_get(engine); + + for (pass = 0; pass <= 2; pass++) { + err = __remote_sync(local, remote); + if (err) + break; + + err = __remote_sync(engine->kernel_context, remote); + if (err) + break; + + if (i915_active_is_idle(&remote->active)) { + pr_err("remote context is not active; expected idle-barrier (%s pass %d)\n", + engine->name, pass); + err = -EINVAL; + break; + } + } + + intel_engine_pm_put(engine); + engine->props.heartbeat_interval_ms = saved_heartbeat; + + intel_context_put(local); +err_remote: + intel_context_put(remote); + return err; +} + +static int live_remote_context(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + for_each_engine(engine, gt, id) { + err = __live_remote_context(engine); + if (err) + break; + + err = igt_flush_test(gt->i915); + if (err) + break; + } + + return err; +} + +int intel_context_live_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(live_context_size), + SUBTEST(live_active_context), + SUBTEST(live_remote_context), + }; + struct intel_gt *gt = &i915->gt; + + if (intel_gt_is_wedged(gt)) + return 0; + + return intel_gt_live_subtests(tests, gt); +} diff --git a/drivers/gpu/drm/i915/gt/selftest_engine.c b/drivers/gpu/drm/i915/gt/selftest_engine.c new file mode 100644 index 000000000..f65b118e2 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_engine.c @@ -0,0 +1,28 @@ +/* + * SPDX-License-Identifier: GPL-2.0 + * + * Copyright © 2018 Intel Corporation + */ + +#include "i915_selftest.h" +#include "selftest_engine.h" + +int intel_engine_live_selftests(struct drm_i915_private *i915) +{ + static int (* const tests[])(struct intel_gt *) = { + live_engine_pm_selftests, + NULL, + }; + struct intel_gt *gt = &i915->gt; + typeof(*tests) *fn; + + for (fn = tests; *fn; fn++) { + int err; + + err = (*fn)(gt); + if (err) + return err; + } + + return 0; +} diff --git a/drivers/gpu/drm/i915/gt/selftest_engine.h b/drivers/gpu/drm/i915/gt/selftest_engine.h new file mode 100644 index 000000000..ab32d09ec --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_engine.h @@ -0,0 +1,14 @@ +/* + * SPDX-License-Identifier: GPL-2.0 + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef SELFTEST_ENGINE_H +#define SELFTEST_ENGINE_H + +struct intel_gt; + +int live_engine_pm_selftests(struct intel_gt *gt); + +#endif diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_cs.c b/drivers/gpu/drm/i915/gt/selftest_engine_cs.c new file mode 100644 index 000000000..729c3c7b1 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_engine_cs.c @@ -0,0 +1,420 @@ +/* + * SPDX-License-Identifier: GPL-2.0 + * + * Copyright © 2018 Intel Corporation + */ + +#include <linux/sort.h> + +#include "intel_gt_pm.h" +#include "intel_rps.h" + +#include "i915_selftest.h" +#include "selftests/igt_flush_test.h" + +#define COUNT 5 + +static int cmp_u32(const void *A, const void *B) +{ + const u32 *a = A, *b = B; + + return *a - *b; +} + +static void perf_begin(struct intel_gt *gt) +{ + intel_gt_pm_get(gt); + + /* Boost gpufreq to max [waitboost] and keep it fixed */ + atomic_inc(>->rps.num_waiters); + schedule_work(>->rps.work); + flush_work(>->rps.work); +} + +static int perf_end(struct intel_gt *gt) +{ + atomic_dec(>->rps.num_waiters); + intel_gt_pm_put(gt); + + return igt_flush_test(gt->i915); +} + +static int write_timestamp(struct i915_request *rq, int slot) +{ + u32 cmd; + u32 *cs; + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + cmd = MI_STORE_REGISTER_MEM | MI_USE_GGTT; + if (INTEL_GEN(rq->engine->i915) >= 8) + cmd++; + *cs++ = cmd; + *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(rq->engine->mmio_base)); + *cs++ = i915_request_timeline(rq)->hwsp_offset + slot * sizeof(u32); + *cs++ = 0; + + intel_ring_advance(rq, cs); + + return 0; +} + +static struct i915_vma *create_empty_batch(struct intel_context *ce) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + u32 *cs; + int err; + + obj = i915_gem_object_create_internal(ce->engine->i915, PAGE_SIZE); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + cs = i915_gem_object_pin_map(obj, I915_MAP_WB); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto err_put; + } + + cs[0] = MI_BATCH_BUFFER_END; + + i915_gem_object_flush_map(obj); + + vma = i915_vma_instance(obj, ce->vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_unpin; + } + + err = i915_vma_pin(vma, 0, 0, PIN_USER); + if (err) + goto err_unpin; + + i915_gem_object_unpin_map(obj); + return vma; + +err_unpin: + i915_gem_object_unpin_map(obj); +err_put: + i915_gem_object_put(obj); + return ERR_PTR(err); +} + +static u32 trifilter(u32 *a) +{ + u64 sum; + + sort(a, COUNT, sizeof(*a), cmp_u32, NULL); + + sum = mul_u32_u32(a[2], 2); + sum += a[1]; + sum += a[3]; + + return sum >> 2; +} + +static int perf_mi_bb_start(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + if (INTEL_GEN(gt->i915) < 7) /* for per-engine CS_TIMESTAMP */ + return 0; + + perf_begin(gt); + for_each_engine(engine, gt, id) { + struct intel_context *ce = engine->kernel_context; + struct i915_vma *batch; + u32 cycles[COUNT]; + int i; + + intel_engine_pm_get(engine); + + batch = create_empty_batch(ce); + if (IS_ERR(batch)) { + err = PTR_ERR(batch); + intel_engine_pm_put(engine); + break; + } + + err = i915_vma_sync(batch); + if (err) { + intel_engine_pm_put(engine); + i915_vma_put(batch); + break; + } + + for (i = 0; i < ARRAY_SIZE(cycles); i++) { + struct i915_request *rq; + + rq = i915_request_create(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + break; + } + + err = write_timestamp(rq, 2); + if (err) + goto out; + + err = rq->engine->emit_bb_start(rq, + batch->node.start, 8, + 0); + if (err) + goto out; + + err = write_timestamp(rq, 3); + if (err) + goto out; + +out: + i915_request_get(rq); + i915_request_add(rq); + + if (i915_request_wait(rq, 0, HZ / 5) < 0) + err = -EIO; + i915_request_put(rq); + if (err) + break; + + cycles[i] = rq->hwsp_seqno[3] - rq->hwsp_seqno[2]; + } + i915_vma_put(batch); + intel_engine_pm_put(engine); + if (err) + break; + + pr_info("%s: MI_BB_START cycles: %u\n", + engine->name, trifilter(cycles)); + } + if (perf_end(gt)) + err = -EIO; + + return err; +} + +static struct i915_vma *create_nop_batch(struct intel_context *ce) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + u32 *cs; + int err; + + obj = i915_gem_object_create_internal(ce->engine->i915, SZ_64K); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + cs = i915_gem_object_pin_map(obj, I915_MAP_WB); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto err_put; + } + + memset(cs, 0, SZ_64K); + cs[SZ_64K / sizeof(*cs) - 1] = MI_BATCH_BUFFER_END; + + i915_gem_object_flush_map(obj); + + vma = i915_vma_instance(obj, ce->vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_unpin; + } + + err = i915_vma_pin(vma, 0, 0, PIN_USER); + if (err) + goto err_unpin; + + i915_gem_object_unpin_map(obj); + return vma; + +err_unpin: + i915_gem_object_unpin_map(obj); +err_put: + i915_gem_object_put(obj); + return ERR_PTR(err); +} + +static int perf_mi_noop(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + if (INTEL_GEN(gt->i915) < 7) /* for per-engine CS_TIMESTAMP */ + return 0; + + perf_begin(gt); + for_each_engine(engine, gt, id) { + struct intel_context *ce = engine->kernel_context; + struct i915_vma *base, *nop; + u32 cycles[COUNT]; + int i; + + intel_engine_pm_get(engine); + + base = create_empty_batch(ce); + if (IS_ERR(base)) { + err = PTR_ERR(base); + intel_engine_pm_put(engine); + break; + } + + err = i915_vma_sync(base); + if (err) { + i915_vma_put(base); + intel_engine_pm_put(engine); + break; + } + + nop = create_nop_batch(ce); + if (IS_ERR(nop)) { + err = PTR_ERR(nop); + i915_vma_put(base); + intel_engine_pm_put(engine); + break; + } + + err = i915_vma_sync(nop); + if (err) { + i915_vma_put(nop); + i915_vma_put(base); + intel_engine_pm_put(engine); + break; + } + + for (i = 0; i < ARRAY_SIZE(cycles); i++) { + struct i915_request *rq; + + rq = i915_request_create(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + break; + } + + err = write_timestamp(rq, 2); + if (err) + goto out; + + err = rq->engine->emit_bb_start(rq, + base->node.start, 8, + 0); + if (err) + goto out; + + err = write_timestamp(rq, 3); + if (err) + goto out; + + err = rq->engine->emit_bb_start(rq, + nop->node.start, + nop->node.size, + 0); + if (err) + goto out; + + err = write_timestamp(rq, 4); + if (err) + goto out; + +out: + i915_request_get(rq); + i915_request_add(rq); + + if (i915_request_wait(rq, 0, HZ / 5) < 0) + err = -EIO; + i915_request_put(rq); + if (err) + break; + + cycles[i] = + (rq->hwsp_seqno[4] - rq->hwsp_seqno[3]) - + (rq->hwsp_seqno[3] - rq->hwsp_seqno[2]); + } + i915_vma_put(nop); + i915_vma_put(base); + intel_engine_pm_put(engine); + if (err) + break; + + pr_info("%s: 16K MI_NOOP cycles: %u\n", + engine->name, trifilter(cycles)); + } + if (perf_end(gt)) + err = -EIO; + + return err; +} + +int intel_engine_cs_perf_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(perf_mi_bb_start), + SUBTEST(perf_mi_noop), + }; + + if (intel_gt_is_wedged(&i915->gt)) + return 0; + + return intel_gt_live_subtests(tests, &i915->gt); +} + +static int intel_mmio_bases_check(void *arg) +{ + int i, j; + + for (i = 0; i < ARRAY_SIZE(intel_engines); i++) { + const struct engine_info *info = &intel_engines[i]; + u8 prev = U8_MAX; + + for (j = 0; j < MAX_MMIO_BASES; j++) { + u8 gen = info->mmio_bases[j].gen; + u32 base = info->mmio_bases[j].base; + + if (gen >= prev) { + pr_err("%s(%s, class:%d, instance:%d): mmio base for gen %x is before the one for gen %x\n", + __func__, + intel_engine_class_repr(info->class), + info->class, info->instance, + prev, gen); + return -EINVAL; + } + + if (gen == 0) + break; + + if (!base) { + pr_err("%s(%s, class:%d, instance:%d): invalid mmio base (%x) for gen %x at entry %u\n", + __func__, + intel_engine_class_repr(info->class), + info->class, info->instance, + base, gen, j); + return -EINVAL; + } + + prev = gen; + } + + pr_debug("%s: min gen supported for %s%d is %d\n", + __func__, + intel_engine_class_repr(info->class), + info->instance, + prev); + } + + return 0; +} + +int intel_engine_cs_mock_selftests(void) +{ + static const struct i915_subtest tests[] = { + SUBTEST(intel_mmio_bases_check), + }; + + return i915_subtests(tests, NULL); +} diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c new file mode 100644 index 000000000..e73854dd2 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c @@ -0,0 +1,394 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2018 Intel Corporation + */ + +#include <linux/sort.h> + +#include "i915_drv.h" + +#include "intel_gt_requests.h" +#include "i915_selftest.h" +#include "selftest_engine_heartbeat.h" + +static int timeline_sync(struct intel_timeline *tl) +{ + struct dma_fence *fence; + long timeout; + + fence = i915_active_fence_get(&tl->last_request); + if (!fence) + return 0; + + timeout = dma_fence_wait_timeout(fence, true, HZ / 2); + dma_fence_put(fence); + if (timeout < 0) + return timeout; + + return 0; +} + +static int engine_sync_barrier(struct intel_engine_cs *engine) +{ + return timeline_sync(engine->kernel_context->timeline); +} + +struct pulse { + struct i915_active active; + struct kref kref; +}; + +static int pulse_active(struct i915_active *active) +{ + kref_get(&container_of(active, struct pulse, active)->kref); + return 0; +} + +static void pulse_free(struct kref *kref) +{ + struct pulse *p = container_of(kref, typeof(*p), kref); + + i915_active_fini(&p->active); + kfree(p); +} + +static void pulse_put(struct pulse *p) +{ + kref_put(&p->kref, pulse_free); +} + +static void pulse_retire(struct i915_active *active) +{ + pulse_put(container_of(active, struct pulse, active)); +} + +static struct pulse *pulse_create(void) +{ + struct pulse *p; + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return p; + + kref_init(&p->kref); + i915_active_init(&p->active, pulse_active, pulse_retire); + + return p; +} + +static void pulse_unlock_wait(struct pulse *p) +{ + i915_active_unlock_wait(&p->active); +} + +static int __live_idle_pulse(struct intel_engine_cs *engine, + int (*fn)(struct intel_engine_cs *cs)) +{ + struct pulse *p; + int err; + + GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); + + p = pulse_create(); + if (!p) + return -ENOMEM; + + err = i915_active_acquire(&p->active); + if (err) + goto out; + + err = i915_active_acquire_preallocate_barrier(&p->active, engine); + if (err) { + i915_active_release(&p->active); + goto out; + } + + i915_active_acquire_barrier(&p->active); + i915_active_release(&p->active); + + GEM_BUG_ON(i915_active_is_idle(&p->active)); + GEM_BUG_ON(llist_empty(&engine->barrier_tasks)); + + err = fn(engine); + if (err) + goto out; + + GEM_BUG_ON(!llist_empty(&engine->barrier_tasks)); + + if (engine_sync_barrier(engine)) { + struct drm_printer m = drm_err_printer("pulse"); + + pr_err("%s: no heartbeat pulse?\n", engine->name); + intel_engine_dump(engine, &m, "%s", engine->name); + + err = -ETIME; + goto out; + } + + GEM_BUG_ON(READ_ONCE(engine->serial) != engine->wakeref_serial); + + pulse_unlock_wait(p); /* synchronize with the retirement callback */ + + if (!i915_active_is_idle(&p->active)) { + struct drm_printer m = drm_err_printer("pulse"); + + pr_err("%s: heartbeat pulse did not flush idle tasks\n", + engine->name); + i915_active_print(&p->active, &m); + + err = -EINVAL; + goto out; + } + +out: + pulse_put(p); + return err; +} + +static int live_idle_flush(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + /* Check that we can flush the idle barriers */ + + for_each_engine(engine, gt, id) { + st_engine_heartbeat_disable(engine); + err = __live_idle_pulse(engine, intel_engine_flush_barriers); + st_engine_heartbeat_enable(engine); + if (err) + break; + } + + return err; +} + +static int live_idle_pulse(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + /* Check that heartbeat pulses flush the idle barriers */ + + for_each_engine(engine, gt, id) { + st_engine_heartbeat_disable(engine); + err = __live_idle_pulse(engine, intel_engine_pulse); + st_engine_heartbeat_enable(engine); + if (err && err != -ENODEV) + break; + + err = 0; + } + + return err; +} + +static int cmp_u32(const void *_a, const void *_b) +{ + const u32 *a = _a, *b = _b; + + return *a - *b; +} + +static int __live_heartbeat_fast(struct intel_engine_cs *engine) +{ + struct intel_context *ce; + struct i915_request *rq; + ktime_t t0, t1; + u32 times[5]; + int err; + int i; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return PTR_ERR(ce); + + intel_engine_pm_get(engine); + + err = intel_engine_set_heartbeat(engine, 1); + if (err) + goto err_pm; + + for (i = 0; i < ARRAY_SIZE(times); i++) { + /* Manufacture a tick */ + do { + while (READ_ONCE(engine->heartbeat.systole)) + flush_delayed_work(&engine->heartbeat.work); + + engine->serial++; /* quick, pretend we are not idle! */ + flush_delayed_work(&engine->heartbeat.work); + if (!delayed_work_pending(&engine->heartbeat.work)) { + pr_err("%s: heartbeat did not start\n", + engine->name); + err = -EINVAL; + goto err_pm; + } + + rcu_read_lock(); + rq = READ_ONCE(engine->heartbeat.systole); + if (rq) + rq = i915_request_get_rcu(rq); + rcu_read_unlock(); + } while (!rq); + + t0 = ktime_get(); + while (rq == READ_ONCE(engine->heartbeat.systole)) + yield(); /* work is on the local cpu! */ + t1 = ktime_get(); + + i915_request_put(rq); + times[i] = ktime_us_delta(t1, t0); + } + + sort(times, ARRAY_SIZE(times), sizeof(times[0]), cmp_u32, NULL); + + pr_info("%s: Heartbeat delay: %uus [%u, %u]\n", + engine->name, + times[ARRAY_SIZE(times) / 2], + times[0], + times[ARRAY_SIZE(times) - 1]); + + /* Min work delay is 2 * 2 (worst), +1 for scheduling, +1 for slack */ + if (times[ARRAY_SIZE(times) / 2] > jiffies_to_usecs(6)) { + pr_err("%s: Heartbeat delay was %uus, expected less than %dus\n", + engine->name, + times[ARRAY_SIZE(times) / 2], + jiffies_to_usecs(6)); + err = -EINVAL; + } + + intel_engine_set_heartbeat(engine, CONFIG_DRM_I915_HEARTBEAT_INTERVAL); +err_pm: + intel_engine_pm_put(engine); + intel_context_put(ce); + return err; +} + +static int live_heartbeat_fast(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + /* Check that the heartbeat ticks at the desired rate. */ + if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL) + return 0; + + for_each_engine(engine, gt, id) { + err = __live_heartbeat_fast(engine); + if (err) + break; + } + + return err; +} + +static int __live_heartbeat_off(struct intel_engine_cs *engine) +{ + int err; + + intel_engine_pm_get(engine); + + engine->serial++; + flush_delayed_work(&engine->heartbeat.work); + if (!delayed_work_pending(&engine->heartbeat.work)) { + pr_err("%s: heartbeat not running\n", + engine->name); + err = -EINVAL; + goto err_pm; + } + + err = intel_engine_set_heartbeat(engine, 0); + if (err) + goto err_pm; + + engine->serial++; + flush_delayed_work(&engine->heartbeat.work); + if (delayed_work_pending(&engine->heartbeat.work)) { + pr_err("%s: heartbeat still running\n", + engine->name); + err = -EINVAL; + goto err_beat; + } + + if (READ_ONCE(engine->heartbeat.systole)) { + pr_err("%s: heartbeat still allocated\n", + engine->name); + err = -EINVAL; + goto err_beat; + } + +err_beat: + intel_engine_set_heartbeat(engine, CONFIG_DRM_I915_HEARTBEAT_INTERVAL); +err_pm: + intel_engine_pm_put(engine); + return err; +} + +static int live_heartbeat_off(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + /* Check that we can turn off heartbeat and not interrupt VIP */ + if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL) + return 0; + + for_each_engine(engine, gt, id) { + if (!intel_engine_has_preemption(engine)) + continue; + + err = __live_heartbeat_off(engine); + if (err) + break; + } + + return err; +} + +int intel_heartbeat_live_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(live_idle_flush), + SUBTEST(live_idle_pulse), + SUBTEST(live_heartbeat_fast), + SUBTEST(live_heartbeat_off), + }; + int saved_hangcheck; + int err; + + if (intel_gt_is_wedged(&i915->gt)) + return 0; + + saved_hangcheck = i915->params.enable_hangcheck; + i915->params.enable_hangcheck = INT_MAX; + + err = intel_gt_live_subtests(tests, &i915->gt); + + i915->params.enable_hangcheck = saved_hangcheck; + return err; +} + +void st_engine_heartbeat_disable(struct intel_engine_cs *engine) +{ + engine->props.heartbeat_interval_ms = 0; + + intel_engine_pm_get(engine); + intel_engine_park_heartbeat(engine); +} + +void st_engine_heartbeat_enable(struct intel_engine_cs *engine) +{ + intel_engine_pm_put(engine); + + engine->props.heartbeat_interval_ms = + engine->defaults.heartbeat_interval_ms; +} diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.h b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.h new file mode 100644 index 000000000..cd27113d5 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2020 Intel Corporation + */ + +#ifndef SELFTEST_ENGINE_HEARTBEAT_H +#define SELFTEST_ENGINE_HEARTBEAT_H + +struct intel_engine_cs; + +void st_engine_heartbeat_disable(struct intel_engine_cs *engine); +void st_engine_heartbeat_enable(struct intel_engine_cs *engine); + +#endif /* SELFTEST_ENGINE_HEARTBEAT_H */ diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c new file mode 100644 index 000000000..b08fc5390 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c @@ -0,0 +1,185 @@ +/* + * SPDX-License-Identifier: GPL-2.0 + * + * Copyright © 2018 Intel Corporation + */ + +#include "i915_selftest.h" +#include "selftest_engine.h" +#include "selftest_engine_heartbeat.h" +#include "selftests/igt_atomic.h" +#include "selftests/igt_flush_test.h" +#include "selftests/igt_spinner.h" + +static int live_engine_busy_stats(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct igt_spinner spin; + int err = 0; + + /* + * Check that if an engine supports busy-stats, they tell the truth. + */ + + if (igt_spinner_init(&spin, gt)) + return -ENOMEM; + + GEM_BUG_ON(intel_gt_pm_is_awake(gt)); + for_each_engine(engine, gt, id) { + struct i915_request *rq; + ktime_t de, dt; + ktime_t t[2]; + + if (!intel_engine_supports_stats(engine)) + continue; + + if (!intel_engine_can_store_dword(engine)) + continue; + + if (intel_gt_pm_wait_for_idle(gt)) { + err = -EBUSY; + break; + } + + st_engine_heartbeat_disable(engine); + + ENGINE_TRACE(engine, "measuring idle time\n"); + preempt_disable(); + de = intel_engine_get_busy_time(engine, &t[0]); + udelay(100); + de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de); + preempt_enable(); + dt = ktime_sub(t[1], t[0]); + if (de < 0 || de > 10) { + pr_err("%s: reported %lldns [%d%%] busyness while sleeping [for %lldns]\n", + engine->name, + de, (int)div64_u64(100 * de, dt), dt); + GEM_TRACE_DUMP(); + err = -EINVAL; + goto end; + } + + /* 100% busy */ + rq = igt_spinner_create_request(&spin, + engine->kernel_context, + MI_NOOP); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto end; + } + i915_request_add(rq); + + if (!igt_wait_for_spinner(&spin, rq)) { + intel_gt_set_wedged(engine->gt); + err = -ETIME; + goto end; + } + + ENGINE_TRACE(engine, "measuring busy time\n"); + preempt_disable(); + de = intel_engine_get_busy_time(engine, &t[0]); + udelay(100); + de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de); + preempt_enable(); + dt = ktime_sub(t[1], t[0]); + if (100 * de < 95 * dt || 95 * de > 100 * dt) { + pr_err("%s: reported %lldns [%d%%] busyness while spinning [for %lldns]\n", + engine->name, + de, (int)div64_u64(100 * de, dt), dt); + GEM_TRACE_DUMP(); + err = -EINVAL; + goto end; + } + +end: + st_engine_heartbeat_enable(engine); + igt_spinner_end(&spin); + if (igt_flush_test(gt->i915)) + err = -EIO; + if (err) + break; + } + + igt_spinner_fini(&spin); + if (igt_flush_test(gt->i915)) + err = -EIO; + return err; +} + +static int live_engine_pm(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + + /* + * Check we can call intel_engine_pm_put from any context. No + * failures are reported directly, but if we mess up lockdep should + * tell us. + */ + if (intel_gt_pm_wait_for_idle(gt)) { + pr_err("Unable to flush GT pm before test\n"); + return -EBUSY; + } + + GEM_BUG_ON(intel_gt_pm_is_awake(gt)); + for_each_engine(engine, gt, id) { + const typeof(*igt_atomic_phases) *p; + + for (p = igt_atomic_phases; p->name; p++) { + /* + * Acquisition is always synchronous, except if we + * know that the engine is already awake, in which + * case we should use intel_engine_pm_get_if_awake() + * to atomically grab the wakeref. + * + * In practice, + * intel_engine_pm_get(); + * intel_engine_pm_put(); + * occurs in one thread, while simultaneously + * intel_engine_pm_get_if_awake(); + * intel_engine_pm_put(); + * occurs from atomic context in another. + */ + GEM_BUG_ON(intel_engine_pm_is_awake(engine)); + intel_engine_pm_get(engine); + + p->critical_section_begin(); + if (!intel_engine_pm_get_if_awake(engine)) + pr_err("intel_engine_pm_get_if_awake(%s) failed under %s\n", + engine->name, p->name); + else + intel_engine_pm_put_async(engine); + intel_engine_pm_put_async(engine); + p->critical_section_end(); + + intel_engine_pm_flush(engine); + + if (intel_engine_pm_is_awake(engine)) { + pr_err("%s is still awake after flushing pm\n", + engine->name); + return -EINVAL; + } + + /* gt wakeref is async (deferred to workqueue) */ + if (intel_gt_pm_wait_for_idle(gt)) { + pr_err("GT failed to idle\n"); + return -EINVAL; + } + } + } + + return 0; +} + +int live_engine_pm_selftests(struct intel_gt *gt) +{ + static const struct i915_subtest tests[] = { + SUBTEST(live_engine_busy_stats), + SUBTEST(live_engine_pm), + }; + + return intel_gt_live_subtests(tests, gt); +} diff --git a/drivers/gpu/drm/i915/gt/selftest_gt_pm.c b/drivers/gpu/drm/i915/gt/selftest_gt_pm.c new file mode 100644 index 000000000..6180a47c1 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_gt_pm.c @@ -0,0 +1,219 @@ + +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include <linux/sort.h> + +#include "intel_gt_clock_utils.h" + +#include "selftest_llc.h" +#include "selftest_rc6.h" +#include "selftest_rps.h" + +static int cmp_u64(const void *A, const void *B) +{ + const u64 *a = A, *b = B; + + if (a < b) + return -1; + else if (a > b) + return 1; + else + return 0; +} + +static int cmp_u32(const void *A, const void *B) +{ + const u32 *a = A, *b = B; + + if (a < b) + return -1; + else if (a > b) + return 1; + else + return 0; +} + +static void measure_clocks(struct intel_engine_cs *engine, + u32 *out_cycles, ktime_t *out_dt) +{ + ktime_t dt[5]; + u32 cycles[5]; + int i; + + for (i = 0; i < 5; i++) { + preempt_disable(); + cycles[i] = -ENGINE_READ_FW(engine, RING_TIMESTAMP); + dt[i] = ktime_get(); + + udelay(1000); + + dt[i] = ktime_sub(ktime_get(), dt[i]); + cycles[i] += ENGINE_READ_FW(engine, RING_TIMESTAMP); + preempt_enable(); + } + + /* Use the median of both cycle/dt; close enough */ + sort(cycles, 5, sizeof(*cycles), cmp_u32, NULL); + *out_cycles = (cycles[1] + 2 * cycles[2] + cycles[3]) / 4; + + sort(dt, 5, sizeof(*dt), cmp_u64, NULL); + *out_dt = div_u64(dt[1] + 2 * dt[2] + dt[3], 4); +} + +static int live_gt_clocks(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + if (!RUNTIME_INFO(gt->i915)->cs_timestamp_frequency_hz) { /* unknown */ + pr_info("CS_TIMESTAMP frequency unknown\n"); + return 0; + } + + if (INTEL_GEN(gt->i915) < 4) /* Any CS_TIMESTAMP? */ + return 0; + + if (IS_GEN(gt->i915, 5)) + /* + * XXX CS_TIMESTAMP low dword is dysfunctional? + * + * Ville's experiments indicate the high dword still works, + * but at a correspondingly reduced frequency. + */ + return 0; + + if (IS_GEN(gt->i915, 4)) + /* + * XXX CS_TIMESTAMP appears gibberish + * + * Ville's experiments indicate that it mostly appears 'stuck' + * in that we see the register report the same cycle count + * for a couple of reads. + */ + return 0; + + intel_gt_pm_get(gt); + intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); + + for_each_engine(engine, gt, id) { + u32 cycles; + u32 expected; + u64 time; + u64 dt; + + if (INTEL_GEN(engine->i915) < 7 && engine->id != RCS0) + continue; + + measure_clocks(engine, &cycles, &dt); + + time = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles); + expected = i915_cs_timestamp_ns_to_ticks(engine->i915, dt); + + pr_info("%s: TIMESTAMP %d cycles [%lldns] in %lldns [%d cycles], using CS clock frequency of %uKHz\n", + engine->name, cycles, time, dt, expected, + RUNTIME_INFO(engine->i915)->cs_timestamp_frequency_hz / 1000); + + if (9 * time < 8 * dt || 8 * time > 9 * dt) { + pr_err("%s: CS ticks did not match walltime!\n", + engine->name); + err = -EINVAL; + break; + } + + if (9 * expected < 8 * cycles || 8 * expected > 9 * cycles) { + pr_err("%s: walltime did not match CS ticks!\n", + engine->name); + err = -EINVAL; + break; + } + } + + intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); + intel_gt_pm_put(gt); + + return err; +} + +static int live_gt_resume(void *arg) +{ + struct intel_gt *gt = arg; + IGT_TIMEOUT(end_time); + int err; + + /* Do several suspend/resume cycles to check we don't explode! */ + do { + intel_gt_suspend_prepare(gt); + intel_gt_suspend_late(gt); + + if (gt->rc6.enabled) { + pr_err("rc6 still enabled after suspend!\n"); + intel_gt_set_wedged_on_init(gt); + err = -EINVAL; + break; + } + + err = intel_gt_resume(gt); + if (err) + break; + + if (gt->rc6.supported && !gt->rc6.enabled) { + pr_err("rc6 not enabled upon resume!\n"); + intel_gt_set_wedged_on_init(gt); + err = -EINVAL; + break; + } + + err = st_llc_verify(>->llc); + if (err) { + pr_err("llc state not restored upon resume!\n"); + intel_gt_set_wedged_on_init(gt); + break; + } + } while (!__igt_timeout(end_time, NULL)); + + return err; +} + +int intel_gt_pm_live_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(live_gt_clocks), + SUBTEST(live_rc6_manual), + SUBTEST(live_rps_clock_interval), + SUBTEST(live_rps_control), + SUBTEST(live_rps_frequency_cs), + SUBTEST(live_rps_frequency_srm), + SUBTEST(live_rps_power), + SUBTEST(live_rps_interrupt), + SUBTEST(live_rps_dynamic), + SUBTEST(live_gt_resume), + }; + + if (intel_gt_is_wedged(&i915->gt)) + return 0; + + return intel_gt_live_subtests(tests, &i915->gt); +} + +int intel_gt_pm_late_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + /* + * These tests may leave the system in an undesirable state. + * They are intended to be run last in CI and the system + * rebooted afterwards. + */ + SUBTEST(live_rc6_ctx_wa), + }; + + if (intel_gt_is_wedged(&i915->gt)) + return 0; + + return intel_gt_live_subtests(tests, &i915->gt); +} diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c new file mode 100644 index 000000000..fb5ebf930 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c @@ -0,0 +1,1716 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include <linux/kthread.h> + +#include "gem/i915_gem_context.h" + +#include "intel_gt.h" +#include "intel_engine_heartbeat.h" +#include "intel_engine_pm.h" +#include "selftest_engine_heartbeat.h" + +#include "i915_selftest.h" +#include "selftests/i915_random.h" +#include "selftests/igt_flush_test.h" +#include "selftests/igt_reset.h" +#include "selftests/igt_atomic.h" + +#include "selftests/mock_drm.h" + +#include "gem/selftests/mock_context.h" +#include "gem/selftests/igt_gem_utils.h" + +#define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ + +struct hang { + struct intel_gt *gt; + struct drm_i915_gem_object *hws; + struct drm_i915_gem_object *obj; + struct i915_gem_context *ctx; + u32 *seqno; + u32 *batch; +}; + +static int hang_init(struct hang *h, struct intel_gt *gt) +{ + void *vaddr; + int err; + + memset(h, 0, sizeof(*h)); + h->gt = gt; + + h->ctx = kernel_context(gt->i915); + if (IS_ERR(h->ctx)) + return PTR_ERR(h->ctx); + + GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); + + h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); + if (IS_ERR(h->hws)) { + err = PTR_ERR(h->hws); + goto err_ctx; + } + + h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); + if (IS_ERR(h->obj)) { + err = PTR_ERR(h->obj); + goto err_hws; + } + + i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); + vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB); + if (IS_ERR(vaddr)) { + err = PTR_ERR(vaddr); + goto err_obj; + } + h->seqno = memset(vaddr, 0xff, PAGE_SIZE); + + vaddr = i915_gem_object_pin_map(h->obj, + i915_coherent_map_type(gt->i915)); + if (IS_ERR(vaddr)) { + err = PTR_ERR(vaddr); + goto err_unpin_hws; + } + h->batch = vaddr; + + return 0; + +err_unpin_hws: + i915_gem_object_unpin_map(h->hws); +err_obj: + i915_gem_object_put(h->obj); +err_hws: + i915_gem_object_put(h->hws); +err_ctx: + kernel_context_close(h->ctx); + return err; +} + +static u64 hws_address(const struct i915_vma *hws, + const struct i915_request *rq) +{ + return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); +} + +static int move_to_active(struct i915_vma *vma, + struct i915_request *rq, + unsigned int flags) +{ + int err; + + i915_vma_lock(vma); + err = i915_request_await_object(rq, vma->obj, + flags & EXEC_OBJECT_WRITE); + if (err == 0) + err = i915_vma_move_to_active(vma, rq, flags); + i915_vma_unlock(vma); + + return err; +} + +static struct i915_request * +hang_create_request(struct hang *h, struct intel_engine_cs *engine) +{ + struct intel_gt *gt = h->gt; + struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx); + struct drm_i915_gem_object *obj; + struct i915_request *rq = NULL; + struct i915_vma *hws, *vma; + unsigned int flags; + void *vaddr; + u32 *batch; + int err; + + obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); + if (IS_ERR(obj)) { + i915_vm_put(vm); + return ERR_CAST(obj); + } + + vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915)); + if (IS_ERR(vaddr)) { + i915_gem_object_put(obj); + i915_vm_put(vm); + return ERR_CAST(vaddr); + } + + i915_gem_object_unpin_map(h->obj); + i915_gem_object_put(h->obj); + + h->obj = obj; + h->batch = vaddr; + + vma = i915_vma_instance(h->obj, vm, NULL); + if (IS_ERR(vma)) { + i915_vm_put(vm); + return ERR_CAST(vma); + } + + hws = i915_vma_instance(h->hws, vm, NULL); + if (IS_ERR(hws)) { + i915_vm_put(vm); + return ERR_CAST(hws); + } + + err = i915_vma_pin(vma, 0, 0, PIN_USER); + if (err) { + i915_vm_put(vm); + return ERR_PTR(err); + } + + err = i915_vma_pin(hws, 0, 0, PIN_USER); + if (err) + goto unpin_vma; + + rq = igt_request_alloc(h->ctx, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto unpin_hws; + } + + err = move_to_active(vma, rq, 0); + if (err) + goto cancel_rq; + + err = move_to_active(hws, rq, 0); + if (err) + goto cancel_rq; + + batch = h->batch; + if (INTEL_GEN(gt->i915) >= 8) { + *batch++ = MI_STORE_DWORD_IMM_GEN4; + *batch++ = lower_32_bits(hws_address(hws, rq)); + *batch++ = upper_32_bits(hws_address(hws, rq)); + *batch++ = rq->fence.seqno; + *batch++ = MI_NOOP; + + memset(batch, 0, 1024); + batch += 1024 / sizeof(*batch); + + *batch++ = MI_NOOP; + *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; + *batch++ = lower_32_bits(vma->node.start); + *batch++ = upper_32_bits(vma->node.start); + } else if (INTEL_GEN(gt->i915) >= 6) { + *batch++ = MI_STORE_DWORD_IMM_GEN4; + *batch++ = 0; + *batch++ = lower_32_bits(hws_address(hws, rq)); + *batch++ = rq->fence.seqno; + *batch++ = MI_NOOP; + + memset(batch, 0, 1024); + batch += 1024 / sizeof(*batch); + + *batch++ = MI_NOOP; + *batch++ = MI_BATCH_BUFFER_START | 1 << 8; + *batch++ = lower_32_bits(vma->node.start); + } else if (INTEL_GEN(gt->i915) >= 4) { + *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *batch++ = 0; + *batch++ = lower_32_bits(hws_address(hws, rq)); + *batch++ = rq->fence.seqno; + *batch++ = MI_NOOP; + + memset(batch, 0, 1024); + batch += 1024 / sizeof(*batch); + + *batch++ = MI_NOOP; + *batch++ = MI_BATCH_BUFFER_START | 2 << 6; + *batch++ = lower_32_bits(vma->node.start); + } else { + *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; + *batch++ = lower_32_bits(hws_address(hws, rq)); + *batch++ = rq->fence.seqno; + *batch++ = MI_NOOP; + + memset(batch, 0, 1024); + batch += 1024 / sizeof(*batch); + + *batch++ = MI_NOOP; + *batch++ = MI_BATCH_BUFFER_START | 2 << 6; + *batch++ = lower_32_bits(vma->node.start); + } + *batch++ = MI_BATCH_BUFFER_END; /* not reached */ + intel_gt_chipset_flush(engine->gt); + + if (rq->engine->emit_init_breadcrumb) { + err = rq->engine->emit_init_breadcrumb(rq); + if (err) + goto cancel_rq; + } + + flags = 0; + if (INTEL_GEN(gt->i915) <= 5) + flags |= I915_DISPATCH_SECURE; + + err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); + +cancel_rq: + if (err) { + i915_request_set_error_once(rq, err); + i915_request_add(rq); + } +unpin_hws: + i915_vma_unpin(hws); +unpin_vma: + i915_vma_unpin(vma); + i915_vm_put(vm); + return err ? ERR_PTR(err) : rq; +} + +static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) +{ + return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); +} + +static void hang_fini(struct hang *h) +{ + *h->batch = MI_BATCH_BUFFER_END; + intel_gt_chipset_flush(h->gt); + + i915_gem_object_unpin_map(h->obj); + i915_gem_object_put(h->obj); + + i915_gem_object_unpin_map(h->hws); + i915_gem_object_put(h->hws); + + kernel_context_close(h->ctx); + + igt_flush_test(h->gt->i915); +} + +static bool wait_until_running(struct hang *h, struct i915_request *rq) +{ + return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), + rq->fence.seqno), + 10) && + wait_for(i915_seqno_passed(hws_seqno(h, rq), + rq->fence.seqno), + 1000)); +} + +static int igt_hang_sanitycheck(void *arg) +{ + struct intel_gt *gt = arg; + struct i915_request *rq; + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct hang h; + int err; + + /* Basic check that we can execute our hanging batch */ + + err = hang_init(&h, gt); + if (err) + return err; + + for_each_engine(engine, gt, id) { + struct intel_wedge_me w; + long timeout; + + if (!intel_engine_can_store_dword(engine)) + continue; + + rq = hang_create_request(&h, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + pr_err("Failed to create request for %s, err=%d\n", + engine->name, err); + goto fini; + } + + i915_request_get(rq); + + *h.batch = MI_BATCH_BUFFER_END; + intel_gt_chipset_flush(engine->gt); + + i915_request_add(rq); + + timeout = 0; + intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) + timeout = i915_request_wait(rq, 0, + MAX_SCHEDULE_TIMEOUT); + if (intel_gt_is_wedged(gt)) + timeout = -EIO; + + i915_request_put(rq); + + if (timeout < 0) { + err = timeout; + pr_err("Wait for request failed on %s, err=%d\n", + engine->name, err); + goto fini; + } + } + +fini: + hang_fini(&h); + return err; +} + +static bool wait_for_idle(struct intel_engine_cs *engine) +{ + return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; +} + +static int igt_reset_nop(void *arg) +{ + struct intel_gt *gt = arg; + struct i915_gpu_error *global = >->i915->gpu_error; + struct intel_engine_cs *engine; + unsigned int reset_count, count; + enum intel_engine_id id; + IGT_TIMEOUT(end_time); + int err = 0; + + /* Check that we can reset during non-user portions of requests */ + + reset_count = i915_reset_count(global); + count = 0; + do { + for_each_engine(engine, gt, id) { + struct intel_context *ce; + int i; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + break; + } + + for (i = 0; i < 16; i++) { + struct i915_request *rq; + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + break; + } + + i915_request_add(rq); + } + + intel_context_put(ce); + } + + igt_global_reset_lock(gt); + intel_gt_reset(gt, ALL_ENGINES, NULL); + igt_global_reset_unlock(gt); + + if (intel_gt_is_wedged(gt)) { + err = -EIO; + break; + } + + if (i915_reset_count(global) != reset_count + ++count) { + pr_err("Full GPU reset not recorded!\n"); + err = -EINVAL; + break; + } + + err = igt_flush_test(gt->i915); + if (err) + break; + } while (time_before(jiffies, end_time)); + pr_info("%s: %d resets\n", __func__, count); + + if (igt_flush_test(gt->i915)) + err = -EIO; + return err; +} + +static int igt_reset_nop_engine(void *arg) +{ + struct intel_gt *gt = arg; + struct i915_gpu_error *global = >->i915->gpu_error; + struct intel_engine_cs *engine; + enum intel_engine_id id; + + /* Check that we can engine-reset during non-user portions */ + + if (!intel_has_reset_engine(gt)) + return 0; + + for_each_engine(engine, gt, id) { + unsigned int reset_count, reset_engine_count, count; + struct intel_context *ce; + IGT_TIMEOUT(end_time); + int err; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return PTR_ERR(ce); + + reset_count = i915_reset_count(global); + reset_engine_count = i915_reset_engine_count(global, engine); + count = 0; + + st_engine_heartbeat_disable(engine); + set_bit(I915_RESET_ENGINE + id, >->reset.flags); + do { + int i; + + if (!wait_for_idle(engine)) { + pr_err("%s failed to idle before reset\n", + engine->name); + err = -EIO; + break; + } + + for (i = 0; i < 16; i++) { + struct i915_request *rq; + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) { + struct drm_printer p = + drm_info_printer(gt->i915->drm.dev); + intel_engine_dump(engine, &p, + "%s(%s): failed to submit request\n", + __func__, + engine->name); + + GEM_TRACE("%s(%s): failed to submit request\n", + __func__, + engine->name); + GEM_TRACE_DUMP(); + + intel_gt_set_wedged(gt); + + err = PTR_ERR(rq); + break; + } + + i915_request_add(rq); + } + err = intel_engine_reset(engine, NULL); + if (err) { + pr_err("i915_reset_engine failed\n"); + break; + } + + if (i915_reset_count(global) != reset_count) { + pr_err("Full GPU reset recorded! (engine reset expected)\n"); + err = -EINVAL; + break; + } + + if (i915_reset_engine_count(global, engine) != + reset_engine_count + ++count) { + pr_err("%s engine reset not recorded!\n", + engine->name); + err = -EINVAL; + break; + } + } while (time_before(jiffies, end_time)); + clear_bit(I915_RESET_ENGINE + id, >->reset.flags); + st_engine_heartbeat_enable(engine); + + pr_info("%s(%s): %d resets\n", __func__, engine->name, count); + + intel_context_put(ce); + if (igt_flush_test(gt->i915)) + err = -EIO; + if (err) + return err; + } + + return 0; +} + +static int __igt_reset_engine(struct intel_gt *gt, bool active) +{ + struct i915_gpu_error *global = >->i915->gpu_error; + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct hang h; + int err = 0; + + /* Check that we can issue an engine reset on an idle engine (no-op) */ + + if (!intel_has_reset_engine(gt)) + return 0; + + if (active) { + err = hang_init(&h, gt); + if (err) + return err; + } + + for_each_engine(engine, gt, id) { + unsigned int reset_count, reset_engine_count; + IGT_TIMEOUT(end_time); + + if (active && !intel_engine_can_store_dword(engine)) + continue; + + if (!wait_for_idle(engine)) { + pr_err("%s failed to idle before reset\n", + engine->name); + err = -EIO; + break; + } + + reset_count = i915_reset_count(global); + reset_engine_count = i915_reset_engine_count(global, engine); + + st_engine_heartbeat_disable(engine); + set_bit(I915_RESET_ENGINE + id, >->reset.flags); + do { + if (active) { + struct i915_request *rq; + + rq = hang_create_request(&h, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + break; + } + + i915_request_get(rq); + i915_request_add(rq); + + if (!wait_until_running(&h, rq)) { + struct drm_printer p = drm_info_printer(gt->i915->drm.dev); + + pr_err("%s: Failed to start request %llx, at %x\n", + __func__, rq->fence.seqno, hws_seqno(&h, rq)); + intel_engine_dump(engine, &p, + "%s\n", engine->name); + + i915_request_put(rq); + err = -EIO; + break; + } + + i915_request_put(rq); + } + + err = intel_engine_reset(engine, NULL); + if (err) { + pr_err("i915_reset_engine failed\n"); + break; + } + + if (i915_reset_count(global) != reset_count) { + pr_err("Full GPU reset recorded! (engine reset expected)\n"); + err = -EINVAL; + break; + } + + if (i915_reset_engine_count(global, engine) != + ++reset_engine_count) { + pr_err("%s engine reset not recorded!\n", + engine->name); + err = -EINVAL; + break; + } + } while (time_before(jiffies, end_time)); + clear_bit(I915_RESET_ENGINE + id, >->reset.flags); + st_engine_heartbeat_enable(engine); + + if (err) + break; + + err = igt_flush_test(gt->i915); + if (err) + break; + } + + if (intel_gt_is_wedged(gt)) + err = -EIO; + + if (active) + hang_fini(&h); + + return err; +} + +static int igt_reset_idle_engine(void *arg) +{ + return __igt_reset_engine(arg, false); +} + +static int igt_reset_active_engine(void *arg) +{ + return __igt_reset_engine(arg, true); +} + +struct active_engine { + struct task_struct *task; + struct intel_engine_cs *engine; + unsigned long resets; + unsigned int flags; +}; + +#define TEST_ACTIVE BIT(0) +#define TEST_OTHERS BIT(1) +#define TEST_SELF BIT(2) +#define TEST_PRIORITY BIT(3) + +static int active_request_put(struct i915_request *rq) +{ + int err = 0; + + if (!rq) + return 0; + + if (i915_request_wait(rq, 0, 5 * HZ) < 0) { + GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", + rq->engine->name, + rq->fence.context, + rq->fence.seqno); + GEM_TRACE_DUMP(); + + intel_gt_set_wedged(rq->engine->gt); + err = -EIO; + } + + i915_request_put(rq); + + return err; +} + +static int active_engine(void *data) +{ + I915_RND_STATE(prng); + struct active_engine *arg = data; + struct intel_engine_cs *engine = arg->engine; + struct i915_request *rq[8] = {}; + struct intel_context *ce[ARRAY_SIZE(rq)]; + unsigned long count; + int err = 0; + + for (count = 0; count < ARRAY_SIZE(ce); count++) { + ce[count] = intel_context_create(engine); + if (IS_ERR(ce[count])) { + err = PTR_ERR(ce[count]); + while (--count) + intel_context_put(ce[count]); + return err; + } + } + + count = 0; + while (!kthread_should_stop()) { + unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); + struct i915_request *old = rq[idx]; + struct i915_request *new; + + new = intel_context_create_request(ce[idx]); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } + + rq[idx] = i915_request_get(new); + i915_request_add(new); + + if (engine->schedule && arg->flags & TEST_PRIORITY) { + struct i915_sched_attr attr = { + .priority = + i915_prandom_u32_max_state(512, &prng), + }; + engine->schedule(rq[idx], &attr); + } + + err = active_request_put(old); + if (err) + break; + + cond_resched(); + } + + for (count = 0; count < ARRAY_SIZE(rq); count++) { + int err__ = active_request_put(rq[count]); + + /* Keep the first error */ + if (!err) + err = err__; + + intel_context_put(ce[count]); + } + + return err; +} + +static int __igt_reset_engines(struct intel_gt *gt, + const char *test_name, + unsigned int flags) +{ + struct i915_gpu_error *global = >->i915->gpu_error; + struct intel_engine_cs *engine, *other; + enum intel_engine_id id, tmp; + struct hang h; + int err = 0; + + /* Check that issuing a reset on one engine does not interfere + * with any other engine. + */ + + if (!intel_has_reset_engine(gt)) + return 0; + + if (flags & TEST_ACTIVE) { + err = hang_init(&h, gt); + if (err) + return err; + + if (flags & TEST_PRIORITY) + h.ctx->sched.priority = 1024; + } + + for_each_engine(engine, gt, id) { + struct active_engine threads[I915_NUM_ENGINES] = {}; + unsigned long device = i915_reset_count(global); + unsigned long count = 0, reported; + IGT_TIMEOUT(end_time); + + if (flags & TEST_ACTIVE && + !intel_engine_can_store_dword(engine)) + continue; + + if (!wait_for_idle(engine)) { + pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", + engine->name, test_name); + err = -EIO; + break; + } + + memset(threads, 0, sizeof(threads)); + for_each_engine(other, gt, tmp) { + struct task_struct *tsk; + + threads[tmp].resets = + i915_reset_engine_count(global, other); + + if (other == engine && !(flags & TEST_SELF)) + continue; + + if (other != engine && !(flags & TEST_OTHERS)) + continue; + + threads[tmp].engine = other; + threads[tmp].flags = flags; + + tsk = kthread_run(active_engine, &threads[tmp], + "igt/%s", other->name); + if (IS_ERR(tsk)) { + err = PTR_ERR(tsk); + goto unwind; + } + + threads[tmp].task = tsk; + get_task_struct(tsk); + } + + yield(); /* start all threads before we begin */ + + st_engine_heartbeat_disable(engine); + set_bit(I915_RESET_ENGINE + id, >->reset.flags); + do { + struct i915_request *rq = NULL; + + if (flags & TEST_ACTIVE) { + rq = hang_create_request(&h, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + break; + } + + i915_request_get(rq); + i915_request_add(rq); + + if (!wait_until_running(&h, rq)) { + struct drm_printer p = drm_info_printer(gt->i915->drm.dev); + + pr_err("%s: Failed to start request %llx, at %x\n", + __func__, rq->fence.seqno, hws_seqno(&h, rq)); + intel_engine_dump(engine, &p, + "%s\n", engine->name); + + i915_request_put(rq); + err = -EIO; + break; + } + } + + err = intel_engine_reset(engine, NULL); + if (err) { + pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", + engine->name, test_name, err); + break; + } + + count++; + + if (rq) { + if (rq->fence.error != -EIO) { + pr_err("i915_reset_engine(%s:%s):" + " failed to reset request %llx:%lld\n", + engine->name, test_name, + rq->fence.context, + rq->fence.seqno); + i915_request_put(rq); + + GEM_TRACE_DUMP(); + intel_gt_set_wedged(gt); + err = -EIO; + break; + } + + if (i915_request_wait(rq, 0, HZ / 5) < 0) { + struct drm_printer p = + drm_info_printer(gt->i915->drm.dev); + + pr_err("i915_reset_engine(%s:%s):" + " failed to complete request %llx:%lld after reset\n", + engine->name, test_name, + rq->fence.context, + rq->fence.seqno); + intel_engine_dump(engine, &p, + "%s\n", engine->name); + i915_request_put(rq); + + GEM_TRACE_DUMP(); + intel_gt_set_wedged(gt); + err = -EIO; + break; + } + + i915_request_put(rq); + } + + if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { + struct drm_printer p = + drm_info_printer(gt->i915->drm.dev); + + pr_err("i915_reset_engine(%s:%s):" + " failed to idle after reset\n", + engine->name, test_name); + intel_engine_dump(engine, &p, + "%s\n", engine->name); + + err = -EIO; + break; + } + } while (time_before(jiffies, end_time)); + clear_bit(I915_RESET_ENGINE + id, >->reset.flags); + st_engine_heartbeat_enable(engine); + + pr_info("i915_reset_engine(%s:%s): %lu resets\n", + engine->name, test_name, count); + + reported = i915_reset_engine_count(global, engine); + reported -= threads[engine->id].resets; + if (reported != count) { + pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", + engine->name, test_name, count, reported); + if (!err) + err = -EINVAL; + } + +unwind: + for_each_engine(other, gt, tmp) { + int ret; + + if (!threads[tmp].task) + continue; + + ret = kthread_stop(threads[tmp].task); + if (ret) { + pr_err("kthread for other engine %s failed, err=%d\n", + other->name, ret); + if (!err) + err = ret; + } + put_task_struct(threads[tmp].task); + + if (other->uabi_class != engine->uabi_class && + threads[tmp].resets != + i915_reset_engine_count(global, other)) { + pr_err("Innocent engine %s was reset (count=%ld)\n", + other->name, + i915_reset_engine_count(global, other) - + threads[tmp].resets); + if (!err) + err = -EINVAL; + } + } + + if (device != i915_reset_count(global)) { + pr_err("Global reset (count=%ld)!\n", + i915_reset_count(global) - device); + if (!err) + err = -EINVAL; + } + + if (err) + break; + + err = igt_flush_test(gt->i915); + if (err) + break; + } + + if (intel_gt_is_wedged(gt)) + err = -EIO; + + if (flags & TEST_ACTIVE) + hang_fini(&h); + + return err; +} + +static int igt_reset_engines(void *arg) +{ + static const struct { + const char *name; + unsigned int flags; + } phases[] = { + { "idle", 0 }, + { "active", TEST_ACTIVE }, + { "others-idle", TEST_OTHERS }, + { "others-active", TEST_OTHERS | TEST_ACTIVE }, + { + "others-priority", + TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY + }, + { + "self-priority", + TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, + }, + { } + }; + struct intel_gt *gt = arg; + typeof(*phases) *p; + int err; + + for (p = phases; p->name; p++) { + if (p->flags & TEST_PRIORITY) { + if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) + continue; + } + + err = __igt_reset_engines(arg, p->name, p->flags); + if (err) + return err; + } + + return 0; +} + +static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) +{ + u32 count = i915_reset_count(>->i915->gpu_error); + + intel_gt_reset(gt, mask, NULL); + + return count; +} + +static int igt_reset_wait(void *arg) +{ + struct intel_gt *gt = arg; + struct i915_gpu_error *global = >->i915->gpu_error; + struct intel_engine_cs *engine = gt->engine[RCS0]; + struct i915_request *rq; + unsigned int reset_count; + struct hang h; + long timeout; + int err; + + if (!engine || !intel_engine_can_store_dword(engine)) + return 0; + + /* Check that we detect a stuck waiter and issue a reset */ + + igt_global_reset_lock(gt); + + err = hang_init(&h, gt); + if (err) + goto unlock; + + rq = hang_create_request(&h, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto fini; + } + + i915_request_get(rq); + i915_request_add(rq); + + if (!wait_until_running(&h, rq)) { + struct drm_printer p = drm_info_printer(gt->i915->drm.dev); + + pr_err("%s: Failed to start request %llx, at %x\n", + __func__, rq->fence.seqno, hws_seqno(&h, rq)); + intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); + + intel_gt_set_wedged(gt); + + err = -EIO; + goto out_rq; + } + + reset_count = fake_hangcheck(gt, ALL_ENGINES); + + timeout = i915_request_wait(rq, 0, 10); + if (timeout < 0) { + pr_err("i915_request_wait failed on a stuck request: err=%ld\n", + timeout); + err = timeout; + goto out_rq; + } + + if (i915_reset_count(global) == reset_count) { + pr_err("No GPU reset recorded!\n"); + err = -EINVAL; + goto out_rq; + } + +out_rq: + i915_request_put(rq); +fini: + hang_fini(&h); +unlock: + igt_global_reset_unlock(gt); + + if (intel_gt_is_wedged(gt)) + return -EIO; + + return err; +} + +struct evict_vma { + struct completion completion; + struct i915_vma *vma; +}; + +static int evict_vma(void *data) +{ + struct evict_vma *arg = data; + struct i915_address_space *vm = arg->vma->vm; + struct drm_mm_node evict = arg->vma->node; + int err; + + complete(&arg->completion); + + mutex_lock(&vm->mutex); + err = i915_gem_evict_for_node(vm, &evict, 0); + mutex_unlock(&vm->mutex); + + return err; +} + +static int evict_fence(void *data) +{ + struct evict_vma *arg = data; + int err; + + complete(&arg->completion); + + /* Mark the fence register as dirty to force the mmio update. */ + err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); + if (err) { + pr_err("Invalid Y-tiling settings; err:%d\n", err); + return err; + } + + err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); + if (err) { + pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); + return err; + } + + err = i915_vma_pin_fence(arg->vma); + i915_vma_unpin(arg->vma); + if (err) { + pr_err("Unable to pin Y-tiled fence; err:%d\n", err); + return err; + } + + i915_vma_unpin_fence(arg->vma); + + return 0; +} + +static int __igt_reset_evict_vma(struct intel_gt *gt, + struct i915_address_space *vm, + int (*fn)(void *), + unsigned int flags) +{ + struct intel_engine_cs *engine = gt->engine[RCS0]; + struct drm_i915_gem_object *obj; + struct task_struct *tsk = NULL; + struct i915_request *rq; + struct evict_vma arg; + struct hang h; + unsigned int pin_flags; + int err; + + if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE) + return 0; + + if (!engine || !intel_engine_can_store_dword(engine)) + return 0; + + /* Check that we can recover an unbind stuck on a hanging request */ + + err = hang_init(&h, gt); + if (err) + return err; + + obj = i915_gem_object_create_internal(gt->i915, SZ_1M); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + goto fini; + } + + if (flags & EXEC_OBJECT_NEEDS_FENCE) { + err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); + if (err) { + pr_err("Invalid X-tiling settings; err:%d\n", err); + goto out_obj; + } + } + + arg.vma = i915_vma_instance(obj, vm, NULL); + if (IS_ERR(arg.vma)) { + err = PTR_ERR(arg.vma); + goto out_obj; + } + + rq = hang_create_request(&h, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_obj; + } + + pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER; + + if (flags & EXEC_OBJECT_NEEDS_FENCE) + pin_flags |= PIN_MAPPABLE; + + err = i915_vma_pin(arg.vma, 0, 0, pin_flags); + if (err) { + i915_request_add(rq); + goto out_obj; + } + + if (flags & EXEC_OBJECT_NEEDS_FENCE) { + err = i915_vma_pin_fence(arg.vma); + if (err) { + pr_err("Unable to pin X-tiled fence; err:%d\n", err); + i915_vma_unpin(arg.vma); + i915_request_add(rq); + goto out_obj; + } + } + + i915_vma_lock(arg.vma); + err = i915_request_await_object(rq, arg.vma->obj, + flags & EXEC_OBJECT_WRITE); + if (err == 0) + err = i915_vma_move_to_active(arg.vma, rq, flags); + i915_vma_unlock(arg.vma); + + if (flags & EXEC_OBJECT_NEEDS_FENCE) + i915_vma_unpin_fence(arg.vma); + i915_vma_unpin(arg.vma); + + i915_request_get(rq); + i915_request_add(rq); + if (err) + goto out_rq; + + if (!wait_until_running(&h, rq)) { + struct drm_printer p = drm_info_printer(gt->i915->drm.dev); + + pr_err("%s: Failed to start request %llx, at %x\n", + __func__, rq->fence.seqno, hws_seqno(&h, rq)); + intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); + + intel_gt_set_wedged(gt); + goto out_reset; + } + + init_completion(&arg.completion); + + tsk = kthread_run(fn, &arg, "igt/evict_vma"); + if (IS_ERR(tsk)) { + err = PTR_ERR(tsk); + tsk = NULL; + goto out_reset; + } + get_task_struct(tsk); + + wait_for_completion(&arg.completion); + + if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { + struct drm_printer p = drm_info_printer(gt->i915->drm.dev); + + pr_err("igt/evict_vma kthread did not wait\n"); + intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); + + intel_gt_set_wedged(gt); + goto out_reset; + } + +out_reset: + igt_global_reset_lock(gt); + fake_hangcheck(gt, rq->engine->mask); + igt_global_reset_unlock(gt); + + if (tsk) { + struct intel_wedge_me w; + + /* The reset, even indirectly, should take less than 10ms. */ + intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) + err = kthread_stop(tsk); + + put_task_struct(tsk); + } + +out_rq: + i915_request_put(rq); +out_obj: + i915_gem_object_put(obj); +fini: + hang_fini(&h); + if (intel_gt_is_wedged(gt)) + return -EIO; + + return err; +} + +static int igt_reset_evict_ggtt(void *arg) +{ + struct intel_gt *gt = arg; + + return __igt_reset_evict_vma(gt, >->ggtt->vm, + evict_vma, EXEC_OBJECT_WRITE); +} + +static int igt_reset_evict_ppgtt(void *arg) +{ + struct intel_gt *gt = arg; + struct i915_ppgtt *ppgtt; + int err; + + /* aliasing == global gtt locking, covered above */ + if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL) + return 0; + + ppgtt = i915_ppgtt_create(gt); + if (IS_ERR(ppgtt)) + return PTR_ERR(ppgtt); + + err = __igt_reset_evict_vma(gt, &ppgtt->vm, + evict_vma, EXEC_OBJECT_WRITE); + i915_vm_put(&ppgtt->vm); + + return err; +} + +static int igt_reset_evict_fence(void *arg) +{ + struct intel_gt *gt = arg; + + return __igt_reset_evict_vma(gt, >->ggtt->vm, + evict_fence, EXEC_OBJECT_NEEDS_FENCE); +} + +static int wait_for_others(struct intel_gt *gt, + struct intel_engine_cs *exclude) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, gt, id) { + if (engine == exclude) + continue; + + if (!wait_for_idle(engine)) + return -EIO; + } + + return 0; +} + +static int igt_reset_queue(void *arg) +{ + struct intel_gt *gt = arg; + struct i915_gpu_error *global = >->i915->gpu_error; + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct hang h; + int err; + + /* Check that we replay pending requests following a hang */ + + igt_global_reset_lock(gt); + + err = hang_init(&h, gt); + if (err) + goto unlock; + + for_each_engine(engine, gt, id) { + struct i915_request *prev; + IGT_TIMEOUT(end_time); + unsigned int count; + + if (!intel_engine_can_store_dword(engine)) + continue; + + prev = hang_create_request(&h, engine); + if (IS_ERR(prev)) { + err = PTR_ERR(prev); + goto fini; + } + + i915_request_get(prev); + i915_request_add(prev); + + count = 0; + do { + struct i915_request *rq; + unsigned int reset_count; + + rq = hang_create_request(&h, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto fini; + } + + i915_request_get(rq); + i915_request_add(rq); + + /* + * XXX We don't handle resetting the kernel context + * very well. If we trigger a device reset twice in + * quick succession while the kernel context is + * executing, we may end up skipping the breadcrumb. + * This is really only a problem for the selftest as + * normally there is a large interlude between resets + * (hangcheck), or we focus on resetting just one + * engine and so avoid repeatedly resetting innocents. + */ + err = wait_for_others(gt, engine); + if (err) { + pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", + __func__, engine->name); + i915_request_put(rq); + i915_request_put(prev); + + GEM_TRACE_DUMP(); + intel_gt_set_wedged(gt); + goto fini; + } + + if (!wait_until_running(&h, prev)) { + struct drm_printer p = drm_info_printer(gt->i915->drm.dev); + + pr_err("%s(%s): Failed to start request %llx, at %x\n", + __func__, engine->name, + prev->fence.seqno, hws_seqno(&h, prev)); + intel_engine_dump(engine, &p, + "%s\n", engine->name); + + i915_request_put(rq); + i915_request_put(prev); + + intel_gt_set_wedged(gt); + + err = -EIO; + goto fini; + } + + reset_count = fake_hangcheck(gt, BIT(id)); + + if (prev->fence.error != -EIO) { + pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", + prev->fence.error); + i915_request_put(rq); + i915_request_put(prev); + err = -EINVAL; + goto fini; + } + + if (rq->fence.error) { + pr_err("Fence error status not zero [%d] after unrelated reset\n", + rq->fence.error); + i915_request_put(rq); + i915_request_put(prev); + err = -EINVAL; + goto fini; + } + + if (i915_reset_count(global) == reset_count) { + pr_err("No GPU reset recorded!\n"); + i915_request_put(rq); + i915_request_put(prev); + err = -EINVAL; + goto fini; + } + + i915_request_put(prev); + prev = rq; + count++; + } while (time_before(jiffies, end_time)); + pr_info("%s: Completed %d resets\n", engine->name, count); + + *h.batch = MI_BATCH_BUFFER_END; + intel_gt_chipset_flush(engine->gt); + + i915_request_put(prev); + + err = igt_flush_test(gt->i915); + if (err) + break; + } + +fini: + hang_fini(&h); +unlock: + igt_global_reset_unlock(gt); + + if (intel_gt_is_wedged(gt)) + return -EIO; + + return err; +} + +static int igt_handle_error(void *arg) +{ + struct intel_gt *gt = arg; + struct i915_gpu_error *global = >->i915->gpu_error; + struct intel_engine_cs *engine = gt->engine[RCS0]; + struct hang h; + struct i915_request *rq; + struct i915_gpu_coredump *error; + int err; + + /* Check that we can issue a global GPU and engine reset */ + + if (!intel_has_reset_engine(gt)) + return 0; + + if (!engine || !intel_engine_can_store_dword(engine)) + return 0; + + err = hang_init(&h, gt); + if (err) + return err; + + rq = hang_create_request(&h, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_fini; + } + + i915_request_get(rq); + i915_request_add(rq); + + if (!wait_until_running(&h, rq)) { + struct drm_printer p = drm_info_printer(gt->i915->drm.dev); + + pr_err("%s: Failed to start request %llx, at %x\n", + __func__, rq->fence.seqno, hws_seqno(&h, rq)); + intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); + + intel_gt_set_wedged(gt); + + err = -EIO; + goto err_request; + } + + /* Temporarily disable error capture */ + error = xchg(&global->first_error, (void *)-1); + + intel_gt_handle_error(gt, engine->mask, 0, NULL); + + xchg(&global->first_error, error); + + if (rq->fence.error != -EIO) { + pr_err("Guilty request not identified!\n"); + err = -EINVAL; + goto err_request; + } + +err_request: + i915_request_put(rq); +err_fini: + hang_fini(&h); + return err; +} + +static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, + const struct igt_atomic_section *p, + const char *mode) +{ + struct tasklet_struct * const t = &engine->execlists.tasklet; + int err; + + GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", + engine->name, mode, p->name); + + tasklet_disable(t); + p->critical_section_begin(); + + err = intel_engine_reset(engine, NULL); + + p->critical_section_end(); + tasklet_enable(t); + + if (err) + pr_err("i915_reset_engine(%s:%s) failed under %s\n", + engine->name, mode, p->name); + + return err; +} + +static int igt_atomic_reset_engine(struct intel_engine_cs *engine, + const struct igt_atomic_section *p) +{ + struct i915_request *rq; + struct hang h; + int err; + + err = __igt_atomic_reset_engine(engine, p, "idle"); + if (err) + return err; + + err = hang_init(&h, engine->gt); + if (err) + return err; + + rq = hang_create_request(&h, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out; + } + + i915_request_get(rq); + i915_request_add(rq); + + if (wait_until_running(&h, rq)) { + err = __igt_atomic_reset_engine(engine, p, "active"); + } else { + pr_err("%s(%s): Failed to start request %llx, at %x\n", + __func__, engine->name, + rq->fence.seqno, hws_seqno(&h, rq)); + intel_gt_set_wedged(engine->gt); + err = -EIO; + } + + if (err == 0) { + struct intel_wedge_me w; + + intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) + i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); + if (intel_gt_is_wedged(engine->gt)) + err = -EIO; + } + + i915_request_put(rq); +out: + hang_fini(&h); + return err; +} + +static int igt_reset_engines_atomic(void *arg) +{ + struct intel_gt *gt = arg; + const typeof(*igt_atomic_phases) *p; + int err = 0; + + /* Check that the engines resets are usable from atomic context */ + + if (!intel_has_reset_engine(gt)) + return 0; + + if (intel_uc_uses_guc_submission(>->uc)) + return 0; + + igt_global_reset_lock(gt); + + /* Flush any requests before we get started and check basics */ + if (!igt_force_reset(gt)) + goto unlock; + + for (p = igt_atomic_phases; p->name; p++) { + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, gt, id) { + err = igt_atomic_reset_engine(engine, p); + if (err) + goto out; + } + } + +out: + /* As we poke around the guts, do a full reset before continuing. */ + igt_force_reset(gt); +unlock: + igt_global_reset_unlock(gt); + + return err; +} + +int intel_hangcheck_live_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(igt_hang_sanitycheck), + SUBTEST(igt_reset_nop), + SUBTEST(igt_reset_nop_engine), + SUBTEST(igt_reset_idle_engine), + SUBTEST(igt_reset_active_engine), + SUBTEST(igt_reset_engines), + SUBTEST(igt_reset_engines_atomic), + SUBTEST(igt_reset_queue), + SUBTEST(igt_reset_wait), + SUBTEST(igt_reset_evict_ggtt), + SUBTEST(igt_reset_evict_ppgtt), + SUBTEST(igt_reset_evict_fence), + SUBTEST(igt_handle_error), + }; + struct intel_gt *gt = &i915->gt; + intel_wakeref_t wakeref; + int err; + + if (!intel_has_gpu_reset(gt)) + return 0; + + if (intel_gt_is_wedged(gt)) + return -EIO; /* we're long past hope of a successful reset */ + + wakeref = intel_runtime_pm_get(gt->uncore->rpm); + + err = intel_gt_live_subtests(tests, gt); + + intel_runtime_pm_put(gt->uncore->rpm, wakeref); + + return err; +} diff --git a/drivers/gpu/drm/i915/gt/selftest_llc.c b/drivers/gpu/drm/i915/gt/selftest_llc.c new file mode 100644 index 000000000..a91215969 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_llc.c @@ -0,0 +1,73 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include "intel_pm.h" /* intel_gpu_freq() */ +#include "selftest_llc.h" +#include "intel_rps.h" + +static int gen6_verify_ring_freq(struct intel_llc *llc) +{ + struct drm_i915_private *i915 = llc_to_gt(llc)->i915; + struct ia_constants consts; + intel_wakeref_t wakeref; + unsigned int gpu_freq; + int err = 0; + + wakeref = intel_runtime_pm_get(llc_to_gt(llc)->uncore->rpm); + + if (!get_ia_constants(llc, &consts)) + goto out_rpm; + + for (gpu_freq = consts.min_gpu_freq; + gpu_freq <= consts.max_gpu_freq; + gpu_freq++) { + struct intel_rps *rps = &llc_to_gt(llc)->rps; + + unsigned int ia_freq, ring_freq, found; + u32 val; + + calc_ia_freq(llc, gpu_freq, &consts, &ia_freq, &ring_freq); + + val = gpu_freq; + if (sandybridge_pcode_read(i915, + GEN6_PCODE_READ_MIN_FREQ_TABLE, + &val, NULL)) { + pr_err("Failed to read freq table[%d], range [%d, %d]\n", + gpu_freq, consts.min_gpu_freq, consts.max_gpu_freq); + err = -ENXIO; + break; + } + + found = (val >> 0) & 0xff; + if (found != ia_freq) { + pr_err("Min freq table(%d/[%d, %d]):%dMHz did not match expected CPU freq, found %d, expected %d\n", + gpu_freq, consts.min_gpu_freq, consts.max_gpu_freq, + intel_gpu_freq(rps, gpu_freq * (INTEL_GEN(i915) >= 9 ? GEN9_FREQ_SCALER : 1)), + found, ia_freq); + err = -EINVAL; + break; + } + + found = (val >> 8) & 0xff; + if (found != ring_freq) { + pr_err("Min freq table(%d/[%d, %d]):%dMHz did not match expected ring freq, found %d, expected %d\n", + gpu_freq, consts.min_gpu_freq, consts.max_gpu_freq, + intel_gpu_freq(rps, gpu_freq * (INTEL_GEN(i915) >= 9 ? GEN9_FREQ_SCALER : 1)), + found, ring_freq); + err = -EINVAL; + break; + } + } + +out_rpm: + intel_runtime_pm_put(llc_to_gt(llc)->uncore->rpm, wakeref); + return err; +} + +int st_llc_verify(struct intel_llc *llc) +{ + return gen6_verify_ring_freq(llc); +} diff --git a/drivers/gpu/drm/i915/gt/selftest_llc.h b/drivers/gpu/drm/i915/gt/selftest_llc.h new file mode 100644 index 000000000..873f896e7 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_llc.h @@ -0,0 +1,14 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef SELFTEST_LLC_H +#define SELFTEST_LLC_H + +struct intel_llc; + +int st_llc_verify(struct intel_llc *llc); + +#endif /* SELFTEST_LLC_H */ diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c new file mode 100644 index 000000000..35d55f98a --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c @@ -0,0 +1,6481 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2018 Intel Corporation + */ + +#include <linux/prime_numbers.h> + +#include "gem/i915_gem_pm.h" +#include "gt/intel_engine_heartbeat.h" +#include "gt/intel_reset.h" +#include "gt/selftest_engine_heartbeat.h" + +#include "i915_selftest.h" +#include "selftests/i915_random.h" +#include "selftests/igt_flush_test.h" +#include "selftests/igt_live_test.h" +#include "selftests/igt_spinner.h" +#include "selftests/lib_sw_fence.h" + +#include "gem/selftests/igt_gem_utils.h" +#include "gem/selftests/mock_context.h" + +#define CS_GPR(engine, n) ((engine)->mmio_base + 0x600 + (n) * 4) +#define NUM_GPR 16 +#define NUM_GPR_DW (NUM_GPR * 2) /* each GPR is 2 dwords */ + +static struct i915_vma *create_scratch(struct intel_gt *gt) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + int err; + + obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + i915_gem_object_set_cache_coherency(obj, I915_CACHING_CACHED); + + vma = i915_vma_instance(obj, >->ggtt->vm, NULL); + if (IS_ERR(vma)) { + i915_gem_object_put(obj); + return vma; + } + + err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL); + if (err) { + i915_gem_object_put(obj); + return ERR_PTR(err); + } + + return vma; +} + +static bool is_active(struct i915_request *rq) +{ + if (i915_request_is_active(rq)) + return true; + + if (i915_request_on_hold(rq)) + return true; + + if (i915_request_has_initial_breadcrumb(rq) && i915_request_started(rq)) + return true; + + return false; +} + +static int wait_for_submit(struct intel_engine_cs *engine, + struct i915_request *rq, + unsigned long timeout) +{ + timeout += jiffies; + do { + bool done = time_after(jiffies, timeout); + + if (i915_request_completed(rq)) /* that was quick! */ + return 0; + + /* Wait until the HW has acknowleged the submission (or err) */ + intel_engine_flush_submission(engine); + if (!READ_ONCE(engine->execlists.pending[0]) && is_active(rq)) + return 0; + + if (done) + return -ETIME; + + cond_resched(); + } while (1); +} + +static int wait_for_reset(struct intel_engine_cs *engine, + struct i915_request *rq, + unsigned long timeout) +{ + timeout += jiffies; + + do { + cond_resched(); + intel_engine_flush_submission(engine); + + if (READ_ONCE(engine->execlists.pending[0])) + continue; + + if (i915_request_completed(rq)) + break; + + if (READ_ONCE(rq->fence.error)) + break; + } while (time_before(jiffies, timeout)); + + flush_scheduled_work(); + + if (rq->fence.error != -EIO) { + pr_err("%s: hanging request %llx:%lld not reset\n", + engine->name, + rq->fence.context, + rq->fence.seqno); + return -EINVAL; + } + + /* Give the request a jiffie to complete after flushing the worker */ + if (i915_request_wait(rq, 0, + max(0l, (long)(timeout - jiffies)) + 1) < 0) { + pr_err("%s: hanging request %llx:%lld did not complete\n", + engine->name, + rq->fence.context, + rq->fence.seqno); + return -ETIME; + } + + return 0; +} + +static int live_sanitycheck(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct igt_spinner spin; + int err = 0; + + if (!HAS_LOGICAL_RING_CONTEXTS(gt->i915)) + return 0; + + if (igt_spinner_init(&spin, gt)) + return -ENOMEM; + + for_each_engine(engine, gt, id) { + struct intel_context *ce; + struct i915_request *rq; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + break; + } + + rq = igt_spinner_create_request(&spin, ce, MI_NOOP); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_ctx; + } + + i915_request_add(rq); + if (!igt_wait_for_spinner(&spin, rq)) { + GEM_TRACE("spinner failed to start\n"); + GEM_TRACE_DUMP(); + intel_gt_set_wedged(gt); + err = -EIO; + goto out_ctx; + } + + igt_spinner_end(&spin); + if (igt_flush_test(gt->i915)) { + err = -EIO; + goto out_ctx; + } + +out_ctx: + intel_context_put(ce); + if (err) + break; + } + + igt_spinner_fini(&spin); + return err; +} + +static int live_unlite_restore(struct intel_gt *gt, int prio) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct igt_spinner spin; + int err = -ENOMEM; + + /* + * Check that we can correctly context switch between 2 instances + * on the same engine from the same parent context. + */ + + if (igt_spinner_init(&spin, gt)) + return err; + + err = 0; + for_each_engine(engine, gt, id) { + struct intel_context *ce[2] = {}; + struct i915_request *rq[2]; + struct igt_live_test t; + int n; + + if (prio && !intel_engine_has_preemption(engine)) + continue; + + if (!intel_engine_can_store_dword(engine)) + continue; + + if (igt_live_test_begin(&t, gt->i915, __func__, engine->name)) { + err = -EIO; + break; + } + st_engine_heartbeat_disable(engine); + + for (n = 0; n < ARRAY_SIZE(ce); n++) { + struct intel_context *tmp; + + tmp = intel_context_create(engine); + if (IS_ERR(tmp)) { + err = PTR_ERR(tmp); + goto err_ce; + } + + err = intel_context_pin(tmp); + if (err) { + intel_context_put(tmp); + goto err_ce; + } + + /* + * Setup the pair of contexts such that if we + * lite-restore using the RING_TAIL from ce[1] it + * will execute garbage from ce[0]->ring. + */ + memset(tmp->ring->vaddr, + POISON_INUSE, /* IPEHR: 0x5a5a5a5a [hung!] */ + tmp->ring->vma->size); + + ce[n] = tmp; + } + GEM_BUG_ON(!ce[1]->ring->size); + intel_ring_reset(ce[1]->ring, ce[1]->ring->size / 2); + __execlists_update_reg_state(ce[1], engine, ce[1]->ring->head); + + rq[0] = igt_spinner_create_request(&spin, ce[0], MI_ARB_CHECK); + if (IS_ERR(rq[0])) { + err = PTR_ERR(rq[0]); + goto err_ce; + } + + i915_request_get(rq[0]); + i915_request_add(rq[0]); + GEM_BUG_ON(rq[0]->postfix > ce[1]->ring->emit); + + if (!igt_wait_for_spinner(&spin, rq[0])) { + i915_request_put(rq[0]); + goto err_ce; + } + + rq[1] = i915_request_create(ce[1]); + if (IS_ERR(rq[1])) { + err = PTR_ERR(rq[1]); + i915_request_put(rq[0]); + goto err_ce; + } + + if (!prio) { + /* + * Ensure we do the switch to ce[1] on completion. + * + * rq[0] is already submitted, so this should reduce + * to a no-op (a wait on a request on the same engine + * uses the submit fence, not the completion fence), + * but it will install a dependency on rq[1] for rq[0] + * that will prevent the pair being reordered by + * timeslicing. + */ + i915_request_await_dma_fence(rq[1], &rq[0]->fence); + } + + i915_request_get(rq[1]); + i915_request_add(rq[1]); + GEM_BUG_ON(rq[1]->postfix <= rq[0]->postfix); + i915_request_put(rq[0]); + + if (prio) { + struct i915_sched_attr attr = { + .priority = prio, + }; + + /* Alternatively preempt the spinner with ce[1] */ + engine->schedule(rq[1], &attr); + } + + /* And switch back to ce[0] for good measure */ + rq[0] = i915_request_create(ce[0]); + if (IS_ERR(rq[0])) { + err = PTR_ERR(rq[0]); + i915_request_put(rq[1]); + goto err_ce; + } + + i915_request_await_dma_fence(rq[0], &rq[1]->fence); + i915_request_get(rq[0]); + i915_request_add(rq[0]); + GEM_BUG_ON(rq[0]->postfix > rq[1]->postfix); + i915_request_put(rq[1]); + i915_request_put(rq[0]); + +err_ce: + intel_engine_flush_submission(engine); + igt_spinner_end(&spin); + for (n = 0; n < ARRAY_SIZE(ce); n++) { + if (IS_ERR_OR_NULL(ce[n])) + break; + + intel_context_unpin(ce[n]); + intel_context_put(ce[n]); + } + + st_engine_heartbeat_enable(engine); + if (igt_live_test_end(&t)) + err = -EIO; + if (err) + break; + } + + igt_spinner_fini(&spin); + return err; +} + +static int live_unlite_switch(void *arg) +{ + return live_unlite_restore(arg, 0); +} + +static int live_unlite_preempt(void *arg) +{ + return live_unlite_restore(arg, I915_USER_PRIORITY(I915_PRIORITY_MAX)); +} + +static int live_unlite_ring(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + struct igt_spinner spin; + enum intel_engine_id id; + int err = 0; + + /* + * Setup a preemption event that will cause almost the entire ring + * to be unwound, potentially fooling our intel_ring_direction() + * into emitting a forward lite-restore instead of the rollback. + */ + + if (igt_spinner_init(&spin, gt)) + return -ENOMEM; + + for_each_engine(engine, gt, id) { + struct intel_context *ce[2] = {}; + struct i915_request *rq; + struct igt_live_test t; + int n; + + if (!intel_engine_has_preemption(engine)) + continue; + + if (!intel_engine_can_store_dword(engine)) + continue; + + if (igt_live_test_begin(&t, gt->i915, __func__, engine->name)) { + err = -EIO; + break; + } + st_engine_heartbeat_disable(engine); + + for (n = 0; n < ARRAY_SIZE(ce); n++) { + struct intel_context *tmp; + + tmp = intel_context_create(engine); + if (IS_ERR(tmp)) { + err = PTR_ERR(tmp); + goto err_ce; + } + + err = intel_context_pin(tmp); + if (err) { + intel_context_put(tmp); + goto err_ce; + } + + memset32(tmp->ring->vaddr, + 0xdeadbeef, /* trigger a hang if executed */ + tmp->ring->vma->size / sizeof(u32)); + + ce[n] = tmp; + } + + /* Create max prio spinner, followed by N low prio nops */ + rq = igt_spinner_create_request(&spin, ce[0], MI_ARB_CHECK); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_ce; + } + + i915_request_get(rq); + rq->sched.attr.priority = I915_PRIORITY_BARRIER; + i915_request_add(rq); + + if (!igt_wait_for_spinner(&spin, rq)) { + intel_gt_set_wedged(gt); + i915_request_put(rq); + err = -ETIME; + goto err_ce; + } + + /* Fill the ring, until we will cause a wrap */ + n = 0; + while (intel_ring_direction(ce[0]->ring, + rq->wa_tail, + ce[0]->ring->tail) <= 0) { + struct i915_request *tmp; + + tmp = intel_context_create_request(ce[0]); + if (IS_ERR(tmp)) { + err = PTR_ERR(tmp); + i915_request_put(rq); + goto err_ce; + } + + i915_request_add(tmp); + intel_engine_flush_submission(engine); + n++; + } + intel_engine_flush_submission(engine); + pr_debug("%s: Filled ring with %d nop tails {size:%x, tail:%x, emit:%x, rq.tail:%x}\n", + engine->name, n, + ce[0]->ring->size, + ce[0]->ring->tail, + ce[0]->ring->emit, + rq->tail); + GEM_BUG_ON(intel_ring_direction(ce[0]->ring, + rq->tail, + ce[0]->ring->tail) <= 0); + i915_request_put(rq); + + /* Create a second ring to preempt the first ring after rq[0] */ + rq = intel_context_create_request(ce[1]); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_ce; + } + + rq->sched.attr.priority = I915_PRIORITY_BARRIER; + i915_request_get(rq); + i915_request_add(rq); + + err = wait_for_submit(engine, rq, HZ / 2); + i915_request_put(rq); + if (err) { + pr_err("%s: preemption request was not submitted\n", + engine->name); + err = -ETIME; + } + + pr_debug("%s: ring[0]:{ tail:%x, emit:%x }, ring[1]:{ tail:%x, emit:%x }\n", + engine->name, + ce[0]->ring->tail, ce[0]->ring->emit, + ce[1]->ring->tail, ce[1]->ring->emit); + +err_ce: + intel_engine_flush_submission(engine); + igt_spinner_end(&spin); + for (n = 0; n < ARRAY_SIZE(ce); n++) { + if (IS_ERR_OR_NULL(ce[n])) + break; + + intel_context_unpin(ce[n]); + intel_context_put(ce[n]); + } + st_engine_heartbeat_enable(engine); + if (igt_live_test_end(&t)) + err = -EIO; + if (err) + break; + } + + igt_spinner_fini(&spin); + return err; +} + +static int live_pin_rewind(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + /* + * We have to be careful not to trust intel_ring too much, for example + * ring->head is updated upon retire which is out of sync with pinning + * the context. Thus we cannot use ring->head to set CTX_RING_HEAD, + * or else we risk writing an older, stale value. + * + * To simulate this, let's apply a bit of deliberate sabotague. + */ + + for_each_engine(engine, gt, id) { + struct intel_context *ce; + struct i915_request *rq; + struct intel_ring *ring; + struct igt_live_test t; + + if (igt_live_test_begin(&t, gt->i915, __func__, engine->name)) { + err = -EIO; + break; + } + + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + break; + } + + err = intel_context_pin(ce); + if (err) { + intel_context_put(ce); + break; + } + + /* Keep the context awake while we play games */ + err = i915_active_acquire(&ce->active); + if (err) { + intel_context_unpin(ce); + intel_context_put(ce); + break; + } + ring = ce->ring; + + /* Poison the ring, and offset the next request from HEAD */ + memset32(ring->vaddr, STACK_MAGIC, ring->size / sizeof(u32)); + ring->emit = ring->size / 2; + ring->tail = ring->emit; + GEM_BUG_ON(ring->head); + + intel_context_unpin(ce); + + /* Submit a simple nop request */ + GEM_BUG_ON(intel_context_is_pinned(ce)); + rq = intel_context_create_request(ce); + i915_active_release(&ce->active); /* e.g. async retire */ + intel_context_put(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + break; + } + GEM_BUG_ON(!rq->head); + i915_request_add(rq); + + /* Expect not to hang! */ + if (igt_live_test_end(&t)) { + err = -EIO; + break; + } + } + + return err; +} + +static int live_hold_reset(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct igt_spinner spin; + int err = 0; + + /* + * In order to support offline error capture for fast preempt reset, + * we need to decouple the guilty request and ensure that it and its + * descendents are not executed while the capture is in progress. + */ + + if (!intel_has_reset_engine(gt)) + return 0; + + if (igt_spinner_init(&spin, gt)) + return -ENOMEM; + + for_each_engine(engine, gt, id) { + struct intel_context *ce; + struct i915_request *rq; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + break; + } + + st_engine_heartbeat_disable(engine); + + rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out; + } + i915_request_add(rq); + + if (!igt_wait_for_spinner(&spin, rq)) { + intel_gt_set_wedged(gt); + err = -ETIME; + goto out; + } + + /* We have our request executing, now remove it and reset */ + + if (test_and_set_bit(I915_RESET_ENGINE + id, + >->reset.flags)) { + intel_gt_set_wedged(gt); + err = -EBUSY; + goto out; + } + tasklet_disable(&engine->execlists.tasklet); + + engine->execlists.tasklet.func(engine->execlists.tasklet.data); + GEM_BUG_ON(execlists_active(&engine->execlists) != rq); + + i915_request_get(rq); + execlists_hold(engine, rq); + GEM_BUG_ON(!i915_request_on_hold(rq)); + + intel_engine_reset(engine, NULL); + GEM_BUG_ON(rq->fence.error != -EIO); + + tasklet_enable(&engine->execlists.tasklet); + clear_and_wake_up_bit(I915_RESET_ENGINE + id, + >->reset.flags); + + /* Check that we do not resubmit the held request */ + if (!i915_request_wait(rq, 0, HZ / 5)) { + pr_err("%s: on hold request completed!\n", + engine->name); + i915_request_put(rq); + err = -EIO; + goto out; + } + GEM_BUG_ON(!i915_request_on_hold(rq)); + + /* But is resubmitted on release */ + execlists_unhold(engine, rq); + if (i915_request_wait(rq, 0, HZ / 5) < 0) { + pr_err("%s: held request did not complete!\n", + engine->name); + intel_gt_set_wedged(gt); + err = -ETIME; + } + i915_request_put(rq); + +out: + st_engine_heartbeat_enable(engine); + intel_context_put(ce); + if (err) + break; + } + + igt_spinner_fini(&spin); + return err; +} + +static const char *error_repr(int err) +{ + return err ? "bad" : "good"; +} + +static int live_error_interrupt(void *arg) +{ + static const struct error_phase { + enum { GOOD = 0, BAD = -EIO } error[2]; + } phases[] = { + { { BAD, GOOD } }, + { { BAD, BAD } }, + { { BAD, GOOD } }, + { { GOOD, GOOD } }, /* sentinel */ + }; + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + + /* + * We hook up the CS_MASTER_ERROR_INTERRUPT to have forewarning + * of invalid commands in user batches that will cause a GPU hang. + * This is a faster mechanism than using hangcheck/heartbeats, but + * only detects problems the HW knows about -- it will not warn when + * we kill the HW! + * + * To verify our detection and reset, we throw some invalid commands + * at the HW and wait for the interrupt. + */ + + if (!intel_has_reset_engine(gt)) + return 0; + + for_each_engine(engine, gt, id) { + const struct error_phase *p; + int err = 0; + + st_engine_heartbeat_disable(engine); + + for (p = phases; p->error[0] != GOOD; p++) { + struct i915_request *client[ARRAY_SIZE(phases->error)]; + u32 *cs; + int i; + + memset(client, 0, sizeof(*client)); + for (i = 0; i < ARRAY_SIZE(client); i++) { + struct intel_context *ce; + struct i915_request *rq; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto out; + } + + rq = intel_context_create_request(ce); + intel_context_put(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out; + } + + if (rq->engine->emit_init_breadcrumb) { + err = rq->engine->emit_init_breadcrumb(rq); + if (err) { + i915_request_add(rq); + goto out; + } + } + + cs = intel_ring_begin(rq, 2); + if (IS_ERR(cs)) { + i915_request_add(rq); + err = PTR_ERR(cs); + goto out; + } + + if (p->error[i]) { + *cs++ = 0xdeadbeef; + *cs++ = 0xdeadbeef; + } else { + *cs++ = MI_NOOP; + *cs++ = MI_NOOP; + } + + client[i] = i915_request_get(rq); + i915_request_add(rq); + } + + err = wait_for_submit(engine, client[0], HZ / 2); + if (err) { + pr_err("%s: first request did not start within time!\n", + engine->name); + err = -ETIME; + goto out; + } + + for (i = 0; i < ARRAY_SIZE(client); i++) { + if (i915_request_wait(client[i], 0, HZ / 5) < 0) + pr_debug("%s: %s request incomplete!\n", + engine->name, + error_repr(p->error[i])); + + if (!i915_request_started(client[i])) { + pr_err("%s: %s request not started!\n", + engine->name, + error_repr(p->error[i])); + err = -ETIME; + goto out; + } + + /* Kick the tasklet to process the error */ + intel_engine_flush_submission(engine); + if (client[i]->fence.error != p->error[i]) { + pr_err("%s: %s request (%s) with wrong error code: %d\n", + engine->name, + error_repr(p->error[i]), + i915_request_completed(client[i]) ? "completed" : "running", + client[i]->fence.error); + err = -EINVAL; + goto out; + } + } + +out: + for (i = 0; i < ARRAY_SIZE(client); i++) + if (client[i]) + i915_request_put(client[i]); + if (err) { + pr_err("%s: failed at phase[%zd] { %d, %d }\n", + engine->name, p - phases, + p->error[0], p->error[1]); + break; + } + } + + st_engine_heartbeat_enable(engine); + if (err) { + intel_gt_set_wedged(gt); + return err; + } + } + + return 0; +} + +static int +emit_semaphore_chain(struct i915_request *rq, struct i915_vma *vma, int idx) +{ + u32 *cs; + + cs = intel_ring_begin(rq, 10); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + + *cs++ = MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_NEQ_SDD; + *cs++ = 0; + *cs++ = i915_ggtt_offset(vma) + 4 * idx; + *cs++ = 0; + + if (idx > 0) { + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = i915_ggtt_offset(vma) + 4 * (idx - 1); + *cs++ = 0; + *cs++ = 1; + } else { + *cs++ = MI_NOOP; + *cs++ = MI_NOOP; + *cs++ = MI_NOOP; + *cs++ = MI_NOOP; + } + + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + + intel_ring_advance(rq, cs); + return 0; +} + +static struct i915_request * +semaphore_queue(struct intel_engine_cs *engine, struct i915_vma *vma, int idx) +{ + struct intel_context *ce; + struct i915_request *rq; + int err; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return ERR_CAST(ce); + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) + goto out_ce; + + err = 0; + if (rq->engine->emit_init_breadcrumb) + err = rq->engine->emit_init_breadcrumb(rq); + if (err == 0) + err = emit_semaphore_chain(rq, vma, idx); + if (err == 0) + i915_request_get(rq); + i915_request_add(rq); + if (err) + rq = ERR_PTR(err); + +out_ce: + intel_context_put(ce); + return rq; +} + +static int +release_queue(struct intel_engine_cs *engine, + struct i915_vma *vma, + int idx, int prio) +{ + struct i915_sched_attr attr = { + .priority = prio, + }; + struct i915_request *rq; + u32 *cs; + + rq = intel_engine_create_kernel_request(engine); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) { + i915_request_add(rq); + return PTR_ERR(cs); + } + + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = i915_ggtt_offset(vma) + 4 * (idx - 1); + *cs++ = 0; + *cs++ = 1; + + intel_ring_advance(rq, cs); + + i915_request_get(rq); + i915_request_add(rq); + + local_bh_disable(); + engine->schedule(rq, &attr); + local_bh_enable(); /* kick tasklet */ + + i915_request_put(rq); + + return 0; +} + +static int +slice_semaphore_queue(struct intel_engine_cs *outer, + struct i915_vma *vma, + int count) +{ + struct intel_engine_cs *engine; + struct i915_request *head; + enum intel_engine_id id; + int err, i, n = 0; + + head = semaphore_queue(outer, vma, n++); + if (IS_ERR(head)) + return PTR_ERR(head); + + for_each_engine(engine, outer->gt, id) { + for (i = 0; i < count; i++) { + struct i915_request *rq; + + rq = semaphore_queue(engine, vma, n++); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out; + } + + i915_request_put(rq); + } + } + + err = release_queue(outer, vma, n, I915_PRIORITY_BARRIER); + if (err) + goto out; + + if (i915_request_wait(head, 0, + 2 * outer->gt->info.num_engines * (count + 2) * (count + 3)) < 0) { + pr_err("Failed to slice along semaphore chain of length (%d, %d)!\n", + count, n); + GEM_TRACE_DUMP(); + intel_gt_set_wedged(outer->gt); + err = -EIO; + } + +out: + i915_request_put(head); + return err; +} + +static int live_timeslice_preempt(void *arg) +{ + struct intel_gt *gt = arg; + struct drm_i915_gem_object *obj; + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct i915_vma *vma; + void *vaddr; + int err = 0; + + /* + * If a request takes too long, we would like to give other users + * a fair go on the GPU. In particular, users may create batches + * that wait upon external input, where that input may even be + * supplied by another GPU job. To avoid blocking forever, we + * need to preempt the current task and replace it with another + * ready task. + */ + if (!IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION)) + return 0; + + obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + vma = i915_vma_instance(obj, >->ggtt->vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_obj; + } + + vaddr = i915_gem_object_pin_map(obj, I915_MAP_WC); + if (IS_ERR(vaddr)) { + err = PTR_ERR(vaddr); + goto err_obj; + } + + err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL); + if (err) + goto err_map; + + err = i915_vma_sync(vma); + if (err) + goto err_pin; + + for_each_engine(engine, gt, id) { + if (!intel_engine_has_preemption(engine)) + continue; + + memset(vaddr, 0, PAGE_SIZE); + + st_engine_heartbeat_disable(engine); + err = slice_semaphore_queue(engine, vma, 5); + st_engine_heartbeat_enable(engine); + if (err) + goto err_pin; + + if (igt_flush_test(gt->i915)) { + err = -EIO; + goto err_pin; + } + } + +err_pin: + i915_vma_unpin(vma); +err_map: + i915_gem_object_unpin_map(obj); +err_obj: + i915_gem_object_put(obj); + return err; +} + +static struct i915_request * +create_rewinder(struct intel_context *ce, + struct i915_request *wait, + void *slot, int idx) +{ + const u32 offset = + i915_ggtt_offset(ce->engine->status_page.vma) + + offset_in_page(slot); + struct i915_request *rq; + u32 *cs; + int err; + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) + return rq; + + if (wait) { + err = i915_request_await_dma_fence(rq, &wait->fence); + if (err) + goto err; + } + + cs = intel_ring_begin(rq, 14); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto err; + } + + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + *cs++ = MI_NOOP; + + *cs++ = MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_GTE_SDD; + *cs++ = idx; + *cs++ = offset; + *cs++ = 0; + + *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; + *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(rq->engine->mmio_base)); + *cs++ = offset + idx * sizeof(u32); + *cs++ = 0; + + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = offset; + *cs++ = 0; + *cs++ = idx + 1; + + intel_ring_advance(rq, cs); + + rq->sched.attr.priority = I915_PRIORITY_MASK; + err = 0; +err: + i915_request_get(rq); + i915_request_add(rq); + if (err) { + i915_request_put(rq); + return ERR_PTR(err); + } + + return rq; +} + +static int live_timeslice_rewind(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + + /* + * The usual presumption on timeslice expiration is that we replace + * the active context with another. However, given a chain of + * dependencies we may end up with replacing the context with itself, + * but only a few of those requests, forcing us to rewind the + * RING_TAIL of the original request. + */ + if (!IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION)) + return 0; + + for_each_engine(engine, gt, id) { + enum { A1, A2, B1 }; + enum { X = 1, Z, Y }; + struct i915_request *rq[3] = {}; + struct intel_context *ce; + unsigned long timeslice; + int i, err = 0; + u32 *slot; + + if (!intel_engine_has_timeslices(engine)) + continue; + + /* + * A:rq1 -- semaphore wait, timestamp X + * A:rq2 -- write timestamp Y + * + * B:rq1 [await A:rq1] -- write timestamp Z + * + * Force timeslice, release semaphore. + * + * Expect execution/evaluation order XZY + */ + + st_engine_heartbeat_disable(engine); + timeslice = xchg(&engine->props.timeslice_duration_ms, 1); + + slot = memset32(engine->status_page.addr + 1000, 0, 4); + + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto err; + } + + rq[A1] = create_rewinder(ce, NULL, slot, X); + if (IS_ERR(rq[A1])) { + intel_context_put(ce); + goto err; + } + + rq[A2] = create_rewinder(ce, NULL, slot, Y); + intel_context_put(ce); + if (IS_ERR(rq[A2])) + goto err; + + err = wait_for_submit(engine, rq[A2], HZ / 2); + if (err) { + pr_err("%s: failed to submit first context\n", + engine->name); + goto err; + } + + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto err; + } + + rq[B1] = create_rewinder(ce, rq[A1], slot, Z); + intel_context_put(ce); + if (IS_ERR(rq[2])) + goto err; + + err = wait_for_submit(engine, rq[B1], HZ / 2); + if (err) { + pr_err("%s: failed to submit second context\n", + engine->name); + goto err; + } + + /* ELSP[] = { { A:rq1, A:rq2 }, { B:rq1 } } */ + ENGINE_TRACE(engine, "forcing tasklet for rewind\n"); + if (i915_request_is_active(rq[A2])) { /* semaphore yielded! */ + /* Wait for the timeslice to kick in */ + del_timer(&engine->execlists.timer); + tasklet_hi_schedule(&engine->execlists.tasklet); + intel_engine_flush_submission(engine); + } + /* -> ELSP[] = { { A:rq1 }, { B:rq1 } } */ + GEM_BUG_ON(!i915_request_is_active(rq[A1])); + GEM_BUG_ON(!i915_request_is_active(rq[B1])); + GEM_BUG_ON(i915_request_is_active(rq[A2])); + + /* Release the hounds! */ + slot[0] = 1; + wmb(); /* "pairs" with GPU; paranoid kick of internal CPU$ */ + + for (i = 1; i <= 3; i++) { + unsigned long timeout = jiffies + HZ / 2; + + while (!READ_ONCE(slot[i]) && + time_before(jiffies, timeout)) + ; + + if (!time_before(jiffies, timeout)) { + pr_err("%s: rq[%d] timed out\n", + engine->name, i - 1); + err = -ETIME; + goto err; + } + + pr_debug("%s: slot[%d]:%x\n", engine->name, i, slot[i]); + } + + /* XZY: XZ < XY */ + if (slot[Z] - slot[X] >= slot[Y] - slot[X]) { + pr_err("%s: timeslicing did not run context B [%u] before A [%u]!\n", + engine->name, + slot[Z] - slot[X], + slot[Y] - slot[X]); + err = -EINVAL; + } + +err: + memset32(&slot[0], -1, 4); + wmb(); + + engine->props.timeslice_duration_ms = timeslice; + st_engine_heartbeat_enable(engine); + for (i = 0; i < 3; i++) + i915_request_put(rq[i]); + if (igt_flush_test(gt->i915)) + err = -EIO; + if (err) + return err; + } + + return 0; +} + +static struct i915_request *nop_request(struct intel_engine_cs *engine) +{ + struct i915_request *rq; + + rq = intel_engine_create_kernel_request(engine); + if (IS_ERR(rq)) + return rq; + + i915_request_get(rq); + i915_request_add(rq); + + return rq; +} + +static long slice_timeout(struct intel_engine_cs *engine) +{ + long timeout; + + /* Enough time for a timeslice to kick in, and kick out */ + timeout = 2 * msecs_to_jiffies_timeout(timeslice(engine)); + + /* Enough time for the nop request to complete */ + timeout += HZ / 5; + + return timeout + 1; +} + +static int live_timeslice_queue(void *arg) +{ + struct intel_gt *gt = arg; + struct drm_i915_gem_object *obj; + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct i915_vma *vma; + void *vaddr; + int err = 0; + + /* + * Make sure that even if ELSP[0] and ELSP[1] are filled with + * timeslicing between them disabled, we *do* enable timeslicing + * if the queue demands it. (Normally, we do not submit if + * ELSP[1] is already occupied, so must rely on timeslicing to + * eject ELSP[0] in favour of the queue.) + */ + if (!IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION)) + return 0; + + obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + vma = i915_vma_instance(obj, >->ggtt->vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_obj; + } + + vaddr = i915_gem_object_pin_map(obj, I915_MAP_WC); + if (IS_ERR(vaddr)) { + err = PTR_ERR(vaddr); + goto err_obj; + } + + err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL); + if (err) + goto err_map; + + err = i915_vma_sync(vma); + if (err) + goto err_pin; + + for_each_engine(engine, gt, id) { + struct i915_sched_attr attr = { + .priority = I915_USER_PRIORITY(I915_PRIORITY_MAX), + }; + struct i915_request *rq, *nop; + + if (!intel_engine_has_preemption(engine)) + continue; + + st_engine_heartbeat_disable(engine); + memset(vaddr, 0, PAGE_SIZE); + + /* ELSP[0]: semaphore wait */ + rq = semaphore_queue(engine, vma, 0); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_heartbeat; + } + engine->schedule(rq, &attr); + err = wait_for_submit(engine, rq, HZ / 2); + if (err) { + pr_err("%s: Timed out trying to submit semaphores\n", + engine->name); + goto err_rq; + } + + /* ELSP[1]: nop request */ + nop = nop_request(engine); + if (IS_ERR(nop)) { + err = PTR_ERR(nop); + goto err_rq; + } + err = wait_for_submit(engine, nop, HZ / 2); + i915_request_put(nop); + if (err) { + pr_err("%s: Timed out trying to submit nop\n", + engine->name); + goto err_rq; + } + + GEM_BUG_ON(i915_request_completed(rq)); + GEM_BUG_ON(execlists_active(&engine->execlists) != rq); + + /* Queue: semaphore signal, matching priority as semaphore */ + err = release_queue(engine, vma, 1, effective_prio(rq)); + if (err) + goto err_rq; + + /* Wait until we ack the release_queue and start timeslicing */ + do { + cond_resched(); + intel_engine_flush_submission(engine); + } while (READ_ONCE(engine->execlists.pending[0])); + + /* Timeslice every jiffy, so within 2 we should signal */ + if (i915_request_wait(rq, 0, slice_timeout(engine)) < 0) { + struct drm_printer p = + drm_info_printer(gt->i915->drm.dev); + + pr_err("%s: Failed to timeslice into queue\n", + engine->name); + intel_engine_dump(engine, &p, + "%s\n", engine->name); + + memset(vaddr, 0xff, PAGE_SIZE); + err = -EIO; + } +err_rq: + i915_request_put(rq); +err_heartbeat: + st_engine_heartbeat_enable(engine); + if (err) + break; + } + +err_pin: + i915_vma_unpin(vma); +err_map: + i915_gem_object_unpin_map(obj); +err_obj: + i915_gem_object_put(obj); + return err; +} + +static int live_timeslice_nopreempt(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct igt_spinner spin; + int err = 0; + + /* + * We should not timeslice into a request that is marked with + * I915_REQUEST_NOPREEMPT. + */ + if (!IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION)) + return 0; + + if (igt_spinner_init(&spin, gt)) + return -ENOMEM; + + for_each_engine(engine, gt, id) { + struct intel_context *ce; + struct i915_request *rq; + unsigned long timeslice; + + if (!intel_engine_has_preemption(engine)) + continue; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + break; + } + + st_engine_heartbeat_disable(engine); + timeslice = xchg(&engine->props.timeslice_duration_ms, 1); + + /* Create an unpreemptible spinner */ + + rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); + intel_context_put(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_heartbeat; + } + + i915_request_get(rq); + i915_request_add(rq); + + if (!igt_wait_for_spinner(&spin, rq)) { + i915_request_put(rq); + err = -ETIME; + goto out_spin; + } + + set_bit(I915_FENCE_FLAG_NOPREEMPT, &rq->fence.flags); + i915_request_put(rq); + + /* Followed by a maximum priority barrier (heartbeat) */ + + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto out_spin; + } + + rq = intel_context_create_request(ce); + intel_context_put(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_spin; + } + + rq->sched.attr.priority = I915_PRIORITY_BARRIER; + i915_request_get(rq); + i915_request_add(rq); + + /* + * Wait until the barrier is in ELSP, and we know timeslicing + * will have been activated. + */ + if (wait_for_submit(engine, rq, HZ / 2)) { + i915_request_put(rq); + err = -ETIME; + goto out_spin; + } + + /* + * Since the ELSP[0] request is unpreemptible, it should not + * allow the maximum priority barrier through. Wait long + * enough to see if it is timesliced in by mistake. + */ + if (i915_request_wait(rq, 0, slice_timeout(engine)) >= 0) { + pr_err("%s: I915_PRIORITY_BARRIER request completed, bypassing no-preempt request\n", + engine->name); + err = -EINVAL; + } + i915_request_put(rq); + +out_spin: + igt_spinner_end(&spin); +out_heartbeat: + xchg(&engine->props.timeslice_duration_ms, timeslice); + st_engine_heartbeat_enable(engine); + if (err) + break; + + if (igt_flush_test(gt->i915)) { + err = -EIO; + break; + } + } + + igt_spinner_fini(&spin); + return err; +} + +static int live_busywait_preempt(void *arg) +{ + struct intel_gt *gt = arg; + struct i915_gem_context *ctx_hi, *ctx_lo; + struct intel_engine_cs *engine; + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + enum intel_engine_id id; + int err = -ENOMEM; + u32 *map; + + /* + * Verify that even without HAS_LOGICAL_RING_PREEMPTION, we can + * preempt the busywaits used to synchronise between rings. + */ + + ctx_hi = kernel_context(gt->i915); + if (!ctx_hi) + return -ENOMEM; + ctx_hi->sched.priority = + I915_USER_PRIORITY(I915_CONTEXT_MAX_USER_PRIORITY); + + ctx_lo = kernel_context(gt->i915); + if (!ctx_lo) + goto err_ctx_hi; + ctx_lo->sched.priority = + I915_USER_PRIORITY(I915_CONTEXT_MIN_USER_PRIORITY); + + obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + goto err_ctx_lo; + } + + map = i915_gem_object_pin_map(obj, I915_MAP_WC); + if (IS_ERR(map)) { + err = PTR_ERR(map); + goto err_obj; + } + + vma = i915_vma_instance(obj, >->ggtt->vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_map; + } + + err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL); + if (err) + goto err_map; + + err = i915_vma_sync(vma); + if (err) + goto err_vma; + + for_each_engine(engine, gt, id) { + struct i915_request *lo, *hi; + struct igt_live_test t; + u32 *cs; + + if (!intel_engine_has_preemption(engine)) + continue; + + if (!intel_engine_can_store_dword(engine)) + continue; + + if (igt_live_test_begin(&t, gt->i915, __func__, engine->name)) { + err = -EIO; + goto err_vma; + } + + /* + * We create two requests. The low priority request + * busywaits on a semaphore (inside the ringbuffer where + * is should be preemptible) and the high priority requests + * uses a MI_STORE_DWORD_IMM to update the semaphore value + * allowing the first request to complete. If preemption + * fails, we hang instead. + */ + + lo = igt_request_alloc(ctx_lo, engine); + if (IS_ERR(lo)) { + err = PTR_ERR(lo); + goto err_vma; + } + + cs = intel_ring_begin(lo, 8); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + i915_request_add(lo); + goto err_vma; + } + + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = i915_ggtt_offset(vma); + *cs++ = 0; + *cs++ = 1; + + /* XXX Do we need a flush + invalidate here? */ + + *cs++ = MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD; + *cs++ = 0; + *cs++ = i915_ggtt_offset(vma); + *cs++ = 0; + + intel_ring_advance(lo, cs); + + i915_request_get(lo); + i915_request_add(lo); + + if (wait_for(READ_ONCE(*map), 10)) { + i915_request_put(lo); + err = -ETIMEDOUT; + goto err_vma; + } + + /* Low priority request should be busywaiting now */ + if (i915_request_wait(lo, 0, 1) != -ETIME) { + i915_request_put(lo); + pr_err("%s: Busywaiting request did not!\n", + engine->name); + err = -EIO; + goto err_vma; + } + + hi = igt_request_alloc(ctx_hi, engine); + if (IS_ERR(hi)) { + err = PTR_ERR(hi); + i915_request_put(lo); + goto err_vma; + } + + cs = intel_ring_begin(hi, 4); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + i915_request_add(hi); + i915_request_put(lo); + goto err_vma; + } + + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = i915_ggtt_offset(vma); + *cs++ = 0; + *cs++ = 0; + + intel_ring_advance(hi, cs); + i915_request_add(hi); + + if (i915_request_wait(lo, 0, HZ / 5) < 0) { + struct drm_printer p = drm_info_printer(gt->i915->drm.dev); + + pr_err("%s: Failed to preempt semaphore busywait!\n", + engine->name); + + intel_engine_dump(engine, &p, "%s\n", engine->name); + GEM_TRACE_DUMP(); + + i915_request_put(lo); + intel_gt_set_wedged(gt); + err = -EIO; + goto err_vma; + } + GEM_BUG_ON(READ_ONCE(*map)); + i915_request_put(lo); + + if (igt_live_test_end(&t)) { + err = -EIO; + goto err_vma; + } + } + + err = 0; +err_vma: + i915_vma_unpin(vma); +err_map: + i915_gem_object_unpin_map(obj); +err_obj: + i915_gem_object_put(obj); +err_ctx_lo: + kernel_context_close(ctx_lo); +err_ctx_hi: + kernel_context_close(ctx_hi); + return err; +} + +static struct i915_request * +spinner_create_request(struct igt_spinner *spin, + struct i915_gem_context *ctx, + struct intel_engine_cs *engine, + u32 arb) +{ + struct intel_context *ce; + struct i915_request *rq; + + ce = i915_gem_context_get_engine(ctx, engine->legacy_idx); + if (IS_ERR(ce)) + return ERR_CAST(ce); + + rq = igt_spinner_create_request(spin, ce, arb); + intel_context_put(ce); + return rq; +} + +static int live_preempt(void *arg) +{ + struct intel_gt *gt = arg; + struct i915_gem_context *ctx_hi, *ctx_lo; + struct igt_spinner spin_hi, spin_lo; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = -ENOMEM; + + if (!HAS_LOGICAL_RING_PREEMPTION(gt->i915)) + return 0; + + if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PREEMPTION)) + pr_err("Logical preemption supported, but not exposed\n"); + + if (igt_spinner_init(&spin_hi, gt)) + return -ENOMEM; + + if (igt_spinner_init(&spin_lo, gt)) + goto err_spin_hi; + + ctx_hi = kernel_context(gt->i915); + if (!ctx_hi) + goto err_spin_lo; + ctx_hi->sched.priority = + I915_USER_PRIORITY(I915_CONTEXT_MAX_USER_PRIORITY); + + ctx_lo = kernel_context(gt->i915); + if (!ctx_lo) + goto err_ctx_hi; + ctx_lo->sched.priority = + I915_USER_PRIORITY(I915_CONTEXT_MIN_USER_PRIORITY); + + for_each_engine(engine, gt, id) { + struct igt_live_test t; + struct i915_request *rq; + + if (!intel_engine_has_preemption(engine)) + continue; + + if (igt_live_test_begin(&t, gt->i915, __func__, engine->name)) { + err = -EIO; + goto err_ctx_lo; + } + + rq = spinner_create_request(&spin_lo, ctx_lo, engine, + MI_ARB_CHECK); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_ctx_lo; + } + + i915_request_add(rq); + if (!igt_wait_for_spinner(&spin_lo, rq)) { + GEM_TRACE("lo spinner failed to start\n"); + GEM_TRACE_DUMP(); + intel_gt_set_wedged(gt); + err = -EIO; + goto err_ctx_lo; + } + + rq = spinner_create_request(&spin_hi, ctx_hi, engine, + MI_ARB_CHECK); + if (IS_ERR(rq)) { + igt_spinner_end(&spin_lo); + err = PTR_ERR(rq); + goto err_ctx_lo; + } + + i915_request_add(rq); + if (!igt_wait_for_spinner(&spin_hi, rq)) { + GEM_TRACE("hi spinner failed to start\n"); + GEM_TRACE_DUMP(); + intel_gt_set_wedged(gt); + err = -EIO; + goto err_ctx_lo; + } + + igt_spinner_end(&spin_hi); + igt_spinner_end(&spin_lo); + + if (igt_live_test_end(&t)) { + err = -EIO; + goto err_ctx_lo; + } + } + + err = 0; +err_ctx_lo: + kernel_context_close(ctx_lo); +err_ctx_hi: + kernel_context_close(ctx_hi); +err_spin_lo: + igt_spinner_fini(&spin_lo); +err_spin_hi: + igt_spinner_fini(&spin_hi); + return err; +} + +static int live_late_preempt(void *arg) +{ + struct intel_gt *gt = arg; + struct i915_gem_context *ctx_hi, *ctx_lo; + struct igt_spinner spin_hi, spin_lo; + struct intel_engine_cs *engine; + struct i915_sched_attr attr = {}; + enum intel_engine_id id; + int err = -ENOMEM; + + if (!HAS_LOGICAL_RING_PREEMPTION(gt->i915)) + return 0; + + if (igt_spinner_init(&spin_hi, gt)) + return -ENOMEM; + + if (igt_spinner_init(&spin_lo, gt)) + goto err_spin_hi; + + ctx_hi = kernel_context(gt->i915); + if (!ctx_hi) + goto err_spin_lo; + + ctx_lo = kernel_context(gt->i915); + if (!ctx_lo) + goto err_ctx_hi; + + /* Make sure ctx_lo stays before ctx_hi until we trigger preemption. */ + ctx_lo->sched.priority = I915_USER_PRIORITY(1); + + for_each_engine(engine, gt, id) { + struct igt_live_test t; + struct i915_request *rq; + + if (!intel_engine_has_preemption(engine)) + continue; + + if (igt_live_test_begin(&t, gt->i915, __func__, engine->name)) { + err = -EIO; + goto err_ctx_lo; + } + + rq = spinner_create_request(&spin_lo, ctx_lo, engine, + MI_ARB_CHECK); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_ctx_lo; + } + + i915_request_add(rq); + if (!igt_wait_for_spinner(&spin_lo, rq)) { + pr_err("First context failed to start\n"); + goto err_wedged; + } + + rq = spinner_create_request(&spin_hi, ctx_hi, engine, + MI_NOOP); + if (IS_ERR(rq)) { + igt_spinner_end(&spin_lo); + err = PTR_ERR(rq); + goto err_ctx_lo; + } + + i915_request_add(rq); + if (igt_wait_for_spinner(&spin_hi, rq)) { + pr_err("Second context overtook first?\n"); + goto err_wedged; + } + + attr.priority = I915_USER_PRIORITY(I915_PRIORITY_MAX); + engine->schedule(rq, &attr); + + if (!igt_wait_for_spinner(&spin_hi, rq)) { + pr_err("High priority context failed to preempt the low priority context\n"); + GEM_TRACE_DUMP(); + goto err_wedged; + } + + igt_spinner_end(&spin_hi); + igt_spinner_end(&spin_lo); + + if (igt_live_test_end(&t)) { + err = -EIO; + goto err_ctx_lo; + } + } + + err = 0; +err_ctx_lo: + kernel_context_close(ctx_lo); +err_ctx_hi: + kernel_context_close(ctx_hi); +err_spin_lo: + igt_spinner_fini(&spin_lo); +err_spin_hi: + igt_spinner_fini(&spin_hi); + return err; + +err_wedged: + igt_spinner_end(&spin_hi); + igt_spinner_end(&spin_lo); + intel_gt_set_wedged(gt); + err = -EIO; + goto err_ctx_lo; +} + +struct preempt_client { + struct igt_spinner spin; + struct i915_gem_context *ctx; +}; + +static int preempt_client_init(struct intel_gt *gt, struct preempt_client *c) +{ + c->ctx = kernel_context(gt->i915); + if (!c->ctx) + return -ENOMEM; + + if (igt_spinner_init(&c->spin, gt)) + goto err_ctx; + + return 0; + +err_ctx: + kernel_context_close(c->ctx); + return -ENOMEM; +} + +static void preempt_client_fini(struct preempt_client *c) +{ + igt_spinner_fini(&c->spin); + kernel_context_close(c->ctx); +} + +static int live_nopreempt(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + struct preempt_client a, b; + enum intel_engine_id id; + int err = -ENOMEM; + + /* + * Verify that we can disable preemption for an individual request + * that may be being observed and not want to be interrupted. + */ + + if (!HAS_LOGICAL_RING_PREEMPTION(gt->i915)) + return 0; + + if (preempt_client_init(gt, &a)) + return -ENOMEM; + if (preempt_client_init(gt, &b)) + goto err_client_a; + b.ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_MAX); + + for_each_engine(engine, gt, id) { + struct i915_request *rq_a, *rq_b; + + if (!intel_engine_has_preemption(engine)) + continue; + + engine->execlists.preempt_hang.count = 0; + + rq_a = spinner_create_request(&a.spin, + a.ctx, engine, + MI_ARB_CHECK); + if (IS_ERR(rq_a)) { + err = PTR_ERR(rq_a); + goto err_client_b; + } + + /* Low priority client, but unpreemptable! */ + __set_bit(I915_FENCE_FLAG_NOPREEMPT, &rq_a->fence.flags); + + i915_request_add(rq_a); + if (!igt_wait_for_spinner(&a.spin, rq_a)) { + pr_err("First client failed to start\n"); + goto err_wedged; + } + + rq_b = spinner_create_request(&b.spin, + b.ctx, engine, + MI_ARB_CHECK); + if (IS_ERR(rq_b)) { + err = PTR_ERR(rq_b); + goto err_client_b; + } + + i915_request_add(rq_b); + + /* B is much more important than A! (But A is unpreemptable.) */ + GEM_BUG_ON(rq_prio(rq_b) <= rq_prio(rq_a)); + + /* Wait long enough for preemption and timeslicing */ + if (igt_wait_for_spinner(&b.spin, rq_b)) { + pr_err("Second client started too early!\n"); + goto err_wedged; + } + + igt_spinner_end(&a.spin); + + if (!igt_wait_for_spinner(&b.spin, rq_b)) { + pr_err("Second client failed to start\n"); + goto err_wedged; + } + + igt_spinner_end(&b.spin); + + if (engine->execlists.preempt_hang.count) { + pr_err("Preemption recorded x%d; should have been suppressed!\n", + engine->execlists.preempt_hang.count); + err = -EINVAL; + goto err_wedged; + } + + if (igt_flush_test(gt->i915)) + goto err_wedged; + } + + err = 0; +err_client_b: + preempt_client_fini(&b); +err_client_a: + preempt_client_fini(&a); + return err; + +err_wedged: + igt_spinner_end(&b.spin); + igt_spinner_end(&a.spin); + intel_gt_set_wedged(gt); + err = -EIO; + goto err_client_b; +} + +struct live_preempt_cancel { + struct intel_engine_cs *engine; + struct preempt_client a, b; +}; + +static int __cancel_active0(struct live_preempt_cancel *arg) +{ + struct i915_request *rq; + struct igt_live_test t; + int err; + + /* Preempt cancel of ELSP0 */ + GEM_TRACE("%s(%s)\n", __func__, arg->engine->name); + if (igt_live_test_begin(&t, arg->engine->i915, + __func__, arg->engine->name)) + return -EIO; + + rq = spinner_create_request(&arg->a.spin, + arg->a.ctx, arg->engine, + MI_ARB_CHECK); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + clear_bit(CONTEXT_BANNED, &rq->context->flags); + i915_request_get(rq); + i915_request_add(rq); + if (!igt_wait_for_spinner(&arg->a.spin, rq)) { + err = -EIO; + goto out; + } + + intel_context_set_banned(rq->context); + err = intel_engine_pulse(arg->engine); + if (err) + goto out; + + err = wait_for_reset(arg->engine, rq, HZ / 2); + if (err) { + pr_err("Cancelled inflight0 request did not reset\n"); + goto out; + } + +out: + i915_request_put(rq); + if (igt_live_test_end(&t)) + err = -EIO; + return err; +} + +static int __cancel_active1(struct live_preempt_cancel *arg) +{ + struct i915_request *rq[2] = {}; + struct igt_live_test t; + int err; + + /* Preempt cancel of ELSP1 */ + GEM_TRACE("%s(%s)\n", __func__, arg->engine->name); + if (igt_live_test_begin(&t, arg->engine->i915, + __func__, arg->engine->name)) + return -EIO; + + rq[0] = spinner_create_request(&arg->a.spin, + arg->a.ctx, arg->engine, + MI_NOOP); /* no preemption */ + if (IS_ERR(rq[0])) + return PTR_ERR(rq[0]); + + clear_bit(CONTEXT_BANNED, &rq[0]->context->flags); + i915_request_get(rq[0]); + i915_request_add(rq[0]); + if (!igt_wait_for_spinner(&arg->a.spin, rq[0])) { + err = -EIO; + goto out; + } + + rq[1] = spinner_create_request(&arg->b.spin, + arg->b.ctx, arg->engine, + MI_ARB_CHECK); + if (IS_ERR(rq[1])) { + err = PTR_ERR(rq[1]); + goto out; + } + + clear_bit(CONTEXT_BANNED, &rq[1]->context->flags); + i915_request_get(rq[1]); + err = i915_request_await_dma_fence(rq[1], &rq[0]->fence); + i915_request_add(rq[1]); + if (err) + goto out; + + intel_context_set_banned(rq[1]->context); + err = intel_engine_pulse(arg->engine); + if (err) + goto out; + + igt_spinner_end(&arg->a.spin); + err = wait_for_reset(arg->engine, rq[1], HZ / 2); + if (err) + goto out; + + if (rq[0]->fence.error != 0) { + pr_err("Normal inflight0 request did not complete\n"); + err = -EINVAL; + goto out; + } + + if (rq[1]->fence.error != -EIO) { + pr_err("Cancelled inflight1 request did not report -EIO\n"); + err = -EINVAL; + goto out; + } + +out: + i915_request_put(rq[1]); + i915_request_put(rq[0]); + if (igt_live_test_end(&t)) + err = -EIO; + return err; +} + +static int __cancel_queued(struct live_preempt_cancel *arg) +{ + struct i915_request *rq[3] = {}; + struct igt_live_test t; + int err; + + /* Full ELSP and one in the wings */ + GEM_TRACE("%s(%s)\n", __func__, arg->engine->name); + if (igt_live_test_begin(&t, arg->engine->i915, + __func__, arg->engine->name)) + return -EIO; + + rq[0] = spinner_create_request(&arg->a.spin, + arg->a.ctx, arg->engine, + MI_ARB_CHECK); + if (IS_ERR(rq[0])) + return PTR_ERR(rq[0]); + + clear_bit(CONTEXT_BANNED, &rq[0]->context->flags); + i915_request_get(rq[0]); + i915_request_add(rq[0]); + if (!igt_wait_for_spinner(&arg->a.spin, rq[0])) { + err = -EIO; + goto out; + } + + rq[1] = igt_request_alloc(arg->b.ctx, arg->engine); + if (IS_ERR(rq[1])) { + err = PTR_ERR(rq[1]); + goto out; + } + + clear_bit(CONTEXT_BANNED, &rq[1]->context->flags); + i915_request_get(rq[1]); + err = i915_request_await_dma_fence(rq[1], &rq[0]->fence); + i915_request_add(rq[1]); + if (err) + goto out; + + rq[2] = spinner_create_request(&arg->b.spin, + arg->a.ctx, arg->engine, + MI_ARB_CHECK); + if (IS_ERR(rq[2])) { + err = PTR_ERR(rq[2]); + goto out; + } + + i915_request_get(rq[2]); + err = i915_request_await_dma_fence(rq[2], &rq[1]->fence); + i915_request_add(rq[2]); + if (err) + goto out; + + intel_context_set_banned(rq[2]->context); + err = intel_engine_pulse(arg->engine); + if (err) + goto out; + + err = wait_for_reset(arg->engine, rq[2], HZ / 2); + if (err) + goto out; + + if (rq[0]->fence.error != -EIO) { + pr_err("Cancelled inflight0 request did not report -EIO\n"); + err = -EINVAL; + goto out; + } + + if (rq[1]->fence.error != 0) { + pr_err("Normal inflight1 request did not complete\n"); + err = -EINVAL; + goto out; + } + + if (rq[2]->fence.error != -EIO) { + pr_err("Cancelled queued request did not report -EIO\n"); + err = -EINVAL; + goto out; + } + +out: + i915_request_put(rq[2]); + i915_request_put(rq[1]); + i915_request_put(rq[0]); + if (igt_live_test_end(&t)) + err = -EIO; + return err; +} + +static int __cancel_hostile(struct live_preempt_cancel *arg) +{ + struct i915_request *rq; + int err; + + /* Preempt cancel non-preemptible spinner in ELSP0 */ + if (!IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT)) + return 0; + + if (!intel_has_reset_engine(arg->engine->gt)) + return 0; + + GEM_TRACE("%s(%s)\n", __func__, arg->engine->name); + rq = spinner_create_request(&arg->a.spin, + arg->a.ctx, arg->engine, + MI_NOOP); /* preemption disabled */ + if (IS_ERR(rq)) + return PTR_ERR(rq); + + clear_bit(CONTEXT_BANNED, &rq->context->flags); + i915_request_get(rq); + i915_request_add(rq); + if (!igt_wait_for_spinner(&arg->a.spin, rq)) { + err = -EIO; + goto out; + } + + intel_context_set_banned(rq->context); + err = intel_engine_pulse(arg->engine); /* force reset */ + if (err) + goto out; + + err = wait_for_reset(arg->engine, rq, HZ / 2); + if (err) { + pr_err("Cancelled inflight0 request did not reset\n"); + goto out; + } + +out: + i915_request_put(rq); + if (igt_flush_test(arg->engine->i915)) + err = -EIO; + return err; +} + +static int live_preempt_cancel(void *arg) +{ + struct intel_gt *gt = arg; + struct live_preempt_cancel data; + enum intel_engine_id id; + int err = -ENOMEM; + + /* + * To cancel an inflight context, we need to first remove it from the + * GPU. That sounds like preemption! Plus a little bit of bookkeeping. + */ + + if (!HAS_LOGICAL_RING_PREEMPTION(gt->i915)) + return 0; + + if (preempt_client_init(gt, &data.a)) + return -ENOMEM; + if (preempt_client_init(gt, &data.b)) + goto err_client_a; + + for_each_engine(data.engine, gt, id) { + if (!intel_engine_has_preemption(data.engine)) + continue; + + err = __cancel_active0(&data); + if (err) + goto err_wedged; + + err = __cancel_active1(&data); + if (err) + goto err_wedged; + + err = __cancel_queued(&data); + if (err) + goto err_wedged; + + err = __cancel_hostile(&data); + if (err) + goto err_wedged; + } + + err = 0; +err_client_b: + preempt_client_fini(&data.b); +err_client_a: + preempt_client_fini(&data.a); + return err; + +err_wedged: + GEM_TRACE_DUMP(); + igt_spinner_end(&data.b.spin); + igt_spinner_end(&data.a.spin); + intel_gt_set_wedged(gt); + goto err_client_b; +} + +static int live_suppress_self_preempt(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + struct i915_sched_attr attr = { + .priority = I915_USER_PRIORITY(I915_PRIORITY_MAX) + }; + struct preempt_client a, b; + enum intel_engine_id id; + int err = -ENOMEM; + + /* + * Verify that if a preemption request does not cause a change in + * the current execution order, the preempt-to-idle injection is + * skipped and that we do not accidentally apply it after the CS + * completion event. + */ + + if (!HAS_LOGICAL_RING_PREEMPTION(gt->i915)) + return 0; + + if (intel_uc_uses_guc_submission(>->uc)) + return 0; /* presume black blox */ + + if (intel_vgpu_active(gt->i915)) + return 0; /* GVT forces single port & request submission */ + + if (preempt_client_init(gt, &a)) + return -ENOMEM; + if (preempt_client_init(gt, &b)) + goto err_client_a; + + for_each_engine(engine, gt, id) { + struct i915_request *rq_a, *rq_b; + int depth; + + if (!intel_engine_has_preemption(engine)) + continue; + + if (igt_flush_test(gt->i915)) + goto err_wedged; + + st_engine_heartbeat_disable(engine); + engine->execlists.preempt_hang.count = 0; + + rq_a = spinner_create_request(&a.spin, + a.ctx, engine, + MI_NOOP); + if (IS_ERR(rq_a)) { + err = PTR_ERR(rq_a); + st_engine_heartbeat_enable(engine); + goto err_client_b; + } + + i915_request_add(rq_a); + if (!igt_wait_for_spinner(&a.spin, rq_a)) { + pr_err("First client failed to start\n"); + st_engine_heartbeat_enable(engine); + goto err_wedged; + } + + /* Keep postponing the timer to avoid premature slicing */ + mod_timer(&engine->execlists.timer, jiffies + HZ); + for (depth = 0; depth < 8; depth++) { + rq_b = spinner_create_request(&b.spin, + b.ctx, engine, + MI_NOOP); + if (IS_ERR(rq_b)) { + err = PTR_ERR(rq_b); + st_engine_heartbeat_enable(engine); + goto err_client_b; + } + i915_request_add(rq_b); + + GEM_BUG_ON(i915_request_completed(rq_a)); + engine->schedule(rq_a, &attr); + igt_spinner_end(&a.spin); + + if (!igt_wait_for_spinner(&b.spin, rq_b)) { + pr_err("Second client failed to start\n"); + st_engine_heartbeat_enable(engine); + goto err_wedged; + } + + swap(a, b); + rq_a = rq_b; + } + igt_spinner_end(&a.spin); + + if (engine->execlists.preempt_hang.count) { + pr_err("Preemption on %s recorded x%d, depth %d; should have been suppressed!\n", + engine->name, + engine->execlists.preempt_hang.count, + depth); + st_engine_heartbeat_enable(engine); + err = -EINVAL; + goto err_client_b; + } + + st_engine_heartbeat_enable(engine); + if (igt_flush_test(gt->i915)) + goto err_wedged; + } + + err = 0; +err_client_b: + preempt_client_fini(&b); +err_client_a: + preempt_client_fini(&a); + return err; + +err_wedged: + igt_spinner_end(&b.spin); + igt_spinner_end(&a.spin); + intel_gt_set_wedged(gt); + err = -EIO; + goto err_client_b; +} + +static int live_chain_preempt(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + struct preempt_client hi, lo; + enum intel_engine_id id; + int err = -ENOMEM; + + /* + * Build a chain AB...BA between two contexts (A, B) and request + * preemption of the last request. It should then complete before + * the previously submitted spinner in B. + */ + + if (!HAS_LOGICAL_RING_PREEMPTION(gt->i915)) + return 0; + + if (preempt_client_init(gt, &hi)) + return -ENOMEM; + + if (preempt_client_init(gt, &lo)) + goto err_client_hi; + + for_each_engine(engine, gt, id) { + struct i915_sched_attr attr = { + .priority = I915_USER_PRIORITY(I915_PRIORITY_MAX), + }; + struct igt_live_test t; + struct i915_request *rq; + int ring_size, count, i; + + if (!intel_engine_has_preemption(engine)) + continue; + + rq = spinner_create_request(&lo.spin, + lo.ctx, engine, + MI_ARB_CHECK); + if (IS_ERR(rq)) + goto err_wedged; + + i915_request_get(rq); + i915_request_add(rq); + + ring_size = rq->wa_tail - rq->head; + if (ring_size < 0) + ring_size += rq->ring->size; + ring_size = rq->ring->size / ring_size; + pr_debug("%s(%s): Using maximum of %d requests\n", + __func__, engine->name, ring_size); + + igt_spinner_end(&lo.spin); + if (i915_request_wait(rq, 0, HZ / 2) < 0) { + pr_err("Timed out waiting to flush %s\n", engine->name); + i915_request_put(rq); + goto err_wedged; + } + i915_request_put(rq); + + if (igt_live_test_begin(&t, gt->i915, __func__, engine->name)) { + err = -EIO; + goto err_wedged; + } + + for_each_prime_number_from(count, 1, ring_size) { + rq = spinner_create_request(&hi.spin, + hi.ctx, engine, + MI_ARB_CHECK); + if (IS_ERR(rq)) + goto err_wedged; + i915_request_add(rq); + if (!igt_wait_for_spinner(&hi.spin, rq)) + goto err_wedged; + + rq = spinner_create_request(&lo.spin, + lo.ctx, engine, + MI_ARB_CHECK); + if (IS_ERR(rq)) + goto err_wedged; + i915_request_add(rq); + + for (i = 0; i < count; i++) { + rq = igt_request_alloc(lo.ctx, engine); + if (IS_ERR(rq)) + goto err_wedged; + i915_request_add(rq); + } + + rq = igt_request_alloc(hi.ctx, engine); + if (IS_ERR(rq)) + goto err_wedged; + + i915_request_get(rq); + i915_request_add(rq); + engine->schedule(rq, &attr); + + igt_spinner_end(&hi.spin); + if (i915_request_wait(rq, 0, HZ / 5) < 0) { + struct drm_printer p = + drm_info_printer(gt->i915->drm.dev); + + pr_err("Failed to preempt over chain of %d\n", + count); + intel_engine_dump(engine, &p, + "%s\n", engine->name); + i915_request_put(rq); + goto err_wedged; + } + igt_spinner_end(&lo.spin); + i915_request_put(rq); + + rq = igt_request_alloc(lo.ctx, engine); + if (IS_ERR(rq)) + goto err_wedged; + + i915_request_get(rq); + i915_request_add(rq); + + if (i915_request_wait(rq, 0, HZ / 5) < 0) { + struct drm_printer p = + drm_info_printer(gt->i915->drm.dev); + + pr_err("Failed to flush low priority chain of %d requests\n", + count); + intel_engine_dump(engine, &p, + "%s\n", engine->name); + + i915_request_put(rq); + goto err_wedged; + } + i915_request_put(rq); + } + + if (igt_live_test_end(&t)) { + err = -EIO; + goto err_wedged; + } + } + + err = 0; +err_client_lo: + preempt_client_fini(&lo); +err_client_hi: + preempt_client_fini(&hi); + return err; + +err_wedged: + igt_spinner_end(&hi.spin); + igt_spinner_end(&lo.spin); + intel_gt_set_wedged(gt); + err = -EIO; + goto err_client_lo; +} + +static int create_gang(struct intel_engine_cs *engine, + struct i915_request **prev) +{ + struct drm_i915_gem_object *obj; + struct intel_context *ce; + struct i915_request *rq; + struct i915_vma *vma; + u32 *cs; + int err; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return PTR_ERR(ce); + + obj = i915_gem_object_create_internal(engine->i915, 4096); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + goto err_ce; + } + + vma = i915_vma_instance(obj, ce->vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_obj; + } + + err = i915_vma_pin(vma, 0, 0, PIN_USER); + if (err) + goto err_obj; + + cs = i915_gem_object_pin_map(obj, I915_MAP_WC); + if (IS_ERR(cs)) + goto err_obj; + + /* Semaphore target: spin until zero */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + + *cs++ = MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD; + *cs++ = 0; + *cs++ = lower_32_bits(vma->node.start); + *cs++ = upper_32_bits(vma->node.start); + + if (*prev) { + u64 offset = (*prev)->batch->node.start; + + /* Terminate the spinner in the next lower priority batch. */ + *cs++ = MI_STORE_DWORD_IMM_GEN4; + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + *cs++ = 0; + } + + *cs++ = MI_BATCH_BUFFER_END; + i915_gem_object_flush_map(obj); + i915_gem_object_unpin_map(obj); + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) + goto err_obj; + + rq->batch = i915_vma_get(vma); + i915_request_get(rq); + + i915_vma_lock(vma); + err = i915_request_await_object(rq, vma->obj, false); + if (!err) + err = i915_vma_move_to_active(vma, rq, 0); + if (!err) + err = rq->engine->emit_bb_start(rq, + vma->node.start, + PAGE_SIZE, 0); + i915_vma_unlock(vma); + i915_request_add(rq); + if (err) + goto err_rq; + + i915_gem_object_put(obj); + intel_context_put(ce); + + rq->mock.link.next = &(*prev)->mock.link; + *prev = rq; + return 0; + +err_rq: + i915_vma_put(rq->batch); + i915_request_put(rq); +err_obj: + i915_gem_object_put(obj); +err_ce: + intel_context_put(ce); + return err; +} + +static int __live_preempt_ring(struct intel_engine_cs *engine, + struct igt_spinner *spin, + int queue_sz, int ring_sz) +{ + struct intel_context *ce[2] = {}; + struct i915_request *rq; + struct igt_live_test t; + int err = 0; + int n; + + if (igt_live_test_begin(&t, engine->i915, __func__, engine->name)) + return -EIO; + + for (n = 0; n < ARRAY_SIZE(ce); n++) { + struct intel_context *tmp; + + tmp = intel_context_create(engine); + if (IS_ERR(tmp)) { + err = PTR_ERR(tmp); + goto err_ce; + } + + tmp->ring = __intel_context_ring_size(ring_sz); + + err = intel_context_pin(tmp); + if (err) { + intel_context_put(tmp); + goto err_ce; + } + + memset32(tmp->ring->vaddr, + 0xdeadbeef, /* trigger a hang if executed */ + tmp->ring->vma->size / sizeof(u32)); + + ce[n] = tmp; + } + + rq = igt_spinner_create_request(spin, ce[0], MI_ARB_CHECK); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_ce; + } + + i915_request_get(rq); + rq->sched.attr.priority = I915_PRIORITY_BARRIER; + i915_request_add(rq); + + if (!igt_wait_for_spinner(spin, rq)) { + intel_gt_set_wedged(engine->gt); + i915_request_put(rq); + err = -ETIME; + goto err_ce; + } + + /* Fill the ring, until we will cause a wrap */ + n = 0; + while (ce[0]->ring->tail - rq->wa_tail <= queue_sz) { + struct i915_request *tmp; + + tmp = intel_context_create_request(ce[0]); + if (IS_ERR(tmp)) { + err = PTR_ERR(tmp); + i915_request_put(rq); + goto err_ce; + } + + i915_request_add(tmp); + intel_engine_flush_submission(engine); + n++; + } + intel_engine_flush_submission(engine); + pr_debug("%s: Filled %d with %d nop tails {size:%x, tail:%x, emit:%x, rq.tail:%x}\n", + engine->name, queue_sz, n, + ce[0]->ring->size, + ce[0]->ring->tail, + ce[0]->ring->emit, + rq->tail); + i915_request_put(rq); + + /* Create a second request to preempt the first ring */ + rq = intel_context_create_request(ce[1]); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_ce; + } + + rq->sched.attr.priority = I915_PRIORITY_BARRIER; + i915_request_get(rq); + i915_request_add(rq); + + err = wait_for_submit(engine, rq, HZ / 2); + i915_request_put(rq); + if (err) { + pr_err("%s: preemption request was not submited\n", + engine->name); + err = -ETIME; + } + + pr_debug("%s: ring[0]:{ tail:%x, emit:%x }, ring[1]:{ tail:%x, emit:%x }\n", + engine->name, + ce[0]->ring->tail, ce[0]->ring->emit, + ce[1]->ring->tail, ce[1]->ring->emit); + +err_ce: + intel_engine_flush_submission(engine); + igt_spinner_end(spin); + for (n = 0; n < ARRAY_SIZE(ce); n++) { + if (IS_ERR_OR_NULL(ce[n])) + break; + + intel_context_unpin(ce[n]); + intel_context_put(ce[n]); + } + if (igt_live_test_end(&t)) + err = -EIO; + return err; +} + +static int live_preempt_ring(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + struct igt_spinner spin; + enum intel_engine_id id; + int err = 0; + + /* + * Check that we rollback large chunks of a ring in order to do a + * preemption event. Similar to live_unlite_ring, but looking at + * ring size rather than the impact of intel_ring_direction(). + */ + + if (igt_spinner_init(&spin, gt)) + return -ENOMEM; + + for_each_engine(engine, gt, id) { + int n; + + if (!intel_engine_has_preemption(engine)) + continue; + + if (!intel_engine_can_store_dword(engine)) + continue; + + st_engine_heartbeat_disable(engine); + + for (n = 0; n <= 3; n++) { + err = __live_preempt_ring(engine, &spin, + n * SZ_4K / 4, SZ_4K); + if (err) + break; + } + + st_engine_heartbeat_enable(engine); + if (err) + break; + } + + igt_spinner_fini(&spin); + return err; +} + +static int live_preempt_gang(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + + if (!HAS_LOGICAL_RING_PREEMPTION(gt->i915)) + return 0; + + /* + * Build as long a chain of preempters as we can, with each + * request higher priority than the last. Once we are ready, we release + * the last batch which then precolates down the chain, each releasing + * the next oldest in turn. The intent is to simply push as hard as we + * can with the number of preemptions, trying to exceed narrow HW + * limits. At a minimum, we insist that we can sort all the user + * high priority levels into execution order. + */ + + for_each_engine(engine, gt, id) { + struct i915_request *rq = NULL; + struct igt_live_test t; + IGT_TIMEOUT(end_time); + int prio = 0; + int err = 0; + u32 *cs; + + if (!intel_engine_has_preemption(engine)) + continue; + + if (igt_live_test_begin(&t, gt->i915, __func__, engine->name)) + return -EIO; + + do { + struct i915_sched_attr attr = { + .priority = I915_USER_PRIORITY(prio++), + }; + + err = create_gang(engine, &rq); + if (err) + break; + + /* Submit each spinner at increasing priority */ + engine->schedule(rq, &attr); + } while (prio <= I915_PRIORITY_MAX && + !__igt_timeout(end_time, NULL)); + pr_debug("%s: Preempt chain of %d requests\n", + engine->name, prio); + + /* + * Such that the last spinner is the highest priority and + * should execute first. When that spinner completes, + * it will terminate the next lowest spinner until there + * are no more spinners and the gang is complete. + */ + cs = i915_gem_object_pin_map(rq->batch->obj, I915_MAP_WC); + if (!IS_ERR(cs)) { + *cs = 0; + i915_gem_object_unpin_map(rq->batch->obj); + } else { + err = PTR_ERR(cs); + intel_gt_set_wedged(gt); + } + + while (rq) { /* wait for each rq from highest to lowest prio */ + struct i915_request *n = list_next_entry(rq, mock.link); + + if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0) { + struct drm_printer p = + drm_info_printer(engine->i915->drm.dev); + + pr_err("Failed to flush chain of %d requests, at %d\n", + prio, rq_prio(rq) >> I915_USER_PRIORITY_SHIFT); + intel_engine_dump(engine, &p, + "%s\n", engine->name); + + err = -ETIME; + } + + i915_vma_put(rq->batch); + i915_request_put(rq); + rq = n; + } + + if (igt_live_test_end(&t)) + err = -EIO; + if (err) + return err; + } + + return 0; +} + +static struct i915_vma * +create_gpr_user(struct intel_engine_cs *engine, + struct i915_vma *result, + unsigned int offset) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + u32 *cs; + int err; + int i; + + obj = i915_gem_object_create_internal(engine->i915, 4096); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + vma = i915_vma_instance(obj, result->vm, NULL); + if (IS_ERR(vma)) { + i915_gem_object_put(obj); + return vma; + } + + err = i915_vma_pin(vma, 0, 0, PIN_USER); + if (err) { + i915_vma_put(vma); + return ERR_PTR(err); + } + + cs = i915_gem_object_pin_map(obj, I915_MAP_WC); + if (IS_ERR(cs)) { + i915_vma_put(vma); + return ERR_CAST(cs); + } + + /* All GPR are clear for new contexts. We use GPR(0) as a constant */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = CS_GPR(engine, 0); + *cs++ = 1; + + for (i = 1; i < NUM_GPR; i++) { + u64 addr; + + /* + * Perform: GPR[i]++ + * + * As we read and write into the context saved GPR[i], if + * we restart this batch buffer from an earlier point, we + * will repeat the increment and store a value > 1. + */ + *cs++ = MI_MATH(4); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(i)); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(0)); + *cs++ = MI_MATH_ADD; + *cs++ = MI_MATH_STORE(MI_MATH_REG(i), MI_MATH_REG_ACCU); + + addr = result->node.start + offset + i * sizeof(*cs); + *cs++ = MI_STORE_REGISTER_MEM_GEN8; + *cs++ = CS_GPR(engine, 2 * i); + *cs++ = lower_32_bits(addr); + *cs++ = upper_32_bits(addr); + + *cs++ = MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_GTE_SDD; + *cs++ = i; + *cs++ = lower_32_bits(result->node.start); + *cs++ = upper_32_bits(result->node.start); + } + + *cs++ = MI_BATCH_BUFFER_END; + i915_gem_object_flush_map(obj); + i915_gem_object_unpin_map(obj); + + return vma; +} + +static struct i915_vma *create_global(struct intel_gt *gt, size_t sz) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + int err; + + obj = i915_gem_object_create_internal(gt->i915, sz); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + vma = i915_vma_instance(obj, >->ggtt->vm, NULL); + if (IS_ERR(vma)) { + i915_gem_object_put(obj); + return vma; + } + + err = i915_ggtt_pin(vma, NULL, 0, 0); + if (err) { + i915_vma_put(vma); + return ERR_PTR(err); + } + + return vma; +} + +static struct i915_request * +create_gpr_client(struct intel_engine_cs *engine, + struct i915_vma *global, + unsigned int offset) +{ + struct i915_vma *batch, *vma; + struct intel_context *ce; + struct i915_request *rq; + int err; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return ERR_CAST(ce); + + vma = i915_vma_instance(global->obj, ce->vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto out_ce; + } + + err = i915_vma_pin(vma, 0, 0, PIN_USER); + if (err) + goto out_ce; + + batch = create_gpr_user(engine, vma, offset); + if (IS_ERR(batch)) { + err = PTR_ERR(batch); + goto out_vma; + } + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_batch; + } + + i915_vma_lock(vma); + err = i915_request_await_object(rq, vma->obj, false); + if (!err) + err = i915_vma_move_to_active(vma, rq, 0); + i915_vma_unlock(vma); + + i915_vma_lock(batch); + if (!err) + err = i915_request_await_object(rq, batch->obj, false); + if (!err) + err = i915_vma_move_to_active(batch, rq, 0); + if (!err) + err = rq->engine->emit_bb_start(rq, + batch->node.start, + PAGE_SIZE, 0); + i915_vma_unlock(batch); + i915_vma_unpin(batch); + + if (!err) + i915_request_get(rq); + i915_request_add(rq); + +out_batch: + i915_vma_put(batch); +out_vma: + i915_vma_unpin(vma); +out_ce: + intel_context_put(ce); + return err ? ERR_PTR(err) : rq; +} + +static int preempt_user(struct intel_engine_cs *engine, + struct i915_vma *global, + int id) +{ + struct i915_sched_attr attr = { + .priority = I915_PRIORITY_MAX + }; + struct i915_request *rq; + int err = 0; + u32 *cs; + + rq = intel_engine_create_kernel_request(engine); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) { + i915_request_add(rq); + return PTR_ERR(cs); + } + + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = i915_ggtt_offset(global); + *cs++ = 0; + *cs++ = id; + + intel_ring_advance(rq, cs); + + i915_request_get(rq); + i915_request_add(rq); + + engine->schedule(rq, &attr); + + if (i915_request_wait(rq, 0, HZ / 2) < 0) + err = -ETIME; + i915_request_put(rq); + + return err; +} + +static int live_preempt_user(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + struct i915_vma *global; + enum intel_engine_id id; + u32 *result; + int err = 0; + + if (!HAS_LOGICAL_RING_PREEMPTION(gt->i915)) + return 0; + + /* + * In our other tests, we look at preemption in carefully + * controlled conditions in the ringbuffer. Since most of the + * time is spent in user batches, most of our preemptions naturally + * occur there. We want to verify that when we preempt inside a batch + * we continue on from the current instruction and do not roll back + * to the start, or another earlier arbitration point. + * + * To verify this, we create a batch which is a mixture of + * MI_MATH (gpr++) MI_SRM (gpr) and preemption points. Then with + * a few preempting contexts thrown into the mix, we look for any + * repeated instructions (which show up as incorrect values). + */ + + global = create_global(gt, 4096); + if (IS_ERR(global)) + return PTR_ERR(global); + + result = i915_gem_object_pin_map(global->obj, I915_MAP_WC); + if (IS_ERR(result)) { + i915_vma_unpin_and_release(&global, 0); + return PTR_ERR(result); + } + + for_each_engine(engine, gt, id) { + struct i915_request *client[3] = {}; + struct igt_live_test t; + int i; + + if (!intel_engine_has_preemption(engine)) + continue; + + if (IS_GEN(gt->i915, 8) && engine->class != RENDER_CLASS) + continue; /* we need per-context GPR */ + + if (igt_live_test_begin(&t, gt->i915, __func__, engine->name)) { + err = -EIO; + break; + } + + memset(result, 0, 4096); + + for (i = 0; i < ARRAY_SIZE(client); i++) { + struct i915_request *rq; + + rq = create_gpr_client(engine, global, + NUM_GPR * i * sizeof(u32)); + if (IS_ERR(rq)) + goto end_test; + + client[i] = rq; + } + + /* Continuously preempt the set of 3 running contexts */ + for (i = 1; i <= NUM_GPR; i++) { + err = preempt_user(engine, global, i); + if (err) + goto end_test; + } + + if (READ_ONCE(result[0]) != NUM_GPR) { + pr_err("%s: Failed to release semaphore\n", + engine->name); + err = -EIO; + goto end_test; + } + + for (i = 0; i < ARRAY_SIZE(client); i++) { + int gpr; + + if (i915_request_wait(client[i], 0, HZ / 2) < 0) { + err = -ETIME; + goto end_test; + } + + for (gpr = 1; gpr < NUM_GPR; gpr++) { + if (result[NUM_GPR * i + gpr] != 1) { + pr_err("%s: Invalid result, client %d, gpr %d, result: %d\n", + engine->name, + i, gpr, result[NUM_GPR * i + gpr]); + err = -EINVAL; + goto end_test; + } + } + } + +end_test: + for (i = 0; i < ARRAY_SIZE(client); i++) { + if (!client[i]) + break; + + i915_request_put(client[i]); + } + + /* Flush the semaphores on error */ + smp_store_mb(result[0], -1); + if (igt_live_test_end(&t)) + err = -EIO; + if (err) + break; + } + + i915_vma_unpin_and_release(&global, I915_VMA_RELEASE_MAP); + return err; +} + +static int live_preempt_timeout(void *arg) +{ + struct intel_gt *gt = arg; + struct i915_gem_context *ctx_hi, *ctx_lo; + struct igt_spinner spin_lo; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = -ENOMEM; + + /* + * Check that we force preemption to occur by cancelling the previous + * context if it refuses to yield the GPU. + */ + if (!IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT)) + return 0; + + if (!HAS_LOGICAL_RING_PREEMPTION(gt->i915)) + return 0; + + if (!intel_has_reset_engine(gt)) + return 0; + + if (igt_spinner_init(&spin_lo, gt)) + return -ENOMEM; + + ctx_hi = kernel_context(gt->i915); + if (!ctx_hi) + goto err_spin_lo; + ctx_hi->sched.priority = + I915_USER_PRIORITY(I915_CONTEXT_MAX_USER_PRIORITY); + + ctx_lo = kernel_context(gt->i915); + if (!ctx_lo) + goto err_ctx_hi; + ctx_lo->sched.priority = + I915_USER_PRIORITY(I915_CONTEXT_MIN_USER_PRIORITY); + + for_each_engine(engine, gt, id) { + unsigned long saved_timeout; + struct i915_request *rq; + + if (!intel_engine_has_preemption(engine)) + continue; + + rq = spinner_create_request(&spin_lo, ctx_lo, engine, + MI_NOOP); /* preemption disabled */ + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_ctx_lo; + } + + i915_request_add(rq); + if (!igt_wait_for_spinner(&spin_lo, rq)) { + intel_gt_set_wedged(gt); + err = -EIO; + goto err_ctx_lo; + } + + rq = igt_request_alloc(ctx_hi, engine); + if (IS_ERR(rq)) { + igt_spinner_end(&spin_lo); + err = PTR_ERR(rq); + goto err_ctx_lo; + } + + /* Flush the previous CS ack before changing timeouts */ + while (READ_ONCE(engine->execlists.pending[0])) + cpu_relax(); + + saved_timeout = engine->props.preempt_timeout_ms; + engine->props.preempt_timeout_ms = 1; /* in ms, -> 1 jiffie */ + + i915_request_get(rq); + i915_request_add(rq); + + intel_engine_flush_submission(engine); + engine->props.preempt_timeout_ms = saved_timeout; + + if (i915_request_wait(rq, 0, HZ / 10) < 0) { + intel_gt_set_wedged(gt); + i915_request_put(rq); + err = -ETIME; + goto err_ctx_lo; + } + + igt_spinner_end(&spin_lo); + i915_request_put(rq); + } + + err = 0; +err_ctx_lo: + kernel_context_close(ctx_lo); +err_ctx_hi: + kernel_context_close(ctx_hi); +err_spin_lo: + igt_spinner_fini(&spin_lo); + return err; +} + +static int random_range(struct rnd_state *rnd, int min, int max) +{ + return i915_prandom_u32_max_state(max - min, rnd) + min; +} + +static int random_priority(struct rnd_state *rnd) +{ + return random_range(rnd, I915_PRIORITY_MIN, I915_PRIORITY_MAX); +} + +struct preempt_smoke { + struct intel_gt *gt; + struct i915_gem_context **contexts; + struct intel_engine_cs *engine; + struct drm_i915_gem_object *batch; + unsigned int ncontext; + struct rnd_state prng; + unsigned long count; +}; + +static struct i915_gem_context *smoke_context(struct preempt_smoke *smoke) +{ + return smoke->contexts[i915_prandom_u32_max_state(smoke->ncontext, + &smoke->prng)]; +} + +static int smoke_submit(struct preempt_smoke *smoke, + struct i915_gem_context *ctx, int prio, + struct drm_i915_gem_object *batch) +{ + struct i915_request *rq; + struct i915_vma *vma = NULL; + int err = 0; + + if (batch) { + struct i915_address_space *vm; + + vm = i915_gem_context_get_vm_rcu(ctx); + vma = i915_vma_instance(batch, vm, NULL); + i915_vm_put(vm); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + err = i915_vma_pin(vma, 0, 0, PIN_USER); + if (err) + return err; + } + + ctx->sched.priority = prio; + + rq = igt_request_alloc(ctx, smoke->engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto unpin; + } + + if (vma) { + i915_vma_lock(vma); + err = i915_request_await_object(rq, vma->obj, false); + if (!err) + err = i915_vma_move_to_active(vma, rq, 0); + if (!err) + err = rq->engine->emit_bb_start(rq, + vma->node.start, + PAGE_SIZE, 0); + i915_vma_unlock(vma); + } + + i915_request_add(rq); + +unpin: + if (vma) + i915_vma_unpin(vma); + + return err; +} + +static int smoke_crescendo_thread(void *arg) +{ + struct preempt_smoke *smoke = arg; + IGT_TIMEOUT(end_time); + unsigned long count; + + count = 0; + do { + struct i915_gem_context *ctx = smoke_context(smoke); + int err; + + err = smoke_submit(smoke, + ctx, count % I915_PRIORITY_MAX, + smoke->batch); + if (err) + return err; + + count++; + } while (count < smoke->ncontext && !__igt_timeout(end_time, NULL)); + + smoke->count = count; + return 0; +} + +static int smoke_crescendo(struct preempt_smoke *smoke, unsigned int flags) +#define BATCH BIT(0) +{ + struct task_struct *tsk[I915_NUM_ENGINES] = {}; + struct preempt_smoke arg[I915_NUM_ENGINES]; + struct intel_engine_cs *engine; + enum intel_engine_id id; + unsigned long count; + int err = 0; + + for_each_engine(engine, smoke->gt, id) { + arg[id] = *smoke; + arg[id].engine = engine; + if (!(flags & BATCH)) + arg[id].batch = NULL; + arg[id].count = 0; + + tsk[id] = kthread_run(smoke_crescendo_thread, &arg, + "igt/smoke:%d", id); + if (IS_ERR(tsk[id])) { + err = PTR_ERR(tsk[id]); + break; + } + get_task_struct(tsk[id]); + } + + yield(); /* start all threads before we kthread_stop() */ + + count = 0; + for_each_engine(engine, smoke->gt, id) { + int status; + + if (IS_ERR_OR_NULL(tsk[id])) + continue; + + status = kthread_stop(tsk[id]); + if (status && !err) + err = status; + + count += arg[id].count; + + put_task_struct(tsk[id]); + } + + pr_info("Submitted %lu crescendo:%x requests across %d engines and %d contexts\n", + count, flags, smoke->gt->info.num_engines, smoke->ncontext); + return 0; +} + +static int smoke_random(struct preempt_smoke *smoke, unsigned int flags) +{ + enum intel_engine_id id; + IGT_TIMEOUT(end_time); + unsigned long count; + + count = 0; + do { + for_each_engine(smoke->engine, smoke->gt, id) { + struct i915_gem_context *ctx = smoke_context(smoke); + int err; + + err = smoke_submit(smoke, + ctx, random_priority(&smoke->prng), + flags & BATCH ? smoke->batch : NULL); + if (err) + return err; + + count++; + } + } while (count < smoke->ncontext && !__igt_timeout(end_time, NULL)); + + pr_info("Submitted %lu random:%x requests across %d engines and %d contexts\n", + count, flags, smoke->gt->info.num_engines, smoke->ncontext); + return 0; +} + +static int live_preempt_smoke(void *arg) +{ + struct preempt_smoke smoke = { + .gt = arg, + .prng = I915_RND_STATE_INITIALIZER(i915_selftest.random_seed), + .ncontext = 256, + }; + const unsigned int phase[] = { 0, BATCH }; + struct igt_live_test t; + int err = -ENOMEM; + u32 *cs; + int n; + + if (!HAS_LOGICAL_RING_PREEMPTION(smoke.gt->i915)) + return 0; + + smoke.contexts = kmalloc_array(smoke.ncontext, + sizeof(*smoke.contexts), + GFP_KERNEL); + if (!smoke.contexts) + return -ENOMEM; + + smoke.batch = + i915_gem_object_create_internal(smoke.gt->i915, PAGE_SIZE); + if (IS_ERR(smoke.batch)) { + err = PTR_ERR(smoke.batch); + goto err_free; + } + + cs = i915_gem_object_pin_map(smoke.batch, I915_MAP_WB); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto err_batch; + } + for (n = 0; n < PAGE_SIZE / sizeof(*cs) - 1; n++) + cs[n] = MI_ARB_CHECK; + cs[n] = MI_BATCH_BUFFER_END; + i915_gem_object_flush_map(smoke.batch); + i915_gem_object_unpin_map(smoke.batch); + + if (igt_live_test_begin(&t, smoke.gt->i915, __func__, "all")) { + err = -EIO; + goto err_batch; + } + + for (n = 0; n < smoke.ncontext; n++) { + smoke.contexts[n] = kernel_context(smoke.gt->i915); + if (!smoke.contexts[n]) + goto err_ctx; + } + + for (n = 0; n < ARRAY_SIZE(phase); n++) { + err = smoke_crescendo(&smoke, phase[n]); + if (err) + goto err_ctx; + + err = smoke_random(&smoke, phase[n]); + if (err) + goto err_ctx; + } + +err_ctx: + if (igt_live_test_end(&t)) + err = -EIO; + + for (n = 0; n < smoke.ncontext; n++) { + if (!smoke.contexts[n]) + break; + kernel_context_close(smoke.contexts[n]); + } + +err_batch: + i915_gem_object_put(smoke.batch); +err_free: + kfree(smoke.contexts); + + return err; +} + +static int nop_virtual_engine(struct intel_gt *gt, + struct intel_engine_cs **siblings, + unsigned int nsibling, + unsigned int nctx, + unsigned int flags) +#define CHAIN BIT(0) +{ + IGT_TIMEOUT(end_time); + struct i915_request *request[16] = {}; + struct intel_context *ve[16]; + unsigned long n, prime, nc; + struct igt_live_test t; + ktime_t times[2] = {}; + int err; + + GEM_BUG_ON(!nctx || nctx > ARRAY_SIZE(ve)); + + for (n = 0; n < nctx; n++) { + ve[n] = intel_execlists_create_virtual(siblings, nsibling); + if (IS_ERR(ve[n])) { + err = PTR_ERR(ve[n]); + nctx = n; + goto out; + } + + err = intel_context_pin(ve[n]); + if (err) { + intel_context_put(ve[n]); + nctx = n; + goto out; + } + } + + err = igt_live_test_begin(&t, gt->i915, __func__, ve[0]->engine->name); + if (err) + goto out; + + for_each_prime_number_from(prime, 1, 8192) { + times[1] = ktime_get_raw(); + + if (flags & CHAIN) { + for (nc = 0; nc < nctx; nc++) { + for (n = 0; n < prime; n++) { + struct i915_request *rq; + + rq = i915_request_create(ve[nc]); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out; + } + + if (request[nc]) + i915_request_put(request[nc]); + request[nc] = i915_request_get(rq); + i915_request_add(rq); + } + } + } else { + for (n = 0; n < prime; n++) { + for (nc = 0; nc < nctx; nc++) { + struct i915_request *rq; + + rq = i915_request_create(ve[nc]); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out; + } + + if (request[nc]) + i915_request_put(request[nc]); + request[nc] = i915_request_get(rq); + i915_request_add(rq); + } + } + } + + for (nc = 0; nc < nctx; nc++) { + if (i915_request_wait(request[nc], 0, HZ / 10) < 0) { + pr_err("%s(%s): wait for %llx:%lld timed out\n", + __func__, ve[0]->engine->name, + request[nc]->fence.context, + request[nc]->fence.seqno); + + GEM_TRACE("%s(%s) failed at request %llx:%lld\n", + __func__, ve[0]->engine->name, + request[nc]->fence.context, + request[nc]->fence.seqno); + GEM_TRACE_DUMP(); + intel_gt_set_wedged(gt); + break; + } + } + + times[1] = ktime_sub(ktime_get_raw(), times[1]); + if (prime == 1) + times[0] = times[1]; + + for (nc = 0; nc < nctx; nc++) { + i915_request_put(request[nc]); + request[nc] = NULL; + } + + if (__igt_timeout(end_time, NULL)) + break; + } + + err = igt_live_test_end(&t); + if (err) + goto out; + + pr_info("Requestx%d latencies on %s: 1 = %lluns, %lu = %lluns\n", + nctx, ve[0]->engine->name, ktime_to_ns(times[0]), + prime, div64_u64(ktime_to_ns(times[1]), prime)); + +out: + if (igt_flush_test(gt->i915)) + err = -EIO; + + for (nc = 0; nc < nctx; nc++) { + i915_request_put(request[nc]); + intel_context_unpin(ve[nc]); + intel_context_put(ve[nc]); + } + return err; +} + +static unsigned int +__select_siblings(struct intel_gt *gt, + unsigned int class, + struct intel_engine_cs **siblings, + bool (*filter)(const struct intel_engine_cs *)) +{ + unsigned int n = 0; + unsigned int inst; + + for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) { + if (!gt->engine_class[class][inst]) + continue; + + if (filter && !filter(gt->engine_class[class][inst])) + continue; + + siblings[n++] = gt->engine_class[class][inst]; + } + + return n; +} + +static unsigned int +select_siblings(struct intel_gt *gt, + unsigned int class, + struct intel_engine_cs **siblings) +{ + return __select_siblings(gt, class, siblings, NULL); +} + +static int live_virtual_engine(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1]; + struct intel_engine_cs *engine; + enum intel_engine_id id; + unsigned int class; + int err; + + if (intel_uc_uses_guc_submission(>->uc)) + return 0; + + for_each_engine(engine, gt, id) { + err = nop_virtual_engine(gt, &engine, 1, 1, 0); + if (err) { + pr_err("Failed to wrap engine %s: err=%d\n", + engine->name, err); + return err; + } + } + + for (class = 0; class <= MAX_ENGINE_CLASS; class++) { + int nsibling, n; + + nsibling = select_siblings(gt, class, siblings); + if (nsibling < 2) + continue; + + for (n = 1; n <= nsibling + 1; n++) { + err = nop_virtual_engine(gt, siblings, nsibling, + n, 0); + if (err) + return err; + } + + err = nop_virtual_engine(gt, siblings, nsibling, n, CHAIN); + if (err) + return err; + } + + return 0; +} + +static int mask_virtual_engine(struct intel_gt *gt, + struct intel_engine_cs **siblings, + unsigned int nsibling) +{ + struct i915_request *request[MAX_ENGINE_INSTANCE + 1]; + struct intel_context *ve; + struct igt_live_test t; + unsigned int n; + int err; + + /* + * Check that by setting the execution mask on a request, we can + * restrict it to our desired engine within the virtual engine. + */ + + ve = intel_execlists_create_virtual(siblings, nsibling); + if (IS_ERR(ve)) { + err = PTR_ERR(ve); + goto out_close; + } + + err = intel_context_pin(ve); + if (err) + goto out_put; + + err = igt_live_test_begin(&t, gt->i915, __func__, ve->engine->name); + if (err) + goto out_unpin; + + for (n = 0; n < nsibling; n++) { + request[n] = i915_request_create(ve); + if (IS_ERR(request[n])) { + err = PTR_ERR(request[n]); + nsibling = n; + goto out; + } + + /* Reverse order as it's more likely to be unnatural */ + request[n]->execution_mask = siblings[nsibling - n - 1]->mask; + + i915_request_get(request[n]); + i915_request_add(request[n]); + } + + for (n = 0; n < nsibling; n++) { + if (i915_request_wait(request[n], 0, HZ / 10) < 0) { + pr_err("%s(%s): wait for %llx:%lld timed out\n", + __func__, ve->engine->name, + request[n]->fence.context, + request[n]->fence.seqno); + + GEM_TRACE("%s(%s) failed at request %llx:%lld\n", + __func__, ve->engine->name, + request[n]->fence.context, + request[n]->fence.seqno); + GEM_TRACE_DUMP(); + intel_gt_set_wedged(gt); + err = -EIO; + goto out; + } + + if (request[n]->engine != siblings[nsibling - n - 1]) { + pr_err("Executed on wrong sibling '%s', expected '%s'\n", + request[n]->engine->name, + siblings[nsibling - n - 1]->name); + err = -EINVAL; + goto out; + } + } + + err = igt_live_test_end(&t); +out: + if (igt_flush_test(gt->i915)) + err = -EIO; + + for (n = 0; n < nsibling; n++) + i915_request_put(request[n]); + +out_unpin: + intel_context_unpin(ve); +out_put: + intel_context_put(ve); +out_close: + return err; +} + +static int live_virtual_mask(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1]; + unsigned int class; + int err; + + if (intel_uc_uses_guc_submission(>->uc)) + return 0; + + for (class = 0; class <= MAX_ENGINE_CLASS; class++) { + unsigned int nsibling; + + nsibling = select_siblings(gt, class, siblings); + if (nsibling < 2) + continue; + + err = mask_virtual_engine(gt, siblings, nsibling); + if (err) + return err; + } + + return 0; +} + +static int slicein_virtual_engine(struct intel_gt *gt, + struct intel_engine_cs **siblings, + unsigned int nsibling) +{ + const long timeout = slice_timeout(siblings[0]); + struct intel_context *ce; + struct i915_request *rq; + struct igt_spinner spin; + unsigned int n; + int err = 0; + + /* + * Virtual requests must take part in timeslicing on the target engines. + */ + + if (igt_spinner_init(&spin, gt)) + return -ENOMEM; + + for (n = 0; n < nsibling; n++) { + ce = intel_context_create(siblings[n]); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto out; + } + + rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); + intel_context_put(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out; + } + + i915_request_add(rq); + } + + ce = intel_execlists_create_virtual(siblings, nsibling); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto out; + } + + rq = intel_context_create_request(ce); + intel_context_put(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out; + } + + i915_request_get(rq); + i915_request_add(rq); + if (i915_request_wait(rq, 0, timeout) < 0) { + GEM_TRACE_ERR("%s(%s) failed to slice in virtual request\n", + __func__, rq->engine->name); + GEM_TRACE_DUMP(); + intel_gt_set_wedged(gt); + err = -EIO; + } + i915_request_put(rq); + +out: + igt_spinner_end(&spin); + if (igt_flush_test(gt->i915)) + err = -EIO; + igt_spinner_fini(&spin); + return err; +} + +static int sliceout_virtual_engine(struct intel_gt *gt, + struct intel_engine_cs **siblings, + unsigned int nsibling) +{ + const long timeout = slice_timeout(siblings[0]); + struct intel_context *ce; + struct i915_request *rq; + struct igt_spinner spin; + unsigned int n; + int err = 0; + + /* + * Virtual requests must allow others a fair timeslice. + */ + + if (igt_spinner_init(&spin, gt)) + return -ENOMEM; + + /* XXX We do not handle oversubscription and fairness with normal rq */ + for (n = 0; n < nsibling; n++) { + ce = intel_execlists_create_virtual(siblings, nsibling); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto out; + } + + rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); + intel_context_put(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out; + } + + i915_request_add(rq); + } + + for (n = 0; !err && n < nsibling; n++) { + ce = intel_context_create(siblings[n]); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto out; + } + + rq = intel_context_create_request(ce); + intel_context_put(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out; + } + + i915_request_get(rq); + i915_request_add(rq); + if (i915_request_wait(rq, 0, timeout) < 0) { + GEM_TRACE_ERR("%s(%s) failed to slice out virtual request\n", + __func__, siblings[n]->name); + GEM_TRACE_DUMP(); + intel_gt_set_wedged(gt); + err = -EIO; + } + i915_request_put(rq); + } + +out: + igt_spinner_end(&spin); + if (igt_flush_test(gt->i915)) + err = -EIO; + igt_spinner_fini(&spin); + return err; +} + +static int live_virtual_slice(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1]; + unsigned int class; + int err; + + if (intel_uc_uses_guc_submission(>->uc)) + return 0; + + for (class = 0; class <= MAX_ENGINE_CLASS; class++) { + unsigned int nsibling; + + nsibling = __select_siblings(gt, class, siblings, + intel_engine_has_timeslices); + if (nsibling < 2) + continue; + + err = slicein_virtual_engine(gt, siblings, nsibling); + if (err) + return err; + + err = sliceout_virtual_engine(gt, siblings, nsibling); + if (err) + return err; + } + + return 0; +} + +static int preserved_virtual_engine(struct intel_gt *gt, + struct intel_engine_cs **siblings, + unsigned int nsibling) +{ + struct i915_request *last = NULL; + struct intel_context *ve; + struct i915_vma *scratch; + struct igt_live_test t; + unsigned int n; + int err = 0; + u32 *cs; + + scratch = create_scratch(siblings[0]->gt); + if (IS_ERR(scratch)) + return PTR_ERR(scratch); + + err = i915_vma_sync(scratch); + if (err) + goto out_scratch; + + ve = intel_execlists_create_virtual(siblings, nsibling); + if (IS_ERR(ve)) { + err = PTR_ERR(ve); + goto out_scratch; + } + + err = intel_context_pin(ve); + if (err) + goto out_put; + + err = igt_live_test_begin(&t, gt->i915, __func__, ve->engine->name); + if (err) + goto out_unpin; + + for (n = 0; n < NUM_GPR_DW; n++) { + struct intel_engine_cs *engine = siblings[n % nsibling]; + struct i915_request *rq; + + rq = i915_request_create(ve); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_end; + } + + i915_request_put(last); + last = i915_request_get(rq); + + cs = intel_ring_begin(rq, 8); + if (IS_ERR(cs)) { + i915_request_add(rq); + err = PTR_ERR(cs); + goto out_end; + } + + *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; + *cs++ = CS_GPR(engine, n); + *cs++ = i915_ggtt_offset(scratch) + n * sizeof(u32); + *cs++ = 0; + + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = CS_GPR(engine, (n + 1) % NUM_GPR_DW); + *cs++ = n + 1; + + *cs++ = MI_NOOP; + intel_ring_advance(rq, cs); + + /* Restrict this request to run on a particular engine */ + rq->execution_mask = engine->mask; + i915_request_add(rq); + } + + if (i915_request_wait(last, 0, HZ / 5) < 0) { + err = -ETIME; + goto out_end; + } + + cs = i915_gem_object_pin_map(scratch->obj, I915_MAP_WB); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto out_end; + } + + for (n = 0; n < NUM_GPR_DW; n++) { + if (cs[n] != n) { + pr_err("Incorrect value[%d] found for GPR[%d]\n", + cs[n], n); + err = -EINVAL; + break; + } + } + + i915_gem_object_unpin_map(scratch->obj); + +out_end: + if (igt_live_test_end(&t)) + err = -EIO; + i915_request_put(last); +out_unpin: + intel_context_unpin(ve); +out_put: + intel_context_put(ve); +out_scratch: + i915_vma_unpin_and_release(&scratch, 0); + return err; +} + +static int live_virtual_preserved(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1]; + unsigned int class; + + /* + * Check that the context image retains non-privileged (user) registers + * from one engine to the next. For this we check that the CS_GPR + * are preserved. + */ + + if (intel_uc_uses_guc_submission(>->uc)) + return 0; + + /* As we use CS_GPR we cannot run before they existed on all engines. */ + if (INTEL_GEN(gt->i915) < 9) + return 0; + + for (class = 0; class <= MAX_ENGINE_CLASS; class++) { + int nsibling, err; + + nsibling = select_siblings(gt, class, siblings); + if (nsibling < 2) + continue; + + err = preserved_virtual_engine(gt, siblings, nsibling); + if (err) + return err; + } + + return 0; +} + +static int bond_virtual_engine(struct intel_gt *gt, + unsigned int class, + struct intel_engine_cs **siblings, + unsigned int nsibling, + unsigned int flags) +#define BOND_SCHEDULE BIT(0) +{ + struct intel_engine_cs *master; + struct i915_request *rq[16]; + enum intel_engine_id id; + struct igt_spinner spin; + unsigned long n; + int err; + + /* + * A set of bonded requests is intended to be run concurrently + * across a number of engines. We use one request per-engine + * and a magic fence to schedule each of the bonded requests + * at the same time. A consequence of our current scheduler is that + * we only move requests to the HW ready queue when the request + * becomes ready, that is when all of its prerequisite fences have + * been signaled. As one of those fences is the master submit fence, + * there is a delay on all secondary fences as the HW may be + * currently busy. Equally, as all the requests are independent, + * they may have other fences that delay individual request + * submission to HW. Ergo, we do not guarantee that all requests are + * immediately submitted to HW at the same time, just that if the + * rules are abided by, they are ready at the same time as the + * first is submitted. Userspace can embed semaphores in its batch + * to ensure parallel execution of its phases as it requires. + * Though naturally it gets requested that perhaps the scheduler should + * take care of parallel execution, even across preemption events on + * different HW. (The proper answer is of course "lalalala".) + * + * With the submit-fence, we have identified three possible phases + * of synchronisation depending on the master fence: queued (not + * ready), executing, and signaled. The first two are quite simple + * and checked below. However, the signaled master fence handling is + * contentious. Currently we do not distinguish between a signaled + * fence and an expired fence, as once signaled it does not convey + * any information about the previous execution. It may even be freed + * and hence checking later it may not exist at all. Ergo we currently + * do not apply the bonding constraint for an already signaled fence, + * as our expectation is that it should not constrain the secondaries + * and is outside of the scope of the bonded request API (i.e. all + * userspace requests are meant to be running in parallel). As + * it imposes no constraint, and is effectively a no-op, we do not + * check below as normal execution flows are checked extensively above. + * + * XXX Is the degenerate handling of signaled submit fences the + * expected behaviour for userpace? + */ + + GEM_BUG_ON(nsibling >= ARRAY_SIZE(rq) - 1); + + if (igt_spinner_init(&spin, gt)) + return -ENOMEM; + + err = 0; + rq[0] = ERR_PTR(-ENOMEM); + for_each_engine(master, gt, id) { + struct i915_sw_fence fence = {}; + struct intel_context *ce; + + if (master->class == class) + continue; + + ce = intel_context_create(master); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto out; + } + + memset_p((void *)rq, ERR_PTR(-EINVAL), ARRAY_SIZE(rq)); + + rq[0] = igt_spinner_create_request(&spin, ce, MI_NOOP); + intel_context_put(ce); + if (IS_ERR(rq[0])) { + err = PTR_ERR(rq[0]); + goto out; + } + i915_request_get(rq[0]); + + if (flags & BOND_SCHEDULE) { + onstack_fence_init(&fence); + err = i915_sw_fence_await_sw_fence_gfp(&rq[0]->submit, + &fence, + GFP_KERNEL); + } + + i915_request_add(rq[0]); + if (err < 0) + goto out; + + if (!(flags & BOND_SCHEDULE) && + !igt_wait_for_spinner(&spin, rq[0])) { + err = -EIO; + goto out; + } + + for (n = 0; n < nsibling; n++) { + struct intel_context *ve; + + ve = intel_execlists_create_virtual(siblings, nsibling); + if (IS_ERR(ve)) { + err = PTR_ERR(ve); + onstack_fence_fini(&fence); + goto out; + } + + err = intel_virtual_engine_attach_bond(ve->engine, + master, + siblings[n]); + if (err) { + intel_context_put(ve); + onstack_fence_fini(&fence); + goto out; + } + + err = intel_context_pin(ve); + intel_context_put(ve); + if (err) { + onstack_fence_fini(&fence); + goto out; + } + + rq[n + 1] = i915_request_create(ve); + intel_context_unpin(ve); + if (IS_ERR(rq[n + 1])) { + err = PTR_ERR(rq[n + 1]); + onstack_fence_fini(&fence); + goto out; + } + i915_request_get(rq[n + 1]); + + err = i915_request_await_execution(rq[n + 1], + &rq[0]->fence, + ve->engine->bond_execute); + i915_request_add(rq[n + 1]); + if (err < 0) { + onstack_fence_fini(&fence); + goto out; + } + } + onstack_fence_fini(&fence); + intel_engine_flush_submission(master); + igt_spinner_end(&spin); + + if (i915_request_wait(rq[0], 0, HZ / 10) < 0) { + pr_err("Master request did not execute (on %s)!\n", + rq[0]->engine->name); + err = -EIO; + goto out; + } + + for (n = 0; n < nsibling; n++) { + if (i915_request_wait(rq[n + 1], 0, + MAX_SCHEDULE_TIMEOUT) < 0) { + err = -EIO; + goto out; + } + + if (rq[n + 1]->engine != siblings[n]) { + pr_err("Bonded request did not execute on target engine: expected %s, used %s; master was %s\n", + siblings[n]->name, + rq[n + 1]->engine->name, + rq[0]->engine->name); + err = -EINVAL; + goto out; + } + } + + for (n = 0; !IS_ERR(rq[n]); n++) + i915_request_put(rq[n]); + rq[0] = ERR_PTR(-ENOMEM); + } + +out: + for (n = 0; !IS_ERR(rq[n]); n++) + i915_request_put(rq[n]); + if (igt_flush_test(gt->i915)) + err = -EIO; + + igt_spinner_fini(&spin); + return err; +} + +static int live_virtual_bond(void *arg) +{ + static const struct phase { + const char *name; + unsigned int flags; + } phases[] = { + { "", 0 }, + { "schedule", BOND_SCHEDULE }, + { }, + }; + struct intel_gt *gt = arg; + struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1]; + unsigned int class; + int err; + + if (intel_uc_uses_guc_submission(>->uc)) + return 0; + + for (class = 0; class <= MAX_ENGINE_CLASS; class++) { + const struct phase *p; + int nsibling; + + nsibling = select_siblings(gt, class, siblings); + if (nsibling < 2) + continue; + + for (p = phases; p->name; p++) { + err = bond_virtual_engine(gt, + class, siblings, nsibling, + p->flags); + if (err) { + pr_err("%s(%s): failed class=%d, nsibling=%d, err=%d\n", + __func__, p->name, class, nsibling, err); + return err; + } + } + } + + return 0; +} + +static int reset_virtual_engine(struct intel_gt *gt, + struct intel_engine_cs **siblings, + unsigned int nsibling) +{ + struct intel_engine_cs *engine; + struct intel_context *ve; + struct igt_spinner spin; + struct i915_request *rq; + unsigned int n; + int err = 0; + + /* + * In order to support offline error capture for fast preempt reset, + * we need to decouple the guilty request and ensure that it and its + * descendents are not executed while the capture is in progress. + */ + + if (igt_spinner_init(&spin, gt)) + return -ENOMEM; + + ve = intel_execlists_create_virtual(siblings, nsibling); + if (IS_ERR(ve)) { + err = PTR_ERR(ve); + goto out_spin; + } + + for (n = 0; n < nsibling; n++) + st_engine_heartbeat_disable(siblings[n]); + + rq = igt_spinner_create_request(&spin, ve, MI_ARB_CHECK); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_heartbeat; + } + i915_request_add(rq); + + if (!igt_wait_for_spinner(&spin, rq)) { + intel_gt_set_wedged(gt); + err = -ETIME; + goto out_heartbeat; + } + + engine = rq->engine; + GEM_BUG_ON(engine == ve->engine); + + /* Take ownership of the reset and tasklet */ + if (test_and_set_bit(I915_RESET_ENGINE + engine->id, + >->reset.flags)) { + intel_gt_set_wedged(gt); + err = -EBUSY; + goto out_heartbeat; + } + tasklet_disable(&engine->execlists.tasklet); + + engine->execlists.tasklet.func(engine->execlists.tasklet.data); + GEM_BUG_ON(execlists_active(&engine->execlists) != rq); + + /* Fake a preemption event; failed of course */ + spin_lock_irq(&engine->active.lock); + __unwind_incomplete_requests(engine); + spin_unlock_irq(&engine->active.lock); + GEM_BUG_ON(rq->engine != ve->engine); + + /* Reset the engine while keeping our active request on hold */ + execlists_hold(engine, rq); + GEM_BUG_ON(!i915_request_on_hold(rq)); + + intel_engine_reset(engine, NULL); + GEM_BUG_ON(rq->fence.error != -EIO); + + /* Release our grasp on the engine, letting CS flow again */ + tasklet_enable(&engine->execlists.tasklet); + clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id, >->reset.flags); + + /* Check that we do not resubmit the held request */ + i915_request_get(rq); + if (!i915_request_wait(rq, 0, HZ / 5)) { + pr_err("%s: on hold request completed!\n", + engine->name); + intel_gt_set_wedged(gt); + err = -EIO; + goto out_rq; + } + GEM_BUG_ON(!i915_request_on_hold(rq)); + + /* But is resubmitted on release */ + execlists_unhold(engine, rq); + if (i915_request_wait(rq, 0, HZ / 5) < 0) { + pr_err("%s: held request did not complete!\n", + engine->name); + intel_gt_set_wedged(gt); + err = -ETIME; + } + +out_rq: + i915_request_put(rq); +out_heartbeat: + for (n = 0; n < nsibling; n++) + st_engine_heartbeat_enable(siblings[n]); + + intel_context_put(ve); +out_spin: + igt_spinner_fini(&spin); + return err; +} + +static int live_virtual_reset(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1]; + unsigned int class; + + /* + * Check that we handle a reset event within a virtual engine. + * Only the physical engine is reset, but we have to check the flow + * of the virtual requests around the reset, and make sure it is not + * forgotten. + */ + + if (intel_uc_uses_guc_submission(>->uc)) + return 0; + + if (!intel_has_reset_engine(gt)) + return 0; + + for (class = 0; class <= MAX_ENGINE_CLASS; class++) { + int nsibling, err; + + nsibling = select_siblings(gt, class, siblings); + if (nsibling < 2) + continue; + + err = reset_virtual_engine(gt, siblings, nsibling); + if (err) + return err; + } + + return 0; +} + +int intel_execlists_live_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(live_sanitycheck), + SUBTEST(live_unlite_switch), + SUBTEST(live_unlite_preempt), + SUBTEST(live_unlite_ring), + SUBTEST(live_pin_rewind), + SUBTEST(live_hold_reset), + SUBTEST(live_error_interrupt), + SUBTEST(live_timeslice_preempt), + SUBTEST(live_timeslice_rewind), + SUBTEST(live_timeslice_queue), + SUBTEST(live_timeslice_nopreempt), + SUBTEST(live_busywait_preempt), + SUBTEST(live_preempt), + SUBTEST(live_late_preempt), + SUBTEST(live_nopreempt), + SUBTEST(live_preempt_cancel), + SUBTEST(live_suppress_self_preempt), + SUBTEST(live_chain_preempt), + SUBTEST(live_preempt_ring), + SUBTEST(live_preempt_gang), + SUBTEST(live_preempt_timeout), + SUBTEST(live_preempt_user), + SUBTEST(live_preempt_smoke), + SUBTEST(live_virtual_engine), + SUBTEST(live_virtual_mask), + SUBTEST(live_virtual_preserved), + SUBTEST(live_virtual_slice), + SUBTEST(live_virtual_bond), + SUBTEST(live_virtual_reset), + }; + + if (!HAS_EXECLISTS(i915)) + return 0; + + if (intel_gt_is_wedged(&i915->gt)) + return 0; + + return intel_gt_live_subtests(tests, &i915->gt); +} + +static int emit_semaphore_signal(struct intel_context *ce, void *slot) +{ + const u32 offset = + i915_ggtt_offset(ce->engine->status_page.vma) + + offset_in_page(slot); + struct i915_request *rq; + u32 *cs; + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) { + i915_request_add(rq); + return PTR_ERR(cs); + } + + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = offset; + *cs++ = 0; + *cs++ = 1; + + intel_ring_advance(rq, cs); + + rq->sched.attr.priority = I915_PRIORITY_BARRIER; + i915_request_add(rq); + return 0; +} + +static int context_flush(struct intel_context *ce, long timeout) +{ + struct i915_request *rq; + struct dma_fence *fence; + int err = 0; + + rq = intel_engine_create_kernel_request(ce->engine); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + fence = i915_active_fence_get(&ce->timeline->last_request); + if (fence) { + i915_request_await_dma_fence(rq, fence); + dma_fence_put(fence); + } + + rq = i915_request_get(rq); + i915_request_add(rq); + if (i915_request_wait(rq, 0, timeout) < 0) + err = -ETIME; + i915_request_put(rq); + + rmb(); /* We know the request is written, make sure all state is too! */ + return err; +} + +static int live_lrc_layout(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + u32 *lrc; + int err; + + /* + * Check the registers offsets we use to create the initial reg state + * match the layout saved by HW. + */ + + lrc = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!lrc) + return -ENOMEM; + + err = 0; + for_each_engine(engine, gt, id) { + u32 *hw; + int dw; + + if (!engine->default_state) + continue; + + hw = shmem_pin_map(engine->default_state); + if (!hw) { + err = -ENOMEM; + break; + } + hw += LRC_STATE_OFFSET / sizeof(*hw); + + execlists_init_reg_state(memset(lrc, POISON_INUSE, PAGE_SIZE), + engine->kernel_context, + engine, + engine->kernel_context->ring, + true); + + dw = 0; + do { + u32 lri = hw[dw]; + + if (lri == 0) { + dw++; + continue; + } + + if (lrc[dw] == 0) { + pr_debug("%s: skipped instruction %x at dword %d\n", + engine->name, lri, dw); + dw++; + continue; + } + + if ((lri & GENMASK(31, 23)) != MI_INSTR(0x22, 0)) { + pr_err("%s: Expected LRI command at dword %d, found %08x\n", + engine->name, dw, lri); + err = -EINVAL; + break; + } + + if (lrc[dw] != lri) { + pr_err("%s: LRI command mismatch at dword %d, expected %08x found %08x\n", + engine->name, dw, lri, lrc[dw]); + err = -EINVAL; + break; + } + + lri &= 0x7f; + lri++; + dw++; + + while (lri) { + if (hw[dw] != lrc[dw]) { + pr_err("%s: Different registers found at dword %d, expected %x, found %x\n", + engine->name, dw, hw[dw], lrc[dw]); + err = -EINVAL; + break; + } + + /* + * Skip over the actual register value as we + * expect that to differ. + */ + dw += 2; + lri -= 2; + } + } while ((lrc[dw] & ~BIT(0)) != MI_BATCH_BUFFER_END); + + if (err) { + pr_info("%s: HW register image:\n", engine->name); + igt_hexdump(hw, PAGE_SIZE); + + pr_info("%s: SW register image:\n", engine->name); + igt_hexdump(lrc, PAGE_SIZE); + } + + shmem_unpin_map(engine->default_state, hw); + if (err) + break; + } + + kfree(lrc); + return err; +} + +static int find_offset(const u32 *lri, u32 offset) +{ + int i; + + for (i = 0; i < PAGE_SIZE / sizeof(u32); i++) + if (lri[i] == offset) + return i; + + return -1; +} + +static int live_lrc_fixed(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + /* + * Check the assumed register offsets match the actual locations in + * the context image. + */ + + for_each_engine(engine, gt, id) { + const struct { + u32 reg; + u32 offset; + const char *name; + } tbl[] = { + { + i915_mmio_reg_offset(RING_START(engine->mmio_base)), + CTX_RING_START - 1, + "RING_START" + }, + { + i915_mmio_reg_offset(RING_CTL(engine->mmio_base)), + CTX_RING_CTL - 1, + "RING_CTL" + }, + { + i915_mmio_reg_offset(RING_HEAD(engine->mmio_base)), + CTX_RING_HEAD - 1, + "RING_HEAD" + }, + { + i915_mmio_reg_offset(RING_TAIL(engine->mmio_base)), + CTX_RING_TAIL - 1, + "RING_TAIL" + }, + { + i915_mmio_reg_offset(RING_MI_MODE(engine->mmio_base)), + lrc_ring_mi_mode(engine), + "RING_MI_MODE" + }, + { + i915_mmio_reg_offset(RING_BBSTATE(engine->mmio_base)), + CTX_BB_STATE - 1, + "BB_STATE" + }, + { + i915_mmio_reg_offset(RING_BB_PER_CTX_PTR(engine->mmio_base)), + lrc_ring_wa_bb_per_ctx(engine), + "RING_BB_PER_CTX_PTR" + }, + { + i915_mmio_reg_offset(RING_INDIRECT_CTX(engine->mmio_base)), + lrc_ring_indirect_ptr(engine), + "RING_INDIRECT_CTX_PTR" + }, + { + i915_mmio_reg_offset(RING_INDIRECT_CTX_OFFSET(engine->mmio_base)), + lrc_ring_indirect_offset(engine), + "RING_INDIRECT_CTX_OFFSET" + }, + { + i915_mmio_reg_offset(RING_CTX_TIMESTAMP(engine->mmio_base)), + CTX_TIMESTAMP - 1, + "RING_CTX_TIMESTAMP" + }, + { + i915_mmio_reg_offset(GEN8_RING_CS_GPR(engine->mmio_base, 0)), + lrc_ring_gpr0(engine), + "RING_CS_GPR0" + }, + { + i915_mmio_reg_offset(RING_CMD_BUF_CCTL(engine->mmio_base)), + lrc_ring_cmd_buf_cctl(engine), + "RING_CMD_BUF_CCTL" + }, + { }, + }, *t; + u32 *hw; + + if (!engine->default_state) + continue; + + hw = shmem_pin_map(engine->default_state); + if (!hw) { + err = -ENOMEM; + break; + } + hw += LRC_STATE_OFFSET / sizeof(*hw); + + for (t = tbl; t->name; t++) { + int dw = find_offset(hw, t->reg); + + if (dw != t->offset) { + pr_err("%s: Offset for %s [0x%x] mismatch, found %x, expected %x\n", + engine->name, + t->name, + t->reg, + dw, + t->offset); + err = -EINVAL; + } + } + + shmem_unpin_map(engine->default_state, hw); + } + + return err; +} + +static int __live_lrc_state(struct intel_engine_cs *engine, + struct i915_vma *scratch) +{ + struct intel_context *ce; + struct i915_request *rq; + struct i915_gem_ww_ctx ww; + enum { + RING_START_IDX = 0, + RING_TAIL_IDX, + MAX_IDX + }; + u32 expected[MAX_IDX]; + u32 *cs; + int err; + int n; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return PTR_ERR(ce); + + i915_gem_ww_ctx_init(&ww, false); +retry: + err = i915_gem_object_lock(scratch->obj, &ww); + if (!err) + err = intel_context_pin_ww(ce, &ww); + if (err) + goto err_put; + + rq = i915_request_create(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_unpin; + } + + cs = intel_ring_begin(rq, 4 * MAX_IDX); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + i915_request_add(rq); + goto err_unpin; + } + + *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; + *cs++ = i915_mmio_reg_offset(RING_START(engine->mmio_base)); + *cs++ = i915_ggtt_offset(scratch) + RING_START_IDX * sizeof(u32); + *cs++ = 0; + + expected[RING_START_IDX] = i915_ggtt_offset(ce->ring->vma); + + *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; + *cs++ = i915_mmio_reg_offset(RING_TAIL(engine->mmio_base)); + *cs++ = i915_ggtt_offset(scratch) + RING_TAIL_IDX * sizeof(u32); + *cs++ = 0; + + err = i915_request_await_object(rq, scratch->obj, true); + if (!err) + err = i915_vma_move_to_active(scratch, rq, EXEC_OBJECT_WRITE); + + i915_request_get(rq); + i915_request_add(rq); + if (err) + goto err_rq; + + intel_engine_flush_submission(engine); + expected[RING_TAIL_IDX] = ce->ring->tail; + + if (i915_request_wait(rq, 0, HZ / 5) < 0) { + err = -ETIME; + goto err_rq; + } + + cs = i915_gem_object_pin_map(scratch->obj, I915_MAP_WB); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto err_rq; + } + + for (n = 0; n < MAX_IDX; n++) { + if (cs[n] != expected[n]) { + pr_err("%s: Stored register[%d] value[0x%x] did not match expected[0x%x]\n", + engine->name, n, cs[n], expected[n]); + err = -EINVAL; + break; + } + } + + i915_gem_object_unpin_map(scratch->obj); + +err_rq: + i915_request_put(rq); +err_unpin: + intel_context_unpin(ce); +err_put: + if (err == -EDEADLK) { + err = i915_gem_ww_ctx_backoff(&ww); + if (!err) + goto retry; + } + i915_gem_ww_ctx_fini(&ww); + intel_context_put(ce); + return err; +} + +static int live_lrc_state(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + struct i915_vma *scratch; + enum intel_engine_id id; + int err = 0; + + /* + * Check the live register state matches what we expect for this + * intel_context. + */ + + scratch = create_scratch(gt); + if (IS_ERR(scratch)) + return PTR_ERR(scratch); + + for_each_engine(engine, gt, id) { + err = __live_lrc_state(engine, scratch); + if (err) + break; + } + + if (igt_flush_test(gt->i915)) + err = -EIO; + + i915_vma_unpin_and_release(&scratch, 0); + return err; +} + +static int gpr_make_dirty(struct intel_context *ce) +{ + struct i915_request *rq; + u32 *cs; + int n; + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + cs = intel_ring_begin(rq, 2 * NUM_GPR_DW + 2); + if (IS_ERR(cs)) { + i915_request_add(rq); + return PTR_ERR(cs); + } + + *cs++ = MI_LOAD_REGISTER_IMM(NUM_GPR_DW); + for (n = 0; n < NUM_GPR_DW; n++) { + *cs++ = CS_GPR(ce->engine, n); + *cs++ = STACK_MAGIC; + } + *cs++ = MI_NOOP; + + intel_ring_advance(rq, cs); + + rq->sched.attr.priority = I915_PRIORITY_BARRIER; + i915_request_add(rq); + + return 0; +} + +static struct i915_request * +__gpr_read(struct intel_context *ce, struct i915_vma *scratch, u32 *slot) +{ + const u32 offset = + i915_ggtt_offset(ce->engine->status_page.vma) + + offset_in_page(slot); + struct i915_request *rq; + u32 *cs; + int err; + int n; + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) + return rq; + + cs = intel_ring_begin(rq, 6 + 4 * NUM_GPR_DW); + if (IS_ERR(cs)) { + i915_request_add(rq); + return ERR_CAST(cs); + } + + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + *cs++ = MI_NOOP; + + *cs++ = MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_NEQ_SDD; + *cs++ = 0; + *cs++ = offset; + *cs++ = 0; + + for (n = 0; n < NUM_GPR_DW; n++) { + *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; + *cs++ = CS_GPR(ce->engine, n); + *cs++ = i915_ggtt_offset(scratch) + n * sizeof(u32); + *cs++ = 0; + } + + i915_vma_lock(scratch); + err = i915_request_await_object(rq, scratch->obj, true); + if (!err) + err = i915_vma_move_to_active(scratch, rq, EXEC_OBJECT_WRITE); + i915_vma_unlock(scratch); + + i915_request_get(rq); + i915_request_add(rq); + if (err) { + i915_request_put(rq); + rq = ERR_PTR(err); + } + + return rq; +} + +static int __live_lrc_gpr(struct intel_engine_cs *engine, + struct i915_vma *scratch, + bool preempt) +{ + u32 *slot = memset32(engine->status_page.addr + 1000, 0, 4); + struct intel_context *ce; + struct i915_request *rq; + u32 *cs; + int err; + int n; + + if (INTEL_GEN(engine->i915) < 9 && engine->class != RENDER_CLASS) + return 0; /* GPR only on rcs0 for gen8 */ + + err = gpr_make_dirty(engine->kernel_context); + if (err) + return err; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return PTR_ERR(ce); + + rq = __gpr_read(ce, scratch, slot); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_put; + } + + err = wait_for_submit(engine, rq, HZ / 2); + if (err) + goto err_rq; + + if (preempt) { + err = gpr_make_dirty(engine->kernel_context); + if (err) + goto err_rq; + + err = emit_semaphore_signal(engine->kernel_context, slot); + if (err) + goto err_rq; + } else { + slot[0] = 1; + wmb(); + } + + if (i915_request_wait(rq, 0, HZ / 5) < 0) { + err = -ETIME; + goto err_rq; + } + + cs = i915_gem_object_pin_map(scratch->obj, I915_MAP_WB); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto err_rq; + } + + for (n = 0; n < NUM_GPR_DW; n++) { + if (cs[n]) { + pr_err("%s: GPR[%d].%s was not zero, found 0x%08x!\n", + engine->name, + n / 2, n & 1 ? "udw" : "ldw", + cs[n]); + err = -EINVAL; + break; + } + } + + i915_gem_object_unpin_map(scratch->obj); + +err_rq: + memset32(&slot[0], -1, 4); + wmb(); + i915_request_put(rq); +err_put: + intel_context_put(ce); + return err; +} + +static int live_lrc_gpr(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + struct i915_vma *scratch; + enum intel_engine_id id; + int err = 0; + + /* + * Check that GPR registers are cleared in new contexts as we need + * to avoid leaking any information from previous contexts. + */ + + scratch = create_scratch(gt); + if (IS_ERR(scratch)) + return PTR_ERR(scratch); + + for_each_engine(engine, gt, id) { + st_engine_heartbeat_disable(engine); + + err = __live_lrc_gpr(engine, scratch, false); + if (err) + goto err; + + err = __live_lrc_gpr(engine, scratch, true); + if (err) + goto err; + +err: + st_engine_heartbeat_enable(engine); + if (igt_flush_test(gt->i915)) + err = -EIO; + if (err) + break; + } + + i915_vma_unpin_and_release(&scratch, 0); + return err; +} + +static struct i915_request * +create_timestamp(struct intel_context *ce, void *slot, int idx) +{ + const u32 offset = + i915_ggtt_offset(ce->engine->status_page.vma) + + offset_in_page(slot); + struct i915_request *rq; + u32 *cs; + int err; + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) + return rq; + + cs = intel_ring_begin(rq, 10); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto err; + } + + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + *cs++ = MI_NOOP; + + *cs++ = MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_NEQ_SDD; + *cs++ = 0; + *cs++ = offset; + *cs++ = 0; + + *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(rq->engine->mmio_base)); + *cs++ = offset + idx * sizeof(u32); + *cs++ = 0; + + intel_ring_advance(rq, cs); + + rq->sched.attr.priority = I915_PRIORITY_MASK; + err = 0; +err: + i915_request_get(rq); + i915_request_add(rq); + if (err) { + i915_request_put(rq); + return ERR_PTR(err); + } + + return rq; +} + +struct lrc_timestamp { + struct intel_engine_cs *engine; + struct intel_context *ce[2]; + u32 poison; +}; + +static bool timestamp_advanced(u32 start, u32 end) +{ + return (s32)(end - start) > 0; +} + +static int __lrc_timestamp(const struct lrc_timestamp *arg, bool preempt) +{ + u32 *slot = memset32(arg->engine->status_page.addr + 1000, 0, 4); + struct i915_request *rq; + u32 timestamp; + int err = 0; + + arg->ce[0]->lrc_reg_state[CTX_TIMESTAMP] = arg->poison; + rq = create_timestamp(arg->ce[0], slot, 1); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + err = wait_for_submit(rq->engine, rq, HZ / 2); + if (err) + goto err; + + if (preempt) { + arg->ce[1]->lrc_reg_state[CTX_TIMESTAMP] = 0xdeadbeef; + err = emit_semaphore_signal(arg->ce[1], slot); + if (err) + goto err; + } else { + slot[0] = 1; + wmb(); + } + + /* And wait for switch to kernel (to save our context to memory) */ + err = context_flush(arg->ce[0], HZ / 2); + if (err) + goto err; + + if (!timestamp_advanced(arg->poison, slot[1])) { + pr_err("%s(%s): invalid timestamp on restore, context:%x, request:%x\n", + arg->engine->name, preempt ? "preempt" : "simple", + arg->poison, slot[1]); + err = -EINVAL; + } + + timestamp = READ_ONCE(arg->ce[0]->lrc_reg_state[CTX_TIMESTAMP]); + if (!timestamp_advanced(slot[1], timestamp)) { + pr_err("%s(%s): invalid timestamp on save, request:%x, context:%x\n", + arg->engine->name, preempt ? "preempt" : "simple", + slot[1], timestamp); + err = -EINVAL; + } + +err: + memset32(slot, -1, 4); + i915_request_put(rq); + return err; +} + +static int live_lrc_timestamp(void *arg) +{ + struct lrc_timestamp data = {}; + struct intel_gt *gt = arg; + enum intel_engine_id id; + const u32 poison[] = { + 0, + S32_MAX, + (u32)S32_MAX + 1, + U32_MAX, + }; + + /* + * We want to verify that the timestamp is saved and restore across + * context switches and is monotonic. + * + * So we do this with a little bit of LRC poisoning to check various + * boundary conditions, and see what happens if we preempt the context + * with a second request (carrying more poison into the timestamp). + */ + + for_each_engine(data.engine, gt, id) { + int i, err = 0; + + st_engine_heartbeat_disable(data.engine); + + for (i = 0; i < ARRAY_SIZE(data.ce); i++) { + struct intel_context *tmp; + + tmp = intel_context_create(data.engine); + if (IS_ERR(tmp)) { + err = PTR_ERR(tmp); + goto err; + } + + err = intel_context_pin(tmp); + if (err) { + intel_context_put(tmp); + goto err; + } + + data.ce[i] = tmp; + } + + for (i = 0; i < ARRAY_SIZE(poison); i++) { + data.poison = poison[i]; + + err = __lrc_timestamp(&data, false); + if (err) + break; + + err = __lrc_timestamp(&data, true); + if (err) + break; + } + +err: + st_engine_heartbeat_enable(data.engine); + for (i = 0; i < ARRAY_SIZE(data.ce); i++) { + if (!data.ce[i]) + break; + + intel_context_unpin(data.ce[i]); + intel_context_put(data.ce[i]); + } + + if (igt_flush_test(gt->i915)) + err = -EIO; + if (err) + return err; + } + + return 0; +} + +static struct i915_vma * +create_user_vma(struct i915_address_space *vm, unsigned long size) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + int err; + + obj = i915_gem_object_create_internal(vm->i915, size); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + vma = i915_vma_instance(obj, vm, NULL); + if (IS_ERR(vma)) { + i915_gem_object_put(obj); + return vma; + } + + err = i915_vma_pin(vma, 0, 0, PIN_USER); + if (err) { + i915_gem_object_put(obj); + return ERR_PTR(err); + } + + return vma; +} + +static struct i915_vma * +store_context(struct intel_context *ce, struct i915_vma *scratch) +{ + struct i915_vma *batch; + u32 dw, x, *cs, *hw; + u32 *defaults; + + batch = create_user_vma(ce->vm, SZ_64K); + if (IS_ERR(batch)) + return batch; + + cs = i915_gem_object_pin_map(batch->obj, I915_MAP_WC); + if (IS_ERR(cs)) { + i915_vma_put(batch); + return ERR_CAST(cs); + } + + defaults = shmem_pin_map(ce->engine->default_state); + if (!defaults) { + i915_gem_object_unpin_map(batch->obj); + i915_vma_put(batch); + return ERR_PTR(-ENOMEM); + } + + x = 0; + dw = 0; + hw = defaults; + hw += LRC_STATE_OFFSET / sizeof(*hw); + do { + u32 len = hw[dw] & 0x7f; + + if (hw[dw] == 0) { + dw++; + continue; + } + + if ((hw[dw] & GENMASK(31, 23)) != MI_INSTR(0x22, 0)) { + dw += len + 2; + continue; + } + + dw++; + len = (len + 1) / 2; + while (len--) { + *cs++ = MI_STORE_REGISTER_MEM_GEN8; + *cs++ = hw[dw]; + *cs++ = lower_32_bits(scratch->node.start + x); + *cs++ = upper_32_bits(scratch->node.start + x); + + dw += 2; + x += 4; + } + } while (dw < PAGE_SIZE / sizeof(u32) && + (hw[dw] & ~BIT(0)) != MI_BATCH_BUFFER_END); + + *cs++ = MI_BATCH_BUFFER_END; + + shmem_unpin_map(ce->engine->default_state, defaults); + + i915_gem_object_flush_map(batch->obj); + i915_gem_object_unpin_map(batch->obj); + + return batch; +} + +static int move_to_active(struct i915_request *rq, + struct i915_vma *vma, + unsigned int flags) +{ + int err; + + i915_vma_lock(vma); + err = i915_request_await_object(rq, vma->obj, flags); + if (!err) + err = i915_vma_move_to_active(vma, rq, flags); + i915_vma_unlock(vma); + + return err; +} + +static struct i915_request * +record_registers(struct intel_context *ce, + struct i915_vma *before, + struct i915_vma *after, + u32 *sema) +{ + struct i915_vma *b_before, *b_after; + struct i915_request *rq; + u32 *cs; + int err; + + b_before = store_context(ce, before); + if (IS_ERR(b_before)) + return ERR_CAST(b_before); + + b_after = store_context(ce, after); + if (IS_ERR(b_after)) { + rq = ERR_CAST(b_after); + goto err_before; + } + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) + goto err_after; + + err = move_to_active(rq, before, EXEC_OBJECT_WRITE); + if (err) + goto err_rq; + + err = move_to_active(rq, b_before, 0); + if (err) + goto err_rq; + + err = move_to_active(rq, after, EXEC_OBJECT_WRITE); + if (err) + goto err_rq; + + err = move_to_active(rq, b_after, 0); + if (err) + goto err_rq; + + cs = intel_ring_begin(rq, 14); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto err_rq; + } + + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + *cs++ = MI_BATCH_BUFFER_START_GEN8 | BIT(8); + *cs++ = lower_32_bits(b_before->node.start); + *cs++ = upper_32_bits(b_before->node.start); + + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + *cs++ = MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_NEQ_SDD; + *cs++ = 0; + *cs++ = i915_ggtt_offset(ce->engine->status_page.vma) + + offset_in_page(sema); + *cs++ = 0; + *cs++ = MI_NOOP; + + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + *cs++ = MI_BATCH_BUFFER_START_GEN8 | BIT(8); + *cs++ = lower_32_bits(b_after->node.start); + *cs++ = upper_32_bits(b_after->node.start); + + intel_ring_advance(rq, cs); + + WRITE_ONCE(*sema, 0); + i915_request_get(rq); + i915_request_add(rq); +err_after: + i915_vma_put(b_after); +err_before: + i915_vma_put(b_before); + return rq; + +err_rq: + i915_request_add(rq); + rq = ERR_PTR(err); + goto err_after; +} + +static struct i915_vma *load_context(struct intel_context *ce, u32 poison) +{ + struct i915_vma *batch; + u32 dw, *cs, *hw; + u32 *defaults; + + batch = create_user_vma(ce->vm, SZ_64K); + if (IS_ERR(batch)) + return batch; + + cs = i915_gem_object_pin_map(batch->obj, I915_MAP_WC); + if (IS_ERR(cs)) { + i915_vma_put(batch); + return ERR_CAST(cs); + } + + defaults = shmem_pin_map(ce->engine->default_state); + if (!defaults) { + i915_gem_object_unpin_map(batch->obj); + i915_vma_put(batch); + return ERR_PTR(-ENOMEM); + } + + dw = 0; + hw = defaults; + hw += LRC_STATE_OFFSET / sizeof(*hw); + do { + u32 len = hw[dw] & 0x7f; + + if (hw[dw] == 0) { + dw++; + continue; + } + + if ((hw[dw] & GENMASK(31, 23)) != MI_INSTR(0x22, 0)) { + dw += len + 2; + continue; + } + + dw++; + len = (len + 1) / 2; + *cs++ = MI_LOAD_REGISTER_IMM(len); + while (len--) { + *cs++ = hw[dw]; + *cs++ = poison; + dw += 2; + } + } while (dw < PAGE_SIZE / sizeof(u32) && + (hw[dw] & ~BIT(0)) != MI_BATCH_BUFFER_END); + + *cs++ = MI_BATCH_BUFFER_END; + + shmem_unpin_map(ce->engine->default_state, defaults); + + i915_gem_object_flush_map(batch->obj); + i915_gem_object_unpin_map(batch->obj); + + return batch; +} + +static int poison_registers(struct intel_context *ce, u32 poison, u32 *sema) +{ + struct i915_request *rq; + struct i915_vma *batch; + u32 *cs; + int err; + + batch = load_context(ce, poison); + if (IS_ERR(batch)) + return PTR_ERR(batch); + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_batch; + } + + err = move_to_active(rq, batch, 0); + if (err) + goto err_rq; + + cs = intel_ring_begin(rq, 8); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto err_rq; + } + + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + *cs++ = MI_BATCH_BUFFER_START_GEN8 | BIT(8); + *cs++ = lower_32_bits(batch->node.start); + *cs++ = upper_32_bits(batch->node.start); + + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = i915_ggtt_offset(ce->engine->status_page.vma) + + offset_in_page(sema); + *cs++ = 0; + *cs++ = 1; + + intel_ring_advance(rq, cs); + + rq->sched.attr.priority = I915_PRIORITY_BARRIER; +err_rq: + i915_request_add(rq); +err_batch: + i915_vma_put(batch); + return err; +} + +static bool is_moving(u32 a, u32 b) +{ + return a != b; +} + +static int compare_isolation(struct intel_engine_cs *engine, + struct i915_vma *ref[2], + struct i915_vma *result[2], + struct intel_context *ce, + u32 poison) +{ + u32 x, dw, *hw, *lrc; + u32 *A[2], *B[2]; + u32 *defaults; + int err = 0; + + A[0] = i915_gem_object_pin_map(ref[0]->obj, I915_MAP_WC); + if (IS_ERR(A[0])) + return PTR_ERR(A[0]); + + A[1] = i915_gem_object_pin_map(ref[1]->obj, I915_MAP_WC); + if (IS_ERR(A[1])) { + err = PTR_ERR(A[1]); + goto err_A0; + } + + B[0] = i915_gem_object_pin_map(result[0]->obj, I915_MAP_WC); + if (IS_ERR(B[0])) { + err = PTR_ERR(B[0]); + goto err_A1; + } + + B[1] = i915_gem_object_pin_map(result[1]->obj, I915_MAP_WC); + if (IS_ERR(B[1])) { + err = PTR_ERR(B[1]); + goto err_B0; + } + + lrc = i915_gem_object_pin_map(ce->state->obj, + i915_coherent_map_type(engine->i915)); + if (IS_ERR(lrc)) { + err = PTR_ERR(lrc); + goto err_B1; + } + lrc += LRC_STATE_OFFSET / sizeof(*hw); + + defaults = shmem_pin_map(ce->engine->default_state); + if (!defaults) { + err = -ENOMEM; + goto err_lrc; + } + + x = 0; + dw = 0; + hw = defaults; + hw += LRC_STATE_OFFSET / sizeof(*hw); + do { + u32 len = hw[dw] & 0x7f; + + if (hw[dw] == 0) { + dw++; + continue; + } + + if ((hw[dw] & GENMASK(31, 23)) != MI_INSTR(0x22, 0)) { + dw += len + 2; + continue; + } + + dw++; + len = (len + 1) / 2; + while (len--) { + if (!is_moving(A[0][x], A[1][x]) && + (A[0][x] != B[0][x] || A[1][x] != B[1][x])) { + switch (hw[dw] & 4095) { + case 0x30: /* RING_HEAD */ + case 0x34: /* RING_TAIL */ + break; + + default: + pr_err("%s[%d]: Mismatch for register %4x, default %08x, reference %08x, result (%08x, %08x), poison %08x, context %08x\n", + engine->name, dw, + hw[dw], hw[dw + 1], + A[0][x], B[0][x], B[1][x], + poison, lrc[dw + 1]); + err = -EINVAL; + } + } + dw += 2; + x++; + } + } while (dw < PAGE_SIZE / sizeof(u32) && + (hw[dw] & ~BIT(0)) != MI_BATCH_BUFFER_END); + + shmem_unpin_map(ce->engine->default_state, defaults); +err_lrc: + i915_gem_object_unpin_map(ce->state->obj); +err_B1: + i915_gem_object_unpin_map(result[1]->obj); +err_B0: + i915_gem_object_unpin_map(result[0]->obj); +err_A1: + i915_gem_object_unpin_map(ref[1]->obj); +err_A0: + i915_gem_object_unpin_map(ref[0]->obj); + return err; +} + +static int __lrc_isolation(struct intel_engine_cs *engine, u32 poison) +{ + u32 *sema = memset32(engine->status_page.addr + 1000, 0, 1); + struct i915_vma *ref[2], *result[2]; + struct intel_context *A, *B; + struct i915_request *rq; + int err; + + A = intel_context_create(engine); + if (IS_ERR(A)) + return PTR_ERR(A); + + B = intel_context_create(engine); + if (IS_ERR(B)) { + err = PTR_ERR(B); + goto err_A; + } + + ref[0] = create_user_vma(A->vm, SZ_64K); + if (IS_ERR(ref[0])) { + err = PTR_ERR(ref[0]); + goto err_B; + } + + ref[1] = create_user_vma(A->vm, SZ_64K); + if (IS_ERR(ref[1])) { + err = PTR_ERR(ref[1]); + goto err_ref0; + } + + rq = record_registers(A, ref[0], ref[1], sema); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_ref1; + } + + WRITE_ONCE(*sema, 1); + wmb(); + + if (i915_request_wait(rq, 0, HZ / 2) < 0) { + i915_request_put(rq); + err = -ETIME; + goto err_ref1; + } + i915_request_put(rq); + + result[0] = create_user_vma(A->vm, SZ_64K); + if (IS_ERR(result[0])) { + err = PTR_ERR(result[0]); + goto err_ref1; + } + + result[1] = create_user_vma(A->vm, SZ_64K); + if (IS_ERR(result[1])) { + err = PTR_ERR(result[1]); + goto err_result0; + } + + rq = record_registers(A, result[0], result[1], sema); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_result1; + } + + err = poison_registers(B, poison, sema); + if (err) { + WRITE_ONCE(*sema, -1); + i915_request_put(rq); + goto err_result1; + } + + if (i915_request_wait(rq, 0, HZ / 2) < 0) { + i915_request_put(rq); + err = -ETIME; + goto err_result1; + } + i915_request_put(rq); + + err = compare_isolation(engine, ref, result, A, poison); + +err_result1: + i915_vma_put(result[1]); +err_result0: + i915_vma_put(result[0]); +err_ref1: + i915_vma_put(ref[1]); +err_ref0: + i915_vma_put(ref[0]); +err_B: + intel_context_put(B); +err_A: + intel_context_put(A); + return err; +} + +static bool skip_isolation(const struct intel_engine_cs *engine) +{ + if (engine->class == COPY_ENGINE_CLASS && INTEL_GEN(engine->i915) == 9) + return true; + + if (engine->class == RENDER_CLASS && INTEL_GEN(engine->i915) == 11) + return true; + + return false; +} + +static int live_lrc_isolation(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + const u32 poison[] = { + STACK_MAGIC, + 0x3a3a3a3a, + 0x5c5c5c5c, + 0xffffffff, + 0xffff0000, + }; + int err = 0; + + /* + * Our goal is try and verify that per-context state cannot be + * tampered with by another non-privileged client. + * + * We take the list of context registers from the LRI in the default + * context image and attempt to modify that list from a remote context. + */ + + for_each_engine(engine, gt, id) { + int i; + + /* Just don't even ask */ + if (!IS_ENABLED(CONFIG_DRM_I915_SELFTEST_BROKEN) && + skip_isolation(engine)) + continue; + + intel_engine_pm_get(engine); + for (i = 0; i < ARRAY_SIZE(poison); i++) { + int result; + + result = __lrc_isolation(engine, poison[i]); + if (result && !err) + err = result; + + result = __lrc_isolation(engine, ~poison[i]); + if (result && !err) + err = result; + } + intel_engine_pm_put(engine); + if (igt_flush_test(gt->i915)) { + err = -EIO; + break; + } + } + + return err; +} + +static int indirect_ctx_submit_req(struct intel_context *ce) +{ + struct i915_request *rq; + int err = 0; + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + i915_request_get(rq); + i915_request_add(rq); + + if (i915_request_wait(rq, 0, HZ / 5) < 0) + err = -ETIME; + + i915_request_put(rq); + + return err; +} + +#define CTX_BB_CANARY_OFFSET (3 * 1024) +#define CTX_BB_CANARY_INDEX (CTX_BB_CANARY_OFFSET / sizeof(u32)) + +static u32 * +emit_indirect_ctx_bb_canary(const struct intel_context *ce, u32 *cs) +{ + *cs++ = MI_STORE_REGISTER_MEM_GEN8 | + MI_SRM_LRM_GLOBAL_GTT | + MI_LRI_LRM_CS_MMIO; + *cs++ = i915_mmio_reg_offset(RING_START(0)); + *cs++ = i915_ggtt_offset(ce->state) + + context_wa_bb_offset(ce) + + CTX_BB_CANARY_OFFSET; + *cs++ = 0; + + return cs; +} + +static void +indirect_ctx_bb_setup(struct intel_context *ce) +{ + u32 *cs = context_indirect_bb(ce); + + cs[CTX_BB_CANARY_INDEX] = 0xdeadf00d; + + setup_indirect_ctx_bb(ce, ce->engine, emit_indirect_ctx_bb_canary); +} + +static bool check_ring_start(struct intel_context *ce) +{ + const u32 * const ctx_bb = (void *)(ce->lrc_reg_state) - + LRC_STATE_OFFSET + context_wa_bb_offset(ce); + + if (ctx_bb[CTX_BB_CANARY_INDEX] == ce->lrc_reg_state[CTX_RING_START]) + return true; + + pr_err("ring start mismatch: canary 0x%08x vs state 0x%08x\n", + ctx_bb[CTX_BB_CANARY_INDEX], + ce->lrc_reg_state[CTX_RING_START]); + + return false; +} + +static int indirect_ctx_bb_check(struct intel_context *ce) +{ + int err; + + err = indirect_ctx_submit_req(ce); + if (err) + return err; + + if (!check_ring_start(ce)) + return -EINVAL; + + return 0; +} + +static int __live_lrc_indirect_ctx_bb(struct intel_engine_cs *engine) +{ + struct intel_context *a, *b; + int err; + + a = intel_context_create(engine); + if (IS_ERR(a)) + return PTR_ERR(a); + err = intel_context_pin(a); + if (err) + goto put_a; + + b = intel_context_create(engine); + if (IS_ERR(b)) { + err = PTR_ERR(b); + goto unpin_a; + } + err = intel_context_pin(b); + if (err) + goto put_b; + + /* We use the already reserved extra page in context state */ + if (!a->wa_bb_page) { + GEM_BUG_ON(b->wa_bb_page); + GEM_BUG_ON(INTEL_GEN(engine->i915) == 12); + goto unpin_b; + } + + /* + * In order to test that our per context bb is truly per context, + * and executes at the intended spot on context restoring process, + * make the batch store the ring start value to memory. + * As ring start is restored apriori of starting the indirect ctx bb and + * as it will be different for each context, it fits to this purpose. + */ + indirect_ctx_bb_setup(a); + indirect_ctx_bb_setup(b); + + err = indirect_ctx_bb_check(a); + if (err) + goto unpin_b; + + err = indirect_ctx_bb_check(b); + +unpin_b: + intel_context_unpin(b); +put_b: + intel_context_put(b); +unpin_a: + intel_context_unpin(a); +put_a: + intel_context_put(a); + + return err; +} + +static int live_lrc_indirect_ctx_bb(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + for_each_engine(engine, gt, id) { + intel_engine_pm_get(engine); + err = __live_lrc_indirect_ctx_bb(engine); + intel_engine_pm_put(engine); + + if (igt_flush_test(gt->i915)) + err = -EIO; + + if (err) + break; + } + + return err; +} + +static void garbage_reset(struct intel_engine_cs *engine, + struct i915_request *rq) +{ + const unsigned int bit = I915_RESET_ENGINE + engine->id; + unsigned long *lock = &engine->gt->reset.flags; + + if (test_and_set_bit(bit, lock)) + return; + + tasklet_disable(&engine->execlists.tasklet); + + if (!rq->fence.error) + intel_engine_reset(engine, NULL); + + tasklet_enable(&engine->execlists.tasklet); + clear_and_wake_up_bit(bit, lock); +} + +static struct i915_request *garbage(struct intel_context *ce, + struct rnd_state *prng) +{ + struct i915_request *rq; + int err; + + err = intel_context_pin(ce); + if (err) + return ERR_PTR(err); + + prandom_bytes_state(prng, + ce->lrc_reg_state, + ce->engine->context_size - + LRC_STATE_OFFSET); + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_unpin; + } + + i915_request_get(rq); + i915_request_add(rq); + return rq; + +err_unpin: + intel_context_unpin(ce); + return ERR_PTR(err); +} + +static int __lrc_garbage(struct intel_engine_cs *engine, struct rnd_state *prng) +{ + struct intel_context *ce; + struct i915_request *hang; + int err = 0; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return PTR_ERR(ce); + + hang = garbage(ce, prng); + if (IS_ERR(hang)) { + err = PTR_ERR(hang); + goto err_ce; + } + + if (wait_for_submit(engine, hang, HZ / 2)) { + i915_request_put(hang); + err = -ETIME; + goto err_ce; + } + + intel_context_set_banned(ce); + garbage_reset(engine, hang); + + intel_engine_flush_submission(engine); + if (!hang->fence.error) { + i915_request_put(hang); + pr_err("%s: corrupted context was not reset\n", + engine->name); + err = -EINVAL; + goto err_ce; + } + + if (i915_request_wait(hang, 0, HZ / 2) < 0) { + pr_err("%s: corrupted context did not recover\n", + engine->name); + i915_request_put(hang); + err = -EIO; + goto err_ce; + } + i915_request_put(hang); + +err_ce: + intel_context_put(ce); + return err; +} + +static int live_lrc_garbage(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + + /* + * Verify that we can recover if one context state is completely + * corrupted. + */ + + if (!IS_ENABLED(CONFIG_DRM_I915_SELFTEST_BROKEN)) + return 0; + + for_each_engine(engine, gt, id) { + I915_RND_STATE(prng); + int err = 0, i; + + if (!intel_has_reset_engine(engine->gt)) + continue; + + intel_engine_pm_get(engine); + for (i = 0; i < 3; i++) { + err = __lrc_garbage(engine, &prng); + if (err) + break; + } + intel_engine_pm_put(engine); + + if (igt_flush_test(gt->i915)) + err = -EIO; + if (err) + return err; + } + + return 0; +} + +static int __live_pphwsp_runtime(struct intel_engine_cs *engine) +{ + struct intel_context *ce; + struct i915_request *rq; + IGT_TIMEOUT(end_time); + int err; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return PTR_ERR(ce); + + ce->runtime.num_underflow = 0; + ce->runtime.max_underflow = 0; + + do { + unsigned int loop = 1024; + + while (loop) { + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_rq; + } + + if (--loop == 0) + i915_request_get(rq); + + i915_request_add(rq); + } + + if (__igt_timeout(end_time, NULL)) + break; + + i915_request_put(rq); + } while (1); + + err = i915_request_wait(rq, 0, HZ / 5); + if (err < 0) { + pr_err("%s: request not completed!\n", engine->name); + goto err_wait; + } + + igt_flush_test(engine->i915); + + pr_info("%s: pphwsp runtime %lluns, average %lluns\n", + engine->name, + intel_context_get_total_runtime_ns(ce), + intel_context_get_avg_runtime_ns(ce)); + + err = 0; + if (ce->runtime.num_underflow) { + pr_err("%s: pphwsp underflow %u time(s), max %u cycles!\n", + engine->name, + ce->runtime.num_underflow, + ce->runtime.max_underflow); + GEM_TRACE_DUMP(); + err = -EOVERFLOW; + } + +err_wait: + i915_request_put(rq); +err_rq: + intel_context_put(ce); + return err; +} + +static int live_pphwsp_runtime(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + /* + * Check that cumulative context runtime as stored in the pphwsp[16] + * is monotonic. + */ + + for_each_engine(engine, gt, id) { + err = __live_pphwsp_runtime(engine); + if (err) + break; + } + + if (igt_flush_test(gt->i915)) + err = -EIO; + + return err; +} + +int intel_lrc_live_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(live_lrc_layout), + SUBTEST(live_lrc_fixed), + SUBTEST(live_lrc_state), + SUBTEST(live_lrc_gpr), + SUBTEST(live_lrc_isolation), + SUBTEST(live_lrc_timestamp), + SUBTEST(live_lrc_garbage), + SUBTEST(live_pphwsp_runtime), + SUBTEST(live_lrc_indirect_ctx_bb), + }; + + if (!HAS_LOGICAL_RING_CONTEXTS(i915)) + return 0; + + return intel_gt_live_subtests(tests, &i915->gt); +} diff --git a/drivers/gpu/drm/i915/gt/selftest_mocs.c b/drivers/gpu/drm/i915/gt/selftest_mocs.c new file mode 100644 index 000000000..b25eba50c --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_mocs.c @@ -0,0 +1,445 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include "gt/intel_engine_pm.h" +#include "i915_selftest.h" + +#include "gem/selftests/mock_context.h" +#include "selftests/igt_reset.h" +#include "selftests/igt_spinner.h" + +struct live_mocs { + struct drm_i915_mocs_table mocs; + struct drm_i915_mocs_table l3cc; + struct i915_vma *scratch; + void *vaddr; +}; + +static struct intel_context *mocs_context_create(struct intel_engine_cs *engine) +{ + struct intel_context *ce; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return ce; + + /* We build large requests to read the registers from the ring */ + ce->ring = __intel_context_ring_size(SZ_16K); + + return ce; +} + +static int request_add_sync(struct i915_request *rq, int err) +{ + i915_request_get(rq); + i915_request_add(rq); + if (i915_request_wait(rq, 0, HZ / 5) < 0) + err = -ETIME; + i915_request_put(rq); + + return err; +} + +static int request_add_spin(struct i915_request *rq, struct igt_spinner *spin) +{ + int err = 0; + + i915_request_get(rq); + i915_request_add(rq); + if (spin && !igt_wait_for_spinner(spin, rq)) + err = -ETIME; + i915_request_put(rq); + + return err; +} + +static struct i915_vma *create_scratch(struct intel_gt *gt) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + int err; + + obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + i915_gem_object_set_cache_coherency(obj, I915_CACHING_CACHED); + + vma = i915_vma_instance(obj, >->ggtt->vm, NULL); + if (IS_ERR(vma)) { + i915_gem_object_put(obj); + return vma; + } + + err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL); + if (err) { + i915_gem_object_put(obj); + return ERR_PTR(err); + } + + return vma; +} + +static int live_mocs_init(struct live_mocs *arg, struct intel_gt *gt) +{ + struct drm_i915_mocs_table table; + unsigned int flags; + int err; + + memset(arg, 0, sizeof(*arg)); + + flags = get_mocs_settings(gt->i915, &table); + if (!flags) + return -EINVAL; + + if (flags & HAS_RENDER_L3CC) + arg->l3cc = table; + + if (flags & (HAS_GLOBAL_MOCS | HAS_ENGINE_MOCS)) + arg->mocs = table; + + arg->scratch = create_scratch(gt); + if (IS_ERR(arg->scratch)) + return PTR_ERR(arg->scratch); + + arg->vaddr = i915_gem_object_pin_map(arg->scratch->obj, I915_MAP_WB); + if (IS_ERR(arg->vaddr)) { + err = PTR_ERR(arg->vaddr); + goto err_scratch; + } + + return 0; + +err_scratch: + i915_vma_unpin_and_release(&arg->scratch, 0); + return err; +} + +static void live_mocs_fini(struct live_mocs *arg) +{ + i915_vma_unpin_and_release(&arg->scratch, I915_VMA_RELEASE_MAP); +} + +static int read_regs(struct i915_request *rq, + u32 addr, unsigned int count, + uint32_t *offset) +{ + unsigned int i; + u32 *cs; + + GEM_BUG_ON(!IS_ALIGNED(*offset, sizeof(u32))); + + cs = intel_ring_begin(rq, 4 * count); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + for (i = 0; i < count; i++) { + *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; + *cs++ = addr; + *cs++ = *offset; + *cs++ = 0; + + addr += sizeof(u32); + *offset += sizeof(u32); + } + + intel_ring_advance(rq, cs); + + return 0; +} + +static int read_mocs_table(struct i915_request *rq, + const struct drm_i915_mocs_table *table, + uint32_t *offset) +{ + u32 addr; + + if (HAS_GLOBAL_MOCS_REGISTERS(rq->engine->i915)) + addr = global_mocs_offset(); + else + addr = mocs_offset(rq->engine); + + return read_regs(rq, addr, table->n_entries, offset); +} + +static int read_l3cc_table(struct i915_request *rq, + const struct drm_i915_mocs_table *table, + uint32_t *offset) +{ + u32 addr = i915_mmio_reg_offset(GEN9_LNCFCMOCS(0)); + + return read_regs(rq, addr, (table->n_entries + 1) / 2, offset); +} + +static int check_mocs_table(struct intel_engine_cs *engine, + const struct drm_i915_mocs_table *table, + uint32_t **vaddr) +{ + unsigned int i; + u32 expect; + + for_each_mocs(expect, table, i) { + if (**vaddr != expect) { + pr_err("%s: Invalid MOCS[%d] entry, found %08x, expected %08x\n", + engine->name, i, **vaddr, expect); + return -EINVAL; + } + ++*vaddr; + } + + return 0; +} + +static bool mcr_range(struct drm_i915_private *i915, u32 offset) +{ + /* + * Registers in this range are affected by the MCR selector + * which only controls CPU initiated MMIO. Routing does not + * work for CS access so we cannot verify them on this path. + */ + return INTEL_GEN(i915) >= 8 && offset >= 0xb000 && offset <= 0xb4ff; +} + +static int check_l3cc_table(struct intel_engine_cs *engine, + const struct drm_i915_mocs_table *table, + uint32_t **vaddr) +{ + /* Can we read the MCR range 0xb00 directly? See intel_workarounds! */ + u32 reg = i915_mmio_reg_offset(GEN9_LNCFCMOCS(0)); + unsigned int i; + u32 expect; + + for_each_l3cc(expect, table, i) { + if (!mcr_range(engine->i915, reg) && **vaddr != expect) { + pr_err("%s: Invalid L3CC[%d] entry, found %08x, expected %08x\n", + engine->name, i, **vaddr, expect); + return -EINVAL; + } + ++*vaddr; + reg += 4; + } + + return 0; +} + +static int check_mocs_engine(struct live_mocs *arg, + struct intel_context *ce) +{ + struct i915_vma *vma = arg->scratch; + struct i915_request *rq; + u32 offset; + u32 *vaddr; + int err; + + memset32(arg->vaddr, STACK_MAGIC, PAGE_SIZE / sizeof(u32)); + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + i915_vma_lock(vma); + err = i915_request_await_object(rq, vma->obj, true); + if (!err) + err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); + i915_vma_unlock(vma); + + /* Read the mocs tables back using SRM */ + offset = i915_ggtt_offset(vma); + if (!err) + err = read_mocs_table(rq, &arg->mocs, &offset); + if (!err && ce->engine->class == RENDER_CLASS) + err = read_l3cc_table(rq, &arg->l3cc, &offset); + offset -= i915_ggtt_offset(vma); + GEM_BUG_ON(offset > PAGE_SIZE); + + err = request_add_sync(rq, err); + if (err) + return err; + + /* Compare the results against the expected tables */ + vaddr = arg->vaddr; + if (!err) + err = check_mocs_table(ce->engine, &arg->mocs, &vaddr); + if (!err && ce->engine->class == RENDER_CLASS) + err = check_l3cc_table(ce->engine, &arg->l3cc, &vaddr); + if (err) + return err; + + GEM_BUG_ON(arg->vaddr + offset != vaddr); + return 0; +} + +static int live_mocs_kernel(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct live_mocs mocs; + int err; + + /* Basic check the system is configured with the expected mocs table */ + + err = live_mocs_init(&mocs, gt); + if (err) + return err; + + for_each_engine(engine, gt, id) { + intel_engine_pm_get(engine); + err = check_mocs_engine(&mocs, engine->kernel_context); + intel_engine_pm_put(engine); + if (err) + break; + } + + live_mocs_fini(&mocs); + return err; +} + +static int live_mocs_clean(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct live_mocs mocs; + int err; + + /* Every new context should see the same mocs table */ + + err = live_mocs_init(&mocs, gt); + if (err) + return err; + + for_each_engine(engine, gt, id) { + struct intel_context *ce; + + ce = mocs_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + break; + } + + err = check_mocs_engine(&mocs, ce); + intel_context_put(ce); + if (err) + break; + } + + live_mocs_fini(&mocs); + return err; +} + +static int active_engine_reset(struct intel_context *ce, + const char *reason) +{ + struct igt_spinner spin; + struct i915_request *rq; + int err; + + err = igt_spinner_init(&spin, ce->engine->gt); + if (err) + return err; + + rq = igt_spinner_create_request(&spin, ce, MI_NOOP); + if (IS_ERR(rq)) { + igt_spinner_fini(&spin); + return PTR_ERR(rq); + } + + err = request_add_spin(rq, &spin); + if (err == 0) + err = intel_engine_reset(ce->engine, reason); + + igt_spinner_end(&spin); + igt_spinner_fini(&spin); + + return err; +} + +static int __live_mocs_reset(struct live_mocs *mocs, + struct intel_context *ce) +{ + int err; + + err = intel_engine_reset(ce->engine, "mocs"); + if (err) + return err; + + err = check_mocs_engine(mocs, ce); + if (err) + return err; + + err = active_engine_reset(ce, "mocs"); + if (err) + return err; + + err = check_mocs_engine(mocs, ce); + if (err) + return err; + + intel_gt_reset(ce->engine->gt, ce->engine->mask, "mocs"); + + err = check_mocs_engine(mocs, ce); + if (err) + return err; + + return 0; +} + +static int live_mocs_reset(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct live_mocs mocs; + int err = 0; + + /* Check the mocs setup is retained over per-engine and global resets */ + + if (!intel_has_reset_engine(gt)) + return 0; + + err = live_mocs_init(&mocs, gt); + if (err) + return err; + + igt_global_reset_lock(gt); + for_each_engine(engine, gt, id) { + struct intel_context *ce; + + ce = mocs_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + break; + } + + intel_engine_pm_get(engine); + err = __live_mocs_reset(&mocs, ce); + intel_engine_pm_put(engine); + + intel_context_put(ce); + if (err) + break; + } + igt_global_reset_unlock(gt); + + live_mocs_fini(&mocs); + return err; +} + +int intel_mocs_live_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(live_mocs_kernel), + SUBTEST(live_mocs_clean), + SUBTEST(live_mocs_reset), + }; + struct drm_i915_mocs_table table; + + if (!get_mocs_settings(i915, &table)) + return 0; + + return intel_gt_live_subtests(tests, &i915->gt); +} diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c b/drivers/gpu/drm/i915/gt/selftest_rc6.c new file mode 100644 index 000000000..64ef5ee5d --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c @@ -0,0 +1,246 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include "intel_context.h" +#include "intel_engine_pm.h" +#include "intel_gt_requests.h" +#include "intel_ring.h" +#include "selftest_rc6.h" + +#include "selftests/i915_random.h" +#include "selftests/librapl.h" + +static u64 rc6_residency(struct intel_rc6 *rc6) +{ + u64 result; + + /* XXX VLV_GT_MEDIA_RC6? */ + + result = intel_rc6_residency_ns(rc6, GEN6_GT_GFX_RC6); + if (HAS_RC6p(rc6_to_i915(rc6))) + result += intel_rc6_residency_ns(rc6, GEN6_GT_GFX_RC6p); + if (HAS_RC6pp(rc6_to_i915(rc6))) + result += intel_rc6_residency_ns(rc6, GEN6_GT_GFX_RC6pp); + + return result; +} + +int live_rc6_manual(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_rc6 *rc6 = >->rc6; + u64 rc0_power, rc6_power; + intel_wakeref_t wakeref; + ktime_t dt; + u64 res[2]; + int err = 0; + + /* + * Our claim is that we can "encourage" the GPU to enter rc6 at will. + * Let's try it! + */ + + if (!rc6->enabled) + return 0; + + /* bsw/byt use a PCU and decouple RC6 from our manual control */ + if (IS_VALLEYVIEW(gt->i915) || IS_CHERRYVIEW(gt->i915)) + return 0; + + wakeref = intel_runtime_pm_get(gt->uncore->rpm); + + /* Force RC6 off for starters */ + __intel_rc6_disable(rc6); + msleep(1); /* wakeup is not immediate, takes about 100us on icl */ + + res[0] = rc6_residency(rc6); + + dt = ktime_get(); + rc0_power = librapl_energy_uJ(); + msleep(250); + rc0_power = librapl_energy_uJ() - rc0_power; + dt = ktime_sub(ktime_get(), dt); + res[1] = rc6_residency(rc6); + if ((res[1] - res[0]) >> 10) { + pr_err("RC6 residency increased by %lldus while disabled for 250ms!\n", + (res[1] - res[0]) >> 10); + err = -EINVAL; + goto out_unlock; + } + + rc0_power = div64_u64(NSEC_PER_SEC * rc0_power, ktime_to_ns(dt)); + if (!rc0_power) { + pr_err("No power measured while in RC0\n"); + err = -EINVAL; + goto out_unlock; + } + + /* Manually enter RC6 */ + intel_rc6_park(rc6); + + res[0] = rc6_residency(rc6); + intel_uncore_forcewake_flush(rc6_to_uncore(rc6), FORCEWAKE_ALL); + dt = ktime_get(); + rc6_power = librapl_energy_uJ(); + msleep(100); + rc6_power = librapl_energy_uJ() - rc6_power; + dt = ktime_sub(ktime_get(), dt); + res[1] = rc6_residency(rc6); + if (res[1] == res[0]) { + pr_err("Did not enter RC6! RC6_STATE=%08x, RC6_CONTROL=%08x, residency=%lld\n", + intel_uncore_read_fw(gt->uncore, GEN6_RC_STATE), + intel_uncore_read_fw(gt->uncore, GEN6_RC_CONTROL), + res[0]); + err = -EINVAL; + } + + rc6_power = div64_u64(NSEC_PER_SEC * rc6_power, ktime_to_ns(dt)); + pr_info("GPU consumed %llduW in RC0 and %llduW in RC6\n", + rc0_power, rc6_power); + if (2 * rc6_power > rc0_power) { + pr_err("GPU leaked energy while in RC6!\n"); + err = -EINVAL; + goto out_unlock; + } + + /* Restore what should have been the original state! */ + intel_rc6_unpark(rc6); + +out_unlock: + intel_runtime_pm_put(gt->uncore->rpm, wakeref); + return err; +} + +static const u32 *__live_rc6_ctx(struct intel_context *ce) +{ + struct i915_request *rq; + const u32 *result; + u32 cmd; + u32 *cs; + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) + return ERR_CAST(rq); + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) { + i915_request_add(rq); + return cs; + } + + cmd = MI_STORE_REGISTER_MEM | MI_USE_GGTT; + if (INTEL_GEN(rq->engine->i915) >= 8) + cmd++; + + *cs++ = cmd; + *cs++ = i915_mmio_reg_offset(GEN8_RC6_CTX_INFO); + *cs++ = ce->timeline->hwsp_offset + 8; + *cs++ = 0; + intel_ring_advance(rq, cs); + + result = rq->hwsp_seqno + 2; + i915_request_add(rq); + + return result; +} + +static struct intel_engine_cs ** +randomised_engines(struct intel_gt *gt, + struct rnd_state *prng, + unsigned int *count) +{ + struct intel_engine_cs *engine, **engines; + enum intel_engine_id id; + int n; + + n = 0; + for_each_engine(engine, gt, id) + n++; + if (!n) + return NULL; + + engines = kmalloc_array(n, sizeof(*engines), GFP_KERNEL); + if (!engines) + return NULL; + + n = 0; + for_each_engine(engine, gt, id) + engines[n++] = engine; + + i915_prandom_shuffle(engines, sizeof(*engines), n, prng); + + *count = n; + return engines; +} + +int live_rc6_ctx_wa(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs **engines; + unsigned int n, count; + I915_RND_STATE(prng); + int err = 0; + + /* A read of CTX_INFO upsets rc6. Poke the bear! */ + if (INTEL_GEN(gt->i915) < 8) + return 0; + + engines = randomised_engines(gt, &prng, &count); + if (!engines) + return 0; + + for (n = 0; n < count; n++) { + struct intel_engine_cs *engine = engines[n]; + int pass; + + for (pass = 0; pass < 2; pass++) { + struct i915_gpu_error *error = >->i915->gpu_error; + struct intel_context *ce; + unsigned int resets = + i915_reset_engine_count(error, engine); + const u32 *res; + + /* Use a sacrifical context */ + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto out; + } + + intel_engine_pm_get(engine); + res = __live_rc6_ctx(ce); + intel_engine_pm_put(engine); + intel_context_put(ce); + if (IS_ERR(res)) { + err = PTR_ERR(res); + goto out; + } + + if (intel_gt_wait_for_idle(gt, HZ / 5) == -ETIME) { + intel_gt_set_wedged(gt); + err = -ETIME; + goto out; + } + + intel_gt_pm_wait_for_idle(gt); + pr_debug("%s: CTX_INFO=%0x\n", + engine->name, READ_ONCE(*res)); + + if (resets != + i915_reset_engine_count(error, engine)) { + pr_err("%s: GPU reset required\n", + engine->name); + add_taint_for_CI(gt->i915, TAINT_WARN); + err = -EIO; + goto out; + } + } + } + +out: + kfree(engines); + return err; +} diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.h b/drivers/gpu/drm/i915/gt/selftest_rc6.h new file mode 100644 index 000000000..762fd442d --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_rc6.h @@ -0,0 +1,13 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef SELFTEST_RC6_H +#define SELFTEST_RC6_H + +int live_rc6_ctx_wa(void *arg); +int live_rc6_manual(void *arg); + +#endif /* SELFTEST_RC6_H */ diff --git a/drivers/gpu/drm/i915/gt/selftest_reset.c b/drivers/gpu/drm/i915/gt/selftest_reset.c new file mode 100644 index 000000000..ef5aeebbe --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_reset.c @@ -0,0 +1,376 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2018 Intel Corporation + */ + +#include <linux/crc32.h> + +#include "gem/i915_gem_stolen.h" + +#include "i915_memcpy.h" +#include "i915_selftest.h" +#include "selftests/igt_reset.h" +#include "selftests/igt_atomic.h" +#include "selftests/igt_spinner.h" + +static int +__igt_reset_stolen(struct intel_gt *gt, + intel_engine_mask_t mask, + const char *msg) +{ + struct i915_ggtt *ggtt = >->i915->ggtt; + const struct resource *dsm = >->i915->dsm; + resource_size_t num_pages, page; + struct intel_engine_cs *engine; + intel_wakeref_t wakeref; + enum intel_engine_id id; + struct igt_spinner spin; + long max, count; + void *tmp; + u32 *crc; + int err; + + if (!drm_mm_node_allocated(&ggtt->error_capture)) + return 0; + + num_pages = resource_size(dsm) >> PAGE_SHIFT; + if (!num_pages) + return 0; + + crc = kmalloc_array(num_pages, sizeof(u32), GFP_KERNEL); + if (!crc) + return -ENOMEM; + + tmp = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!tmp) { + err = -ENOMEM; + goto err_crc; + } + + igt_global_reset_lock(gt); + wakeref = intel_runtime_pm_get(gt->uncore->rpm); + + err = igt_spinner_init(&spin, gt); + if (err) + goto err_lock; + + for_each_engine(engine, gt, id) { + struct intel_context *ce; + struct i915_request *rq; + + if (!(mask & engine->mask)) + continue; + + if (!intel_engine_can_store_dword(engine)) + continue; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto err_spin; + } + rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); + intel_context_put(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_spin; + } + i915_request_add(rq); + } + + for (page = 0; page < num_pages; page++) { + dma_addr_t dma = (dma_addr_t)dsm->start + (page << PAGE_SHIFT); + void __iomem *s; + void *in; + + ggtt->vm.insert_page(&ggtt->vm, dma, + ggtt->error_capture.start, + I915_CACHE_NONE, 0); + mb(); + + s = io_mapping_map_wc(&ggtt->iomap, + ggtt->error_capture.start, + PAGE_SIZE); + + if (!__drm_mm_interval_first(>->i915->mm.stolen, + page << PAGE_SHIFT, + ((page + 1) << PAGE_SHIFT) - 1)) + memset32(s, STACK_MAGIC, PAGE_SIZE / sizeof(u32)); + + in = s; + if (i915_memcpy_from_wc(tmp, s, PAGE_SIZE)) + in = tmp; + crc[page] = crc32_le(0, in, PAGE_SIZE); + + io_mapping_unmap(s); + } + mb(); + ggtt->vm.clear_range(&ggtt->vm, ggtt->error_capture.start, PAGE_SIZE); + + if (mask == ALL_ENGINES) { + intel_gt_reset(gt, mask, NULL); + } else { + for_each_engine(engine, gt, id) { + if (mask & engine->mask) + intel_engine_reset(engine, NULL); + } + } + + max = -1; + count = 0; + for (page = 0; page < num_pages; page++) { + dma_addr_t dma = (dma_addr_t)dsm->start + (page << PAGE_SHIFT); + void __iomem *s; + void *in; + u32 x; + + ggtt->vm.insert_page(&ggtt->vm, dma, + ggtt->error_capture.start, + I915_CACHE_NONE, 0); + mb(); + + s = io_mapping_map_wc(&ggtt->iomap, + ggtt->error_capture.start, + PAGE_SIZE); + + in = s; + if (i915_memcpy_from_wc(tmp, s, PAGE_SIZE)) + in = tmp; + x = crc32_le(0, in, PAGE_SIZE); + + if (x != crc[page] && + !__drm_mm_interval_first(>->i915->mm.stolen, + page << PAGE_SHIFT, + ((page + 1) << PAGE_SHIFT) - 1)) { + pr_debug("unused stolen page %pa modified by GPU reset\n", + &page); + if (count++ == 0) + igt_hexdump(in, PAGE_SIZE); + max = page; + } + + io_mapping_unmap(s); + } + mb(); + ggtt->vm.clear_range(&ggtt->vm, ggtt->error_capture.start, PAGE_SIZE); + + if (count > 0) { + pr_info("%s reset clobbered %ld pages of stolen, last clobber at page %ld\n", + msg, count, max); + } + if (max >= I915_GEM_STOLEN_BIAS >> PAGE_SHIFT) { + pr_err("%s reset clobbered unreserved area [above %x] of stolen; may cause severe faults\n", + msg, I915_GEM_STOLEN_BIAS); + err = -EINVAL; + } + +err_spin: + igt_spinner_fini(&spin); + +err_lock: + intel_runtime_pm_put(gt->uncore->rpm, wakeref); + igt_global_reset_unlock(gt); + + kfree(tmp); +err_crc: + kfree(crc); + return err; +} + +static int igt_reset_device_stolen(void *arg) +{ + return __igt_reset_stolen(arg, ALL_ENGINES, "device"); +} + +static int igt_reset_engines_stolen(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err; + + if (!intel_has_reset_engine(gt)) + return 0; + + for_each_engine(engine, gt, id) { + err = __igt_reset_stolen(gt, engine->mask, engine->name); + if (err) + return err; + } + + return 0; +} + +static int igt_global_reset(void *arg) +{ + struct intel_gt *gt = arg; + unsigned int reset_count; + intel_wakeref_t wakeref; + int err = 0; + + /* Check that we can issue a global GPU reset */ + + igt_global_reset_lock(gt); + wakeref = intel_runtime_pm_get(gt->uncore->rpm); + + reset_count = i915_reset_count(>->i915->gpu_error); + + intel_gt_reset(gt, ALL_ENGINES, NULL); + + if (i915_reset_count(>->i915->gpu_error) == reset_count) { + pr_err("No GPU reset recorded!\n"); + err = -EINVAL; + } + + intel_runtime_pm_put(gt->uncore->rpm, wakeref); + igt_global_reset_unlock(gt); + + if (intel_gt_is_wedged(gt)) + err = -EIO; + + return err; +} + +static int igt_wedged_reset(void *arg) +{ + struct intel_gt *gt = arg; + intel_wakeref_t wakeref; + + /* Check that we can recover a wedged device with a GPU reset */ + + igt_global_reset_lock(gt); + wakeref = intel_runtime_pm_get(gt->uncore->rpm); + + intel_gt_set_wedged(gt); + + GEM_BUG_ON(!intel_gt_is_wedged(gt)); + intel_gt_reset(gt, ALL_ENGINES, NULL); + + intel_runtime_pm_put(gt->uncore->rpm, wakeref); + igt_global_reset_unlock(gt); + + return intel_gt_is_wedged(gt) ? -EIO : 0; +} + +static int igt_atomic_reset(void *arg) +{ + struct intel_gt *gt = arg; + const typeof(*igt_atomic_phases) *p; + int err = 0; + + /* Check that the resets are usable from atomic context */ + + intel_gt_pm_get(gt); + igt_global_reset_lock(gt); + + /* Flush any requests before we get started and check basics */ + if (!igt_force_reset(gt)) + goto unlock; + + for (p = igt_atomic_phases; p->name; p++) { + intel_engine_mask_t awake; + + GEM_TRACE("__intel_gt_reset under %s\n", p->name); + + awake = reset_prepare(gt); + p->critical_section_begin(); + + err = __intel_gt_reset(gt, ALL_ENGINES); + + p->critical_section_end(); + reset_finish(gt, awake); + + if (err) { + pr_err("__intel_gt_reset failed under %s\n", p->name); + break; + } + } + + /* As we poke around the guts, do a full reset before continuing. */ + igt_force_reset(gt); + +unlock: + igt_global_reset_unlock(gt); + intel_gt_pm_put(gt); + + return err; +} + +static int igt_atomic_engine_reset(void *arg) +{ + struct intel_gt *gt = arg; + const typeof(*igt_atomic_phases) *p; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + /* Check that the resets are usable from atomic context */ + + if (!intel_has_reset_engine(gt)) + return 0; + + if (intel_uc_uses_guc_submission(>->uc)) + return 0; + + intel_gt_pm_get(gt); + igt_global_reset_lock(gt); + + /* Flush any requests before we get started and check basics */ + if (!igt_force_reset(gt)) + goto out_unlock; + + for_each_engine(engine, gt, id) { + tasklet_disable(&engine->execlists.tasklet); + intel_engine_pm_get(engine); + + for (p = igt_atomic_phases; p->name; p++) { + GEM_TRACE("intel_engine_reset(%s) under %s\n", + engine->name, p->name); + + p->critical_section_begin(); + err = intel_engine_reset(engine, NULL); + p->critical_section_end(); + + if (err) { + pr_err("intel_engine_reset(%s) failed under %s\n", + engine->name, p->name); + break; + } + } + + intel_engine_pm_put(engine); + tasklet_enable(&engine->execlists.tasklet); + if (err) + break; + } + + /* As we poke around the guts, do a full reset before continuing. */ + igt_force_reset(gt); + +out_unlock: + igt_global_reset_unlock(gt); + intel_gt_pm_put(gt); + + return err; +} + +int intel_reset_live_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(igt_global_reset), /* attempt to recover GPU first */ + SUBTEST(igt_reset_device_stolen), + SUBTEST(igt_reset_engines_stolen), + SUBTEST(igt_wedged_reset), + SUBTEST(igt_atomic_reset), + SUBTEST(igt_atomic_engine_reset), + }; + struct intel_gt *gt = &i915->gt; + + if (!intel_has_gpu_reset(gt)) + return 0; + + if (intel_gt_is_wedged(gt)) + return -EIO; /* we're long past hope of a successful reset */ + + return intel_gt_live_subtests(tests, gt); +} diff --git a/drivers/gpu/drm/i915/gt/selftest_ring.c b/drivers/gpu/drm/i915/gt/selftest_ring.c new file mode 100644 index 000000000..2a8c534dc --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_ring.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright © 2020 Intel Corporation + */ + +static struct intel_ring *mock_ring(unsigned long sz) +{ + struct intel_ring *ring; + + ring = kzalloc(sizeof(*ring) + sz, GFP_KERNEL); + if (!ring) + return NULL; + + kref_init(&ring->ref); + ring->size = sz; + ring->wrap = BITS_PER_TYPE(ring->size) - ilog2(sz); + ring->effective_size = sz; + ring->vaddr = (void *)(ring + 1); + atomic_set(&ring->pin_count, 1); + + intel_ring_update_space(ring); + + return ring; +} + +static void mock_ring_free(struct intel_ring *ring) +{ + kfree(ring); +} + +static int check_ring_direction(struct intel_ring *ring, + u32 next, u32 prev, + int expected) +{ + int result; + + result = intel_ring_direction(ring, next, prev); + if (result < 0) + result = -1; + else if (result > 0) + result = 1; + + if (result != expected) { + pr_err("intel_ring_direction(%u, %u):%d != %d\n", + next, prev, result, expected); + return -EINVAL; + } + + return 0; +} + +static int check_ring_step(struct intel_ring *ring, u32 x, u32 step) +{ + u32 prev = x, next = intel_ring_wrap(ring, x + step); + int err = 0; + + err |= check_ring_direction(ring, next, next, 0); + err |= check_ring_direction(ring, prev, prev, 0); + err |= check_ring_direction(ring, next, prev, 1); + err |= check_ring_direction(ring, prev, next, -1); + + return err; +} + +static int check_ring_offset(struct intel_ring *ring, u32 x, u32 step) +{ + int err = 0; + + err |= check_ring_step(ring, x, step); + err |= check_ring_step(ring, intel_ring_wrap(ring, x + 1), step); + err |= check_ring_step(ring, intel_ring_wrap(ring, x - 1), step); + + return err; +} + +static int igt_ring_direction(void *dummy) +{ + struct intel_ring *ring; + unsigned int half = 2048; + int step, err = 0; + + ring = mock_ring(2 * half); + if (!ring) + return -ENOMEM; + + GEM_BUG_ON(ring->size != 2 * half); + + /* Precision of wrap detection is limited to ring->size / 2 */ + for (step = 1; step < half; step <<= 1) { + err |= check_ring_offset(ring, 0, step); + err |= check_ring_offset(ring, half, step); + } + err |= check_ring_step(ring, 0, half - 64); + + /* And check unwrapped handling for good measure */ + err |= check_ring_offset(ring, 0, 2 * half + 64); + err |= check_ring_offset(ring, 3 * half, 1); + + mock_ring_free(ring); + return err; +} + +int intel_ring_mock_selftests(void) +{ + static const struct i915_subtest tests[] = { + SUBTEST(igt_ring_direction), + }; + + return i915_subtests(tests, NULL); +} diff --git a/drivers/gpu/drm/i915/gt/selftest_ring_submission.c b/drivers/gpu/drm/i915/gt/selftest_ring_submission.c new file mode 100644 index 000000000..3350e7c99 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_ring_submission.c @@ -0,0 +1,298 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +#include "intel_engine_pm.h" +#include "selftests/igt_flush_test.h" + +static struct i915_vma *create_wally(struct intel_engine_cs *engine) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + u32 *cs; + int err; + + obj = i915_gem_object_create_internal(engine->i915, 4096); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + vma = i915_vma_instance(obj, engine->gt->vm, NULL); + if (IS_ERR(vma)) { + i915_gem_object_put(obj); + return vma; + } + + err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_HIGH); + if (err) { + i915_gem_object_put(obj); + return ERR_PTR(err); + } + + err = i915_vma_sync(vma); + if (err) { + i915_gem_object_put(obj); + return ERR_PTR(err); + } + + cs = i915_gem_object_pin_map(obj, I915_MAP_WC); + if (IS_ERR(cs)) { + i915_gem_object_put(obj); + return ERR_CAST(cs); + } + + if (INTEL_GEN(engine->i915) >= 6) { + *cs++ = MI_STORE_DWORD_IMM_GEN4; + *cs++ = 0; + } else if (INTEL_GEN(engine->i915) >= 4) { + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = 0; + } else { + *cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; + } + *cs++ = vma->node.start + 4000; + *cs++ = STACK_MAGIC; + + *cs++ = MI_BATCH_BUFFER_END; + + i915_gem_object_flush_map(obj); + i915_gem_object_unpin_map(obj); + + vma->private = intel_context_create(engine); /* dummy residuals */ + if (IS_ERR(vma->private)) { + vma = ERR_CAST(vma->private); + i915_gem_object_put(obj); + } + + return vma; +} + +static int context_sync(struct intel_context *ce) +{ + struct i915_request *rq; + int err = 0; + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + i915_request_get(rq); + i915_request_add(rq); + + if (i915_request_wait(rq, 0, HZ / 5) < 0) + err = -ETIME; + i915_request_put(rq); + + return err; +} + +static int new_context_sync(struct intel_engine_cs *engine) +{ + struct intel_context *ce; + int err; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return PTR_ERR(ce); + + err = context_sync(ce); + intel_context_put(ce); + + return err; +} + +static int mixed_contexts_sync(struct intel_engine_cs *engine, u32 *result) +{ + int pass; + int err; + + for (pass = 0; pass < 2; pass++) { + WRITE_ONCE(*result, 0); + err = context_sync(engine->kernel_context); + if (err || READ_ONCE(*result)) { + if (!err) { + pr_err("pass[%d] wa_bb emitted for the kernel context\n", + pass); + err = -EINVAL; + } + return err; + } + + WRITE_ONCE(*result, 0); + err = new_context_sync(engine); + if (READ_ONCE(*result) != STACK_MAGIC) { + if (!err) { + pr_err("pass[%d] wa_bb *NOT* emitted after the kernel context\n", + pass); + err = -EINVAL; + } + return err; + } + + WRITE_ONCE(*result, 0); + err = new_context_sync(engine); + if (READ_ONCE(*result) != STACK_MAGIC) { + if (!err) { + pr_err("pass[%d] wa_bb *NOT* emitted for the user context switch\n", + pass); + err = -EINVAL; + } + return err; + } + } + + return 0; +} + +static int double_context_sync_00(struct intel_engine_cs *engine, u32 *result) +{ + struct intel_context *ce; + int err, i; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return PTR_ERR(ce); + + for (i = 0; i < 2; i++) { + WRITE_ONCE(*result, 0); + err = context_sync(ce); + if (err) + break; + } + intel_context_put(ce); + if (err) + return err; + + if (READ_ONCE(*result)) { + pr_err("wa_bb emitted between the same user context\n"); + return -EINVAL; + } + + return 0; +} + +static int kernel_context_sync_00(struct intel_engine_cs *engine, u32 *result) +{ + struct intel_context *ce; + int err, i; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return PTR_ERR(ce); + + for (i = 0; i < 2; i++) { + WRITE_ONCE(*result, 0); + err = context_sync(ce); + if (err) + break; + + err = context_sync(engine->kernel_context); + if (err) + break; + } + intel_context_put(ce); + if (err) + return err; + + if (READ_ONCE(*result)) { + pr_err("wa_bb emitted between the same user context [with intervening kernel]\n"); + return -EINVAL; + } + + return 0; +} + +static int __live_ctx_switch_wa(struct intel_engine_cs *engine) +{ + struct i915_vma *bb; + u32 *result; + int err; + + bb = create_wally(engine); + if (IS_ERR(bb)) + return PTR_ERR(bb); + + result = i915_gem_object_pin_map(bb->obj, I915_MAP_WC); + if (IS_ERR(result)) { + intel_context_put(bb->private); + i915_vma_unpin_and_release(&bb, 0); + return PTR_ERR(result); + } + result += 1000; + + engine->wa_ctx.vma = bb; + + err = mixed_contexts_sync(engine, result); + if (err) + goto out; + + err = double_context_sync_00(engine, result); + if (err) + goto out; + + err = kernel_context_sync_00(engine, result); + if (err) + goto out; + +out: + intel_context_put(engine->wa_ctx.vma->private); + i915_vma_unpin_and_release(&engine->wa_ctx.vma, I915_VMA_RELEASE_MAP); + return err; +} + +static int live_ctx_switch_wa(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + + /* + * Exercise the inter-context wa batch. + * + * Between each user context we run a wa batch, and since it may + * have implications for user visible state, we have to check that + * we do actually execute it. + * + * The trick we use is to replace the normal wa batch with a custom + * one that writes to a marker within it, and we can then look for + * that marker to confirm if the batch was run when we expect it, + * and equally important it was wasn't run when we don't! + */ + + for_each_engine(engine, gt, id) { + struct i915_vma *saved_wa; + int err; + + if (!intel_engine_can_store_dword(engine)) + continue; + + if (IS_GEN_RANGE(gt->i915, 4, 5)) + continue; /* MI_STORE_DWORD is privileged! */ + + saved_wa = fetch_and_zero(&engine->wa_ctx.vma); + + intel_engine_pm_get(engine); + err = __live_ctx_switch_wa(engine); + intel_engine_pm_put(engine); + if (igt_flush_test(gt->i915)) + err = -EIO; + + engine->wa_ctx.vma = saved_wa; + if (err) + return err; + } + + return 0; +} + +int intel_ring_submission_live_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(live_ctx_switch_wa), + }; + + if (HAS_EXECLISTS(i915)) + return 0; + + return intel_gt_live_subtests(tests, &i915->gt); +} diff --git a/drivers/gpu/drm/i915/gt/selftest_rps.c b/drivers/gpu/drm/i915/gt/selftest_rps.c new file mode 100644 index 000000000..3540ba9bd --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_rps.c @@ -0,0 +1,1314 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +#include <linux/pm_qos.h> +#include <linux/sort.h> + +#include "intel_engine_heartbeat.h" +#include "intel_engine_pm.h" +#include "intel_gpu_commands.h" +#include "intel_gt_clock_utils.h" +#include "intel_gt_pm.h" +#include "intel_rc6.h" +#include "selftest_engine_heartbeat.h" +#include "selftest_rps.h" +#include "selftests/igt_flush_test.h" +#include "selftests/igt_spinner.h" +#include "selftests/librapl.h" + +/* Try to isolate the impact of cstates from determing frequency response */ +#define CPU_LATENCY 0 /* -1 to disable pm_qos, 0 to disable cstates */ + +static void dummy_rps_work(struct work_struct *wrk) +{ +} + +static int cmp_u64(const void *A, const void *B) +{ + const u64 *a = A, *b = B; + + if (*a < *b) + return -1; + else if (*a > *b) + return 1; + else + return 0; +} + +static int cmp_u32(const void *A, const void *B) +{ + const u32 *a = A, *b = B; + + if (*a < *b) + return -1; + else if (*a > *b) + return 1; + else + return 0; +} + +static struct i915_vma * +create_spin_counter(struct intel_engine_cs *engine, + struct i915_address_space *vm, + bool srm, + u32 **cancel, + u32 **counter) +{ + enum { + COUNT, + INC, + __NGPR__, + }; +#define CS_GPR(x) GEN8_RING_CS_GPR(engine->mmio_base, x) + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + unsigned long end; + u32 *base, *cs; + int loop, i; + int err; + + obj = i915_gem_object_create_internal(vm->i915, 64 << 10); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + end = obj->base.size / sizeof(u32) - 1; + + vma = i915_vma_instance(obj, vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_put; + } + + err = i915_vma_pin(vma, 0, 0, PIN_USER); + if (err) + goto err_unlock; + + i915_vma_lock(vma); + + base = i915_gem_object_pin_map(obj, I915_MAP_WC); + if (IS_ERR(base)) { + err = PTR_ERR(base); + goto err_unpin; + } + cs = base; + + *cs++ = MI_LOAD_REGISTER_IMM(__NGPR__ * 2); + for (i = 0; i < __NGPR__; i++) { + *cs++ = i915_mmio_reg_offset(CS_GPR(i)); + *cs++ = 0; + *cs++ = i915_mmio_reg_offset(CS_GPR(i)) + 4; + *cs++ = 0; + } + + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(CS_GPR(INC)); + *cs++ = 1; + + loop = cs - base; + + /* Unroll the loop to avoid MI_BB_START stalls impacting measurements */ + for (i = 0; i < 1024; i++) { + *cs++ = MI_MATH(4); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(COUNT)); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(INC)); + *cs++ = MI_MATH_ADD; + *cs++ = MI_MATH_STORE(MI_MATH_REG(COUNT), MI_MATH_REG_ACCU); + + if (srm) { + *cs++ = MI_STORE_REGISTER_MEM_GEN8; + *cs++ = i915_mmio_reg_offset(CS_GPR(COUNT)); + *cs++ = lower_32_bits(vma->node.start + end * sizeof(*cs)); + *cs++ = upper_32_bits(vma->node.start + end * sizeof(*cs)); + } + } + + *cs++ = MI_BATCH_BUFFER_START_GEN8; + *cs++ = lower_32_bits(vma->node.start + loop * sizeof(*cs)); + *cs++ = upper_32_bits(vma->node.start + loop * sizeof(*cs)); + GEM_BUG_ON(cs - base > end); + + i915_gem_object_flush_map(obj); + + *cancel = base + loop; + *counter = srm ? memset32(base + end, 0, 1) : NULL; + return vma; + +err_unpin: + i915_vma_unpin(vma); +err_unlock: + i915_vma_unlock(vma); +err_put: + i915_gem_object_put(obj); + return ERR_PTR(err); +} + +static u8 wait_for_freq(struct intel_rps *rps, u8 freq, int timeout_ms) +{ + u8 history[64], i; + unsigned long end; + int sleep; + + i = 0; + memset(history, freq, sizeof(history)); + sleep = 20; + + /* The PCU does not change instantly, but drifts towards the goal? */ + end = jiffies + msecs_to_jiffies(timeout_ms); + do { + u8 act; + + act = read_cagf(rps); + if (time_after(jiffies, end)) + return act; + + /* Target acquired */ + if (act == freq) + return act; + + /* Any change within the last N samples? */ + if (!memchr_inv(history, act, sizeof(history))) + return act; + + history[i] = act; + i = (i + 1) % ARRAY_SIZE(history); + + usleep_range(sleep, 2 * sleep); + sleep *= 2; + if (sleep > timeout_ms * 20) + sleep = timeout_ms * 20; + } while (1); +} + +static u8 rps_set_check(struct intel_rps *rps, u8 freq) +{ + mutex_lock(&rps->lock); + GEM_BUG_ON(!intel_rps_is_active(rps)); + intel_rps_set(rps, freq); + GEM_BUG_ON(rps->last_freq != freq); + mutex_unlock(&rps->lock); + + return wait_for_freq(rps, freq, 50); +} + +static void show_pstate_limits(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + + if (IS_BROXTON(i915)) { + pr_info("P_STATE_CAP[%x]: 0x%08x\n", + i915_mmio_reg_offset(BXT_RP_STATE_CAP), + intel_uncore_read(rps_to_uncore(rps), + BXT_RP_STATE_CAP)); + } else if (IS_GEN(i915, 9)) { + pr_info("P_STATE_LIMITS[%x]: 0x%08x\n", + i915_mmio_reg_offset(GEN9_RP_STATE_LIMITS), + intel_uncore_read(rps_to_uncore(rps), + GEN9_RP_STATE_LIMITS)); + } +} + +int live_rps_clock_interval(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_rps *rps = >->rps; + void (*saved_work)(struct work_struct *wrk); + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct igt_spinner spin; + int err = 0; + + if (!intel_rps_is_enabled(rps)) + return 0; + + if (igt_spinner_init(&spin, gt)) + return -ENOMEM; + + intel_gt_pm_wait_for_idle(gt); + saved_work = rps->work.func; + rps->work.func = dummy_rps_work; + + intel_gt_pm_get(gt); + intel_rps_disable(>->rps); + + intel_gt_check_clock_frequency(gt); + + for_each_engine(engine, gt, id) { + struct i915_request *rq; + u32 cycles; + u64 dt; + + if (!intel_engine_can_store_dword(engine)) + continue; + + st_engine_heartbeat_disable(engine); + + rq = igt_spinner_create_request(&spin, + engine->kernel_context, + MI_NOOP); + if (IS_ERR(rq)) { + st_engine_heartbeat_enable(engine); + err = PTR_ERR(rq); + break; + } + + i915_request_add(rq); + + if (!igt_wait_for_spinner(&spin, rq)) { + pr_err("%s: RPS spinner did not start\n", + engine->name); + igt_spinner_end(&spin); + st_engine_heartbeat_enable(engine); + intel_gt_set_wedged(engine->gt); + err = -EIO; + break; + } + + intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); + + intel_uncore_write_fw(gt->uncore, GEN6_RP_CUR_UP_EI, 0); + + /* Set the evaluation interval to infinity! */ + intel_uncore_write_fw(gt->uncore, + GEN6_RP_UP_EI, 0xffffffff); + intel_uncore_write_fw(gt->uncore, + GEN6_RP_UP_THRESHOLD, 0xffffffff); + + intel_uncore_write_fw(gt->uncore, GEN6_RP_CONTROL, + GEN6_RP_ENABLE | GEN6_RP_UP_BUSY_AVG); + + if (wait_for(intel_uncore_read_fw(gt->uncore, + GEN6_RP_CUR_UP_EI), + 10)) { + /* Just skip the test; assume lack of HW support */ + pr_notice("%s: rps evaluation interval not ticking\n", + engine->name); + err = -ENODEV; + } else { + ktime_t dt_[5]; + u32 cycles_[5]; + int i; + + for (i = 0; i < 5; i++) { + preempt_disable(); + + dt_[i] = ktime_get(); + cycles_[i] = -intel_uncore_read_fw(gt->uncore, GEN6_RP_CUR_UP_EI); + + udelay(1000); + + dt_[i] = ktime_sub(ktime_get(), dt_[i]); + cycles_[i] += intel_uncore_read_fw(gt->uncore, GEN6_RP_CUR_UP_EI); + + preempt_enable(); + } + + /* Use the median of both cycle/dt; close enough */ + sort(cycles_, 5, sizeof(*cycles_), cmp_u32, NULL); + cycles = (cycles_[1] + 2 * cycles_[2] + cycles_[3]) / 4; + sort(dt_, 5, sizeof(*dt_), cmp_u64, NULL); + dt = div_u64(dt_[1] + 2 * dt_[2] + dt_[3], 4); + } + + intel_uncore_write_fw(gt->uncore, GEN6_RP_CONTROL, 0); + intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); + + igt_spinner_end(&spin); + st_engine_heartbeat_enable(engine); + + if (err == 0) { + u64 time = intel_gt_pm_interval_to_ns(gt, cycles); + u32 expected = + intel_gt_ns_to_pm_interval(gt, dt); + + pr_info("%s: rps counted %d C0 cycles [%lldns] in %lldns [%d cycles], using GT clock frequency of %uKHz\n", + engine->name, cycles, time, dt, expected, + gt->clock_frequency / 1000); + + if (10 * time < 8 * dt || + 8 * time > 10 * dt) { + pr_err("%s: rps clock time does not match walltime!\n", + engine->name); + err = -EINVAL; + } + + if (10 * expected < 8 * cycles || + 8 * expected > 10 * cycles) { + pr_err("%s: walltime does not match rps clock ticks!\n", + engine->name); + err = -EINVAL; + } + } + + if (igt_flush_test(gt->i915)) + err = -EIO; + + break; /* once is enough */ + } + + intel_rps_enable(>->rps); + intel_gt_pm_put(gt); + + igt_spinner_fini(&spin); + + intel_gt_pm_wait_for_idle(gt); + rps->work.func = saved_work; + + if (err == -ENODEV) /* skipped, don't report a fail */ + err = 0; + + return err; +} + +int live_rps_control(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_rps *rps = >->rps; + void (*saved_work)(struct work_struct *wrk); + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct igt_spinner spin; + int err = 0; + + /* + * Check that the actual frequency matches our requested frequency, + * to verify our control mechanism. We have to be careful that the + * PCU may throttle the GPU in which case the actual frequency used + * will be lowered than requested. + */ + + if (!intel_rps_is_enabled(rps)) + return 0; + + if (IS_CHERRYVIEW(gt->i915)) /* XXX fragile PCU */ + return 0; + + if (igt_spinner_init(&spin, gt)) + return -ENOMEM; + + intel_gt_pm_wait_for_idle(gt); + saved_work = rps->work.func; + rps->work.func = dummy_rps_work; + + intel_gt_pm_get(gt); + for_each_engine(engine, gt, id) { + struct i915_request *rq; + ktime_t min_dt, max_dt; + int f, limit; + int min, max; + + if (!intel_engine_can_store_dword(engine)) + continue; + + st_engine_heartbeat_disable(engine); + + rq = igt_spinner_create_request(&spin, + engine->kernel_context, + MI_NOOP); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + break; + } + + i915_request_add(rq); + + if (!igt_wait_for_spinner(&spin, rq)) { + pr_err("%s: RPS spinner did not start\n", + engine->name); + igt_spinner_end(&spin); + st_engine_heartbeat_enable(engine); + intel_gt_set_wedged(engine->gt); + err = -EIO; + break; + } + + if (rps_set_check(rps, rps->min_freq) != rps->min_freq) { + pr_err("%s: could not set minimum frequency [%x], only %x!\n", + engine->name, rps->min_freq, read_cagf(rps)); + igt_spinner_end(&spin); + st_engine_heartbeat_enable(engine); + show_pstate_limits(rps); + err = -EINVAL; + break; + } + + for (f = rps->min_freq + 1; f < rps->max_freq; f++) { + if (rps_set_check(rps, f) < f) + break; + } + + limit = rps_set_check(rps, f); + + if (rps_set_check(rps, rps->min_freq) != rps->min_freq) { + pr_err("%s: could not restore minimum frequency [%x], only %x!\n", + engine->name, rps->min_freq, read_cagf(rps)); + igt_spinner_end(&spin); + st_engine_heartbeat_enable(engine); + show_pstate_limits(rps); + err = -EINVAL; + break; + } + + max_dt = ktime_get(); + max = rps_set_check(rps, limit); + max_dt = ktime_sub(ktime_get(), max_dt); + + min_dt = ktime_get(); + min = rps_set_check(rps, rps->min_freq); + min_dt = ktime_sub(ktime_get(), min_dt); + + igt_spinner_end(&spin); + st_engine_heartbeat_enable(engine); + + pr_info("%s: range:[%x:%uMHz, %x:%uMHz] limit:[%x:%uMHz], %x:%x response %lluns:%lluns\n", + engine->name, + rps->min_freq, intel_gpu_freq(rps, rps->min_freq), + rps->max_freq, intel_gpu_freq(rps, rps->max_freq), + limit, intel_gpu_freq(rps, limit), + min, max, ktime_to_ns(min_dt), ktime_to_ns(max_dt)); + + if (limit == rps->min_freq) { + pr_err("%s: GPU throttled to minimum!\n", + engine->name); + show_pstate_limits(rps); + err = -ENODEV; + break; + } + + if (igt_flush_test(gt->i915)) { + err = -EIO; + break; + } + } + intel_gt_pm_put(gt); + + igt_spinner_fini(&spin); + + intel_gt_pm_wait_for_idle(gt); + rps->work.func = saved_work; + + return err; +} + +static void show_pcu_config(struct intel_rps *rps) +{ + struct drm_i915_private *i915 = rps_to_i915(rps); + unsigned int max_gpu_freq, min_gpu_freq; + intel_wakeref_t wakeref; + int gpu_freq; + + if (!HAS_LLC(i915)) + return; + + min_gpu_freq = rps->min_freq; + max_gpu_freq = rps->max_freq; + if (INTEL_GEN(i915) >= 9) { + /* Convert GT frequency to 50 HZ units */ + min_gpu_freq /= GEN9_FREQ_SCALER; + max_gpu_freq /= GEN9_FREQ_SCALER; + } + + wakeref = intel_runtime_pm_get(rps_to_uncore(rps)->rpm); + + pr_info("%5s %5s %5s\n", "GPU", "eCPU", "eRing"); + for (gpu_freq = min_gpu_freq; gpu_freq <= max_gpu_freq; gpu_freq++) { + int ia_freq = gpu_freq; + + sandybridge_pcode_read(i915, + GEN6_PCODE_READ_MIN_FREQ_TABLE, + &ia_freq, NULL); + + pr_info("%5d %5d %5d\n", + gpu_freq * 50, + ((ia_freq >> 0) & 0xff) * 100, + ((ia_freq >> 8) & 0xff) * 100); + } + + intel_runtime_pm_put(rps_to_uncore(rps)->rpm, wakeref); +} + +static u64 __measure_frequency(u32 *cntr, int duration_ms) +{ + u64 dc, dt; + + dt = ktime_get(); + dc = READ_ONCE(*cntr); + usleep_range(1000 * duration_ms, 2000 * duration_ms); + dc = READ_ONCE(*cntr) - dc; + dt = ktime_get() - dt; + + return div64_u64(1000 * 1000 * dc, dt); +} + +static u64 measure_frequency_at(struct intel_rps *rps, u32 *cntr, int *freq) +{ + u64 x[5]; + int i; + + *freq = rps_set_check(rps, *freq); + for (i = 0; i < 5; i++) + x[i] = __measure_frequency(cntr, 2); + *freq = (*freq + read_cagf(rps)) / 2; + + /* A simple triangle filter for better result stability */ + sort(x, 5, sizeof(*x), cmp_u64, NULL); + return div_u64(x[1] + 2 * x[2] + x[3], 4); +} + +static u64 __measure_cs_frequency(struct intel_engine_cs *engine, + int duration_ms) +{ + u64 dc, dt; + + dt = ktime_get(); + dc = intel_uncore_read_fw(engine->uncore, CS_GPR(0)); + usleep_range(1000 * duration_ms, 2000 * duration_ms); + dc = intel_uncore_read_fw(engine->uncore, CS_GPR(0)) - dc; + dt = ktime_get() - dt; + + return div64_u64(1000 * 1000 * dc, dt); +} + +static u64 measure_cs_frequency_at(struct intel_rps *rps, + struct intel_engine_cs *engine, + int *freq) +{ + u64 x[5]; + int i; + + *freq = rps_set_check(rps, *freq); + for (i = 0; i < 5; i++) + x[i] = __measure_cs_frequency(engine, 2); + *freq = (*freq + read_cagf(rps)) / 2; + + /* A simple triangle filter for better result stability */ + sort(x, 5, sizeof(*x), cmp_u64, NULL); + return div_u64(x[1] + 2 * x[2] + x[3], 4); +} + +static bool scaled_within(u64 x, u64 y, u32 f_n, u32 f_d) +{ + return f_d * x > f_n * y && f_n * x < f_d * y; +} + +int live_rps_frequency_cs(void *arg) +{ + void (*saved_work)(struct work_struct *wrk); + struct intel_gt *gt = arg; + struct intel_rps *rps = >->rps; + struct intel_engine_cs *engine; + struct pm_qos_request qos; + enum intel_engine_id id; + int err = 0; + + /* + * The premise is that the GPU does change freqency at our behest. + * Let's check there is a correspondence between the requested + * frequency, the actual frequency, and the observed clock rate. + */ + + if (!intel_rps_is_enabled(rps)) + return 0; + + if (INTEL_GEN(gt->i915) < 8) /* for CS simplicity */ + return 0; + + if (CPU_LATENCY >= 0) + cpu_latency_qos_add_request(&qos, CPU_LATENCY); + + intel_gt_pm_wait_for_idle(gt); + saved_work = rps->work.func; + rps->work.func = dummy_rps_work; + + for_each_engine(engine, gt, id) { + struct i915_request *rq; + struct i915_vma *vma; + u32 *cancel, *cntr; + struct { + u64 count; + int freq; + } min, max; + + st_engine_heartbeat_disable(engine); + + vma = create_spin_counter(engine, + engine->kernel_context->vm, false, + &cancel, &cntr); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + st_engine_heartbeat_enable(engine); + break; + } + + rq = intel_engine_create_kernel_request(engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_vma; + } + + err = i915_request_await_object(rq, vma->obj, false); + if (!err) + err = i915_vma_move_to_active(vma, rq, 0); + if (!err) + err = rq->engine->emit_bb_start(rq, + vma->node.start, + PAGE_SIZE, 0); + i915_request_add(rq); + if (err) + goto err_vma; + + if (wait_for(intel_uncore_read(engine->uncore, CS_GPR(0)), + 10)) { + pr_err("%s: timed loop did not start\n", + engine->name); + goto err_vma; + } + + min.freq = rps->min_freq; + min.count = measure_cs_frequency_at(rps, engine, &min.freq); + + max.freq = rps->max_freq; + max.count = measure_cs_frequency_at(rps, engine, &max.freq); + + pr_info("%s: min:%lluKHz @ %uMHz, max:%lluKHz @ %uMHz [%d%%]\n", + engine->name, + min.count, intel_gpu_freq(rps, min.freq), + max.count, intel_gpu_freq(rps, max.freq), + (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * max.count, + max.freq * min.count)); + + if (!scaled_within(max.freq * min.count, + min.freq * max.count, + 2, 3)) { + int f; + + pr_err("%s: CS did not scale with frequency! scaled min:%llu, max:%llu\n", + engine->name, + max.freq * min.count, + min.freq * max.count); + show_pcu_config(rps); + + for (f = min.freq + 1; f <= rps->max_freq; f++) { + int act = f; + u64 count; + + count = measure_cs_frequency_at(rps, engine, &act); + if (act < f) + break; + + pr_info("%s: %x:%uMHz: %lluKHz [%d%%]\n", + engine->name, + act, intel_gpu_freq(rps, act), count, + (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * count, + act * min.count)); + + f = act; /* may skip ahead [pcu granularity] */ + } + + err = -EINTR; /* ignore error, continue on with test */ + } + +err_vma: + *cancel = MI_BATCH_BUFFER_END; + i915_gem_object_flush_map(vma->obj); + i915_gem_object_unpin_map(vma->obj); + i915_vma_unpin(vma); + i915_vma_unlock(vma); + i915_vma_put(vma); + + st_engine_heartbeat_enable(engine); + if (igt_flush_test(gt->i915)) + err = -EIO; + if (err) + break; + } + + intel_gt_pm_wait_for_idle(gt); + rps->work.func = saved_work; + + if (CPU_LATENCY >= 0) + cpu_latency_qos_remove_request(&qos); + + return err; +} + +int live_rps_frequency_srm(void *arg) +{ + void (*saved_work)(struct work_struct *wrk); + struct intel_gt *gt = arg; + struct intel_rps *rps = >->rps; + struct intel_engine_cs *engine; + struct pm_qos_request qos; + enum intel_engine_id id; + int err = 0; + + /* + * The premise is that the GPU does change freqency at our behest. + * Let's check there is a correspondence between the requested + * frequency, the actual frequency, and the observed clock rate. + */ + + if (!intel_rps_is_enabled(rps)) + return 0; + + if (INTEL_GEN(gt->i915) < 8) /* for CS simplicity */ + return 0; + + if (CPU_LATENCY >= 0) + cpu_latency_qos_add_request(&qos, CPU_LATENCY); + + intel_gt_pm_wait_for_idle(gt); + saved_work = rps->work.func; + rps->work.func = dummy_rps_work; + + for_each_engine(engine, gt, id) { + struct i915_request *rq; + struct i915_vma *vma; + u32 *cancel, *cntr; + struct { + u64 count; + int freq; + } min, max; + + st_engine_heartbeat_disable(engine); + + vma = create_spin_counter(engine, + engine->kernel_context->vm, true, + &cancel, &cntr); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + st_engine_heartbeat_enable(engine); + break; + } + + rq = intel_engine_create_kernel_request(engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_vma; + } + + err = i915_request_await_object(rq, vma->obj, false); + if (!err) + err = i915_vma_move_to_active(vma, rq, 0); + if (!err) + err = rq->engine->emit_bb_start(rq, + vma->node.start, + PAGE_SIZE, 0); + i915_request_add(rq); + if (err) + goto err_vma; + + if (wait_for(READ_ONCE(*cntr), 10)) { + pr_err("%s: timed loop did not start\n", + engine->name); + goto err_vma; + } + + min.freq = rps->min_freq; + min.count = measure_frequency_at(rps, cntr, &min.freq); + + max.freq = rps->max_freq; + max.count = measure_frequency_at(rps, cntr, &max.freq); + + pr_info("%s: min:%lluKHz @ %uMHz, max:%lluKHz @ %uMHz [%d%%]\n", + engine->name, + min.count, intel_gpu_freq(rps, min.freq), + max.count, intel_gpu_freq(rps, max.freq), + (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * max.count, + max.freq * min.count)); + + if (!scaled_within(max.freq * min.count, + min.freq * max.count, + 1, 2)) { + int f; + + pr_err("%s: CS did not scale with frequency! scaled min:%llu, max:%llu\n", + engine->name, + max.freq * min.count, + min.freq * max.count); + show_pcu_config(rps); + + for (f = min.freq + 1; f <= rps->max_freq; f++) { + int act = f; + u64 count; + + count = measure_frequency_at(rps, cntr, &act); + if (act < f) + break; + + pr_info("%s: %x:%uMHz: %lluKHz [%d%%]\n", + engine->name, + act, intel_gpu_freq(rps, act), count, + (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * count, + act * min.count)); + + f = act; /* may skip ahead [pcu granularity] */ + } + + err = -EINTR; /* ignore error, continue on with test */ + } + +err_vma: + *cancel = MI_BATCH_BUFFER_END; + i915_gem_object_flush_map(vma->obj); + i915_gem_object_unpin_map(vma->obj); + i915_vma_unpin(vma); + i915_vma_unlock(vma); + i915_vma_put(vma); + + st_engine_heartbeat_enable(engine); + if (igt_flush_test(gt->i915)) + err = -EIO; + if (err) + break; + } + + intel_gt_pm_wait_for_idle(gt); + rps->work.func = saved_work; + + if (CPU_LATENCY >= 0) + cpu_latency_qos_remove_request(&qos); + + return err; +} + +static void sleep_for_ei(struct intel_rps *rps, int timeout_us) +{ + /* Flush any previous EI */ + usleep_range(timeout_us, 2 * timeout_us); + + /* Reset the interrupt status */ + rps_disable_interrupts(rps); + GEM_BUG_ON(rps->pm_iir); + rps_enable_interrupts(rps); + + /* And then wait for the timeout, for real this time */ + usleep_range(2 * timeout_us, 3 * timeout_us); +} + +static int __rps_up_interrupt(struct intel_rps *rps, + struct intel_engine_cs *engine, + struct igt_spinner *spin) +{ + struct intel_uncore *uncore = engine->uncore; + struct i915_request *rq; + u32 timeout; + + if (!intel_engine_can_store_dword(engine)) + return 0; + + rps_set_check(rps, rps->min_freq); + + rq = igt_spinner_create_request(spin, engine->kernel_context, MI_NOOP); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + i915_request_get(rq); + i915_request_add(rq); + + if (!igt_wait_for_spinner(spin, rq)) { + pr_err("%s: RPS spinner did not start\n", + engine->name); + i915_request_put(rq); + intel_gt_set_wedged(engine->gt); + return -EIO; + } + + if (!intel_rps_is_active(rps)) { + pr_err("%s: RPS not enabled on starting spinner\n", + engine->name); + igt_spinner_end(spin); + i915_request_put(rq); + return -EINVAL; + } + + if (!(rps->pm_events & GEN6_PM_RP_UP_THRESHOLD)) { + pr_err("%s: RPS did not register UP interrupt\n", + engine->name); + i915_request_put(rq); + return -EINVAL; + } + + if (rps->last_freq != rps->min_freq) { + pr_err("%s: RPS did not program min frequency\n", + engine->name); + i915_request_put(rq); + return -EINVAL; + } + + timeout = intel_uncore_read(uncore, GEN6_RP_UP_EI); + timeout = intel_gt_pm_interval_to_ns(engine->gt, timeout); + timeout = DIV_ROUND_UP(timeout, 1000); + + sleep_for_ei(rps, timeout); + GEM_BUG_ON(i915_request_completed(rq)); + + igt_spinner_end(spin); + i915_request_put(rq); + + if (rps->cur_freq != rps->min_freq) { + pr_err("%s: Frequency unexpectedly changed [up], now %d!\n", + engine->name, intel_rps_read_actual_frequency(rps)); + return -EINVAL; + } + + if (!(rps->pm_iir & GEN6_PM_RP_UP_THRESHOLD)) { + pr_err("%s: UP interrupt not recorded for spinner, pm_iir:%x, prev_up:%x, up_threshold:%x, up_ei:%x\n", + engine->name, rps->pm_iir, + intel_uncore_read(uncore, GEN6_RP_PREV_UP), + intel_uncore_read(uncore, GEN6_RP_UP_THRESHOLD), + intel_uncore_read(uncore, GEN6_RP_UP_EI)); + return -EINVAL; + } + + return 0; +} + +static int __rps_down_interrupt(struct intel_rps *rps, + struct intel_engine_cs *engine) +{ + struct intel_uncore *uncore = engine->uncore; + u32 timeout; + + rps_set_check(rps, rps->max_freq); + + if (!(rps->pm_events & GEN6_PM_RP_DOWN_THRESHOLD)) { + pr_err("%s: RPS did not register DOWN interrupt\n", + engine->name); + return -EINVAL; + } + + if (rps->last_freq != rps->max_freq) { + pr_err("%s: RPS did not program max frequency\n", + engine->name); + return -EINVAL; + } + + timeout = intel_uncore_read(uncore, GEN6_RP_DOWN_EI); + timeout = intel_gt_pm_interval_to_ns(engine->gt, timeout); + timeout = DIV_ROUND_UP(timeout, 1000); + + sleep_for_ei(rps, timeout); + + if (rps->cur_freq != rps->max_freq) { + pr_err("%s: Frequency unexpectedly changed [down], now %d!\n", + engine->name, + intel_rps_read_actual_frequency(rps)); + return -EINVAL; + } + + if (!(rps->pm_iir & (GEN6_PM_RP_DOWN_THRESHOLD | GEN6_PM_RP_DOWN_TIMEOUT))) { + pr_err("%s: DOWN interrupt not recorded for idle, pm_iir:%x, prev_down:%x, down_threshold:%x, down_ei:%x [prev_up:%x, up_threshold:%x, up_ei:%x]\n", + engine->name, rps->pm_iir, + intel_uncore_read(uncore, GEN6_RP_PREV_DOWN), + intel_uncore_read(uncore, GEN6_RP_DOWN_THRESHOLD), + intel_uncore_read(uncore, GEN6_RP_DOWN_EI), + intel_uncore_read(uncore, GEN6_RP_PREV_UP), + intel_uncore_read(uncore, GEN6_RP_UP_THRESHOLD), + intel_uncore_read(uncore, GEN6_RP_UP_EI)); + return -EINVAL; + } + + return 0; +} + +int live_rps_interrupt(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_rps *rps = >->rps; + void (*saved_work)(struct work_struct *wrk); + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct igt_spinner spin; + u32 pm_events; + int err = 0; + + /* + * First, let's check whether or not we are receiving interrupts. + */ + + if (!intel_rps_has_interrupts(rps)) + return 0; + + intel_gt_pm_get(gt); + pm_events = rps->pm_events; + intel_gt_pm_put(gt); + if (!pm_events) { + pr_err("No RPS PM events registered, but RPS is enabled?\n"); + return -ENODEV; + } + + if (igt_spinner_init(&spin, gt)) + return -ENOMEM; + + intel_gt_pm_wait_for_idle(gt); + saved_work = rps->work.func; + rps->work.func = dummy_rps_work; + + for_each_engine(engine, gt, id) { + /* Keep the engine busy with a spinner; expect an UP! */ + if (pm_events & GEN6_PM_RP_UP_THRESHOLD) { + intel_gt_pm_wait_for_idle(engine->gt); + GEM_BUG_ON(intel_rps_is_active(rps)); + + st_engine_heartbeat_disable(engine); + + err = __rps_up_interrupt(rps, engine, &spin); + + st_engine_heartbeat_enable(engine); + if (err) + goto out; + + intel_gt_pm_wait_for_idle(engine->gt); + } + + /* Keep the engine awake but idle and check for DOWN */ + if (pm_events & GEN6_PM_RP_DOWN_THRESHOLD) { + st_engine_heartbeat_disable(engine); + intel_rc6_disable(>->rc6); + + err = __rps_down_interrupt(rps, engine); + + intel_rc6_enable(>->rc6); + st_engine_heartbeat_enable(engine); + if (err) + goto out; + } + } + +out: + if (igt_flush_test(gt->i915)) + err = -EIO; + + igt_spinner_fini(&spin); + + intel_gt_pm_wait_for_idle(gt); + rps->work.func = saved_work; + + return err; +} + +static u64 __measure_power(int duration_ms) +{ + u64 dE, dt; + + dt = ktime_get(); + dE = librapl_energy_uJ(); + usleep_range(1000 * duration_ms, 2000 * duration_ms); + dE = librapl_energy_uJ() - dE; + dt = ktime_get() - dt; + + return div64_u64(1000 * 1000 * dE, dt); +} + +static u64 measure_power_at(struct intel_rps *rps, int *freq) +{ + u64 x[5]; + int i; + + *freq = rps_set_check(rps, *freq); + for (i = 0; i < 5; i++) + x[i] = __measure_power(5); + *freq = (*freq + read_cagf(rps)) / 2; + + /* A simple triangle filter for better result stability */ + sort(x, 5, sizeof(*x), cmp_u64, NULL); + return div_u64(x[1] + 2 * x[2] + x[3], 4); +} + +int live_rps_power(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_rps *rps = >->rps; + void (*saved_work)(struct work_struct *wrk); + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct igt_spinner spin; + int err = 0; + + /* + * Our fundamental assumption is that running at lower frequency + * actually saves power. Let's see if our RAPL measurement support + * that theory. + */ + + if (!intel_rps_is_enabled(rps)) + return 0; + + if (!librapl_energy_uJ()) + return 0; + + if (igt_spinner_init(&spin, gt)) + return -ENOMEM; + + intel_gt_pm_wait_for_idle(gt); + saved_work = rps->work.func; + rps->work.func = dummy_rps_work; + + for_each_engine(engine, gt, id) { + struct i915_request *rq; + struct { + u64 power; + int freq; + } min, max; + + if (!intel_engine_can_store_dword(engine)) + continue; + + st_engine_heartbeat_disable(engine); + + rq = igt_spinner_create_request(&spin, + engine->kernel_context, + MI_NOOP); + if (IS_ERR(rq)) { + st_engine_heartbeat_enable(engine); + err = PTR_ERR(rq); + break; + } + + i915_request_add(rq); + + if (!igt_wait_for_spinner(&spin, rq)) { + pr_err("%s: RPS spinner did not start\n", + engine->name); + igt_spinner_end(&spin); + st_engine_heartbeat_enable(engine); + intel_gt_set_wedged(engine->gt); + err = -EIO; + break; + } + + max.freq = rps->max_freq; + max.power = measure_power_at(rps, &max.freq); + + min.freq = rps->min_freq; + min.power = measure_power_at(rps, &min.freq); + + igt_spinner_end(&spin); + st_engine_heartbeat_enable(engine); + + pr_info("%s: min:%llumW @ %uMHz, max:%llumW @ %uMHz\n", + engine->name, + min.power, intel_gpu_freq(rps, min.freq), + max.power, intel_gpu_freq(rps, max.freq)); + + if (10 * min.freq >= 9 * max.freq) { + pr_notice("Could not control frequency, ran at [%d:%uMHz, %d:%uMhz]\n", + min.freq, intel_gpu_freq(rps, min.freq), + max.freq, intel_gpu_freq(rps, max.freq)); + continue; + } + + if (11 * min.power > 10 * max.power) { + pr_err("%s: did not conserve power when setting lower frequency!\n", + engine->name); + err = -EINVAL; + break; + } + + if (igt_flush_test(gt->i915)) { + err = -EIO; + break; + } + } + + igt_spinner_fini(&spin); + + intel_gt_pm_wait_for_idle(gt); + rps->work.func = saved_work; + + return err; +} + +int live_rps_dynamic(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_rps *rps = >->rps; + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct igt_spinner spin; + int err = 0; + + /* + * We've looked at the bascs, and have established that we + * can change the clock frequency and that the HW will generate + * interrupts based on load. Now we check how we integrate those + * moving parts into dynamic reclocking based on load. + */ + + if (!intel_rps_is_enabled(rps)) + return 0; + + if (igt_spinner_init(&spin, gt)) + return -ENOMEM; + + if (intel_rps_has_interrupts(rps)) + pr_info("RPS has interrupt support\n"); + if (intel_rps_uses_timer(rps)) + pr_info("RPS has timer support\n"); + + for_each_engine(engine, gt, id) { + struct i915_request *rq; + struct { + ktime_t dt; + u8 freq; + } min, max; + + if (!intel_engine_can_store_dword(engine)) + continue; + + intel_gt_pm_wait_for_idle(gt); + GEM_BUG_ON(intel_rps_is_active(rps)); + rps->cur_freq = rps->min_freq; + + intel_engine_pm_get(engine); + intel_rc6_disable(>->rc6); + GEM_BUG_ON(rps->last_freq != rps->min_freq); + + rq = igt_spinner_create_request(&spin, + engine->kernel_context, + MI_NOOP); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err; + } + + i915_request_add(rq); + + max.dt = ktime_get(); + max.freq = wait_for_freq(rps, rps->max_freq, 500); + max.dt = ktime_sub(ktime_get(), max.dt); + + igt_spinner_end(&spin); + + min.dt = ktime_get(); + min.freq = wait_for_freq(rps, rps->min_freq, 2000); + min.dt = ktime_sub(ktime_get(), min.dt); + + pr_info("%s: dynamically reclocked to %u:%uMHz while busy in %lluns, and %u:%uMHz while idle in %lluns\n", + engine->name, + max.freq, intel_gpu_freq(rps, max.freq), + ktime_to_ns(max.dt), + min.freq, intel_gpu_freq(rps, min.freq), + ktime_to_ns(min.dt)); + if (min.freq >= max.freq) { + pr_err("%s: dynamic reclocking of spinner failed\n!", + engine->name); + err = -EINVAL; + } + +err: + intel_rc6_enable(>->rc6); + intel_engine_pm_put(engine); + + if (igt_flush_test(gt->i915)) + err = -EIO; + if (err) + break; + } + + igt_spinner_fini(&spin); + + return err; +} diff --git a/drivers/gpu/drm/i915/gt/selftest_rps.h b/drivers/gpu/drm/i915/gt/selftest_rps.h new file mode 100644 index 000000000..6e82a631c --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_rps.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2020 Intel Corporation + */ + +#ifndef SELFTEST_RPS_H +#define SELFTEST_RPS_H + +int live_rps_control(void *arg); +int live_rps_clock_interval(void *arg); +int live_rps_frequency_cs(void *arg); +int live_rps_frequency_srm(void *arg); +int live_rps_power(void *arg); +int live_rps_interrupt(void *arg); +int live_rps_dynamic(void *arg); + +#endif /* SELFTEST_RPS_H */ diff --git a/drivers/gpu/drm/i915/gt/selftest_timeline.c b/drivers/gpu/drm/i915/gt/selftest_timeline.c new file mode 100644 index 000000000..19c2cb166 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_timeline.c @@ -0,0 +1,1009 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2017-2018 Intel Corporation + */ + +#include <linux/prime_numbers.h> + +#include "intel_context.h" +#include "intel_engine_heartbeat.h" +#include "intel_engine_pm.h" +#include "intel_gt.h" +#include "intel_gt_requests.h" +#include "intel_ring.h" +#include "selftest_engine_heartbeat.h" + +#include "../selftests/i915_random.h" +#include "../i915_selftest.h" + +#include "../selftests/igt_flush_test.h" +#include "../selftests/mock_gem_device.h" +#include "selftests/mock_timeline.h" + +static struct page *hwsp_page(struct intel_timeline *tl) +{ + struct drm_i915_gem_object *obj = tl->hwsp_ggtt->obj; + + GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj)); + return sg_page(obj->mm.pages->sgl); +} + +static unsigned long hwsp_cacheline(struct intel_timeline *tl) +{ + unsigned long address = (unsigned long)page_address(hwsp_page(tl)); + + return (address + tl->hwsp_offset) / CACHELINE_BYTES; +} + +#define CACHELINES_PER_PAGE (PAGE_SIZE / CACHELINE_BYTES) + +struct mock_hwsp_freelist { + struct intel_gt *gt; + struct radix_tree_root cachelines; + struct intel_timeline **history; + unsigned long count, max; + struct rnd_state prng; +}; + +enum { + SHUFFLE = BIT(0), +}; + +static void __mock_hwsp_record(struct mock_hwsp_freelist *state, + unsigned int idx, + struct intel_timeline *tl) +{ + tl = xchg(&state->history[idx], tl); + if (tl) { + radix_tree_delete(&state->cachelines, hwsp_cacheline(tl)); + intel_timeline_put(tl); + } +} + +static int __mock_hwsp_timeline(struct mock_hwsp_freelist *state, + unsigned int count, + unsigned int flags) +{ + struct intel_timeline *tl; + unsigned int idx; + + while (count--) { + unsigned long cacheline; + int err; + + tl = intel_timeline_create(state->gt); + if (IS_ERR(tl)) + return PTR_ERR(tl); + + cacheline = hwsp_cacheline(tl); + err = radix_tree_insert(&state->cachelines, cacheline, tl); + if (err) { + if (err == -EEXIST) { + pr_err("HWSP cacheline %lu already used; duplicate allocation!\n", + cacheline); + } + intel_timeline_put(tl); + return err; + } + + idx = state->count++ % state->max; + __mock_hwsp_record(state, idx, tl); + } + + if (flags & SHUFFLE) + i915_prandom_shuffle(state->history, + sizeof(*state->history), + min(state->count, state->max), + &state->prng); + + count = i915_prandom_u32_max_state(min(state->count, state->max), + &state->prng); + while (count--) { + idx = --state->count % state->max; + __mock_hwsp_record(state, idx, NULL); + } + + return 0; +} + +static int mock_hwsp_freelist(void *arg) +{ + struct mock_hwsp_freelist state; + struct drm_i915_private *i915; + const struct { + const char *name; + unsigned int flags; + } phases[] = { + { "linear", 0 }, + { "shuffled", SHUFFLE }, + { }, + }, *p; + unsigned int na; + int err = 0; + + i915 = mock_gem_device(); + if (!i915) + return -ENOMEM; + + INIT_RADIX_TREE(&state.cachelines, GFP_KERNEL); + state.prng = I915_RND_STATE_INITIALIZER(i915_selftest.random_seed); + + state.gt = &i915->gt; + + /* + * Create a bunch of timelines and check that their HWSP do not overlap. + * Free some, and try again. + */ + + state.max = PAGE_SIZE / sizeof(*state.history); + state.count = 0; + state.history = kcalloc(state.max, sizeof(*state.history), GFP_KERNEL); + if (!state.history) { + err = -ENOMEM; + goto err_put; + } + + for (p = phases; p->name; p++) { + pr_debug("%s(%s)\n", __func__, p->name); + for_each_prime_number_from(na, 1, 2 * CACHELINES_PER_PAGE) { + err = __mock_hwsp_timeline(&state, na, p->flags); + if (err) + goto out; + } + } + +out: + for (na = 0; na < state.max; na++) + __mock_hwsp_record(&state, na, NULL); + kfree(state.history); +err_put: + mock_destroy_device(i915); + return err; +} + +struct __igt_sync { + const char *name; + u32 seqno; + bool expected; + bool set; +}; + +static int __igt_sync(struct intel_timeline *tl, + u64 ctx, + const struct __igt_sync *p, + const char *name) +{ + int ret; + + if (__intel_timeline_sync_is_later(tl, ctx, p->seqno) != p->expected) { + pr_err("%s: %s(ctx=%llu, seqno=%u) expected passed %s but failed\n", + name, p->name, ctx, p->seqno, yesno(p->expected)); + return -EINVAL; + } + + if (p->set) { + ret = __intel_timeline_sync_set(tl, ctx, p->seqno); + if (ret) + return ret; + } + + return 0; +} + +static int igt_sync(void *arg) +{ + const struct __igt_sync pass[] = { + { "unset", 0, false, false }, + { "new", 0, false, true }, + { "0a", 0, true, true }, + { "1a", 1, false, true }, + { "1b", 1, true, true }, + { "0b", 0, true, false }, + { "2a", 2, false, true }, + { "4", 4, false, true }, + { "INT_MAX", INT_MAX, false, true }, + { "INT_MAX-1", INT_MAX-1, true, false }, + { "INT_MAX+1", (u32)INT_MAX+1, false, true }, + { "INT_MAX", INT_MAX, true, false }, + { "UINT_MAX", UINT_MAX, false, true }, + { "wrap", 0, false, true }, + { "unwrap", UINT_MAX, true, false }, + {}, + }, *p; + struct intel_timeline tl; + int order, offset; + int ret = -ENODEV; + + mock_timeline_init(&tl, 0); + for (p = pass; p->name; p++) { + for (order = 1; order < 64; order++) { + for (offset = -1; offset <= (order > 1); offset++) { + u64 ctx = BIT_ULL(order) + offset; + + ret = __igt_sync(&tl, ctx, p, "1"); + if (ret) + goto out; + } + } + } + mock_timeline_fini(&tl); + + mock_timeline_init(&tl, 0); + for (order = 1; order < 64; order++) { + for (offset = -1; offset <= (order > 1); offset++) { + u64 ctx = BIT_ULL(order) + offset; + + for (p = pass; p->name; p++) { + ret = __igt_sync(&tl, ctx, p, "2"); + if (ret) + goto out; + } + } + } + +out: + mock_timeline_fini(&tl); + return ret; +} + +static unsigned int random_engine(struct rnd_state *rnd) +{ + return i915_prandom_u32_max_state(I915_NUM_ENGINES, rnd); +} + +static int bench_sync(void *arg) +{ + struct rnd_state prng; + struct intel_timeline tl; + unsigned long end_time, count; + u64 prng32_1M; + ktime_t kt; + int order, last_order; + + mock_timeline_init(&tl, 0); + + /* Lookups from cache are very fast and so the random number generation + * and the loop itself becomes a significant factor in the per-iteration + * timings. We try to compensate the results by measuring the overhead + * of the prng and subtract it from the reported results. + */ + prandom_seed_state(&prng, i915_selftest.random_seed); + count = 0; + kt = ktime_get(); + end_time = jiffies + HZ/10; + do { + u32 x; + + /* Make sure the compiler doesn't optimise away the prng call */ + WRITE_ONCE(x, prandom_u32_state(&prng)); + + count++; + } while (!time_after(jiffies, end_time)); + kt = ktime_sub(ktime_get(), kt); + pr_debug("%s: %lu random evaluations, %lluns/prng\n", + __func__, count, (long long)div64_ul(ktime_to_ns(kt), count)); + prng32_1M = div64_ul(ktime_to_ns(kt) << 20, count); + + /* Benchmark (only) setting random context ids */ + prandom_seed_state(&prng, i915_selftest.random_seed); + count = 0; + kt = ktime_get(); + end_time = jiffies + HZ/10; + do { + u64 id = i915_prandom_u64_state(&prng); + + __intel_timeline_sync_set(&tl, id, 0); + count++; + } while (!time_after(jiffies, end_time)); + kt = ktime_sub(ktime_get(), kt); + kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20); + pr_info("%s: %lu random insertions, %lluns/insert\n", + __func__, count, (long long)div64_ul(ktime_to_ns(kt), count)); + + /* Benchmark looking up the exact same context ids as we just set */ + prandom_seed_state(&prng, i915_selftest.random_seed); + end_time = count; + kt = ktime_get(); + while (end_time--) { + u64 id = i915_prandom_u64_state(&prng); + + if (!__intel_timeline_sync_is_later(&tl, id, 0)) { + mock_timeline_fini(&tl); + pr_err("Lookup of %llu failed\n", id); + return -EINVAL; + } + } + kt = ktime_sub(ktime_get(), kt); + kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20); + pr_info("%s: %lu random lookups, %lluns/lookup\n", + __func__, count, (long long)div64_ul(ktime_to_ns(kt), count)); + + mock_timeline_fini(&tl); + cond_resched(); + + mock_timeline_init(&tl, 0); + + /* Benchmark setting the first N (in order) contexts */ + count = 0; + kt = ktime_get(); + end_time = jiffies + HZ/10; + do { + __intel_timeline_sync_set(&tl, count++, 0); + } while (!time_after(jiffies, end_time)); + kt = ktime_sub(ktime_get(), kt); + pr_info("%s: %lu in-order insertions, %lluns/insert\n", + __func__, count, (long long)div64_ul(ktime_to_ns(kt), count)); + + /* Benchmark looking up the exact same context ids as we just set */ + end_time = count; + kt = ktime_get(); + while (end_time--) { + if (!__intel_timeline_sync_is_later(&tl, end_time, 0)) { + pr_err("Lookup of %lu failed\n", end_time); + mock_timeline_fini(&tl); + return -EINVAL; + } + } + kt = ktime_sub(ktime_get(), kt); + pr_info("%s: %lu in-order lookups, %lluns/lookup\n", + __func__, count, (long long)div64_ul(ktime_to_ns(kt), count)); + + mock_timeline_fini(&tl); + cond_resched(); + + mock_timeline_init(&tl, 0); + + /* Benchmark searching for a random context id and maybe changing it */ + prandom_seed_state(&prng, i915_selftest.random_seed); + count = 0; + kt = ktime_get(); + end_time = jiffies + HZ/10; + do { + u32 id = random_engine(&prng); + u32 seqno = prandom_u32_state(&prng); + + if (!__intel_timeline_sync_is_later(&tl, id, seqno)) + __intel_timeline_sync_set(&tl, id, seqno); + + count++; + } while (!time_after(jiffies, end_time)); + kt = ktime_sub(ktime_get(), kt); + kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20); + pr_info("%s: %lu repeated insert/lookups, %lluns/op\n", + __func__, count, (long long)div64_ul(ktime_to_ns(kt), count)); + mock_timeline_fini(&tl); + cond_resched(); + + /* Benchmark searching for a known context id and changing the seqno */ + for (last_order = 1, order = 1; order < 32; + ({ int tmp = last_order; last_order = order; order += tmp; })) { + unsigned int mask = BIT(order) - 1; + + mock_timeline_init(&tl, 0); + + count = 0; + kt = ktime_get(); + end_time = jiffies + HZ/10; + do { + /* Without assuming too many details of the underlying + * implementation, try to identify its phase-changes + * (if any)! + */ + u64 id = (u64)(count & mask) << order; + + __intel_timeline_sync_is_later(&tl, id, 0); + __intel_timeline_sync_set(&tl, id, 0); + + count++; + } while (!time_after(jiffies, end_time)); + kt = ktime_sub(ktime_get(), kt); + pr_info("%s: %lu cyclic/%d insert/lookups, %lluns/op\n", + __func__, count, order, + (long long)div64_ul(ktime_to_ns(kt), count)); + mock_timeline_fini(&tl); + cond_resched(); + } + + return 0; +} + +int intel_timeline_mock_selftests(void) +{ + static const struct i915_subtest tests[] = { + SUBTEST(mock_hwsp_freelist), + SUBTEST(igt_sync), + SUBTEST(bench_sync), + }; + + return i915_subtests(tests, NULL); +} + +static int emit_ggtt_store_dw(struct i915_request *rq, u32 addr, u32 value) +{ + u32 *cs; + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + if (INTEL_GEN(rq->engine->i915) >= 8) { + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = addr; + *cs++ = 0; + *cs++ = value; + } else if (INTEL_GEN(rq->engine->i915) >= 4) { + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = 0; + *cs++ = addr; + *cs++ = value; + } else { + *cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; + *cs++ = addr; + *cs++ = value; + *cs++ = MI_NOOP; + } + + intel_ring_advance(rq, cs); + + return 0; +} + +static struct i915_request * +tl_write(struct intel_timeline *tl, struct intel_engine_cs *engine, u32 value) +{ + struct i915_request *rq; + int err; + + err = intel_timeline_pin(tl, NULL); + if (err) { + rq = ERR_PTR(err); + goto out; + } + + rq = intel_engine_create_kernel_request(engine); + if (IS_ERR(rq)) + goto out_unpin; + + i915_request_get(rq); + + err = emit_ggtt_store_dw(rq, tl->hwsp_offset, value); + i915_request_add(rq); + if (err) { + i915_request_put(rq); + rq = ERR_PTR(err); + } + +out_unpin: + intel_timeline_unpin(tl); +out: + if (IS_ERR(rq)) + pr_err("Failed to write to timeline!\n"); + return rq; +} + +static struct intel_timeline * +checked_intel_timeline_create(struct intel_gt *gt) +{ + struct intel_timeline *tl; + + tl = intel_timeline_create(gt); + if (IS_ERR(tl)) + return tl; + + if (READ_ONCE(*tl->hwsp_seqno) != tl->seqno) { + pr_err("Timeline created with incorrect breadcrumb, found %x, expected %x\n", + *tl->hwsp_seqno, tl->seqno); + intel_timeline_put(tl); + return ERR_PTR(-EINVAL); + } + + return tl; +} + +static int live_hwsp_engine(void *arg) +{ +#define NUM_TIMELINES 4096 + struct intel_gt *gt = arg; + struct intel_timeline **timelines; + struct intel_engine_cs *engine; + enum intel_engine_id id; + unsigned long count, n; + int err = 0; + + /* + * Create a bunch of timelines and check we can write + * independently to each of their breadcrumb slots. + */ + + timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES, + sizeof(*timelines), + GFP_KERNEL); + if (!timelines) + return -ENOMEM; + + count = 0; + for_each_engine(engine, gt, id) { + if (!intel_engine_can_store_dword(engine)) + continue; + + intel_engine_pm_get(engine); + + for (n = 0; n < NUM_TIMELINES; n++) { + struct intel_timeline *tl; + struct i915_request *rq; + + tl = checked_intel_timeline_create(gt); + if (IS_ERR(tl)) { + err = PTR_ERR(tl); + break; + } + + rq = tl_write(tl, engine, count); + if (IS_ERR(rq)) { + intel_timeline_put(tl); + err = PTR_ERR(rq); + break; + } + + timelines[count++] = tl; + i915_request_put(rq); + } + + intel_engine_pm_put(engine); + if (err) + break; + } + + if (igt_flush_test(gt->i915)) + err = -EIO; + + for (n = 0; n < count; n++) { + struct intel_timeline *tl = timelines[n]; + + if (!err && READ_ONCE(*tl->hwsp_seqno) != n) { + GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n", + n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno); + GEM_TRACE_DUMP(); + err = -EINVAL; + } + intel_timeline_put(tl); + } + + kvfree(timelines); + return err; +#undef NUM_TIMELINES +} + +static int live_hwsp_alternate(void *arg) +{ +#define NUM_TIMELINES 4096 + struct intel_gt *gt = arg; + struct intel_timeline **timelines; + struct intel_engine_cs *engine; + enum intel_engine_id id; + unsigned long count, n; + int err = 0; + + /* + * Create a bunch of timelines and check we can write + * independently to each of their breadcrumb slots with adjacent + * engines. + */ + + timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES, + sizeof(*timelines), + GFP_KERNEL); + if (!timelines) + return -ENOMEM; + + count = 0; + for (n = 0; n < NUM_TIMELINES; n++) { + for_each_engine(engine, gt, id) { + struct intel_timeline *tl; + struct i915_request *rq; + + if (!intel_engine_can_store_dword(engine)) + continue; + + tl = checked_intel_timeline_create(gt); + if (IS_ERR(tl)) { + err = PTR_ERR(tl); + goto out; + } + + intel_engine_pm_get(engine); + rq = tl_write(tl, engine, count); + intel_engine_pm_put(engine); + if (IS_ERR(rq)) { + intel_timeline_put(tl); + err = PTR_ERR(rq); + goto out; + } + + timelines[count++] = tl; + i915_request_put(rq); + } + } + +out: + if (igt_flush_test(gt->i915)) + err = -EIO; + + for (n = 0; n < count; n++) { + struct intel_timeline *tl = timelines[n]; + + if (!err && READ_ONCE(*tl->hwsp_seqno) != n) { + GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n", + n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno); + GEM_TRACE_DUMP(); + err = -EINVAL; + } + intel_timeline_put(tl); + } + + kvfree(timelines); + return err; +#undef NUM_TIMELINES +} + +static int live_hwsp_wrap(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + struct intel_timeline *tl; + enum intel_engine_id id; + int err = 0; + + /* + * Across a seqno wrap, we need to keep the old cacheline alive for + * foreign GPU references. + */ + + tl = intel_timeline_create(gt); + if (IS_ERR(tl)) + return PTR_ERR(tl); + + if (!tl->has_initial_breadcrumb || !tl->hwsp_cacheline) + goto out_free; + + err = intel_timeline_pin(tl, NULL); + if (err) + goto out_free; + + for_each_engine(engine, gt, id) { + const u32 *hwsp_seqno[2]; + struct i915_request *rq; + u32 seqno[2]; + + if (!intel_engine_can_store_dword(engine)) + continue; + + rq = intel_engine_create_kernel_request(engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out; + } + + tl->seqno = -4u; + + mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING); + err = intel_timeline_get_seqno(tl, rq, &seqno[0]); + mutex_unlock(&tl->mutex); + if (err) { + i915_request_add(rq); + goto out; + } + pr_debug("seqno[0]:%08x, hwsp_offset:%08x\n", + seqno[0], tl->hwsp_offset); + + err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[0]); + if (err) { + i915_request_add(rq); + goto out; + } + hwsp_seqno[0] = tl->hwsp_seqno; + + mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING); + err = intel_timeline_get_seqno(tl, rq, &seqno[1]); + mutex_unlock(&tl->mutex); + if (err) { + i915_request_add(rq); + goto out; + } + pr_debug("seqno[1]:%08x, hwsp_offset:%08x\n", + seqno[1], tl->hwsp_offset); + + err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[1]); + if (err) { + i915_request_add(rq); + goto out; + } + hwsp_seqno[1] = tl->hwsp_seqno; + + /* With wrap should come a new hwsp */ + GEM_BUG_ON(seqno[1] >= seqno[0]); + GEM_BUG_ON(hwsp_seqno[0] == hwsp_seqno[1]); + + i915_request_add(rq); + + if (i915_request_wait(rq, 0, HZ / 5) < 0) { + pr_err("Wait for timeline writes timed out!\n"); + err = -EIO; + goto out; + } + + if (READ_ONCE(*hwsp_seqno[0]) != seqno[0] || + READ_ONCE(*hwsp_seqno[1]) != seqno[1]) { + pr_err("Bad timeline values: found (%x, %x), expected (%x, %x)\n", + *hwsp_seqno[0], *hwsp_seqno[1], + seqno[0], seqno[1]); + err = -EINVAL; + goto out; + } + + intel_gt_retire_requests(gt); /* recycle HWSP */ + } + +out: + if (igt_flush_test(gt->i915)) + err = -EIO; + + intel_timeline_unpin(tl); +out_free: + intel_timeline_put(tl); + return err; +} + +static int live_hwsp_rollover_kernel(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + /* + * Run the host for long enough, and even the kernel context will + * see a seqno rollover. + */ + + for_each_engine(engine, gt, id) { + struct intel_context *ce = engine->kernel_context; + struct intel_timeline *tl = ce->timeline; + struct i915_request *rq[3] = {}; + int i; + + st_engine_heartbeat_disable(engine); + if (intel_gt_wait_for_idle(gt, HZ / 2)) { + err = -EIO; + goto out; + } + + GEM_BUG_ON(i915_active_fence_isset(&tl->last_request)); + tl->seqno = 0; + timeline_rollback(tl); + timeline_rollback(tl); + WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno); + + for (i = 0; i < ARRAY_SIZE(rq); i++) { + struct i915_request *this; + + this = i915_request_create(ce); + if (IS_ERR(this)) { + err = PTR_ERR(this); + goto out; + } + + pr_debug("%s: create fence.seqnp:%d\n", + engine->name, + lower_32_bits(this->fence.seqno)); + + GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl); + + rq[i] = i915_request_get(this); + i915_request_add(this); + } + + /* We expected a wrap! */ + GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno); + + if (i915_request_wait(rq[2], 0, HZ / 5) < 0) { + pr_err("Wait for timeline wrap timed out!\n"); + err = -EIO; + goto out; + } + + for (i = 0; i < ARRAY_SIZE(rq); i++) { + if (!i915_request_completed(rq[i])) { + pr_err("Pre-wrap request not completed!\n"); + err = -EINVAL; + goto out; + } + } + +out: + for (i = 0; i < ARRAY_SIZE(rq); i++) + i915_request_put(rq[i]); + st_engine_heartbeat_enable(engine); + if (err) + break; + } + + if (igt_flush_test(gt->i915)) + err = -EIO; + + return err; +} + +static int live_hwsp_rollover_user(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + /* + * Simulate a long running user context, and force the seqno wrap + * on the user's timeline. + */ + + for_each_engine(engine, gt, id) { + struct i915_request *rq[3] = {}; + struct intel_timeline *tl; + struct intel_context *ce; + int i; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return PTR_ERR(ce); + + err = intel_context_alloc_state(ce); + if (err) + goto out; + + tl = ce->timeline; + if (!tl->has_initial_breadcrumb || !tl->hwsp_cacheline) + goto out; + + timeline_rollback(tl); + timeline_rollback(tl); + WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno); + + for (i = 0; i < ARRAY_SIZE(rq); i++) { + struct i915_request *this; + + this = intel_context_create_request(ce); + if (IS_ERR(this)) { + err = PTR_ERR(this); + goto out; + } + + pr_debug("%s: create fence.seqnp:%d\n", + engine->name, + lower_32_bits(this->fence.seqno)); + + GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl); + + rq[i] = i915_request_get(this); + i915_request_add(this); + } + + /* We expected a wrap! */ + GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno); + + if (i915_request_wait(rq[2], 0, HZ / 5) < 0) { + pr_err("Wait for timeline wrap timed out!\n"); + err = -EIO; + goto out; + } + + for (i = 0; i < ARRAY_SIZE(rq); i++) { + if (!i915_request_completed(rq[i])) { + pr_err("Pre-wrap request not completed!\n"); + err = -EINVAL; + goto out; + } + } + +out: + for (i = 0; i < ARRAY_SIZE(rq); i++) + i915_request_put(rq[i]); + intel_context_put(ce); + if (err) + break; + } + + if (igt_flush_test(gt->i915)) + err = -EIO; + + return err; +} + +static int live_hwsp_recycle(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + unsigned long count; + int err = 0; + + /* + * Check seqno writes into one timeline at a time. We expect to + * recycle the breadcrumb slot between iterations and neither + * want to confuse ourselves or the GPU. + */ + + count = 0; + for_each_engine(engine, gt, id) { + IGT_TIMEOUT(end_time); + + if (!intel_engine_can_store_dword(engine)) + continue; + + intel_engine_pm_get(engine); + + do { + struct intel_timeline *tl; + struct i915_request *rq; + + tl = checked_intel_timeline_create(gt); + if (IS_ERR(tl)) { + err = PTR_ERR(tl); + break; + } + + rq = tl_write(tl, engine, count); + if (IS_ERR(rq)) { + intel_timeline_put(tl); + err = PTR_ERR(rq); + break; + } + + if (i915_request_wait(rq, 0, HZ / 5) < 0) { + pr_err("Wait for timeline writes timed out!\n"); + i915_request_put(rq); + intel_timeline_put(tl); + err = -EIO; + break; + } + + if (READ_ONCE(*tl->hwsp_seqno) != count) { + GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x found 0x%x\n", + count, tl->fence_context, + tl->hwsp_offset, *tl->hwsp_seqno); + GEM_TRACE_DUMP(); + err = -EINVAL; + } + + i915_request_put(rq); + intel_timeline_put(tl); + count++; + + if (err) + break; + } while (!__igt_timeout(end_time, NULL)); + + intel_engine_pm_put(engine); + if (err) + break; + } + + return err; +} + +int intel_timeline_live_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(live_hwsp_recycle), + SUBTEST(live_hwsp_engine), + SUBTEST(live_hwsp_alternate), + SUBTEST(live_hwsp_wrap), + SUBTEST(live_hwsp_rollover_kernel), + SUBTEST(live_hwsp_rollover_user), + }; + + if (intel_gt_is_wedged(&i915->gt)) + return 0; + + return intel_gt_live_subtests(tests, &i915->gt); +} diff --git a/drivers/gpu/drm/i915/gt/selftest_workarounds.c b/drivers/gpu/drm/i915/gt/selftest_workarounds.c new file mode 100644 index 000000000..61a0532d0 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_workarounds.c @@ -0,0 +1,1299 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2018 Intel Corporation + */ + +#include "gem/i915_gem_pm.h" +#include "gt/intel_engine_user.h" +#include "gt/intel_gt.h" +#include "i915_selftest.h" +#include "intel_reset.h" + +#include "selftests/igt_flush_test.h" +#include "selftests/igt_reset.h" +#include "selftests/igt_spinner.h" +#include "selftests/mock_drm.h" + +#include "gem/selftests/igt_gem_utils.h" +#include "gem/selftests/mock_context.h" + +static const struct wo_register { + enum intel_platform platform; + u32 reg; +} wo_registers[] = { + { INTEL_GEMINILAKE, 0x731c } +}; + +struct wa_lists { + struct i915_wa_list gt_wa_list; + struct { + struct i915_wa_list wa_list; + struct i915_wa_list ctx_wa_list; + } engine[I915_NUM_ENGINES]; +}; + +static int request_add_sync(struct i915_request *rq, int err) +{ + i915_request_get(rq); + i915_request_add(rq); + if (i915_request_wait(rq, 0, HZ / 5) < 0) + err = -EIO; + i915_request_put(rq); + + return err; +} + +static int request_add_spin(struct i915_request *rq, struct igt_spinner *spin) +{ + int err = 0; + + i915_request_get(rq); + i915_request_add(rq); + if (spin && !igt_wait_for_spinner(spin, rq)) + err = -ETIMEDOUT; + i915_request_put(rq); + + return err; +} + +static void +reference_lists_init(struct intel_gt *gt, struct wa_lists *lists) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + memset(lists, 0, sizeof(*lists)); + + wa_init_start(&lists->gt_wa_list, "GT_REF", "global"); + gt_init_workarounds(gt->i915, &lists->gt_wa_list); + wa_init_finish(&lists->gt_wa_list); + + for_each_engine(engine, gt, id) { + struct i915_wa_list *wal = &lists->engine[id].wa_list; + + wa_init_start(wal, "REF", engine->name); + engine_init_workarounds(engine, wal); + wa_init_finish(wal); + + __intel_engine_init_ctx_wa(engine, + &lists->engine[id].ctx_wa_list, + "CTX_REF"); + } +} + +static void +reference_lists_fini(struct intel_gt *gt, struct wa_lists *lists) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, gt, id) + intel_wa_list_free(&lists->engine[id].wa_list); + + intel_wa_list_free(&lists->gt_wa_list); +} + +static struct drm_i915_gem_object * +read_nonprivs(struct i915_gem_context *ctx, struct intel_engine_cs *engine) +{ + const u32 base = engine->mmio_base; + struct drm_i915_gem_object *result; + struct i915_request *rq; + struct i915_vma *vma; + u32 srm, *cs; + int err; + int i; + + result = i915_gem_object_create_internal(engine->i915, PAGE_SIZE); + if (IS_ERR(result)) + return result; + + i915_gem_object_set_cache_coherency(result, I915_CACHE_LLC); + + cs = i915_gem_object_pin_map(result, I915_MAP_WB); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto err_obj; + } + memset(cs, 0xc5, PAGE_SIZE); + i915_gem_object_flush_map(result); + i915_gem_object_unpin_map(result); + + vma = i915_vma_instance(result, &engine->gt->ggtt->vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_obj; + } + + err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL); + if (err) + goto err_obj; + + rq = igt_request_alloc(ctx, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_pin; + } + + i915_vma_lock(vma); + err = i915_request_await_object(rq, vma->obj, true); + if (err == 0) + err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); + i915_vma_unlock(vma); + if (err) + goto err_req; + + srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT; + if (INTEL_GEN(ctx->i915) >= 8) + srm++; + + cs = intel_ring_begin(rq, 4 * RING_MAX_NONPRIV_SLOTS); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto err_req; + } + + for (i = 0; i < RING_MAX_NONPRIV_SLOTS; i++) { + *cs++ = srm; + *cs++ = i915_mmio_reg_offset(RING_FORCE_TO_NONPRIV(base, i)); + *cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i; + *cs++ = 0; + } + intel_ring_advance(rq, cs); + + i915_request_add(rq); + i915_vma_unpin(vma); + + return result; + +err_req: + i915_request_add(rq); +err_pin: + i915_vma_unpin(vma); +err_obj: + i915_gem_object_put(result); + return ERR_PTR(err); +} + +static u32 +get_whitelist_reg(const struct intel_engine_cs *engine, unsigned int i) +{ + i915_reg_t reg = i < engine->whitelist.count ? + engine->whitelist.list[i].reg : + RING_NOPID(engine->mmio_base); + + return i915_mmio_reg_offset(reg); +} + +static void +print_results(const struct intel_engine_cs *engine, const u32 *results) +{ + unsigned int i; + + for (i = 0; i < RING_MAX_NONPRIV_SLOTS; i++) { + u32 expected = get_whitelist_reg(engine, i); + u32 actual = results[i]; + + pr_info("RING_NONPRIV[%d]: expected 0x%08x, found 0x%08x\n", + i, expected, actual); + } +} + +static int check_whitelist(struct i915_gem_context *ctx, + struct intel_engine_cs *engine) +{ + struct drm_i915_gem_object *results; + struct intel_wedge_me wedge; + u32 *vaddr; + int err; + int i; + + results = read_nonprivs(ctx, engine); + if (IS_ERR(results)) + return PTR_ERR(results); + + err = 0; + i915_gem_object_lock(results, NULL); + intel_wedge_on_timeout(&wedge, engine->gt, HZ / 5) /* safety net! */ + err = i915_gem_object_set_to_cpu_domain(results, false); + i915_gem_object_unlock(results); + if (intel_gt_is_wedged(engine->gt)) + err = -EIO; + if (err) + goto out_put; + + vaddr = i915_gem_object_pin_map(results, I915_MAP_WB); + if (IS_ERR(vaddr)) { + err = PTR_ERR(vaddr); + goto out_put; + } + + for (i = 0; i < RING_MAX_NONPRIV_SLOTS; i++) { + u32 expected = get_whitelist_reg(engine, i); + u32 actual = vaddr[i]; + + if (expected != actual) { + print_results(engine, vaddr); + pr_err("Invalid RING_NONPRIV[%d], expected 0x%08x, found 0x%08x\n", + i, expected, actual); + + err = -EINVAL; + break; + } + } + + i915_gem_object_unpin_map(results); +out_put: + i915_gem_object_put(results); + return err; +} + +static int do_device_reset(struct intel_engine_cs *engine) +{ + intel_gt_reset(engine->gt, engine->mask, "live_workarounds"); + return 0; +} + +static int do_engine_reset(struct intel_engine_cs *engine) +{ + return intel_engine_reset(engine, "live_workarounds"); +} + +static int +switch_to_scratch_context(struct intel_engine_cs *engine, + struct igt_spinner *spin) +{ + struct intel_context *ce; + struct i915_request *rq; + int err = 0; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return PTR_ERR(ce); + + rq = igt_spinner_create_request(spin, ce, MI_NOOP); + intel_context_put(ce); + + if (IS_ERR(rq)) { + spin = NULL; + err = PTR_ERR(rq); + goto err; + } + + err = request_add_spin(rq, spin); +err: + if (err && spin) + igt_spinner_end(spin); + + return err; +} + +static int check_whitelist_across_reset(struct intel_engine_cs *engine, + int (*reset)(struct intel_engine_cs *), + const char *name) +{ + struct drm_i915_private *i915 = engine->i915; + struct i915_gem_context *ctx, *tmp; + struct igt_spinner spin; + intel_wakeref_t wakeref; + int err; + + pr_info("Checking %d whitelisted registers on %s (RING_NONPRIV) [%s]\n", + engine->whitelist.count, engine->name, name); + + ctx = kernel_context(i915); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + err = igt_spinner_init(&spin, engine->gt); + if (err) + goto out_ctx; + + err = check_whitelist(ctx, engine); + if (err) { + pr_err("Invalid whitelist *before* %s reset!\n", name); + goto out_spin; + } + + err = switch_to_scratch_context(engine, &spin); + if (err) + goto out_spin; + + with_intel_runtime_pm(engine->uncore->rpm, wakeref) + err = reset(engine); + + igt_spinner_end(&spin); + + if (err) { + pr_err("%s reset failed\n", name); + goto out_spin; + } + + err = check_whitelist(ctx, engine); + if (err) { + pr_err("Whitelist not preserved in context across %s reset!\n", + name); + goto out_spin; + } + + tmp = kernel_context(i915); + if (IS_ERR(tmp)) { + err = PTR_ERR(tmp); + goto out_spin; + } + kernel_context_close(ctx); + ctx = tmp; + + err = check_whitelist(ctx, engine); + if (err) { + pr_err("Invalid whitelist *after* %s reset in fresh context!\n", + name); + goto out_spin; + } + +out_spin: + igt_spinner_fini(&spin); +out_ctx: + kernel_context_close(ctx); + return err; +} + +static struct i915_vma *create_batch(struct i915_address_space *vm) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + int err; + + obj = i915_gem_object_create_internal(vm->i915, 16 * PAGE_SIZE); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + vma = i915_vma_instance(obj, vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_obj; + } + + err = i915_vma_pin(vma, 0, 0, PIN_USER); + if (err) + goto err_obj; + + return vma; + +err_obj: + i915_gem_object_put(obj); + return ERR_PTR(err); +} + +static u32 reg_write(u32 old, u32 new, u32 rsvd) +{ + if (rsvd == 0x0000ffff) { + old &= ~(new >> 16); + old |= new & (new >> 16); + } else { + old &= ~rsvd; + old |= new & rsvd; + } + + return old; +} + +static bool wo_register(struct intel_engine_cs *engine, u32 reg) +{ + enum intel_platform platform = INTEL_INFO(engine->i915)->platform; + int i; + + if ((reg & RING_FORCE_TO_NONPRIV_ACCESS_MASK) == + RING_FORCE_TO_NONPRIV_ACCESS_WR) + return true; + + for (i = 0; i < ARRAY_SIZE(wo_registers); i++) { + if (wo_registers[i].platform == platform && + wo_registers[i].reg == reg) + return true; + } + + return false; +} + +static bool timestamp(const struct intel_engine_cs *engine, u32 reg) +{ + reg = (reg - engine->mmio_base) & ~RING_FORCE_TO_NONPRIV_ACCESS_MASK; + switch (reg) { + case 0x358: + case 0x35c: + case 0x3a8: + return true; + + default: + return false; + } +} + +static bool ro_register(u32 reg) +{ + if ((reg & RING_FORCE_TO_NONPRIV_ACCESS_MASK) == + RING_FORCE_TO_NONPRIV_ACCESS_RD) + return true; + + return false; +} + +static int whitelist_writable_count(struct intel_engine_cs *engine) +{ + int count = engine->whitelist.count; + int i; + + for (i = 0; i < engine->whitelist.count; i++) { + u32 reg = i915_mmio_reg_offset(engine->whitelist.list[i].reg); + + if (ro_register(reg)) + count--; + } + + return count; +} + +static int check_dirty_whitelist(struct intel_context *ce) +{ + const u32 values[] = { + 0x00000000, + 0x01010101, + 0x10100101, + 0x03030303, + 0x30300303, + 0x05050505, + 0x50500505, + 0x0f0f0f0f, + 0xf00ff00f, + 0x10101010, + 0xf0f01010, + 0x30303030, + 0xa0a03030, + 0x50505050, + 0xc0c05050, + 0xf0f0f0f0, + 0x11111111, + 0x33333333, + 0x55555555, + 0x0000ffff, + 0x00ff00ff, + 0xff0000ff, + 0xffff00ff, + 0xffffffff, + }; + struct intel_engine_cs *engine = ce->engine; + struct i915_vma *scratch; + struct i915_vma *batch; + int err = 0, i, v; + u32 *cs, *results; + + scratch = create_scratch(ce->vm, 2 * ARRAY_SIZE(values) + 1); + if (IS_ERR(scratch)) + return PTR_ERR(scratch); + + batch = create_batch(ce->vm); + if (IS_ERR(batch)) { + err = PTR_ERR(batch); + goto out_scratch; + } + + for (i = 0; i < engine->whitelist.count; i++) { + u32 reg = i915_mmio_reg_offset(engine->whitelist.list[i].reg); + u64 addr = scratch->node.start; + struct i915_request *rq; + u32 srm, lrm, rsvd; + u32 expect; + int idx; + bool ro_reg; + + if (wo_register(engine, reg)) + continue; + + if (timestamp(engine, reg)) + continue; /* timestamps are expected to autoincrement */ + + ro_reg = ro_register(reg); + + /* Clear non priv flags */ + reg &= RING_FORCE_TO_NONPRIV_ADDRESS_MASK; + + srm = MI_STORE_REGISTER_MEM; + lrm = MI_LOAD_REGISTER_MEM; + if (INTEL_GEN(engine->i915) >= 8) + lrm++, srm++; + + pr_debug("%s: Writing garbage to %x\n", + engine->name, reg); + + cs = i915_gem_object_pin_map(batch->obj, I915_MAP_WC); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto out_batch; + } + + /* SRM original */ + *cs++ = srm; + *cs++ = reg; + *cs++ = lower_32_bits(addr); + *cs++ = upper_32_bits(addr); + + idx = 1; + for (v = 0; v < ARRAY_SIZE(values); v++) { + /* LRI garbage */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = reg; + *cs++ = values[v]; + + /* SRM result */ + *cs++ = srm; + *cs++ = reg; + *cs++ = lower_32_bits(addr + sizeof(u32) * idx); + *cs++ = upper_32_bits(addr + sizeof(u32) * idx); + idx++; + } + for (v = 0; v < ARRAY_SIZE(values); v++) { + /* LRI garbage */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = reg; + *cs++ = ~values[v]; + + /* SRM result */ + *cs++ = srm; + *cs++ = reg; + *cs++ = lower_32_bits(addr + sizeof(u32) * idx); + *cs++ = upper_32_bits(addr + sizeof(u32) * idx); + idx++; + } + GEM_BUG_ON(idx * sizeof(u32) > scratch->size); + + /* LRM original -- don't leave garbage in the context! */ + *cs++ = lrm; + *cs++ = reg; + *cs++ = lower_32_bits(addr); + *cs++ = upper_32_bits(addr); + + *cs++ = MI_BATCH_BUFFER_END; + + i915_gem_object_flush_map(batch->obj); + i915_gem_object_unpin_map(batch->obj); + intel_gt_chipset_flush(engine->gt); + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_batch; + } + + if (engine->emit_init_breadcrumb) { /* Be nice if we hang */ + err = engine->emit_init_breadcrumb(rq); + if (err) + goto err_request; + } + + i915_vma_lock(batch); + err = i915_request_await_object(rq, batch->obj, false); + if (err == 0) + err = i915_vma_move_to_active(batch, rq, 0); + i915_vma_unlock(batch); + if (err) + goto err_request; + + i915_vma_lock(scratch); + err = i915_request_await_object(rq, scratch->obj, true); + if (err == 0) + err = i915_vma_move_to_active(scratch, rq, + EXEC_OBJECT_WRITE); + i915_vma_unlock(scratch); + if (err) + goto err_request; + + err = engine->emit_bb_start(rq, + batch->node.start, PAGE_SIZE, + 0); + if (err) + goto err_request; + +err_request: + err = request_add_sync(rq, err); + if (err) { + pr_err("%s: Futzing %x timedout; cancelling test\n", + engine->name, reg); + intel_gt_set_wedged(engine->gt); + goto out_batch; + } + + results = i915_gem_object_pin_map(scratch->obj, I915_MAP_WB); + if (IS_ERR(results)) { + err = PTR_ERR(results); + goto out_batch; + } + + GEM_BUG_ON(values[ARRAY_SIZE(values) - 1] != 0xffffffff); + if (!ro_reg) { + /* detect write masking */ + rsvd = results[ARRAY_SIZE(values)]; + if (!rsvd) { + pr_err("%s: Unable to write to whitelisted register %x\n", + engine->name, reg); + err = -EINVAL; + goto out_unpin; + } + } else { + rsvd = 0; + } + + expect = results[0]; + idx = 1; + for (v = 0; v < ARRAY_SIZE(values); v++) { + if (ro_reg) + expect = results[0]; + else + expect = reg_write(expect, values[v], rsvd); + + if (results[idx] != expect) + err++; + idx++; + } + for (v = 0; v < ARRAY_SIZE(values); v++) { + if (ro_reg) + expect = results[0]; + else + expect = reg_write(expect, ~values[v], rsvd); + + if (results[idx] != expect) + err++; + idx++; + } + if (err) { + pr_err("%s: %d mismatch between values written to whitelisted register [%x], and values read back!\n", + engine->name, err, reg); + + if (ro_reg) + pr_info("%s: Whitelisted read-only register: %x, original value %08x\n", + engine->name, reg, results[0]); + else + pr_info("%s: Whitelisted register: %x, original value %08x, rsvd %08x\n", + engine->name, reg, results[0], rsvd); + + expect = results[0]; + idx = 1; + for (v = 0; v < ARRAY_SIZE(values); v++) { + u32 w = values[v]; + + if (ro_reg) + expect = results[0]; + else + expect = reg_write(expect, w, rsvd); + pr_info("Wrote %08x, read %08x, expect %08x\n", + w, results[idx], expect); + idx++; + } + for (v = 0; v < ARRAY_SIZE(values); v++) { + u32 w = ~values[v]; + + if (ro_reg) + expect = results[0]; + else + expect = reg_write(expect, w, rsvd); + pr_info("Wrote %08x, read %08x, expect %08x\n", + w, results[idx], expect); + idx++; + } + + err = -EINVAL; + } +out_unpin: + i915_gem_object_unpin_map(scratch->obj); + if (err) + break; + } + + if (igt_flush_test(engine->i915)) + err = -EIO; +out_batch: + i915_vma_unpin_and_release(&batch, 0); +out_scratch: + i915_vma_unpin_and_release(&scratch, 0); + return err; +} + +static int live_dirty_whitelist(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + + /* Can the user write to the whitelisted registers? */ + + if (INTEL_GEN(gt->i915) < 7) /* minimum requirement for LRI, SRM, LRM */ + return 0; + + for_each_engine(engine, gt, id) { + struct intel_context *ce; + int err; + + if (engine->whitelist.count == 0) + continue; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return PTR_ERR(ce); + + err = check_dirty_whitelist(ce); + intel_context_put(ce); + if (err) + return err; + } + + return 0; +} + +static int live_reset_whitelist(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + /* If we reset the gpu, we should not lose the RING_NONPRIV */ + igt_global_reset_lock(gt); + + for_each_engine(engine, gt, id) { + if (engine->whitelist.count == 0) + continue; + + if (intel_has_reset_engine(gt)) { + err = check_whitelist_across_reset(engine, + do_engine_reset, + "engine"); + if (err) + goto out; + } + + if (intel_has_gpu_reset(gt)) { + err = check_whitelist_across_reset(engine, + do_device_reset, + "device"); + if (err) + goto out; + } + } + +out: + igt_global_reset_unlock(gt); + return err; +} + +static int read_whitelisted_registers(struct i915_gem_context *ctx, + struct intel_engine_cs *engine, + struct i915_vma *results) +{ + struct i915_request *rq; + int i, err = 0; + u32 srm, *cs; + + rq = igt_request_alloc(ctx, engine); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + i915_vma_lock(results); + err = i915_request_await_object(rq, results->obj, true); + if (err == 0) + err = i915_vma_move_to_active(results, rq, EXEC_OBJECT_WRITE); + i915_vma_unlock(results); + if (err) + goto err_req; + + srm = MI_STORE_REGISTER_MEM; + if (INTEL_GEN(ctx->i915) >= 8) + srm++; + + cs = intel_ring_begin(rq, 4 * engine->whitelist.count); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto err_req; + } + + for (i = 0; i < engine->whitelist.count; i++) { + u64 offset = results->node.start + sizeof(u32) * i; + u32 reg = i915_mmio_reg_offset(engine->whitelist.list[i].reg); + + /* Clear non priv flags */ + reg &= RING_FORCE_TO_NONPRIV_ADDRESS_MASK; + + *cs++ = srm; + *cs++ = reg; + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + } + intel_ring_advance(rq, cs); + +err_req: + return request_add_sync(rq, err); +} + +static int scrub_whitelisted_registers(struct i915_gem_context *ctx, + struct intel_engine_cs *engine) +{ + struct i915_address_space *vm; + struct i915_request *rq; + struct i915_vma *batch; + int i, err = 0; + u32 *cs; + + vm = i915_gem_context_get_vm_rcu(ctx); + batch = create_batch(vm); + i915_vm_put(vm); + if (IS_ERR(batch)) + return PTR_ERR(batch); + + cs = i915_gem_object_pin_map(batch->obj, I915_MAP_WC); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto err_batch; + } + + *cs++ = MI_LOAD_REGISTER_IMM(whitelist_writable_count(engine)); + for (i = 0; i < engine->whitelist.count; i++) { + u32 reg = i915_mmio_reg_offset(engine->whitelist.list[i].reg); + + if (ro_register(reg)) + continue; + + /* Clear non priv flags */ + reg &= RING_FORCE_TO_NONPRIV_ADDRESS_MASK; + + *cs++ = reg; + *cs++ = 0xffffffff; + } + *cs++ = MI_BATCH_BUFFER_END; + + i915_gem_object_flush_map(batch->obj); + intel_gt_chipset_flush(engine->gt); + + rq = igt_request_alloc(ctx, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_unpin; + } + + if (engine->emit_init_breadcrumb) { /* Be nice if we hang */ + err = engine->emit_init_breadcrumb(rq); + if (err) + goto err_request; + } + + i915_vma_lock(batch); + err = i915_request_await_object(rq, batch->obj, false); + if (err == 0) + err = i915_vma_move_to_active(batch, rq, 0); + i915_vma_unlock(batch); + if (err) + goto err_request; + + /* Perform the writes from an unprivileged "user" batch */ + err = engine->emit_bb_start(rq, batch->node.start, 0, 0); + +err_request: + err = request_add_sync(rq, err); + +err_unpin: + i915_gem_object_unpin_map(batch->obj); +err_batch: + i915_vma_unpin_and_release(&batch, 0); + return err; +} + +struct regmask { + i915_reg_t reg; + unsigned long gen_mask; +}; + +static bool find_reg(struct drm_i915_private *i915, + i915_reg_t reg, + const struct regmask *tbl, + unsigned long count) +{ + u32 offset = i915_mmio_reg_offset(reg); + + while (count--) { + if (INTEL_INFO(i915)->gen_mask & tbl->gen_mask && + i915_mmio_reg_offset(tbl->reg) == offset) + return true; + tbl++; + } + + return false; +} + +static bool pardon_reg(struct drm_i915_private *i915, i915_reg_t reg) +{ + /* Alas, we must pardon some whitelists. Mistakes already made */ + static const struct regmask pardon[] = { + { GEN9_CTX_PREEMPT_REG, INTEL_GEN_MASK(9, 9) }, + { GEN8_L3SQCREG4, INTEL_GEN_MASK(9, 9) }, + }; + + return find_reg(i915, reg, pardon, ARRAY_SIZE(pardon)); +} + +static bool result_eq(struct intel_engine_cs *engine, + u32 a, u32 b, i915_reg_t reg) +{ + if (a != b && !pardon_reg(engine->i915, reg)) { + pr_err("Whitelisted register 0x%4x not context saved: A=%08x, B=%08x\n", + i915_mmio_reg_offset(reg), a, b); + return false; + } + + return true; +} + +static bool writeonly_reg(struct drm_i915_private *i915, i915_reg_t reg) +{ + /* Some registers do not seem to behave and our writes unreadable */ + static const struct regmask wo[] = { + { GEN9_SLICE_COMMON_ECO_CHICKEN1, INTEL_GEN_MASK(9, 9) }, + }; + + return find_reg(i915, reg, wo, ARRAY_SIZE(wo)); +} + +static bool result_neq(struct intel_engine_cs *engine, + u32 a, u32 b, i915_reg_t reg) +{ + if (a == b && !writeonly_reg(engine->i915, reg)) { + pr_err("Whitelist register 0x%4x:%08x was unwritable\n", + i915_mmio_reg_offset(reg), a); + return false; + } + + return true; +} + +static int +check_whitelisted_registers(struct intel_engine_cs *engine, + struct i915_vma *A, + struct i915_vma *B, + bool (*fn)(struct intel_engine_cs *engine, + u32 a, u32 b, + i915_reg_t reg)) +{ + u32 *a, *b; + int i, err; + + a = i915_gem_object_pin_map(A->obj, I915_MAP_WB); + if (IS_ERR(a)) + return PTR_ERR(a); + + b = i915_gem_object_pin_map(B->obj, I915_MAP_WB); + if (IS_ERR(b)) { + err = PTR_ERR(b); + goto err_a; + } + + err = 0; + for (i = 0; i < engine->whitelist.count; i++) { + const struct i915_wa *wa = &engine->whitelist.list[i]; + + if (i915_mmio_reg_offset(wa->reg) & + RING_FORCE_TO_NONPRIV_ACCESS_RD) + continue; + + if (!fn(engine, a[i], b[i], wa->reg)) + err = -EINVAL; + } + + i915_gem_object_unpin_map(B->obj); +err_a: + i915_gem_object_unpin_map(A->obj); + return err; +} + +static int live_isolated_whitelist(void *arg) +{ + struct intel_gt *gt = arg; + struct { + struct i915_gem_context *ctx; + struct i915_vma *scratch[2]; + } client[2] = {}; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int i, err = 0; + + /* + * Check that a write into a whitelist register works, but + * invisible to a second context. + */ + + if (!intel_engines_has_context_isolation(gt->i915)) + return 0; + + for (i = 0; i < ARRAY_SIZE(client); i++) { + struct i915_address_space *vm; + struct i915_gem_context *c; + + c = kernel_context(gt->i915); + if (IS_ERR(c)) { + err = PTR_ERR(c); + goto err; + } + + vm = i915_gem_context_get_vm_rcu(c); + + client[i].scratch[0] = create_scratch(vm, 1024); + if (IS_ERR(client[i].scratch[0])) { + err = PTR_ERR(client[i].scratch[0]); + i915_vm_put(vm); + kernel_context_close(c); + goto err; + } + + client[i].scratch[1] = create_scratch(vm, 1024); + if (IS_ERR(client[i].scratch[1])) { + err = PTR_ERR(client[i].scratch[1]); + i915_vma_unpin_and_release(&client[i].scratch[0], 0); + i915_vm_put(vm); + kernel_context_close(c); + goto err; + } + + client[i].ctx = c; + i915_vm_put(vm); + } + + for_each_engine(engine, gt, id) { + if (!engine->kernel_context->vm) + continue; + + if (!whitelist_writable_count(engine)) + continue; + + /* Read default values */ + err = read_whitelisted_registers(client[0].ctx, engine, + client[0].scratch[0]); + if (err) + goto err; + + /* Try to overwrite registers (should only affect ctx0) */ + err = scrub_whitelisted_registers(client[0].ctx, engine); + if (err) + goto err; + + /* Read values from ctx1, we expect these to be defaults */ + err = read_whitelisted_registers(client[1].ctx, engine, + client[1].scratch[0]); + if (err) + goto err; + + /* Verify that both reads return the same default values */ + err = check_whitelisted_registers(engine, + client[0].scratch[0], + client[1].scratch[0], + result_eq); + if (err) + goto err; + + /* Read back the updated values in ctx0 */ + err = read_whitelisted_registers(client[0].ctx, engine, + client[0].scratch[1]); + if (err) + goto err; + + /* User should be granted privilege to overwhite regs */ + err = check_whitelisted_registers(engine, + client[0].scratch[0], + client[0].scratch[1], + result_neq); + if (err) + goto err; + } + +err: + for (i = 0; i < ARRAY_SIZE(client); i++) { + if (!client[i].ctx) + break; + + i915_vma_unpin_and_release(&client[i].scratch[1], 0); + i915_vma_unpin_and_release(&client[i].scratch[0], 0); + kernel_context_close(client[i].ctx); + } + + if (igt_flush_test(gt->i915)) + err = -EIO; + + return err; +} + +static bool +verify_wa_lists(struct i915_gem_context *ctx, struct wa_lists *lists, + const char *str) +{ + struct drm_i915_private *i915 = ctx->i915; + struct i915_gem_engines_iter it; + struct intel_context *ce; + bool ok = true; + + ok &= wa_list_verify(&i915->uncore, &lists->gt_wa_list, str); + + for_each_gem_engine(ce, i915_gem_context_engines(ctx), it) { + enum intel_engine_id id = ce->engine->id; + + ok &= engine_wa_list_verify(ce, + &lists->engine[id].wa_list, + str) == 0; + + ok &= engine_wa_list_verify(ce, + &lists->engine[id].ctx_wa_list, + str) == 0; + } + + return ok; +} + +static int +live_gpu_reset_workarounds(void *arg) +{ + struct intel_gt *gt = arg; + struct i915_gem_context *ctx; + intel_wakeref_t wakeref; + struct wa_lists lists; + bool ok; + + if (!intel_has_gpu_reset(gt)) + return 0; + + ctx = kernel_context(gt->i915); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + i915_gem_context_lock_engines(ctx); + + pr_info("Verifying after GPU reset...\n"); + + igt_global_reset_lock(gt); + wakeref = intel_runtime_pm_get(gt->uncore->rpm); + + reference_lists_init(gt, &lists); + + ok = verify_wa_lists(ctx, &lists, "before reset"); + if (!ok) + goto out; + + intel_gt_reset(gt, ALL_ENGINES, "live_workarounds"); + + ok = verify_wa_lists(ctx, &lists, "after reset"); + +out: + i915_gem_context_unlock_engines(ctx); + kernel_context_close(ctx); + reference_lists_fini(gt, &lists); + intel_runtime_pm_put(gt->uncore->rpm, wakeref); + igt_global_reset_unlock(gt); + + return ok ? 0 : -ESRCH; +} + +static int +live_engine_reset_workarounds(void *arg) +{ + struct intel_gt *gt = arg; + struct i915_gem_engines_iter it; + struct i915_gem_context *ctx; + struct intel_context *ce; + struct igt_spinner spin; + struct i915_request *rq; + intel_wakeref_t wakeref; + struct wa_lists lists; + int ret = 0; + + if (!intel_has_reset_engine(gt)) + return 0; + + ctx = kernel_context(gt->i915); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + igt_global_reset_lock(gt); + wakeref = intel_runtime_pm_get(gt->uncore->rpm); + + reference_lists_init(gt, &lists); + + for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) { + struct intel_engine_cs *engine = ce->engine; + bool ok; + + pr_info("Verifying after %s reset...\n", engine->name); + + ok = verify_wa_lists(ctx, &lists, "before reset"); + if (!ok) { + ret = -ESRCH; + goto err; + } + + intel_engine_reset(engine, "live_workarounds"); + + ok = verify_wa_lists(ctx, &lists, "after idle reset"); + if (!ok) { + ret = -ESRCH; + goto err; + } + + ret = igt_spinner_init(&spin, engine->gt); + if (ret) + goto err; + + rq = igt_spinner_create_request(&spin, ce, MI_NOOP); + if (IS_ERR(rq)) { + ret = PTR_ERR(rq); + igt_spinner_fini(&spin); + goto err; + } + + ret = request_add_spin(rq, &spin); + if (ret) { + pr_err("Spinner failed to start\n"); + igt_spinner_fini(&spin); + goto err; + } + + intel_engine_reset(engine, "live_workarounds"); + + igt_spinner_end(&spin); + igt_spinner_fini(&spin); + + ok = verify_wa_lists(ctx, &lists, "after busy reset"); + if (!ok) { + ret = -ESRCH; + goto err; + } + } +err: + i915_gem_context_unlock_engines(ctx); + reference_lists_fini(gt, &lists); + intel_runtime_pm_put(gt->uncore->rpm, wakeref); + igt_global_reset_unlock(gt); + kernel_context_close(ctx); + + igt_flush_test(gt->i915); + + return ret; +} + +int intel_workarounds_live_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(live_dirty_whitelist), + SUBTEST(live_reset_whitelist), + SUBTEST(live_isolated_whitelist), + SUBTEST(live_gpu_reset_workarounds), + SUBTEST(live_engine_reset_workarounds), + }; + + if (intel_gt_is_wedged(&i915->gt)) + return 0; + + return intel_gt_live_subtests(tests, &i915->gt); +} diff --git a/drivers/gpu/drm/i915/gt/selftests/mock_timeline.c b/drivers/gpu/drm/i915/gt/selftests/mock_timeline.c new file mode 100644 index 000000000..aeb1d1f61 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftests/mock_timeline.c @@ -0,0 +1,29 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2017-2018 Intel Corporation + */ + +#include "../intel_timeline.h" + +#include "mock_timeline.h" + +void mock_timeline_init(struct intel_timeline *timeline, u64 context) +{ + timeline->gt = NULL; + timeline->fence_context = context; + + mutex_init(&timeline->mutex); + + INIT_ACTIVE_FENCE(&timeline->last_request); + INIT_LIST_HEAD(&timeline->requests); + + i915_syncmap_init(&timeline->sync); + + INIT_LIST_HEAD(&timeline->link); +} + +void mock_timeline_fini(struct intel_timeline *timeline) +{ + i915_syncmap_free(&timeline->sync); +} diff --git a/drivers/gpu/drm/i915/gt/selftests/mock_timeline.h b/drivers/gpu/drm/i915/gt/selftests/mock_timeline.h new file mode 100644 index 000000000..d2bcc3df6 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftests/mock_timeline.h @@ -0,0 +1,17 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2017-2018 Intel Corporation + */ + +#ifndef __MOCK_TIMELINE__ +#define __MOCK_TIMELINE__ + +#include <linux/types.h> + +struct intel_timeline; + +void mock_timeline_init(struct intel_timeline *timeline, u64 context); +void mock_timeline_fini(struct intel_timeline *timeline); + +#endif /* !__MOCK_TIMELINE__ */ diff --git a/drivers/gpu/drm/i915/gt/shaders/README b/drivers/gpu/drm/i915/gt/shaders/README new file mode 100644 index 000000000..e7e96d707 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/shaders/README @@ -0,0 +1,46 @@ +ASM sources for auto generated shaders +====================================== + +The i915/gt/hsw_clear_kernel.c and i915/gt/ivb_clear_kernel.c files contain +pre-compiled batch chunks that will clear any residual render cache during +context switch. + +They are generated from their respective platform ASM files present on +i915/gt/shaders/clear_kernel directory. + +The generated .c files should never be modified directly. Instead, any modification +needs to be done on the on their respective ASM files and build instructions below +needes to be followed. + +Building +======== + +Environment +----------- + +IGT GPU tool scripts and the Mesa's i965 instruction assembler tool are used +on building. + +Please make sure your Mesa tool is compiled with "-Dtools=intel" and +"-Ddri-drivers=i965", and run this script from IGT source root directory" + +The instructions bellow assume: + * IGT gpu tools source code is located on your home directory (~) as ~/igt + * Mesa source code is located on your home directory (~) as ~/mesa + and built under the ~/mesa/build directory + * Linux kernel source code is under your home directory (~) as ~/linux + +Instructions +------------ + +~ $ cp ~/linux/drivers/gpu/drm/i915/gt/shaders/clear_kernel/ivb.asm \ + ~/igt/lib/i915/shaders/clear_kernel/ivb.asm +~ $ cd ~/igt +igt $ ./scripts/generate_clear_kernel.sh -g ivb \ + -m ~/mesa/build/src/intel/tools/i965_asm + +~ $ cp ~/linux/drivers/gpu/drm/i915/gt/shaders/clear_kernel/hsw.asm \ + ~/igt/lib/i915/shaders/clear_kernel/hsw.asm +~ $ cd ~/igt +igt $ ./scripts/generate_clear_kernel.sh -g hsw \ + -m ~/mesa/build/src/intel/tools/i965_asm
\ No newline at end of file diff --git a/drivers/gpu/drm/i915/gt/shaders/clear_kernel/hsw.asm b/drivers/gpu/drm/i915/gt/shaders/clear_kernel/hsw.asm new file mode 100644 index 000000000..5fdf384bb --- /dev/null +++ b/drivers/gpu/drm/i915/gt/shaders/clear_kernel/hsw.asm @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +/* + * Kernel for PAVP buffer clear. + * + * 1. Clear all 64 GRF registers assigned to the kernel with designated value; + * 2. Write 32x16 block of all "0" to render target buffer which indirectly clears + * 512 bytes of Render Cache. + */ + +/* Store designated "clear GRF" value */ +mov(1) f0.1<1>UW g1.2<0,1,0>UW { align1 1N }; + +/** + * Curbe Format + * + * DW 1.0 - Block Offset to write Render Cache + * DW 1.1 [15:0] - Clear Word + * DW 1.2 - Delay iterations + * DW 1.3 - Enable Instrumentation (only for debug) + * DW 1.4 - Rsvd (intended for context ID) + * DW 1.5 - [31:16]:SliceCount, [15:0]:SubSlicePerSliceCount + * DW 1.6 - Rsvd MBZ (intended for Enable Wait on Total Thread Count) + * DW 1.7 - Rsvd MBZ (inteded for Total Thread Count) + * + * Binding Table + * + * BTI 0: 2D Surface to help clear L3 (Render/Data Cache) + * BTI 1: Wait/Instrumentation Buffer + * Size : (SliceCount * SubSliceCount * 16 EUs/SubSlice) rows * (16 threads/EU) cols (Format R32_UINT) + * Expected to be initialized to 0 by driver/another kernel + * Layout: + * RowN: Histogram for EU-N: (SliceID*SubSlicePerSliceCount + SSID)*16 + EUID [assume max 16 EUs / SS] + * Col-k[DW-k]: Threads Executed on ThreadID-k for EU-N + */ +add(1) g1.2<1>UD g1.2<0,1,0>UD 0x00000001UD { align1 1N }; /* Loop count to delay kernel: Init to (g1.2 + 1) */ +cmp.z.f0.0(1) null<1>UD g1.3<0,1,0>UD 0x00000000UD { align1 1N }; +(+f0.0) jmpi(1) 352D { align1 WE_all 1N }; + +/** + * State Register has info on where this thread is running + * IVB: sr0.0 :: [15:13]: MBZ, 12: HSID (Half-Slice ID), [11:8]EUID, [2:0] ThreadSlotID + * HSW: sr0.0 :: 15: MBZ, [14:13]: SliceID, 12: HSID (Half-Slice ID), [11:8]EUID, [2:0] ThreadSlotID + */ +mov(8) g3<1>UD 0x00000000UD { align1 1Q }; +shr(1) g3<1>D sr0<0,1,0>D 12D { align1 1N }; +and(1) g3<1>D g3<0,1,0>D 1D { align1 1N }; /* g3 has HSID */ +shr(1) g3.1<1>D sr0<0,1,0>D 13D { align1 1N }; +and(1) g3.1<1>D g3.1<0,1,0>D 3D { align1 1N }; /* g3.1 has sliceID */ +mul(1) g3.5<1>D g3.1<0,1,0>D g1.10<0,1,0>UW { align1 1N }; +add(1) g3<1>D g3<0,1,0>D g3.5<0,1,0>D { align1 1N }; /* g3 = sliceID * SubSlicePerSliceCount + HSID */ +shr(1) g3.2<1>D sr0<0,1,0>D 8D { align1 1N }; +and(1) g3.2<1>D g3.2<0,1,0>D 15D { align1 1N }; /* g3.2 = EUID */ +mul(1) g3.4<1>D g3<0,1,0>D 16D { align1 1N }; +add(1) g3.2<1>D g3.2<0,1,0>D g3.4<0,1,0>D { align1 1N }; /* g3.2 now points to EU row number (Y-pixel = V address ) in instrumentation surf */ + +mov(8) g5<1>UD 0x00000000UD { align1 1Q }; +and(1) g3.3<1>D sr0<0,1,0>D 7D { align1 1N }; +mul(1) g3.3<1>D g3.3<0,1,0>D 4D { align1 1N }; + +mov(8) g4<1>UD g0<8,8,1>UD { align1 1Q }; /* Initialize message header with g0 */ +mov(1) g4<1>UD g3.3<0,1,0>UD { align1 1N }; /* Block offset */ +mov(1) g4.1<1>UD g3.2<0,1,0>UD { align1 1N }; /* Block offset */ +mov(1) g4.2<1>UD 0x00000003UD { align1 1N }; /* Block size (1 row x 4 bytes) */ +and(1) g4.3<1>UD g4.3<0,1,0>UW 0xffffffffUD { align1 1N }; + +/* Media block read to fetch current value at specified location in instrumentation buffer */ +sendc(8) g5<1>UD g4<8,8,1>F 0x02190001 + + render MsgDesc: media block read MsgCtrl = 0x0 Surface = 1 mlen 1 rlen 1 { align1 1Q }; +add(1) g5<1>D g5<0,1,0>D 1D { align1 1N }; + +/* Media block write for updated value at specified location in instrumentation buffer */ +sendc(8) g5<1>UD g4<8,8,1>F 0x040a8001 + render MsgDesc: media block write MsgCtrl = 0x0 Surface = 1 mlen 2 rlen 0 { align1 1Q }; + +/* Delay thread for specified parameter */ +add.nz.f0.0(1) g1.2<1>UD g1.2<0,1,0>UD -1D { align1 1N }; +(+f0.0) jmpi(1) -32D { align1 WE_all 1N }; + +/* Store designated "clear GRF" value */ +mov(1) f0.1<1>UW g1.2<0,1,0>UW { align1 1N }; + +/* Initialize looping parameters */ +mov(1) a0<1>D 0D { align1 1N }; /* Initialize a0.0:w=0 */ +mov(1) a0.4<1>W 127W { align1 1N }; /* Loop count. Each loop contains 16 GRF's */ + +/* Write 32x16 all "0" block */ +mov(8) g2<1>UD g0<8,8,1>UD { align1 1Q }; +mov(8) g127<1>UD g0<8,8,1>UD { align1 1Q }; +mov(2) g2<1>UD g1<2,2,1>UW { align1 1N }; +mov(1) g2.2<1>UD 0x000f000fUD { align1 1N }; /* Block size (16x16) */ +and(1) g2.3<1>UD g2.3<0,1,0>UW 0xffffffefUD { align1 1N }; +mov(16) g3<1>UD 0x00000000UD { align1 1H }; +mov(16) g4<1>UD 0x00000000UD { align1 1H }; +mov(16) g5<1>UD 0x00000000UD { align1 1H }; +mov(16) g6<1>UD 0x00000000UD { align1 1H }; +mov(16) g7<1>UD 0x00000000UD { align1 1H }; +mov(16) g8<1>UD 0x00000000UD { align1 1H }; +mov(16) g9<1>UD 0x00000000UD { align1 1H }; +mov(16) g10<1>UD 0x00000000UD { align1 1H }; +sendc(8) null<1>UD g2<8,8,1>F 0x120a8000 + render MsgDesc: media block write MsgCtrl = 0x0 Surface = 0 mlen 9 rlen 0 { align1 1Q }; +add(1) g2<1>UD g1<0,1,0>UW 0x0010UW { align1 1N }; +sendc(8) null<1>UD g2<8,8,1>F 0x120a8000 + render MsgDesc: media block write MsgCtrl = 0x0 Surface = 0 mlen 9 rlen 0 { align1 1Q }; + +/* Now, clear all GRF registers */ +add.nz.f0.0(1) a0.4<1>W a0.4<0,1,0>W -1W { align1 1N }; +mov(16) g[a0]<1>UW f0.1<0,1,0>UW { align1 1H }; +add(1) a0<1>D a0<0,1,0>D 32D { align1 1N }; +(+f0.0) jmpi(1) -64D { align1 WE_all 1N }; + +/* Terminante the thread */ +sendc(8) null<1>UD g127<8,8,1>F 0x82000010 + thread_spawner MsgDesc: mlen 1 rlen 0 { align1 1Q EOT }; diff --git a/drivers/gpu/drm/i915/gt/shaders/clear_kernel/ivb.asm b/drivers/gpu/drm/i915/gt/shaders/clear_kernel/ivb.asm new file mode 100644 index 000000000..97c7ac9e3 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/shaders/clear_kernel/ivb.asm @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +/* + * Kernel for PAVP buffer clear. + * + * 1. Clear all 64 GRF registers assigned to the kernel with designated value; + * 2. Write 32x16 block of all "0" to render target buffer which indirectly clears + * 512 bytes of Render Cache. + */ + +/* Store designated "clear GRF" value */ +mov(1) f0.1<1>UW g1.2<0,1,0>UW { align1 1N }; + +/** + * Curbe Format + * + * DW 1.0 - Block Offset to write Render Cache + * DW 1.1 [15:0] - Clear Word + * DW 1.2 - Delay iterations + * DW 1.3 - Enable Instrumentation (only for debug) + * DW 1.4 - Rsvd (intended for context ID) + * DW 1.5 - [31:16]:SliceCount, [15:0]:SubSlicePerSliceCount + * DW 1.6 - Rsvd MBZ (intended for Enable Wait on Total Thread Count) + * DW 1.7 - Rsvd MBZ (inteded for Total Thread Count) + * + * Binding Table + * + * BTI 0: 2D Surface to help clear L3 (Render/Data Cache) + * BTI 1: Wait/Instrumentation Buffer + * Size : (SliceCount * SubSliceCount * 16 EUs/SubSlice) rows * (16 threads/EU) cols (Format R32_UINT) + * Expected to be initialized to 0 by driver/another kernel + * Layout : + * RowN: Histogram for EU-N: (SliceID*SubSlicePerSliceCount + SSID)*16 + EUID [assume max 16 EUs / SS] + * Col-k[DW-k]: Threads Executed on ThreadID-k for EU-N + */ +add(1) g1.2<1>UD g1.2<0,1,0>UD 0x00000001UD { align1 1N }; /* Loop count to delay kernel: Init to (g1.2 + 1) */ +cmp.z.f0.0(1) null<1>UD g1.3<0,1,0>UD 0x00000000UD { align1 1N }; +(+f0.0) jmpi(1) 44D { align1 WE_all 1N }; + +/** + * State Register has info on where this thread is running + * IVB: sr0.0 :: [15:13]: MBZ, 12: HSID (Half-Slice ID), [11:8]EUID, [2:0] ThreadSlotID + * HSW: sr0.0 :: 15: MBZ, [14:13]: SliceID, 12: HSID (Half-Slice ID), [11:8]EUID, [2:0] ThreadSlotID + */ +mov(8) g3<1>UD 0x00000000UD { align1 1Q }; +shr(1) g3<1>D sr0<0,1,0>D 12D { align1 1N }; +and(1) g3<1>D g3<0,1,0>D 1D { align1 1N }; /* g3 has HSID */ +shr(1) g3.1<1>D sr0<0,1,0>D 13D { align1 1N }; +and(1) g3.1<1>D g3.1<0,1,0>D 3D { align1 1N }; /* g3.1 has sliceID */ +mul(1) g3.5<1>D g3.1<0,1,0>D g1.10<0,1,0>UW { align1 1N }; +add(1) g3<1>D g3<0,1,0>D g3.5<0,1,0>D { align1 1N }; /* g3 = sliceID * SubSlicePerSliceCount + HSID */ +shr(1) g3.2<1>D sr0<0,1,0>D 8D { align1 1N }; +and(1) g3.2<1>D g3.2<0,1,0>D 15D { align1 1N }; /* g3.2 = EUID */ +mul(1) g3.4<1>D g3<0,1,0>D 16D { align1 1N }; +add(1) g3.2<1>D g3.2<0,1,0>D g3.4<0,1,0>D { align1 1N }; /* g3.2 now points to EU row number (Y-pixel = V address ) in instrumentation surf */ + +mov(8) g5<1>UD 0x00000000UD { align1 1Q }; +and(1) g3.3<1>D sr0<0,1,0>D 7D { align1 1N }; +mul(1) g3.3<1>D g3.3<0,1,0>D 4D { align1 1N }; + +mov(8) g4<1>UD g0<8,8,1>UD { align1 1Q }; /* Initialize message header with g0 */ +mov(1) g4<1>UD g3.3<0,1,0>UD { align1 1N }; /* Block offset */ +mov(1) g4.1<1>UD g3.2<0,1,0>UD { align1 1N }; /* Block offset */ +mov(1) g4.2<1>UD 0x00000003UD { align1 1N }; /* Block size (1 row x 4 bytes) */ +and(1) g4.3<1>UD g4.3<0,1,0>UW 0xffffffffUD { align1 1N }; + +/* Media block read to fetch current value at specified location in instrumentation buffer */ +sendc(8) g5<1>UD g4<8,8,1>F 0x02190001 + render MsgDesc: media block read MsgCtrl = 0x0 Surface = 1 mlen 1 rlen 1 { align1 1Q }; +add(1) g5<1>D g5<0,1,0>D 1D { align1 1N }; + +/* Media block write for updated value at specified location in instrumentation buffer */ +sendc(8) g5<1>UD g4<8,8,1>F 0x040a8001 + render MsgDesc: media block write MsgCtrl = 0x0 Surface = 1 mlen 2 rlen 0 { align1 1Q }; +/* Delay thread for specified parameter */ +add.nz.f0.0(1) g1.2<1>UD g1.2<0,1,0>UD -1D { align1 1N }; +(+f0.0) jmpi(1) -4D { align1 WE_all 1N }; + +/* Store designated "clear GRF" value */ +mov(1) f0.1<1>UW g1.2<0,1,0>UW { align1 1N }; + +/* Initialize looping parameters */ +mov(1) a0<1>D 0D { align1 1N }; /* Initialize a0.0:w=0 */ +mov(1) a0.4<1>W 127W { align1 1N }; /* Loop count. Each loop contains 16 GRF's */ + +/* Write 32x16 all "0" block */ +mov(8) g2<1>UD g0<8,8,1>UD { align1 1Q }; +mov(8) g127<1>UD g0<8,8,1>UD { align1 1Q }; +mov(2) g2<1>UD g1<2,2,1>UW { align1 1N }; +mov(1) g2.2<1>UD 0x000f000fUD { align1 1N }; /* Block size (16x16) */ +and(1) g2.3<1>UD g2.3<0,1,0>UW 0xffffffefUD { align1 1N }; +mov(16) g3<1>UD 0x00000000UD { align1 1H }; +mov(16) g4<1>UD 0x00000000UD { align1 1H }; +mov(16) g5<1>UD 0x00000000UD { align1 1H }; +mov(16) g6<1>UD 0x00000000UD { align1 1H }; +mov(16) g7<1>UD 0x00000000UD { align1 1H }; +mov(16) g8<1>UD 0x00000000UD { align1 1H }; +mov(16) g9<1>UD 0x00000000UD { align1 1H }; +mov(16) g10<1>UD 0x00000000UD { align1 1H }; +sendc(8) null<1>UD g2<8,8,1>F 0x120a8000 + render MsgDesc: media block write MsgCtrl = 0x0 Surface = 0 mlen 9 rlen 0 { align1 1Q }; +add(1) g2<1>UD g1<0,1,0>UW 0x0010UW { align1 1N }; +sendc(8) null<1>UD g2<8,8,1>F 0x120a8000 + render MsgDesc: media block write MsgCtrl = 0x0 Surface = 0 mlen 9 rlen 0 { align1 1Q }; + +/* Now, clear all GRF registers */ +add.nz.f0.0(1) a0.4<1>W a0.4<0,1,0>W -1W { align1 1N }; +mov(16) g[a0]<1>UW f0.1<0,1,0>UW { align1 1H }; +add(1) a0<1>D a0<0,1,0>D 32D { align1 1N }; +(+f0.0) jmpi(1) -8D { align1 WE_all 1N }; + +/* Terminante the thread */ +sendc(8) null<1>UD g127<8,8,1>F 0x82000010 + thread_spawner MsgDesc: mlen 1 rlen 0 { align1 1Q EOT }; diff --git a/drivers/gpu/drm/i915/gt/shmem_utils.c b/drivers/gpu/drm/i915/gt/shmem_utils.c new file mode 100644 index 000000000..5982b62f9 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/shmem_utils.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/shmem_fs.h> + +#include "gem/i915_gem_object.h" +#include "shmem_utils.h" + +struct file *shmem_create_from_data(const char *name, void *data, size_t len) +{ + struct file *file; + int err; + + file = shmem_file_setup(name, PAGE_ALIGN(len), VM_NORESERVE); + if (IS_ERR(file)) + return file; + + err = shmem_write(file, 0, data, len); + if (err) { + fput(file); + return ERR_PTR(err); + } + + return file; +} + +struct file *shmem_create_from_object(struct drm_i915_gem_object *obj) +{ + struct file *file; + void *ptr; + + if (obj->ops == &i915_gem_shmem_ops) { + file = obj->base.filp; + atomic_long_inc(&file->f_count); + return file; + } + + ptr = i915_gem_object_pin_map(obj, I915_MAP_WB); + if (IS_ERR(ptr)) + return ERR_CAST(ptr); + + file = shmem_create_from_data("", ptr, obj->base.size); + i915_gem_object_unpin_map(obj); + + return file; +} + +void *shmem_pin_map(struct file *file) +{ + struct page **pages; + size_t n_pages, i; + void *vaddr; + + n_pages = file->f_mapping->host->i_size >> PAGE_SHIFT; + pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return NULL; + + for (i = 0; i < n_pages; i++) { + pages[i] = shmem_read_mapping_page_gfp(file->f_mapping, i, + GFP_KERNEL); + if (IS_ERR(pages[i])) + goto err_page; + } + + vaddr = vmap(pages, n_pages, VM_MAP_PUT_PAGES, PAGE_KERNEL); + if (!vaddr) + goto err_page; + mapping_set_unevictable(file->f_mapping); + return vaddr; +err_page: + while (i--) + put_page(pages[i]); + kvfree(pages); + return NULL; +} + +void shmem_unpin_map(struct file *file, void *ptr) +{ + mapping_clear_unevictable(file->f_mapping); + vfree(ptr); +} + +static int __shmem_rw(struct file *file, loff_t off, + void *ptr, size_t len, + bool write) +{ + unsigned long pfn; + + for (pfn = off >> PAGE_SHIFT; len; pfn++) { + unsigned int this = + min_t(size_t, PAGE_SIZE - offset_in_page(off), len); + struct page *page; + void *vaddr; + + page = shmem_read_mapping_page_gfp(file->f_mapping, pfn, + GFP_KERNEL); + if (IS_ERR(page)) + return PTR_ERR(page); + + vaddr = kmap(page); + if (write) { + memcpy(vaddr + offset_in_page(off), ptr, this); + set_page_dirty(page); + } else { + memcpy(ptr, vaddr + offset_in_page(off), this); + } + mark_page_accessed(page); + kunmap(page); + put_page(page); + + len -= this; + ptr += this; + off = 0; + } + + return 0; +} + +int shmem_read(struct file *file, loff_t off, void *dst, size_t len) +{ + return __shmem_rw(file, off, dst, len, false); +} + +int shmem_write(struct file *file, loff_t off, void *src, size_t len) +{ + return __shmem_rw(file, off, src, len, true); +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "st_shmem_utils.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/shmem_utils.h b/drivers/gpu/drm/i915/gt/shmem_utils.h new file mode 100644 index 000000000..c1669170c --- /dev/null +++ b/drivers/gpu/drm/i915/gt/shmem_utils.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2020 Intel Corporation + */ + +#ifndef SHMEM_UTILS_H +#define SHMEM_UTILS_H + +#include <linux/types.h> + +struct drm_i915_gem_object; +struct file; + +struct file *shmem_create_from_data(const char *name, void *data, size_t len); +struct file *shmem_create_from_object(struct drm_i915_gem_object *obj); + +void *shmem_pin_map(struct file *file); +void shmem_unpin_map(struct file *file, void *ptr); + +int shmem_read(struct file *file, loff_t off, void *dst, size_t len); +int shmem_write(struct file *file, loff_t off, void *src, size_t len); + +#endif /* SHMEM_UTILS_H */ diff --git a/drivers/gpu/drm/i915/gt/st_shmem_utils.c b/drivers/gpu/drm/i915/gt/st_shmem_utils.c new file mode 100644 index 000000000..b279fe88b --- /dev/null +++ b/drivers/gpu/drm/i915/gt/st_shmem_utils.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +/* Just a quick and causal check of the shmem_utils API */ + +static int igt_shmem_basic(void *ignored) +{ + u32 datum = 0xdeadbeef, result; + struct file *file; + u32 *map; + int err; + + file = shmem_create_from_data("mock", &datum, sizeof(datum)); + if (IS_ERR(file)) + return PTR_ERR(file); + + result = 0; + err = shmem_read(file, 0, &result, sizeof(result)); + if (err) + goto out_file; + + if (result != datum) { + pr_err("Incorrect read back from shmemfs: %x != %x\n", + result, datum); + err = -EINVAL; + goto out_file; + } + + result = 0xc0ffee; + err = shmem_write(file, 0, &result, sizeof(result)); + if (err) + goto out_file; + + map = shmem_pin_map(file); + if (!map) { + err = -ENOMEM; + goto out_file; + } + + if (*map != result) { + pr_err("Incorrect read back via mmap of last write: %x != %x\n", + *map, result); + err = -EINVAL; + goto out_map; + } + +out_map: + shmem_unpin_map(file, map); +out_file: + fput(file); + return err; +} + +int shmem_utils_mock_selftests(void) +{ + static const struct i915_subtest tests[] = { + SUBTEST(igt_shmem_basic), + }; + + return i915_subtests(tests, NULL); +} diff --git a/drivers/gpu/drm/i915/gt/sysfs_engines.c b/drivers/gpu/drm/i915/gt/sysfs_engines.c new file mode 100644 index 000000000..535cc1169 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/sysfs_engines.c @@ -0,0 +1,539 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2019 Intel Corporation + */ + +#include <linux/kobject.h> +#include <linux/sysfs.h> + +#include "i915_drv.h" +#include "intel_engine.h" +#include "intel_engine_heartbeat.h" +#include "sysfs_engines.h" + +struct kobj_engine { + struct kobject base; + struct intel_engine_cs *engine; +}; + +static struct intel_engine_cs *kobj_to_engine(struct kobject *kobj) +{ + return container_of(kobj, struct kobj_engine, base)->engine; +} + +static ssize_t +name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", kobj_to_engine(kobj)->name); +} + +static struct kobj_attribute name_attr = +__ATTR(name, 0444, name_show, NULL); + +static ssize_t +class_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", kobj_to_engine(kobj)->uabi_class); +} + +static struct kobj_attribute class_attr = +__ATTR(class, 0444, class_show, NULL); + +static ssize_t +inst_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", kobj_to_engine(kobj)->uabi_instance); +} + +static struct kobj_attribute inst_attr = +__ATTR(instance, 0444, inst_show, NULL); + +static ssize_t +mmio_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "0x%x\n", kobj_to_engine(kobj)->mmio_base); +} + +static struct kobj_attribute mmio_attr = +__ATTR(mmio_base, 0444, mmio_show, NULL); + +static const char * const vcs_caps[] = { + [ilog2(I915_VIDEO_CLASS_CAPABILITY_HEVC)] = "hevc", + [ilog2(I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC)] = "sfc", +}; + +static const char * const vecs_caps[] = { + [ilog2(I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC)] = "sfc", +}; + +static ssize_t repr_trim(char *buf, ssize_t len) +{ + /* Trim off the trailing space and replace with a newline */ + if (len > PAGE_SIZE) + len = PAGE_SIZE; + if (len > 0) + buf[len - 1] = '\n'; + + return len; +} + +static ssize_t +__caps_show(struct intel_engine_cs *engine, + u32 caps, char *buf, bool show_unknown) +{ + const char * const *repr; + int count, n; + ssize_t len; + + BUILD_BUG_ON(!typecheck(typeof(caps), engine->uabi_capabilities)); + + switch (engine->class) { + case VIDEO_DECODE_CLASS: + repr = vcs_caps; + count = ARRAY_SIZE(vcs_caps); + break; + + case VIDEO_ENHANCEMENT_CLASS: + repr = vecs_caps; + count = ARRAY_SIZE(vecs_caps); + break; + + default: + repr = NULL; + count = 0; + break; + } + GEM_BUG_ON(count > BITS_PER_TYPE(typeof(caps))); + + len = 0; + for_each_set_bit(n, + (unsigned long *)&caps, + show_unknown ? BITS_PER_TYPE(typeof(caps)) : count) { + if (n >= count || !repr[n]) { + if (GEM_WARN_ON(show_unknown)) + len += snprintf(buf + len, PAGE_SIZE - len, + "[%x] ", n); + } else { + len += snprintf(buf + len, PAGE_SIZE - len, + "%s ", repr[n]); + } + if (GEM_WARN_ON(len >= PAGE_SIZE)) + break; + } + return repr_trim(buf, len); +} + +static ssize_t +caps_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + struct intel_engine_cs *engine = kobj_to_engine(kobj); + + return __caps_show(engine, engine->uabi_capabilities, buf, true); +} + +static struct kobj_attribute caps_attr = +__ATTR(capabilities, 0444, caps_show, NULL); + +static ssize_t +all_caps_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return __caps_show(kobj_to_engine(kobj), -1, buf, false); +} + +static struct kobj_attribute all_caps_attr = +__ATTR(known_capabilities, 0444, all_caps_show, NULL); + +static ssize_t +max_spin_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct intel_engine_cs *engine = kobj_to_engine(kobj); + unsigned long long duration; + int err; + + /* + * When waiting for a request, if is it currently being executed + * on the GPU, we busywait for a short while before sleeping. The + * premise is that most requests are short, and if it is already + * executing then there is a good chance that it will complete + * before we can setup the interrupt handler and go to sleep. + * We try to offset the cost of going to sleep, by first spinning + * on the request -- if it completed in less time than it would take + * to go sleep, process the interrupt and return back to the client, + * then we have saved the client some latency, albeit at the cost + * of spinning on an expensive CPU core. + * + * While we try to avoid waiting at all for a request that is unlikely + * to complete, deciding how long it is worth spinning is for is an + * arbitrary decision: trading off power vs latency. + */ + + err = kstrtoull(buf, 0, &duration); + if (err) + return err; + + if (duration > jiffies_to_nsecs(2)) + return -EINVAL; + + WRITE_ONCE(engine->props.max_busywait_duration_ns, duration); + + return count; +} + +static ssize_t +max_spin_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + struct intel_engine_cs *engine = kobj_to_engine(kobj); + + return sprintf(buf, "%lu\n", engine->props.max_busywait_duration_ns); +} + +static struct kobj_attribute max_spin_attr = +__ATTR(max_busywait_duration_ns, 0644, max_spin_show, max_spin_store); + +static ssize_t +max_spin_default(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + struct intel_engine_cs *engine = kobj_to_engine(kobj); + + return sprintf(buf, "%lu\n", engine->defaults.max_busywait_duration_ns); +} + +static struct kobj_attribute max_spin_def = +__ATTR(max_busywait_duration_ns, 0444, max_spin_default, NULL); + +static ssize_t +timeslice_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct intel_engine_cs *engine = kobj_to_engine(kobj); + unsigned long long duration; + int err; + + /* + * Execlists uses a scheduling quantum (a timeslice) to alternate + * execution between ready-to-run contexts of equal priority. This + * ensures that all users (though only if they of equal importance) + * have the opportunity to run and prevents livelocks where contexts + * may have implicit ordering due to userspace semaphores. + */ + + err = kstrtoull(buf, 0, &duration); + if (err) + return err; + + if (duration > jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT)) + return -EINVAL; + + WRITE_ONCE(engine->props.timeslice_duration_ms, duration); + + if (execlists_active(&engine->execlists)) + set_timer_ms(&engine->execlists.timer, duration); + + return count; +} + +static ssize_t +timeslice_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + struct intel_engine_cs *engine = kobj_to_engine(kobj); + + return sprintf(buf, "%lu\n", engine->props.timeslice_duration_ms); +} + +static struct kobj_attribute timeslice_duration_attr = +__ATTR(timeslice_duration_ms, 0644, timeslice_show, timeslice_store); + +static ssize_t +timeslice_default(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + struct intel_engine_cs *engine = kobj_to_engine(kobj); + + return sprintf(buf, "%lu\n", engine->defaults.timeslice_duration_ms); +} + +static struct kobj_attribute timeslice_duration_def = +__ATTR(timeslice_duration_ms, 0444, timeslice_default, NULL); + +static ssize_t +stop_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct intel_engine_cs *engine = kobj_to_engine(kobj); + unsigned long long duration; + int err; + + /* + * When we allow ourselves to sleep before a GPU reset after disabling + * submission, even for a few milliseconds, gives an innocent context + * the opportunity to clear the GPU before the reset occurs. However, + * how long to sleep depends on the typical non-preemptible duration + * (a similar problem to determining the ideal preempt-reset timeout + * or even the heartbeat interval). + */ + + err = kstrtoull(buf, 0, &duration); + if (err) + return err; + + if (duration > jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT)) + return -EINVAL; + + WRITE_ONCE(engine->props.stop_timeout_ms, duration); + return count; +} + +static ssize_t +stop_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + struct intel_engine_cs *engine = kobj_to_engine(kobj); + + return sprintf(buf, "%lu\n", engine->props.stop_timeout_ms); +} + +static struct kobj_attribute stop_timeout_attr = +__ATTR(stop_timeout_ms, 0644, stop_show, stop_store); + +static ssize_t +stop_default(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + struct intel_engine_cs *engine = kobj_to_engine(kobj); + + return sprintf(buf, "%lu\n", engine->defaults.stop_timeout_ms); +} + +static struct kobj_attribute stop_timeout_def = +__ATTR(stop_timeout_ms, 0444, stop_default, NULL); + +static ssize_t +preempt_timeout_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct intel_engine_cs *engine = kobj_to_engine(kobj); + unsigned long long timeout; + int err; + + /* + * After initialising a preemption request, we give the current + * resident a small amount of time to vacate the GPU. The preemption + * request is for a higher priority context and should be immediate to + * maintain high quality of service (and avoid priority inversion). + * However, the preemption granularity of the GPU can be quite coarse + * and so we need a compromise. + */ + + err = kstrtoull(buf, 0, &timeout); + if (err) + return err; + + if (timeout > jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT)) + return -EINVAL; + + WRITE_ONCE(engine->props.preempt_timeout_ms, timeout); + + if (READ_ONCE(engine->execlists.pending[0])) + set_timer_ms(&engine->execlists.preempt, timeout); + + return count; +} + +static ssize_t +preempt_timeout_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct intel_engine_cs *engine = kobj_to_engine(kobj); + + return sprintf(buf, "%lu\n", engine->props.preempt_timeout_ms); +} + +static struct kobj_attribute preempt_timeout_attr = +__ATTR(preempt_timeout_ms, 0644, preempt_timeout_show, preempt_timeout_store); + +static ssize_t +preempt_timeout_default(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct intel_engine_cs *engine = kobj_to_engine(kobj); + + return sprintf(buf, "%lu\n", engine->defaults.preempt_timeout_ms); +} + +static struct kobj_attribute preempt_timeout_def = +__ATTR(preempt_timeout_ms, 0444, preempt_timeout_default, NULL); + +static ssize_t +heartbeat_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct intel_engine_cs *engine = kobj_to_engine(kobj); + unsigned long long delay; + int err; + + /* + * We monitor the health of the system via periodic heartbeat pulses. + * The pulses also provide the opportunity to perform garbage + * collection. However, we interpret an incomplete pulse (a missed + * heartbeat) as an indication that the system is no longer responsive, + * i.e. hung, and perform an engine or full GPU reset. Given that the + * preemption granularity can be very coarse on a system, the optimal + * value for any workload is unknowable! + */ + + err = kstrtoull(buf, 0, &delay); + if (err) + return err; + + if (delay >= jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT)) + return -EINVAL; + + err = intel_engine_set_heartbeat(engine, delay); + if (err) + return err; + + return count; +} + +static ssize_t +heartbeat_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + struct intel_engine_cs *engine = kobj_to_engine(kobj); + + return sprintf(buf, "%lu\n", engine->props.heartbeat_interval_ms); +} + +static struct kobj_attribute heartbeat_interval_attr = +__ATTR(heartbeat_interval_ms, 0644, heartbeat_show, heartbeat_store); + +static ssize_t +heartbeat_default(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + struct intel_engine_cs *engine = kobj_to_engine(kobj); + + return sprintf(buf, "%lu\n", engine->defaults.heartbeat_interval_ms); +} + +static struct kobj_attribute heartbeat_interval_def = +__ATTR(heartbeat_interval_ms, 0444, heartbeat_default, NULL); + +static void kobj_engine_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static struct kobj_type kobj_engine_type = { + .release = kobj_engine_release, + .sysfs_ops = &kobj_sysfs_ops +}; + +static struct kobject * +kobj_engine(struct kobject *dir, struct intel_engine_cs *engine) +{ + struct kobj_engine *ke; + + ke = kzalloc(sizeof(*ke), GFP_KERNEL); + if (!ke) + return NULL; + + kobject_init(&ke->base, &kobj_engine_type); + ke->engine = engine; + + if (kobject_add(&ke->base, dir, "%s", engine->name)) { + kobject_put(&ke->base); + return NULL; + } + + /* xfer ownership to sysfs tree */ + return &ke->base; +} + +static void add_defaults(struct kobj_engine *parent) +{ + static const struct attribute *files[] = { + &max_spin_def.attr, + &stop_timeout_def.attr, +#if CONFIG_DRM_I915_HEARTBEAT_INTERVAL + &heartbeat_interval_def.attr, +#endif + NULL + }; + struct kobj_engine *ke; + + ke = kzalloc(sizeof(*ke), GFP_KERNEL); + if (!ke) + return; + + kobject_init(&ke->base, &kobj_engine_type); + ke->engine = parent->engine; + + if (kobject_add(&ke->base, &parent->base, "%s", ".defaults")) { + kobject_put(&ke->base); + return; + } + + if (sysfs_create_files(&ke->base, files)) + return; + + if (intel_engine_has_timeslices(ke->engine) && + sysfs_create_file(&ke->base, ×lice_duration_def.attr)) + return; + + if (intel_engine_has_preempt_reset(ke->engine) && + sysfs_create_file(&ke->base, &preempt_timeout_def.attr)) + return; +} + +void intel_engines_add_sysfs(struct drm_i915_private *i915) +{ + static const struct attribute *files[] = { + &name_attr.attr, + &class_attr.attr, + &inst_attr.attr, + &mmio_attr.attr, + &caps_attr.attr, + &all_caps_attr.attr, + &max_spin_attr.attr, + &stop_timeout_attr.attr, +#if CONFIG_DRM_I915_HEARTBEAT_INTERVAL + &heartbeat_interval_attr.attr, +#endif + NULL + }; + + struct device *kdev = i915->drm.primary->kdev; + struct intel_engine_cs *engine; + struct kobject *dir; + + dir = kobject_create_and_add("engine", &kdev->kobj); + if (!dir) + return; + + for_each_uabi_engine(engine, i915) { + struct kobject *kobj; + + kobj = kobj_engine(dir, engine); + if (!kobj) + goto err_engine; + + if (sysfs_create_files(kobj, files)) + goto err_object; + + if (intel_engine_has_timeslices(engine) && + sysfs_create_file(kobj, ×lice_duration_attr.attr)) + goto err_engine; + + if (intel_engine_has_preempt_reset(engine) && + sysfs_create_file(kobj, &preempt_timeout_attr.attr)) + goto err_engine; + + add_defaults(container_of(kobj, struct kobj_engine, base)); + + if (0) { +err_object: + kobject_put(kobj); +err_engine: + dev_err(kdev, "Failed to add sysfs engine '%s'\n", + engine->name); + break; + } + } +} diff --git a/drivers/gpu/drm/i915/gt/sysfs_engines.h b/drivers/gpu/drm/i915/gt/sysfs_engines.h new file mode 100644 index 000000000..9546fffe0 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/sysfs_engines.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_ENGINE_SYSFS_H +#define INTEL_ENGINE_SYSFS_H + +struct drm_i915_private; + +void intel_engines_add_sysfs(struct drm_i915_private *i915); + +#endif /* INTEL_ENGINE_SYSFS_H */ diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c new file mode 100644 index 000000000..942c7c187 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c @@ -0,0 +1,769 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2014-2019 Intel Corporation + */ + +#include "gt/intel_gt.h" +#include "gt/intel_gt_irq.h" +#include "gt/intel_gt_pm_irq.h" +#include "intel_guc.h" +#include "intel_guc_ads.h" +#include "intel_guc_submission.h" +#include "i915_drv.h" + +/** + * DOC: GuC + * + * The GuC is a microcontroller inside the GT HW, introduced in gen9. The GuC is + * designed to offload some of the functionality usually performed by the host + * driver; currently the main operations it can take care of are: + * + * - Authentication of the HuC, which is required to fully enable HuC usage. + * - Low latency graphics context scheduling (a.k.a. GuC submission). + * - GT Power management. + * + * The enable_guc module parameter can be used to select which of those + * operations to enable within GuC. Note that not all the operations are + * supported on all gen9+ platforms. + * + * Enabling the GuC is not mandatory and therefore the firmware is only loaded + * if at least one of the operations is selected. However, not loading the GuC + * might result in the loss of some features that do require the GuC (currently + * just the HuC, but more are expected to land in the future). + */ + +void intel_guc_notify(struct intel_guc *guc) +{ + struct intel_gt *gt = guc_to_gt(guc); + + /* + * On Gen11+, the value written to the register is passes as a payload + * to the FW. However, the FW currently treats all values the same way + * (H2G interrupt), so we can just write the value that the HW expects + * on older gens. + */ + intel_uncore_write(gt->uncore, guc->notify_reg, GUC_SEND_TRIGGER); +} + +static inline i915_reg_t guc_send_reg(struct intel_guc *guc, u32 i) +{ + GEM_BUG_ON(!guc->send_regs.base); + GEM_BUG_ON(!guc->send_regs.count); + GEM_BUG_ON(i >= guc->send_regs.count); + + return _MMIO(guc->send_regs.base + 4 * i); +} + +void intel_guc_init_send_regs(struct intel_guc *guc) +{ + struct intel_gt *gt = guc_to_gt(guc); + enum forcewake_domains fw_domains = 0; + unsigned int i; + + if (INTEL_GEN(gt->i915) >= 11) { + guc->send_regs.base = + i915_mmio_reg_offset(GEN11_SOFT_SCRATCH(0)); + guc->send_regs.count = GEN11_SOFT_SCRATCH_COUNT; + } else { + guc->send_regs.base = i915_mmio_reg_offset(SOFT_SCRATCH(0)); + guc->send_regs.count = GUC_MAX_MMIO_MSG_LEN; + BUILD_BUG_ON(GUC_MAX_MMIO_MSG_LEN > SOFT_SCRATCH_COUNT); + } + + for (i = 0; i < guc->send_regs.count; i++) { + fw_domains |= intel_uncore_forcewake_for_reg(gt->uncore, + guc_send_reg(guc, i), + FW_REG_READ | FW_REG_WRITE); + } + guc->send_regs.fw_domains = fw_domains; +} + +static void gen9_reset_guc_interrupts(struct intel_guc *guc) +{ + struct intel_gt *gt = guc_to_gt(guc); + + assert_rpm_wakelock_held(>->i915->runtime_pm); + + spin_lock_irq(>->irq_lock); + gen6_gt_pm_reset_iir(gt, gt->pm_guc_events); + spin_unlock_irq(>->irq_lock); +} + +static void gen9_enable_guc_interrupts(struct intel_guc *guc) +{ + struct intel_gt *gt = guc_to_gt(guc); + + assert_rpm_wakelock_held(>->i915->runtime_pm); + + spin_lock_irq(>->irq_lock); + if (!guc->interrupts.enabled) { + WARN_ON_ONCE(intel_uncore_read(gt->uncore, GEN8_GT_IIR(2)) & + gt->pm_guc_events); + guc->interrupts.enabled = true; + gen6_gt_pm_enable_irq(gt, gt->pm_guc_events); + } + spin_unlock_irq(>->irq_lock); +} + +static void gen9_disable_guc_interrupts(struct intel_guc *guc) +{ + struct intel_gt *gt = guc_to_gt(guc); + + assert_rpm_wakelock_held(>->i915->runtime_pm); + + spin_lock_irq(>->irq_lock); + guc->interrupts.enabled = false; + + gen6_gt_pm_disable_irq(gt, gt->pm_guc_events); + + spin_unlock_irq(>->irq_lock); + intel_synchronize_irq(gt->i915); + + gen9_reset_guc_interrupts(guc); +} + +static void gen11_reset_guc_interrupts(struct intel_guc *guc) +{ + struct intel_gt *gt = guc_to_gt(guc); + + spin_lock_irq(>->irq_lock); + gen11_gt_reset_one_iir(gt, 0, GEN11_GUC); + spin_unlock_irq(>->irq_lock); +} + +static void gen11_enable_guc_interrupts(struct intel_guc *guc) +{ + struct intel_gt *gt = guc_to_gt(guc); + + spin_lock_irq(>->irq_lock); + if (!guc->interrupts.enabled) { + u32 events = REG_FIELD_PREP(ENGINE1_MASK, GUC_INTR_GUC2HOST); + + WARN_ON_ONCE(gen11_gt_reset_one_iir(gt, 0, GEN11_GUC)); + intel_uncore_write(gt->uncore, + GEN11_GUC_SG_INTR_ENABLE, events); + intel_uncore_write(gt->uncore, + GEN11_GUC_SG_INTR_MASK, ~events); + guc->interrupts.enabled = true; + } + spin_unlock_irq(>->irq_lock); +} + +static void gen11_disable_guc_interrupts(struct intel_guc *guc) +{ + struct intel_gt *gt = guc_to_gt(guc); + + spin_lock_irq(>->irq_lock); + guc->interrupts.enabled = false; + + intel_uncore_write(gt->uncore, GEN11_GUC_SG_INTR_MASK, ~0); + intel_uncore_write(gt->uncore, GEN11_GUC_SG_INTR_ENABLE, 0); + + spin_unlock_irq(>->irq_lock); + intel_synchronize_irq(gt->i915); + + gen11_reset_guc_interrupts(guc); +} + +void intel_guc_init_early(struct intel_guc *guc) +{ + struct drm_i915_private *i915 = guc_to_gt(guc)->i915; + + intel_uc_fw_init_early(&guc->fw, INTEL_UC_FW_TYPE_GUC); + intel_guc_ct_init_early(&guc->ct); + intel_guc_log_init_early(&guc->log); + intel_guc_submission_init_early(guc); + + mutex_init(&guc->send_mutex); + spin_lock_init(&guc->irq_lock); + if (INTEL_GEN(i915) >= 11) { + guc->notify_reg = GEN11_GUC_HOST_INTERRUPT; + guc->interrupts.reset = gen11_reset_guc_interrupts; + guc->interrupts.enable = gen11_enable_guc_interrupts; + guc->interrupts.disable = gen11_disable_guc_interrupts; + } else { + guc->notify_reg = GUC_SEND_INTERRUPT; + guc->interrupts.reset = gen9_reset_guc_interrupts; + guc->interrupts.enable = gen9_enable_guc_interrupts; + guc->interrupts.disable = gen9_disable_guc_interrupts; + } +} + +static u32 guc_ctl_debug_flags(struct intel_guc *guc) +{ + u32 level = intel_guc_log_get_level(&guc->log); + u32 flags = 0; + + if (!GUC_LOG_LEVEL_IS_VERBOSE(level)) + flags |= GUC_LOG_DISABLED; + else + flags |= GUC_LOG_LEVEL_TO_VERBOSITY(level) << + GUC_LOG_VERBOSITY_SHIFT; + + return flags; +} + +static u32 guc_ctl_feature_flags(struct intel_guc *guc) +{ + u32 flags = 0; + + if (!intel_guc_submission_is_used(guc)) + flags |= GUC_CTL_DISABLE_SCHEDULER; + + return flags; +} + +static u32 guc_ctl_ctxinfo_flags(struct intel_guc *guc) +{ + u32 flags = 0; + + if (intel_guc_submission_is_used(guc)) { + u32 ctxnum, base; + + base = intel_guc_ggtt_offset(guc, guc->stage_desc_pool); + ctxnum = GUC_MAX_STAGE_DESCRIPTORS / 16; + + base >>= PAGE_SHIFT; + flags |= (base << GUC_CTL_BASE_ADDR_SHIFT) | + (ctxnum << GUC_CTL_CTXNUM_IN16_SHIFT); + } + return flags; +} + +static u32 guc_ctl_log_params_flags(struct intel_guc *guc) +{ + u32 offset = intel_guc_ggtt_offset(guc, guc->log.vma) >> PAGE_SHIFT; + u32 flags; + + #if (((CRASH_BUFFER_SIZE) % SZ_1M) == 0) + #define UNIT SZ_1M + #define FLAG GUC_LOG_ALLOC_IN_MEGABYTE + #else + #define UNIT SZ_4K + #define FLAG 0 + #endif + + BUILD_BUG_ON(!CRASH_BUFFER_SIZE); + BUILD_BUG_ON(!IS_ALIGNED(CRASH_BUFFER_SIZE, UNIT)); + BUILD_BUG_ON(!DPC_BUFFER_SIZE); + BUILD_BUG_ON(!IS_ALIGNED(DPC_BUFFER_SIZE, UNIT)); + BUILD_BUG_ON(!ISR_BUFFER_SIZE); + BUILD_BUG_ON(!IS_ALIGNED(ISR_BUFFER_SIZE, UNIT)); + + BUILD_BUG_ON((CRASH_BUFFER_SIZE / UNIT - 1) > + (GUC_LOG_CRASH_MASK >> GUC_LOG_CRASH_SHIFT)); + BUILD_BUG_ON((DPC_BUFFER_SIZE / UNIT - 1) > + (GUC_LOG_DPC_MASK >> GUC_LOG_DPC_SHIFT)); + BUILD_BUG_ON((ISR_BUFFER_SIZE / UNIT - 1) > + (GUC_LOG_ISR_MASK >> GUC_LOG_ISR_SHIFT)); + + flags = GUC_LOG_VALID | + GUC_LOG_NOTIFY_ON_HALF_FULL | + FLAG | + ((CRASH_BUFFER_SIZE / UNIT - 1) << GUC_LOG_CRASH_SHIFT) | + ((DPC_BUFFER_SIZE / UNIT - 1) << GUC_LOG_DPC_SHIFT) | + ((ISR_BUFFER_SIZE / UNIT - 1) << GUC_LOG_ISR_SHIFT) | + (offset << GUC_LOG_BUF_ADDR_SHIFT); + + #undef UNIT + #undef FLAG + + return flags; +} + +static u32 guc_ctl_ads_flags(struct intel_guc *guc) +{ + u32 ads = intel_guc_ggtt_offset(guc, guc->ads_vma) >> PAGE_SHIFT; + u32 flags = ads << GUC_ADS_ADDR_SHIFT; + + return flags; +} + +/* + * Initialise the GuC parameter block before starting the firmware + * transfer. These parameters are read by the firmware on startup + * and cannot be changed thereafter. + */ +static void guc_init_params(struct intel_guc *guc) +{ + u32 *params = guc->params; + int i; + + BUILD_BUG_ON(sizeof(guc->params) != GUC_CTL_MAX_DWORDS * sizeof(u32)); + + params[GUC_CTL_CTXINFO] = guc_ctl_ctxinfo_flags(guc); + params[GUC_CTL_LOG_PARAMS] = guc_ctl_log_params_flags(guc); + params[GUC_CTL_FEATURE] = guc_ctl_feature_flags(guc); + params[GUC_CTL_DEBUG] = guc_ctl_debug_flags(guc); + params[GUC_CTL_ADS] = guc_ctl_ads_flags(guc); + + for (i = 0; i < GUC_CTL_MAX_DWORDS; i++) + DRM_DEBUG_DRIVER("param[%2d] = %#x\n", i, params[i]); +} + +/* + * Initialise the GuC parameter block before starting the firmware + * transfer. These parameters are read by the firmware on startup + * and cannot be changed thereafter. + */ +void intel_guc_write_params(struct intel_guc *guc) +{ + struct intel_uncore *uncore = guc_to_gt(guc)->uncore; + int i; + + /* + * All SOFT_SCRATCH registers are in FORCEWAKE_BLITTER domain and + * they are power context saved so it's ok to release forcewake + * when we are done here and take it again at xfer time. + */ + intel_uncore_forcewake_get(uncore, FORCEWAKE_BLITTER); + + intel_uncore_write(uncore, SOFT_SCRATCH(0), 0); + + for (i = 0; i < GUC_CTL_MAX_DWORDS; i++) + intel_uncore_write(uncore, SOFT_SCRATCH(1 + i), guc->params[i]); + + intel_uncore_forcewake_put(uncore, FORCEWAKE_BLITTER); +} + +int intel_guc_init(struct intel_guc *guc) +{ + struct intel_gt *gt = guc_to_gt(guc); + int ret; + + ret = intel_uc_fw_init(&guc->fw); + if (ret) + goto out; + + ret = intel_guc_log_create(&guc->log); + if (ret) + goto err_fw; + + ret = intel_guc_ads_create(guc); + if (ret) + goto err_log; + GEM_BUG_ON(!guc->ads_vma); + + ret = intel_guc_ct_init(&guc->ct); + if (ret) + goto err_ads; + + if (intel_guc_submission_is_used(guc)) { + /* + * This is stuff we need to have available at fw load time + * if we are planning to enable submission later + */ + ret = intel_guc_submission_init(guc); + if (ret) + goto err_ct; + } + + /* now that everything is perma-pinned, initialize the parameters */ + guc_init_params(guc); + + /* We need to notify the guc whenever we change the GGTT */ + i915_ggtt_enable_guc(gt->ggtt); + + intel_uc_fw_change_status(&guc->fw, INTEL_UC_FIRMWARE_LOADABLE); + + return 0; + +err_ct: + intel_guc_ct_fini(&guc->ct); +err_ads: + intel_guc_ads_destroy(guc); +err_log: + intel_guc_log_destroy(&guc->log); +err_fw: + intel_uc_fw_fini(&guc->fw); +out: + i915_probe_error(gt->i915, "failed with %d\n", ret); + return ret; +} + +void intel_guc_fini(struct intel_guc *guc) +{ + struct intel_gt *gt = guc_to_gt(guc); + + if (!intel_uc_fw_is_loadable(&guc->fw)) + return; + + i915_ggtt_disable_guc(gt->ggtt); + + if (intel_guc_submission_is_used(guc)) + intel_guc_submission_fini(guc); + + intel_guc_ct_fini(&guc->ct); + + intel_guc_ads_destroy(guc); + intel_guc_log_destroy(&guc->log); + intel_uc_fw_fini(&guc->fw); +} + +/* + * This function implements the MMIO based host to GuC interface. + */ +int intel_guc_send_mmio(struct intel_guc *guc, const u32 *action, u32 len, + u32 *response_buf, u32 response_buf_size) +{ + struct intel_uncore *uncore = guc_to_gt(guc)->uncore; + u32 status; + int i; + int ret; + + GEM_BUG_ON(!len); + GEM_BUG_ON(len > guc->send_regs.count); + + /* We expect only action code */ + GEM_BUG_ON(*action & ~INTEL_GUC_MSG_CODE_MASK); + + /* If CT is available, we expect to use MMIO only during init/fini */ + GEM_BUG_ON(*action != INTEL_GUC_ACTION_REGISTER_COMMAND_TRANSPORT_BUFFER && + *action != INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER); + + mutex_lock(&guc->send_mutex); + intel_uncore_forcewake_get(uncore, guc->send_regs.fw_domains); + + for (i = 0; i < len; i++) + intel_uncore_write(uncore, guc_send_reg(guc, i), action[i]); + + intel_uncore_posting_read(uncore, guc_send_reg(guc, i - 1)); + + intel_guc_notify(guc); + + /* + * No GuC command should ever take longer than 10ms. + * Fast commands should still complete in 10us. + */ + ret = __intel_wait_for_register_fw(uncore, + guc_send_reg(guc, 0), + INTEL_GUC_MSG_TYPE_MASK, + INTEL_GUC_MSG_TYPE_RESPONSE << + INTEL_GUC_MSG_TYPE_SHIFT, + 10, 10, &status); + /* If GuC explicitly returned an error, convert it to -EIO */ + if (!ret && !INTEL_GUC_MSG_IS_RESPONSE_SUCCESS(status)) + ret = -EIO; + + if (ret) { + DRM_ERROR("MMIO: GuC action %#x failed with error %d %#x\n", + action[0], ret, status); + goto out; + } + + if (response_buf) { + int count = min(response_buf_size, guc->send_regs.count - 1); + + for (i = 0; i < count; i++) + response_buf[i] = intel_uncore_read(uncore, + guc_send_reg(guc, i + 1)); + } + + /* Use data from the GuC response as our return value */ + ret = INTEL_GUC_MSG_TO_DATA(status); + +out: + intel_uncore_forcewake_put(uncore, guc->send_regs.fw_domains); + mutex_unlock(&guc->send_mutex); + + return ret; +} + +int intel_guc_to_host_process_recv_msg(struct intel_guc *guc, + const u32 *payload, u32 len) +{ + u32 msg; + + if (unlikely(!len)) + return -EPROTO; + + /* Make sure to handle only enabled messages */ + msg = payload[0] & guc->msg_enabled_mask; + + if (msg & (INTEL_GUC_RECV_MSG_FLUSH_LOG_BUFFER | + INTEL_GUC_RECV_MSG_CRASH_DUMP_POSTED)) + intel_guc_log_handle_flush_event(&guc->log); + + return 0; +} + +int intel_guc_sample_forcewake(struct intel_guc *guc) +{ + struct drm_i915_private *dev_priv = guc_to_gt(guc)->i915; + u32 action[2]; + + action[0] = INTEL_GUC_ACTION_SAMPLE_FORCEWAKE; + /* WaRsDisableCoarsePowerGating:skl,cnl */ + if (!HAS_RC6(dev_priv) || NEEDS_WaRsDisableCoarsePowerGating(dev_priv)) + action[1] = 0; + else + /* bit 0 and 1 are for Render and Media domain separately */ + action[1] = GUC_FORCEWAKE_RENDER | GUC_FORCEWAKE_MEDIA; + + return intel_guc_send(guc, action, ARRAY_SIZE(action)); +} + +/** + * intel_guc_auth_huc() - Send action to GuC to authenticate HuC ucode + * @guc: intel_guc structure + * @rsa_offset: rsa offset w.r.t ggtt base of huc vma + * + * Triggers a HuC firmware authentication request to the GuC via intel_guc_send + * INTEL_GUC_ACTION_AUTHENTICATE_HUC interface. This function is invoked by + * intel_huc_auth(). + * + * Return: non-zero code on error + */ +int intel_guc_auth_huc(struct intel_guc *guc, u32 rsa_offset) +{ + u32 action[] = { + INTEL_GUC_ACTION_AUTHENTICATE_HUC, + rsa_offset + }; + + return intel_guc_send(guc, action, ARRAY_SIZE(action)); +} + +/** + * intel_guc_suspend() - notify GuC entering suspend state + * @guc: the guc + */ +int intel_guc_suspend(struct intel_guc *guc) +{ + struct intel_uncore *uncore = guc_to_gt(guc)->uncore; + int ret; + u32 status; + u32 action[] = { + INTEL_GUC_ACTION_ENTER_S_STATE, + GUC_POWER_D1, /* any value greater than GUC_POWER_D0 */ + }; + + /* + * If GuC communication is enabled but submission is not supported, + * we do not need to suspend the GuC. + */ + if (!intel_guc_submission_is_used(guc) || !intel_guc_is_ready(guc)) + return 0; + + /* + * The ENTER_S_STATE action queues the save/restore operation in GuC FW + * and then returns, so waiting on the H2G is not enough to guarantee + * GuC is done. When all the processing is done, GuC writes + * INTEL_GUC_SLEEP_STATE_SUCCESS to scratch register 14, so we can poll + * on that. Note that GuC does not ensure that the value in the register + * is different from INTEL_GUC_SLEEP_STATE_SUCCESS while the action is + * in progress so we need to take care of that ourselves as well. + */ + + intel_uncore_write(uncore, SOFT_SCRATCH(14), + INTEL_GUC_SLEEP_STATE_INVALID_MASK); + + ret = intel_guc_send(guc, action, ARRAY_SIZE(action)); + if (ret) + return ret; + + ret = __intel_wait_for_register(uncore, SOFT_SCRATCH(14), + INTEL_GUC_SLEEP_STATE_INVALID_MASK, + 0, 0, 10, &status); + if (ret) + return ret; + + if (status != INTEL_GUC_SLEEP_STATE_SUCCESS) { + DRM_ERROR("GuC failed to change sleep state. " + "action=0x%x, err=%u\n", + action[0], status); + return -EIO; + } + + return 0; +} + +/** + * intel_guc_reset_engine() - ask GuC to reset an engine + * @guc: intel_guc structure + * @engine: engine to be reset + */ +int intel_guc_reset_engine(struct intel_guc *guc, + struct intel_engine_cs *engine) +{ + /* XXX: to be implemented with submission interface rework */ + + return -ENODEV; +} + +/** + * intel_guc_resume() - notify GuC resuming from suspend state + * @guc: the guc + */ +int intel_guc_resume(struct intel_guc *guc) +{ + u32 action[] = { + INTEL_GUC_ACTION_EXIT_S_STATE, + GUC_POWER_D0, + }; + + /* + * If GuC communication is enabled but submission is not supported, + * we do not need to resume the GuC but we do need to enable the + * GuC communication on resume (above). + */ + if (!intel_guc_submission_is_used(guc) || !intel_guc_is_ready(guc)) + return 0; + + return intel_guc_send(guc, action, ARRAY_SIZE(action)); +} + +/** + * DOC: GuC Memory Management + * + * GuC can't allocate any memory for its own usage, so all the allocations must + * be handled by the host driver. GuC accesses the memory via the GGTT, with the + * exception of the top and bottom parts of the 4GB address space, which are + * instead re-mapped by the GuC HW to memory location of the FW itself (WOPCM) + * or other parts of the HW. The driver must take care not to place objects that + * the GuC is going to access in these reserved ranges. The layout of the GuC + * address space is shown below: + * + * :: + * + * +===========> +====================+ <== FFFF_FFFF + * ^ | Reserved | + * | +====================+ <== GUC_GGTT_TOP + * | | | + * | | DRAM | + * GuC | | + * Address +===> +====================+ <== GuC ggtt_pin_bias + * Space ^ | | + * | | | | + * | GuC | GuC | + * | WOPCM | WOPCM | + * | Size | | + * | | | | + * v v | | + * +=======+===> +====================+ <== 0000_0000 + * + * The lower part of GuC Address Space [0, ggtt_pin_bias) is mapped to GuC WOPCM + * while upper part of GuC Address Space [ggtt_pin_bias, GUC_GGTT_TOP) is mapped + * to DRAM. The value of the GuC ggtt_pin_bias is the GuC WOPCM size. + */ + +/** + * intel_guc_allocate_vma() - Allocate a GGTT VMA for GuC usage + * @guc: the guc + * @size: size of area to allocate (both virtual space and memory) + * + * This is a wrapper to create an object for use with the GuC. In order to + * use it inside the GuC, an object needs to be pinned lifetime, so we allocate + * both some backing storage and a range inside the Global GTT. We must pin + * it in the GGTT somewhere other than than [0, GUC ggtt_pin_bias) because that + * range is reserved inside GuC. + * + * Return: A i915_vma if successful, otherwise an ERR_PTR. + */ +struct i915_vma *intel_guc_allocate_vma(struct intel_guc *guc, u32 size) +{ + struct intel_gt *gt = guc_to_gt(guc); + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + u64 flags; + int ret; + + obj = i915_gem_object_create_shmem(gt->i915, size); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + vma = i915_vma_instance(obj, >->ggtt->vm, NULL); + if (IS_ERR(vma)) + goto err; + + flags = PIN_OFFSET_BIAS | i915_ggtt_pin_bias(vma); + ret = i915_ggtt_pin(vma, NULL, 0, flags); + if (ret) { + vma = ERR_PTR(ret); + goto err; + } + + return i915_vma_make_unshrinkable(vma); + +err: + i915_gem_object_put(obj); + return vma; +} + +/** + * intel_guc_allocate_and_map_vma() - Allocate and map VMA for GuC usage + * @guc: the guc + * @size: size of area to allocate (both virtual space and memory) + * @out_vma: return variable for the allocated vma pointer + * @out_vaddr: return variable for the obj mapping + * + * This wrapper calls intel_guc_allocate_vma() and then maps the allocated + * object with I915_MAP_WB. + * + * Return: 0 if successful, a negative errno code otherwise. + */ +int intel_guc_allocate_and_map_vma(struct intel_guc *guc, u32 size, + struct i915_vma **out_vma, void **out_vaddr) +{ + struct i915_vma *vma; + void *vaddr; + + vma = intel_guc_allocate_vma(guc, size); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + vaddr = i915_gem_object_pin_map(vma->obj, I915_MAP_WB); + if (IS_ERR(vaddr)) { + i915_vma_unpin_and_release(&vma, 0); + return PTR_ERR(vaddr); + } + + *out_vma = vma; + *out_vaddr = vaddr; + + return 0; +} + +/** + * intel_guc_load_status - dump information about GuC load status + * @guc: the GuC + * @p: the &drm_printer + * + * Pretty printer for GuC load status. + */ +void intel_guc_load_status(struct intel_guc *guc, struct drm_printer *p) +{ + struct intel_gt *gt = guc_to_gt(guc); + struct intel_uncore *uncore = gt->uncore; + intel_wakeref_t wakeref; + + if (!intel_guc_is_supported(guc)) { + drm_printf(p, "GuC not supported\n"); + return; + } + + if (!intel_guc_is_wanted(guc)) { + drm_printf(p, "GuC disabled\n"); + return; + } + + intel_uc_fw_dump(&guc->fw, p); + + with_intel_runtime_pm(uncore->rpm, wakeref) { + u32 status = intel_uncore_read(uncore, GUC_STATUS); + u32 i; + + drm_printf(p, "\nGuC status 0x%08x:\n", status); + drm_printf(p, "\tBootrom status = 0x%x\n", + (status & GS_BOOTROM_MASK) >> GS_BOOTROM_SHIFT); + drm_printf(p, "\tuKernel status = 0x%x\n", + (status & GS_UKERNEL_MASK) >> GS_UKERNEL_SHIFT); + drm_printf(p, "\tMIA Core status = 0x%x\n", + (status & GS_MIA_MASK) >> GS_MIA_SHIFT); + drm_puts(p, "\nScratch registers:\n"); + for (i = 0; i < 16; i++) { + drm_printf(p, "\t%2d: \t0x%x\n", + i, intel_uncore_read(uncore, SOFT_SCRATCH(i))); + } + } +} diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h new file mode 100644 index 000000000..e84ab67b3 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2014-2019 Intel Corporation + */ + +#ifndef _INTEL_GUC_H_ +#define _INTEL_GUC_H_ + +#include "intel_uncore.h" +#include "intel_guc_fw.h" +#include "intel_guc_fwif.h" +#include "intel_guc_ct.h" +#include "intel_guc_log.h" +#include "intel_guc_reg.h" +#include "intel_uc_fw.h" +#include "i915_utils.h" +#include "i915_vma.h" + +struct __guc_ads_blob; + +/* + * Top level structure of GuC. It handles firmware loading and manages client + * pool. intel_guc owns a intel_guc_client to replace the legacy ExecList + * submission. + */ +struct intel_guc { + struct intel_uc_fw fw; + struct intel_guc_log log; + struct intel_guc_ct ct; + + /* intel_guc_recv interrupt related state */ + spinlock_t irq_lock; + unsigned int msg_enabled_mask; + + struct { + bool enabled; + void (*reset)(struct intel_guc *guc); + void (*enable)(struct intel_guc *guc); + void (*disable)(struct intel_guc *guc); + } interrupts; + + bool submission_selected; + + struct i915_vma *ads_vma; + struct __guc_ads_blob *ads_blob; + + struct i915_vma *stage_desc_pool; + void *stage_desc_pool_vaddr; + + struct i915_vma *workqueue; + void *workqueue_vaddr; + spinlock_t wq_lock; + + struct i915_vma *proc_desc; + void *proc_desc_vaddr; + + /* Control params for fw initialization */ + u32 params[GUC_CTL_MAX_DWORDS]; + + /* GuC's FW specific registers used in MMIO send */ + struct { + u32 base; + unsigned int count; + enum forcewake_domains fw_domains; + } send_regs; + + /* register used to send interrupts to the GuC FW */ + i915_reg_t notify_reg; + + /* Store msg (e.g. log flush) that we see while CTBs are disabled */ + u32 mmio_msg; + + /* To serialize the intel_guc_send actions */ + struct mutex send_mutex; +}; + +static inline struct intel_guc *log_to_guc(struct intel_guc_log *log) +{ + return container_of(log, struct intel_guc, log); +} + +static +inline int intel_guc_send(struct intel_guc *guc, const u32 *action, u32 len) +{ + return intel_guc_ct_send(&guc->ct, action, len, NULL, 0); +} + +static inline int +intel_guc_send_and_receive(struct intel_guc *guc, const u32 *action, u32 len, + u32 *response_buf, u32 response_buf_size) +{ + return intel_guc_ct_send(&guc->ct, action, len, + response_buf, response_buf_size); +} + +static inline void intel_guc_to_host_event_handler(struct intel_guc *guc) +{ + intel_guc_ct_event_handler(&guc->ct); +} + +/* GuC addresses above GUC_GGTT_TOP also don't map through the GTT */ +#define GUC_GGTT_TOP 0xFEE00000 + +/** + * intel_guc_ggtt_offset() - Get and validate the GGTT offset of @vma + * @guc: intel_guc structure. + * @vma: i915 graphics virtual memory area. + * + * GuC does not allow any gfx GGTT address that falls into range + * [0, ggtt.pin_bias), which is reserved for Boot ROM, SRAM and WOPCM. + * Currently, in order to exclude [0, ggtt.pin_bias) address space from + * GGTT, all gfx objects used by GuC are allocated with intel_guc_allocate_vma() + * and pinned with PIN_OFFSET_BIAS along with the value of ggtt.pin_bias. + * + * Return: GGTT offset of the @vma. + */ +static inline u32 intel_guc_ggtt_offset(struct intel_guc *guc, + struct i915_vma *vma) +{ + u32 offset = i915_ggtt_offset(vma); + + GEM_BUG_ON(offset < i915_ggtt_pin_bias(vma)); + GEM_BUG_ON(range_overflows_t(u64, offset, vma->size, GUC_GGTT_TOP)); + + return offset; +} + +void intel_guc_init_early(struct intel_guc *guc); +void intel_guc_init_send_regs(struct intel_guc *guc); +void intel_guc_write_params(struct intel_guc *guc); +int intel_guc_init(struct intel_guc *guc); +void intel_guc_fini(struct intel_guc *guc); +void intel_guc_notify(struct intel_guc *guc); +int intel_guc_send_mmio(struct intel_guc *guc, const u32 *action, u32 len, + u32 *response_buf, u32 response_buf_size); +int intel_guc_to_host_process_recv_msg(struct intel_guc *guc, + const u32 *payload, u32 len); +int intel_guc_sample_forcewake(struct intel_guc *guc); +int intel_guc_auth_huc(struct intel_guc *guc, u32 rsa_offset); +int intel_guc_suspend(struct intel_guc *guc); +int intel_guc_resume(struct intel_guc *guc); +struct i915_vma *intel_guc_allocate_vma(struct intel_guc *guc, u32 size); +int intel_guc_allocate_and_map_vma(struct intel_guc *guc, u32 size, + struct i915_vma **out_vma, void **out_vaddr); + +static inline bool intel_guc_is_supported(struct intel_guc *guc) +{ + return intel_uc_fw_is_supported(&guc->fw); +} + +static inline bool intel_guc_is_wanted(struct intel_guc *guc) +{ + return intel_uc_fw_is_enabled(&guc->fw); +} + +static inline bool intel_guc_is_used(struct intel_guc *guc) +{ + GEM_BUG_ON(__intel_uc_fw_status(&guc->fw) == INTEL_UC_FIRMWARE_SELECTED); + return intel_uc_fw_is_available(&guc->fw); +} + +static inline bool intel_guc_is_fw_running(struct intel_guc *guc) +{ + return intel_uc_fw_is_running(&guc->fw); +} + +static inline bool intel_guc_is_ready(struct intel_guc *guc) +{ + return intel_guc_is_fw_running(guc) && intel_guc_ct_enabled(&guc->ct); +} + +static inline int intel_guc_sanitize(struct intel_guc *guc) +{ + intel_uc_fw_sanitize(&guc->fw); + intel_guc_ct_sanitize(&guc->ct); + guc->mmio_msg = 0; + + return 0; +} + +static inline void intel_guc_enable_msg(struct intel_guc *guc, u32 mask) +{ + spin_lock_irq(&guc->irq_lock); + guc->msg_enabled_mask |= mask; + spin_unlock_irq(&guc->irq_lock); +} + +static inline void intel_guc_disable_msg(struct intel_guc *guc, u32 mask) +{ + spin_lock_irq(&guc->irq_lock); + guc->msg_enabled_mask &= ~mask; + spin_unlock_irq(&guc->irq_lock); +} + +int intel_guc_reset_engine(struct intel_guc *guc, + struct intel_engine_cs *engine); + +void intel_guc_load_status(struct intel_guc *guc, struct drm_printer *p); + +#endif diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c new file mode 100644 index 000000000..d44061033 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2014-2019 Intel Corporation + */ + +#include "gt/intel_gt.h" +#include "intel_guc_ads.h" +#include "intel_uc.h" +#include "i915_drv.h" + +/* + * The Additional Data Struct (ADS) has pointers for different buffers used by + * the GuC. One single gem object contains the ADS struct itself (guc_ads), the + * scheduling policies (guc_policies), a structure describing a collection of + * register sets (guc_mmio_reg_state) and some extra pages for the GuC to save + * its internal state for sleep. + */ + +static void guc_policy_init(struct guc_policy *policy) +{ + policy->execution_quantum = POLICY_DEFAULT_EXECUTION_QUANTUM_US; + policy->preemption_time = POLICY_DEFAULT_PREEMPTION_TIME_US; + policy->fault_time = POLICY_DEFAULT_FAULT_TIME_US; + policy->policy_flags = 0; +} + +static void guc_policies_init(struct guc_policies *policies) +{ + struct guc_policy *policy; + u32 p, i; + + policies->dpc_promote_time = POLICY_DEFAULT_DPC_PROMOTE_TIME_US; + policies->max_num_work_items = POLICY_MAX_NUM_WI; + + for (p = 0; p < GUC_CLIENT_PRIORITY_NUM; p++) { + for (i = 0; i < GUC_MAX_ENGINE_CLASSES; i++) { + policy = &policies->policy[p][i]; + + guc_policy_init(policy); + } + } + + policies->is_valid = 1; +} + +static void guc_ct_pool_entries_init(struct guc_ct_pool_entry *pool, u32 num) +{ + memset(pool, 0, num * sizeof(*pool)); +} + +/* + * The first 80 dwords of the register state context, containing the + * execlists and ppgtt registers. + */ +#define LR_HW_CONTEXT_SIZE (80 * sizeof(u32)) + +/* The ads obj includes the struct itself and buffers passed to GuC */ +struct __guc_ads_blob { + struct guc_ads ads; + struct guc_policies policies; + struct guc_mmio_reg_state reg_state; + struct guc_gt_system_info system_info; + struct guc_clients_info clients_info; + struct guc_ct_pool_entry ct_pool[GUC_CT_POOL_SIZE]; + u8 reg_state_buffer[GUC_S3_SAVE_SPACE_PAGES * PAGE_SIZE]; +} __packed; + +static void __guc_ads_init(struct intel_guc *guc) +{ + struct intel_gt *gt = guc_to_gt(guc); + struct __guc_ads_blob *blob = guc->ads_blob; + const u32 skipped_size = LRC_PPHWSP_SZ * PAGE_SIZE + LR_HW_CONTEXT_SIZE; + u32 base; + u8 engine_class; + + /* GuC scheduling policies */ + guc_policies_init(&blob->policies); + + /* + * GuC expects a per-engine-class context image and size + * (minus hwsp and ring context). The context image will be + * used to reinitialize engines after a reset. It must exist + * and be pinned in the GGTT, so that the address won't change after + * we have told GuC where to find it. The context size will be used + * to validate that the LRC base + size fall within allowed GGTT. + */ + for (engine_class = 0; engine_class <= MAX_ENGINE_CLASS; ++engine_class) { + if (engine_class == OTHER_CLASS) + continue; + /* + * TODO: Set context pointer to default state to allow + * GuC to re-init guilty contexts after internal reset. + */ + blob->ads.golden_context_lrca[engine_class] = 0; + blob->ads.eng_state_size[engine_class] = + intel_engine_context_size(guc_to_gt(guc), + engine_class) - + skipped_size; + } + + /* System info */ + blob->system_info.slice_enabled = hweight8(gt->info.sseu.slice_mask); + blob->system_info.rcs_enabled = 1; + blob->system_info.bcs_enabled = 1; + + blob->system_info.vdbox_enable_mask = VDBOX_MASK(gt); + blob->system_info.vebox_enable_mask = VEBOX_MASK(gt); + blob->system_info.vdbox_sfc_support_mask = gt->info.vdbox_sfc_access; + + base = intel_guc_ggtt_offset(guc, guc->ads_vma); + + /* Clients info */ + guc_ct_pool_entries_init(blob->ct_pool, ARRAY_SIZE(blob->ct_pool)); + + blob->clients_info.clients_num = 1; + blob->clients_info.ct_pool_addr = base + ptr_offset(blob, ct_pool); + blob->clients_info.ct_pool_count = ARRAY_SIZE(blob->ct_pool); + + /* ADS */ + blob->ads.scheduler_policies = base + ptr_offset(blob, policies); + blob->ads.reg_state_buffer = base + ptr_offset(blob, reg_state_buffer); + blob->ads.reg_state_addr = base + ptr_offset(blob, reg_state); + blob->ads.gt_system_info = base + ptr_offset(blob, system_info); + blob->ads.clients_info = base + ptr_offset(blob, clients_info); + + i915_gem_object_flush_map(guc->ads_vma->obj); +} + +/** + * intel_guc_ads_create() - allocates and initializes GuC ADS. + * @guc: intel_guc struct + * + * GuC needs memory block (Additional Data Struct), where it will store + * some data. Allocate and initialize such memory block for GuC use. + */ +int intel_guc_ads_create(struct intel_guc *guc) +{ + const u32 size = PAGE_ALIGN(sizeof(struct __guc_ads_blob)); + int ret; + + GEM_BUG_ON(guc->ads_vma); + + ret = intel_guc_allocate_and_map_vma(guc, size, &guc->ads_vma, + (void **)&guc->ads_blob); + + if (ret) + return ret; + + __guc_ads_init(guc); + + return 0; +} + +void intel_guc_ads_destroy(struct intel_guc *guc) +{ + i915_vma_unpin_and_release(&guc->ads_vma, I915_VMA_RELEASE_MAP); +} + +/** + * intel_guc_ads_reset() - prepares GuC Additional Data Struct for reuse + * @guc: intel_guc struct + * + * GuC stores some data in ADS, which might be stale after a reset. + * Reinitialize whole ADS in case any part of it was corrupted during + * previous GuC run. + */ +void intel_guc_ads_reset(struct intel_guc *guc) +{ + if (!guc->ads_vma) + return; + __guc_ads_init(guc); +} diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h new file mode 100644 index 000000000..b00d3ae11 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2014-2019 Intel Corporation + */ + +#ifndef _INTEL_GUC_ADS_H_ +#define _INTEL_GUC_ADS_H_ + +struct intel_guc; + +int intel_guc_ads_create(struct intel_guc *guc); +void intel_guc_ads_destroy(struct intel_guc *guc); +void intel_guc_ads_reset(struct intel_guc *guc); + +#endif diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c new file mode 100644 index 000000000..11742fca0 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c @@ -0,0 +1,861 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2016-2019 Intel Corporation + */ + +#include "i915_drv.h" +#include "intel_guc_ct.h" +#include "gt/intel_gt.h" + +#define CT_ERROR(_ct, _fmt, ...) \ + DRM_DEV_ERROR(ct_to_dev(_ct), "CT: " _fmt, ##__VA_ARGS__) +#ifdef CONFIG_DRM_I915_DEBUG_GUC +#define CT_DEBUG(_ct, _fmt, ...) \ + DRM_DEV_DEBUG_DRIVER(ct_to_dev(_ct), "CT: " _fmt, ##__VA_ARGS__) +#else +#define CT_DEBUG(...) do { } while (0) +#endif + +struct ct_request { + struct list_head link; + u32 fence; + u32 status; + u32 response_len; + u32 *response_buf; +}; + +struct ct_incoming_request { + struct list_head link; + u32 msg[]; +}; + +enum { CTB_SEND = 0, CTB_RECV = 1 }; + +enum { CTB_OWNER_HOST = 0 }; + +static void ct_incoming_request_worker_func(struct work_struct *w); + +/** + * intel_guc_ct_init_early - Initialize CT state without requiring device access + * @ct: pointer to CT struct + */ +void intel_guc_ct_init_early(struct intel_guc_ct *ct) +{ + spin_lock_init(&ct->requests.lock); + INIT_LIST_HEAD(&ct->requests.pending); + INIT_LIST_HEAD(&ct->requests.incoming); + INIT_WORK(&ct->requests.worker, ct_incoming_request_worker_func); +} + +static inline struct intel_guc *ct_to_guc(struct intel_guc_ct *ct) +{ + return container_of(ct, struct intel_guc, ct); +} + +static inline struct intel_gt *ct_to_gt(struct intel_guc_ct *ct) +{ + return guc_to_gt(ct_to_guc(ct)); +} + +static inline struct drm_i915_private *ct_to_i915(struct intel_guc_ct *ct) +{ + return ct_to_gt(ct)->i915; +} + +static inline struct device *ct_to_dev(struct intel_guc_ct *ct) +{ + return ct_to_i915(ct)->drm.dev; +} + +static inline const char *guc_ct_buffer_type_to_str(u32 type) +{ + switch (type) { + case INTEL_GUC_CT_BUFFER_TYPE_SEND: + return "SEND"; + case INTEL_GUC_CT_BUFFER_TYPE_RECV: + return "RECV"; + default: + return "<invalid>"; + } +} + +static void guc_ct_buffer_desc_init(struct guc_ct_buffer_desc *desc, + u32 cmds_addr, u32 size) +{ + memset(desc, 0, sizeof(*desc)); + desc->addr = cmds_addr; + desc->size = size; + desc->owner = CTB_OWNER_HOST; +} + +static void guc_ct_buffer_desc_reset(struct guc_ct_buffer_desc *desc) +{ + desc->head = 0; + desc->tail = 0; + desc->is_in_error = 0; +} + +static int guc_action_register_ct_buffer(struct intel_guc *guc, + u32 desc_addr, + u32 type) +{ + u32 action[] = { + INTEL_GUC_ACTION_REGISTER_COMMAND_TRANSPORT_BUFFER, + desc_addr, + sizeof(struct guc_ct_buffer_desc), + type + }; + + /* Can't use generic send(), CT registration must go over MMIO */ + return intel_guc_send_mmio(guc, action, ARRAY_SIZE(action), NULL, 0); +} + +static int ct_register_buffer(struct intel_guc_ct *ct, u32 desc_addr, u32 type) +{ + int err = guc_action_register_ct_buffer(ct_to_guc(ct), desc_addr, type); + + if (unlikely(err)) + CT_ERROR(ct, "Failed to register %s buffer (err=%d)\n", + guc_ct_buffer_type_to_str(type), err); + return err; +} + +static int guc_action_deregister_ct_buffer(struct intel_guc *guc, u32 type) +{ + u32 action[] = { + INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER, + CTB_OWNER_HOST, + type + }; + + /* Can't use generic send(), CT deregistration must go over MMIO */ + return intel_guc_send_mmio(guc, action, ARRAY_SIZE(action), NULL, 0); +} + +static int ct_deregister_buffer(struct intel_guc_ct *ct, u32 type) +{ + int err = guc_action_deregister_ct_buffer(ct_to_guc(ct), type); + + if (unlikely(err)) + CT_ERROR(ct, "Failed to deregister %s buffer (err=%d)\n", + guc_ct_buffer_type_to_str(type), err); + return err; +} + +/** + * intel_guc_ct_init - Init buffer-based communication + * @ct: pointer to CT struct + * + * Allocate memory required for buffer-based communication. + * + * Return: 0 on success, a negative errno code on failure. + */ +int intel_guc_ct_init(struct intel_guc_ct *ct) +{ + struct intel_guc *guc = ct_to_guc(ct); + void *blob; + int err; + int i; + + GEM_BUG_ON(ct->vma); + + /* We allocate 1 page to hold both descriptors and both buffers. + * ___________..................... + * |desc (SEND)| : + * |___________| PAGE/4 + * :___________....................: + * |desc (RECV)| : + * |___________| PAGE/4 + * :_______________________________: + * |cmds (SEND) | + * | PAGE/4 + * |_______________________________| + * |cmds (RECV) | + * | PAGE/4 + * |_______________________________| + * + * Each message can use a maximum of 32 dwords and we don't expect to + * have more than 1 in flight at any time, so we have enough space. + * Some logic further ahead will rely on the fact that there is only 1 + * page and that it is always mapped, so if the size is changed the + * other code will need updating as well. + */ + + err = intel_guc_allocate_and_map_vma(guc, PAGE_SIZE, &ct->vma, &blob); + if (unlikely(err)) { + CT_ERROR(ct, "Failed to allocate CT channel (err=%d)\n", err); + return err; + } + + CT_DEBUG(ct, "vma base=%#x\n", intel_guc_ggtt_offset(guc, ct->vma)); + + /* store pointers to desc and cmds */ + for (i = 0; i < ARRAY_SIZE(ct->ctbs); i++) { + GEM_BUG_ON((i != CTB_SEND) && (i != CTB_RECV)); + ct->ctbs[i].desc = blob + PAGE_SIZE/4 * i; + ct->ctbs[i].cmds = blob + PAGE_SIZE/4 * i + PAGE_SIZE/2; + } + + return 0; +} + +/** + * intel_guc_ct_fini - Fini buffer-based communication + * @ct: pointer to CT struct + * + * Deallocate memory required for buffer-based communication. + */ +void intel_guc_ct_fini(struct intel_guc_ct *ct) +{ + GEM_BUG_ON(ct->enabled); + + i915_vma_unpin_and_release(&ct->vma, I915_VMA_RELEASE_MAP); +} + +/** + * intel_guc_ct_enable - Enable buffer based command transport. + * @ct: pointer to CT struct + * + * Return: 0 on success, a negative errno code on failure. + */ +int intel_guc_ct_enable(struct intel_guc_ct *ct) +{ + struct intel_guc *guc = ct_to_guc(ct); + u32 base, cmds, size; + int err; + int i; + + GEM_BUG_ON(ct->enabled); + + /* vma should be already allocated and map'ed */ + GEM_BUG_ON(!ct->vma); + base = intel_guc_ggtt_offset(guc, ct->vma); + + /* (re)initialize descriptors + * cmds buffers are in the second half of the blob page + */ + for (i = 0; i < ARRAY_SIZE(ct->ctbs); i++) { + GEM_BUG_ON((i != CTB_SEND) && (i != CTB_RECV)); + cmds = base + PAGE_SIZE / 4 * i + PAGE_SIZE / 2; + size = PAGE_SIZE / 4; + CT_DEBUG(ct, "%d: addr=%#x size=%u\n", i, cmds, size); + guc_ct_buffer_desc_init(ct->ctbs[i].desc, cmds, size); + } + + /* + * Register both CT buffers starting with RECV buffer. + * Descriptors are in first half of the blob. + */ + err = ct_register_buffer(ct, base + PAGE_SIZE / 4 * CTB_RECV, + INTEL_GUC_CT_BUFFER_TYPE_RECV); + if (unlikely(err)) + goto err_out; + + err = ct_register_buffer(ct, base + PAGE_SIZE / 4 * CTB_SEND, + INTEL_GUC_CT_BUFFER_TYPE_SEND); + if (unlikely(err)) + goto err_deregister; + + ct->enabled = true; + + return 0; + +err_deregister: + ct_deregister_buffer(ct, INTEL_GUC_CT_BUFFER_TYPE_RECV); +err_out: + CT_ERROR(ct, "Failed to open open CT channel (err=%d)\n", err); + return err; +} + +/** + * intel_guc_ct_disable - Disable buffer based command transport. + * @ct: pointer to CT struct + */ +void intel_guc_ct_disable(struct intel_guc_ct *ct) +{ + struct intel_guc *guc = ct_to_guc(ct); + + GEM_BUG_ON(!ct->enabled); + + ct->enabled = false; + + if (intel_guc_is_fw_running(guc)) { + ct_deregister_buffer(ct, INTEL_GUC_CT_BUFFER_TYPE_SEND); + ct_deregister_buffer(ct, INTEL_GUC_CT_BUFFER_TYPE_RECV); + } +} + +static u32 ct_get_next_fence(struct intel_guc_ct *ct) +{ + /* For now it's trivial */ + return ++ct->requests.last_fence; +} + +/** + * DOC: CTB Host to GuC request + * + * Format of the CTB Host to GuC request message is as follows:: + * + * +------------+---------+---------+---------+---------+ + * | msg[0] | [1] | [2] | ... | [n-1] | + * +------------+---------+---------+---------+---------+ + * | MESSAGE | MESSAGE PAYLOAD | + * + HEADER +---------+---------+---------+---------+ + * | | 0 | 1 | ... | n | + * +============+=========+=========+=========+=========+ + * | len >= 1 | FENCE | request specific data | + * +------+-----+---------+---------+---------+---------+ + * + * ^-----------------len-------------------^ + */ + +static int ct_write(struct intel_guc_ct *ct, + const u32 *action, + u32 len /* in dwords */, + u32 fence, + bool want_response) +{ + struct intel_guc_ct_buffer *ctb = &ct->ctbs[CTB_SEND]; + struct guc_ct_buffer_desc *desc = ctb->desc; + u32 head = desc->head; + u32 tail = desc->tail; + u32 size = desc->size; + u32 used; + u32 header; + u32 *cmds = ctb->cmds; + unsigned int i; + + if (unlikely(desc->is_in_error)) + return -EPIPE; + + if (unlikely(!IS_ALIGNED(head | tail | size, 4) || + (tail | head) >= size)) + goto corrupted; + + /* later calculations will be done in dwords */ + head /= 4; + tail /= 4; + size /= 4; + + /* + * tail == head condition indicates empty. GuC FW does not support + * using up the entire buffer to get tail == head meaning full. + */ + if (tail < head) + used = (size - head) + tail; + else + used = tail - head; + + /* make sure there is a space including extra dw for the fence */ + if (unlikely(used + len + 1 >= size)) + return -ENOSPC; + + /* + * Write the message. The format is the following: + * DW0: header (including action code) + * DW1: fence + * DW2+: action data + */ + header = (len << GUC_CT_MSG_LEN_SHIFT) | + (GUC_CT_MSG_WRITE_FENCE_TO_DESC) | + (want_response ? GUC_CT_MSG_SEND_STATUS : 0) | + (action[0] << GUC_CT_MSG_ACTION_SHIFT); + + CT_DEBUG(ct, "writing %*ph %*ph %*ph\n", + 4, &header, 4, &fence, 4 * (len - 1), &action[1]); + + cmds[tail] = header; + tail = (tail + 1) % size; + + cmds[tail] = fence; + tail = (tail + 1) % size; + + for (i = 1; i < len; i++) { + cmds[tail] = action[i]; + tail = (tail + 1) % size; + } + GEM_BUG_ON(tail > size); + + /* now update desc tail (back in bytes) */ + desc->tail = tail * 4; + return 0; + +corrupted: + CT_ERROR(ct, "Corrupted descriptor addr=%#x head=%u tail=%u size=%u\n", + desc->addr, desc->head, desc->tail, desc->size); + desc->is_in_error = 1; + return -EPIPE; +} + +/** + * wait_for_ctb_desc_update - Wait for the CT buffer descriptor update. + * @desc: buffer descriptor + * @fence: response fence + * @status: placeholder for status + * + * Guc will update CT buffer descriptor with new fence and status + * after processing the command identified by the fence. Wait for + * specified fence and then read from the descriptor status of the + * command. + * + * Return: + * * 0 response received (status is valid) + * * -ETIMEDOUT no response within hardcoded timeout + * * -EPROTO no response, CT buffer is in error + */ +static int wait_for_ctb_desc_update(struct guc_ct_buffer_desc *desc, + u32 fence, + u32 *status) +{ + int err; + + /* + * Fast commands should complete in less than 10us, so sample quickly + * up to that length of time, then switch to a slower sleep-wait loop. + * No GuC command should ever take longer than 10ms. + */ +#define done (READ_ONCE(desc->fence) == fence) + err = wait_for_us(done, 10); + if (err) + err = wait_for(done, 10); +#undef done + + if (unlikely(err)) { + DRM_ERROR("CT: fence %u failed; reported fence=%u\n", + fence, desc->fence); + + if (WARN_ON(desc->is_in_error)) { + /* Something went wrong with the messaging, try to reset + * the buffer and hope for the best + */ + guc_ct_buffer_desc_reset(desc); + err = -EPROTO; + } + } + + *status = desc->status; + return err; +} + +/** + * wait_for_ct_request_update - Wait for CT request state update. + * @req: pointer to pending request + * @status: placeholder for status + * + * For each sent request, Guc shall send bac CT response message. + * Our message handler will update status of tracked request once + * response message with given fence is received. Wait here and + * check for valid response status value. + * + * Return: + * * 0 response received (status is valid) + * * -ETIMEDOUT no response within hardcoded timeout + */ +static int wait_for_ct_request_update(struct ct_request *req, u32 *status) +{ + int err; + + /* + * Fast commands should complete in less than 10us, so sample quickly + * up to that length of time, then switch to a slower sleep-wait loop. + * No GuC command should ever take longer than 10ms. + */ +#define done INTEL_GUC_MSG_IS_RESPONSE(READ_ONCE(req->status)) + err = wait_for_us(done, 10); + if (err) + err = wait_for(done, 10); +#undef done + + if (unlikely(err)) + DRM_ERROR("CT: fence %u err %d\n", req->fence, err); + + *status = req->status; + return err; +} + +static int ct_send(struct intel_guc_ct *ct, + const u32 *action, + u32 len, + u32 *response_buf, + u32 response_buf_size, + u32 *status) +{ + struct intel_guc_ct_buffer *ctb = &ct->ctbs[CTB_SEND]; + struct guc_ct_buffer_desc *desc = ctb->desc; + struct ct_request request; + unsigned long flags; + u32 fence; + int err; + + GEM_BUG_ON(!ct->enabled); + GEM_BUG_ON(!len); + GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK); + GEM_BUG_ON(!response_buf && response_buf_size); + + fence = ct_get_next_fence(ct); + request.fence = fence; + request.status = 0; + request.response_len = response_buf_size; + request.response_buf = response_buf; + + spin_lock_irqsave(&ct->requests.lock, flags); + list_add_tail(&request.link, &ct->requests.pending); + spin_unlock_irqrestore(&ct->requests.lock, flags); + + err = ct_write(ct, action, len, fence, !!response_buf); + if (unlikely(err)) + goto unlink; + + intel_guc_notify(ct_to_guc(ct)); + + if (response_buf) + err = wait_for_ct_request_update(&request, status); + else + err = wait_for_ctb_desc_update(desc, fence, status); + if (unlikely(err)) + goto unlink; + + if (!INTEL_GUC_MSG_IS_RESPONSE_SUCCESS(*status)) { + err = -EIO; + goto unlink; + } + + if (response_buf) { + /* There shall be no data in the status */ + WARN_ON(INTEL_GUC_MSG_TO_DATA(request.status)); + /* Return actual response len */ + err = request.response_len; + } else { + /* There shall be no response payload */ + WARN_ON(request.response_len); + /* Return data decoded from the status dword */ + err = INTEL_GUC_MSG_TO_DATA(*status); + } + +unlink: + spin_lock_irqsave(&ct->requests.lock, flags); + list_del(&request.link); + spin_unlock_irqrestore(&ct->requests.lock, flags); + + return err; +} + +/* + * Command Transport (CT) buffer based GuC send function. + */ +int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len, + u32 *response_buf, u32 response_buf_size) +{ + struct intel_guc *guc = ct_to_guc(ct); + u32 status = ~0; /* undefined */ + int ret; + + if (unlikely(!ct->enabled)) { + WARN(1, "Unexpected send: action=%#x\n", *action); + return -ENODEV; + } + + mutex_lock(&guc->send_mutex); + + ret = ct_send(ct, action, len, response_buf, response_buf_size, &status); + if (unlikely(ret < 0)) { + CT_ERROR(ct, "Sending action %#x failed (err=%d status=%#X)\n", + action[0], ret, status); + } else if (unlikely(ret)) { + CT_DEBUG(ct, "send action %#x returned %d (%#x)\n", + action[0], ret, ret); + } + + mutex_unlock(&guc->send_mutex); + return ret; +} + +static inline unsigned int ct_header_get_len(u32 header) +{ + return (header >> GUC_CT_MSG_LEN_SHIFT) & GUC_CT_MSG_LEN_MASK; +} + +static inline unsigned int ct_header_get_action(u32 header) +{ + return (header >> GUC_CT_MSG_ACTION_SHIFT) & GUC_CT_MSG_ACTION_MASK; +} + +static inline bool ct_header_is_response(u32 header) +{ + return !!(header & GUC_CT_MSG_IS_RESPONSE); +} + +static int ct_read(struct intel_guc_ct *ct, u32 *data) +{ + struct intel_guc_ct_buffer *ctb = &ct->ctbs[CTB_RECV]; + struct guc_ct_buffer_desc *desc = ctb->desc; + u32 head = desc->head; + u32 tail = desc->tail; + u32 size = desc->size; + u32 *cmds = ctb->cmds; + s32 available; + unsigned int len; + unsigned int i; + + if (unlikely(desc->is_in_error)) + return -EPIPE; + + if (unlikely(!IS_ALIGNED(head | tail | size, 4) || + (tail | head) >= size)) + goto corrupted; + + /* later calculations will be done in dwords */ + head /= 4; + tail /= 4; + size /= 4; + + /* tail == head condition indicates empty */ + available = tail - head; + if (unlikely(available == 0)) + return -ENODATA; + + /* beware of buffer wrap case */ + if (unlikely(available < 0)) + available += size; + CT_DEBUG(ct, "available %d (%u:%u)\n", available, head, tail); + GEM_BUG_ON(available < 0); + + data[0] = cmds[head]; + head = (head + 1) % size; + + /* message len with header */ + len = ct_header_get_len(data[0]) + 1; + if (unlikely(len > (u32)available)) { + CT_ERROR(ct, "Incomplete message %*ph %*ph %*ph\n", + 4, data, + 4 * (head + available - 1 > size ? + size - head : available - 1), &cmds[head], + 4 * (head + available - 1 > size ? + available - 1 - size + head : 0), &cmds[0]); + goto corrupted; + } + + for (i = 1; i < len; i++) { + data[i] = cmds[head]; + head = (head + 1) % size; + } + CT_DEBUG(ct, "received %*ph\n", 4 * len, data); + + desc->head = head * 4; + return 0; + +corrupted: + CT_ERROR(ct, "Corrupted descriptor addr=%#x head=%u tail=%u size=%u\n", + desc->addr, desc->head, desc->tail, desc->size); + desc->is_in_error = 1; + return -EPIPE; +} + +/** + * DOC: CTB GuC to Host response + * + * Format of the CTB GuC to Host response message is as follows:: + * + * +------------+---------+---------+---------+---------+---------+ + * | msg[0] | [1] | [2] | [3] | ... | [n-1] | + * +------------+---------+---------+---------+---------+---------+ + * | MESSAGE | MESSAGE PAYLOAD | + * + HEADER +---------+---------+---------+---------+---------+ + * | | 0 | 1 | 2 | ... | n | + * +============+=========+=========+=========+=========+=========+ + * | len >= 2 | FENCE | STATUS | response specific data | + * +------+-----+---------+---------+---------+---------+---------+ + * + * ^-----------------------len-----------------------^ + */ + +static int ct_handle_response(struct intel_guc_ct *ct, const u32 *msg) +{ + u32 header = msg[0]; + u32 len = ct_header_get_len(header); + u32 msgsize = (len + 1) * sizeof(u32); /* msg size in bytes w/header */ + u32 fence; + u32 status; + u32 datalen; + struct ct_request *req; + bool found = false; + + GEM_BUG_ON(!ct_header_is_response(header)); + GEM_BUG_ON(!in_irq()); + + /* Response payload shall at least include fence and status */ + if (unlikely(len < 2)) { + CT_ERROR(ct, "Corrupted response %*ph\n", msgsize, msg); + return -EPROTO; + } + + fence = msg[1]; + status = msg[2]; + datalen = len - 2; + + /* Format of the status follows RESPONSE message */ + if (unlikely(!INTEL_GUC_MSG_IS_RESPONSE(status))) { + CT_ERROR(ct, "Corrupted response %*ph\n", msgsize, msg); + return -EPROTO; + } + + CT_DEBUG(ct, "response fence %u status %#x\n", fence, status); + + spin_lock(&ct->requests.lock); + list_for_each_entry(req, &ct->requests.pending, link) { + if (unlikely(fence != req->fence)) { + CT_DEBUG(ct, "request %u awaits response\n", + req->fence); + continue; + } + if (unlikely(datalen > req->response_len)) { + CT_ERROR(ct, "Response for %u is too long %*ph\n", + req->fence, msgsize, msg); + datalen = 0; + } + if (datalen) + memcpy(req->response_buf, msg + 3, 4 * datalen); + req->response_len = datalen; + WRITE_ONCE(req->status, status); + found = true; + break; + } + spin_unlock(&ct->requests.lock); + + if (!found) + CT_ERROR(ct, "Unsolicited response %*ph\n", msgsize, msg); + return 0; +} + +static void ct_process_request(struct intel_guc_ct *ct, + u32 action, u32 len, const u32 *payload) +{ + struct intel_guc *guc = ct_to_guc(ct); + int ret; + + CT_DEBUG(ct, "request %x %*ph\n", action, 4 * len, payload); + + switch (action) { + case INTEL_GUC_ACTION_DEFAULT: + ret = intel_guc_to_host_process_recv_msg(guc, payload, len); + if (unlikely(ret)) + goto fail_unexpected; + break; + + default: +fail_unexpected: + CT_ERROR(ct, "Unexpected request %x %*ph\n", + action, 4 * len, payload); + break; + } +} + +static bool ct_process_incoming_requests(struct intel_guc_ct *ct) +{ + unsigned long flags; + struct ct_incoming_request *request; + u32 header; + u32 *payload; + bool done; + + spin_lock_irqsave(&ct->requests.lock, flags); + request = list_first_entry_or_null(&ct->requests.incoming, + struct ct_incoming_request, link); + if (request) + list_del(&request->link); + done = !!list_empty(&ct->requests.incoming); + spin_unlock_irqrestore(&ct->requests.lock, flags); + + if (!request) + return true; + + header = request->msg[0]; + payload = &request->msg[1]; + ct_process_request(ct, + ct_header_get_action(header), + ct_header_get_len(header), + payload); + + kfree(request); + return done; +} + +static void ct_incoming_request_worker_func(struct work_struct *w) +{ + struct intel_guc_ct *ct = + container_of(w, struct intel_guc_ct, requests.worker); + bool done; + + done = ct_process_incoming_requests(ct); + if (!done) + queue_work(system_unbound_wq, &ct->requests.worker); +} + +/** + * DOC: CTB GuC to Host request + * + * Format of the CTB GuC to Host request message is as follows:: + * + * +------------+---------+---------+---------+---------+---------+ + * | msg[0] | [1] | [2] | [3] | ... | [n-1] | + * +------------+---------+---------+---------+---------+---------+ + * | MESSAGE | MESSAGE PAYLOAD | + * + HEADER +---------+---------+---------+---------+---------+ + * | | 0 | 1 | 2 | ... | n | + * +============+=========+=========+=========+=========+=========+ + * | len | request specific data | + * +------+-----+---------+---------+---------+---------+---------+ + * + * ^-----------------------len-----------------------^ + */ + +static int ct_handle_request(struct intel_guc_ct *ct, const u32 *msg) +{ + u32 header = msg[0]; + u32 len = ct_header_get_len(header); + u32 msgsize = (len + 1) * sizeof(u32); /* msg size in bytes w/header */ + struct ct_incoming_request *request; + unsigned long flags; + + GEM_BUG_ON(ct_header_is_response(header)); + + request = kmalloc(sizeof(*request) + msgsize, GFP_ATOMIC); + if (unlikely(!request)) { + CT_ERROR(ct, "Dropping request %*ph\n", msgsize, msg); + return 0; /* XXX: -ENOMEM ? */ + } + memcpy(request->msg, msg, msgsize); + + spin_lock_irqsave(&ct->requests.lock, flags); + list_add_tail(&request->link, &ct->requests.incoming); + spin_unlock_irqrestore(&ct->requests.lock, flags); + + queue_work(system_unbound_wq, &ct->requests.worker); + return 0; +} + +/* + * When we're communicating with the GuC over CT, GuC uses events + * to notify us about new messages being posted on the RECV buffer. + */ +void intel_guc_ct_event_handler(struct intel_guc_ct *ct) +{ + u32 msg[GUC_CT_MSG_LEN_MASK + 1]; /* one extra dw for the header */ + int err = 0; + + if (unlikely(!ct->enabled)) { + WARN(1, "Unexpected GuC event received while CT disabled!\n"); + return; + } + + do { + err = ct_read(ct, msg); + if (err) + break; + + if (ct_header_is_response(msg[0])) + err = ct_handle_response(ct, msg); + else + err = ct_handle_request(ct, msg); + } while (!err); +} diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h new file mode 100644 index 000000000..494a51a52 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2016-2019 Intel Corporation + */ + +#ifndef _INTEL_GUC_CT_H_ +#define _INTEL_GUC_CT_H_ + +#include <linux/spinlock.h> +#include <linux/workqueue.h> + +#include "intel_guc_fwif.h" + +struct i915_vma; +struct intel_guc; + +/** + * DOC: Command Transport (CT). + * + * Buffer based command transport is a replacement for MMIO based mechanism. + * It can be used to perform both host-2-guc and guc-to-host communication. + */ + +/** Represents single command transport buffer. + * + * A single command transport buffer consists of two parts, the header + * record (command transport buffer descriptor) and the actual buffer which + * holds the commands. + * + * @desc: pointer to the buffer descriptor + * @cmds: pointer to the commands buffer + */ +struct intel_guc_ct_buffer { + struct guc_ct_buffer_desc *desc; + u32 *cmds; +}; + + +/** Top-level structure for Command Transport related data + * + * Includes a pair of CT buffers for bi-directional communication and tracking + * for the H2G and G2H requests sent and received through the buffers. + */ +struct intel_guc_ct { + struct i915_vma *vma; + bool enabled; + + /* buffers for sending(0) and receiving(1) commands */ + struct intel_guc_ct_buffer ctbs[2]; + + struct { + u32 last_fence; /* last fence used to send request */ + + spinlock_t lock; /* protects pending requests list */ + struct list_head pending; /* requests waiting for response */ + + struct list_head incoming; /* incoming requests */ + struct work_struct worker; /* handler for incoming requests */ + } requests; +}; + +void intel_guc_ct_init_early(struct intel_guc_ct *ct); +int intel_guc_ct_init(struct intel_guc_ct *ct); +void intel_guc_ct_fini(struct intel_guc_ct *ct); +int intel_guc_ct_enable(struct intel_guc_ct *ct); +void intel_guc_ct_disable(struct intel_guc_ct *ct); + +static inline void intel_guc_ct_sanitize(struct intel_guc_ct *ct) +{ + ct->enabled = false; +} + +static inline bool intel_guc_ct_enabled(struct intel_guc_ct *ct) +{ + return ct->enabled; +} + +int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len, + u32 *response_buf, u32 response_buf_size); +void intel_guc_ct_event_handler(struct intel_guc_ct *ct); + +#endif /* _INTEL_GUC_CT_H_ */ diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_debugfs.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_debugfs.c new file mode 100644 index 000000000..fe7cb7b29 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_debugfs.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +#include <drm/drm_print.h> + +#include "gt/debugfs_gt.h" +#include "intel_guc.h" +#include "intel_guc_debugfs.h" +#include "intel_guc_log_debugfs.h" + +static int guc_info_show(struct seq_file *m, void *data) +{ + struct intel_guc *guc = m->private; + struct drm_printer p = drm_seq_file_printer(m); + + if (!intel_guc_is_supported(guc)) + return -ENODEV; + + intel_guc_load_status(guc, &p); + drm_puts(&p, "\n"); + intel_guc_log_info(&guc->log, &p); + + /* Add more as required ... */ + + return 0; +} +DEFINE_GT_DEBUGFS_ATTRIBUTE(guc_info); + +void intel_guc_debugfs_register(struct intel_guc *guc, struct dentry *root) +{ + static const struct debugfs_gt_file files[] = { + { "guc_info", &guc_info_fops, NULL }, + }; + + if (!intel_guc_is_supported(guc)) + return; + + intel_gt_debugfs_register_files(root, files, ARRAY_SIZE(files), guc); + intel_guc_log_debugfs_register(&guc->log, root); +} diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_debugfs.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_debugfs.h new file mode 100644 index 000000000..424c26665 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_debugfs.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2020 Intel Corporation + */ + +#ifndef DEBUGFS_GUC_H +#define DEBUGFS_GUC_H + +struct intel_guc; +struct dentry; + +void intel_guc_debugfs_register(struct intel_guc *guc, struct dentry *root); + +#endif /* DEBUGFS_GUC_H */ diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c new file mode 100644 index 000000000..d4a87f4c9 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2014-2019 Intel Corporation + * + * Authors: + * Vinit Azad <vinit.azad@intel.com> + * Ben Widawsky <ben@bwidawsk.net> + * Dave Gordon <david.s.gordon@intel.com> + * Alex Dai <yu.dai@intel.com> + */ + +#include "gt/intel_gt.h" +#include "intel_guc_fw.h" +#include "i915_drv.h" + +static void guc_prepare_xfer(struct intel_uncore *uncore) +{ + u32 shim_flags = GUC_DISABLE_SRAM_INIT_TO_ZEROES | + GUC_ENABLE_READ_CACHE_LOGIC | + GUC_ENABLE_MIA_CACHING | + GUC_ENABLE_READ_CACHE_FOR_SRAM_DATA | + GUC_ENABLE_READ_CACHE_FOR_WOPCM_DATA | + GUC_ENABLE_MIA_CLOCK_GATING; + + /* Must program this register before loading the ucode with DMA */ + intel_uncore_write(uncore, GUC_SHIM_CONTROL, shim_flags); + + if (IS_GEN9_LP(uncore->i915)) + intel_uncore_write(uncore, GEN9LP_GT_PM_CONFIG, GT_DOORBELL_ENABLE); + else + intel_uncore_write(uncore, GEN9_GT_PM_CONFIG, GT_DOORBELL_ENABLE); + + if (IS_GEN(uncore->i915, 9)) { + /* DOP Clock Gating Enable for GuC clocks */ + intel_uncore_rmw(uncore, GEN7_MISCCPCTL, + 0, GEN8_DOP_CLOCK_GATE_GUC_ENABLE); + + /* allows for 5us (in 10ns units) before GT can go to RC6 */ + intel_uncore_write(uncore, GUC_ARAT_C6DIS, 0x1FF); + } +} + +/* Copy RSA signature from the fw image to HW for verification */ +static void guc_xfer_rsa(struct intel_uc_fw *guc_fw, + struct intel_uncore *uncore) +{ + u32 rsa[UOS_RSA_SCRATCH_COUNT]; + size_t copied; + int i; + + copied = intel_uc_fw_copy_rsa(guc_fw, rsa, sizeof(rsa)); + GEM_BUG_ON(copied < sizeof(rsa)); + + for (i = 0; i < UOS_RSA_SCRATCH_COUNT; i++) + intel_uncore_write(uncore, UOS_RSA_SCRATCH(i), rsa[i]); +} + +/* + * Read the GuC status register (GUC_STATUS) and store it in the + * specified location; then return a boolean indicating whether + * the value matches either of two values representing completion + * of the GuC boot process. + * + * This is used for polling the GuC status in a wait_for() + * loop below. + */ +static inline bool guc_ready(struct intel_uncore *uncore, u32 *status) +{ + u32 val = intel_uncore_read(uncore, GUC_STATUS); + u32 uk_val = val & GS_UKERNEL_MASK; + + *status = val; + return (uk_val == GS_UKERNEL_READY) || + ((val & GS_MIA_CORE_STATE) && (uk_val == GS_UKERNEL_LAPIC_DONE)); +} + +static int guc_wait_ucode(struct intel_uncore *uncore) +{ + u32 status; + int ret; + + /* + * Wait for the GuC to start up. + * NB: Docs recommend not using the interrupt for completion. + * Measurements indicate this should take no more than 20ms, so a + * timeout here indicates that the GuC has failed and is unusable. + * (Higher levels of the driver may decide to reset the GuC and + * attempt the ucode load again if this happens.) + */ + ret = wait_for(guc_ready(uncore, &status), 100); + DRM_DEBUG_DRIVER("GuC status %#x\n", status); + + if ((status & GS_BOOTROM_MASK) == GS_BOOTROM_RSA_FAILED) { + DRM_ERROR("GuC firmware signature verification failed\n"); + ret = -ENOEXEC; + } + + if ((status & GS_UKERNEL_MASK) == GS_UKERNEL_EXCEPTION) { + DRM_ERROR("GuC firmware exception. EIP: %#x\n", + intel_uncore_read(uncore, SOFT_SCRATCH(13))); + ret = -ENXIO; + } + + return ret; +} + +/** + * intel_guc_fw_upload() - load GuC uCode to device + * @guc: intel_guc structure + * + * Called from intel_uc_init_hw() during driver load, resume from sleep and + * after a GPU reset. + * + * The firmware image should have already been fetched into memory, so only + * check that fetch succeeded, and then transfer the image to the h/w. + * + * Return: non-zero code on error + */ +int intel_guc_fw_upload(struct intel_guc *guc) +{ + struct intel_gt *gt = guc_to_gt(guc); + struct intel_uncore *uncore = gt->uncore; + int ret; + + guc_prepare_xfer(uncore); + + /* + * Note that GuC needs the CSS header plus uKernel code to be copied + * by the DMA engine in one operation, whereas the RSA signature is + * loaded via MMIO. + */ + guc_xfer_rsa(&guc->fw, uncore); + + /* + * Current uCode expects the code to be loaded at 8k; locations below + * this are used for the stack. + */ + ret = intel_uc_fw_upload(&guc->fw, 0x2000, UOS_MOVE); + if (ret) + goto out; + + ret = guc_wait_ucode(uncore); + if (ret) + goto out; + + intel_uc_fw_change_status(&guc->fw, INTEL_UC_FIRMWARE_RUNNING); + return 0; + +out: + intel_uc_fw_change_status(&guc->fw, INTEL_UC_FIRMWARE_FAIL); + return ret; +} diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.h new file mode 100644 index 000000000..0b4d2a9c9 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2017-2019 Intel Corporation + */ + +#ifndef _INTEL_GUC_FW_H_ +#define _INTEL_GUC_FW_H_ + +struct intel_guc; + +int intel_guc_fw_upload(struct intel_guc *guc); + +#endif diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h new file mode 100644 index 000000000..a6b733c14 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h @@ -0,0 +1,603 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2014-2019 Intel Corporation + */ + +#ifndef _INTEL_GUC_FWIF_H +#define _INTEL_GUC_FWIF_H + +#include <linux/bits.h> +#include <linux/compiler.h> +#include <linux/types.h> + +#define GUC_CLIENT_PRIORITY_KMD_HIGH 0 +#define GUC_CLIENT_PRIORITY_HIGH 1 +#define GUC_CLIENT_PRIORITY_KMD_NORMAL 2 +#define GUC_CLIENT_PRIORITY_NORMAL 3 +#define GUC_CLIENT_PRIORITY_NUM 4 + +#define GUC_MAX_STAGE_DESCRIPTORS 1024 +#define GUC_INVALID_STAGE_ID GUC_MAX_STAGE_DESCRIPTORS + +#define GUC_RENDER_ENGINE 0 +#define GUC_VIDEO_ENGINE 1 +#define GUC_BLITTER_ENGINE 2 +#define GUC_VIDEOENHANCE_ENGINE 3 +#define GUC_VIDEO_ENGINE2 4 +#define GUC_MAX_ENGINES_NUM (GUC_VIDEO_ENGINE2 + 1) + +#define GUC_MAX_ENGINE_CLASSES 5 +#define GUC_MAX_INSTANCES_PER_CLASS 16 + +#define GUC_DOORBELL_INVALID 256 + +#define GUC_WQ_SIZE (PAGE_SIZE * 2) + +/* Work queue item header definitions */ +#define WQ_STATUS_ACTIVE 1 +#define WQ_STATUS_SUSPENDED 2 +#define WQ_STATUS_CMD_ERROR 3 +#define WQ_STATUS_ENGINE_ID_NOT_USED 4 +#define WQ_STATUS_SUSPENDED_FROM_RESET 5 +#define WQ_TYPE_SHIFT 0 +#define WQ_TYPE_BATCH_BUF (0x1 << WQ_TYPE_SHIFT) +#define WQ_TYPE_PSEUDO (0x2 << WQ_TYPE_SHIFT) +#define WQ_TYPE_INORDER (0x3 << WQ_TYPE_SHIFT) +#define WQ_TYPE_NOOP (0x4 << WQ_TYPE_SHIFT) +#define WQ_TARGET_SHIFT 10 +#define WQ_LEN_SHIFT 16 +#define WQ_NO_WCFLUSH_WAIT (1 << 27) +#define WQ_PRESENT_WORKLOAD (1 << 28) + +#define WQ_RING_TAIL_SHIFT 20 +#define WQ_RING_TAIL_MAX 0x7FF /* 2^11 QWords */ +#define WQ_RING_TAIL_MASK (WQ_RING_TAIL_MAX << WQ_RING_TAIL_SHIFT) + +#define GUC_STAGE_DESC_ATTR_ACTIVE BIT(0) +#define GUC_STAGE_DESC_ATTR_PENDING_DB BIT(1) +#define GUC_STAGE_DESC_ATTR_KERNEL BIT(2) +#define GUC_STAGE_DESC_ATTR_PREEMPT BIT(3) +#define GUC_STAGE_DESC_ATTR_RESET BIT(4) +#define GUC_STAGE_DESC_ATTR_WQLOCKED BIT(5) +#define GUC_STAGE_DESC_ATTR_PCH BIT(6) +#define GUC_STAGE_DESC_ATTR_TERMINATED BIT(7) + +/* New GuC control data */ +#define GUC_CTL_CTXINFO 0 +#define GUC_CTL_CTXNUM_IN16_SHIFT 0 +#define GUC_CTL_BASE_ADDR_SHIFT 12 + +#define GUC_CTL_LOG_PARAMS 1 +#define GUC_LOG_VALID (1 << 0) +#define GUC_LOG_NOTIFY_ON_HALF_FULL (1 << 1) +#define GUC_LOG_ALLOC_IN_MEGABYTE (1 << 3) +#define GUC_LOG_CRASH_SHIFT 4 +#define GUC_LOG_CRASH_MASK (0x3 << GUC_LOG_CRASH_SHIFT) +#define GUC_LOG_DPC_SHIFT 6 +#define GUC_LOG_DPC_MASK (0x7 << GUC_LOG_DPC_SHIFT) +#define GUC_LOG_ISR_SHIFT 9 +#define GUC_LOG_ISR_MASK (0x7 << GUC_LOG_ISR_SHIFT) +#define GUC_LOG_BUF_ADDR_SHIFT 12 + +#define GUC_CTL_WA 2 +#define GUC_CTL_FEATURE 3 +#define GUC_CTL_DISABLE_SCHEDULER (1 << 14) + +#define GUC_CTL_DEBUG 4 +#define GUC_LOG_VERBOSITY_SHIFT 0 +#define GUC_LOG_VERBOSITY_LOW (0 << GUC_LOG_VERBOSITY_SHIFT) +#define GUC_LOG_VERBOSITY_MED (1 << GUC_LOG_VERBOSITY_SHIFT) +#define GUC_LOG_VERBOSITY_HIGH (2 << GUC_LOG_VERBOSITY_SHIFT) +#define GUC_LOG_VERBOSITY_ULTRA (3 << GUC_LOG_VERBOSITY_SHIFT) +/* Verbosity range-check limits, without the shift */ +#define GUC_LOG_VERBOSITY_MIN 0 +#define GUC_LOG_VERBOSITY_MAX 3 +#define GUC_LOG_VERBOSITY_MASK 0x0000000f +#define GUC_LOG_DESTINATION_MASK (3 << 4) +#define GUC_LOG_DISABLED (1 << 6) +#define GUC_PROFILE_ENABLED (1 << 7) + +#define GUC_CTL_ADS 5 +#define GUC_ADS_ADDR_SHIFT 1 +#define GUC_ADS_ADDR_MASK (0xFFFFF << GUC_ADS_ADDR_SHIFT) + +#define GUC_CTL_MAX_DWORDS (SOFT_SCRATCH_COUNT - 2) /* [1..14] */ + +/* Work item for submitting workloads into work queue of GuC. */ +struct guc_wq_item { + u32 header; + u32 context_desc; + u32 submit_element_info; + u32 fence_id; +} __packed; + +struct guc_process_desc { + u32 stage_id; + u64 db_base_addr; + u32 head; + u32 tail; + u32 error_offset; + u64 wq_base_addr; + u32 wq_size_bytes; + u32 wq_status; + u32 engine_presence; + u32 priority; + u32 reserved[30]; +} __packed; + +/* engine id and context id is packed into guc_execlist_context.context_id*/ +#define GUC_ELC_CTXID_OFFSET 0 +#define GUC_ELC_ENGINE_OFFSET 29 + +/* The execlist context including software and HW information */ +struct guc_execlist_context { + u32 context_desc; + u32 context_id; + u32 ring_status; + u32 ring_lrca; + u32 ring_begin; + u32 ring_end; + u32 ring_next_free_location; + u32 ring_current_tail_pointer_value; + u8 engine_state_submit_value; + u8 engine_state_wait_value; + u16 pagefault_count; + u16 engine_submit_queue_count; +} __packed; + +/* + * This structure describes a stage set arranged for a particular communication + * between uKernel (GuC) and Driver (KMD). Technically, this is known as a + * "GuC Context descriptor" in the specs, but we use the term "stage descriptor" + * to avoid confusion with all the other things already named "context" in the + * driver. A static pool of these descriptors are stored inside a GEM object + * (stage_desc_pool) which is held for the entire lifetime of our interaction + * with the GuC, being allocated before the GuC is loaded with its firmware. + */ +struct guc_stage_desc { + u32 sched_common_area; + u32 stage_id; + u32 pas_id; + u8 engines_used; + u64 db_trigger_cpu; + u32 db_trigger_uk; + u64 db_trigger_phy; + u16 db_id; + + struct guc_execlist_context lrc[GUC_MAX_ENGINES_NUM]; + + u8 attribute; + + u32 priority; + + u32 wq_sampled_tail_offset; + u32 wq_total_submit_enqueues; + + u32 process_desc; + u32 wq_addr; + u32 wq_size; + + u32 engine_presence; + + u8 engine_suspended; + + u8 reserved0[3]; + u64 reserved1[1]; + + u64 desc_private; +} __packed; + +/** + * DOC: CTB based communication + * + * The CTB (command transport buffer) communication between Host and GuC + * is based on u32 data stream written to the shared buffer. One buffer can + * be used to transmit data only in one direction (one-directional channel). + * + * Current status of the each buffer is stored in the buffer descriptor. + * Buffer descriptor holds tail and head fields that represents active data + * stream. The tail field is updated by the data producer (sender), and head + * field is updated by the data consumer (receiver):: + * + * +------------+ + * | DESCRIPTOR | +=================+============+========+ + * +============+ | | MESSAGE(s) | | + * | address |--------->+=================+============+========+ + * +------------+ + * | head | ^-----head--------^ + * +------------+ + * | tail | ^---------tail-----------------^ + * +------------+ + * | size | ^---------------size--------------------^ + * +------------+ + * + * Each message in data stream starts with the single u32 treated as a header, + * followed by optional set of u32 data that makes message specific payload:: + * + * +------------+---------+---------+---------+ + * | MESSAGE | + * +------------+---------+---------+---------+ + * | msg[0] | [1] | ... | [n-1] | + * +------------+---------+---------+---------+ + * | MESSAGE | MESSAGE PAYLOAD | + * + HEADER +---------+---------+---------+ + * | | 0 | ... | n | + * +======+=====+=========+=========+=========+ + * | 31:16| code| | | | + * +------+-----+ | | | + * | 15:5|flags| | | | + * +------+-----+ | | | + * | 4:0| len| | | | + * +------+-----+---------+---------+---------+ + * + * ^-------------len-------------^ + * + * The message header consists of: + * + * - **len**, indicates length of the message payload (in u32) + * - **code**, indicates message code + * - **flags**, holds various bits to control message handling + */ + +/* + * Describes single command transport buffer. + * Used by both guc-master and clients. + */ +struct guc_ct_buffer_desc { + u32 addr; /* gfx address */ + u64 host_private; /* host private data */ + u32 size; /* size in bytes */ + u32 head; /* offset updated by GuC*/ + u32 tail; /* offset updated by owner */ + u32 is_in_error; /* error indicator */ + u32 fence; /* fence updated by GuC */ + u32 status; /* status updated by GuC */ + u32 owner; /* id of the channel owner */ + u32 owner_sub_id; /* owner-defined field for extra tracking */ + u32 reserved[5]; +} __packed; + +/* Type of command transport buffer */ +#define INTEL_GUC_CT_BUFFER_TYPE_SEND 0x0u +#define INTEL_GUC_CT_BUFFER_TYPE_RECV 0x1u + +/* + * Definition of the command transport message header (DW0) + * + * bit[4..0] message len (in dwords) + * bit[7..5] reserved + * bit[8] response (G2H only) + * bit[8] write fence to desc (H2G only) + * bit[9] write status to H2G buff (H2G only) + * bit[10] send status back via G2H (H2G only) + * bit[15..11] reserved + * bit[31..16] action code + */ +#define GUC_CT_MSG_LEN_SHIFT 0 +#define GUC_CT_MSG_LEN_MASK 0x1F +#define GUC_CT_MSG_IS_RESPONSE (1 << 8) +#define GUC_CT_MSG_WRITE_FENCE_TO_DESC (1 << 8) +#define GUC_CT_MSG_WRITE_STATUS_TO_BUFF (1 << 9) +#define GUC_CT_MSG_SEND_STATUS (1 << 10) +#define GUC_CT_MSG_ACTION_SHIFT 16 +#define GUC_CT_MSG_ACTION_MASK 0xFFFF + +#define GUC_FORCEWAKE_RENDER (1 << 0) +#define GUC_FORCEWAKE_MEDIA (1 << 1) + +#define GUC_POWER_UNSPECIFIED 0 +#define GUC_POWER_D0 1 +#define GUC_POWER_D1 2 +#define GUC_POWER_D2 3 +#define GUC_POWER_D3 4 + +/* Scheduling policy settings */ + +/* Reset engine upon preempt failure */ +#define POLICY_RESET_ENGINE (1<<0) +/* Preempt to idle on quantum expiry */ +#define POLICY_PREEMPT_TO_IDLE (1<<1) + +#define POLICY_MAX_NUM_WI 15 +#define POLICY_DEFAULT_DPC_PROMOTE_TIME_US 500000 +#define POLICY_DEFAULT_EXECUTION_QUANTUM_US 1000000 +#define POLICY_DEFAULT_PREEMPTION_TIME_US 500000 +#define POLICY_DEFAULT_FAULT_TIME_US 250000 + +struct guc_policy { + /* Time for one workload to execute. (in micro seconds) */ + u32 execution_quantum; + /* Time to wait for a preemption request to completed before issuing a + * reset. (in micro seconds). */ + u32 preemption_time; + /* How much time to allow to run after the first fault is observed. + * Then preempt afterwards. (in micro seconds) */ + u32 fault_time; + u32 policy_flags; + u32 reserved[8]; +} __packed; + +struct guc_policies { + struct guc_policy policy[GUC_CLIENT_PRIORITY_NUM][GUC_MAX_ENGINE_CLASSES]; + u32 submission_queue_depth[GUC_MAX_ENGINE_CLASSES]; + /* In micro seconds. How much time to allow before DPC processing is + * called back via interrupt (to prevent DPC queue drain starving). + * Typically 1000s of micro seconds (example only, not granularity). */ + u32 dpc_promote_time; + + /* Must be set to take these new values. */ + u32 is_valid; + + /* Max number of WIs to process per call. A large value may keep CS + * idle. */ + u32 max_num_work_items; + + u32 reserved[4]; +} __packed; + +/* GuC MMIO reg state struct */ + + +#define GUC_REGSET_MAX_REGISTERS 64 +#define GUC_S3_SAVE_SPACE_PAGES 10 + +struct guc_mmio_reg { + u32 offset; + u32 value; + u32 flags; +#define GUC_REGSET_MASKED (1 << 0) +} __packed; + +struct guc_mmio_regset { + struct guc_mmio_reg registers[GUC_REGSET_MAX_REGISTERS]; + u32 values_valid; + u32 number_of_registers; +} __packed; + +/* GuC register sets */ +struct guc_mmio_reg_state { + struct guc_mmio_regset engine_reg[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS]; + u32 reserved[98]; +} __packed; + +/* HW info */ +struct guc_gt_system_info { + u32 slice_enabled; + u32 rcs_enabled; + u32 reserved0; + u32 bcs_enabled; + u32 vdbox_enable_mask; + u32 vdbox_sfc_support_mask; + u32 vebox_enable_mask; + u32 reserved[9]; +} __packed; + +/* Clients info */ +struct guc_ct_pool_entry { + struct guc_ct_buffer_desc desc; + u32 reserved[7]; +} __packed; + +#define GUC_CT_POOL_SIZE 2 + +struct guc_clients_info { + u32 clients_num; + u32 reserved0[13]; + u32 ct_pool_addr; + u32 ct_pool_count; + u32 reserved[4]; +} __packed; + +/* GuC Additional Data Struct */ +struct guc_ads { + u32 reg_state_addr; + u32 reg_state_buffer; + u32 scheduler_policies; + u32 gt_system_info; + u32 clients_info; + u32 control_data; + u32 golden_context_lrca[GUC_MAX_ENGINE_CLASSES]; + u32 eng_state_size[GUC_MAX_ENGINE_CLASSES]; + u32 reserved[16]; +} __packed; + +/* GuC logging structures */ + +enum guc_log_buffer_type { + GUC_ISR_LOG_BUFFER, + GUC_DPC_LOG_BUFFER, + GUC_CRASH_DUMP_LOG_BUFFER, + GUC_MAX_LOG_BUFFER +}; + +/** + * struct guc_log_buffer_state - GuC log buffer state + * + * Below state structure is used for coordination of retrieval of GuC firmware + * logs. Separate state is maintained for each log buffer type. + * read_ptr points to the location where i915 read last in log buffer and + * is read only for GuC firmware. write_ptr is incremented by GuC with number + * of bytes written for each log entry and is read only for i915. + * When any type of log buffer becomes half full, GuC sends a flush interrupt. + * GuC firmware expects that while it is writing to 2nd half of the buffer, + * first half would get consumed by Host and then get a flush completed + * acknowledgment from Host, so that it does not end up doing any overwrite + * causing loss of logs. So when buffer gets half filled & i915 has requested + * for interrupt, GuC will set flush_to_file field, set the sampled_write_ptr + * to the value of write_ptr and raise the interrupt. + * On receiving the interrupt i915 should read the buffer, clear flush_to_file + * field and also update read_ptr with the value of sample_write_ptr, before + * sending an acknowledgment to GuC. marker & version fields are for internal + * usage of GuC and opaque to i915. buffer_full_cnt field is incremented every + * time GuC detects the log buffer overflow. + */ +struct guc_log_buffer_state { + u32 marker[2]; + u32 read_ptr; + u32 write_ptr; + u32 size; + u32 sampled_write_ptr; + union { + struct { + u32 flush_to_file:1; + u32 buffer_full_cnt:4; + u32 reserved:27; + }; + u32 flags; + }; + u32 version; +} __packed; + +struct guc_ctx_report { + u32 report_return_status; + u32 reserved1[64]; + u32 affected_count; + u32 reserved2[2]; +} __packed; + +/* GuC Shared Context Data Struct */ +struct guc_shared_ctx_data { + u32 addr_of_last_preempted_data_low; + u32 addr_of_last_preempted_data_high; + u32 addr_of_last_preempted_data_high_tmp; + u32 padding; + u32 is_mapped_to_proxy; + u32 proxy_ctx_id; + u32 engine_reset_ctx_id; + u32 media_reset_count; + u32 reserved1[8]; + u32 uk_last_ctx_switch_reason; + u32 was_reset; + u32 lrca_gpu_addr; + u64 execlist_ctx; + u32 reserved2[66]; + struct guc_ctx_report preempt_ctx_report[GUC_MAX_ENGINES_NUM]; +} __packed; + +/** + * DOC: MMIO based communication + * + * The MMIO based communication between Host and GuC uses software scratch + * registers, where first register holds data treated as message header, + * and other registers are used to hold message payload. + * + * For Gen9+, GuC uses software scratch registers 0xC180-0xC1B8, + * but no H2G command takes more than 8 parameters and the GuC FW + * itself uses an 8-element array to store the H2G message. + * + * +-----------+---------+---------+---------+ + * | MMIO[0] | MMIO[1] | ... | MMIO[n] | + * +-----------+---------+---------+---------+ + * | header | optional payload | + * +======+====+=========+=========+=========+ + * | 31:28|type| | | | + * +------+----+ | | | + * | 27:16|data| | | | + * +------+----+ | | | + * | 15:0|code| | | | + * +------+----+---------+---------+---------+ + * + * The message header consists of: + * + * - **type**, indicates message type + * - **code**, indicates message code, is specific for **type** + * - **data**, indicates message data, optional, depends on **code** + * + * The following message **types** are supported: + * + * - **REQUEST**, indicates Host-to-GuC request, requested GuC action code + * must be priovided in **code** field. Optional action specific parameters + * can be provided in remaining payload registers or **data** field. + * + * - **RESPONSE**, indicates GuC-to-Host response from earlier GuC request, + * action response status will be provided in **code** field. Optional + * response data can be returned in remaining payload registers or **data** + * field. + */ + +#define GUC_MAX_MMIO_MSG_LEN 8 + +#define INTEL_GUC_MSG_TYPE_SHIFT 28 +#define INTEL_GUC_MSG_TYPE_MASK (0xF << INTEL_GUC_MSG_TYPE_SHIFT) +#define INTEL_GUC_MSG_DATA_SHIFT 16 +#define INTEL_GUC_MSG_DATA_MASK (0xFFF << INTEL_GUC_MSG_DATA_SHIFT) +#define INTEL_GUC_MSG_CODE_SHIFT 0 +#define INTEL_GUC_MSG_CODE_MASK (0xFFFF << INTEL_GUC_MSG_CODE_SHIFT) + +#define __INTEL_GUC_MSG_GET(T, m) \ + (((m) & INTEL_GUC_MSG_ ## T ## _MASK) >> INTEL_GUC_MSG_ ## T ## _SHIFT) +#define INTEL_GUC_MSG_TO_TYPE(m) __INTEL_GUC_MSG_GET(TYPE, m) +#define INTEL_GUC_MSG_TO_DATA(m) __INTEL_GUC_MSG_GET(DATA, m) +#define INTEL_GUC_MSG_TO_CODE(m) __INTEL_GUC_MSG_GET(CODE, m) + +enum intel_guc_msg_type { + INTEL_GUC_MSG_TYPE_REQUEST = 0x0, + INTEL_GUC_MSG_TYPE_RESPONSE = 0xF, +}; + +#define __INTEL_GUC_MSG_TYPE_IS(T, m) \ + (INTEL_GUC_MSG_TO_TYPE(m) == INTEL_GUC_MSG_TYPE_ ## T) +#define INTEL_GUC_MSG_IS_REQUEST(m) __INTEL_GUC_MSG_TYPE_IS(REQUEST, m) +#define INTEL_GUC_MSG_IS_RESPONSE(m) __INTEL_GUC_MSG_TYPE_IS(RESPONSE, m) + +enum intel_guc_action { + INTEL_GUC_ACTION_DEFAULT = 0x0, + INTEL_GUC_ACTION_REQUEST_PREEMPTION = 0x2, + INTEL_GUC_ACTION_REQUEST_ENGINE_RESET = 0x3, + INTEL_GUC_ACTION_ALLOCATE_DOORBELL = 0x10, + INTEL_GUC_ACTION_DEALLOCATE_DOORBELL = 0x20, + INTEL_GUC_ACTION_LOG_BUFFER_FILE_FLUSH_COMPLETE = 0x30, + INTEL_GUC_ACTION_UK_LOG_ENABLE_LOGGING = 0x40, + INTEL_GUC_ACTION_FORCE_LOG_BUFFER_FLUSH = 0x302, + INTEL_GUC_ACTION_ENTER_S_STATE = 0x501, + INTEL_GUC_ACTION_EXIT_S_STATE = 0x502, + INTEL_GUC_ACTION_SLPC_REQUEST = 0x3003, + INTEL_GUC_ACTION_SAMPLE_FORCEWAKE = 0x3005, + INTEL_GUC_ACTION_AUTHENTICATE_HUC = 0x4000, + INTEL_GUC_ACTION_REGISTER_COMMAND_TRANSPORT_BUFFER = 0x4505, + INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506, + INTEL_GUC_ACTION_LIMIT +}; + +enum intel_guc_preempt_options { + INTEL_GUC_PREEMPT_OPTION_DROP_WORK_Q = 0x4, + INTEL_GUC_PREEMPT_OPTION_DROP_SUBMIT_Q = 0x8, +}; + +enum intel_guc_report_status { + INTEL_GUC_REPORT_STATUS_UNKNOWN = 0x0, + INTEL_GUC_REPORT_STATUS_ACKED = 0x1, + INTEL_GUC_REPORT_STATUS_ERROR = 0x2, + INTEL_GUC_REPORT_STATUS_COMPLETE = 0x4, +}; + +enum intel_guc_sleep_state_status { + INTEL_GUC_SLEEP_STATE_SUCCESS = 0x1, + INTEL_GUC_SLEEP_STATE_PREEMPT_TO_IDLE_FAILED = 0x2, + INTEL_GUC_SLEEP_STATE_ENGINE_RESET_FAILED = 0x3 +#define INTEL_GUC_SLEEP_STATE_INVALID_MASK 0x80000000 +}; + +#define GUC_LOG_CONTROL_LOGGING_ENABLED (1 << 0) +#define GUC_LOG_CONTROL_VERBOSITY_SHIFT 4 +#define GUC_LOG_CONTROL_VERBOSITY_MASK (0xF << GUC_LOG_CONTROL_VERBOSITY_SHIFT) +#define GUC_LOG_CONTROL_DEFAULT_LOGGING (1 << 8) + +enum intel_guc_response_status { + INTEL_GUC_RESPONSE_STATUS_SUCCESS = 0x0, + INTEL_GUC_RESPONSE_STATUS_GENERIC_FAIL = 0xF000, +}; + +#define INTEL_GUC_MSG_IS_RESPONSE_SUCCESS(m) \ + (typecheck(u32, (m)) && \ + ((m) & (INTEL_GUC_MSG_TYPE_MASK | INTEL_GUC_MSG_CODE_MASK)) == \ + ((INTEL_GUC_MSG_TYPE_RESPONSE << INTEL_GUC_MSG_TYPE_SHIFT) | \ + (INTEL_GUC_RESPONSE_STATUS_SUCCESS << INTEL_GUC_MSG_CODE_SHIFT))) + +/* This action will be programmed in C1BC - SOFT_SCRATCH_15_REG */ +enum intel_guc_recv_message { + INTEL_GUC_RECV_MSG_CRASH_DUMP_POSTED = BIT(1), + INTEL_GUC_RECV_MSG_FLUSH_LOG_BUFFER = BIT(3) +}; + +#endif diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c new file mode 100644 index 000000000..9bbe8a795 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c @@ -0,0 +1,764 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2014-2019 Intel Corporation + */ + +#include <linux/debugfs.h> + +#include "gt/intel_gt.h" +#include "i915_drv.h" +#include "i915_memcpy.h" +#include "intel_guc_log.h" + +static void guc_log_capture_logs(struct intel_guc_log *log); + +/** + * DOC: GuC firmware log + * + * Firmware log is enabled by setting i915.guc_log_level to the positive level. + * Log data is printed out via reading debugfs i915_guc_log_dump. Reading from + * i915_guc_load_status will print out firmware loading status and scratch + * registers value. + */ + +static int guc_action_flush_log_complete(struct intel_guc *guc) +{ + u32 action[] = { + INTEL_GUC_ACTION_LOG_BUFFER_FILE_FLUSH_COMPLETE + }; + + return intel_guc_send(guc, action, ARRAY_SIZE(action)); +} + +static int guc_action_flush_log(struct intel_guc *guc) +{ + u32 action[] = { + INTEL_GUC_ACTION_FORCE_LOG_BUFFER_FLUSH, + 0 + }; + + return intel_guc_send(guc, action, ARRAY_SIZE(action)); +} + +static int guc_action_control_log(struct intel_guc *guc, bool enable, + bool default_logging, u32 verbosity) +{ + u32 action[] = { + INTEL_GUC_ACTION_UK_LOG_ENABLE_LOGGING, + (enable ? GUC_LOG_CONTROL_LOGGING_ENABLED : 0) | + (verbosity << GUC_LOG_CONTROL_VERBOSITY_SHIFT) | + (default_logging ? GUC_LOG_CONTROL_DEFAULT_LOGGING : 0) + }; + + GEM_BUG_ON(verbosity > GUC_LOG_VERBOSITY_MAX); + + return intel_guc_send(guc, action, ARRAY_SIZE(action)); +} + +static void guc_log_enable_flush_events(struct intel_guc_log *log) +{ + intel_guc_enable_msg(log_to_guc(log), + INTEL_GUC_RECV_MSG_FLUSH_LOG_BUFFER | + INTEL_GUC_RECV_MSG_CRASH_DUMP_POSTED); +} + +static void guc_log_disable_flush_events(struct intel_guc_log *log) +{ + intel_guc_disable_msg(log_to_guc(log), + INTEL_GUC_RECV_MSG_FLUSH_LOG_BUFFER | + INTEL_GUC_RECV_MSG_CRASH_DUMP_POSTED); +} + +/* + * Sub buffer switch callback. Called whenever relay has to switch to a new + * sub buffer, relay stays on the same sub buffer if 0 is returned. + */ +static int subbuf_start_callback(struct rchan_buf *buf, + void *subbuf, + void *prev_subbuf, + size_t prev_padding) +{ + /* + * Use no-overwrite mode by default, where relay will stop accepting + * new data if there are no empty sub buffers left. + * There is no strict synchronization enforced by relay between Consumer + * and Producer. In overwrite mode, there is a possibility of getting + * inconsistent/garbled data, the producer could be writing on to the + * same sub buffer from which Consumer is reading. This can't be avoided + * unless Consumer is fast enough and can always run in tandem with + * Producer. + */ + if (relay_buf_full(buf)) + return 0; + + return 1; +} + +/* + * file_create() callback. Creates relay file in debugfs. + */ +static struct dentry *create_buf_file_callback(const char *filename, + struct dentry *parent, + umode_t mode, + struct rchan_buf *buf, + int *is_global) +{ + struct dentry *buf_file; + + /* + * This to enable the use of a single buffer for the relay channel and + * correspondingly have a single file exposed to User, through which + * it can collect the logs in order without any post-processing. + * Need to set 'is_global' even if parent is NULL for early logging. + */ + *is_global = 1; + + if (!parent) + return NULL; + + buf_file = debugfs_create_file(filename, mode, + parent, buf, &relay_file_operations); + if (IS_ERR(buf_file)) + return NULL; + + return buf_file; +} + +/* + * file_remove() default callback. Removes relay file in debugfs. + */ +static int remove_buf_file_callback(struct dentry *dentry) +{ + debugfs_remove(dentry); + return 0; +} + +/* relay channel callbacks */ +static struct rchan_callbacks relay_callbacks = { + .subbuf_start = subbuf_start_callback, + .create_buf_file = create_buf_file_callback, + .remove_buf_file = remove_buf_file_callback, +}; + +static void guc_move_to_next_buf(struct intel_guc_log *log) +{ + /* + * Make sure the updates made in the sub buffer are visible when + * Consumer sees the following update to offset inside the sub buffer. + */ + smp_wmb(); + + /* All data has been written, so now move the offset of sub buffer. */ + relay_reserve(log->relay.channel, log->vma->obj->base.size); + + /* Switch to the next sub buffer */ + relay_flush(log->relay.channel); +} + +static void *guc_get_write_buffer(struct intel_guc_log *log) +{ + /* + * Just get the base address of a new sub buffer and copy data into it + * ourselves. NULL will be returned in no-overwrite mode, if all sub + * buffers are full. Could have used the relay_write() to indirectly + * copy the data, but that would have been bit convoluted, as we need to + * write to only certain locations inside a sub buffer which cannot be + * done without using relay_reserve() along with relay_write(). So its + * better to use relay_reserve() alone. + */ + return relay_reserve(log->relay.channel, 0); +} + +static bool guc_check_log_buf_overflow(struct intel_guc_log *log, + enum guc_log_buffer_type type, + unsigned int full_cnt) +{ + unsigned int prev_full_cnt = log->stats[type].sampled_overflow; + bool overflow = false; + + if (full_cnt != prev_full_cnt) { + overflow = true; + + log->stats[type].overflow = full_cnt; + log->stats[type].sampled_overflow += full_cnt - prev_full_cnt; + + if (full_cnt < prev_full_cnt) { + /* buffer_full_cnt is a 4 bit counter */ + log->stats[type].sampled_overflow += 16; + } + + dev_notice_ratelimited(guc_to_gt(log_to_guc(log))->i915->drm.dev, + "GuC log buffer overflow\n"); + } + + return overflow; +} + +static unsigned int guc_get_log_buffer_size(enum guc_log_buffer_type type) +{ + switch (type) { + case GUC_ISR_LOG_BUFFER: + return ISR_BUFFER_SIZE; + case GUC_DPC_LOG_BUFFER: + return DPC_BUFFER_SIZE; + case GUC_CRASH_DUMP_LOG_BUFFER: + return CRASH_BUFFER_SIZE; + default: + MISSING_CASE(type); + } + + return 0; +} + +static void guc_read_update_log_buffer(struct intel_guc_log *log) +{ + unsigned int buffer_size, read_offset, write_offset, bytes_to_copy, full_cnt; + struct guc_log_buffer_state *log_buf_state, *log_buf_snapshot_state; + struct guc_log_buffer_state log_buf_state_local; + enum guc_log_buffer_type type; + void *src_data, *dst_data; + bool new_overflow; + + mutex_lock(&log->relay.lock); + + if (WARN_ON(!intel_guc_log_relay_created(log))) + goto out_unlock; + + /* Get the pointer to shared GuC log buffer */ + log_buf_state = src_data = log->relay.buf_addr; + + /* Get the pointer to local buffer to store the logs */ + log_buf_snapshot_state = dst_data = guc_get_write_buffer(log); + + if (unlikely(!log_buf_snapshot_state)) { + /* + * Used rate limited to avoid deluge of messages, logs might be + * getting consumed by User at a slow rate. + */ + DRM_ERROR_RATELIMITED("no sub-buffer to capture logs\n"); + log->relay.full_count++; + + goto out_unlock; + } + + /* Actual logs are present from the 2nd page */ + src_data += PAGE_SIZE; + dst_data += PAGE_SIZE; + + for (type = GUC_ISR_LOG_BUFFER; type < GUC_MAX_LOG_BUFFER; type++) { + /* + * Make a copy of the state structure, inside GuC log buffer + * (which is uncached mapped), on the stack to avoid reading + * from it multiple times. + */ + memcpy(&log_buf_state_local, log_buf_state, + sizeof(struct guc_log_buffer_state)); + buffer_size = guc_get_log_buffer_size(type); + read_offset = log_buf_state_local.read_ptr; + write_offset = log_buf_state_local.sampled_write_ptr; + full_cnt = log_buf_state_local.buffer_full_cnt; + + /* Bookkeeping stuff */ + log->stats[type].flush += log_buf_state_local.flush_to_file; + new_overflow = guc_check_log_buf_overflow(log, type, full_cnt); + + /* Update the state of shared log buffer */ + log_buf_state->read_ptr = write_offset; + log_buf_state->flush_to_file = 0; + log_buf_state++; + + /* First copy the state structure in snapshot buffer */ + memcpy(log_buf_snapshot_state, &log_buf_state_local, + sizeof(struct guc_log_buffer_state)); + + /* + * The write pointer could have been updated by GuC firmware, + * after sending the flush interrupt to Host, for consistency + * set write pointer value to same value of sampled_write_ptr + * in the snapshot buffer. + */ + log_buf_snapshot_state->write_ptr = write_offset; + log_buf_snapshot_state++; + + /* Now copy the actual logs. */ + if (unlikely(new_overflow)) { + /* copy the whole buffer in case of overflow */ + read_offset = 0; + write_offset = buffer_size; + } else if (unlikely((read_offset > buffer_size) || + (write_offset > buffer_size))) { + DRM_ERROR("invalid log buffer state\n"); + /* copy whole buffer as offsets are unreliable */ + read_offset = 0; + write_offset = buffer_size; + } + + /* Just copy the newly written data */ + if (read_offset > write_offset) { + i915_memcpy_from_wc(dst_data, src_data, write_offset); + bytes_to_copy = buffer_size - read_offset; + } else { + bytes_to_copy = write_offset - read_offset; + } + i915_memcpy_from_wc(dst_data + read_offset, + src_data + read_offset, bytes_to_copy); + + src_data += buffer_size; + dst_data += buffer_size; + } + + guc_move_to_next_buf(log); + +out_unlock: + mutex_unlock(&log->relay.lock); +} + +static void capture_logs_work(struct work_struct *work) +{ + struct intel_guc_log *log = + container_of(work, struct intel_guc_log, relay.flush_work); + + guc_log_capture_logs(log); +} + +static int guc_log_map(struct intel_guc_log *log) +{ + void *vaddr; + + lockdep_assert_held(&log->relay.lock); + + if (!log->vma) + return -ENODEV; + + /* + * Create a WC (Uncached for read) vmalloc mapping of log + * buffer pages, so that we can directly get the data + * (up-to-date) from memory. + */ + vaddr = i915_gem_object_pin_map(log->vma->obj, I915_MAP_WC); + if (IS_ERR(vaddr)) + return PTR_ERR(vaddr); + + log->relay.buf_addr = vaddr; + + return 0; +} + +static void guc_log_unmap(struct intel_guc_log *log) +{ + lockdep_assert_held(&log->relay.lock); + + i915_gem_object_unpin_map(log->vma->obj); + log->relay.buf_addr = NULL; +} + +void intel_guc_log_init_early(struct intel_guc_log *log) +{ + mutex_init(&log->relay.lock); + INIT_WORK(&log->relay.flush_work, capture_logs_work); + log->relay.started = false; +} + +static int guc_log_relay_create(struct intel_guc_log *log) +{ + struct intel_guc *guc = log_to_guc(log); + struct drm_i915_private *dev_priv = guc_to_gt(guc)->i915; + struct rchan *guc_log_relay_chan; + size_t n_subbufs, subbuf_size; + int ret; + + lockdep_assert_held(&log->relay.lock); + GEM_BUG_ON(!log->vma); + + /* Keep the size of sub buffers same as shared log buffer */ + subbuf_size = log->vma->size; + + /* + * Store up to 8 snapshots, which is large enough to buffer sufficient + * boot time logs and provides enough leeway to User, in terms of + * latency, for consuming the logs from relay. Also doesn't take + * up too much memory. + */ + n_subbufs = 8; + + guc_log_relay_chan = relay_open("guc_log", + dev_priv->drm.primary->debugfs_root, + subbuf_size, n_subbufs, + &relay_callbacks, dev_priv); + if (!guc_log_relay_chan) { + DRM_ERROR("Couldn't create relay chan for GuC logging\n"); + + ret = -ENOMEM; + return ret; + } + + GEM_BUG_ON(guc_log_relay_chan->subbuf_size < subbuf_size); + log->relay.channel = guc_log_relay_chan; + + return 0; +} + +static void guc_log_relay_destroy(struct intel_guc_log *log) +{ + lockdep_assert_held(&log->relay.lock); + + relay_close(log->relay.channel); + log->relay.channel = NULL; +} + +static void guc_log_capture_logs(struct intel_guc_log *log) +{ + struct intel_guc *guc = log_to_guc(log); + struct drm_i915_private *dev_priv = guc_to_gt(guc)->i915; + intel_wakeref_t wakeref; + + guc_read_update_log_buffer(log); + + /* + * Generally device is expected to be active only at this + * time, so get/put should be really quick. + */ + with_intel_runtime_pm(&dev_priv->runtime_pm, wakeref) + guc_action_flush_log_complete(guc); +} + +static u32 __get_default_log_level(struct intel_guc_log *log) +{ + struct intel_guc *guc = log_to_guc(log); + struct drm_i915_private *i915 = guc_to_gt(guc)->i915; + + /* A negative value means "use platform/config default" */ + if (i915->params.guc_log_level < 0) { + return (IS_ENABLED(CONFIG_DRM_I915_DEBUG) || + IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) ? + GUC_LOG_LEVEL_MAX : GUC_LOG_LEVEL_NON_VERBOSE; + } + + if (i915->params.guc_log_level > GUC_LOG_LEVEL_MAX) { + DRM_WARN("Incompatible option detected: %s=%d, %s!\n", + "guc_log_level", i915->params.guc_log_level, + "verbosity too high"); + return (IS_ENABLED(CONFIG_DRM_I915_DEBUG) || + IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) ? + GUC_LOG_LEVEL_MAX : GUC_LOG_LEVEL_DISABLED; + } + + GEM_BUG_ON(i915->params.guc_log_level < GUC_LOG_LEVEL_DISABLED); + GEM_BUG_ON(i915->params.guc_log_level > GUC_LOG_LEVEL_MAX); + return i915->params.guc_log_level; +} + +int intel_guc_log_create(struct intel_guc_log *log) +{ + struct intel_guc *guc = log_to_guc(log); + struct i915_vma *vma; + u32 guc_log_size; + int ret; + + GEM_BUG_ON(log->vma); + + /* + * GuC Log buffer Layout + * + * +===============================+ 00B + * | Crash dump state header | + * +-------------------------------+ 32B + * | DPC state header | + * +-------------------------------+ 64B + * | ISR state header | + * +-------------------------------+ 96B + * | | + * +===============================+ PAGE_SIZE (4KB) + * | Crash Dump logs | + * +===============================+ + CRASH_SIZE + * | DPC logs | + * +===============================+ + DPC_SIZE + * | ISR logs | + * +===============================+ + ISR_SIZE + */ + guc_log_size = PAGE_SIZE + CRASH_BUFFER_SIZE + DPC_BUFFER_SIZE + + ISR_BUFFER_SIZE; + + vma = intel_guc_allocate_vma(guc, guc_log_size); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto err; + } + + log->vma = vma; + + log->level = __get_default_log_level(log); + DRM_DEBUG_DRIVER("guc_log_level=%d (%s, verbose:%s, verbosity:%d)\n", + log->level, enableddisabled(log->level), + yesno(GUC_LOG_LEVEL_IS_VERBOSE(log->level)), + GUC_LOG_LEVEL_TO_VERBOSITY(log->level)); + + return 0; + +err: + DRM_ERROR("Failed to allocate GuC log buffer. %d\n", ret); + return ret; +} + +void intel_guc_log_destroy(struct intel_guc_log *log) +{ + i915_vma_unpin_and_release(&log->vma, 0); +} + +int intel_guc_log_set_level(struct intel_guc_log *log, u32 level) +{ + struct intel_guc *guc = log_to_guc(log); + struct drm_i915_private *dev_priv = guc_to_gt(guc)->i915; + intel_wakeref_t wakeref; + int ret = 0; + + BUILD_BUG_ON(GUC_LOG_VERBOSITY_MIN != 0); + GEM_BUG_ON(!log->vma); + + /* + * GuC is recognizing log levels starting from 0 to max, we're using 0 + * as indication that logging should be disabled. + */ + if (level < GUC_LOG_LEVEL_DISABLED || level > GUC_LOG_LEVEL_MAX) + return -EINVAL; + + mutex_lock(&dev_priv->drm.struct_mutex); + + if (log->level == level) + goto out_unlock; + + with_intel_runtime_pm(&dev_priv->runtime_pm, wakeref) + ret = guc_action_control_log(guc, + GUC_LOG_LEVEL_IS_VERBOSE(level), + GUC_LOG_LEVEL_IS_ENABLED(level), + GUC_LOG_LEVEL_TO_VERBOSITY(level)); + if (ret) { + DRM_DEBUG_DRIVER("guc_log_control action failed %d\n", ret); + goto out_unlock; + } + + log->level = level; + +out_unlock: + mutex_unlock(&dev_priv->drm.struct_mutex); + + return ret; +} + +bool intel_guc_log_relay_created(const struct intel_guc_log *log) +{ + return log->relay.buf_addr; +} + +int intel_guc_log_relay_open(struct intel_guc_log *log) +{ + int ret; + + if (!log->vma) + return -ENODEV; + + mutex_lock(&log->relay.lock); + + if (intel_guc_log_relay_created(log)) { + ret = -EEXIST; + goto out_unlock; + } + + /* + * We require SSE 4.1 for fast reads from the GuC log buffer and + * it should be present on the chipsets supporting GuC based + * submisssions. + */ + if (!i915_has_memcpy_from_wc()) { + ret = -ENXIO; + goto out_unlock; + } + + ret = guc_log_relay_create(log); + if (ret) + goto out_unlock; + + ret = guc_log_map(log); + if (ret) + goto out_relay; + + mutex_unlock(&log->relay.lock); + + return 0; + +out_relay: + guc_log_relay_destroy(log); +out_unlock: + mutex_unlock(&log->relay.lock); + + return ret; +} + +int intel_guc_log_relay_start(struct intel_guc_log *log) +{ + if (log->relay.started) + return -EEXIST; + + guc_log_enable_flush_events(log); + + /* + * When GuC is logging without us relaying to userspace, we're ignoring + * the flush notification. This means that we need to unconditionally + * flush on relay enabling, since GuC only notifies us once. + */ + queue_work(system_highpri_wq, &log->relay.flush_work); + + log->relay.started = true; + + return 0; +} + +void intel_guc_log_relay_flush(struct intel_guc_log *log) +{ + struct intel_guc *guc = log_to_guc(log); + intel_wakeref_t wakeref; + + if (!log->relay.started) + return; + + /* + * Before initiating the forceful flush, wait for any pending/ongoing + * flush to complete otherwise forceful flush may not actually happen. + */ + flush_work(&log->relay.flush_work); + + with_intel_runtime_pm(guc_to_gt(guc)->uncore->rpm, wakeref) + guc_action_flush_log(guc); + + /* GuC would have updated log buffer by now, so capture it */ + guc_log_capture_logs(log); +} + +/* + * Stops the relay log. Called from intel_guc_log_relay_close(), so no + * possibility of race with start/flush since relay_write cannot race + * relay_close. + */ +static void guc_log_relay_stop(struct intel_guc_log *log) +{ + struct intel_guc *guc = log_to_guc(log); + struct drm_i915_private *i915 = guc_to_gt(guc)->i915; + + if (!log->relay.started) + return; + + guc_log_disable_flush_events(log); + intel_synchronize_irq(i915); + + flush_work(&log->relay.flush_work); + + log->relay.started = false; +} + +void intel_guc_log_relay_close(struct intel_guc_log *log) +{ + guc_log_relay_stop(log); + + mutex_lock(&log->relay.lock); + GEM_BUG_ON(!intel_guc_log_relay_created(log)); + guc_log_unmap(log); + guc_log_relay_destroy(log); + mutex_unlock(&log->relay.lock); +} + +void intel_guc_log_handle_flush_event(struct intel_guc_log *log) +{ + queue_work(system_highpri_wq, &log->relay.flush_work); +} + +static const char * +stringify_guc_log_type(enum guc_log_buffer_type type) +{ + switch (type) { + case GUC_ISR_LOG_BUFFER: + return "ISR"; + case GUC_DPC_LOG_BUFFER: + return "DPC"; + case GUC_CRASH_DUMP_LOG_BUFFER: + return "CRASH"; + default: + MISSING_CASE(type); + } + + return ""; +} + +/** + * intel_guc_log_info - dump information about GuC log relay + * @log: the GuC log + * @p: the &drm_printer + * + * Pretty printer for GuC log info + */ +void intel_guc_log_info(struct intel_guc_log *log, struct drm_printer *p) +{ + enum guc_log_buffer_type type; + + if (!intel_guc_log_relay_created(log)) { + drm_puts(p, "GuC log relay not created\n"); + return; + } + + drm_puts(p, "GuC logging stats:\n"); + + drm_printf(p, "\tRelay full count: %u\n", log->relay.full_count); + + for (type = GUC_ISR_LOG_BUFFER; type < GUC_MAX_LOG_BUFFER; type++) { + drm_printf(p, "\t%s:\tflush count %10u, overflow count %10u\n", + stringify_guc_log_type(type), + log->stats[type].flush, + log->stats[type].sampled_overflow); + } +} + +/** + * intel_guc_log_dump - dump the contents of the GuC log + * @log: the GuC log + * @p: the &drm_printer + * @dump_load_err: dump the log saved on GuC load error + * + * Pretty printer for the GuC log + */ +int intel_guc_log_dump(struct intel_guc_log *log, struct drm_printer *p, + bool dump_load_err) +{ + struct intel_guc *guc = log_to_guc(log); + struct intel_uc *uc = container_of(guc, struct intel_uc, guc); + struct drm_i915_gem_object *obj = NULL; + u32 *map; + int i = 0; + + if (!intel_guc_is_supported(guc)) + return -ENODEV; + + if (dump_load_err) + obj = uc->load_err_log; + else if (guc->log.vma) + obj = guc->log.vma->obj; + + if (!obj) + return 0; + + map = i915_gem_object_pin_map(obj, I915_MAP_WC); + if (IS_ERR(map)) { + DRM_DEBUG("Failed to pin object\n"); + drm_puts(p, "(log data unaccessible)\n"); + return PTR_ERR(map); + } + + for (i = 0; i < obj->base.size / sizeof(u32); i += 4) + drm_printf(p, "0x%08x 0x%08x 0x%08x 0x%08x\n", + *(map + i), *(map + i + 1), + *(map + i + 2), *(map + i + 3)); + + drm_puts(p, "\n"); + + i915_gem_object_unpin_map(obj); + + return 0; +} diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h new file mode 100644 index 000000000..11fccd0b2 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2014-2019 Intel Corporation + */ + +#ifndef _INTEL_GUC_LOG_H_ +#define _INTEL_GUC_LOG_H_ + +#include <linux/mutex.h> +#include <linux/relay.h> +#include <linux/workqueue.h> + +#include "intel_guc_fwif.h" +#include "i915_gem.h" + +struct intel_guc; + +#ifdef CONFIG_DRM_I915_DEBUG_GUC +#define CRASH_BUFFER_SIZE SZ_2M +#define DPC_BUFFER_SIZE SZ_8M +#define ISR_BUFFER_SIZE SZ_8M +#else +#define CRASH_BUFFER_SIZE SZ_8K +#define DPC_BUFFER_SIZE SZ_32K +#define ISR_BUFFER_SIZE SZ_32K +#endif + +/* + * While we're using plain log level in i915, GuC controls are much more... + * "elaborate"? We have a couple of bits for verbosity, separate bit for actual + * log enabling, and separate bit for default logging - which "conveniently" + * ignores the enable bit. + */ +#define GUC_LOG_LEVEL_DISABLED 0 +#define GUC_LOG_LEVEL_NON_VERBOSE 1 +#define GUC_LOG_LEVEL_IS_ENABLED(x) ((x) > GUC_LOG_LEVEL_DISABLED) +#define GUC_LOG_LEVEL_IS_VERBOSE(x) ((x) > GUC_LOG_LEVEL_NON_VERBOSE) +#define GUC_LOG_LEVEL_TO_VERBOSITY(x) ({ \ + typeof(x) _x = (x); \ + GUC_LOG_LEVEL_IS_VERBOSE(_x) ? _x - 2 : 0; \ +}) +#define GUC_VERBOSITY_TO_LOG_LEVEL(x) ((x) + 2) +#define GUC_LOG_LEVEL_MAX GUC_VERBOSITY_TO_LOG_LEVEL(GUC_LOG_VERBOSITY_MAX) + +struct intel_guc_log { + u32 level; + struct i915_vma *vma; + struct { + void *buf_addr; + bool started; + struct work_struct flush_work; + struct rchan *channel; + struct mutex lock; + u32 full_count; + } relay; + /* logging related stats */ + struct { + u32 sampled_overflow; + u32 overflow; + u32 flush; + } stats[GUC_MAX_LOG_BUFFER]; +}; + +void intel_guc_log_init_early(struct intel_guc_log *log); +int intel_guc_log_create(struct intel_guc_log *log); +void intel_guc_log_destroy(struct intel_guc_log *log); + +int intel_guc_log_set_level(struct intel_guc_log *log, u32 level); +bool intel_guc_log_relay_created(const struct intel_guc_log *log); +int intel_guc_log_relay_open(struct intel_guc_log *log); +int intel_guc_log_relay_start(struct intel_guc_log *log); +void intel_guc_log_relay_flush(struct intel_guc_log *log); +void intel_guc_log_relay_close(struct intel_guc_log *log); + +void intel_guc_log_handle_flush_event(struct intel_guc_log *log); + +static inline u32 intel_guc_log_get_level(struct intel_guc_log *log) +{ + return log->level; +} + +void intel_guc_log_info(struct intel_guc_log *log, struct drm_printer *p); +int intel_guc_log_dump(struct intel_guc_log *log, struct drm_printer *p, + bool dump_load_err); + +#endif diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c new file mode 100644 index 000000000..129e0cf7d --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +#include <linux/fs.h> +#include <drm/drm_print.h> + +#include "gt/debugfs_gt.h" +#include "intel_guc.h" +#include "intel_guc_log.h" +#include "intel_guc_log_debugfs.h" + +static int guc_log_dump_show(struct seq_file *m, void *data) +{ + struct drm_printer p = drm_seq_file_printer(m); + + return intel_guc_log_dump(m->private, &p, false); +} +DEFINE_GT_DEBUGFS_ATTRIBUTE(guc_log_dump); + +static int guc_load_err_log_dump_show(struct seq_file *m, void *data) +{ + struct drm_printer p = drm_seq_file_printer(m); + + return intel_guc_log_dump(m->private, &p, true); +} +DEFINE_GT_DEBUGFS_ATTRIBUTE(guc_load_err_log_dump); + +static int guc_log_level_get(void *data, u64 *val) +{ + struct intel_guc_log *log = data; + + if (!intel_guc_is_used(log_to_guc(log))) + return -ENODEV; + + *val = intel_guc_log_get_level(log); + + return 0; +} + +static int guc_log_level_set(void *data, u64 val) +{ + struct intel_guc_log *log = data; + + if (!intel_guc_is_used(log_to_guc(log))) + return -ENODEV; + + return intel_guc_log_set_level(log, val); +} + +DEFINE_SIMPLE_ATTRIBUTE(guc_log_level_fops, + guc_log_level_get, guc_log_level_set, + "%lld\n"); + +static int guc_log_relay_open(struct inode *inode, struct file *file) +{ + struct intel_guc_log *log = inode->i_private; + + if (!intel_guc_is_ready(log_to_guc(log))) + return -ENODEV; + + file->private_data = log; + + return intel_guc_log_relay_open(log); +} + +static ssize_t +guc_log_relay_write(struct file *filp, + const char __user *ubuf, + size_t cnt, + loff_t *ppos) +{ + struct intel_guc_log *log = filp->private_data; + int val; + int ret; + + ret = kstrtoint_from_user(ubuf, cnt, 0, &val); + if (ret < 0) + return ret; + + /* + * Enable and start the guc log relay on value of 1. + * Flush log relay for any other value. + */ + if (val == 1) + ret = intel_guc_log_relay_start(log); + else + intel_guc_log_relay_flush(log); + + return ret ?: cnt; +} + +static int guc_log_relay_release(struct inode *inode, struct file *file) +{ + struct intel_guc_log *log = inode->i_private; + + intel_guc_log_relay_close(log); + return 0; +} + +static const struct file_operations guc_log_relay_fops = { + .owner = THIS_MODULE, + .open = guc_log_relay_open, + .write = guc_log_relay_write, + .release = guc_log_relay_release, +}; + +void intel_guc_log_debugfs_register(struct intel_guc_log *log, + struct dentry *root) +{ + static const struct debugfs_gt_file files[] = { + { "guc_log_dump", &guc_log_dump_fops, NULL }, + { "guc_load_err_log_dump", &guc_load_err_log_dump_fops, NULL }, + { "guc_log_level", &guc_log_level_fops, NULL }, + { "guc_log_relay", &guc_log_relay_fops, NULL }, + }; + + if (!intel_guc_is_supported(log_to_guc(log))) + return; + + intel_gt_debugfs_register_files(root, files, ARRAY_SIZE(files), log); +} + diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.h new file mode 100644 index 000000000..e8900e3d7 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2020 Intel Corporation + */ + +#ifndef DEBUGFS_GUC_LOG_H +#define DEBUGFS_GUC_LOG_H + +struct intel_guc_log; +struct dentry; + +void intel_guc_log_debugfs_register(struct intel_guc_log *log, + struct dentry *root); + +#endif /* DEBUGFS_GUC_LOG_H */ diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_reg.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_reg.h new file mode 100644 index 000000000..1949346e7 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_reg.h @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2014-2019 Intel Corporation + */ + +#ifndef _INTEL_GUC_REG_H_ +#define _INTEL_GUC_REG_H_ + +#include <linux/compiler.h> +#include <linux/types.h> + +#include "i915_reg.h" + +/* Definitions of GuC H/W registers, bits, etc */ + +#define GUC_STATUS _MMIO(0xc000) +#define GS_RESET_SHIFT 0 +#define GS_MIA_IN_RESET (0x01 << GS_RESET_SHIFT) +#define GS_BOOTROM_SHIFT 1 +#define GS_BOOTROM_MASK (0x7F << GS_BOOTROM_SHIFT) +#define GS_BOOTROM_RSA_FAILED (0x50 << GS_BOOTROM_SHIFT) +#define GS_BOOTROM_JUMP_PASSED (0x76 << GS_BOOTROM_SHIFT) +#define GS_UKERNEL_SHIFT 8 +#define GS_UKERNEL_MASK (0xFF << GS_UKERNEL_SHIFT) +#define GS_UKERNEL_LAPIC_DONE (0x30 << GS_UKERNEL_SHIFT) +#define GS_UKERNEL_DPC_ERROR (0x60 << GS_UKERNEL_SHIFT) +#define GS_UKERNEL_EXCEPTION (0x70 << GS_UKERNEL_SHIFT) +#define GS_UKERNEL_READY (0xF0 << GS_UKERNEL_SHIFT) +#define GS_MIA_SHIFT 16 +#define GS_MIA_MASK (0x07 << GS_MIA_SHIFT) +#define GS_MIA_CORE_STATE (0x01 << GS_MIA_SHIFT) +#define GS_MIA_HALT_REQUESTED (0x02 << GS_MIA_SHIFT) +#define GS_MIA_ISR_ENTRY (0x04 << GS_MIA_SHIFT) +#define GS_AUTH_STATUS_SHIFT 30 +#define GS_AUTH_STATUS_MASK (0x03 << GS_AUTH_STATUS_SHIFT) +#define GS_AUTH_STATUS_BAD (0x01 << GS_AUTH_STATUS_SHIFT) +#define GS_AUTH_STATUS_GOOD (0x02 << GS_AUTH_STATUS_SHIFT) + +#define SOFT_SCRATCH(n) _MMIO(0xc180 + (n) * 4) +#define SOFT_SCRATCH_COUNT 16 + +#define GEN11_SOFT_SCRATCH(n) _MMIO(0x190240 + (n) * 4) +#define GEN11_SOFT_SCRATCH_COUNT 4 + +#define UOS_RSA_SCRATCH(i) _MMIO(0xc200 + (i) * 4) +#define UOS_RSA_SCRATCH_COUNT 64 + +#define DMA_ADDR_0_LOW _MMIO(0xc300) +#define DMA_ADDR_0_HIGH _MMIO(0xc304) +#define DMA_ADDR_1_LOW _MMIO(0xc308) +#define DMA_ADDR_1_HIGH _MMIO(0xc30c) +#define DMA_ADDRESS_SPACE_WOPCM (7 << 16) +#define DMA_ADDRESS_SPACE_GTT (8 << 16) +#define DMA_COPY_SIZE _MMIO(0xc310) +#define DMA_CTRL _MMIO(0xc314) +#define HUC_UKERNEL (1<<9) +#define UOS_MOVE (1<<4) +#define START_DMA (1<<0) +#define DMA_GUC_WOPCM_OFFSET _MMIO(0xc340) +#define GUC_WOPCM_OFFSET_VALID (1<<0) +#define HUC_LOADING_AGENT_VCR (0<<1) +#define HUC_LOADING_AGENT_GUC (1<<1) +#define GUC_WOPCM_OFFSET_SHIFT 14 +#define GUC_WOPCM_OFFSET_MASK (0x3ffff << GUC_WOPCM_OFFSET_SHIFT) +#define GUC_MAX_IDLE_COUNT _MMIO(0xC3E4) + +#define HUC_STATUS2 _MMIO(0xD3B0) +#define HUC_FW_VERIFIED (1<<7) + +#define GEN11_HUC_KERNEL_LOAD_INFO _MMIO(0xC1DC) +#define HUC_LOAD_SUCCESSFUL (1 << 0) + +#define GUC_WOPCM_SIZE _MMIO(0xc050) +#define GUC_WOPCM_SIZE_LOCKED (1<<0) +#define GUC_WOPCM_SIZE_SHIFT 12 +#define GUC_WOPCM_SIZE_MASK (0xfffff << GUC_WOPCM_SIZE_SHIFT) + +#define GEN8_GT_PM_CONFIG _MMIO(0x138140) +#define GEN9LP_GT_PM_CONFIG _MMIO(0x138140) +#define GEN9_GT_PM_CONFIG _MMIO(0x13816c) +#define GT_DOORBELL_ENABLE (1<<0) + +#define GEN8_GTCR _MMIO(0x4274) +#define GEN8_GTCR_INVALIDATE (1<<0) + +#define GEN12_GUC_TLB_INV_CR _MMIO(0xcee8) +#define GEN12_GUC_TLB_INV_CR_INVALIDATE (1 << 0) + +#define GUC_ARAT_C6DIS _MMIO(0xA178) + +#define GUC_SHIM_CONTROL _MMIO(0xc064) +#define GUC_DISABLE_SRAM_INIT_TO_ZEROES (1<<0) +#define GUC_ENABLE_READ_CACHE_LOGIC (1<<1) +#define GUC_ENABLE_MIA_CACHING (1<<2) +#define GUC_GEN10_MSGCH_ENABLE (1<<4) +#define GUC_ENABLE_READ_CACHE_FOR_SRAM_DATA (1<<9) +#define GUC_ENABLE_READ_CACHE_FOR_WOPCM_DATA (1<<10) +#define GUC_ENABLE_MIA_CLOCK_GATING (1<<15) +#define GUC_GEN10_SHIM_WC_ENABLE (1<<21) + +#define GUC_SEND_INTERRUPT _MMIO(0xc4c8) +#define GUC_SEND_TRIGGER (1<<0) +#define GEN11_GUC_HOST_INTERRUPT _MMIO(0x1901f0) + +#define GUC_NUM_DOORBELLS 256 + +/* format of the HW-monitored doorbell cacheline */ +struct guc_doorbell_info { + u32 db_status; +#define GUC_DOORBELL_DISABLED 0 +#define GUC_DOORBELL_ENABLED 1 + + u32 cookie; + u32 reserved[14]; +} __packed; + +#define GEN8_DRBREGL(x) _MMIO(0x1000 + (x) * 8) +#define GEN8_DRB_VALID (1<<0) +#define GEN8_DRBREGU(x) _MMIO(0x1000 + (x) * 8 + 4) + +#define DE_GUCRMR _MMIO(0x44054) + +#define GUC_BCS_RCS_IER _MMIO(0xC550) +#define GUC_VCS2_VCS1_IER _MMIO(0xC554) +#define GUC_WD_VECS_IER _MMIO(0xC558) +#define GUC_PM_P24C_IER _MMIO(0xC55C) + +/* GuC Interrupt Vector */ +#define GUC_INTR_GUC2HOST BIT(15) +#define GUC_INTR_EXEC_ERROR BIT(14) +#define GUC_INTR_DISPLAY_EVENT BIT(13) +#define GUC_INTR_SEM_SIG BIT(12) +#define GUC_INTR_IOMMU2GUC BIT(11) +#define GUC_INTR_DOORBELL_RANG BIT(10) +#define GUC_INTR_DMA_DONE BIT(9) +#define GUC_INTR_FATAL_ERROR BIT(8) +#define GUC_INTR_NOTIF_ERROR BIT(7) +#define GUC_INTR_SW_INT_6 BIT(6) +#define GUC_INTR_SW_INT_5 BIT(5) +#define GUC_INTR_SW_INT_4 BIT(4) +#define GUC_INTR_SW_INT_3 BIT(3) +#define GUC_INTR_SW_INT_2 BIT(2) +#define GUC_INTR_SW_INT_1 BIT(1) +#define GUC_INTR_SW_INT_0 BIT(0) + +#endif diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c new file mode 100644 index 000000000..fdfeb4b9b --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -0,0 +1,679 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2014 Intel Corporation + */ + +#include <linux/circ_buf.h> + +#include "gem/i915_gem_context.h" +#include "gt/intel_context.h" +#include "gt/intel_engine_pm.h" +#include "gt/intel_gt.h" +#include "gt/intel_gt_pm.h" +#include "gt/intel_lrc_reg.h" +#include "gt/intel_ring.h" + +#include "intel_guc_submission.h" + +#include "i915_drv.h" +#include "i915_trace.h" + +/** + * DOC: GuC-based command submission + * + * IMPORTANT NOTE: GuC submission is currently not supported in i915. The GuC + * firmware is moving to an updated submission interface and we plan to + * turn submission back on when that lands. The below documentation (and related + * code) matches the old submission model and will be updated as part of the + * upgrade to the new flow. + * + * GuC stage descriptor: + * During initialization, the driver allocates a static pool of 1024 such + * descriptors, and shares them with the GuC. Currently, we only use one + * descriptor. This stage descriptor lets the GuC know about the workqueue and + * process descriptor. Theoretically, it also lets the GuC know about our HW + * contexts (context ID, etc...), but we actually employ a kind of submission + * where the GuC uses the LRCA sent via the work item instead. This is called + * a "proxy" submission. + * + * The Scratch registers: + * There are 16 MMIO-based registers start from 0xC180. The kernel driver writes + * a value to the action register (SOFT_SCRATCH_0) along with any data. It then + * triggers an interrupt on the GuC via another register write (0xC4C8). + * Firmware writes a success/fail code back to the action register after + * processes the request. The kernel driver polls waiting for this update and + * then proceeds. + * + * Work Items: + * There are several types of work items that the host may place into a + * workqueue, each with its own requirements and limitations. Currently only + * WQ_TYPE_INORDER is needed to support legacy submission via GuC, which + * represents in-order queue. The kernel driver packs ring tail pointer and an + * ELSP context descriptor dword into Work Item. + * See guc_add_request() + * + */ + +static inline struct i915_priolist *to_priolist(struct rb_node *rb) +{ + return rb_entry(rb, struct i915_priolist, node); +} + +static struct guc_stage_desc *__get_stage_desc(struct intel_guc *guc, u32 id) +{ + struct guc_stage_desc *base = guc->stage_desc_pool_vaddr; + + return &base[id]; +} + +static int guc_workqueue_create(struct intel_guc *guc) +{ + return intel_guc_allocate_and_map_vma(guc, GUC_WQ_SIZE, &guc->workqueue, + &guc->workqueue_vaddr); +} + +static void guc_workqueue_destroy(struct intel_guc *guc) +{ + i915_vma_unpin_and_release(&guc->workqueue, I915_VMA_RELEASE_MAP); +} + +/* + * Initialise the process descriptor shared with the GuC firmware. + */ +static int guc_proc_desc_create(struct intel_guc *guc) +{ + const u32 size = PAGE_ALIGN(sizeof(struct guc_process_desc)); + + return intel_guc_allocate_and_map_vma(guc, size, &guc->proc_desc, + &guc->proc_desc_vaddr); +} + +static void guc_proc_desc_destroy(struct intel_guc *guc) +{ + i915_vma_unpin_and_release(&guc->proc_desc, I915_VMA_RELEASE_MAP); +} + +static void guc_proc_desc_init(struct intel_guc *guc) +{ + struct guc_process_desc *desc; + + desc = memset(guc->proc_desc_vaddr, 0, sizeof(*desc)); + + /* + * XXX: pDoorbell and WQVBaseAddress are pointers in process address + * space for ring3 clients (set them as in mmap_ioctl) or kernel + * space for kernel clients (map on demand instead? May make debug + * easier to have it mapped). + */ + desc->wq_base_addr = 0; + desc->db_base_addr = 0; + + desc->wq_size_bytes = GUC_WQ_SIZE; + desc->wq_status = WQ_STATUS_ACTIVE; + desc->priority = GUC_CLIENT_PRIORITY_KMD_NORMAL; +} + +static void guc_proc_desc_fini(struct intel_guc *guc) +{ + memset(guc->proc_desc_vaddr, 0, sizeof(struct guc_process_desc)); +} + +static int guc_stage_desc_pool_create(struct intel_guc *guc) +{ + u32 size = PAGE_ALIGN(sizeof(struct guc_stage_desc) * + GUC_MAX_STAGE_DESCRIPTORS); + + return intel_guc_allocate_and_map_vma(guc, size, &guc->stage_desc_pool, + &guc->stage_desc_pool_vaddr); +} + +static void guc_stage_desc_pool_destroy(struct intel_guc *guc) +{ + i915_vma_unpin_and_release(&guc->stage_desc_pool, I915_VMA_RELEASE_MAP); +} + +/* + * Initialise/clear the stage descriptor shared with the GuC firmware. + * + * This descriptor tells the GuC where (in GGTT space) to find the important + * data structures related to work submission (process descriptor, write queue, + * etc). + */ +static void guc_stage_desc_init(struct intel_guc *guc) +{ + struct guc_stage_desc *desc; + + /* we only use 1 stage desc, so hardcode it to 0 */ + desc = __get_stage_desc(guc, 0); + memset(desc, 0, sizeof(*desc)); + + desc->attribute = GUC_STAGE_DESC_ATTR_ACTIVE | + GUC_STAGE_DESC_ATTR_KERNEL; + + desc->stage_id = 0; + desc->priority = GUC_CLIENT_PRIORITY_KMD_NORMAL; + + desc->process_desc = intel_guc_ggtt_offset(guc, guc->proc_desc); + desc->wq_addr = intel_guc_ggtt_offset(guc, guc->workqueue); + desc->wq_size = GUC_WQ_SIZE; +} + +static void guc_stage_desc_fini(struct intel_guc *guc) +{ + struct guc_stage_desc *desc; + + desc = __get_stage_desc(guc, 0); + memset(desc, 0, sizeof(*desc)); +} + +/* Construct a Work Item and append it to the GuC's Work Queue */ +static void guc_wq_item_append(struct intel_guc *guc, + u32 target_engine, u32 context_desc, + u32 ring_tail, u32 fence_id) +{ + /* wqi_len is in DWords, and does not include the one-word header */ + const size_t wqi_size = sizeof(struct guc_wq_item); + const u32 wqi_len = wqi_size / sizeof(u32) - 1; + struct guc_process_desc *desc = guc->proc_desc_vaddr; + struct guc_wq_item *wqi; + u32 wq_off; + + lockdep_assert_held(&guc->wq_lock); + + /* For now workqueue item is 4 DWs; workqueue buffer is 2 pages. So we + * should not have the case where structure wqi is across page, neither + * wrapped to the beginning. This simplifies the implementation below. + * + * XXX: if not the case, we need save data to a temp wqi and copy it to + * workqueue buffer dw by dw. + */ + BUILD_BUG_ON(wqi_size != 16); + + /* We expect the WQ to be active if we're appending items to it */ + GEM_BUG_ON(desc->wq_status != WQ_STATUS_ACTIVE); + + /* Free space is guaranteed. */ + wq_off = READ_ONCE(desc->tail); + GEM_BUG_ON(CIRC_SPACE(wq_off, READ_ONCE(desc->head), + GUC_WQ_SIZE) < wqi_size); + GEM_BUG_ON(wq_off & (wqi_size - 1)); + + wqi = guc->workqueue_vaddr + wq_off; + + /* Now fill in the 4-word work queue item */ + wqi->header = WQ_TYPE_INORDER | + (wqi_len << WQ_LEN_SHIFT) | + (target_engine << WQ_TARGET_SHIFT) | + WQ_NO_WCFLUSH_WAIT; + wqi->context_desc = context_desc; + wqi->submit_element_info = ring_tail << WQ_RING_TAIL_SHIFT; + GEM_BUG_ON(ring_tail > WQ_RING_TAIL_MAX); + wqi->fence_id = fence_id; + + /* Make the update visible to GuC */ + WRITE_ONCE(desc->tail, (wq_off + wqi_size) & (GUC_WQ_SIZE - 1)); +} + +static void guc_add_request(struct intel_guc *guc, struct i915_request *rq) +{ + struct intel_engine_cs *engine = rq->engine; + u32 ctx_desc = rq->context->lrc.ccid; + u32 ring_tail = intel_ring_set_tail(rq->ring, rq->tail) / sizeof(u64); + + guc_wq_item_append(guc, engine->guc_id, ctx_desc, + ring_tail, rq->fence.seqno); +} + +/* + * When we're doing submissions using regular execlists backend, writing to + * ELSP from CPU side is enough to make sure that writes to ringbuffer pages + * pinned in mappable aperture portion of GGTT are visible to command streamer. + * Writes done by GuC on our behalf are not guaranteeing such ordering, + * therefore, to ensure the flush, we're issuing a POSTING READ. + */ +static void flush_ggtt_writes(struct i915_vma *vma) +{ + if (i915_vma_is_map_and_fenceable(vma)) + intel_uncore_posting_read_fw(vma->vm->gt->uncore, + GUC_STATUS); +} + +static void guc_submit(struct intel_engine_cs *engine, + struct i915_request **out, + struct i915_request **end) +{ + struct intel_guc *guc = &engine->gt->uc.guc; + + spin_lock(&guc->wq_lock); + + do { + struct i915_request *rq = *out++; + + flush_ggtt_writes(rq->ring->vma); + guc_add_request(guc, rq); + } while (out != end); + + spin_unlock(&guc->wq_lock); +} + +static inline int rq_prio(const struct i915_request *rq) +{ + return rq->sched.attr.priority; +} + +static struct i915_request *schedule_in(struct i915_request *rq, int idx) +{ + trace_i915_request_in(rq, idx); + + /* + * Currently we are not tracking the rq->context being inflight + * (ce->inflight = rq->engine). It is only used by the execlists + * backend at the moment, a similar counting strategy would be + * required if we generalise the inflight tracking. + */ + + __intel_gt_pm_get(rq->engine->gt); + return i915_request_get(rq); +} + +static void schedule_out(struct i915_request *rq) +{ + trace_i915_request_out(rq); + + intel_gt_pm_put_async(rq->engine->gt); + i915_request_put(rq); +} + +static void __guc_dequeue(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + struct i915_request **first = execlists->inflight; + struct i915_request ** const last_port = first + execlists->port_mask; + struct i915_request *last = first[0]; + struct i915_request **port; + bool submit = false; + struct rb_node *rb; + + lockdep_assert_held(&engine->active.lock); + + if (last) { + if (*++first) + return; + + last = NULL; + } + + /* + * We write directly into the execlists->inflight queue and don't use + * the execlists->pending queue, as we don't have a distinct switch + * event. + */ + port = first; + while ((rb = rb_first_cached(&execlists->queue))) { + struct i915_priolist *p = to_priolist(rb); + struct i915_request *rq, *rn; + int i; + + priolist_for_each_request_consume(rq, rn, p, i) { + if (last && rq->context != last->context) { + if (port == last_port) + goto done; + + *port = schedule_in(last, + port - execlists->inflight); + port++; + } + + list_del_init(&rq->sched.link); + __i915_request_submit(rq); + submit = true; + last = rq; + } + + rb_erase_cached(&p->node, &execlists->queue); + i915_priolist_free(p); + } +done: + execlists->queue_priority_hint = + rb ? to_priolist(rb)->priority : INT_MIN; + if (submit) { + *port = schedule_in(last, port - execlists->inflight); + *++port = NULL; + guc_submit(engine, first, port); + } + execlists->active = execlists->inflight; +} + +static void guc_submission_tasklet(unsigned long data) +{ + struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; + struct intel_engine_execlists * const execlists = &engine->execlists; + struct i915_request **port, *rq; + unsigned long flags; + + spin_lock_irqsave(&engine->active.lock, flags); + + for (port = execlists->inflight; (rq = *port); port++) { + if (!i915_request_completed(rq)) + break; + + schedule_out(rq); + } + if (port != execlists->inflight) { + int idx = port - execlists->inflight; + int rem = ARRAY_SIZE(execlists->inflight) - idx; + memmove(execlists->inflight, port, rem * sizeof(*port)); + } + + __guc_dequeue(engine); + + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +static void guc_reset_prepare(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + + ENGINE_TRACE(engine, "\n"); + + /* + * Prevent request submission to the hardware until we have + * completed the reset in i915_gem_reset_finish(). If a request + * is completed by one engine, it may then queue a request + * to a second via its execlists->tasklet *just* as we are + * calling engine->init_hw() and also writing the ELSP. + * Turning off the execlists->tasklet until the reset is over + * prevents the race. + */ + __tasklet_disable_sync_once(&execlists->tasklet); +} + +static void +cancel_port_requests(struct intel_engine_execlists * const execlists) +{ + struct i915_request * const *port, *rq; + + /* Note we are only using the inflight and not the pending queue */ + + for (port = execlists->active; (rq = *port); port++) + schedule_out(rq); + execlists->active = + memset(execlists->inflight, 0, sizeof(execlists->inflight)); +} + +static void guc_reset_rewind(struct intel_engine_cs *engine, bool stalled) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + struct i915_request *rq; + unsigned long flags; + + spin_lock_irqsave(&engine->active.lock, flags); + + cancel_port_requests(execlists); + + /* Push back any incomplete requests for replay after the reset. */ + rq = execlists_unwind_incomplete_requests(execlists); + if (!rq) + goto out_unlock; + + if (!i915_request_started(rq)) + stalled = false; + + __i915_request_reset(rq, stalled); + intel_lr_context_reset(engine, rq->context, rq->head, stalled); + +out_unlock: + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +static void guc_reset_cancel(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + struct i915_request *rq, *rn; + struct rb_node *rb; + unsigned long flags; + + ENGINE_TRACE(engine, "\n"); + + /* + * Before we call engine->cancel_requests(), we should have exclusive + * access to the submission state. This is arranged for us by the + * caller disabling the interrupt generation, the tasklet and other + * threads that may then access the same state, giving us a free hand + * to reset state. However, we still need to let lockdep be aware that + * we know this state may be accessed in hardirq context, so we + * disable the irq around this manipulation and we want to keep + * the spinlock focused on its duties and not accidentally conflate + * coverage to the submission's irq state. (Similarly, although we + * shouldn't need to disable irq around the manipulation of the + * submission's irq state, we also wish to remind ourselves that + * it is irq state.) + */ + spin_lock_irqsave(&engine->active.lock, flags); + + /* Cancel the requests on the HW and clear the ELSP tracker. */ + cancel_port_requests(execlists); + + /* Mark all executing requests as skipped. */ + list_for_each_entry(rq, &engine->active.requests, sched.link) { + i915_request_set_error_once(rq, -EIO); + i915_request_mark_complete(rq); + } + + /* Flush the queued requests to the timeline list (for retiring). */ + while ((rb = rb_first_cached(&execlists->queue))) { + struct i915_priolist *p = to_priolist(rb); + int i; + + priolist_for_each_request_consume(rq, rn, p, i) { + list_del_init(&rq->sched.link); + __i915_request_submit(rq); + dma_fence_set_error(&rq->fence, -EIO); + i915_request_mark_complete(rq); + } + + rb_erase_cached(&p->node, &execlists->queue); + i915_priolist_free(p); + } + + /* Remaining _unready_ requests will be nop'ed when submitted */ + + execlists->queue_priority_hint = INT_MIN; + execlists->queue = RB_ROOT_CACHED; + + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +static void guc_reset_finish(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + + if (__tasklet_enable(&execlists->tasklet)) + /* And kick in case we missed a new request submission. */ + tasklet_hi_schedule(&execlists->tasklet); + + ENGINE_TRACE(engine, "depth->%d\n", + atomic_read(&execlists->tasklet.count)); +} + +/* + * Everything below here is concerned with setup & teardown, and is + * therefore not part of the somewhat time-critical batch-submission + * path of guc_submit() above. + */ + +/* + * Set up the memory resources to be shared with the GuC (via the GGTT) + * at firmware loading time. + */ +int intel_guc_submission_init(struct intel_guc *guc) +{ + int ret; + + if (guc->stage_desc_pool) + return 0; + + ret = guc_stage_desc_pool_create(guc); + if (ret) + return ret; + /* + * Keep static analysers happy, let them know that we allocated the + * vma after testing that it didn't exist earlier. + */ + GEM_BUG_ON(!guc->stage_desc_pool); + + ret = guc_workqueue_create(guc); + if (ret) + goto err_pool; + + ret = guc_proc_desc_create(guc); + if (ret) + goto err_workqueue; + + spin_lock_init(&guc->wq_lock); + + return 0; + +err_workqueue: + guc_workqueue_destroy(guc); +err_pool: + guc_stage_desc_pool_destroy(guc); + return ret; +} + +void intel_guc_submission_fini(struct intel_guc *guc) +{ + if (guc->stage_desc_pool) { + guc_proc_desc_destroy(guc); + guc_workqueue_destroy(guc); + guc_stage_desc_pool_destroy(guc); + } +} + +static void guc_interrupts_capture(struct intel_gt *gt) +{ + struct intel_uncore *uncore = gt->uncore; + u32 irqs = GT_CONTEXT_SWITCH_INTERRUPT; + u32 dmask = irqs << 16 | irqs; + + GEM_BUG_ON(INTEL_GEN(gt->i915) < 11); + + /* Don't handle the ctx switch interrupt in GuC submission mode */ + intel_uncore_rmw(uncore, GEN11_RENDER_COPY_INTR_ENABLE, dmask, 0); + intel_uncore_rmw(uncore, GEN11_VCS_VECS_INTR_ENABLE, dmask, 0); +} + +static void guc_interrupts_release(struct intel_gt *gt) +{ + struct intel_uncore *uncore = gt->uncore; + u32 irqs = GT_CONTEXT_SWITCH_INTERRUPT; + u32 dmask = irqs << 16 | irqs; + + GEM_BUG_ON(INTEL_GEN(gt->i915) < 11); + + /* Handle ctx switch interrupts again */ + intel_uncore_rmw(uncore, GEN11_RENDER_COPY_INTR_ENABLE, 0, dmask); + intel_uncore_rmw(uncore, GEN11_VCS_VECS_INTR_ENABLE, 0, dmask); +} + +static void guc_set_default_submission(struct intel_engine_cs *engine) +{ + /* + * We inherit a bunch of functions from execlists that we'd like + * to keep using: + * + * engine->submit_request = execlists_submit_request; + * engine->cancel_requests = execlists_cancel_requests; + * engine->schedule = execlists_schedule; + * + * But we need to override the actual submission backend in order + * to talk to the GuC. + */ + intel_execlists_set_default_submission(engine); + + engine->execlists.tasklet.func = guc_submission_tasklet; + + /* do not use execlists park/unpark */ + engine->park = engine->unpark = NULL; + + engine->reset.prepare = guc_reset_prepare; + engine->reset.rewind = guc_reset_rewind; + engine->reset.cancel = guc_reset_cancel; + engine->reset.finish = guc_reset_finish; + + engine->flags &= ~I915_ENGINE_SUPPORTS_STATS; + engine->flags |= I915_ENGINE_NEEDS_BREADCRUMB_TASKLET; + + /* + * For the breadcrumb irq to work we need the interrupts to stay + * enabled. However, on all platforms on which we'll have support for + * GuC submission we don't allow disabling the interrupts at runtime, so + * we're always safe with the current flow. + */ + GEM_BUG_ON(engine->irq_enable || engine->irq_disable); +} + +void intel_guc_submission_enable(struct intel_guc *guc) +{ + struct intel_gt *gt = guc_to_gt(guc); + struct intel_engine_cs *engine; + enum intel_engine_id id; + + /* + * We're using GuC work items for submitting work through GuC. Since + * we're coalescing multiple requests from a single context into a + * single work item prior to assigning it to execlist_port, we can + * never have more work items than the total number of ports (for all + * engines). The GuC firmware is controlling the HEAD of work queue, + * and it is guaranteed that it will remove the work item from the + * queue before our request is completed. + */ + BUILD_BUG_ON(ARRAY_SIZE(engine->execlists.inflight) * + sizeof(struct guc_wq_item) * + I915_NUM_ENGINES > GUC_WQ_SIZE); + + guc_proc_desc_init(guc); + guc_stage_desc_init(guc); + + /* Take over from manual control of ELSP (execlists) */ + guc_interrupts_capture(gt); + + for_each_engine(engine, gt, id) { + engine->set_default_submission = guc_set_default_submission; + engine->set_default_submission(engine); + } +} + +void intel_guc_submission_disable(struct intel_guc *guc) +{ + struct intel_gt *gt = guc_to_gt(guc); + + GEM_BUG_ON(gt->awake); /* GT should be parked first */ + + /* Note: By the time we're here, GuC may have already been reset */ + + guc_interrupts_release(gt); + + guc_stage_desc_fini(guc); + guc_proc_desc_fini(guc); +} + +static bool __guc_submission_selected(struct intel_guc *guc) +{ + struct drm_i915_private *i915 = guc_to_gt(guc)->i915; + + if (!intel_guc_submission_is_supported(guc)) + return false; + + return i915->params.enable_guc & ENABLE_GUC_SUBMISSION; +} + +void intel_guc_submission_init_early(struct intel_guc *guc) +{ + guc->submission_selected = __guc_submission_selected(guc); +} + +bool intel_engine_in_guc_submission_mode(const struct intel_engine_cs *engine) +{ + return engine->set_default_submission == guc_set_default_submission; +} diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h new file mode 100644 index 000000000..4cf9d3e50 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2014-2019 Intel Corporation + */ + +#ifndef _INTEL_GUC_SUBMISSION_H_ +#define _INTEL_GUC_SUBMISSION_H_ + +#include <linux/types.h> + +#include "intel_guc.h" + +struct intel_engine_cs; + +void intel_guc_submission_init_early(struct intel_guc *guc); +int intel_guc_submission_init(struct intel_guc *guc); +void intel_guc_submission_enable(struct intel_guc *guc); +void intel_guc_submission_disable(struct intel_guc *guc); +void intel_guc_submission_fini(struct intel_guc *guc); +int intel_guc_preempt_work_create(struct intel_guc *guc); +void intel_guc_preempt_work_destroy(struct intel_guc *guc); +bool intel_engine_in_guc_submission_mode(const struct intel_engine_cs *engine); + +static inline bool intel_guc_submission_is_supported(struct intel_guc *guc) +{ + /* XXX: GuC submission is unavailable for now */ + return false; +} + +static inline bool intel_guc_submission_is_wanted(struct intel_guc *guc) +{ + return guc->submission_selected; +} + +static inline bool intel_guc_submission_is_used(struct intel_guc *guc) +{ + return intel_guc_is_used(guc) && intel_guc_submission_is_wanted(guc); +} + +#endif diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc.c b/drivers/gpu/drm/i915/gt/uc/intel_huc.c new file mode 100644 index 000000000..65eeb44b3 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_huc.c @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2016-2019 Intel Corporation + */ + +#include <linux/types.h> + +#include "gt/intel_gt.h" +#include "intel_huc.h" +#include "i915_drv.h" + +/** + * DOC: HuC + * + * The HuC is a dedicated microcontroller for usage in media HEVC (High + * Efficiency Video Coding) operations. Userspace can directly use the firmware + * capabilities by adding HuC specific commands to batch buffers. + * + * The kernel driver is only responsible for loading the HuC firmware and + * triggering its security authentication, which is performed by the GuC. For + * The GuC to correctly perform the authentication, the HuC binary must be + * loaded before the GuC one. Loading the HuC is optional; however, not using + * the HuC might negatively impact power usage and/or performance of media + * workloads, depending on the use-cases. + * + * See https://github.com/intel/media-driver for the latest details on HuC + * functionality. + */ + +/** + * DOC: HuC Memory Management + * + * Similarly to the GuC, the HuC can't do any memory allocations on its own, + * with the difference being that the allocations for HuC usage are handled by + * the userspace driver instead of the kernel one. The HuC accesses the memory + * via the PPGTT belonging to the context loaded on the VCS executing the + * HuC-specific commands. + */ + +void intel_huc_init_early(struct intel_huc *huc) +{ + struct drm_i915_private *i915 = huc_to_gt(huc)->i915; + + intel_uc_fw_init_early(&huc->fw, INTEL_UC_FW_TYPE_HUC); + + if (INTEL_GEN(i915) >= 11) { + huc->status.reg = GEN11_HUC_KERNEL_LOAD_INFO; + huc->status.mask = HUC_LOAD_SUCCESSFUL; + huc->status.value = HUC_LOAD_SUCCESSFUL; + } else { + huc->status.reg = HUC_STATUS2; + huc->status.mask = HUC_FW_VERIFIED; + huc->status.value = HUC_FW_VERIFIED; + } +} + +static int intel_huc_rsa_data_create(struct intel_huc *huc) +{ + struct intel_gt *gt = huc_to_gt(huc); + struct intel_guc *guc = >->uc.guc; + struct i915_vma *vma; + size_t copied; + void *vaddr; + int err; + + err = i915_inject_probe_error(gt->i915, -ENXIO); + if (err) + return err; + + /* + * HuC firmware will sit above GUC_GGTT_TOP and will not map + * through GTT. Unfortunately, this means GuC cannot perform + * the HuC auth. as the rsa offset now falls within the GuC + * inaccessible range. We resort to perma-pinning an additional + * vma within the accessible range that only contains the rsa + * signature. The GuC can use this extra pinning to perform + * the authentication since its GGTT offset will be GuC + * accessible. + */ + GEM_BUG_ON(huc->fw.rsa_size > PAGE_SIZE); + vma = intel_guc_allocate_vma(guc, PAGE_SIZE); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + vaddr = i915_gem_object_pin_map(vma->obj, I915_MAP_WB); + if (IS_ERR(vaddr)) { + i915_vma_unpin_and_release(&vma, 0); + return PTR_ERR(vaddr); + } + + copied = intel_uc_fw_copy_rsa(&huc->fw, vaddr, vma->size); + GEM_BUG_ON(copied < huc->fw.rsa_size); + + i915_gem_object_unpin_map(vma->obj); + + huc->rsa_data = vma; + + return 0; +} + +static void intel_huc_rsa_data_destroy(struct intel_huc *huc) +{ + i915_vma_unpin_and_release(&huc->rsa_data, 0); +} + +int intel_huc_init(struct intel_huc *huc) +{ + struct drm_i915_private *i915 = huc_to_gt(huc)->i915; + int err; + + err = intel_uc_fw_init(&huc->fw); + if (err) + goto out; + + /* + * HuC firmware image is outside GuC accessible range. + * Copy the RSA signature out of the image into + * a perma-pinned region set aside for it + */ + err = intel_huc_rsa_data_create(huc); + if (err) + goto out_fini; + + intel_uc_fw_change_status(&huc->fw, INTEL_UC_FIRMWARE_LOADABLE); + + return 0; + +out_fini: + intel_uc_fw_fini(&huc->fw); +out: + i915_probe_error(i915, "failed with %d\n", err); + return err; +} + +void intel_huc_fini(struct intel_huc *huc) +{ + if (!intel_uc_fw_is_loadable(&huc->fw)) + return; + + intel_huc_rsa_data_destroy(huc); + intel_uc_fw_fini(&huc->fw); +} + +/** + * intel_huc_auth() - Authenticate HuC uCode + * @huc: intel_huc structure + * + * Called after HuC and GuC firmware loading during intel_uc_init_hw(). + * + * This function invokes the GuC action to authenticate the HuC firmware, + * passing the offset of the RSA signature to intel_guc_auth_huc(). It then + * waits for up to 50ms for firmware verification ACK. + */ +int intel_huc_auth(struct intel_huc *huc) +{ + struct intel_gt *gt = huc_to_gt(huc); + struct intel_guc *guc = >->uc.guc; + int ret; + + GEM_BUG_ON(intel_huc_is_authenticated(huc)); + + if (!intel_uc_fw_is_loaded(&huc->fw)) + return -ENOEXEC; + + ret = i915_inject_probe_error(gt->i915, -ENXIO); + if (ret) + goto fail; + + ret = intel_guc_auth_huc(guc, + intel_guc_ggtt_offset(guc, huc->rsa_data)); + if (ret) { + DRM_ERROR("HuC: GuC did not ack Auth request %d\n", ret); + goto fail; + } + + /* Check authentication status, it should be done by now */ + ret = __intel_wait_for_register(gt->uncore, + huc->status.reg, + huc->status.mask, + huc->status.value, + 2, 50, NULL); + if (ret) { + DRM_ERROR("HuC: Firmware not verified %d\n", ret); + goto fail; + } + + intel_uc_fw_change_status(&huc->fw, INTEL_UC_FIRMWARE_RUNNING); + return 0; + +fail: + i915_probe_error(gt->i915, "HuC: Authentication failed %d\n", ret); + intel_uc_fw_change_status(&huc->fw, INTEL_UC_FIRMWARE_FAIL); + return ret; +} + +/** + * intel_huc_check_status() - check HuC status + * @huc: intel_huc structure + * + * This function reads status register to verify if HuC + * firmware was successfully loaded. + * + * Returns: + * * -ENODEV if HuC is not present on this platform, + * * -EOPNOTSUPP if HuC firmware is disabled, + * * -ENOPKG if HuC firmware was not installed, + * * -ENOEXEC if HuC firmware is invalid or mismatched, + * * 0 if HuC firmware is not running, + * * 1 if HuC firmware is authenticated and running. + */ +int intel_huc_check_status(struct intel_huc *huc) +{ + struct intel_gt *gt = huc_to_gt(huc); + intel_wakeref_t wakeref; + u32 status = 0; + + switch (__intel_uc_fw_status(&huc->fw)) { + case INTEL_UC_FIRMWARE_NOT_SUPPORTED: + return -ENODEV; + case INTEL_UC_FIRMWARE_DISABLED: + return -EOPNOTSUPP; + case INTEL_UC_FIRMWARE_MISSING: + return -ENOPKG; + case INTEL_UC_FIRMWARE_ERROR: + return -ENOEXEC; + default: + break; + } + + with_intel_runtime_pm(gt->uncore->rpm, wakeref) + status = intel_uncore_read(gt->uncore, huc->status.reg); + + return (status & huc->status.mask) == huc->status.value; +} + +/** + * intel_huc_load_status - dump information about HuC load status + * @huc: the HuC + * @p: the &drm_printer + * + * Pretty printer for HuC load status. + */ +void intel_huc_load_status(struct intel_huc *huc, struct drm_printer *p) +{ + struct intel_gt *gt = huc_to_gt(huc); + intel_wakeref_t wakeref; + + if (!intel_huc_is_supported(huc)) { + drm_printf(p, "HuC not supported\n"); + return; + } + + if (!intel_huc_is_wanted(huc)) { + drm_printf(p, "HuC disabled\n"); + return; + } + + intel_uc_fw_dump(&huc->fw, p); + + with_intel_runtime_pm(gt->uncore->rpm, wakeref) + drm_printf(p, "HuC status: 0x%08x\n", + intel_uncore_read(gt->uncore, huc->status.reg)); +} diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc.h b/drivers/gpu/drm/i915/gt/uc/intel_huc.h new file mode 100644 index 000000000..daee43b66 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_huc.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2014-2019 Intel Corporation + */ + +#ifndef _INTEL_HUC_H_ +#define _INTEL_HUC_H_ + +#include "i915_reg.h" +#include "intel_uc_fw.h" +#include "intel_huc_fw.h" + +struct intel_huc { + /* Generic uC firmware management */ + struct intel_uc_fw fw; + + /* HuC-specific additions */ + struct i915_vma *rsa_data; + + struct { + i915_reg_t reg; + u32 mask; + u32 value; + } status; +}; + +void intel_huc_init_early(struct intel_huc *huc); +int intel_huc_init(struct intel_huc *huc); +void intel_huc_fini(struct intel_huc *huc); +int intel_huc_auth(struct intel_huc *huc); +int intel_huc_check_status(struct intel_huc *huc); + +static inline int intel_huc_sanitize(struct intel_huc *huc) +{ + intel_uc_fw_sanitize(&huc->fw); + return 0; +} + +static inline bool intel_huc_is_supported(struct intel_huc *huc) +{ + return intel_uc_fw_is_supported(&huc->fw); +} + +static inline bool intel_huc_is_wanted(struct intel_huc *huc) +{ + return intel_uc_fw_is_enabled(&huc->fw); +} + +static inline bool intel_huc_is_used(struct intel_huc *huc) +{ + GEM_BUG_ON(__intel_uc_fw_status(&huc->fw) == INTEL_UC_FIRMWARE_SELECTED); + return intel_uc_fw_is_available(&huc->fw); +} + +static inline bool intel_huc_is_authenticated(struct intel_huc *huc) +{ + return intel_uc_fw_is_running(&huc->fw); +} + +void intel_huc_load_status(struct intel_huc *huc, struct drm_printer *p); + +#endif diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc_debugfs.c b/drivers/gpu/drm/i915/gt/uc/intel_huc_debugfs.c new file mode 100644 index 000000000..5733c15fd --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_huc_debugfs.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +#include <drm/drm_print.h> + +#include "gt/debugfs_gt.h" +#include "intel_huc.h" +#include "intel_huc_debugfs.h" + +static int huc_info_show(struct seq_file *m, void *data) +{ + struct intel_huc *huc = m->private; + struct drm_printer p = drm_seq_file_printer(m); + + if (!intel_huc_is_supported(huc)) + return -ENODEV; + + intel_huc_load_status(huc, &p); + + return 0; +} +DEFINE_GT_DEBUGFS_ATTRIBUTE(huc_info); + +void intel_huc_debugfs_register(struct intel_huc *huc, struct dentry *root) +{ + static const struct debugfs_gt_file files[] = { + { "huc_info", &huc_info_fops, NULL }, + }; + + if (!intel_huc_is_supported(huc)) + return; + + intel_gt_debugfs_register_files(root, files, ARRAY_SIZE(files), huc); +} diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc_debugfs.h b/drivers/gpu/drm/i915/gt/uc/intel_huc_debugfs.h new file mode 100644 index 000000000..be79e992f --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_huc_debugfs.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2020 Intel Corporation + */ + +#ifndef DEBUGFS_HUC_H +#define DEBUGFS_HUC_H + +struct intel_huc; +struct dentry; + +void intel_huc_debugfs_register(struct intel_huc *huc, struct dentry *root); + +#endif /* DEBUGFS_HUC_H */ diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_huc_fw.c new file mode 100644 index 000000000..e5ef509c7 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_huc_fw.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2014-2019 Intel Corporation + */ + +#include "gt/intel_gt.h" +#include "intel_huc_fw.h" +#include "i915_drv.h" + +/** + * intel_huc_fw_upload() - load HuC uCode to device + * @huc: intel_huc structure + * + * Called from intel_uc_init_hw() during driver load, resume from sleep and + * after a GPU reset. Note that HuC must be loaded before GuC. + * + * The firmware image should have already been fetched into memory, so only + * check that fetch succeeded, and then transfer the image to the h/w. + * + * Return: non-zero code on error + */ +int intel_huc_fw_upload(struct intel_huc *huc) +{ + /* HW doesn't look at destination address for HuC, so set it to 0 */ + return intel_uc_fw_upload(&huc->fw, 0, HUC_UKERNEL); +} diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc_fw.h b/drivers/gpu/drm/i915/gt/uc/intel_huc_fw.h new file mode 100644 index 000000000..12f264ee3 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_huc_fw.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2014-2019 Intel Corporation + */ + +#ifndef _INTEL_HUC_FW_H_ +#define _INTEL_HUC_FW_H_ + +struct intel_huc; + +int intel_huc_fw_upload(struct intel_huc *huc); + +#endif diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c new file mode 100644 index 000000000..d6f55f708 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c @@ -0,0 +1,647 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2016-2019 Intel Corporation + */ + +#include "gt/intel_gt.h" +#include "gt/intel_reset.h" +#include "intel_guc.h" +#include "intel_guc_ads.h" +#include "intel_guc_submission.h" +#include "intel_uc.h" + +#include "i915_drv.h" + +static const struct intel_uc_ops uc_ops_off; +static const struct intel_uc_ops uc_ops_on; + +/* Reset GuC providing us with fresh state for both GuC and HuC. + */ +static int __intel_uc_reset_hw(struct intel_uc *uc) +{ + struct intel_gt *gt = uc_to_gt(uc); + int ret; + u32 guc_status; + + ret = i915_inject_probe_error(gt->i915, -ENXIO); + if (ret) + return ret; + + ret = intel_reset_guc(gt); + if (ret) { + DRM_ERROR("Failed to reset GuC, ret = %d\n", ret); + return ret; + } + + guc_status = intel_uncore_read(gt->uncore, GUC_STATUS); + WARN(!(guc_status & GS_MIA_IN_RESET), + "GuC status: 0x%x, MIA core expected to be in reset\n", + guc_status); + + return ret; +} + +static void __confirm_options(struct intel_uc *uc) +{ + struct drm_i915_private *i915 = uc_to_gt(uc)->i915; + + drm_dbg(&i915->drm, + "enable_guc=%d (guc:%s submission:%s huc:%s)\n", + i915->params.enable_guc, + yesno(intel_uc_wants_guc(uc)), + yesno(intel_uc_wants_guc_submission(uc)), + yesno(intel_uc_wants_huc(uc))); + + if (i915->params.enable_guc == -1) + return; + + if (i915->params.enable_guc == 0) { + GEM_BUG_ON(intel_uc_wants_guc(uc)); + GEM_BUG_ON(intel_uc_wants_guc_submission(uc)); + GEM_BUG_ON(intel_uc_wants_huc(uc)); + return; + } + + if (!intel_uc_supports_guc(uc)) + drm_info(&i915->drm, + "Incompatible option enable_guc=%d - %s\n", + i915->params.enable_guc, "GuC is not supported!"); + + if (i915->params.enable_guc & ENABLE_GUC_LOAD_HUC && + !intel_uc_supports_huc(uc)) + drm_info(&i915->drm, + "Incompatible option enable_guc=%d - %s\n", + i915->params.enable_guc, "HuC is not supported!"); + + if (i915->params.enable_guc & ENABLE_GUC_SUBMISSION && + !intel_uc_supports_guc_submission(uc)) + drm_info(&i915->drm, + "Incompatible option enable_guc=%d - %s\n", + i915->params.enable_guc, "GuC submission is N/A"); + + if (i915->params.enable_guc & ~(ENABLE_GUC_SUBMISSION | + ENABLE_GUC_LOAD_HUC)) + drm_info(&i915->drm, + "Incompatible option enable_guc=%d - %s\n", + i915->params.enable_guc, "undocumented flag"); +} + +void intel_uc_init_early(struct intel_uc *uc) +{ + intel_guc_init_early(&uc->guc); + intel_huc_init_early(&uc->huc); + + __confirm_options(uc); + + if (intel_uc_wants_guc(uc)) + uc->ops = &uc_ops_on; + else + uc->ops = &uc_ops_off; +} + +void intel_uc_driver_late_release(struct intel_uc *uc) +{ +} + +/** + * intel_uc_init_mmio - setup uC MMIO access + * @uc: the intel_uc structure + * + * Setup minimal state necessary for MMIO accesses later in the + * initialization sequence. + */ +void intel_uc_init_mmio(struct intel_uc *uc) +{ + intel_guc_init_send_regs(&uc->guc); +} + +static void __uc_capture_load_err_log(struct intel_uc *uc) +{ + struct intel_guc *guc = &uc->guc; + + if (guc->log.vma && !uc->load_err_log) + uc->load_err_log = i915_gem_object_get(guc->log.vma->obj); +} + +static void __uc_free_load_err_log(struct intel_uc *uc) +{ + struct drm_i915_gem_object *log = fetch_and_zero(&uc->load_err_log); + + if (log) + i915_gem_object_put(log); +} + +void intel_uc_driver_remove(struct intel_uc *uc) +{ + intel_uc_fini_hw(uc); + intel_uc_fini(uc); + __uc_free_load_err_log(uc); +} + +static inline bool guc_communication_enabled(struct intel_guc *guc) +{ + return intel_guc_ct_enabled(&guc->ct); +} + +/* + * Events triggered while CT buffers are disabled are logged in the SCRATCH_15 + * register using the same bits used in the CT message payload. Since our + * communication channel with guc is turned off at this point, we can save the + * message and handle it after we turn it back on. + */ +static void guc_clear_mmio_msg(struct intel_guc *guc) +{ + intel_uncore_write(guc_to_gt(guc)->uncore, SOFT_SCRATCH(15), 0); +} + +static void guc_get_mmio_msg(struct intel_guc *guc) +{ + u32 val; + + spin_lock_irq(&guc->irq_lock); + + val = intel_uncore_read(guc_to_gt(guc)->uncore, SOFT_SCRATCH(15)); + guc->mmio_msg |= val & guc->msg_enabled_mask; + + /* + * clear all events, including the ones we're not currently servicing, + * to make sure we don't try to process a stale message if we enable + * handling of more events later. + */ + guc_clear_mmio_msg(guc); + + spin_unlock_irq(&guc->irq_lock); +} + +static void guc_handle_mmio_msg(struct intel_guc *guc) +{ + struct drm_i915_private *i915 = guc_to_gt(guc)->i915; + + /* we need communication to be enabled to reply to GuC */ + GEM_BUG_ON(!guc_communication_enabled(guc)); + + if (!guc->mmio_msg) + return; + + spin_lock_irq(&i915->irq_lock); + intel_guc_to_host_process_recv_msg(guc, &guc->mmio_msg, 1); + spin_unlock_irq(&i915->irq_lock); + + guc->mmio_msg = 0; +} + +static void guc_reset_interrupts(struct intel_guc *guc) +{ + guc->interrupts.reset(guc); +} + +static void guc_enable_interrupts(struct intel_guc *guc) +{ + guc->interrupts.enable(guc); +} + +static void guc_disable_interrupts(struct intel_guc *guc) +{ + guc->interrupts.disable(guc); +} + +static int guc_enable_communication(struct intel_guc *guc) +{ + struct drm_i915_private *i915 = guc_to_gt(guc)->i915; + int ret; + + GEM_BUG_ON(guc_communication_enabled(guc)); + + ret = i915_inject_probe_error(i915, -ENXIO); + if (ret) + return ret; + + ret = intel_guc_ct_enable(&guc->ct); + if (ret) + return ret; + + /* check for mmio messages received before/during the CT enable */ + guc_get_mmio_msg(guc); + guc_handle_mmio_msg(guc); + + guc_enable_interrupts(guc); + + /* check for CT messages received before we enabled interrupts */ + spin_lock_irq(&i915->irq_lock); + intel_guc_ct_event_handler(&guc->ct); + spin_unlock_irq(&i915->irq_lock); + + DRM_INFO("GuC communication enabled\n"); + + return 0; +} + +static void guc_disable_communication(struct intel_guc *guc) +{ + /* + * Events generated during or after CT disable are logged by guc in + * via mmio. Make sure the register is clear before disabling CT since + * all events we cared about have already been processed via CT. + */ + guc_clear_mmio_msg(guc); + + guc_disable_interrupts(guc); + + intel_guc_ct_disable(&guc->ct); + + /* + * Check for messages received during/after the CT disable. We do not + * expect any messages to have arrived via CT between the interrupt + * disable and the CT disable because GuC should've been idle until we + * triggered the CT disable protocol. + */ + guc_get_mmio_msg(guc); + + DRM_INFO("GuC communication disabled\n"); +} + +static void __uc_fetch_firmwares(struct intel_uc *uc) +{ + int err; + + GEM_BUG_ON(!intel_uc_wants_guc(uc)); + + err = intel_uc_fw_fetch(&uc->guc.fw); + if (err) { + /* Make sure we transition out of transient "SELECTED" state */ + if (intel_uc_wants_huc(uc)) { + drm_dbg(&uc_to_gt(uc)->i915->drm, + "Failed to fetch GuC: %d disabling HuC\n", err); + intel_uc_fw_change_status(&uc->huc.fw, + INTEL_UC_FIRMWARE_ERROR); + } + + return; + } + + if (intel_uc_wants_huc(uc)) + intel_uc_fw_fetch(&uc->huc.fw); +} + +static void __uc_cleanup_firmwares(struct intel_uc *uc) +{ + intel_uc_fw_cleanup_fetch(&uc->huc.fw); + intel_uc_fw_cleanup_fetch(&uc->guc.fw); +} + +static int __uc_init(struct intel_uc *uc) +{ + struct intel_guc *guc = &uc->guc; + struct intel_huc *huc = &uc->huc; + int ret; + + GEM_BUG_ON(!intel_uc_wants_guc(uc)); + + if (!intel_uc_uses_guc(uc)) + return 0; + + if (i915_inject_probe_failure(uc_to_gt(uc)->i915)) + return -ENOMEM; + + /* XXX: GuC submission is unavailable for now */ + GEM_BUG_ON(intel_uc_uses_guc_submission(uc)); + + ret = intel_guc_init(guc); + if (ret) + return ret; + + if (intel_uc_uses_huc(uc)) { + ret = intel_huc_init(huc); + if (ret) + goto out_guc; + } + + return 0; + +out_guc: + intel_guc_fini(guc); + return ret; +} + +static void __uc_fini(struct intel_uc *uc) +{ + intel_huc_fini(&uc->huc); + intel_guc_fini(&uc->guc); +} + +static int __uc_sanitize(struct intel_uc *uc) +{ + struct intel_guc *guc = &uc->guc; + struct intel_huc *huc = &uc->huc; + + GEM_BUG_ON(!intel_uc_supports_guc(uc)); + + intel_huc_sanitize(huc); + intel_guc_sanitize(guc); + + return __intel_uc_reset_hw(uc); +} + +/* Initialize and verify the uC regs related to uC positioning in WOPCM */ +static int uc_init_wopcm(struct intel_uc *uc) +{ + struct intel_gt *gt = uc_to_gt(uc); + struct intel_uncore *uncore = gt->uncore; + u32 base = intel_wopcm_guc_base(>->i915->wopcm); + u32 size = intel_wopcm_guc_size(>->i915->wopcm); + u32 huc_agent = intel_uc_uses_huc(uc) ? HUC_LOADING_AGENT_GUC : 0; + u32 mask; + int err; + + if (unlikely(!base || !size)) { + i915_probe_error(gt->i915, "Unsuccessful WOPCM partitioning\n"); + return -E2BIG; + } + + GEM_BUG_ON(!intel_uc_supports_guc(uc)); + GEM_BUG_ON(!(base & GUC_WOPCM_OFFSET_MASK)); + GEM_BUG_ON(base & ~GUC_WOPCM_OFFSET_MASK); + GEM_BUG_ON(!(size & GUC_WOPCM_SIZE_MASK)); + GEM_BUG_ON(size & ~GUC_WOPCM_SIZE_MASK); + + err = i915_inject_probe_error(gt->i915, -ENXIO); + if (err) + return err; + + mask = GUC_WOPCM_SIZE_MASK | GUC_WOPCM_SIZE_LOCKED; + err = intel_uncore_write_and_verify(uncore, GUC_WOPCM_SIZE, size, mask, + size | GUC_WOPCM_SIZE_LOCKED); + if (err) + goto err_out; + + mask = GUC_WOPCM_OFFSET_MASK | GUC_WOPCM_OFFSET_VALID | huc_agent; + err = intel_uncore_write_and_verify(uncore, DMA_GUC_WOPCM_OFFSET, + base | huc_agent, mask, + base | huc_agent | + GUC_WOPCM_OFFSET_VALID); + if (err) + goto err_out; + + return 0; + +err_out: + i915_probe_error(gt->i915, "Failed to init uC WOPCM registers!\n"); + i915_probe_error(gt->i915, "%s(%#x)=%#x\n", "DMA_GUC_WOPCM_OFFSET", + i915_mmio_reg_offset(DMA_GUC_WOPCM_OFFSET), + intel_uncore_read(uncore, DMA_GUC_WOPCM_OFFSET)); + i915_probe_error(gt->i915, "%s(%#x)=%#x\n", "GUC_WOPCM_SIZE", + i915_mmio_reg_offset(GUC_WOPCM_SIZE), + intel_uncore_read(uncore, GUC_WOPCM_SIZE)); + + return err; +} + +static bool uc_is_wopcm_locked(struct intel_uc *uc) +{ + struct intel_gt *gt = uc_to_gt(uc); + struct intel_uncore *uncore = gt->uncore; + + return (intel_uncore_read(uncore, GUC_WOPCM_SIZE) & GUC_WOPCM_SIZE_LOCKED) || + (intel_uncore_read(uncore, DMA_GUC_WOPCM_OFFSET) & GUC_WOPCM_OFFSET_VALID); +} + +static int __uc_check_hw(struct intel_uc *uc) +{ + if (!intel_uc_supports_guc(uc)) + return 0; + + /* + * We can silently continue without GuC only if it was never enabled + * before on this system after reboot, otherwise we risk GPU hangs. + * To check if GuC was loaded before we look at WOPCM registers. + */ + if (uc_is_wopcm_locked(uc)) + return -EIO; + + return 0; +} + +static int __uc_init_hw(struct intel_uc *uc) +{ + struct drm_i915_private *i915 = uc_to_gt(uc)->i915; + struct intel_guc *guc = &uc->guc; + struct intel_huc *huc = &uc->huc; + int ret, attempts; + + GEM_BUG_ON(!intel_uc_supports_guc(uc)); + GEM_BUG_ON(!intel_uc_wants_guc(uc)); + + if (!intel_uc_fw_is_loadable(&guc->fw)) { + ret = __uc_check_hw(uc) || + intel_uc_fw_is_overridden(&guc->fw) || + intel_uc_wants_guc_submission(uc) ? + intel_uc_fw_status_to_error(guc->fw.status) : 0; + goto err_out; + } + + ret = uc_init_wopcm(uc); + if (ret) + goto err_out; + + guc_reset_interrupts(guc); + + /* WaEnableuKernelHeaderValidFix:skl */ + /* WaEnableGuCBootHashCheckNotSet:skl,bxt,kbl */ + if (IS_GEN(i915, 9)) + attempts = 3; + else + attempts = 1; + + while (attempts--) { + /* + * Always reset the GuC just before (re)loading, so + * that the state and timing are fairly predictable + */ + ret = __uc_sanitize(uc); + if (ret) + goto err_out; + + intel_huc_fw_upload(huc); + intel_guc_ads_reset(guc); + intel_guc_write_params(guc); + ret = intel_guc_fw_upload(guc); + if (ret == 0) + break; + + DRM_DEBUG_DRIVER("GuC fw load failed: %d; will reset and " + "retry %d more time(s)\n", ret, attempts); + } + + /* Did we succeded or run out of retries? */ + if (ret) + goto err_log_capture; + + ret = guc_enable_communication(guc); + if (ret) + goto err_log_capture; + + intel_huc_auth(huc); + + ret = intel_guc_sample_forcewake(guc); + if (ret) + goto err_communication; + + if (intel_uc_uses_guc_submission(uc)) + intel_guc_submission_enable(guc); + + drm_info(&i915->drm, "%s firmware %s version %u.%u %s:%s\n", + intel_uc_fw_type_repr(INTEL_UC_FW_TYPE_GUC), guc->fw.path, + guc->fw.major_ver_found, guc->fw.minor_ver_found, + "submission", + enableddisabled(intel_uc_uses_guc_submission(uc))); + + if (intel_uc_uses_huc(uc)) { + drm_info(&i915->drm, "%s firmware %s version %u.%u %s:%s\n", + intel_uc_fw_type_repr(INTEL_UC_FW_TYPE_HUC), + huc->fw.path, + huc->fw.major_ver_found, huc->fw.minor_ver_found, + "authenticated", + yesno(intel_huc_is_authenticated(huc))); + } + + return 0; + + /* + * We've failed to load the firmware :( + */ +err_communication: + guc_disable_communication(guc); +err_log_capture: + __uc_capture_load_err_log(uc); +err_out: + __uc_sanitize(uc); + + if (!ret) { + drm_notice(&i915->drm, "GuC is uninitialized\n"); + /* We want to run without GuC submission */ + return 0; + } + + i915_probe_error(i915, "GuC initialization failed %d\n", ret); + + /* We want to keep KMS alive */ + return -EIO; +} + +static void __uc_fini_hw(struct intel_uc *uc) +{ + struct intel_guc *guc = &uc->guc; + + if (!intel_guc_is_fw_running(guc)) + return; + + if (intel_uc_uses_guc_submission(uc)) + intel_guc_submission_disable(guc); + + if (guc_communication_enabled(guc)) + guc_disable_communication(guc); + + __uc_sanitize(uc); +} + +/** + * intel_uc_reset_prepare - Prepare for reset + * @uc: the intel_uc structure + * + * Preparing for full gpu reset. + */ +void intel_uc_reset_prepare(struct intel_uc *uc) +{ + struct intel_guc *guc = &uc->guc; + + if (!intel_guc_is_ready(guc)) + return; + + guc_disable_communication(guc); + __uc_sanitize(uc); +} + +void intel_uc_runtime_suspend(struct intel_uc *uc) +{ + struct intel_guc *guc = &uc->guc; + int err; + + if (!intel_guc_is_ready(guc)) + return; + + err = intel_guc_suspend(guc); + if (err) + DRM_DEBUG_DRIVER("Failed to suspend GuC, err=%d", err); + + guc_disable_communication(guc); +} + +void intel_uc_suspend(struct intel_uc *uc) +{ + struct intel_guc *guc = &uc->guc; + intel_wakeref_t wakeref; + + if (!intel_guc_is_ready(guc)) + return; + + with_intel_runtime_pm(uc_to_gt(uc)->uncore->rpm, wakeref) + intel_uc_runtime_suspend(uc); +} + +static int __uc_resume(struct intel_uc *uc, bool enable_communication) +{ + struct intel_guc *guc = &uc->guc; + int err; + + if (!intel_guc_is_fw_running(guc)) + return 0; + + /* Make sure we enable communication if and only if it's disabled */ + GEM_BUG_ON(enable_communication == guc_communication_enabled(guc)); + + if (enable_communication) + guc_enable_communication(guc); + + err = intel_guc_resume(guc); + if (err) { + DRM_DEBUG_DRIVER("Failed to resume GuC, err=%d", err); + return err; + } + + return 0; +} + +int intel_uc_resume(struct intel_uc *uc) +{ + /* + * When coming out of S3/S4 we sanitize and re-init the HW, so + * communication is already re-enabled at this point. + */ + return __uc_resume(uc, false); +} + +int intel_uc_runtime_resume(struct intel_uc *uc) +{ + /* + * During runtime resume we don't sanitize, so we need to re-init + * communication as well. + */ + return __uc_resume(uc, true); +} + +static const struct intel_uc_ops uc_ops_off = { + .init_hw = __uc_check_hw, +}; + +static const struct intel_uc_ops uc_ops_on = { + .sanitize = __uc_sanitize, + + .init_fw = __uc_fetch_firmwares, + .fini_fw = __uc_cleanup_firmwares, + + .init = __uc_init, + .fini = __uc_fini, + + .init_hw = __uc_init_hw, + .fini_hw = __uc_fini_hw, +}; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.h b/drivers/gpu/drm/i915/gt/uc/intel_uc.h new file mode 100644 index 000000000..9c954c589 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2014-2019 Intel Corporation + */ + +#ifndef _INTEL_UC_H_ +#define _INTEL_UC_H_ + +#include "intel_guc.h" +#include "intel_guc_submission.h" +#include "intel_huc.h" +#include "i915_params.h" + +struct intel_uc; + +struct intel_uc_ops { + int (*sanitize)(struct intel_uc *uc); + void (*init_fw)(struct intel_uc *uc); + void (*fini_fw)(struct intel_uc *uc); + int (*init)(struct intel_uc *uc); + void (*fini)(struct intel_uc *uc); + int (*init_hw)(struct intel_uc *uc); + void (*fini_hw)(struct intel_uc *uc); +}; + +struct intel_uc { + struct intel_uc_ops const *ops; + struct intel_guc guc; + struct intel_huc huc; + + /* Snapshot of GuC log from last failed load */ + struct drm_i915_gem_object *load_err_log; +}; + +void intel_uc_init_early(struct intel_uc *uc); +void intel_uc_driver_late_release(struct intel_uc *uc); +void intel_uc_driver_remove(struct intel_uc *uc); +void intel_uc_init_mmio(struct intel_uc *uc); +void intel_uc_reset_prepare(struct intel_uc *uc); +void intel_uc_suspend(struct intel_uc *uc); +void intel_uc_runtime_suspend(struct intel_uc *uc); +int intel_uc_resume(struct intel_uc *uc); +int intel_uc_runtime_resume(struct intel_uc *uc); + +/* + * We need to know as early as possible if we're going to use GuC or not to + * take the correct setup paths. Additionally, once we've started loading the + * GuC, it is unsafe to keep executing without it because some parts of the HW, + * a subset of which is not cleaned on GT reset, will start expecting the GuC FW + * to be running. + * To solve both these requirements, we commit to using the microcontrollers if + * the relevant modparam is set and the blobs are found on the system. At this + * stage, the only thing that can stop us from attempting to load the blobs on + * the HW and use them is a fundamental issue (e.g. no memory for our + * structures); if we hit such a problem during driver load we're broken even + * without GuC, so there is no point in trying to fall back. + * + * Given the above, we can be in one of 4 states, with the last one implying + * we're committed to using the microcontroller: + * - Not supported: not available in HW and/or firmware not defined. + * - Supported: available in HW and firmware defined. + * - Wanted: supported + enabled in modparam. + * - In use: wanted + firmware found on the system and successfully fetched. + */ + +#define __uc_state_checker(x, func, state, required) \ +static inline bool intel_uc_##state##_##func(struct intel_uc *uc) \ +{ \ + return intel_##func##_is_##required(&uc->x); \ +} + +#define uc_state_checkers(x, func) \ +__uc_state_checker(x, func, supports, supported) \ +__uc_state_checker(x, func, wants, wanted) \ +__uc_state_checker(x, func, uses, used) + +uc_state_checkers(guc, guc); +uc_state_checkers(huc, huc); +uc_state_checkers(guc, guc_submission); + +#undef uc_state_checkers +#undef __uc_state_checker + +#define intel_uc_ops_function(_NAME, _OPS, _TYPE, _RET) \ +static inline _TYPE intel_uc_##_NAME(struct intel_uc *uc) \ +{ \ + if (uc->ops->_OPS) \ + return uc->ops->_OPS(uc); \ + return _RET; \ +} +intel_uc_ops_function(sanitize, sanitize, int, 0); +intel_uc_ops_function(fetch_firmwares, init_fw, void, ); +intel_uc_ops_function(cleanup_firmwares, fini_fw, void, ); +intel_uc_ops_function(init, init, int, 0); +intel_uc_ops_function(fini, fini, void, ); +intel_uc_ops_function(init_hw, init_hw, int, 0); +intel_uc_ops_function(fini_hw, fini_hw, void, ); +#undef intel_uc_ops_function + +#endif diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_debugfs.c b/drivers/gpu/drm/i915/gt/uc/intel_uc_debugfs.c new file mode 100644 index 000000000..089d98662 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_debugfs.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +#include <linux/debugfs.h> +#include <drm/drm_print.h> + +#include "gt/debugfs_gt.h" +#include "intel_guc_debugfs.h" +#include "intel_huc_debugfs.h" +#include "intel_uc.h" +#include "intel_uc_debugfs.h" + +static int uc_usage_show(struct seq_file *m, void *data) +{ + struct intel_uc *uc = m->private; + struct drm_printer p = drm_seq_file_printer(m); + + drm_printf(&p, "[guc] supported:%s wanted:%s used:%s\n", + yesno(intel_uc_supports_guc(uc)), + yesno(intel_uc_wants_guc(uc)), + yesno(intel_uc_uses_guc(uc))); + drm_printf(&p, "[huc] supported:%s wanted:%s used:%s\n", + yesno(intel_uc_supports_huc(uc)), + yesno(intel_uc_wants_huc(uc)), + yesno(intel_uc_uses_huc(uc))); + drm_printf(&p, "[submission] supported:%s wanted:%s used:%s\n", + yesno(intel_uc_supports_guc_submission(uc)), + yesno(intel_uc_wants_guc_submission(uc)), + yesno(intel_uc_uses_guc_submission(uc))); + + return 0; +} +DEFINE_GT_DEBUGFS_ATTRIBUTE(uc_usage); + +void intel_uc_debugfs_register(struct intel_uc *uc, struct dentry *gt_root) +{ + static const struct debugfs_gt_file files[] = { + { "usage", &uc_usage_fops, NULL }, + }; + struct dentry *root; + + if (!gt_root) + return; + + /* GuC and HuC go always in pair, no need to check both */ + if (!intel_uc_supports_guc(uc)) + return; + + root = debugfs_create_dir("uc", gt_root); + if (IS_ERR(root)) + return; + + intel_gt_debugfs_register_files(root, files, ARRAY_SIZE(files), uc); + + intel_guc_debugfs_register(&uc->guc, root); + intel_huc_debugfs_register(&uc->huc, root); +} diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_debugfs.h b/drivers/gpu/drm/i915/gt/uc/intel_uc_debugfs.h new file mode 100644 index 000000000..010ce250d --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_debugfs.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2020 Intel Corporation + */ + +#ifndef DEBUGFS_UC_H +#define DEBUGFS_UC_H + +struct intel_uc; +struct dentry; + +void intel_uc_debugfs_register(struct intel_uc *uc, struct dentry *gt_root); + +#endif /* DEBUGFS_UC_H */ diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c new file mode 100644 index 000000000..80e8b6c3b --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c @@ -0,0 +1,616 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2016-2019 Intel Corporation + */ + +#include <linux/bitfield.h> +#include <linux/firmware.h> +#include <drm/drm_print.h> + +#include "intel_uc_fw.h" +#include "intel_uc_fw_abi.h" +#include "i915_drv.h" + +static inline struct intel_gt * +____uc_fw_to_gt(struct intel_uc_fw *uc_fw, enum intel_uc_fw_type type) +{ + if (type == INTEL_UC_FW_TYPE_GUC) + return container_of(uc_fw, struct intel_gt, uc.guc.fw); + + GEM_BUG_ON(type != INTEL_UC_FW_TYPE_HUC); + return container_of(uc_fw, struct intel_gt, uc.huc.fw); +} + +static inline struct intel_gt *__uc_fw_to_gt(struct intel_uc_fw *uc_fw) +{ + GEM_BUG_ON(uc_fw->status == INTEL_UC_FIRMWARE_UNINITIALIZED); + return ____uc_fw_to_gt(uc_fw, uc_fw->type); +} + +#ifdef CONFIG_DRM_I915_DEBUG_GUC +void intel_uc_fw_change_status(struct intel_uc_fw *uc_fw, + enum intel_uc_fw_status status) +{ + uc_fw->__status = status; + drm_dbg(&__uc_fw_to_gt(uc_fw)->i915->drm, + "%s firmware -> %s\n", + intel_uc_fw_type_repr(uc_fw->type), + status == INTEL_UC_FIRMWARE_SELECTED ? + uc_fw->path : intel_uc_fw_status_repr(status)); +} +#endif + +/* + * List of required GuC and HuC binaries per-platform. + * Must be ordered based on platform + revid, from newer to older. + * + * TGL 35.2 is interface-compatible with 33.0 for previous Gens. The deltas + * between 33.0 and 35.2 are only related to new additions to support new Gen12 + * features. + * + * Note that RKL uses the same firmware as TGL. + */ +#define INTEL_UC_FIRMWARE_DEFS(fw_def, guc_def, huc_def) \ + fw_def(ROCKETLAKE, 0, guc_def(tgl, 35, 2, 0), huc_def(tgl, 7, 5, 0)) \ + fw_def(TIGERLAKE, 0, guc_def(tgl, 35, 2, 0), huc_def(tgl, 7, 5, 0)) \ + fw_def(ELKHARTLAKE, 0, guc_def(ehl, 33, 0, 4), huc_def(ehl, 9, 0, 0)) \ + fw_def(ICELAKE, 0, guc_def(icl, 33, 0, 0), huc_def(icl, 9, 0, 0)) \ + fw_def(COMETLAKE, 5, guc_def(cml, 33, 0, 0), huc_def(cml, 4, 0, 0)) \ + fw_def(COFFEELAKE, 0, guc_def(kbl, 33, 0, 0), huc_def(kbl, 4, 0, 0)) \ + fw_def(GEMINILAKE, 0, guc_def(glk, 33, 0, 0), huc_def(glk, 4, 0, 0)) \ + fw_def(KABYLAKE, 0, guc_def(kbl, 33, 0, 0), huc_def(kbl, 4, 0, 0)) \ + fw_def(BROXTON, 0, guc_def(bxt, 33, 0, 0), huc_def(bxt, 2, 0, 0)) \ + fw_def(SKYLAKE, 0, guc_def(skl, 33, 0, 0), huc_def(skl, 2, 0, 0)) + +#define __MAKE_UC_FW_PATH(prefix_, name_, major_, minor_, patch_) \ + "i915/" \ + __stringify(prefix_) name_ \ + __stringify(major_) "." \ + __stringify(minor_) "." \ + __stringify(patch_) ".bin" + +#define MAKE_GUC_FW_PATH(prefix_, major_, minor_, patch_) \ + __MAKE_UC_FW_PATH(prefix_, "_guc_", major_, minor_, patch_) + +#define MAKE_HUC_FW_PATH(prefix_, major_, minor_, bld_num_) \ + __MAKE_UC_FW_PATH(prefix_, "_huc_", major_, minor_, bld_num_) + +/* All blobs need to be declared via MODULE_FIRMWARE() */ +#define INTEL_UC_MODULE_FW(platform_, revid_, guc_, huc_) \ + MODULE_FIRMWARE(guc_); \ + MODULE_FIRMWARE(huc_); + +INTEL_UC_FIRMWARE_DEFS(INTEL_UC_MODULE_FW, MAKE_GUC_FW_PATH, MAKE_HUC_FW_PATH) + +/* The below structs and macros are used to iterate across the list of blobs */ +struct __packed uc_fw_blob { + u8 major; + u8 minor; + const char *path; +}; + +#define UC_FW_BLOB(major_, minor_, path_) \ + { .major = major_, .minor = minor_, .path = path_ } + +#define GUC_FW_BLOB(prefix_, major_, minor_, patch_) \ + UC_FW_BLOB(major_, minor_, \ + MAKE_GUC_FW_PATH(prefix_, major_, minor_, patch_)) + +#define HUC_FW_BLOB(prefix_, major_, minor_, bld_num_) \ + UC_FW_BLOB(major_, minor_, \ + MAKE_HUC_FW_PATH(prefix_, major_, minor_, bld_num_)) + +struct __packed uc_fw_platform_requirement { + enum intel_platform p; + u8 rev; /* first platform rev using this FW */ + const struct uc_fw_blob blobs[INTEL_UC_FW_NUM_TYPES]; +}; + +#define MAKE_FW_LIST(platform_, revid_, guc_, huc_) \ +{ \ + .p = INTEL_##platform_, \ + .rev = revid_, \ + .blobs[INTEL_UC_FW_TYPE_GUC] = guc_, \ + .blobs[INTEL_UC_FW_TYPE_HUC] = huc_, \ +}, + +static void +__uc_fw_auto_select(struct drm_i915_private *i915, struct intel_uc_fw *uc_fw) +{ + static const struct uc_fw_platform_requirement fw_blobs[] = { + INTEL_UC_FIRMWARE_DEFS(MAKE_FW_LIST, GUC_FW_BLOB, HUC_FW_BLOB) + }; + enum intel_platform p = INTEL_INFO(i915)->platform; + u8 rev = INTEL_REVID(i915); + int i; + + for (i = 0; i < ARRAY_SIZE(fw_blobs) && p <= fw_blobs[i].p; i++) { + if (p == fw_blobs[i].p && rev >= fw_blobs[i].rev) { + const struct uc_fw_blob *blob = + &fw_blobs[i].blobs[uc_fw->type]; + uc_fw->path = blob->path; + uc_fw->major_ver_wanted = blob->major; + uc_fw->minor_ver_wanted = blob->minor; + break; + } + } + + /* make sure the list is ordered as expected */ + if (IS_ENABLED(CONFIG_DRM_I915_SELFTEST)) { + for (i = 1; i < ARRAY_SIZE(fw_blobs); i++) { + if (fw_blobs[i].p < fw_blobs[i - 1].p) + continue; + + if (fw_blobs[i].p == fw_blobs[i - 1].p && + fw_blobs[i].rev < fw_blobs[i - 1].rev) + continue; + + pr_err("invalid FW blob order: %s r%u comes before %s r%u\n", + intel_platform_name(fw_blobs[i - 1].p), + fw_blobs[i - 1].rev, + intel_platform_name(fw_blobs[i].p), + fw_blobs[i].rev); + + uc_fw->path = NULL; + } + } + + /* We don't want to enable GuC/HuC on pre-Gen11 by default */ + if (i915->params.enable_guc == -1 && p < INTEL_ICELAKE) + uc_fw->path = NULL; +} + +static const char *__override_guc_firmware_path(struct drm_i915_private *i915) +{ + if (i915->params.enable_guc & (ENABLE_GUC_SUBMISSION | + ENABLE_GUC_LOAD_HUC)) + return i915->params.guc_firmware_path; + return ""; +} + +static const char *__override_huc_firmware_path(struct drm_i915_private *i915) +{ + if (i915->params.enable_guc & ENABLE_GUC_LOAD_HUC) + return i915->params.huc_firmware_path; + return ""; +} + +static void __uc_fw_user_override(struct drm_i915_private *i915, struct intel_uc_fw *uc_fw) +{ + const char *path = NULL; + + switch (uc_fw->type) { + case INTEL_UC_FW_TYPE_GUC: + path = __override_guc_firmware_path(i915); + break; + case INTEL_UC_FW_TYPE_HUC: + path = __override_huc_firmware_path(i915); + break; + } + + if (unlikely(path)) { + uc_fw->path = path; + uc_fw->user_overridden = true; + } +} + +/** + * intel_uc_fw_init_early - initialize the uC object and select the firmware + * @uc_fw: uC firmware + * @type: type of uC + * + * Initialize the state of our uC object and relevant tracking and select the + * firmware to fetch and load. + */ +void intel_uc_fw_init_early(struct intel_uc_fw *uc_fw, + enum intel_uc_fw_type type) +{ + struct drm_i915_private *i915 = ____uc_fw_to_gt(uc_fw, type)->i915; + + /* + * we use FIRMWARE_UNINITIALIZED to detect checks against uc_fw->status + * before we're looked at the HW caps to see if we have uc support + */ + BUILD_BUG_ON(INTEL_UC_FIRMWARE_UNINITIALIZED); + GEM_BUG_ON(uc_fw->status); + GEM_BUG_ON(uc_fw->path); + + uc_fw->type = type; + + if (HAS_GT_UC(i915)) { + __uc_fw_auto_select(i915, uc_fw); + __uc_fw_user_override(i915, uc_fw); + } + + intel_uc_fw_change_status(uc_fw, uc_fw->path ? *uc_fw->path ? + INTEL_UC_FIRMWARE_SELECTED : + INTEL_UC_FIRMWARE_DISABLED : + INTEL_UC_FIRMWARE_NOT_SUPPORTED); +} + +static void __force_fw_fetch_failures(struct intel_uc_fw *uc_fw, int e) +{ + struct drm_i915_private *i915 = __uc_fw_to_gt(uc_fw)->i915; + bool user = e == -EINVAL; + + if (i915_inject_probe_error(i915, e)) { + /* non-existing blob */ + uc_fw->path = "<invalid>"; + uc_fw->user_overridden = user; + } else if (i915_inject_probe_error(i915, e)) { + /* require next major version */ + uc_fw->major_ver_wanted += 1; + uc_fw->minor_ver_wanted = 0; + uc_fw->user_overridden = user; + } else if (i915_inject_probe_error(i915, e)) { + /* require next minor version */ + uc_fw->minor_ver_wanted += 1; + uc_fw->user_overridden = user; + } else if (uc_fw->major_ver_wanted && + i915_inject_probe_error(i915, e)) { + /* require prev major version */ + uc_fw->major_ver_wanted -= 1; + uc_fw->minor_ver_wanted = 0; + uc_fw->user_overridden = user; + } else if (uc_fw->minor_ver_wanted && + i915_inject_probe_error(i915, e)) { + /* require prev minor version - hey, this should work! */ + uc_fw->minor_ver_wanted -= 1; + uc_fw->user_overridden = user; + } else if (user && i915_inject_probe_error(i915, e)) { + /* officially unsupported platform */ + uc_fw->major_ver_wanted = 0; + uc_fw->minor_ver_wanted = 0; + uc_fw->user_overridden = true; + } +} + +/** + * intel_uc_fw_fetch - fetch uC firmware + * @uc_fw: uC firmware + * + * Fetch uC firmware into GEM obj. + * + * Return: 0 on success, a negative errno code on failure. + */ +int intel_uc_fw_fetch(struct intel_uc_fw *uc_fw) +{ + struct drm_i915_private *i915 = __uc_fw_to_gt(uc_fw)->i915; + struct device *dev = i915->drm.dev; + struct drm_i915_gem_object *obj; + const struct firmware *fw = NULL; + struct uc_css_header *css; + size_t size; + int err; + + GEM_BUG_ON(!i915->wopcm.size); + GEM_BUG_ON(!intel_uc_fw_is_enabled(uc_fw)); + + err = i915_inject_probe_error(i915, -ENXIO); + if (err) + goto fail; + + __force_fw_fetch_failures(uc_fw, -EINVAL); + __force_fw_fetch_failures(uc_fw, -ESTALE); + + err = request_firmware(&fw, uc_fw->path, dev); + if (err) + goto fail; + + /* Check the size of the blob before examining buffer contents */ + if (unlikely(fw->size < sizeof(struct uc_css_header))) { + drm_warn(&i915->drm, "%s firmware %s: invalid size: %zu < %zu\n", + intel_uc_fw_type_repr(uc_fw->type), uc_fw->path, + fw->size, sizeof(struct uc_css_header)); + err = -ENODATA; + goto fail; + } + + css = (struct uc_css_header *)fw->data; + + /* Check integrity of size values inside CSS header */ + size = (css->header_size_dw - css->key_size_dw - css->modulus_size_dw - + css->exponent_size_dw) * sizeof(u32); + if (unlikely(size != sizeof(struct uc_css_header))) { + drm_warn(&i915->drm, + "%s firmware %s: unexpected header size: %zu != %zu\n", + intel_uc_fw_type_repr(uc_fw->type), uc_fw->path, + fw->size, sizeof(struct uc_css_header)); + err = -EPROTO; + goto fail; + } + + /* uCode size must calculated from other sizes */ + uc_fw->ucode_size = (css->size_dw - css->header_size_dw) * sizeof(u32); + + /* now RSA */ + if (unlikely(css->key_size_dw != UOS_RSA_SCRATCH_COUNT)) { + drm_warn(&i915->drm, "%s firmware %s: unexpected key size: %u != %u\n", + intel_uc_fw_type_repr(uc_fw->type), uc_fw->path, + css->key_size_dw, UOS_RSA_SCRATCH_COUNT); + err = -EPROTO; + goto fail; + } + uc_fw->rsa_size = css->key_size_dw * sizeof(u32); + + /* At least, it should have header, uCode and RSA. Size of all three. */ + size = sizeof(struct uc_css_header) + uc_fw->ucode_size + uc_fw->rsa_size; + if (unlikely(fw->size < size)) { + drm_warn(&i915->drm, "%s firmware %s: invalid size: %zu < %zu\n", + intel_uc_fw_type_repr(uc_fw->type), uc_fw->path, + fw->size, size); + err = -ENOEXEC; + goto fail; + } + + /* Sanity check whether this fw is not larger than whole WOPCM memory */ + size = __intel_uc_fw_get_upload_size(uc_fw); + if (unlikely(size >= i915->wopcm.size)) { + drm_warn(&i915->drm, "%s firmware %s: invalid size: %zu > %zu\n", + intel_uc_fw_type_repr(uc_fw->type), uc_fw->path, + size, (size_t)i915->wopcm.size); + err = -E2BIG; + goto fail; + } + + /* Get version numbers from the CSS header */ + uc_fw->major_ver_found = FIELD_GET(CSS_SW_VERSION_UC_MAJOR, + css->sw_version); + uc_fw->minor_ver_found = FIELD_GET(CSS_SW_VERSION_UC_MINOR, + css->sw_version); + + if (uc_fw->major_ver_found != uc_fw->major_ver_wanted || + uc_fw->minor_ver_found < uc_fw->minor_ver_wanted) { + drm_notice(&i915->drm, "%s firmware %s: unexpected version: %u.%u != %u.%u\n", + intel_uc_fw_type_repr(uc_fw->type), uc_fw->path, + uc_fw->major_ver_found, uc_fw->minor_ver_found, + uc_fw->major_ver_wanted, uc_fw->minor_ver_wanted); + if (!intel_uc_fw_is_overridden(uc_fw)) { + err = -ENOEXEC; + goto fail; + } + } + + obj = i915_gem_object_create_shmem_from_data(i915, fw->data, fw->size); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + goto fail; + } + + uc_fw->obj = obj; + uc_fw->size = fw->size; + intel_uc_fw_change_status(uc_fw, INTEL_UC_FIRMWARE_AVAILABLE); + + release_firmware(fw); + return 0; + +fail: + intel_uc_fw_change_status(uc_fw, err == -ENOENT ? + INTEL_UC_FIRMWARE_MISSING : + INTEL_UC_FIRMWARE_ERROR); + + drm_notice(&i915->drm, "%s firmware %s: fetch failed with error %d\n", + intel_uc_fw_type_repr(uc_fw->type), uc_fw->path, err); + drm_info(&i915->drm, "%s firmware(s) can be downloaded from %s\n", + intel_uc_fw_type_repr(uc_fw->type), INTEL_UC_FIRMWARE_URL); + + release_firmware(fw); /* OK even if fw is NULL */ + return err; +} + +static u32 uc_fw_ggtt_offset(struct intel_uc_fw *uc_fw) +{ + struct i915_ggtt *ggtt = __uc_fw_to_gt(uc_fw)->ggtt; + struct drm_mm_node *node = &ggtt->uc_fw; + + GEM_BUG_ON(!drm_mm_node_allocated(node)); + GEM_BUG_ON(upper_32_bits(node->start)); + GEM_BUG_ON(upper_32_bits(node->start + node->size - 1)); + + return lower_32_bits(node->start); +} + +static void uc_fw_bind_ggtt(struct intel_uc_fw *uc_fw) +{ + struct drm_i915_gem_object *obj = uc_fw->obj; + struct i915_ggtt *ggtt = __uc_fw_to_gt(uc_fw)->ggtt; + struct i915_vma dummy = { + .node.start = uc_fw_ggtt_offset(uc_fw), + .node.size = obj->base.size, + .pages = obj->mm.pages, + .vm = &ggtt->vm, + }; + + GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj)); + GEM_BUG_ON(dummy.node.size > ggtt->uc_fw.size); + + /* uc_fw->obj cache domains were not controlled across suspend */ + drm_clflush_sg(dummy.pages); + + ggtt->vm.insert_entries(&ggtt->vm, &dummy, I915_CACHE_NONE, 0); +} + +static void uc_fw_unbind_ggtt(struct intel_uc_fw *uc_fw) +{ + struct drm_i915_gem_object *obj = uc_fw->obj; + struct i915_ggtt *ggtt = __uc_fw_to_gt(uc_fw)->ggtt; + u64 start = uc_fw_ggtt_offset(uc_fw); + + ggtt->vm.clear_range(&ggtt->vm, start, obj->base.size); +} + +static int uc_fw_xfer(struct intel_uc_fw *uc_fw, u32 dst_offset, u32 dma_flags) +{ + struct intel_gt *gt = __uc_fw_to_gt(uc_fw); + struct intel_uncore *uncore = gt->uncore; + u64 offset; + int ret; + + ret = i915_inject_probe_error(gt->i915, -ETIMEDOUT); + if (ret) + return ret; + + intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); + + /* Set the source address for the uCode */ + offset = uc_fw_ggtt_offset(uc_fw); + GEM_BUG_ON(upper_32_bits(offset) & 0xFFFF0000); + intel_uncore_write_fw(uncore, DMA_ADDR_0_LOW, lower_32_bits(offset)); + intel_uncore_write_fw(uncore, DMA_ADDR_0_HIGH, upper_32_bits(offset)); + + /* Set the DMA destination */ + intel_uncore_write_fw(uncore, DMA_ADDR_1_LOW, dst_offset); + intel_uncore_write_fw(uncore, DMA_ADDR_1_HIGH, DMA_ADDRESS_SPACE_WOPCM); + + /* + * Set the transfer size. The header plus uCode will be copied to WOPCM + * via DMA, excluding any other components + */ + intel_uncore_write_fw(uncore, DMA_COPY_SIZE, + sizeof(struct uc_css_header) + uc_fw->ucode_size); + + /* Start the DMA */ + intel_uncore_write_fw(uncore, DMA_CTRL, + _MASKED_BIT_ENABLE(dma_flags | START_DMA)); + + /* Wait for DMA to finish */ + ret = intel_wait_for_register_fw(uncore, DMA_CTRL, START_DMA, 0, 100); + if (ret) + drm_err(>->i915->drm, "DMA for %s fw failed, DMA_CTRL=%u\n", + intel_uc_fw_type_repr(uc_fw->type), + intel_uncore_read_fw(uncore, DMA_CTRL)); + + /* Disable the bits once DMA is over */ + intel_uncore_write_fw(uncore, DMA_CTRL, _MASKED_BIT_DISABLE(dma_flags)); + + intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL); + + return ret; +} + +/** + * intel_uc_fw_upload - load uC firmware using custom loader + * @uc_fw: uC firmware + * @dst_offset: destination offset + * @dma_flags: flags for flags for dma ctrl + * + * Loads uC firmware and updates internal flags. + * + * Return: 0 on success, non-zero on failure. + */ +int intel_uc_fw_upload(struct intel_uc_fw *uc_fw, u32 dst_offset, u32 dma_flags) +{ + struct intel_gt *gt = __uc_fw_to_gt(uc_fw); + int err; + + /* make sure the status was cleared the last time we reset the uc */ + GEM_BUG_ON(intel_uc_fw_is_loaded(uc_fw)); + + err = i915_inject_probe_error(gt->i915, -ENOEXEC); + if (err) + return err; + + if (!intel_uc_fw_is_loadable(uc_fw)) + return -ENOEXEC; + + /* Call custom loader */ + uc_fw_bind_ggtt(uc_fw); + err = uc_fw_xfer(uc_fw, dst_offset, dma_flags); + uc_fw_unbind_ggtt(uc_fw); + if (err) + goto fail; + + intel_uc_fw_change_status(uc_fw, INTEL_UC_FIRMWARE_TRANSFERRED); + return 0; + +fail: + i915_probe_error(gt->i915, "Failed to load %s firmware %s (%d)\n", + intel_uc_fw_type_repr(uc_fw->type), uc_fw->path, + err); + intel_uc_fw_change_status(uc_fw, INTEL_UC_FIRMWARE_FAIL); + return err; +} + +int intel_uc_fw_init(struct intel_uc_fw *uc_fw) +{ + int err; + + /* this should happen before the load! */ + GEM_BUG_ON(intel_uc_fw_is_loaded(uc_fw)); + + if (!intel_uc_fw_is_available(uc_fw)) + return -ENOEXEC; + + err = i915_gem_object_pin_pages(uc_fw->obj); + if (err) { + DRM_DEBUG_DRIVER("%s fw pin-pages err=%d\n", + intel_uc_fw_type_repr(uc_fw->type), err); + intel_uc_fw_change_status(uc_fw, INTEL_UC_FIRMWARE_FAIL); + } + + return err; +} + +void intel_uc_fw_fini(struct intel_uc_fw *uc_fw) +{ + if (i915_gem_object_has_pinned_pages(uc_fw->obj)) + i915_gem_object_unpin_pages(uc_fw->obj); + + intel_uc_fw_change_status(uc_fw, INTEL_UC_FIRMWARE_AVAILABLE); +} + +/** + * intel_uc_fw_cleanup_fetch - cleanup uC firmware + * @uc_fw: uC firmware + * + * Cleans up uC firmware by releasing the firmware GEM obj. + */ +void intel_uc_fw_cleanup_fetch(struct intel_uc_fw *uc_fw) +{ + if (!intel_uc_fw_is_available(uc_fw)) + return; + + i915_gem_object_put(fetch_and_zero(&uc_fw->obj)); + + intel_uc_fw_change_status(uc_fw, INTEL_UC_FIRMWARE_SELECTED); +} + +/** + * intel_uc_fw_copy_rsa - copy fw RSA to buffer + * + * @uc_fw: uC firmware + * @dst: dst buffer + * @max_len: max number of bytes to copy + * + * Return: number of copied bytes. + */ +size_t intel_uc_fw_copy_rsa(struct intel_uc_fw *uc_fw, void *dst, u32 max_len) +{ + struct sg_table *pages = uc_fw->obj->mm.pages; + u32 size = min_t(u32, uc_fw->rsa_size, max_len); + u32 offset = sizeof(struct uc_css_header) + uc_fw->ucode_size; + + GEM_BUG_ON(!intel_uc_fw_is_available(uc_fw)); + + return sg_pcopy_to_buffer(pages->sgl, pages->nents, dst, size, offset); +} + +/** + * intel_uc_fw_dump - dump information about uC firmware + * @uc_fw: uC firmware + * @p: the &drm_printer + * + * Pretty printer for uC firmware. + */ +void intel_uc_fw_dump(const struct intel_uc_fw *uc_fw, struct drm_printer *p) +{ + drm_printf(p, "%s firmware: %s\n", + intel_uc_fw_type_repr(uc_fw->type), uc_fw->path); + drm_printf(p, "\tstatus: %s\n", + intel_uc_fw_status_repr(uc_fw->status)); + drm_printf(p, "\tversion: wanted %u.%u, found %u.%u\n", + uc_fw->major_ver_wanted, uc_fw->minor_ver_wanted, + uc_fw->major_ver_found, uc_fw->minor_ver_found); + drm_printf(p, "\tuCode: %u bytes\n", uc_fw->ucode_size); + drm_printf(p, "\tRSA: %u bytes\n", uc_fw->rsa_size); +} diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h new file mode 100644 index 000000000..23d3a423a --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h @@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2014-2019 Intel Corporation + */ + +#ifndef _INTEL_UC_FW_H_ +#define _INTEL_UC_FW_H_ + +#include <linux/types.h> +#include "intel_uc_fw_abi.h" +#include "intel_device_info.h" +#include "i915_gem.h" + +struct drm_printer; +struct drm_i915_private; +struct intel_gt; + +/* Home of GuC, HuC and DMC firmwares */ +#define INTEL_UC_FIRMWARE_URL "https://git.kernel.org/pub/scm/linux/kernel/git/firmware/linux-firmware.git/tree/i915" + +/* + * +------------+---------------------------------------------------+ + * | PHASE | FIRMWARE STATUS TRANSITIONS | + * +============+===================================================+ + * | | UNINITIALIZED | + * +------------+- / | \ -+ + * | | DISABLED <--/ | \--> NOT_SUPPORTED | + * | init_early | V | + * | | SELECTED | + * +------------+- / | \ -+ + * | | MISSING <--/ | \--> ERROR | + * | fetch | V | + * | | AVAILABLE | + * +------------+- | -+ + * | init | V | + * | | /------> LOADABLE <----<-----------\ | + * +------------+- \ / \ \ \ -+ + * | | FAIL <--< \--> TRANSFERRED \ | + * | upload | \ / \ / | + * | | \---------/ \--> RUNNING | + * +------------+---------------------------------------------------+ + */ + +enum intel_uc_fw_status { + INTEL_UC_FIRMWARE_NOT_SUPPORTED = -1, /* no uc HW */ + INTEL_UC_FIRMWARE_UNINITIALIZED = 0, /* used to catch checks done too early */ + INTEL_UC_FIRMWARE_DISABLED, /* disabled */ + INTEL_UC_FIRMWARE_SELECTED, /* selected the blob we want to load */ + INTEL_UC_FIRMWARE_MISSING, /* blob not found on the system */ + INTEL_UC_FIRMWARE_ERROR, /* invalid format or version */ + INTEL_UC_FIRMWARE_AVAILABLE, /* blob found and copied in mem */ + INTEL_UC_FIRMWARE_LOADABLE, /* all fw-required objects are ready */ + INTEL_UC_FIRMWARE_FAIL, /* failed to xfer or init/auth the fw */ + INTEL_UC_FIRMWARE_TRANSFERRED, /* dma xfer done */ + INTEL_UC_FIRMWARE_RUNNING /* init/auth done */ +}; + +enum intel_uc_fw_type { + INTEL_UC_FW_TYPE_GUC = 0, + INTEL_UC_FW_TYPE_HUC +}; +#define INTEL_UC_FW_NUM_TYPES 2 + +/* + * This structure encapsulates all the data needed during the process + * of fetching, caching, and loading the firmware image into the uC. + */ +struct intel_uc_fw { + enum intel_uc_fw_type type; + union { + const enum intel_uc_fw_status status; + enum intel_uc_fw_status __status; /* no accidental overwrites */ + }; + const char *path; + bool user_overridden; + size_t size; + struct drm_i915_gem_object *obj; + + /* + * The firmware build process will generate a version header file with major and + * minor version defined. The versions are built into CSS header of firmware. + * i915 kernel driver set the minimal firmware version required per platform. + */ + u16 major_ver_wanted; + u16 minor_ver_wanted; + u16 major_ver_found; + u16 minor_ver_found; + + u32 rsa_size; + u32 ucode_size; +}; + +#ifdef CONFIG_DRM_I915_DEBUG_GUC +void intel_uc_fw_change_status(struct intel_uc_fw *uc_fw, + enum intel_uc_fw_status status); +#else +static inline void intel_uc_fw_change_status(struct intel_uc_fw *uc_fw, + enum intel_uc_fw_status status) +{ + uc_fw->__status = status; +} +#endif + +static inline +const char *intel_uc_fw_status_repr(enum intel_uc_fw_status status) +{ + switch (status) { + case INTEL_UC_FIRMWARE_NOT_SUPPORTED: + return "N/A"; + case INTEL_UC_FIRMWARE_UNINITIALIZED: + return "UNINITIALIZED"; + case INTEL_UC_FIRMWARE_DISABLED: + return "DISABLED"; + case INTEL_UC_FIRMWARE_SELECTED: + return "SELECTED"; + case INTEL_UC_FIRMWARE_MISSING: + return "MISSING"; + case INTEL_UC_FIRMWARE_ERROR: + return "ERROR"; + case INTEL_UC_FIRMWARE_AVAILABLE: + return "AVAILABLE"; + case INTEL_UC_FIRMWARE_LOADABLE: + return "LOADABLE"; + case INTEL_UC_FIRMWARE_FAIL: + return "FAIL"; + case INTEL_UC_FIRMWARE_TRANSFERRED: + return "TRANSFERRED"; + case INTEL_UC_FIRMWARE_RUNNING: + return "RUNNING"; + } + return "<invalid>"; +} + +static inline int intel_uc_fw_status_to_error(enum intel_uc_fw_status status) +{ + switch (status) { + case INTEL_UC_FIRMWARE_NOT_SUPPORTED: + return -ENODEV; + case INTEL_UC_FIRMWARE_UNINITIALIZED: + return -EACCES; + case INTEL_UC_FIRMWARE_DISABLED: + return -EPERM; + case INTEL_UC_FIRMWARE_MISSING: + return -ENOENT; + case INTEL_UC_FIRMWARE_ERROR: + return -ENOEXEC; + case INTEL_UC_FIRMWARE_FAIL: + return -EIO; + case INTEL_UC_FIRMWARE_SELECTED: + return -ESTALE; + case INTEL_UC_FIRMWARE_AVAILABLE: + case INTEL_UC_FIRMWARE_LOADABLE: + case INTEL_UC_FIRMWARE_TRANSFERRED: + case INTEL_UC_FIRMWARE_RUNNING: + return 0; + } + return -EINVAL; +} + +static inline const char *intel_uc_fw_type_repr(enum intel_uc_fw_type type) +{ + switch (type) { + case INTEL_UC_FW_TYPE_GUC: + return "GuC"; + case INTEL_UC_FW_TYPE_HUC: + return "HuC"; + } + return "uC"; +} + +static inline enum intel_uc_fw_status +__intel_uc_fw_status(struct intel_uc_fw *uc_fw) +{ + /* shouldn't call this before checking hw/blob availability */ + GEM_BUG_ON(uc_fw->status == INTEL_UC_FIRMWARE_UNINITIALIZED); + return uc_fw->status; +} + +static inline bool intel_uc_fw_is_supported(struct intel_uc_fw *uc_fw) +{ + return __intel_uc_fw_status(uc_fw) != INTEL_UC_FIRMWARE_NOT_SUPPORTED; +} + +static inline bool intel_uc_fw_is_enabled(struct intel_uc_fw *uc_fw) +{ + return __intel_uc_fw_status(uc_fw) > INTEL_UC_FIRMWARE_DISABLED; +} + +static inline bool intel_uc_fw_is_available(struct intel_uc_fw *uc_fw) +{ + return __intel_uc_fw_status(uc_fw) >= INTEL_UC_FIRMWARE_AVAILABLE; +} + +static inline bool intel_uc_fw_is_loadable(struct intel_uc_fw *uc_fw) +{ + return __intel_uc_fw_status(uc_fw) >= INTEL_UC_FIRMWARE_LOADABLE; +} + +static inline bool intel_uc_fw_is_loaded(struct intel_uc_fw *uc_fw) +{ + return __intel_uc_fw_status(uc_fw) >= INTEL_UC_FIRMWARE_TRANSFERRED; +} + +static inline bool intel_uc_fw_is_running(struct intel_uc_fw *uc_fw) +{ + return __intel_uc_fw_status(uc_fw) == INTEL_UC_FIRMWARE_RUNNING; +} + +static inline bool intel_uc_fw_is_overridden(const struct intel_uc_fw *uc_fw) +{ + return uc_fw->user_overridden; +} + +static inline void intel_uc_fw_sanitize(struct intel_uc_fw *uc_fw) +{ + if (intel_uc_fw_is_loaded(uc_fw)) + intel_uc_fw_change_status(uc_fw, INTEL_UC_FIRMWARE_LOADABLE); +} + +static inline u32 __intel_uc_fw_get_upload_size(struct intel_uc_fw *uc_fw) +{ + return sizeof(struct uc_css_header) + uc_fw->ucode_size; +} + +/** + * intel_uc_fw_get_upload_size() - Get size of firmware needed to be uploaded. + * @uc_fw: uC firmware. + * + * Get the size of the firmware and header that will be uploaded to WOPCM. + * + * Return: Upload firmware size, or zero on firmware fetch failure. + */ +static inline u32 intel_uc_fw_get_upload_size(struct intel_uc_fw *uc_fw) +{ + if (!intel_uc_fw_is_available(uc_fw)) + return 0; + + return __intel_uc_fw_get_upload_size(uc_fw); +} + +void intel_uc_fw_init_early(struct intel_uc_fw *uc_fw, + enum intel_uc_fw_type type); +int intel_uc_fw_fetch(struct intel_uc_fw *uc_fw); +void intel_uc_fw_cleanup_fetch(struct intel_uc_fw *uc_fw); +int intel_uc_fw_upload(struct intel_uc_fw *uc_fw, u32 offset, u32 dma_flags); +int intel_uc_fw_init(struct intel_uc_fw *uc_fw); +void intel_uc_fw_fini(struct intel_uc_fw *uc_fw); +size_t intel_uc_fw_copy_rsa(struct intel_uc_fw *uc_fw, void *dst, u32 max_len); +void intel_uc_fw_dump(const struct intel_uc_fw *uc_fw, struct drm_printer *p); + +#endif diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw_abi.h b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw_abi.h new file mode 100644 index 000000000..029214cde --- /dev/null +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw_abi.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2019 Intel Corporation + */ + +#ifndef _INTEL_UC_FW_ABI_H +#define _INTEL_UC_FW_ABI_H + +#include <linux/types.h> +#include <linux/build_bug.h> + +/** + * DOC: Firmware Layout + * + * The GuC/HuC firmware layout looks like this:: + * + * +======================================================================+ + * | Firmware blob | + * +===============+===============+============+============+============+ + * | CSS header | uCode | RSA key | modulus | exponent | + * +===============+===============+============+============+============+ + * <-header size-> <---header size continued -----------> + * <--- size -----------------------------------------------------------> + * <-key size-> + * <-mod size-> + * <-exp size-> + * + * The firmware may or may not have modulus key and exponent data. The header, + * uCode and RSA signature are must-have components that will be used by driver. + * Length of each components, which is all in dwords, can be found in header. + * In the case that modulus and exponent are not present in fw, a.k.a truncated + * image, the length value still appears in header. + * + * Driver will do some basic fw size validation based on the following rules: + * + * 1. Header, uCode and RSA are must-have components. + * 2. All firmware components, if they present, are in the sequence illustrated + * in the layout table above. + * 3. Length info of each component can be found in header, in dwords. + * 4. Modulus and exponent key are not required by driver. They may not appear + * in fw. So driver will load a truncated firmware in this case. + */ + +struct uc_css_header { + u32 module_type; + /* + * header_size includes all non-uCode bits, including css_header, rsa + * key, modulus key and exponent data. + */ + u32 header_size_dw; + u32 header_version; + u32 module_id; + u32 module_vendor; + u32 date; +#define CSS_DATE_DAY (0xFF << 0) +#define CSS_DATE_MONTH (0xFF << 8) +#define CSS_DATE_YEAR (0xFFFF << 16) + u32 size_dw; /* uCode plus header_size_dw */ + u32 key_size_dw; + u32 modulus_size_dw; + u32 exponent_size_dw; + u32 time; +#define CSS_TIME_HOUR (0xFF << 0) +#define CSS_DATE_MIN (0xFF << 8) +#define CSS_DATE_SEC (0xFFFF << 16) + char username[8]; + char buildnumber[12]; + u32 sw_version; +#define CSS_SW_VERSION_UC_MAJOR (0xFF << 16) +#define CSS_SW_VERSION_UC_MINOR (0xFF << 8) +#define CSS_SW_VERSION_UC_PATCH (0xFF << 0) + u32 reserved[14]; + u32 header_info; +} __packed; +static_assert(sizeof(struct uc_css_header) == 128); + +#endif /* _INTEL_UC_FW_ABI_H */ |