drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345

// SPDX-License-Identifier: GPL-2.0-only
// Copyright (C) 2019-2020 NVIDIA CORPORATION.  All rights reserved.

#include <linux/bitfield.h>
#include <linux/delay.h>
#include <linux/of.h>
#include <linux/platform_device.h>
#include <linux/slab.h>

#include <soc/tegra/mc.h>

#include "arm-smmu.h"

/*
 * Tegra194 has three ARM MMU-500 Instances.
 * Two of them are used together and must be programmed identically for
 * interleaved IOVA accesses across them and translates accesses from
 * non-isochronous HW devices.
 * Third one is used for translating accesses from isochronous HW devices.
 *
 * In addition, the SMMU driver needs to coordinate with the memory controller
 * driver to ensure that the right SID override is programmed for any given
 * memory client. This is necessary to allow for use-case such as seamlessly
 * handing over the display controller configuration from the firmware to the
 * kernel.
 *
 * This implementation supports programming of the two instances that must
 * be programmed identically and takes care of invoking the memory controller
 * driver for SID override programming after devices have been attached to an
 * SMMU instance.
 */
#define MAX_SMMU_INSTANCES 2

struct nvidia_smmu {
	struct arm_smmu_device smmu;
	void __iomem *bases[MAX_SMMU_INSTANCES];
	unsigned int num_instances;
	struct tegra_mc *mc;
};

static inline struct nvidia_smmu *to_nvidia_smmu(struct arm_smmu_device *smmu)
{
	return container_of(smmu, struct nvidia_smmu, smmu);
}

static inline void __iomem *nvidia_smmu_page(struct arm_smmu_device *smmu,
					     unsigned int inst, int page)
{
	struct nvidia_smmu *nvidia_smmu;

	nvidia_smmu = container_of(smmu, struct nvidia_smmu, smmu);
	return nvidia_smmu->bases[inst] + (page << smmu->pgshift);
}

static u32 nvidia_smmu_read_reg(struct arm_smmu_device *smmu,
				int page, int offset)
{
	void __iomem *reg = nvidia_smmu_page(smmu, 0, page) + offset;

	return readl_relaxed(reg);
}

static void nvidia_smmu_write_reg(struct arm_smmu_device *smmu,
				  int page, int offset, u32 val)
{
	struct nvidia_smmu *nvidia = to_nvidia_smmu(smmu);
	unsigned int i;

	for (i = 0; i < nvidia->num_instances; i++) {
		void __iomem *reg = nvidia_smmu_page(smmu, i, page) + offset;

		writel_relaxed(val, reg);
	}
}

static u64 nvidia_smmu_read_reg64(struct arm_smmu_device *smmu,
				  int page, int offset)
{
	void __iomem *reg = nvidia_smmu_page(smmu, 0, page) + offset;

	return readq_relaxed(reg);
}

static void nvidia_smmu_write_reg64(struct arm_smmu_device *smmu,
				    int page, int offset, u64 val)
{
	struct nvidia_smmu *nvidia = to_nvidia_smmu(smmu);
	unsigned int i;

	for (i = 0; i < nvidia->num_instances; i++) {
		void __iomem *reg = nvidia_smmu_page(smmu, i, page) + offset;

		writeq_relaxed(val, reg);
	}
}

static void nvidia_smmu_tlb_sync(struct arm_smmu_device *smmu, int page,
				 int sync, int status)
{
	struct nvidia_smmu *nvidia = to_nvidia_smmu(smmu);
	unsigned int delay;

	arm_smmu_writel(smmu, page, sync, 0);

	for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
		unsigned int spin_cnt;

		for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
			u32 val = 0;
			unsigned int i;

			for (i = 0; i < nvidia->num_instances; i++) {
				void __iomem *reg;

				reg = nvidia_smmu_page(smmu, i, page) + status;
				val |= readl_relaxed(reg);
			}

			if (!(val & ARM_SMMU_sTLBGSTATUS_GSACTIVE))
				return;

			cpu_relax();
		}

		udelay(delay);
	}

	dev_err_ratelimited(smmu->dev,
			    "TLB sync timed out -- SMMU may be deadlocked\n");
}

static int nvidia_smmu_reset(struct arm_smmu_device *smmu)
{
	struct nvidia_smmu *nvidia = to_nvidia_smmu(smmu);
	unsigned int i;

	for (i = 0; i < nvidia->num_instances; i++) {
		u32 val;
		void __iomem *reg = nvidia_smmu_page(smmu, i, ARM_SMMU_GR0) +
				    ARM_SMMU_GR0_sGFSR;

		/* clear global FSR */
		val = readl_relaxed(reg);
		writel_relaxed(val, reg);
	}

	return 0;
}

static irqreturn_t nvidia_smmu_global_fault_inst(int irq,
						 struct arm_smmu_device *smmu,
						 int inst)
{
	u32 gfsr, gfsynr0, gfsynr1, gfsynr2;
	void __iomem *gr0_base = nvidia_smmu_page(smmu, inst, 0);

	gfsr = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSR);
	if (!gfsr)
		return IRQ_NONE;

	gfsynr0 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR0);
	gfsynr1 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR1);
	gfsynr2 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR2);

	dev_err_ratelimited(smmu->dev,
			    "Unexpected global fault, this could be serious\n");
	dev_err_ratelimited(smmu->dev,
			    "\tGFSR 0x%08x, GFSYNR0 0x%08x, GFSYNR1 0x%08x, GFSYNR2 0x%08x\n",
			    gfsr, gfsynr0, gfsynr1, gfsynr2);

	writel_relaxed(gfsr, gr0_base + ARM_SMMU_GR0_sGFSR);
	return IRQ_HANDLED;
}

static irqreturn_t nvidia_smmu_global_fault(int irq, void *dev)
{
	unsigned int inst;
	irqreturn_t ret = IRQ_NONE;
	struct arm_smmu_device *smmu = dev;
	struct nvidia_smmu *nvidia = to_nvidia_smmu(smmu);

	for (inst = 0; inst < nvidia->num_instances; inst++) {
		irqreturn_t irq_ret;

		irq_ret = nvidia_smmu_global_fault_inst(irq, smmu, inst);
		if (irq_ret == IRQ_HANDLED)
			ret = IRQ_HANDLED;
	}

	return ret;
}

static irqreturn_t nvidia_smmu_context_fault_bank(int irq,
						  struct arm_smmu_device *smmu,
						  int idx, int inst)
{
	u32 fsr, fsynr, cbfrsynra;
	unsigned long iova;
	void __iomem *gr1_base = nvidia_smmu_page(smmu, inst, 1);
	void __iomem *cb_base = nvidia_smmu_page(smmu, inst, smmu->numpage + idx);

	fsr = readl_relaxed(cb_base + ARM_SMMU_CB_FSR);
	if (!(fsr & ARM_SMMU_FSR_FAULT))
		return IRQ_NONE;

	fsynr = readl_relaxed(cb_base + ARM_SMMU_CB_FSYNR0);
	iova = readq_relaxed(cb_base + ARM_SMMU_CB_FAR);
	cbfrsynra = readl_relaxed(gr1_base + ARM_SMMU_GR1_CBFRSYNRA(idx));

	dev_err_ratelimited(smmu->dev,
			    "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
			    fsr, iova, fsynr, cbfrsynra, idx);

	writel_relaxed(fsr, cb_base + ARM_SMMU_CB_FSR);
	return IRQ_HANDLED;
}

static irqreturn_t nvidia_smmu_context_fault(int irq, void *dev)
{
	int idx;
	unsigned int inst;
	irqreturn_t ret = IRQ_NONE;
	struct arm_smmu_device *smmu;
	struct iommu_domain *domain = dev;
	struct arm_smmu_domain *smmu_domain;
	struct nvidia_smmu *nvidia;

	smmu_domain = container_of(domain, struct arm_smmu_domain, domain);
	smmu = smmu_domain->smmu;
	nvidia = to_nvidia_smmu(smmu);

	for (inst = 0; inst < nvidia->num_instances; inst++) {
		irqreturn_t irq_ret;

		/*
		 * Interrupt line is shared between all contexts.
		 * Check for faults across all contexts.
		 */
		for (idx = 0; idx < smmu->num_context_banks; idx++) {
			irq_ret = nvidia_smmu_context_fault_bank(irq, smmu,
								 idx, inst);
			if (irq_ret == IRQ_HANDLED)
				ret = IRQ_HANDLED;
		}
	}

	return ret;
}

static void nvidia_smmu_probe_finalize(struct arm_smmu_device *smmu, struct device *dev)
{
	struct nvidia_smmu *nvidia = to_nvidia_smmu(smmu);
	int err;

	err = tegra_mc_probe_device(nvidia->mc, dev);
	if (err < 0)
		dev_err(smmu->dev, "memory controller probe failed for %s: %d\n",
			dev_name(dev), err);
}

static int nvidia_smmu_init_context(struct arm_smmu_domain *smmu_domain,
				    struct io_pgtable_cfg *pgtbl_cfg,
				    struct device *dev)
{
	struct arm_smmu_device *smmu = smmu_domain->smmu;
	const struct device_node *np = smmu->dev->of_node;

	/*
	 * Tegra194 and Tegra234 SoCs have the erratum that causes walk cache
	 * entries to not be invalidated correctly. The problem is that the walk
	 * cache index generated for IOVA is not same across translation and
	 * invalidation requests. This is leading to page faults when PMD entry
	 * is released during unmap and populated with new PTE table during
	 * subsequent map request. Disabling large page mappings avoids the
	 * release of PMD entry and avoid translations seeing stale PMD entry in
	 * walk cache.
	 * Fix this by limiting the page mappings to PAGE_SIZE on Tegra194 and
	 * Tegra234.
	 */
	if (of_device_is_compatible(np, "nvidia,tegra234-smmu") ||
	    of_device_is_compatible(np, "nvidia,tegra194-smmu")) {
		smmu->pgsize_bitmap = PAGE_SIZE;
		pgtbl_cfg->pgsize_bitmap = smmu->pgsize_bitmap;
	}

	return 0;
}

static const struct arm_smmu_impl nvidia_smmu_impl = {
	.read_reg = nvidia_smmu_read_reg,
	.write_reg = nvidia_smmu_write_reg,
	.read_reg64 = nvidia_smmu_read_reg64,
	.write_reg64 = nvidia_smmu_write_reg64,
	.reset = nvidia_smmu_reset,
	.tlb_sync = nvidia_smmu_tlb_sync,
	.global_fault = nvidia_smmu_global_fault,
	.context_fault = nvidia_smmu_context_fault,
	.probe_finalize = nvidia_smmu_probe_finalize,
	.init_context = nvidia_smmu_init_context,
};

static const struct arm_smmu_impl nvidia_smmu_single_impl = {
	.probe_finalize = nvidia_smmu_probe_finalize,
	.init_context = nvidia_smmu_init_context,
};

struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu)
{
	struct resource *res;
	struct device *dev = smmu->dev;
	struct nvidia_smmu *nvidia_smmu;
	struct platform_device *pdev = to_platform_device(dev);
	unsigned int i;

	nvidia_smmu = devm_krealloc(dev, smmu, sizeof(*nvidia_smmu), GFP_KERNEL);
	if (!nvidia_smmu)
		return ERR_PTR(-ENOMEM);

	nvidia_smmu->mc = devm_tegra_memory_controller_get(dev);
	if (IS_ERR(nvidia_smmu->mc))
		return ERR_CAST(nvidia_smmu->mc);

	/* Instance 0 is ioremapped by arm-smmu.c. */
	nvidia_smmu->bases[0] = smmu->base;
	nvidia_smmu->num_instances++;

	for (i = 1; i < MAX_SMMU_INSTANCES; i++) {
		res = platform_get_resource(pdev, IORESOURCE_MEM, i);
		if (!res)
			break;

		nvidia_smmu->bases[i] = devm_ioremap_resource(dev, res);
		if (IS_ERR(nvidia_smmu->bases[i]))
			return ERR_CAST(nvidia_smmu->bases[i]);

		nvidia_smmu->num_instances++;
	}

	if (nvidia_smmu->num_instances == 1)
		nvidia_smmu->smmu.impl = &nvidia_smmu_single_impl;
	else
		nvidia_smmu->smmu.impl = &nvidia_smmu_impl;

	return &nvidia_smmu->smmu;
}