Explore Jetson Nano GPU Driver
Analyze Nvidia Jetson Nano device driver code to understand how job is submitted and interacted with IRQ.
1. Job Submission
nvgpu/include/nvgpu/channel.h
114 struct priv_cmd_queue {
115 struct nvgpu_mem mem;
116 u32 size; /* num of entries in words */
117 u32 put; /* put for priv cmd queue */
118 u32 get; /* get for priv cmd queue */
119 };
120
121 struct priv_cmd_entry {
122 bool valid;
123 struct nvgpu_mem *mem;
124 u32 off; /* offset in mem, in u32 entries */
125 u64 gva;
126 u32 get; /* start of entry in queue */
127 u32 size; /* in words */
128 };
129
130 struct channel_gk20a_job {
131 struct nvgpu_mapped_buf **mapped_buffers;
132 int num_mapped_buffers;
133 struct gk20a_fence *post_fence;
134 struct priv_cmd_entry *wait_cmd;
135 struct priv_cmd_entry *incr_cmd;
136 struct nvgpu_list_node list;
137 };
- Job and command structure used in kernel-space.
nvgpu/common/submit.c
317 static int nvgpu_submit_channel_gpfifo(struct channel_gk20a *c,
318 struct nvgpu_gpfifo_entry *gpfifo,
319 struct nvgpu_gpfifo_userdata userdata,
320 u32 num_entries,
321 u32 flags,
322 struct nvgpu_channel_fence *fence,
323 struct gk20a_fence **fence_out,
324 struct fifo_profile_gk20a *profile)
325 {
...
537 if (wait_cmd) {
538 nvgpu_submit_append_priv_cmdbuf(c, wait_cmd);
539 }
540
541 err = nvgpu_submit_append_gpfifo(c, gpfifo, userdata,
542 num_entries);
543 if (err) {
544 goto clean_up_job;
545 }
546
547 /*
548 * And here's where we add the incr_cmd we generated earlier. It should
549 * always run!
550 */
551 if (incr_cmd) {
552 nvgpu_submit_append_priv_cmdbuf(c, incr_cmd);
553 }
554
555 if (fence_out) {
556 *fence_out = gk20a_fence_get(post_fence);
557 }
558
559 if (need_job_tracking) {
560 /* TODO! Check for errors... */
561 gk20a_channel_add_job(c, job, skip_buffer_refcounting);
562 }
563 gk20a_fifo_profile_snapshot(profile, PROFILE_APPEND);
565 g->ops.fifo.userd_gp_put(g, c);
...
599 }
Add commands to the ring buffer
First, the driver appends gpfifo entries into the shared memory (ring buffer). The entries from user-space copied into the ring buffer. Note that wait_cmd and/or incr_cmd will be appended before and after the actual command. When a new command is appended, the driver increments put pointer (by depending on # of entries, whether wait_cmd or incr_cmd is appended).
- Copy gpfifo entries from the user-space into gpfifo.mem in the kernel-space (using cpu_va).
203 static int nvgpu_submit_append_gpfifo_user_direct(struct channel_gk20a *c,
204 struct nvgpu_gpfifo_userdata userdata,
205 u32 num_entries)
206 {
207 struct gk20a *g = c->g;
208 struct nvgpu_gpfifo_entry *gpfifo_cpu = c->gpfifo.mem.cpu_va;
209 u32 gpfifo_size = c->gpfifo.entry_num;
210 u32 len = num_entries;
211 u32 start = c->gpfifo.put;
212 u32 end = start + len; /* exclusive */
213 int err;
214
215 if (end > gpfifo_size) {
216 /* wrap-around */
217 int length0 = gpfifo_size - start;
218 int length1 = len - length0;
219
220 err = g->os_channel.copy_user_gpfifo(
221 gpfifo_cpu + start, userdata,
222 0, length0);
223 if (err) {
224 return err;
225 }
226
227 err = g->os_channel.copy_user_gpfifo(
228 gpfifo_cpu, userdata,
229 length0, length1);
230 if (err) {
231 return err;
232 }
233 } else {
234 err = g->os_channel.copy_user_gpfifo(
235 gpfifo_cpu + start, userdata,
236 0, len);
237 if (err) {
238 return err;
239 }
240 }
241
242 return 0;
243 }
- gpfifo_size is channel’s total fifo size. If the size exceeds the channel’s gpfifo size, it wrap-around
270 /*
271 * Copy source gpfifo entries into the gpfifo ring buffer, potentially
272 * splitting into two memcpys to handle wrap-around.
273 */
274 static int nvgpu_submit_append_gpfifo(struct channel_gk20a *c,
275 struct nvgpu_gpfifo_entry *kern_gpfifo,
276 struct nvgpu_gpfifo_userdata userdata,
277 u32 num_entries)
278 {
279 struct gk20a *g = c->g;
280 int err;
281
282 if (!kern_gpfifo && !c->gpfifo.pipe) {
283 /*
284 * This path (from userspace to sysmem) is special in order to
285 * avoid two copies unnecessarily (from user to pipe, then from
286 * pipe to gpu sysmem buffer).
287 */
288 err = nvgpu_submit_append_gpfifo_user_direct(c, userdata,
289 num_entries);
290 if (err) {
291 return err;
292 }
293 } else if (!kern_gpfifo) {
294 /* from userspace to vidmem, use the common path */
295 err = g->os_channel.copy_user_gpfifo(c->gpfifo.pipe, userdata,
296 0, num_entries);
297 if (err) {
298 return err;
299 }
300
301 nvgpu_submit_append_gpfifo_common(c, c->gpfifo.pipe,
302 num_entries);
303 } else {
304 /* from kernel to either sysmem or vidmem, don't need
305 * copy_user_gpfifo so use the common path */
306 nvgpu_submit_append_gpfifo_common(c, kern_gpfifo, num_entries);
307 }
308
309 trace_write_pushbuffers(c, num_entries);
310
311 c->gpfifo.put = (c->gpfifo.put + num_entries) &
312 (c->gpfifo.entry_num - 1U);
313
314 return 0;
315 }
nvgpu/os/linux-channel.c
376 static int nvgpu_channel_copy_user_gpfifo(struct nvgpu_gpfifo_entry *dest,
377 struct nvgpu_gpfifo_userdata userdata, u32 start, u32 length)
378 {
379 struct nvgpu_gpfifo_entry __user *user_gpfifo = userdata.entries;
380 unsigned long n;
381
382 n = copy_from_user(dest, user_gpfifo + start,
383 length * sizeof(struct nvgpu_gpfifo_entry));
384
385 return n == 0 ? 0 : -EFAULT;
386 }
nvgpu/gpu/nvgpu/gk20a/fifo_gk20a.c
4416 void gk20a_fifo_userd_gp_put(struct gk20a *g, struct channel_gk20a *c)
4417 {
4418 gk20a_bar1_writel(g,
4419 c->userd_gpu_va + sizeof(u32) * ram_userd_gp_put_w(),
4420 c->gpfifo.put);
4421 }
- Finally, g->ops.fifo.userd_gp_put(g, c) used to update put pointer from GPU side.
2. Interrupt
nvidia/drivers/video/tegra/host/host1x_instr.c
373 static int t20_intr_init(struct nvhost_intr *intr)
374 {
375 struct nvhost_master *dev = intr_to_dev(intr);
376 int err;
377
378 intr_op().disable_all_syncpt_intrs(intr);
379
380 err = request_threaded_irq(intr->syncpt_irq, NULL,
381 syncpt_thresh_cascade_isr,
382 IRQF_ONESHOT, "host_syncpt", dev);
383 if (err)
384 return err;
385
386 /* master disable for general (not syncpt) host interrupts */
387 host1x_sync_writel(dev, host1x_sync_intmask_r(), 0);
388
389 /* clear status & extstatus */
390 host1x_sync_writel(dev, host1x_sync_hintstatus_ext_r(),
391 0xfffffffful);
392 host1x_sync_writel(dev, host1x_sync_hintstatus_r(),
393 0xfffffffful);
394
395 err = request_threaded_irq(intr->general_irq, NULL,
396 t20_intr_host1x_isr,
397 IRQF_ONESHOT, "host_status", intr);
398 if (err) {
399 free_irq(intr->syncpt_irq, dev);
400 return err;
401 }
402
403 return 0;
404 }
39 static irqreturn_t syncpt_thresh_cascade_isr(int irq, void *dev_id)
40 {
41 struct nvhost_master *dev = dev_id;
42 struct nvhost_intr *intr = &dev->intr;
43 unsigned long reg;
44 int i, id;
45 struct nvhost_timespec isr_recv;
46
47 nvhost_ktime_get_ts(&isr_recv);
48
49 for (i = 0; i < DIV_ROUND_UP(nvhost_syncpt_nb_hw_pts(&dev->syncpt), 32);
50 i++) {
51 reg = host1x_sync_readl(dev,
52 host1x_sync_syncpt_thresh_cpu0_int_status_r() +
53 i * REGISTER_STRIDE);
54
55 for_each_set_bit(id, ®, 32) {
56 struct nvhost_intr_syncpt *sp;
57 int sp_id = i * 32 + id;
58 int graphics_host_sp =
59 nvhost_syncpt_graphics_host_sp(&dev->syncpt);
60
61 if (unlikely(!nvhost_syncpt_is_valid_hw_pt(&dev->syncpt,
62 sp_id))) {
63 dev_err(&dev->dev->dev, "%s(): syncpoint id %d is beyond the number of syncpoints (%d)\n",
64 __func__, sp_id,
65 nvhost_syncpt_nb_hw_pts(&dev->syncpt));
66 goto out;
67 }
68
69 sp = intr->syncpt + sp_id;
70 sp->isr_recv = isr_recv;
71
72 /* handle graphics host syncpoint increments
73 * immediately
74 */
75 if (sp_id == graphics_host_sp) {
76 dev_warn(&dev->dev->dev, "%s(): syncpoint id %d incremented\n",
77 __func__, graphics_host_sp);
78 nvhost_syncpt_patch_check(&dev->syncpt);
79 t20_intr_syncpt_intr_ack(sp, false);
80 } else {
81 t20_intr_syncpt_intr_ack(sp, true);
82 nvhost_syncpt_thresh_fn(sp);
83 }
84 }
85 }
86
87 out:
88 return IRQ_HANDLED;
89 }
Interrupt registration.
- Register interrupt in here. In general computation, the isr invoke nvhost_syncpt_thresh_fn() which handles syncpt.
- The code above represents the ISR and the handling is cascaded.
nvidia/drivers/video/tegra/host/nvhost_intr.c
/*** host syncpt interrupt service functions ***/
351 void nvhost_syncpt_thresh_fn(void *dev_id)
352 {
353 struct nvhost_intr_syncpt *syncpt = dev_id;
354 unsigned int id = syncpt->id;
355 struct nvhost_intr *intr = intr_syncpt_to_intr(syncpt);
356 struct nvhost_master *dev = intr_to_dev(intr);
357 int err;
358
359 /* make sure host1x is powered */
360 err = nvhost_module_busy(dev->dev);
361 if (err) {
362 WARN(1, "failed to powerON host1x.");
363 return;
364 }
365
366 if (nvhost_dev_is_virtual(dev->dev))
367 (void)process_wait_list(intr, syncpt,
368 nvhost_syncpt_read_min(&dev->syncpt, id));
369 else
370 (void)process_wait_list(intr, syncpt,
371 nvhost_syncpt_update_min(&dev->syncpt, id));
372
373 nvhost_module_idle(dev->dev);
374 }
- process_wait_list() eventually invokes callback, the registered work (e.g. channel update()) that is registered when the gpfifo is submitted as shown below.
nvgpu/common/sync/channel_sync.c
163 if (register_irq) {
164 struct channel_gk20a *referenced = gk20a_channel_get(c);
165
166 WARN_ON(!referenced);
167
168 if (referenced) {
169 /* note: channel_put() is in
170 * channel_sync_syncpt_update() */
171
172 err = nvgpu_nvhost_intr_register_notifier(
173 sp->nvhost_dev,
174 sp->id, thresh,
175 channel_sync_syncpt_update, c);
176 if (err != 0) {
177 gk20a_channel_put(referenced);
178 }
179
180 /* Adding interrupt action should
181 * never fail. A proper error handling
182 * here would require us to decrement
183 * the syncpt max back to its original
184 * value. */
185 WARN(err,
186 "failed to set submit complete interrupt");
187 }
188 }
3. SYNCPT (Sync Point)
nvidia/drivers/video/tegra/host/nvhost_syncpt.c
113 /**
114 * Updates the last value read from hardware.
115 */
116 u32 nvhost_syncpt_update_min(struct nvhost_syncpt *sp, u32 id)
117 {
118 u32 val;
119
120 val = syncpt_op().update_min(sp, id);
121 trace_nvhost_syncpt_update_min(id, val);
122
123 return val;
124 }
nvidia/drivers/video/tegra/host/host1x/host1x_syncpt.c
39 /**
40 * Updates the last value read from hardware.
41 * (was nvhost_syncpt_update_min)
42 */
43 static u32 t20_syncpt_update_min(struct nvhost_syncpt *sp, u32 id)
44 {
45 struct nvhost_master *dev = syncpt_to_dev(sp);
46 u32 old, live;
47
48 do {
49 old = nvhost_syncpt_read_min(sp, id);
50 live = host1x_sync_readl(dev,
51 (host1x_sync_syncpt_0_r() + id * 4));
52 } while ((u32)atomic_cmpxchg(&sp->min_val[id], old, live) != old);
53
54 return live;
55 }
nvhost_syncpt_thresh_fn() updates syncpt value (threshold for sync) by reading min value (I guess last get pointer from GPU side) from GPU.
nvidia/drivers/video/tegra/host/nvhost_intr.c
260 /**
261 * Remove & handle all waiters that have completed for the given syncpt
262 */
263 static int process_wait_list(struct nvhost_intr *intr,
264 struct nvhost_intr_syncpt *syncpt,
265 u32 threshold)
266 {
267 struct list_head *completed[NVHOST_INTR_ACTION_COUNT] = {NULL};
268 struct list_head high_prio_handlers[NVHOST_INTR_HIGH_PRIO_COUNT];
269 bool run_low_prio_work = false;
270 unsigned int i, j;
271 int empty;
272
273 /* take lock on waiter list */
274 spin_lock(&syncpt->lock);
275
276 /* keep high priority workers in local list */
277 for (i = 0; i < NVHOST_INTR_HIGH_PRIO_COUNT; ++i) {
278 INIT_LIST_HEAD(high_prio_handlers + i);
279 completed[i] = high_prio_handlers + i;
280 }
281
282 /* .. and low priority workers in global list */
283 for (j = 0; i < NVHOST_INTR_ACTION_COUNT; ++i, ++j)
284 completed[i] = syncpt->low_prio_handlers + j;
285
286 /* this functions fills completed data */
287 remove_completed_waiters(&syncpt->wait_head, threshold,
288 syncpt->isr_recv, completed);
289
290 /* check if there are still waiters left */
291 empty = list_empty(&syncpt->wait_head);
292
293 /* if not, disable interrupt. If yes, update the inetrrupt */
294 if (empty)
295 intr_op().disable_syncpt_intr(intr, syncpt->id);
296 else
297 reset_threshold_interrupt(intr, &syncpt->wait_head,
298 syncpt->id);
299
300 /* remove low priority handlers from this list */
301 for (i = NVHOST_INTR_HIGH_PRIO_COUNT;
302 i < NVHOST_INTR_ACTION_COUNT; ++i) {
303 if (!list_empty(completed[i]))
304 run_low_prio_work = true;
305 completed[i] = NULL;
306 }
307
308 /* release waiter lock */
309 spin_unlock(&syncpt->lock);
310
311 run_handlers(completed);
312
313 /* schedule a separate task to handle low priority handlers */
314 if (run_low_prio_work)
315 queue_work(intr->low_prio_wq, &syncpt->low_prio_work);
316
317 return empty;
318 }
process_wait_list() picks waiters (for completion) of which value is smaller than the threshold (read from GPU as we discussed above) and runs the corresponding handlers.