Explore Jetson Nano GPU Driver

8 minute read

Analyze Nvidia Jetson Nano device driver code to understand how job is submitted and interacted with IRQ.

1. Job Submission

nvgpu/include/nvgpu/channel.h

114 struct priv_cmd_queue {
115     struct nvgpu_mem mem;
116     u32 size;   /* num of entries in words */
117     u32 put;    /* put for priv cmd queue */
118     u32 get;    /* get for priv cmd queue */
119 };
120 
121 struct priv_cmd_entry {
122     bool valid;
123     struct nvgpu_mem *mem;
124     u32 off;    /* offset in mem, in u32 entries */
125     u64 gva;
126     u32 get;    /* start of entry in queue */
127     u32 size;   /* in words */
128 };
129 
130 struct channel_gk20a_job {
131     struct nvgpu_mapped_buf **mapped_buffers;
132     int num_mapped_buffers;
133     struct gk20a_fence *post_fence;
134     struct priv_cmd_entry *wait_cmd;
135     struct priv_cmd_entry *incr_cmd;
136     struct nvgpu_list_node list;
137 };
  • Job and command structure used in kernel-space.

nvgpu/common/submit.c

317 static int nvgpu_submit_channel_gpfifo(struct channel_gk20a *c,
318                 struct nvgpu_gpfifo_entry *gpfifo,
319                 struct nvgpu_gpfifo_userdata userdata,
320                 u32 num_entries,
321                 u32 flags,
322                 struct nvgpu_channel_fence *fence,
323                 struct gk20a_fence **fence_out,
324                 struct fifo_profile_gk20a *profile)
325 {
	...
537     if (wait_cmd) {
538         nvgpu_submit_append_priv_cmdbuf(c, wait_cmd);
539     }
540 
541     err = nvgpu_submit_append_gpfifo(c, gpfifo, userdata,
542             num_entries);
543     if (err) {
544         goto clean_up_job;
545     }
546 
547     /*
548      * And here's where we add the incr_cmd we generated earlier. It should
549      * always run!
550      */
551     if (incr_cmd) {
552         nvgpu_submit_append_priv_cmdbuf(c, incr_cmd);
553     }
554 
555     if (fence_out) {
556         *fence_out = gk20a_fence_get(post_fence);
557     }
558 
559     if (need_job_tracking) {
560         /* TODO! Check for errors... */
561         gk20a_channel_add_job(c, job, skip_buffer_refcounting);
562     }
563     gk20a_fifo_profile_snapshot(profile, PROFILE_APPEND);
565     g->ops.fifo.userd_gp_put(g, c);
	...
599 }

Add commands to the ring buffer

First, the driver appends gpfifo entries into the shared memory (ring buffer). The entries from user-space copied into the ring buffer. Note that wait_cmd and/or incr_cmd will be appended before and after the actual command. When a new command is appended, the driver increments put pointer (by depending on # of entries, whether wait_cmd or incr_cmd is appended).

  • Copy gpfifo entries from the user-space into gpfifo.mem in the kernel-space (using cpu_va).
203 static int nvgpu_submit_append_gpfifo_user_direct(struct channel_gk20a *c,
204         struct nvgpu_gpfifo_userdata userdata,
205         u32 num_entries)
206 {
207     struct gk20a *g = c->g;
208     struct nvgpu_gpfifo_entry *gpfifo_cpu = c->gpfifo.mem.cpu_va;
209     u32 gpfifo_size = c->gpfifo.entry_num;
210     u32 len = num_entries;
211     u32 start = c->gpfifo.put;
212     u32 end = start + len; /* exclusive */
213     int err;
214 
215     if (end > gpfifo_size) {
216         /* wrap-around */
217         int length0 = gpfifo_size - start;
218         int length1 = len - length0;
219 
220         err = g->os_channel.copy_user_gpfifo(
221                 gpfifo_cpu + start, userdata,
222                 0, length0);
223         if (err) {
224             return err;
225         }
226 
227         err = g->os_channel.copy_user_gpfifo(
228                 gpfifo_cpu, userdata,
229                 length0, length1);
230         if (err) {
231             return err;
232         }
233     } else {
234         err = g->os_channel.copy_user_gpfifo(
235                 gpfifo_cpu + start, userdata,
236                 0, len);
237         if (err) {
238             return err;
239         }
240     }
241 
242     return 0;
243 }
  • gpfifo_size is channel’s total fifo size. If the size exceeds the channel’s gpfifo size, it wrap-around
270 /*
271  * Copy source gpfifo entries into the gpfifo ring buffer, potentially
272  * splitting into two memcpys to handle wrap-around.
273  */
274 static int nvgpu_submit_append_gpfifo(struct channel_gk20a *c,
275         struct nvgpu_gpfifo_entry *kern_gpfifo,
276         struct nvgpu_gpfifo_userdata userdata,
277         u32 num_entries)
278 {
279     struct gk20a *g = c->g;
280     int err;
281 
282     if (!kern_gpfifo && !c->gpfifo.pipe) {
283         /*
284          * This path (from userspace to sysmem) is special in order to
285          * avoid two copies unnecessarily (from user to pipe, then from
286          * pipe to gpu sysmem buffer).
287          */
288         err = nvgpu_submit_append_gpfifo_user_direct(c, userdata,
289                 num_entries);
290         if (err) {
291             return err;
292         }
293     } else if (!kern_gpfifo) {
294         /* from userspace to vidmem, use the common path */
295         err = g->os_channel.copy_user_gpfifo(c->gpfifo.pipe, userdata,
296                 0, num_entries);
297         if (err) {
298             return err;
299         }
300 
301         nvgpu_submit_append_gpfifo_common(c, c->gpfifo.pipe,
302                 num_entries);
303     } else {                                                                                                                                              
304         /* from kernel to either sysmem or vidmem, don't need
305          * copy_user_gpfifo so use the common path */
306         nvgpu_submit_append_gpfifo_common(c, kern_gpfifo, num_entries);
307     }
308 
309     trace_write_pushbuffers(c, num_entries);
310 
311     c->gpfifo.put = (c->gpfifo.put + num_entries) &
312         (c->gpfifo.entry_num - 1U);
313 
314     return 0;
315 }

nvgpu/os/linux-channel.c

376 static int nvgpu_channel_copy_user_gpfifo(struct nvgpu_gpfifo_entry *dest,
377         struct nvgpu_gpfifo_userdata userdata, u32 start, u32 length)
378 {
379     struct nvgpu_gpfifo_entry __user *user_gpfifo = userdata.entries;
380     unsigned long n;
381 
382     n = copy_from_user(dest, user_gpfifo + start,
383             length * sizeof(struct nvgpu_gpfifo_entry));
384 
385     return n == 0 ? 0 : -EFAULT;
386 }

nvgpu/gpu/nvgpu/gk20a/fifo_gk20a.c

4416 void gk20a_fifo_userd_gp_put(struct gk20a *g, struct channel_gk20a *c)
4417 {
4418     gk20a_bar1_writel(g,
4419         c->userd_gpu_va + sizeof(u32) * ram_userd_gp_put_w(),
4420         c->gpfifo.put);
4421 }    
  • Finally, g->ops.fifo.userd_gp_put(g, c) used to update put pointer from GPU side.

2. Interrupt

nvidia/drivers/video/tegra/host/host1x_instr.c

373 static int t20_intr_init(struct nvhost_intr *intr)
374 {
375     struct nvhost_master *dev = intr_to_dev(intr);
376     int err;
377 
378     intr_op().disable_all_syncpt_intrs(intr);
379 
380     err = request_threaded_irq(intr->syncpt_irq, NULL,
381                 syncpt_thresh_cascade_isr,
382                 IRQF_ONESHOT, "host_syncpt", dev);
383     if (err)
384         return err;
385 
386     /* master disable for general (not syncpt) host interrupts */
387     host1x_sync_writel(dev, host1x_sync_intmask_r(), 0);
388 
389     /* clear status & extstatus */
390     host1x_sync_writel(dev, host1x_sync_hintstatus_ext_r(),
391             0xfffffffful);
392     host1x_sync_writel(dev, host1x_sync_hintstatus_r(),
393             0xfffffffful);
394 
395     err = request_threaded_irq(intr->general_irq, NULL,
396                 t20_intr_host1x_isr,
397                 IRQF_ONESHOT, "host_status", intr);
398     if (err) {
399         free_irq(intr->syncpt_irq, dev);
400         return err;
401     }
402 
403     return 0;
404 }
 39 static irqreturn_t syncpt_thresh_cascade_isr(int irq, void *dev_id)
 40 {
 41     struct nvhost_master *dev = dev_id;
 42     struct nvhost_intr *intr = &dev->intr;
 43     unsigned long reg;
 44     int i, id;
 45     struct nvhost_timespec isr_recv;
 46 
 47     nvhost_ktime_get_ts(&isr_recv);
 48 
 49     for (i = 0; i < DIV_ROUND_UP(nvhost_syncpt_nb_hw_pts(&dev->syncpt), 32);
 50             i++) {
 51         reg = host1x_sync_readl(dev,
 52                 host1x_sync_syncpt_thresh_cpu0_int_status_r() +
 53                 i * REGISTER_STRIDE);
 54 
 55         for_each_set_bit(id, &reg, 32) {
 56             struct nvhost_intr_syncpt *sp;
 57             int sp_id = i * 32 + id;
 58             int graphics_host_sp =
 59                 nvhost_syncpt_graphics_host_sp(&dev->syncpt);
 60 
 61             if (unlikely(!nvhost_syncpt_is_valid_hw_pt(&dev->syncpt,
 62                     sp_id))) {
 63                 dev_err(&dev->dev->dev, "%s(): syncpoint id %d is beyond the number of syncpoints (%d)\n",
 64                     __func__, sp_id,
 65                     nvhost_syncpt_nb_hw_pts(&dev->syncpt));
 66                 goto out;
 67             }
 68 
 69             sp = intr->syncpt + sp_id;
 70             sp->isr_recv = isr_recv;
 71 
 72             /* handle graphics host syncpoint increments                                                                                                  
 73              * immediately
 74              */
 75             if (sp_id == graphics_host_sp) {
 76                 dev_warn(&dev->dev->dev, "%s(): syncpoint id %d incremented\n",
 77                      __func__, graphics_host_sp);
 78                 nvhost_syncpt_patch_check(&dev->syncpt);
 79                 t20_intr_syncpt_intr_ack(sp, false);
 80             } else {
 81                 t20_intr_syncpt_intr_ack(sp, true);
 82                 nvhost_syncpt_thresh_fn(sp);
 83             }
 84         }
 85     }
 86 
 87 out:
 88     return IRQ_HANDLED;
 89 }

Interrupt registration.

  • Register interrupt in here. In general computation, the isr invoke nvhost_syncpt_thresh_fn() which handles syncpt.
  • The code above represents the ISR and the handling is cascaded.

nvidia/drivers/video/tegra/host/nvhost_intr.c

 /*** host syncpt interrupt service functions ***/
351 void nvhost_syncpt_thresh_fn(void *dev_id)
352 {    
353     struct nvhost_intr_syncpt *syncpt = dev_id;
354     unsigned int id = syncpt->id;
355     struct nvhost_intr *intr = intr_syncpt_to_intr(syncpt);
356     struct nvhost_master *dev = intr_to_dev(intr);
357     int err;                                                                                                                                              
358 
359     /* make sure host1x is powered */
360     err = nvhost_module_busy(dev->dev);
361     if (err) {
362         WARN(1, "failed to powerON host1x.");
363         return;
364     }
365 
366     if (nvhost_dev_is_virtual(dev->dev))
367         (void)process_wait_list(intr, syncpt,
368                 nvhost_syncpt_read_min(&dev->syncpt, id));
369     else
370         (void)process_wait_list(intr, syncpt,
371                 nvhost_syncpt_update_min(&dev->syncpt, id));
372 
373     nvhost_module_idle(dev->dev);
374 }                 
  • process_wait_list() eventually invokes callback, the registered work (e.g. channel update()) that is registered when the gpfifo is submitted as shown below.

nvgpu/common/sync/channel_sync.c

163     if (register_irq) {
164         struct channel_gk20a *referenced = gk20a_channel_get(c);
165 
166         WARN_ON(!referenced);
167 
168         if (referenced) {
169             /* note: channel_put() is in
170              * channel_sync_syncpt_update() */
171 
172             err = nvgpu_nvhost_intr_register_notifier(
173                 sp->nvhost_dev,                                 
174                 sp->id, thresh,
175                 channel_sync_syncpt_update, c);
176             if (err != 0) {
177                 gk20a_channel_put(referenced);
178             }
179 
180             /* Adding interrupt action should
181              * never fail. A proper error handling
182              * here would require us to decrement
183              * the syncpt max back to its original
184              * value. */
185             WARN(err,
186                  "failed to set submit complete interrupt");
187         }
188     }

3. SYNCPT (Sync Point)

nvidia/drivers/video/tegra/host/nvhost_syncpt.c

 113 /**
 114  * Updates the last value read from hardware.
 115  */
 116 u32 nvhost_syncpt_update_min(struct nvhost_syncpt *sp, u32 id)
 117 {
 118     u32 val;
 119 
 120     val = syncpt_op().update_min(sp, id);
 121     trace_nvhost_syncpt_update_min(id, val);
 122 
 123     return val;
 124 }

nvidia/drivers/video/tegra/host/host1x/host1x_syncpt.c

39 /**
 40  * Updates the last value read from hardware.
 41  * (was nvhost_syncpt_update_min)
 42  */
 43 static u32 t20_syncpt_update_min(struct nvhost_syncpt *sp, u32 id)
 44 {
 45     struct nvhost_master *dev = syncpt_to_dev(sp);
 46     u32 old, live;
 47 
 48     do {
 49         old = nvhost_syncpt_read_min(sp, id);
 50         live = host1x_sync_readl(dev,
 51                 (host1x_sync_syncpt_0_r() + id * 4));
 52     } while ((u32)atomic_cmpxchg(&sp->min_val[id], old, live) != old);
 53 
 54     return live;
 55 }

nvhost_syncpt_thresh_fn() updates syncpt value (threshold for sync) by reading min value (I guess last get pointer from GPU side) from GPU.

nvidia/drivers/video/tegra/host/nvhost_intr.c

260 /**
261  * Remove & handle all waiters that have completed for the given syncpt
262  */
263 static int process_wait_list(struct nvhost_intr *intr,
264                  struct nvhost_intr_syncpt *syncpt,
265                  u32 threshold)
266 {
267     struct list_head *completed[NVHOST_INTR_ACTION_COUNT] = {NULL};
268     struct list_head high_prio_handlers[NVHOST_INTR_HIGH_PRIO_COUNT];
269     bool run_low_prio_work = false;
270     unsigned int i, j;
271     int empty;
272 
273     /* take lock on waiter list */
274     spin_lock(&syncpt->lock);
275 
276     /* keep high priority workers in local list */
277     for (i = 0; i < NVHOST_INTR_HIGH_PRIO_COUNT; ++i) {
278         INIT_LIST_HEAD(high_prio_handlers + i);
279         completed[i] = high_prio_handlers + i;
280     }
281 
282     /* .. and low priority workers in global list */
283     for (j = 0; i < NVHOST_INTR_ACTION_COUNT; ++i, ++j)
284         completed[i] = syncpt->low_prio_handlers + j;
285 
286     /* this functions fills completed data */
287     remove_completed_waiters(&syncpt->wait_head, threshold,
288         syncpt->isr_recv, completed);
289 
290     /* check if there are still waiters left */
291     empty = list_empty(&syncpt->wait_head);
292 
293     /* if not, disable interrupt. If yes, update the inetrrupt */
294     if (empty)
295         intr_op().disable_syncpt_intr(intr, syncpt->id);
296     else
297         reset_threshold_interrupt(intr, &syncpt->wait_head,
298                       syncpt->id);
299 
300     /* remove low priority handlers from this list */
301     for (i = NVHOST_INTR_HIGH_PRIO_COUNT;
302          i < NVHOST_INTR_ACTION_COUNT; ++i) {
303         if (!list_empty(completed[i]))
304             run_low_prio_work = true;
305         completed[i] = NULL;
306     }
307 
308     /* release waiter lock */
309     spin_unlock(&syncpt->lock);
310 
311     run_handlers(completed);
312 
313     /* schedule a separate task to handle low priority handlers */
314     if (run_low_prio_work)
315         queue_work(intr->low_prio_wq, &syncpt->low_prio_work);
316 
317     return empty;
318 }

process_wait_list() picks waiters (for completion) of which value is smaller than the threshold (read from GPU as we discussed above) and runs the corresponding handlers.