Explore Jetson Nano GPU Driver

8 minute read

Analyze Nvidia Jetson Nano device driver code to understand how job is submitted and interacted with IRQ.

1. Job Submission

nvgpu/include/nvgpu/channel.h

struct priv_cmd_queue {
   struct nvgpu_mem mem;
   u32 size;   /* num of entries in words */
   u32 put;    /* put for priv cmd queue */
   u32 get;    /* get for priv cmd queue */
};

struct priv_cmd_entry {
   bool valid;
   struct nvgpu_mem *mem;
   u32 off;    /* offset in mem, in u32 entries */
   u64 gva;
   u32 get;    /* start of entry in queue */
   u32 size;   /* in words */
};

struct channel_gk20a_job {
   struct nvgpu_mapped_buf **mapped_buffers;
   int num_mapped_buffers;
   struct gk20a_fence *post_fence;
   struct priv_cmd_entry *wait_cmd;
   struct priv_cmd_entry *incr_cmd;
   struct nvgpu_list_node list;
};

Job and command structure used in kernel-space.

nvgpu/common/submit.c

static int nvgpu_submit_channel_gpfifo(struct channel_gk20a *c,
               struct nvgpu_gpfifo_entry *gpfifo,
               struct nvgpu_gpfifo_userdata userdata,
               u32 num_entries,
               u32 flags,
               struct nvgpu_channel_fence *fence,
               struct gk20a_fence **fence_out,
               struct fifo_profile_gk20a *profile)
{
	...
   if (wait_cmd) {
       nvgpu_submit_append_priv_cmdbuf(c, wait_cmd);
   }

   err = nvgpu_submit_append_gpfifo(c, gpfifo, userdata,
           num_entries);
   if (err) {
       goto clean_up_job;
   }

   /*
    * And here's where we add the incr_cmd we generated earlier. It should
    * always run!
    */
   if (incr_cmd) {
       nvgpu_submit_append_priv_cmdbuf(c, incr_cmd);
   }

   if (fence_out) {
       *fence_out = gk20a_fence_get(post_fence);
   }

   if (need_job_tracking) {
       /* TODO! Check for errors... */
       gk20a_channel_add_job(c, job, skip_buffer_refcounting);
   }
   gk20a_fifo_profile_snapshot(profile, PROFILE_APPEND);
   g->ops.fifo.userd_gp_put(g, c);
	...
}

Add commands to the ring buffer

First, the driver appends gpfifo entries into the shared memory (ring buffer). The entries from user-space copied into the ring buffer. Note that wait_cmd and/or incr_cmd will be appended before and after the actual command. When a new command is appended, the driver increments put pointer (by depending on # of entries, whether wait_cmd or incr_cmd is appended).

Copy gpfifo entries from the user-space into gpfifo.mem in the kernel-space (using cpu_va).

static int nvgpu_submit_append_gpfifo_user_direct(struct channel_gk20a *c,
       struct nvgpu_gpfifo_userdata userdata,
       u32 num_entries)
{
   struct gk20a *g = c->g;
   struct nvgpu_gpfifo_entry *gpfifo_cpu = c->gpfifo.mem.cpu_va;
   u32 gpfifo_size = c->gpfifo.entry_num;
   u32 len = num_entries;
   u32 start = c->gpfifo.put;
   u32 end = start + len; /* exclusive */
   int err;

   if (end > gpfifo_size) {
       /* wrap-around */
       int length0 = gpfifo_size - start;
       int length1 = len - length0;

       err = g->os_channel.copy_user_gpfifo(
               gpfifo_cpu + start, userdata,
               0, length0);
       if (err) {
           return err;
       }

       err = g->os_channel.copy_user_gpfifo(
               gpfifo_cpu, userdata,
               length0, length1);
       if (err) {
           return err;
       }
   } else {
       err = g->os_channel.copy_user_gpfifo(
               gpfifo_cpu + start, userdata,
               0, len);
       if (err) {
           return err;
       }
   }

   return 0;
}

gpfifo_size is channel’s total fifo size. If the size exceeds the channel’s gpfifo size, it wrap-around

/*
* Copy source gpfifo entries into the gpfifo ring buffer, potentially
* splitting into two memcpys to handle wrap-around.
*/
static int nvgpu_submit_append_gpfifo(struct channel_gk20a *c,
       struct nvgpu_gpfifo_entry *kern_gpfifo,
       struct nvgpu_gpfifo_userdata userdata,
       u32 num_entries)
{
   struct gk20a *g = c->g;
   int err;

   if (!kern_gpfifo && !c->gpfifo.pipe) {
       /*
        * This path (from userspace to sysmem) is special in order to
        * avoid two copies unnecessarily (from user to pipe, then from
        * pipe to gpu sysmem buffer).
        */
       err = nvgpu_submit_append_gpfifo_user_direct(c, userdata,
               num_entries);
       if (err) {
           return err;
       }
   } else if (!kern_gpfifo) {
       /* from userspace to vidmem, use the common path */
       err = g->os_channel.copy_user_gpfifo(c->gpfifo.pipe, userdata,
               0, num_entries);
       if (err) {
           return err;
       }

       nvgpu_submit_append_gpfifo_common(c, c->gpfifo.pipe,
               num_entries);
   } else {                                                                                                                                              
       /* from kernel to either sysmem or vidmem, don't need
        * copy_user_gpfifo so use the common path */
       nvgpu_submit_append_gpfifo_common(c, kern_gpfifo, num_entries);
   }

   trace_write_pushbuffers(c, num_entries);

   c->gpfifo.put = (c->gpfifo.put + num_entries) &
       (c->gpfifo.entry_num - 1U);

   return 0;
}

nvgpu/os/linux-channel.c

static int nvgpu_channel_copy_user_gpfifo(struct nvgpu_gpfifo_entry *dest,
       struct nvgpu_gpfifo_userdata userdata, u32 start, u32 length)
{
   struct nvgpu_gpfifo_entry __user *user_gpfifo = userdata.entries;
   unsigned long n;

   n = copy_from_user(dest, user_gpfifo + start,
           length * sizeof(struct nvgpu_gpfifo_entry));

   return n == 0 ? 0 : -EFAULT;
}

nvgpu/gpu/nvgpu/gk20a/fifo_gk20a.c

void gk20a_fifo_userd_gp_put(struct gk20a *g, struct channel_gk20a *c)
{
   gk20a_bar1_writel(g,
       c->userd_gpu_va + sizeof(u32) * ram_userd_gp_put_w(),
       c->gpfifo.put);
}    

Finally, g->ops.fifo.userd_gp_put(g, c) used to update put pointer from GPU side.

2. Interrupt

nvidia/drivers/video/tegra/host/host1x_instr.c

static int t20_intr_init(struct nvhost_intr *intr)
{
   struct nvhost_master *dev = intr_to_dev(intr);
   int err;

   intr_op().disable_all_syncpt_intrs(intr);

   err = request_threaded_irq(intr->syncpt_irq, NULL,
               syncpt_thresh_cascade_isr,
               IRQF_ONESHOT, "host_syncpt", dev);
   if (err)
       return err;

   /* master disable for general (not syncpt) host interrupts */
   host1x_sync_writel(dev, host1x_sync_intmask_r(), 0);

   /* clear status & extstatus */
   host1x_sync_writel(dev, host1x_sync_hintstatus_ext_r(),
           0xfffffffful);
   host1x_sync_writel(dev, host1x_sync_hintstatus_r(),
           0xfffffffful);

   err = request_threaded_irq(intr->general_irq, NULL,
               t20_intr_host1x_isr,
               IRQF_ONESHOT, "host_status", intr);
   if (err) {
       free_irq(intr->syncpt_irq, dev);
       return err;
   }

   return 0;
}

static irqreturn_t syncpt_thresh_cascade_isr(int irq, void *dev_id)
{
   struct nvhost_master *dev = dev_id;
   struct nvhost_intr *intr = &dev->intr;
   unsigned long reg;
   int i, id;
   struct nvhost_timespec isr_recv;

   nvhost_ktime_get_ts(&isr_recv);

   for (i = 0; i < DIV_ROUND_UP(nvhost_syncpt_nb_hw_pts(&dev->syncpt), 32);
           i++) {
       reg = host1x_sync_readl(dev,
               host1x_sync_syncpt_thresh_cpu0_int_status_r() +
               i * REGISTER_STRIDE);

       for_each_set_bit(id, &reg, 32) {
           struct nvhost_intr_syncpt *sp;
           int sp_id = i * 32 + id;
           int graphics_host_sp =
               nvhost_syncpt_graphics_host_sp(&dev->syncpt);

           if (unlikely(!nvhost_syncpt_is_valid_hw_pt(&dev->syncpt,
                   sp_id))) {
               dev_err(&dev->dev->dev, "%s(): syncpoint id %d is beyond the number of syncpoints (%d)\n",
                   __func__, sp_id,
                   nvhost_syncpt_nb_hw_pts(&dev->syncpt));
               goto out;
           }

           sp = intr->syncpt + sp_id;
           sp->isr_recv = isr_recv;

           /* handle graphics host syncpoint increments                                                                                                  
            * immediately
            */
           if (sp_id == graphics_host_sp) {
               dev_warn(&dev->dev->dev, "%s(): syncpoint id %d incremented\n",
                    __func__, graphics_host_sp);
               nvhost_syncpt_patch_check(&dev->syncpt);
               t20_intr_syncpt_intr_ack(sp, false);
           } else {
               t20_intr_syncpt_intr_ack(sp, true);
               nvhost_syncpt_thresh_fn(sp);
           }
       }
   }

out:
   return IRQ_HANDLED;
}

Interrupt registration.

Register interrupt in here. In general computation, the isr invoke nvhost_syncpt_thresh_fn() which handles syncpt.
The code above represents the ISR and the handling is cascaded.

nvidia/drivers/video/tegra/host/nvhost_intr.c

 /*** host syncpt interrupt service functions ***/
void nvhost_syncpt_thresh_fn(void *dev_id)
{    
   struct nvhost_intr_syncpt *syncpt = dev_id;
   unsigned int id = syncpt->id;
   struct nvhost_intr *intr = intr_syncpt_to_intr(syncpt);
   struct nvhost_master *dev = intr_to_dev(intr);
   int err;                                                                                                                                              

   /* make sure host1x is powered */
   err = nvhost_module_busy(dev->dev);
   if (err) {
       WARN(1, "failed to powerON host1x.");
       return;
   }

   if (nvhost_dev_is_virtual(dev->dev))
       (void)process_wait_list(intr, syncpt,
               nvhost_syncpt_read_min(&dev->syncpt, id));
   else
       (void)process_wait_list(intr, syncpt,
               nvhost_syncpt_update_min(&dev->syncpt, id));

   nvhost_module_idle(dev->dev);
}                 

process_wait_list() eventually invokes callback, the registered work (e.g. channel update()) that is registered when the gpfifo is submitted as shown below.

nvgpu/common/sync/channel_sync.c

   if (register_irq) {
       struct channel_gk20a *referenced = gk20a_channel_get(c);

       WARN_ON(!referenced);

       if (referenced) {
           /* note: channel_put() is in
            * channel_sync_syncpt_update() */

           err = nvgpu_nvhost_intr_register_notifier(
               sp->nvhost_dev,                                 
               sp->id, thresh,
               channel_sync_syncpt_update, c);
           if (err != 0) {
               gk20a_channel_put(referenced);
           }

           /* Adding interrupt action should
            * never fail. A proper error handling
            * here would require us to decrement
            * the syncpt max back to its original
            * value. */
           WARN(err,
                "failed to set submit complete interrupt");
       }
   }

3. SYNCPT (Sync Point)

nvidia/drivers/video/tegra/host/nvhost_syncpt.c

/**
* Updates the last value read from hardware.
*/
u32 nvhost_syncpt_update_min(struct nvhost_syncpt *sp, u32 id)
{
   u32 val;

   val = syncpt_op().update_min(sp, id);
   trace_nvhost_syncpt_update_min(id, val);

   return val;
}

nvidia/drivers/video/tegra/host/host1x/host1x_syncpt.c

/**
* Updates the last value read from hardware.
* (was nvhost_syncpt_update_min)
*/
static u32 t20_syncpt_update_min(struct nvhost_syncpt *sp, u32 id)
{
   struct nvhost_master *dev = syncpt_to_dev(sp);
   u32 old, live;

   do {
       old = nvhost_syncpt_read_min(sp, id);
       live = host1x_sync_readl(dev,
               (host1x_sync_syncpt_0_r() + id * 4));
   } while ((u32)atomic_cmpxchg(&sp->min_val[id], old, live) != old);

   return live;
}

nvhost_syncpt_thresh_fn() updates syncpt value (threshold for sync) by reading min value (I guess last get pointer from GPU side) from GPU.

nvidia/drivers/video/tegra/host/nvhost_intr.c

/**
* Remove & handle all waiters that have completed for the given syncpt
*/
static int process_wait_list(struct nvhost_intr *intr,
                struct nvhost_intr_syncpt *syncpt,
                u32 threshold)
{
   struct list_head *completed[NVHOST_INTR_ACTION_COUNT] = {NULL};
   struct list_head high_prio_handlers[NVHOST_INTR_HIGH_PRIO_COUNT];
   bool run_low_prio_work = false;
   unsigned int i, j;
   int empty;

   /* take lock on waiter list */
   spin_lock(&syncpt->lock);

   /* keep high priority workers in local list */
   for (i = 0; i < NVHOST_INTR_HIGH_PRIO_COUNT; ++i) {
       INIT_LIST_HEAD(high_prio_handlers + i);
       completed[i] = high_prio_handlers + i;
   }

   /* .. and low priority workers in global list */
   for (j = 0; i < NVHOST_INTR_ACTION_COUNT; ++i, ++j)
       completed[i] = syncpt->low_prio_handlers + j;

   /* this functions fills completed data */
   remove_completed_waiters(&syncpt->wait_head, threshold,
       syncpt->isr_recv, completed);

   /* check if there are still waiters left */
   empty = list_empty(&syncpt->wait_head);

   /* if not, disable interrupt. If yes, update the inetrrupt */
   if (empty)
       intr_op().disable_syncpt_intr(intr, syncpt->id);
   else
       reset_threshold_interrupt(intr, &syncpt->wait_head,
                     syncpt->id);

   /* remove low priority handlers from this list */
   for (i = NVHOST_INTR_HIGH_PRIO_COUNT;
        i < NVHOST_INTR_ACTION_COUNT; ++i) {
       if (!list_empty(completed[i]))
           run_low_prio_work = true;
       completed[i] = NULL;
   }

   /* release waiter lock */
   spin_unlock(&syncpt->lock);

   run_handlers(completed);

   /* schedule a separate task to handle low priority handlers */
   if (run_low_prio_work)
       queue_work(intr->low_prio_wq, &syncpt->low_prio_work);

   return empty;
}

process_wait_list() picks waiters (for completion) of which value is smaller than the threshold (read from GPU as we discussed above) and runs the corresponding handlers.

Share on

Twitter Facebook LinkedIn

Heejin Park

Explore Jetson Nano GPU Driver

1. Job Submission

2. Interrupt

3. SYNCPT (Sync Point)

Share on

You may also enjoy

DSTAT - Resource Monitoring

TMUX - A Terminal Multiplexer

DLXOS Setup

Directly Access Your Physical Memory (dev/mem)