Adreno

Device Drivers

Freedreno

Freedreno is reverse-engineered open-source driver stack for Qualcomm's Adreno GPU. It consists of the MSM DRM driver, the xf86-video-freedreno DDX, and the Freedreno Gallium3D driver inside Mesa.

KGSL

Interface with Qualcomm's KGSL driver used by vendor. It will be no longer updated because MSM kernel is working well. See the article.

Job Submission

The driver spins until ring buffer has enough space to put commands.

// src: "adreno_gpu.h"
static inline void
OUT_PKT7(struct msm_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
{
    adreno_wait_ring(ring, cnt + 1);
    OUT_RING(ring, CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) |
        ((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23));
}

// src: "adreno_gpu.c"
void adreno_wait_ring(struct msm_ringbuffer *ring, uint32_t ndwords)
{
    if (spin_until(ring_freewords(ring) >= ndwords))
        DRM_DEV_ERROR(ring->gpu->dev->dev,
            "timeout waiting for space in ringbuffer %d\n",
            ring->id);
}

As shown in the below, the driver submits the commands stream by

1) writing command stream to the shared ring buffer

2) update the ring buffer pointer by register I/O (inform the pointer is updated to the GPU)

NB: see the inline comments

// src: a6xx_submit() at "a6xx_gpu.c"
/* Submit the commands */
static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
    struct msm_file_private *ctx)
{
    unsigned int index = submit->seqno % MSM_GPU_SUBMIT_STATS_COUNT;
    struct msm_drm_private *priv = gpu->dev->dev_private;
    struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
    struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
    struct msm_ringbuffer *ring = submit->ring;
    unsigned int i;

    get_stats_counter(ring, REG_A6XX_RBBM_PERFCTR_CP_0_LO,
        rbmemptr_stats(ring, index, cpcycles_start));

    /*
     * For PM4 the GMU register offsets are calculated from the base of the
     * GPU registers so we need to add 0x1a800 to the register value on A630
     * to get the right value from PM4.
     */
    get_stats_counter(ring, REG_A6XX_GMU_ALWAYS_ON_COUNTER_L + 0x1a800,
        rbmemptr_stats(ring, index, alwayson_start));

    /* Invalidate CCU depth and color */
    OUT_PKT7(ring, CP_EVENT_WRITE, 1);
    OUT_RING(ring, PC_CCU_INVALIDATE_DEPTH);

    OUT_PKT7(ring, CP_EVENT_WRITE, 1);
    OUT_RING(ring, PC_CCU_INVALIDATE_COLOR);

    /* Submit the commands */
    for (i = 0; i < submit->nr_cmds; i++) {
        switch (submit->cmd[i].type) {
        case MSM_SUBMIT_CMD_IB_TARGET_BUF:
            break;
        case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
            if (priv->lastctx == ctx)
                break;
            /* fall-thru */
        case MSM_SUBMIT_CMD_BUF:
            OUT_PKT7(ring, CP_INDIRECT_BUFFER_PFE, 3);
            OUT_RING(ring, lower_32_bits(submit->cmd[i].iova));
            OUT_RING(ring, upper_32_bits(submit->cmd[i].iova));
            OUT_RING(ring, submit->cmd[i].size);
            break;
        }
    }

    get_stats_counter(ring, REG_A6XX_RBBM_PERFCTR_CP_0_LO,
        rbmemptr_stats(ring, index, cpcycles_end));
    get_stats_counter(ring, REG_A6XX_GMU_ALWAYS_ON_COUNTER_L + 0x1a800,
        rbmemptr_stats(ring, index, alwayson_end));

    /* Write the fence to the scratch register */
    OUT_PKT4(ring, REG_A6XX_CP_SCRATCH_REG(2), 1);
    OUT_RING(ring, submit->seqno);

    /*
     * Execute a CACHE_FLUSH_TS event. This will ensure that the
     * timestamp is written to the memory and then triggers the interrupt
     */
    OUT_PKT7(ring, CP_EVENT_WRITE, 4);
    OUT_RING(ring, CACHE_FLUSH_TS | (1 << 31));
    OUT_RING(ring, lower_32_bits(rbmemptr(ring, fence)));
    OUT_RING(ring, upper_32_bits(rbmemptr(ring, fence)));
    OUT_RING(ring, submit->seqno);

    trace_msm_gpu_submit_flush(submit,
        gmu_read64(&a6xx_gpu->gmu, REG_A6XX_GMU_ALWAYS_ON_COUNTER_L,
            REG_A6XX_GMU_ALWAYS_ON_COUNTER_H));

    /* jin: This can be synch as we don't update ringbuffer pointer in the GPU.
     * Hence, the GPU does not know whether a new command stream was written */
    a6xx_flush(gpu, ring);
}

static void a6xx_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
{
    uint32_t wptr;
    unsigned long flags;

    spin_lock_irqsave(&ring->lock, flags);

    /* Copy the shadow to the actual register */
    ring->cur = ring->next;

    /* Make sure to wrap wptr if we need to */
    wptr = get_wptr(ring);

    spin_unlock_irqrestore(&ring->lock, flags);

    /* Make sure everything is posted before making a decision */
    mb();

    /* jin: Here, we can check if all the previous jobs are done.
     * If not, meaning the GPU may be in idle state, we can delay the flush.
     * Hence, we may achieve job synchrnoization */
    gpu_write(gpu, REG_A6XX_CP_RB_WPTR, wptr);
}

Note

Adreno uses command stream like other GPUs (e.g. NV's) but puts them into indirect ring buffer via regIO.
Currently Adreno does not support OpenCL for debian. It is more focusing on AOSP.