Re: [PATCH 01/14] staging: vc04_services: Add new vc-sm-cma driver

From: Laurent Pinchart
Date: Mon Nov 21 2022 - 18:05:14 EST


Hi Umang (and Dave),

Thank you for the patch.

On Tue, Nov 22, 2022 at 03:17:09AM +0530, Umang Jain wrote:
> From: Dave Stevenson <dave.stevenson@xxxxxxxxxxxxxxx>
>
> Add Broadcom VideoCore Shared Memory support.
>
> This new driver allows contiguous memory blocks to be imported
> into the VideoCore VPU memory map, and manages the lifetime of
> those objects, only releasing the source dmabuf once the VPU has
> confirmed it has finished with it.
>
> Signed-off-by: Dave Stevenson <dave.stevenson@xxxxxxxxxxxxxxx>
> Signed-off-by: Umang Jain <umang.jain@xxxxxxxxxxxxxxxx>
> ---
> drivers/staging/vc04_services/Kconfig | 2 +
> drivers/staging/vc04_services/Makefile | 1 +
> .../staging/vc04_services/vc-sm-cma/Kconfig | 10 +
> .../staging/vc04_services/vc-sm-cma/Makefile | 12 +
> .../staging/vc04_services/vc-sm-cma/vc_sm.c | 801 ++++++++++++++++++
> .../staging/vc04_services/vc-sm-cma/vc_sm.h | 54 ++
> .../vc04_services/vc-sm-cma/vc_sm_cma_vchi.c | 507 +++++++++++
> .../vc04_services/vc-sm-cma/vc_sm_cma_vchi.h | 63 ++
> .../vc04_services/vc-sm-cma/vc_sm_defs.h | 187 ++++
> .../vc04_services/vc-sm-cma/vc_sm_knl.h | 28 +
> 10 files changed, 1665 insertions(+)
> create mode 100644 drivers/staging/vc04_services/vc-sm-cma/Kconfig
> create mode 100644 drivers/staging/vc04_services/vc-sm-cma/Makefile
> create mode 100644 drivers/staging/vc04_services/vc-sm-cma/vc_sm.c
> create mode 100644 drivers/staging/vc04_services/vc-sm-cma/vc_sm.h
> create mode 100644 drivers/staging/vc04_services/vc-sm-cma/vc_sm_cma_vchi.c
> create mode 100644 drivers/staging/vc04_services/vc-sm-cma/vc_sm_cma_vchi.h
> create mode 100644 drivers/staging/vc04_services/vc-sm-cma/vc_sm_defs.h
> create mode 100644 drivers/staging/vc04_services/vc-sm-cma/vc_sm_knl.h
>
> diff --git a/drivers/staging/vc04_services/Kconfig b/drivers/staging/vc04_services/Kconfig
> index 31e58c9d1a11..6c0e77d64376 100644
> --- a/drivers/staging/vc04_services/Kconfig
> +++ b/drivers/staging/vc04_services/Kconfig
> @@ -46,5 +46,7 @@ source "drivers/staging/vc04_services/bcm2835-camera/Kconfig"
>
> source "drivers/staging/vc04_services/vchiq-mmal/Kconfig"
>
> +source "drivers/staging/vc04_services/vc-sm-cma/Kconfig"
> +
> endif
>
> diff --git a/drivers/staging/vc04_services/Makefile b/drivers/staging/vc04_services/Makefile
> index 1fd191e2e2a5..01089f369fb4 100644
> --- a/drivers/staging/vc04_services/Makefile
> +++ b/drivers/staging/vc04_services/Makefile
> @@ -14,6 +14,7 @@ endif
> obj-$(CONFIG_SND_BCM2835) += bcm2835-audio/
> obj-$(CONFIG_VIDEO_BCM2835) += bcm2835-camera/
> obj-$(CONFIG_BCM2835_VCHIQ_MMAL) += vchiq-mmal/
> +obj-$(CONFIG_BCM_VC_SM_CMA) += vc-sm-cma/
>
> ccflags-y += -I $(srctree)/$(src)/include
>
> diff --git a/drivers/staging/vc04_services/vc-sm-cma/Kconfig b/drivers/staging/vc04_services/vc-sm-cma/Kconfig
> new file mode 100644
> index 000000000000..bbd296f5826b
> --- /dev/null
> +++ b/drivers/staging/vc04_services/vc-sm-cma/Kconfig
> @@ -0,0 +1,10 @@
> +config BCM_VC_SM_CMA
> + tristate "VideoCore Shared Memory (CMA) driver"
> + depends on BCM2835_VCHIQ
> + select RBTREE

I don't see an RBTREE config option in mainline.

> + select DMA_SHARED_BUFFER
> + help
> + Say Y here to enable the shared memory interface that
> + supports sharing dmabufs with VideoCore.
> + This operates over the VCHIQ interface to a service
> + running on VideoCore.
> diff --git a/drivers/staging/vc04_services/vc-sm-cma/Makefile b/drivers/staging/vc04_services/vc-sm-cma/Makefile
> new file mode 100644
> index 000000000000..c92a5775c62e
> --- /dev/null
> +++ b/drivers/staging/vc04_services/vc-sm-cma/Makefile
> @@ -0,0 +1,12 @@
> +ccflags-y += \
> + -I$(srctree)/$(src)/../ \
> + -I$(srctree)/$(src)/../interface/vchiq_arm\

Missing space before \

> + -I$(srctree)/$(src)/../include
> +
> +ccflags-y += \
> + -D__VCCOREVER__=0

Is this needed ?

> +
> +vc-sm-cma-$(CONFIG_BCM_VC_SM_CMA) := \
> + vc_sm.o vc_sm_cma_vchi.o
> +
> +obj-$(CONFIG_BCM_VC_SM_CMA) += vc-sm-cma.o
> diff --git a/drivers/staging/vc04_services/vc-sm-cma/vc_sm.c b/drivers/staging/vc04_services/vc-sm-cma/vc_sm.c
> new file mode 100644
> index 000000000000..7fe81d259c7b
> --- /dev/null
> +++ b/drivers/staging/vc04_services/vc-sm-cma/vc_sm.c
> @@ -0,0 +1,801 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * VideoCore Shared Memory driver using CMA.
> + *
> + * Copyright: 2018, Raspberry Pi (Trading) Ltd
> + * Dave Stevenson <dave.stevenson@xxxxxxxxxxxxxxx>
> + *
> + * Based on vmcs_sm driver from Broadcom Corporation for some API,
> + * and taking some code for buffer allocation and dmabuf handling from
> + * videobuf2.
> + *
> + * This driver handles taking a dmabuf, mapping it into the VPU address space,
> + * and returning a handle that the VPU can use.
> + */
> +
> +/* ---- Include Files ----------------------------------------------------- */

I'd drop this comment (same below), it feels a bit alien.

> +#include <linux/debugfs.h>
> +#include <linux/dma-buf.h>
> +#include <linux/errno.h>
> +#include <linux/miscdevice.h>
> +#include <linux/platform_device.h>
> +#include <linux/syscalls.h>
> +
> +#include "vchiq_connected.h"
> +#include "vc_sm_cma_vchi.h"
> +
> +#include "vc_sm.h"
> +#include "vc_sm_knl.h"
> +
> +MODULE_IMPORT_NS(DMA_BUF);
> +
> +/* ---- Private Constants and Types --------------------------------------- */
> +
> +#define VC_SM_RESOURCE_NAME_DEFAULT "sm-host-resource"
> +
> +#define VC_SM_DIR_ROOT_NAME "vcsm-cma"
> +#define VC_SM_STATE "state"
> +
> +/* Private file data associated with each opened device. */
> +struct vc_sm_privdata_t {
> +};
> +
> +typedef int (*VC_SM_SHOW) (struct seq_file *s, void *v);
> +struct sm_pde_t {

Should this structure (and the other ones below) have a vc_ prefix for
consistency ? Also, drop the _t suffix.

> + VC_SM_SHOW show; /* Debug fs function hookup. */
> + struct dentry *dir_entry; /* Debug fs directory entry. */
> + void *priv_data; /* Private data */

Never used.

> +};
> +
> +/* Global state information. */
> +struct sm_state_t {
> + struct platform_device *pdev;
> +
> + struct miscdevice misc_dev;
> +
> + struct sm_instance *sm_handle; /* Handle for videocore service. */
> +
> + spinlock_t kernelid_map_lock; /* Spinlock protecting kernelid_map */
> + struct idr kernelid_map;
> +
> + struct mutex map_lock; /* Global map lock. */
> + struct list_head buffer_list; /* List of buffer. */
> +
> + struct dentry *dir_root; /* Debug fs entries root. */
> + struct sm_pde_t dir_state; /* Debug fs entries state sub-tree. */
> +
> + bool require_released_callback; /* VPU will send a released msg when it
> + * has finished with a resource.
> + */
> + /* State for transactions */
> + int restart_sys; /* Tracks restart on interrupt. */
> + enum vc_sm_msg_type int_action; /* Interrupted action. */

Both of these are set but never used. Either there's dead code below, or
something is wrong.

> +
> + u32 int_trans_id; /* Interrupted transaction. */
> + struct vchiq_instance *vchiq_instance;
> +};
> +
> +struct vc_sm_dma_buf_attachment {

Never used.

> + struct device *dev;
> + struct sg_table sg_table;
> + struct list_head list;
> + enum dma_data_direction dma_dir;
> +};
> +
> +/* ---- Private Variables ----------------------------------------------- */
> +
> +static struct sm_state_t *sm_state;
> +static int sm_inited;

Can we avoid global variables please ?

> +
> +/* ---- Private Function Prototypes -------------------------------------- */
> +
> +/* ---- Private Functions ------------------------------------------------ */

If you want to split the driver in sections (which I personally like for
large files), I would group functions by purpose and add a section
header for each group.

> +
> +static int get_kernel_id(struct vc_sm_buffer *buffer)
> +{
> + int handle;
> +
> + spin_lock(&sm_state->kernelid_map_lock);
> + handle = idr_alloc(&sm_state->kernelid_map, buffer, 0, 0, GFP_KERNEL);
> + spin_unlock(&sm_state->kernelid_map_lock);
> +
> + return handle;
> +}
> +
> +static struct vc_sm_buffer *lookup_kernel_id(int handle)
> +{
> + return idr_find(&sm_state->kernelid_map, handle);
> +}
> +
> +static void free_kernel_id(int handle)
> +{
> + spin_lock(&sm_state->kernelid_map_lock);
> + idr_remove(&sm_state->kernelid_map, handle);
> + spin_unlock(&sm_state->kernelid_map_lock);
> +}

For instance those three functions could be in the IDR section. I would
probably inline them in their respective caller though, but if you wait
to keep them, you should give them better names. vc_sm_cma_idr_ sounds
like a good prefix, the functions could be vc_sm_cma_idr_alloc(),
vc_sm_cma_idr_find() and vc_sm_cma_idr_remove(). I would then move the
IDR init and cleanup code to two separate functions too.

> +
> +static int vc_sm_cma_seq_file_show(struct seq_file *s, void *v)
> +{
> + struct sm_pde_t *sm_pde;
> +
> + sm_pde = (struct sm_pde_t *)(s->private);
> +
> + if (sm_pde && sm_pde->show)
> + sm_pde->show(s, v);

The only show handler is vc_sm_cma_global_state_show(), you can simplify
all this by dropping this function and passing
vc_sm_cma_global_state_show to single_open(). The show field in sm_pde_t
can also be dropped. You can actually drop the sm_pde_t structure and
embed the dir_entry field directly in sm_state_t.

> +
> + return 0;
> +}
> +
> +static int vc_sm_cma_single_open(struct inode *inode, struct file *file)
> +{
> + return single_open(file, vc_sm_cma_seq_file_show, inode->i_private);
> +}
> +
> +static const struct file_operations vc_sm_cma_debug_fs_fops = {
> + .open = vc_sm_cma_single_open,
> + .read = seq_read,
> + .llseek = seq_lseek,
> + .release = single_release,
> +};
> +
> +static int vc_sm_cma_global_state_show(struct seq_file *s, void *v)
> +{
> + struct vc_sm_buffer *resource = NULL;
> + int resource_count = 0;
> +
> + if (!sm_state)
> + return 0;
> +
> + seq_printf(s, "\nVC-ServiceHandle %p\n", sm_state->sm_handle);
> +
> + /* Log all applicable mapping(s). */
> +
> + mutex_lock(&sm_state->map_lock);
> + seq_puts(s, "\nResources\n");
> + if (!list_empty(&sm_state->buffer_list)) {
> + list_for_each_entry(resource, &sm_state->buffer_list,
> + global_buffer_list) {
> + resource_count++;
> +
> + seq_printf(s, "\nResource %p\n",
> + resource);
> + seq_printf(s, " NAME %s\n",
> + resource->name);
> + seq_printf(s, " SIZE %zu\n",
> + resource->size);
> + seq_printf(s, " DMABUF %p\n",
> + resource->dma_buf);
> + seq_printf(s, " IMPORTED_DMABUF %p\n",
> + resource->imported_dma_buf);
> + seq_printf(s, " ATTACH %p\n",
> + resource->attach);
> + seq_printf(s, " SGT %p\n",
> + resource->sgt);
> + seq_printf(s, " DMA_ADDR %pad\n",
> + &resource->dma_addr);
> + seq_printf(s, " VC_HANDLE %08x\n",
> + resource->vc_handle);
> + seq_printf(s, " VC_MAPPING %d\n",
> + resource->vpu_state);
> + }
> + }
> + seq_printf(s, "\n\nTotal resource count: %d\n\n", resource_count);
> +
> + mutex_unlock(&sm_state->map_lock);
> +
> + return 0;
> +}
> +
> +/*
> + * Adds a buffer to the private data list which tracks all the allocated
> + * data.
> + */
> +static void vc_sm_add_resource(struct vc_sm_buffer *buffer)
> +{
> + mutex_lock(&sm_state->map_lock);
> + list_add(&buffer->global_buffer_list, &sm_state->buffer_list);
> + mutex_unlock(&sm_state->map_lock);
> +
> + pr_debug("[%s]: added buffer %p (name %s, size %zu)\n",
> + __func__, buffer, buffer->name, buffer->size);

Let's use dev_* instead of pr_* where possible.

> +}
> +
> +/*
> + * Cleans up imported dmabuf.
> + * Should be called with mutex held.
> + */
> +static void vc_sm_clean_up_dmabuf(struct vc_sm_buffer *buffer)
> +{
> + /* Handle cleaning up imported dmabufs */
> + if (buffer->sgt) {
> + dma_buf_unmap_attachment(buffer->attach,
> + buffer->sgt,
> + DMA_BIDIRECTIONAL);
> + buffer->sgt = NULL;
> + }
> + if (buffer->attach) {
> + dma_buf_detach(buffer->dma_buf, buffer->attach);
> + buffer->attach = NULL;
> + }
> +}
> +
> +/*
> + * Instructs VPU to decrement the refcount on a buffer.
> + */
> +static void vc_sm_vpu_free(struct vc_sm_buffer *buffer)
> +{
> + if (buffer->vc_handle && buffer->vpu_state == VPU_MAPPED) {
> + struct vc_sm_free_t free = { buffer->vc_handle, 0 };
> + int status = vc_sm_cma_vchi_free(sm_state->sm_handle, &free,
> + &sm_state->int_trans_id);
> + if (status != 0 && status != -EINTR) {
> + pr_err("[%s]: failed to free memory on videocore (status: %u, trans_id: %u)\n",
> + __func__, status, sm_state->int_trans_id);
> + }
> +
> + if (sm_state->require_released_callback) {
> + /* Need to wait for the VPU to confirm the free. */
> +
> + /* Retain a reference on this until the VPU has
> + * released it
> + */
> + buffer->vpu_state = VPU_UNMAPPING;
> + } else {
> + buffer->vpu_state = VPU_NOT_MAPPED;
> + buffer->vc_handle = 0;
> + }
> + }
> +}
> +
> +/*
> + * Release an allocation.
> + * All refcounting is done via the dma buf object.
> + *
> + * Must be called with the mutex held. The function will either release the
> + * mutex (if defering the release) or destroy it. The caller must therefore not
> + * reuse the buffer on return.
> + */
> +static void vc_sm_release_resource(struct vc_sm_buffer *buffer)
> +{
> + pr_debug("[%s]: buffer %p (name %s, size %zu)\n",
> + __func__, buffer, buffer->name, buffer->size);
> +
> + if (buffer->vc_handle) {
> + /* We've sent the unmap request but not had the response. */
> + pr_debug("[%s]: Waiting for VPU unmap response on %p\n",
> + __func__, buffer);
> + goto defer;
> + }
> + if (buffer->in_use) {
> + /* dmabuf still in use - we await the release */
> + pr_debug("[%s]: buffer %p is still in use\n", __func__, buffer);
> + goto defer;
> + }
> +
> + /* Release the allocation */
> + if (buffer->imported_dma_buf)
> + dma_buf_put(buffer->imported_dma_buf);
> + else
> + pr_err("%s: Imported dmabuf already been put for buf %p\n",
> + __func__, buffer);
> + buffer->imported_dma_buf = NULL;
> +
> + /* Free our buffer. Start by removing it from the list */
> + mutex_lock(&sm_state->map_lock);
> + list_del(&buffer->global_buffer_list);
> + mutex_unlock(&sm_state->map_lock);
> +
> + pr_debug("%s: Release our allocation - done\n", __func__);
> + mutex_unlock(&buffer->lock);
> +
> + mutex_destroy(&buffer->lock);
> +
> + kfree(buffer);
> + return;
> +
> +defer:
> + mutex_unlock(&buffer->lock);
> +}
> +
> +/* Dma_buf operations for chaining through to an imported dma_buf */
> +
> +static void vc_sm_dma_buf_release(struct dma_buf *dmabuf)
> +{
> + struct vc_sm_buffer *buffer;
> +
> + if (!dmabuf)
> + return;
> +
> + buffer = (struct vc_sm_buffer *)dmabuf->priv;
> +
> + mutex_lock(&buffer->lock);
> +
> + pr_debug("%s dmabuf %p, buffer %p\n", __func__, dmabuf, buffer);
> +
> + buffer->in_use = 0;
> +
> + /* Unmap on the VPU */
> + vc_sm_vpu_free(buffer);
> + pr_debug("%s vpu_free done\n", __func__);
> +
> + /* Unmap our dma_buf object (the vc_sm_buffer remains until released
> + * on the VPU).
> + */
> + vc_sm_clean_up_dmabuf(buffer);
> + pr_debug("%s clean_up dmabuf done\n", __func__);
> +
> + /* buffer->lock will be destroyed by vc_sm_release_resource if finished
> + * with, otherwise unlocked. Do NOT unlock here.
> + */
> + vc_sm_release_resource(buffer);
> + pr_debug("%s done\n", __func__);
> +}
> +
> +static
> +int vc_sm_import_dma_buf_attach(struct dma_buf *dmabuf,
> + struct dma_buf_attachment *attachment)
> +{
> + struct vc_sm_buffer *buf = dmabuf->priv;
> +
> + return buf->imported_dma_buf->ops->attach(buf->imported_dma_buf,
> + attachment);
> +}
> +
> +static
> +void vc_sm_import_dma_buf_detatch(struct dma_buf *dmabuf,
> + struct dma_buf_attachment *attachment)
> +{
> + struct vc_sm_buffer *buf = dmabuf->priv;
> +
> + buf->imported_dma_buf->ops->detach(buf->imported_dma_buf, attachment);
> +}
> +
> +static
> +struct sg_table *vc_sm_import_map_dma_buf(struct dma_buf_attachment *attachment,
> + enum dma_data_direction direction)
> +{
> + struct vc_sm_buffer *buf = attachment->dmabuf->priv;
> +
> + return buf->imported_dma_buf->ops->map_dma_buf(attachment,
> + direction);
> +}
> +
> +static
> +void vc_sm_import_unmap_dma_buf(struct dma_buf_attachment *attachment,
> + struct sg_table *table,
> + enum dma_data_direction direction)
> +{
> + struct vc_sm_buffer *buf = attachment->dmabuf->priv;
> +
> + buf->imported_dma_buf->ops->unmap_dma_buf(attachment, table, direction);
> +}
> +
> +static
> +int vc_sm_import_dmabuf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
> +{
> + struct vc_sm_buffer *buf = dmabuf->priv;
> +
> + pr_debug("%s: mmap dma_buf %p, buf %p, imported db %p\n", __func__,
> + dmabuf, buf, buf->imported_dma_buf);
> +
> + return buf->imported_dma_buf->ops->mmap(buf->imported_dma_buf, vma);
> +}
> +
> +static
> +int vc_sm_import_dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
> + enum dma_data_direction direction)
> +{
> + struct vc_sm_buffer *buf = dmabuf->priv;
> +
> + return buf->imported_dma_buf->ops->begin_cpu_access(buf->imported_dma_buf,
> + direction);
> +}
> +
> +static
> +int vc_sm_import_dma_buf_end_cpu_access(struct dma_buf *dmabuf,
> + enum dma_data_direction direction)
> +{
> + struct vc_sm_buffer *buf = dmabuf->priv;
> +
> + return buf->imported_dma_buf->ops->end_cpu_access(buf->imported_dma_buf,
> + direction);
> +}
> +
> +static const struct dma_buf_ops dma_buf_import_ops = {
> + .map_dma_buf = vc_sm_import_map_dma_buf,
> + .unmap_dma_buf = vc_sm_import_unmap_dma_buf,
> + .mmap = vc_sm_import_dmabuf_mmap,
> + .release = vc_sm_dma_buf_release,
> + .attach = vc_sm_import_dma_buf_attach,
> + .detach = vc_sm_import_dma_buf_detatch,
> + .begin_cpu_access = vc_sm_import_dma_buf_begin_cpu_access,
> + .end_cpu_access = vc_sm_import_dma_buf_end_cpu_access,
> +};
> +
> +/* Import a dma_buf to be shared with VC. */
> +int
> +vc_sm_cma_import_dmabuf_internal(struct sm_state_t *state,
> + struct dma_buf *dma_buf,
> + int fd,
> + struct dma_buf **imported_buf)

This can be static.

> +{
> + DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
> + struct vc_sm_buffer *buffer = NULL;
> + struct vc_sm_import import = { };
> + struct vc_sm_import_result result = { };
> + struct dma_buf_attachment *attach = NULL;
> + struct sg_table *sgt = NULL;
> + dma_addr_t dma_addr;
> + int ret = 0;
> + int status;
> +
> + /* Setup our allocation parameters */
> + pr_debug("%s: importing dma_buf %p/fd %d\n", __func__, dma_buf, fd);
> +
> + if (fd < 0)
> + get_dma_buf(dma_buf);
> + else
> + dma_buf = dma_buf_get(fd);

This function is always called with fd == -1. There seems to be quite a
bit of dead code in this driver, could you double-check everything and
trim it down ?

> +
> + if (!dma_buf)
> + return -EINVAL;
> +
> + attach = dma_buf_attach(dma_buf, &sm_state->pdev->dev);
> + if (IS_ERR(attach)) {
> + ret = PTR_ERR(attach);
> + goto error;
> + }
> +
> + sgt = dma_buf_map_attachment(attach, DMA_BIDIRECTIONAL);
> + if (IS_ERR(sgt)) {
> + ret = PTR_ERR(sgt);
> + goto error;
> + }
> +
> + /* Verify that the address block is contiguous */
> + if (sgt->nents != 1) {
> + ret = -ENOMEM;
> + goto error;
> + }
> +
> + /* Allocate local buffer to track this allocation. */
> + buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
> + if (!buffer) {
> + ret = -ENOMEM;
> + goto error;
> + }
> +
> + import.type = VC_SM_ALLOC_NON_CACHED;
> + dma_addr = sg_dma_address(sgt->sgl);
> + import.addr = (u32)dma_addr;
> + if ((import.addr & 0xC0000000) != 0xC0000000) {
> + pr_err("%s: Expecting an uncached alias for dma_addr %pad\n",
> + __func__, &dma_addr);
> + import.addr |= 0xC0000000;
> + }
> + import.size = sg_dma_len(sgt->sgl);
> + import.allocator = current->tgid;
> + import.kernel_id = get_kernel_id(buffer);
> +
> + memcpy(import.name, VC_SM_RESOURCE_NAME_DEFAULT,
> + sizeof(VC_SM_RESOURCE_NAME_DEFAULT));
> +
> + pr_debug("[%s]: attempt to import \"%s\" data - type %u, addr %pad, size %u.\n",
> + __func__, import.name, import.type, &dma_addr, import.size);
> +
> + /* Allocate the videocore buffer. */
> + status = vc_sm_cma_vchi_import(sm_state->sm_handle, &import, &result,
> + &sm_state->int_trans_id);
> + if (status == -EINTR) {
> + pr_debug("[%s]: requesting import memory action restart (trans_id: %u)\n",
> + __func__, sm_state->int_trans_id);
> + ret = -ERESTARTSYS;
> + sm_state->restart_sys = -EINTR;
> + sm_state->int_action = VC_SM_MSG_TYPE_IMPORT;
> + goto error;
> + } else if (status || !result.res_handle) {
> + pr_debug("[%s]: failed to import memory on videocore (status: %u, trans_id: %u)\n",
> + __func__, status, sm_state->int_trans_id);
> + ret = -ENOMEM;
> + goto error;
> + }
> +
> + mutex_init(&buffer->lock);
> + INIT_LIST_HEAD(&buffer->attachments);
> + memcpy(buffer->name, import.name,
> + min(sizeof(buffer->name), sizeof(import.name) - 1));
> +
> + /* Keep track of the buffer we created. */
> + buffer->vc_handle = result.res_handle;
> + buffer->size = import.size;
> + buffer->vpu_state = VPU_MAPPED;
> +
> + buffer->imported_dma_buf = dma_buf;
> +
> + buffer->attach = attach;
> + buffer->sgt = sgt;
> + buffer->dma_addr = dma_addr;
> + buffer->in_use = 1;
> + buffer->kernel_id = import.kernel_id;
> +
> + /*
> + * We're done - we need to export a new dmabuf chaining through most
> + * functions, but enabling us to release our own internal references
> + * here.
> + */

Why do we need to export a new dmabuf ?

> + exp_info.ops = &dma_buf_import_ops;
> + exp_info.size = import.size;
> + exp_info.flags = O_RDWR;
> + exp_info.priv = buffer;
> +
> + buffer->dma_buf = dma_buf_export(&exp_info);
> + if (IS_ERR(buffer->dma_buf)) {
> + ret = PTR_ERR(buffer->dma_buf);
> + goto error;
> + }
> +
> + vc_sm_add_resource(buffer);
> +
> + *imported_buf = buffer->dma_buf;
> +
> + return 0;
> +
> +error:
> + if (result.res_handle) {
> + struct vc_sm_free_t free = { result.res_handle, 0 };
> +
> + vc_sm_cma_vchi_free(sm_state->sm_handle, &free,
> + &sm_state->int_trans_id);
> + }
> + free_kernel_id(import.kernel_id);
> + kfree(buffer);
> + if (sgt)
> + dma_buf_unmap_attachment(attach, sgt, DMA_BIDIRECTIONAL);
> + if (attach)
> + dma_buf_detach(dma_buf, attach);
> + dma_buf_put(dma_buf);
> + return ret;
> +}
> +
> +static void
> +vc_sm_vpu_event(struct sm_instance *instance, struct vc_sm_result_t *reply,
> + int reply_len)
> +{
> + switch (reply->trans_id & ~0x80000000) {
> + case VC_SM_MSG_TYPE_CLIENT_VERSION:
> + {
> + /* Acknowledge that the firmware supports the version command */
> + pr_debug("%s: firmware acked version msg. Require release cb\n",
> + __func__);
> + sm_state->require_released_callback = true;
> + }
> + break;

No need for braces.

> + case VC_SM_MSG_TYPE_RELEASED:
> + {
> + struct vc_sm_released *release = (struct vc_sm_released *)reply;
> + struct vc_sm_buffer *buffer =
> + lookup_kernel_id(release->kernel_id);
> + if (!buffer) {
> + pr_err("%s: VC released a buffer that is already released, kernel_id %d\n",
> + __func__, release->kernel_id);
> + break;
> + }
> + mutex_lock(&buffer->lock);
> +
> + pr_debug("%s: Released addr %08x, size %u, id %08x, mem_handle %08x\n",
> + __func__, release->addr, release->size,
> + release->kernel_id, release->vc_handle);
> +
> + buffer->vc_handle = 0;
> + buffer->vpu_state = VPU_NOT_MAPPED;
> + free_kernel_id(release->kernel_id);
> +
> + vc_sm_release_resource(buffer);
> + }
> + break;
> + default:
> + pr_err("%s: Unknown vpu cmd %x\n", __func__, reply->trans_id);
> + break;
> + }
> +}
> +
> +/* Driver load/unload functions */
> +/* Videocore connected. */
> +static void vc_sm_connected_init(void)
> +{
> + int ret;
> + struct vc_sm_version version;
> + struct vc_sm_result_t version_result;
> +
> + pr_info("[%s]: start\n", __func__);
> +
> + /*
> + * Initialize and create a VCHI connection for the shared memory service
> + * running on videocore.
> + */
> + ret = vchiq_initialise(&sm_state->vchiq_instance);
> + if (ret) {
> + pr_err("[%s]: failed to initialise VCHI instance (ret=%d)\n",
> + __func__, ret);
> +
> + return;
> + }
> +
> + ret = vchiq_connect(sm_state->vchiq_instance);
> + if (ret) {
> + pr_err("[%s]: failed to connect VCHI instance (ret=%d)\n",
> + __func__, ret);
> +
> + return;
> + }
> +
> + /* Initialize an instance of the shared memory service. */
> + sm_state->sm_handle = vc_sm_cma_vchi_init(sm_state->vchiq_instance, 1,
> + vc_sm_vpu_event);
> + if (!sm_state->sm_handle) {
> + pr_err("[%s]: failed to initialize shared memory service\n",
> + __func__);
> +
> + return;
> + }
> +
> + /* Create a debug fs directory entry (root). */
> + sm_state->dir_root = debugfs_create_dir(VC_SM_DIR_ROOT_NAME, NULL);
> +
> + sm_state->dir_state.show = &vc_sm_cma_global_state_show;
> + sm_state->dir_state.dir_entry =
> + debugfs_create_file(VC_SM_STATE, 0444, sm_state->dir_root,
> + &sm_state->dir_state,
> + &vc_sm_cma_debug_fs_fops);

Move this to a separate debugfs init function, grouped with the rest of
the debugfs code. debugfs is conditioned by CONFIG_DEBUG_FS, so you'll
need conditional compilation. You will also need to handle cleanup, you
never destroy the file and directory.

> +
> + INIT_LIST_HEAD(&sm_state->buffer_list);
> +
> + version.version = 2;
> + ret = vc_sm_cma_vchi_client_version(sm_state->sm_handle, &version,
> + &version_result,
> + &sm_state->int_trans_id);
> + if (ret) {
> + pr_err("[%s]: Failed to send version request %d\n", __func__,
> + ret);
> + }
> +
> + /* Done! */
> + sm_inited = 1;
> + pr_info("[%s]: installed successfully\n", __func__);
> +}
> +
> +/* Driver loading. */
> +static int bcm2835_vc_sm_cma_probe(struct platform_device *pdev)
> +{
> + pr_info("%s: Videocore shared memory driver\n", __func__);

Drop this, it only clutters the kernel log.

> +
> + sm_state = devm_kzalloc(&pdev->dev, sizeof(*sm_state), GFP_KERNEL);
> + if (!sm_state)
> + return -ENOMEM;
> + sm_state->pdev = pdev;
> + mutex_init(&sm_state->map_lock);
> +
> + spin_lock_init(&sm_state->kernelid_map_lock);
> + idr_init_base(&sm_state->kernelid_map, 1);
> +
> + pdev->dev.dma_parms = devm_kzalloc(&pdev->dev,
> + sizeof(*pdev->dev.dma_parms),
> + GFP_KERNEL);
> + /* dma_set_max_seg_size checks if dma_parms is NULL. */
> + dma_set_max_seg_size(&pdev->dev, 0x3FFFFFFF);
> +
> + vchiq_add_connected_callback(vc_sm_connected_init);
> + return 0;
> +}
> +
> +/* Driver unloading. */
> +static int bcm2835_vc_sm_cma_remove(struct platform_device *pdev)
> +{
> + pr_debug("[%s]: start\n", __func__);

You can drop this message and the one at the end of the function.

> + if (sm_inited) {
> + misc_deregister(&sm_state->misc_dev);

There's no misc_register() call, something seems wrong. Probably
leftover code ?

> +
> + /* Remove all proc entries. */
> + debugfs_remove_recursive(sm_state->dir_root);
> +
> + /* Stop the videocore shared memory service. */
> + vc_sm_cma_vchi_stop(sm_state->vchiq_instance, &sm_state->sm_handle);
> + }
> +
> + if (sm_state) {

Drop the condition, sm_state can never be NULL if probe() succeeded, and
if it hasn't succeeded, then remove() will not be called.

> + idr_destroy(&sm_state->kernelid_map);
> +
> + /* Free the memory for the state structure. */
> + mutex_destroy(&sm_state->map_lock);
> + }
> +
> + pr_debug("[%s]: end\n", __func__);
> + return 0;
> +}
> +
> +/* Kernel API calls */
> +/* Get an internal resource handle mapped from the external one. */
> +int vc_sm_cma_int_handle(void *handle)
> +{
> + struct dma_buf *dma_buf = (struct dma_buf *)handle;
> + struct vc_sm_buffer *buf;
> +
> + /* Validate we can work with this device. */
> + if (!sm_state || !handle) {
> + pr_err("[%s]: invalid input\n", __func__);
> + return 0;
> + }
> +
> + buf = (struct vc_sm_buffer *)dma_buf->priv;
> + return buf->vc_handle;
> +}
> +EXPORT_SYMBOL_GPL(vc_sm_cma_int_handle);
> +
> +/* Free a previously allocated shared memory handle and block. */
> +int vc_sm_cma_free(void *handle)
> +{
> + struct dma_buf *dma_buf = (struct dma_buf *)handle;
> +
> + /* Validate we can work with this device. */
> + if (!sm_state || !handle) {
> + pr_err("[%s]: invalid input\n", __func__);
> + return -EPERM;

That's a weird error code for an invalid input.

> + }
> +
> + pr_debug("%s: handle %p/dmabuf %p\n", __func__, handle, dma_buf);
> +
> + dma_buf_put(dma_buf);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(vc_sm_cma_free);
> +
> +/* Import a dmabuf to be shared with VC. */
> +int vc_sm_cma_import_dmabuf(struct dma_buf *src_dmabuf, void **handle)
> +{
> + struct dma_buf *new_dma_buf;
> + int ret;
> +
> + /* Validate we can work with this device. */
> + if (!sm_state || !src_dmabuf || !handle) {
> + pr_err("[%s]: invalid input\n", __func__);
> + return -EPERM;

Same here.

> + }
> +
> + ret = vc_sm_cma_import_dmabuf_internal(sm_state, src_dmabuf,
> + -1, &new_dma_buf);
> +
> + if (!ret) {
> + pr_debug("%s: imported to ptr %p\n", __func__, new_dma_buf);
> +
> + /* Assign valid handle at this time.*/
> + *handle = new_dma_buf;
> + } else {
> + /*
> + * succeeded in importing the dma_buf, but then
> + * failed to look it up again. How?
> + * Release the fd again.
> + */
> + pr_err("%s: imported vc_sm_cma_get_buffer failed %d\n",
> + __func__, ret);
> + }
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(vc_sm_cma_import_dmabuf);

Those three functions are a bit ill-placed between the probe/remove
functions and the platform_driver. I'd move them just above
vc_sm_vpu_event(), and order them as mentioned below in the header file.

> +
> +static struct platform_driver bcm2835_vcsm_cma_driver = {
> + .probe = bcm2835_vc_sm_cma_probe,
> + .remove = bcm2835_vc_sm_cma_remove,
> + .driver = {
> + .name = "vcsm-cma",
> + .owner = THIS_MODULE,
> + },

Wrong indentation, should be

.driver = {
.name = "vcsm-cma",
.owner = THIS_MODULE,
},

> +};
> +
> +module_platform_driver(bcm2835_vcsm_cma_driver);
> +
> +MODULE_AUTHOR("Dave Stevenson");
> +MODULE_DESCRIPTION("VideoCore CMA Shared Memory Driver");
> +MODULE_LICENSE("GPL");
> +MODULE_ALIAS("platform:vcsm-cma");
> diff --git a/drivers/staging/vc04_services/vc-sm-cma/vc_sm.h b/drivers/staging/vc04_services/vc-sm-cma/vc_sm.h
> new file mode 100644
> index 000000000000..61e110ec414d
> --- /dev/null
> +++ b/drivers/staging/vc04_services/vc-sm-cma/vc_sm.h
> @@ -0,0 +1,54 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * VideoCore Shared Memory driver using CMA.
> + *
> + * Copyright: 2018, Raspberry Pi (Trading) Ltd
> + *
> + */
> +
> +#ifndef VC_SM_H
> +#define VC_SM_H
> +
> +#define VC_SM_MAX_NAME_LEN 32
> +
> +enum vc_sm_vpu_mapping_state {
> + VPU_NOT_MAPPED,
> + VPU_MAPPED,
> + VPU_UNMAPPING
> +};
> +
> +struct vc_sm_buffer {
> + struct list_head global_buffer_list; /* Global list of buffers. */
> +
> + /* Index in the kernel_id idr so that we can find the
> + * mmal_msg_context again when servicing the VCHI reply.
> + */
> + int kernel_id;
> +
> + size_t size;
> +
> + /* Lock over all the following state for this buffer */
> + struct mutex lock;
> + struct list_head attachments;
> +
> + char name[VC_SM_MAX_NAME_LEN];
> +
> + int in_use:1; /* Kernel is still using this resource */
> +
> + enum vc_sm_vpu_mapping_state vpu_state;
> + u32 vc_handle; /* VideoCore handle for this buffer */
> +
> + /* DMABUF related fields */
> + struct dma_buf *dma_buf;
> + dma_addr_t dma_addr;
> + void *cookie;
> +
> + struct vc_sm_privdata_t *private;
> +
> + struct dma_buf *imported_dma_buf;
> + struct dma_buf_attachment *attach;
> + struct sg_table *sgt;
> +};
> +
> +#endif
> diff --git a/drivers/staging/vc04_services/vc-sm-cma/vc_sm_cma_vchi.c b/drivers/staging/vc04_services/vc-sm-cma/vc_sm_cma_vchi.c
> new file mode 100644
> index 000000000000..c77ef0998a31
> --- /dev/null
> +++ b/drivers/staging/vc04_services/vc-sm-cma/vc_sm_cma_vchi.c
> @@ -0,0 +1,507 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * VideoCore Shared Memory CMA allocator
> + *
> + * Copyright: 2018, Raspberry Pi (Trading) Ltd
> + * Copyright 2011-2012 Broadcom Corporation. All rights reserved.
> + *
> + * Based on vmcs_sm driver from Broadcom Corporation.
> + *
> + */
> +
> +/* ---- Include Files ----------------------------------------------------- */
> +#include <linux/kthread.h>
> +#include <linux/list.h>
> +#include <linux/semaphore.h>
> +#include <linux/slab.h>
> +#include <linux/types.h>
> +
> +#include "vc_sm_cma_vchi.h"
> +
> +#define VC_SM_VER 1
> +#define VC_SM_MIN_VER 0
> +
> +/* ---- Private Constants and Types -------------------------------------- */
> +
> +/* Command blocks come from a pool */
> +#define SM_MAX_NUM_CMD_RSP_BLKS 32
> +
> +/* The number of supported connections */
> +#define SM_MAX_NUM_CONNECTIONS 3
> +
> +struct sm_cmd_rsp_blk {
> + struct list_head head; /* To create lists */
> + /* To be signaled when the response is there */
> + struct completion cmplt;
> +
> + u32 id;
> + u16 length;
> +
> + u8 msg[VC_SM_MAX_MSG_LEN];
> +
> + uint32_t wait:1;
> + uint32_t sent:1;
> + uint32_t alloc:1;
> +
> +};
> +
> +struct sm_instance {
> + u32 num_connections;
> + unsigned int service_handle[SM_MAX_NUM_CONNECTIONS];
> + struct task_struct *io_thread;
> + struct completion io_cmplt;
> +
> + vpu_event_cb vpu_event;
> +
> + /* Mutex over the following lists */
> + struct mutex lock;
> + u32 trans_id;
> + struct list_head cmd_list;
> + struct list_head rsp_list;
> + struct list_head dead_list;
> +
> + struct sm_cmd_rsp_blk free_blk[SM_MAX_NUM_CMD_RSP_BLKS];
> +
> + /* Mutex over the free_list */
> + struct mutex free_lock;
> + struct list_head free_list;
> +
> + struct semaphore free_sema;
> + struct vchiq_instance *vchiq_instance;
> +};
> +
> +/* ---- Private Variables ------------------------------------------------ */
> +
> +/* ---- Private Function Prototypes -------------------------------------- */
> +
> +/* ---- Private Functions ------------------------------------------------ */
> +static int
> +bcm2835_vchi_msg_queue(struct vchiq_instance *vchiq_instance, unsigned int handle,
> + void *data,
> + unsigned int size)
> +{
> + return vchiq_queue_kernel_message(vchiq_instance, handle, data, size);
> +}
> +
> +static struct
> +sm_cmd_rsp_blk *vc_vchi_cmd_create(struct sm_instance *instance,
> + enum vc_sm_msg_type id, void *msg,
> + u32 size, int wait)
> +{
> + struct sm_cmd_rsp_blk *blk;
> + struct vc_sm_msg_hdr_t *hdr;
> +
> + if (down_interruptible(&instance->free_sema)) {
> + blk = kmalloc(sizeof(*blk), GFP_KERNEL);
> + if (!blk)
> + return NULL;
> +
> + blk->alloc = 1;
> + init_completion(&blk->cmplt);
> + } else {
> + mutex_lock(&instance->free_lock);
> + blk =
> + list_first_entry(&instance->free_list,
> + struct sm_cmd_rsp_blk, head);
> + list_del(&blk->head);
> + mutex_unlock(&instance->free_lock);
> + }
> +
> + blk->sent = 0;
> + blk->wait = wait;
> + blk->length = sizeof(*hdr) + size;
> +
> + hdr = (struct vc_sm_msg_hdr_t *)blk->msg;
> + hdr->type = id;
> + mutex_lock(&instance->lock);
> + instance->trans_id++;
> + /*
> + * Retain the top bit for identifying asynchronous events, or VPU cmds.
> + */
> + instance->trans_id &= ~0x80000000;
> + hdr->trans_id = instance->trans_id;
> + blk->id = instance->trans_id;
> + mutex_unlock(&instance->lock);
> +
> + if (size)
> + memcpy(hdr->body, msg, size);
> +
> + return blk;
> +}
> +
> +static void
> +vc_vchi_cmd_delete(struct sm_instance *instance, struct sm_cmd_rsp_blk *blk)
> +{
> + if (blk->alloc) {
> + kfree(blk);
> + return;
> + }
> +
> + mutex_lock(&instance->free_lock);
> + list_add(&blk->head, &instance->free_list);
> + mutex_unlock(&instance->free_lock);
> + up(&instance->free_sema);
> +}
> +
> +static void vc_sm_cma_vchi_rx_ack(struct sm_instance *instance,
> + struct sm_cmd_rsp_blk *cmd,
> + struct vc_sm_result_t *reply,
> + u32 reply_len)
> +{
> + mutex_lock(&instance->lock);
> + list_for_each_entry(cmd,
> + &instance->rsp_list,
> + head) {
> + if (cmd->id == reply->trans_id)
> + break;
> + }
> + mutex_unlock(&instance->lock);
> +
> + if (&cmd->head == &instance->rsp_list) {
> + //pr_debug("%s: received response %u, throw away...",
> + pr_err("%s: received response %u, throw away...",
> + __func__,
> + reply->trans_id);
> + } else if (reply_len > sizeof(cmd->msg)) {
> + pr_err("%s: reply too big (%u) %u, throw away...",
> + __func__, reply_len,
> + reply->trans_id);
> + } else {
> + memcpy(cmd->msg, reply,
> + reply_len);
> + complete(&cmd->cmplt);
> + }
> +}
> +
> +static int vc_sm_cma_vchi_videocore_io(void *arg)
> +{
> + struct sm_instance *instance = arg;
> + struct sm_cmd_rsp_blk *cmd = NULL, *cmd_tmp;
> + struct vc_sm_result_t *reply;
> + struct vchiq_header *header;
> + s32 status;
> + int svc_use = 1;
> +
> + while (1) {
> + if (svc_use)
> + vchiq_release_service(instance->vchiq_instance,
> + instance->service_handle[0]);
> + svc_use = 0;
> +
> + if (wait_for_completion_interruptible(&instance->io_cmplt))
> + continue;

Isn't it a bit overkill to use a thread, wouldn't a workqueue do ?

> + vchiq_use_service(instance->vchiq_instance, instance->service_handle[0]);
> + svc_use = 1;
> +
> + do {
> + /*
> + * Get new command and move it to response list
> + */
> + mutex_lock(&instance->lock);
> + if (list_empty(&instance->cmd_list)) {
> + /* no more commands to process */
> + mutex_unlock(&instance->lock);
> + break;
> + }
> + cmd = list_first_entry(&instance->cmd_list,
> + struct sm_cmd_rsp_blk, head);
> + list_move(&cmd->head, &instance->rsp_list);
> + cmd->sent = 1;
> + mutex_unlock(&instance->lock);
> + /* Send the command */
> + status =
> + bcm2835_vchi_msg_queue(instance->vchiq_instance,
> + instance->service_handle[0],
> + cmd->msg, cmd->length);
> + if (status) {
> + pr_err("%s: failed to queue message (%d)",
> + __func__, status);
> + }
> +
> + /* If no reply is needed then we're done */
> + if (!cmd->wait) {
> + mutex_lock(&instance->lock);
> + list_del(&cmd->head);
> + mutex_unlock(&instance->lock);
> + vc_vchi_cmd_delete(instance, cmd);
> + continue;
> + }
> +
> + if (status) {
> + complete(&cmd->cmplt);
> + continue;
> + }
> +
> + } while (1);
> +
> + while ((header = vchiq_msg_hold(instance->vchiq_instance,
> + instance->service_handle[0]))) {
> + reply = (struct vc_sm_result_t *)header->data;
> + if (reply->trans_id & 0x80000000) {
> + /* Async event or cmd from the VPU */
> + if (instance->vpu_event)
> + instance->vpu_event(instance, reply,
> + header->size);
> + } else {
> + vc_sm_cma_vchi_rx_ack(instance, cmd, reply,
> + header->size);
> + }
> +
> + vchiq_release_message(instance->vchiq_instance,
> + instance->service_handle[0],
> + header);
> + }
> +
> + /* Go through the dead list and free them */
> + mutex_lock(&instance->lock);
> + list_for_each_entry_safe(cmd, cmd_tmp, &instance->dead_list,
> + head) {
> + list_del(&cmd->head);
> + vc_vchi_cmd_delete(instance, cmd);
> + }
> + mutex_unlock(&instance->lock);
> + }
> +
> + return 0;
> +}
> +
> +static enum vchiq_status vc_sm_cma_vchi_callback(struct vchiq_instance *vchiq_instance,
> + enum vchiq_reason reason,
> + struct vchiq_header *header,
> + unsigned int handle, void *userdata)
> +{
> + struct sm_instance *instance = vchiq_get_service_userdata(vchiq_instance, handle);
> +
> + switch (reason) {
> + case VCHIQ_MESSAGE_AVAILABLE:
> + vchiq_msg_queue_push(vchiq_instance, handle, header);
> + complete(&instance->io_cmplt);
> + break;
> +
> + case VCHIQ_SERVICE_CLOSED:
> + pr_info("%s: service CLOSED!!", __func__);
> + default:
> + break;
> + }
> +
> + return VCHIQ_SUCCESS;
> +}
> +
> +struct sm_instance *vc_sm_cma_vchi_init(struct vchiq_instance *vchiq_instance,
> + unsigned int num_connections,
> + vpu_event_cb vpu_event)
> +{
> + u32 i;
> + struct sm_instance *instance;
> + int status;
> +
> + pr_debug("%s: start", __func__);
> +
> + if (num_connections > SM_MAX_NUM_CONNECTIONS) {
> + pr_err("%s: unsupported number of connections %u (max=%u)",
> + __func__, num_connections, SM_MAX_NUM_CONNECTIONS);
> +
> + goto err_null;
> + }
> + /* Allocate memory for this instance */
> + instance = kzalloc(sizeof(*instance), GFP_KERNEL);
> +
> + /* Misc initialisations */
> + mutex_init(&instance->lock);
> + init_completion(&instance->io_cmplt);
> + INIT_LIST_HEAD(&instance->cmd_list);
> + INIT_LIST_HEAD(&instance->rsp_list);
> + INIT_LIST_HEAD(&instance->dead_list);
> + INIT_LIST_HEAD(&instance->free_list);
> + sema_init(&instance->free_sema, SM_MAX_NUM_CMD_RSP_BLKS);
> + mutex_init(&instance->free_lock);
> + for (i = 0; i < SM_MAX_NUM_CMD_RSP_BLKS; i++) {
> + init_completion(&instance->free_blk[i].cmplt);
> + list_add(&instance->free_blk[i].head, &instance->free_list);
> + }
> +
> + instance->vchiq_instance = vchiq_instance;
> +
> + /* Open the VCHI service connections */
> + instance->num_connections = num_connections;
> + for (i = 0; i < num_connections; i++) {
> + struct vchiq_service_params_kernel params = {
> + .version = VC_SM_VER,
> + .version_min = VC_SM_MIN_VER,
> + .fourcc = VCHIQ_MAKE_FOURCC('S', 'M', 'E', 'M'),
> + .callback = vc_sm_cma_vchi_callback,
> + .userdata = instance,
> + };
> +
> + status = vchiq_open_service(vchiq_instance, &params,
> + &instance->service_handle[i]);
> + if (status) {
> + pr_err("%s: failed to open VCHI service (%d)",
> + __func__, status);
> +
> + goto err_close_services;
> + }
> + }
> + /* Create the thread which takes care of all io to/from videoocore. */
> + instance->io_thread = kthread_create(&vc_sm_cma_vchi_videocore_io,
> + (void *)instance, "SMIO");
> + if (!instance->io_thread) {
> + pr_err("%s: failed to create SMIO thread", __func__);
> +
> + goto err_close_services;
> + }
> + instance->vpu_event = vpu_event;
> + set_user_nice(instance->io_thread, -10);
> + wake_up_process(instance->io_thread);
> +
> + pr_debug("%s: success - instance %p", __func__, instance);
> + return instance;
> +
> +err_close_services:
> + for (i = 0; i < instance->num_connections; i++) {
> + if (instance->service_handle[i])
> + vchiq_close_service(vchiq_instance, instance->service_handle[i]);
> + }
> + kfree(instance);
> +err_null:
> + pr_debug("%s: FAILED", __func__);
> + return NULL;
> +}
> +
> +int vc_sm_cma_vchi_stop(struct vchiq_instance *vchiq_instance, struct sm_instance **handle)
> +{
> + struct sm_instance *instance;
> + u32 i;
> +
> + if (!handle) {
> + pr_err("%s: invalid pointer to handle %p", __func__, handle);
> + goto lock;
> + }
> +
> + if (!*handle) {
> + pr_err("%s: invalid handle %p", __func__, *handle);
> + goto lock;
> + }
> +
> + instance = *handle;
> +
> + /* Close all VCHI service connections */
> + for (i = 0; i < instance->num_connections; i++) {
> + vchiq_use_service(vchiq_instance, instance->service_handle[i]);
> + vchiq_close_service(vchiq_instance, instance->service_handle[i]);
> + }
> +
> + kfree(instance);
> +
> + *handle = NULL;
> + return 0;
> +
> +lock:
> + return -EINVAL;
> +}
> +
> +static int vc_sm_cma_vchi_send_msg(struct sm_instance *handle,
> + enum vc_sm_msg_type msg_id, void *msg,
> + u32 msg_size, void *result, u32 result_size,
> + u32 *cur_trans_id, u8 wait_reply)
> +{
> + int status = 0;
> + struct sm_instance *instance = handle;
> + struct sm_cmd_rsp_blk *cmd_blk;
> +
> + if (!handle) {
> + pr_err("%s: invalid handle", __func__);
> + return -EINVAL;
> + }
> + if (!msg) {
> + pr_err("%s: invalid msg pointer", __func__);
> + return -EINVAL;
> + }
> +
> + cmd_blk =
> + vc_vchi_cmd_create(instance, msg_id, msg, msg_size, wait_reply);
> + if (!cmd_blk) {
> + pr_err("[%s]: failed to allocate global tracking resource",
> + __func__);
> + return -ENOMEM;
> + }
> +
> + if (cur_trans_id)
> + *cur_trans_id = cmd_blk->id;
> +
> + mutex_lock(&instance->lock);
> + list_add_tail(&cmd_blk->head, &instance->cmd_list);
> + mutex_unlock(&instance->lock);
> + complete(&instance->io_cmplt);
> +
> + if (!wait_reply)
> + /* We're done */
> + return 0;
> +
> + /* Wait for the response */
> + if (wait_for_completion_interruptible(&cmd_blk->cmplt)) {
> + mutex_lock(&instance->lock);
> + if (!cmd_blk->sent) {
> + list_del(&cmd_blk->head);
> + mutex_unlock(&instance->lock);
> + vc_vchi_cmd_delete(instance, cmd_blk);
> + return -ENXIO;
> + }
> +
> + list_move(&cmd_blk->head, &instance->dead_list);
> + mutex_unlock(&instance->lock);
> + complete(&instance->io_cmplt);
> + return -EINTR; /* We're done */
> + }
> +
> + if (result && result_size) {
> + memcpy(result, cmd_blk->msg, result_size);
> + } else {
> + struct vc_sm_result_t *res =
> + (struct vc_sm_result_t *)cmd_blk->msg;
> + status = (res->success == 0) ? 0 : -ENXIO;
> + }
> +
> + mutex_lock(&instance->lock);
> + list_del(&cmd_blk->head);
> + mutex_unlock(&instance->lock);
> + vc_vchi_cmd_delete(instance, cmd_blk);
> + return status;
> +}
> +
> +int vc_sm_cma_vchi_free(struct sm_instance *handle, struct vc_sm_free_t *msg,
> + u32 *cur_trans_id)
> +{
> + return vc_sm_cma_vchi_send_msg(handle, VC_SM_MSG_TYPE_FREE,
> + msg, sizeof(*msg), 0, 0, cur_trans_id, 0);
> +}
> +
> +int vc_sm_cma_vchi_import(struct sm_instance *handle, struct vc_sm_import *msg,
> + struct vc_sm_import_result *result, u32 *cur_trans_id)
> +{
> + return vc_sm_cma_vchi_send_msg(handle, VC_SM_MSG_TYPE_IMPORT,
> + msg, sizeof(*msg), result, sizeof(*result),
> + cur_trans_id, 1);
> +}
> +
> +int vc_sm_cma_vchi_client_version(struct sm_instance *handle,
> + struct vc_sm_version *msg,
> + struct vc_sm_result_t *result,
> + u32 *cur_trans_id)
> +{
> + return vc_sm_cma_vchi_send_msg(handle, VC_SM_MSG_TYPE_CLIENT_VERSION,
> + //msg, sizeof(*msg), result, sizeof(*result),
> + //cur_trans_id, 1);
> + msg, sizeof(*msg), NULL, 0,
> + cur_trans_id, 0);
> +}
> +
> +int vc_sm_vchi_client_vc_mem_req_reply(struct sm_instance *handle,
> + struct vc_sm_vc_mem_request_result *msg,
> + uint32_t *cur_trans_id)
> +{
> + return vc_sm_cma_vchi_send_msg(handle,
> + VC_SM_MSG_TYPE_VC_MEM_REQUEST_REPLY,
> + msg, sizeof(*msg), 0, 0, cur_trans_id,
> + 0);
> +}
> diff --git a/drivers/staging/vc04_services/vc-sm-cma/vc_sm_cma_vchi.h b/drivers/staging/vc04_services/vc-sm-cma/vc_sm_cma_vchi.h
> new file mode 100644
> index 000000000000..a4f40d4cef05
> --- /dev/null
> +++ b/drivers/staging/vc04_services/vc-sm-cma/vc_sm_cma_vchi.h
> @@ -0,0 +1,63 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * VideoCore Shared Memory CMA allocator
> + *
> + * Copyright: 2018, Raspberry Pi (Trading) Ltd
> + * Copyright 2011-2012 Broadcom Corporation. All rights reserved.
> + *
> + * Based on vmcs_sm driver from Broadcom Corporation.
> + *
> + */
> +
> +#ifndef __VC_SM_CMA_VCHI_H__INCLUDED__


This is quite inconsistent with the existing coding style of
vc04_services and of the kernel as a whole. I'd go for VC_SM_CMA_VCHI_H,
or if preferred, __VC_SM_CMA_VCHI_H or __VC_SM_CMA_VCHI_H__.

> +#define __VC_SM_CMA_VCHI_H__INCLUDED__
> +
> +#include <linux/raspberrypi/vchiq.h>
> +
> +#include "vc_sm_defs.h"
> +
> +/*
> + * Forward declare.
> + */

Drop this.

> +struct sm_instance;
> +
> +typedef void (*vpu_event_cb)(struct sm_instance *instance,
> + struct vc_sm_result_t *reply, int reply_len);
> +
> +/*
> + * Initialize the shared memory service, opens up vchi connection to talk to it.
> + */
> +struct sm_instance *vc_sm_cma_vchi_init(struct vchiq_instance *vchi_instance,
> + unsigned int num_connections,
> + vpu_event_cb vpu_event);
> +
> +/*
> + * Terminates the shared memory service.
> + */
> +int vc_sm_cma_vchi_stop(struct vchiq_instance *vchi_instance, struct sm_instance **handle);
> +
> +/*
> + * Ask the shared memory service to free up some memory that was previously
> + * allocated by the vc_sm_cma_vchi_alloc function call.
> + */
> +int vc_sm_cma_vchi_free(struct sm_instance *handle, struct vc_sm_free_t *msg,
> + u32 *cur_trans_id);
> +
> +/*
> + * Import a contiguous block of memory and wrap it in a GPU MEM_HANDLE_T.
> + */
> +int vc_sm_cma_vchi_import(struct sm_instance *handle, struct vc_sm_import *msg,
> + struct vc_sm_import_result *result,
> + u32 *cur_trans_id);
> +
> +int vc_sm_cma_vchi_client_version(struct sm_instance *handle,
> + struct vc_sm_version *msg,
> + struct vc_sm_result_t *result,
> + u32 *cur_trans_id);
> +
> +int vc_sm_vchi_client_vc_mem_req_reply(struct sm_instance *handle,
> + struct vc_sm_vc_mem_request_result *msg,
> + uint32_t *cur_trans_id);
> +
> +#endif /* __VC_SM_CMA_VCHI_H__INCLUDED__ */
> diff --git a/drivers/staging/vc04_services/vc-sm-cma/vc_sm_defs.h b/drivers/staging/vc04_services/vc-sm-cma/vc_sm_defs.h
> new file mode 100644
> index 000000000000..ad4a3ec194d3
> --- /dev/null
> +++ b/drivers/staging/vc04_services/vc-sm-cma/vc_sm_defs.h
> @@ -0,0 +1,187 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * VideoCore Shared Memory CMA allocator
> + *
> + * Copyright: 2018, Raspberry Pi (Trading) Ltd
> + *
> + * Based on vc_sm_defs.h from the vmcs_sm driver Copyright Broadcom Corporation.
> + * All IPC messages are copied across to this file, even if the vc-sm-cma
> + * driver is not currently using them.
> + *
> + ****************************************************************************
> + */
> +
> +#ifndef __VC_SM_DEFS_H__INCLUDED__
> +#define __VC_SM_DEFS_H__INCLUDED__
> +
> +/* Maximum message length */
> +#define VC_SM_MAX_MSG_LEN (sizeof(union vc_sm_msg_union_t) + \
> + sizeof(struct vc_sm_msg_hdr_t))
> +#define VC_SM_MAX_RSP_LEN (sizeof(union vc_sm_msg_union_t))
> +
> +/* Resource name maximum size */
> +#define VC_SM_RESOURCE_NAME 32
> +
> +/*
> + * Version to be reported to the VPU
> + * VPU assumes 0 (aka 1) which does not require the released callback, nor
> + * expect the client to handle VC_MEM_REQUESTS.
> + * Version 2 requires the released callback, and must support VC_MEM_REQUESTS.
> + */
> +#define VC_SM_PROTOCOL_VERSION 2
> +
> +enum vc_sm_msg_type {
> + /* Message types supported for HOST->VC direction */
> +
> + /* Allocate shared memory block */
> + VC_SM_MSG_TYPE_ALLOC,
> + /* Lock allocated shared memory block */
> + VC_SM_MSG_TYPE_LOCK,
> + /* Unlock allocated shared memory block */
> + VC_SM_MSG_TYPE_UNLOCK,
> + /* Unlock allocated shared memory block, do not answer command */
> + VC_SM_MSG_TYPE_UNLOCK_NOANS,
> + /* Free shared memory block */
> + VC_SM_MSG_TYPE_FREE,
> + /* Resize a shared memory block */
> + VC_SM_MSG_TYPE_RESIZE,
> + /* Walk the allocated shared memory block(s) */
> + VC_SM_MSG_TYPE_WALK_ALLOC,
> +
> + /* A previously applied action will need to be reverted */
> + VC_SM_MSG_TYPE_ACTION_CLEAN,
> +
> + /*
> + * Import a physical address and wrap into a MEM_HANDLE_T.
> + * Release with VC_SM_MSG_TYPE_FREE.
> + */
> + VC_SM_MSG_TYPE_IMPORT,
> + /*
> + *Tells VC the protocol version supported by this client.
> + * 2 supports the async/cmd messages from the VPU for final release
> + * of memory, and for VC allocations.
> + */
> + VC_SM_MSG_TYPE_CLIENT_VERSION,
> + /* Response to VC request for memory */
> + VC_SM_MSG_TYPE_VC_MEM_REQUEST_REPLY,
> +
> + /*
> + * Asynchronous/cmd messages supported for VC->HOST direction.
> + * Signalled by setting the top bit in vc_sm_result_t trans_id.
> + */
> +
> + /*
> + * VC has finished with an imported memory allocation.
> + * Release any Linux reference counts on the underlying block.
> + */
> + VC_SM_MSG_TYPE_RELEASED,
> + /* VC request for memory */
> + VC_SM_MSG_TYPE_VC_MEM_REQUEST,
> +
> + VC_SM_MSG_TYPE_MAX
> +};
> +
> +/* Type of memory to be allocated */
> +enum vc_sm_alloc_type_t {
> + VC_SM_ALLOC_CACHED,
> + VC_SM_ALLOC_NON_CACHED,
> +};
> +
> +/* Message header for all messages in HOST->VC direction */
> +struct vc_sm_msg_hdr_t {
> + u32 type;
> + u32 trans_id;
> + u8 body[0];
> +
> +};
> +
> +/* Request to free a previously allocated memory (HOST->VC) */
> +struct vc_sm_free_t {
> + /* Resource handle (returned from alloc) */
> + u32 res_handle;
> + /* Resource buffer (returned from alloc) */
> + u32 res_mem;
> +
> +};
> +
> +/* Generic result for a request (VC->HOST) */
> +struct vc_sm_result_t {
> + /* Transaction identifier */
> + u32 trans_id;
> +
> + s32 success;
> +
> +};
> +
> +/* Request to import memory (HOST->VC) */
> +struct vc_sm_import {
> + /* type of memory to allocate */
> + enum vc_sm_alloc_type_t type;
> + /* pointer to the VC (ie physical) address of the allocated memory */
> + u32 addr;
> + /* size of buffer */
> + u32 size;
> + /* opaque handle returned in RELEASED messages */
> + u32 kernel_id;
> + /* Allocator identifier */
> + u32 allocator;
> + /* resource name (for easier tracking on vc side) */
> + char name[VC_SM_RESOURCE_NAME];
> +};
> +
> +/* Result of a requested memory import (VC->HOST) */
> +struct vc_sm_import_result {
> + /* Transaction identifier */
> + u32 trans_id;
> +
> + /* Resource handle */
> + u32 res_handle;
> +};
> +
> +/* Notification that VC has finished with an allocation (VC->HOST) */
> +struct vc_sm_released {
> + /* cmd type / trans_id */
> + u32 cmd;
> +
> + /* pointer to the VC (ie physical) address of the allocated memory */
> + u32 addr;
> + /* size of buffer */
> + u32 size;
> + /* opaque handle returned in RELEASED messages */
> + u32 kernel_id;
> + u32 vc_handle;
> +};
> +
> +/*
> + * Client informing VC as to the protocol version it supports.
> + * >=2 requires the released callback, and supports VC asking for memory.
> + * Failure means that the firmware doesn't support this call, and therefore the
> + * client should either fail, or NOT rely on getting the released callback.
> + */
> +struct vc_sm_version {
> + u32 version;
> +};
> +
> +/* Response from the kernel to provide the VPU with some memory */
> +struct vc_sm_vc_mem_request_result {
> + /* Transaction identifier for the VPU */
> + u32 trans_id;
> + /* pointer to the physical address of the allocated memory */
> + u32 addr;
> + /* opaque handle returned in RELEASED messages */
> + u32 kernel_id;
> +};
> +
> +/* Union of ALL messages */
> +union vc_sm_msg_union_t {
> + struct vc_sm_free_t free;
> + struct vc_sm_result_t result;
> + struct vc_sm_import import;
> + struct vc_sm_import_result import_result;
> + struct vc_sm_version version;
> + struct vc_sm_released released;
> + struct vc_sm_vc_mem_request_result vc_request_result;
> +};
> +
> +#endif /* __VC_SM_DEFS_H__INCLUDED__ */
> diff --git a/drivers/staging/vc04_services/vc-sm-cma/vc_sm_knl.h b/drivers/staging/vc04_services/vc-sm-cma/vc_sm_knl.h
> new file mode 100644
> index 000000000000..988fdd967922
> --- /dev/null
> +++ b/drivers/staging/vc04_services/vc-sm-cma/vc_sm_knl.h
> @@ -0,0 +1,28 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * VideoCore Shared Memory CMA allocator
> + *
> + * Copyright: 2018, Raspberry Pi (Trading) Ltd
> + *
> + * Based on vc_sm_defs.h from the vmcs_sm driver Copyright Broadcom Corporation.
> + *
> + */
> +
> +#ifndef __VC_SM_KNL_H__INCLUDED__
> +#define __VC_SM_KNL_H__INCLUDED__
> +
> +#if !defined(__KERNEL__)
> +#error "This interface is for kernel use only..."
> +#endif

As this header isn't available to userspace, I think you can drop this.

> +
> +/* Free a previously allocated or imported shared memory handle and block. */

kerneldoc would be nice for such an inter-driver API.

> +int vc_sm_cma_free(void *handle);
> +
> +/* Get an internal resource handle mapped from the external one. */
> +int vc_sm_cma_int_handle(void *handle);
> +
> +/* Import a block of memory into the GPU space. */
> +int vc_sm_cma_import_dmabuf(struct dma_buf *dmabuf, void **handle);

I would order these function in the expected call order: import,
int_handle and free.

> +
> +#endif /* __VC_SM_KNL_H__INCLUDED__ */

--
Regards,

Laurent Pinchart