基于 Linux Kernel v5.4.123 | 源码:
drivers/staging/android/ion/ion.c
ION 分配的每块内存都通过 Linux dma-buf 框架导出为文件描述符(fd),使得多个设备驱动可以零拷贝共享同一块物理内存。本文聚焦 ION 如何实现 dma_buf_ops 的每一个回调,以及背后的设计考量。
Camera 驱动分配 buffer A(自有内存管理器) │ │ memcpy(拷贝 1) ▼ GPU 驱动分配 buffer B(另一套内存管理器) │ │ memcpy(拷贝 2) ▼ Display 驱动分配 buffer C(又一套) │ └── 输出到屏幕 问题: - 每次跨设备传递都需要拷贝,浪费内存带宽 - 每个驱动有自己的分配 API,用户态需要适配多套接口 - 无法跨进程共享(每个驱动的 handle 不通用)
ION 分配一块物理内存 │ └── dma_buf_export() → fd │ ┌─────┼─────────┐ ▼ ▼ ▼ Camera GPU Display attach attach attach map map map │ │ │ └─────┴─────────┘ 共享同一块物理内存,零拷贝 fd 可以通过 Unix socket / Binder 跨进程传递
dma-buf 是 Linux 内核的标准内存共享框架,定义了导出者(exporter)和导入者(importer)之间的契约。ION 作为导出者,必须实现 dma_buf_ops 来告诉框架"如何操作我分配的内存"。
c// ion.c:343
static const struct dma_buf_ops dma_buf_ops = {
.attach = ion_dma_buf_attach,
.detach = ion_dma_buf_detatch,
.map_dma_buf = ion_map_dma_buf,
.unmap_dma_buf = ion_unmap_dma_buf,
.mmap = ion_mmap,
.release = ion_dma_buf_release,
.begin_cpu_access = ion_dma_buf_begin_cpu_access,
.end_cpu_access = ion_dma_buf_end_cpu_access,
.map = ion_dma_buf_kmap,
.unmap = ion_dma_buf_kunmap,
};
每个回调在 buffer 生命周期中的位置:
ion_alloc() └── dma_buf_export(&dma_buf_ops) ──── 创建 dma-buf,绑定 ops └── dma_buf_fd() → fd ──────── 返回用户态 设备驱动使用 fd: dma_buf_get(fd) └── .attach ──────────────────── 设备注册,dup sg_table dma_buf_map_attachment() └── .map_dma_buf ────────────────── DMA 地址映射 (设备通过 DMA 访问 buffer) dma_buf_unmap_attachment() └── .unmap_dma_buf ──────────────── 解除 DMA 映射 dma_buf_detach() └── .detach ─────────────────────── 设备注销,释放 dup 的 sg_table CPU 访问: dma_buf_begin_cpu_access() └── .begin_cpu_access ───────────── kmap + cache invalidate (CPU 读写 buffer) dma_buf_end_cpu_access() └── .end_cpu_access ─────────────── cache flush + kunmap dma_buf_mmap() └── .mmap ───────────────────────── 用户态 mmap dma_buf_kmap() └── .map ────────────────────────── 内核按页映射 释放: close(fd) → refcount=0 └── .release ────────────────────── 销毁 ion_buffer
c// ion.c:356 ion_alloc() 中的关键代码
static int ion_alloc(size_t len, unsigned int heap_id_mask, unsigned int flags)
{
// ... buffer 分配完成 ...
DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
exp_info.ops = &dma_buf_ops; // 绑定 ION 的 ops 实现
exp_info.size = buffer->size;
exp_info.flags = O_RDWR;
exp_info.priv = buffer; // dma-buf 的 priv 指向 ion_buffer
dmabuf = dma_buf_export(&exp_info); // 创建 dma-buf 对象
fd = dma_buf_fd(dmabuf, O_CLOEXEC); // 创建 fd
return fd;
}
exp_info.priv = buffer 是关键 — 后续所有 ops 回调通过 dmabuf->priv 回溯到 ion_buffer:
c// 每个回调的第一行几乎都是:
struct ion_buffer *buffer = dmabuf->priv;
dma-buf 对象 ├── file (内核文件对象,引用计数管理生命周期) ├── ops → dma_buf_ops (ION 实现的回调) ├── size └── priv → ion_buffer ├── sg_table (物理页面描述) ├── heap (来自哪个 heap) └── attachments (设备列表) fd (用户态文件描述符) └── file → dma-buf → ion_buffer → 物理页面 引用计数链: 用户 close(fd) → file refcount-- → 降到 0 时调 dma_buf_ops.release → ion_dma_buf_release() → _ion_buffer_destroy()
ION buffer 的物理页面只有一份,但每个 attach 的设备可能通过不同的 IOMMU 映射,得到不同的 DMA 地址。dma_map_sg() 会将映射结果写入 sg entry 的 dma_address 字段。如果多个设备共用同一个 sg_table,后 map 的设备会覆盖前一个设备的 DMA 地址。
物理页面: page@0x8000_0000 共用 sg_table 的问题: GPU dma_map_sg → sg.dma_address = 0x0010_0000 ✓ Display dma_map_sg → sg.dma_address = 0xFF00_0000 → GPU 之前的 0x0010_0000 被覆盖!GPU DMA 到错误地址! dup sg_table 的方案: GPU sg_table_A → sg.dma_address = 0x0010_0000 ✓ 独立 Display sg_table_B → sg.dma_address = 0xFF00_0000 ✓ 独立 物理页面指针相同(零拷贝),DMA 地址各自维护
c// ion.c:172
struct ion_dma_buf_attachment {
struct device *dev; // 哪个设备
struct sg_table *table; // 该设备私有的 sg_table 副本
struct list_head list; // 挂到 buffer->attachments
};
// ion.c:178
static int ion_dma_buf_attach(struct dma_buf *dmabuf,
struct dma_buf_attachment *attachment)
{
struct ion_dma_buf_attachment *a;
struct sg_table *table;
struct ion_buffer *buffer = dmabuf->priv;
// 1. 分配 attachment 结构体
a = kzalloc(sizeof(*a), GFP_KERNEL);
// 2. 复制 sg_table(核心)
table = dup_sg_table(buffer->sg_table);
// 3. 初始化并挂到 buffer 的 attachments 链表
a->table = table;
a->dev = attachment->dev;
INIT_LIST_HEAD(&a->list);
attachment->priv = a; // dma-buf 框架的 attachment->priv 指向我们的结构
mutex_lock(&buffer->lock);
list_add(&a->list, &buffer->attachments);
mutex_unlock(&buffer->lock);
}
c// ion.c:140
static struct sg_table *dup_sg_table(struct sg_table *table)
{
struct sg_table *new_table;
struct scatterlist *sg, *new_sg;
new_table = kzalloc(sizeof(*new_table), GFP_KERNEL);
sg_alloc_table(new_table, table->nents, GFP_KERNEL);
new_sg = new_table->sgl;
for_each_sg(table->sgl, sg, table->nents, i) {
memcpy(new_sg, sg, sizeof(*sg)); // 复制 page 指针、length、offset
new_sg->dma_address = 0; // 清零 DMA 地址(等设备自己 map)
new_sg = sg_next(new_sg);
}
return new_table;
}
复制后的状态:
原始 buffer->sg_table dup 出的副本 (GPU 用) ┌───────────────────┐ ┌───────────────────┐ │ sg[0] │ │ sg[0] │ │ page ───────────┼──共享──────▶│ page │ 同一个物理页 │ length = 1MB │ │ length = 1MB │ 值复制 │ dma_addr = ??? │ │ dma_addr = 0 │ 清零 ├───────────────────┤ ├───────────────────┤ │ sg[1] │ │ sg[1] │ │ page ───────────┼──共享──────▶│ page │ │ length = 64KB │ │ length = 64KB │ │ dma_addr = ??? │ │ dma_addr = 0 │ └───────────────────┘ └───────────────────┘
c// ion.c:208
static void ion_dma_buf_detatch(struct dma_buf *dmabuf,
struct dma_buf_attachment *attachment)
{
struct ion_dma_buf_attachment *a = attachment->priv;
struct ion_buffer *buffer = dmabuf->priv;
// 从 attachments 链表移除
mutex_lock(&buffer->lock);
list_del(&a->list);
mutex_unlock(&buffer->lock);
// 释放 dup 的 sg_table
free_duped_table(a->table); // sg_free_table + kfree
kfree(a);
}
c// ion.c:222
static struct sg_table *ion_map_dma_buf(struct dma_buf_attachment *attachment,
enum dma_data_direction direction)
{
struct ion_dma_buf_attachment *a = attachment->priv;
struct sg_table *table = a->table; // 该设备私有的 dup sg_table
if (!dma_map_sg(attachment->dev, table->sgl, table->nents, direction))
return ERR_PTR(-ENOMEM);
return table;
}
dma_map_sg() 做了什么:
对每个 sg entry: 1. 如果设备有 IOMMU: - 在 IOMMU 页表中建立映射:IOVA → 物理地址 - sg->dma_address = IOVA(IOMMU 虚拟地址) 2. 如果设备无 IOMMU(直通): - sg->dma_address = 物理地址 3. 如果需要 bounce buffer(DMA 地址范围受限): - 分配 bounce buffer,sg->dma_address 指向 bounce buffer 设备后续通过 sg->dma_address 发起 DMA 传输
c// ion.c:237
static void ion_unmap_dma_buf(struct dma_buf_attachment *attachment,
struct sg_table *table,
enum dma_data_direction direction)
{
dma_unmap_sg(attachment->dev, table->sgl, table->nents, direction);
}
解除 IOMMU 映射,释放 bounce buffer(如果有)。
enum dma_data_direction: DMA_TO_DEVICE → 设备读(CPU 写完,设备来读) DMA_FROM_DEVICE → 设备写(设备写完,CPU 来读) DMA_BIDIRECTIONAL → 双向 DMA_NONE → 无 DMA(纯 CPU 访问)
direction 影响 cache 同步策略:
DMA_TO_DEVICE:map 时 flush CPU cache → 设备能读到最新数据DMA_FROM_DEVICE:unmap 时 invalidate CPU cache → CPU 能读到设备写入的数据DMA_BIDIRECTIONAL:两者都做c// ion.c:244
static int ion_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
{
struct ion_buffer *buffer = dmabuf->priv;
// 1. 非 cached buffer 使用 write-combine 映射
if (!(buffer->flags & ION_FLAG_CACHED))
vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
// 2. 调用 heap 的 map_user 实现
mutex_lock(&buffer->lock);
ret = buffer->heap->ops->map_user(buffer->heap, buffer, vma);
mutex_unlock(&buffer->lock);
}
| buffer flags | 页保护 | 含义 | 适用场景 |
|---|---|---|---|
ION_FLAG_CACHED | PAGE_SHARED(默认) | CPU cache 正常工作 | CPU 频繁读写 |
| 无 CACHED | pgprot_writecombine | 跳过 cache,写合并 | 设备 DMA 为主 |
c// ion_heap.c:64
int ion_heap_map_user(struct ion_heap *heap, struct ion_buffer *buffer,
struct vm_area_struct *vma)
{
struct sg_table *table = buffer->sg_table;
unsigned long addr = vma->vm_start;
unsigned long offset = vma->vm_pgoff * PAGE_SIZE; // mmap offset 支持
for_each_sg(table->sgl, sg, table->nents, i) {
struct page *page = sg_page(sg);
unsigned long remainder = vma->vm_end - addr;
unsigned long len = sg->length;
// 处理 offset(跳过前面的 sg entries)
if (offset >= sg->length) {
offset -= sg->length;
continue;
} else if (offset) {
page += offset / PAGE_SIZE;
len = sg->length - offset;
offset = 0;
}
len = min(len, remainder);
// 建立页表映射:用户虚拟地址 → 物理页帧
remap_pfn_range(vma, addr, page_to_pfn(page), len,
vma->vm_page_prot);
addr += len;
if (addr >= vma->vm_end)
return 0;
}
}
逐 sg entry 映射,将散布的物理页面映射到用户进程的连续虚拟地址空间:
用户虚拟地址空间 物理内存 ┌──────────────────┐ │ vma->vm_start │───── remap_pfn_range ────→ sg[0].page (1MB) │ + 1MB │───── remap_pfn_range ────→ sg[1].page (64KB) │ + 1MB + 64KB │───── remap_pfn_range ────→ sg[2].page (4KB) │ ... │ (可能不连续) │ vma->vm_end │ └──────────────────┘ 用户看到连续地址,底层物理页面可能散布各处
这是 dma-buf 集成中最精妙也最容易出错的部分。
ARM SoC 典型架构: CPU ←──── L1/L2 Cache ────→ 物理内存 ←──── DMA ────→ 设备 │ 两条路径访问同一块内存 cache 中的数据可能与内存不同步
三种不一致场景:
场景 1: CPU 写入后设备读取 CPU 写 0xAA → 数据在 cache 中(dirty) 设备 DMA 读 → 读到物理内存中的旧数据 0x00 解决: flush cache → 将 0xAA 写回物理内存 场景 2: 设备写入后 CPU 读取 设备 DMA 写 0xBB → 数据在物理内存中 CPU 读 → 读到 cache 中的旧数据 0x00 解决: invalidate cache → 丢弃 cache,强制从物理内存重新加载 场景 3: 双向 需要 flush + invalidate
c// ion.c:289
static int ion_dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
enum dma_data_direction direction)
{
struct ion_buffer *buffer = dmabuf->priv;
void *vaddr;
struct ion_dma_buf_attachment *a;
int ret = 0;
// 1. 将 buffer 映射到内核虚拟地址(引用计数)
if (buffer->heap->ops->map_kernel) {
mutex_lock(&buffer->lock);
vaddr = ion_buffer_kmap_get(buffer);
if (IS_ERR(vaddr)) {
ret = PTR_ERR(vaddr);
goto unlock;
}
mutex_unlock(&buffer->lock);
}
// 2. 对所有 attach 的设备做 cache 同步
mutex_lock(&buffer->lock);
list_for_each_entry(a, &buffer->attachments, list) {
dma_sync_sg_for_cpu(a->dev, a->table->sgl,
a->table->nents, direction);
}
unlock:
mutex_unlock(&buffer->lock);
return ret;
}
c// ion.c:321
static int ion_dma_buf_end_cpu_access(struct dma_buf *dmabuf,
enum dma_data_direction direction)
{
struct ion_buffer *buffer = dmabuf->priv;
struct ion_dma_buf_attachment *a;
// 1. 释放内核映射(引用计数减 1)
if (buffer->heap->ops->map_kernel) {
mutex_lock(&buffer->lock);
ion_buffer_kmap_put(buffer); // kmap_cnt--, 减到0时 vunmap
mutex_unlock(&buffer->lock);
}
// 2. 对所有 attach 的设备做 cache 同步
mutex_lock(&buffer->lock);
list_for_each_entry(a, &buffer->attachments, list) {
dma_sync_sg_for_device(a->dev, a->table->sgl,
a->table->nents, direction);
}
mutex_unlock(&buffer->lock);
return 0;
}
c// ion.c:112
static void *ion_buffer_kmap_get(struct ion_buffer *buffer)
{
if (buffer->kmap_cnt) {
buffer->kmap_cnt++; // 已有映射,直接增加引用
return buffer->vaddr;
}
// 首次映射
vaddr = buffer->heap->ops->map_kernel(buffer->heap, buffer);
buffer->vaddr = vaddr;
buffer->kmap_cnt++;
return vaddr;
}
// ion.c:131
static void ion_buffer_kmap_put(struct ion_buffer *buffer)
{
buffer->kmap_cnt--;
if (!buffer->kmap_cnt) {
buffer->heap->ops->unmap_kernel(buffer->heap, buffer);
buffer->vaddr = NULL; // 最后一个引用释放,解除映射
}
}
为什么用引用计数? 多个内核路径可能同时需要访问 buffer(如同时有两个设备驱动调用 begin_cpu_access),引用计数确保只在第一个 begin 时建立映射,最后一个 end 时解除映射。
clist_for_each_entry(a, &buffer->attachments, list) {
dma_sync_sg_for_cpu(a->dev, a->table->sgl, ...);
}
每个 attach 的设备可能有独立的 DMA 映射和 cache 域。dma_sync_sg_for_cpu 需要知道是哪个设备的 DMA 映射需要同步,因为:
所以必须对每个设备独立调用 sync。
用例:CPU 写入数据 → GPU 读取处理 → CPU 读回结果 1. begin_cpu_access(DMA_TO_DEVICE) dma_sync_sg_for_cpu(gpu, ..., TO_DEVICE) → (TO_DEVICE begin 通常无操作或 invalidate) 2. CPU 写入数据到 mmap 的地址 数据进入 CPU cache(dirty lines) 3. end_cpu_access(DMA_TO_DEVICE) dma_sync_sg_for_device(gpu, ..., TO_DEVICE) → flush CPU cache → 数据写回物理内存 GPU 现在可以通过 DMA 读到最新数据 4. GPU 通过 DMA 读取并处理,结果写回同一 buffer 5. begin_cpu_access(DMA_FROM_DEVICE) dma_sync_sg_for_cpu(gpu, ..., FROM_DEVICE) → invalidate CPU cache → 丢弃旧 cache line CPU 现在读取时会从物理内存加载 GPU 写入的新数据 6. CPU 读取 mmap 的地址 → 得到 GPU 处理后的结果 7. end_cpu_access(DMA_FROM_DEVICE) dma_sync_sg_for_device(gpu, ..., FROM_DEVICE) → (FROM_DEVICE end 通常无操作)
c// ion.c:277
static void *ion_dma_buf_kmap(struct dma_buf *dmabuf, unsigned long offset)
{
struct ion_buffer *buffer = dmabuf->priv;
return buffer->vaddr + offset * PAGE_SIZE;
}
// ion.c:284
static void ion_dma_buf_kunmap(struct dma_buf *dmabuf, unsigned long offset,
void *ptr)
{
// 空函数 — 真正的 unmap 由 end_cpu_access → kmap_put 处理
}
kmap 基于 buffer->vaddr(由 begin_cpu_access 中的 kmap_get 设置)做简单的地址偏移计算。这是一个轻量级操作,不涉及新的页表映射。
前提:buffer->vaddr 必须已经通过 ion_buffer_kmap_get → heap->ops->map_kernel → vmap() 建立好整个 buffer 的连续内核映射。kmap 只是在这个已建立的映射上做偏移。
c// ion.c:270
static void ion_dma_buf_release(struct dma_buf *dmabuf)
{
struct ion_buffer *buffer = dmabuf->priv;
_ion_buffer_destroy(buffer);
}
c// ion.c:102
static void _ion_buffer_destroy(struct ion_buffer *buffer)
{
struct ion_heap *heap = buffer->heap;
if (heap->flags & ION_HEAP_FLAG_DEFER_FREE)
ion_heap_freelist_add(heap, buffer); // 延迟释放
else
ion_buffer_destroy(buffer); // 立即释放
}
触发条件: dma-buf 的文件引用计数降到 0 时自动调用。这发生在所有持有该 fd 的进程都 close 了,且所有内核引用(dma_buf_get)都释放了之后。
完整引用链:
用户进程 A: close(fd) → file refcount-- 用户进程 B: close(fd) → file refcount-- GPU 驱动: dma_buf_put() → file refcount-- Display: dma_buf_put() → file refcount-- → refcount = 0 → fput() → dma_buf_release() [dma-buf 框架] → ops->release() [调 ION] → ion_dma_buf_release() → _ion_buffer_destroy()
注意: release 被调用时,所有 attachment 应该已经 detach 完毕。如果还有未 detach 的 attachment,是使用者的 bug。
用户态 内核态 ────── ────── open("/dev/ion") ──────────────────→ ion_fops 注册 ioctl(ION_IOC_ALLOC) ─────────────→ ion_alloc() ├── ion_buffer_create() │ └── heap->ops->allocate() │ → 填充 buffer->sg_table ├── dma_buf_export(ops, priv=buffer) │ → 创建 struct dma_buf └── dma_buf_fd(dmabuf) → 创建 fd fd ←──────────────────────────────── return fd // fd 通过 Binder/socket 传给其他进程或驱动 GPU 驱动侧: dma_buf_get(fd) ──────────────────→ 根据 fd 找到 dma_buf 对象 dma_buf_attach(dev) ──────────────→ ion_dma_buf_attach() ├── dup_sg_table() └── list_add(attachments) dma_buf_map_attachment(dir) ──────→ ion_map_dma_buf() └── dma_map_sg(dev, sg) → sg.dma_address 填入设备 DMA 地址 // GPU 通过 sg.dma_address 发起 DMA 读写 dma_buf_unmap_attachment() ───────→ ion_unmap_dma_buf() └── dma_unmap_sg() dma_buf_detach() ─────────────────→ ion_dma_buf_detatch() ├── list_del(attachments) └── free_duped_table() dma_buf_put() ────────────────────→ refcount-- CPU 侧: mmap(fd) ─────────────────────────→ ion_mmap() ├── pgprot_writecombine (if !CACHED) └── remap_pfn_range() 逐 sg entry ptr ←────────────────────────────── 用户态虚拟地址 ioctl(SYNC_START) ────────────────→ ion_dma_buf_begin_cpu_access() ├── kmap_get() → vmap └── dma_sync_sg_for_cpu() × 每个设备 memcpy(ptr, src, size) CPU 读写 buffer ioctl(SYNC_END) ──────────────────→ ion_dma_buf_end_cpu_access() ├── dma_sync_sg_for_device() × 每个设备 └── kmap_put() munmap(ptr) close(fd) ────────────────────────→ refcount = 0 → ion_dma_buf_release() → _ion_buffer_destroy() → deferred free 或 立即释放
以下 C 程序在用户态模拟 dma-buf 的 attach/map/sync 机制,展示多设备共享同一 buffer 时 sg_table 的独立性和 cache 同步流程。
保存为 dma_buf_sim.c:
c/*
* ION dma-buf 集成机制用户态模拟
*
* 模拟内容:
* 1. dma_buf_export: 将 buffer 包装为 dma-buf
* 2. attach / detach: 设备注册,dup sg_table
* 3. map / unmap: DMA 地址映射(模拟 IOMMU)
* 4. begin/end_cpu_access: cache 同步
* 5. release: 引用计数归零后销毁
*
* 编译: gcc -o dma_buf_sim dma_buf_sim.c -Wall
* 运行: ./dma_buf_sim
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define PAGE_SIZE 4096
#define MAX_SG_ENTRIES 16
#define MAX_ATTACHMENTS 4
/* --- 模拟 scatterlist entry --- */
typedef struct {
unsigned long phys_addr; /* 物理地址 */
unsigned long dma_address; /* DMA 地址(设备视角) */
unsigned long length;
} SimSgEntry;
/* --- 模拟 sg_table --- */
typedef struct {
SimSgEntry entries[MAX_SG_ENTRIES];
int nents;
} SimSgTable;
/* --- 模拟 device --- */
typedef struct {
const char *name;
unsigned long iommu_offset; /* 模拟 IOMMU 偏移 */
int has_iommu;
} SimDevice;
/* --- 模拟 dma_buf_attachment --- */
typedef struct {
SimDevice *dev;
SimSgTable *table; /* dup 出的独立 sg_table */
int mapped; /* 是否已 DMA map */
} SimAttachment;
/* --- 模拟 ion_buffer --- */
typedef struct {
SimSgTable *sg_table; /* 原始 sg_table */
unsigned long size;
int flags;
int kmap_cnt;
void *vaddr;
SimAttachment attachments[MAX_ATTACHMENTS];
int attach_count;
} SimBuffer;
/* --- 模拟 dma_buf --- */
typedef struct {
SimBuffer *priv; /* 指向 ion_buffer */
int refcount;
} SimDmaBuf;
/* --- cache 状态 --- */
typedef struct {
int dirty; /* cache 有脏数据 */
int valid; /* cache 有效 */
unsigned char data;
} CacheState;
static CacheState cpu_cache = {0, 0, 0};
static unsigned char phys_memory_data = 0;
/* ======================== dup_sg_table ======================== */
static SimSgTable *dup_sg_table(SimSgTable *orig)
{
SimSgTable *dup = malloc(sizeof(*dup));
dup->nents = orig->nents;
for (int i = 0; i < orig->nents; i++) {
dup->entries[i].phys_addr = orig->entries[i].phys_addr; /* 共享 */
dup->entries[i].length = orig->entries[i].length; /* 复制 */
dup->entries[i].dma_address = 0; /* 清零 */
}
printf(" dup_sg_table: copied %d entries, dma_address all zeroed\n",
dup->nents);
return dup;
}
/* ======================== attach / detach ======================== */
static int sim_attach(SimDmaBuf *dmabuf, SimDevice *dev)
{
SimBuffer *buf = dmabuf->priv;
if (buf->attach_count >= MAX_ATTACHMENTS) return -1;
int idx = buf->attach_count;
buf->attachments[idx].dev = dev;
buf->attachments[idx].table = dup_sg_table(buf->sg_table);
buf->attachments[idx].mapped = 0;
buf->attach_count++;
printf(" [attach] device '%s' attached (total: %d)\n",
dev->name, buf->attach_count);
return idx;
}
static void sim_detach(SimDmaBuf *dmabuf, int att_idx)
{
SimBuffer *buf = dmabuf->priv;
SimAttachment *a = &buf->attachments[att_idx];
printf(" [detach] device '%s'\n", a->dev->name);
printf(" free_duped_table: released %d sg entries\n",
a->table->nents);
free(a->table);
a->table = NULL;
a->dev = NULL;
}
/* ======================== map / unmap DMA ======================== */
static SimSgTable *sim_map_dma_buf(SimBuffer *buf, int att_idx)
{
SimAttachment *a = &buf->attachments[att_idx];
SimSgTable *table = a->table;
printf(" [map_dma_buf] device '%s':\n", a->dev->name);
for (int i = 0; i < table->nents; i++) {
if (a->dev->has_iommu) {
/* IOMMU 映射:DMA 地址 ≠ 物理地址 */
table->entries[i].dma_address =
a->dev->iommu_offset + (i * PAGE_SIZE);
printf(" sg[%d]: phys=0x%08lx → dma=0x%08lx (IOMMU)\n",
i, table->entries[i].phys_addr,
table->entries[i].dma_address);
} else {
/* 直通:DMA 地址 = 物理地址 */
table->entries[i].dma_address = table->entries[i].phys_addr;
printf(" sg[%d]: phys=0x%08lx → dma=0x%08lx (direct)\n",
i, table->entries[i].phys_addr,
table->entries[i].dma_address);
}
}
a->mapped = 1;
return table;
}
static void sim_unmap_dma_buf(SimBuffer *buf, int att_idx)
{
SimAttachment *a = &buf->attachments[att_idx];
printf(" [unmap_dma_buf] device '%s': cleared %d DMA mappings\n",
a->dev->name, a->table->nents);
for (int i = 0; i < a->table->nents; i++)
a->table->entries[i].dma_address = 0;
a->mapped = 0;
}
/* ======================== cache sync ======================== */
static void sim_begin_cpu_access(SimDmaBuf *dmabuf, const char *direction)
{
SimBuffer *buf = dmabuf->priv;
/* kmap_get */
buf->kmap_cnt++;
if (buf->kmap_cnt == 1)
buf->vaddr = (void *)0xFFFF880000001000UL; /* 模拟 vmap 地址 */
printf(" [begin_cpu_access] direction=%s, kmap_cnt=%d\n",
direction, buf->kmap_cnt);
/* dma_sync_sg_for_cpu for each attachment */
for (int i = 0; i < buf->attach_count; i++) {
if (!buf->attachments[i].dev) continue;
printf(" dma_sync_sg_for_cpu(dev='%s')\n",
buf->attachments[i].dev->name);
}
/* 模拟 cache invalidate */
if (strcmp(direction, "FROM_DEVICE") == 0 ||
strcmp(direction, "BIDIRECTIONAL") == 0) {
printf(" cache INVALIDATE → discard stale cache lines\n");
cpu_cache.valid = 0; /* 失效,下次读从内存加载 */
}
}
static void sim_end_cpu_access(SimDmaBuf *dmabuf, const char *direction)
{
SimBuffer *buf = dmabuf->priv;
/* kmap_put */
buf->kmap_cnt--;
if (buf->kmap_cnt == 0)
buf->vaddr = NULL; /* 解除映射 */
printf(" [end_cpu_access] direction=%s, kmap_cnt=%d\n",
direction, buf->kmap_cnt);
/* dma_sync_sg_for_device for each attachment */
for (int i = 0; i < buf->attach_count; i++) {
if (!buf->attachments[i].dev) continue;
printf(" dma_sync_sg_for_device(dev='%s')\n",
buf->attachments[i].dev->name);
}
/* 模拟 cache flush */
if (strcmp(direction, "TO_DEVICE") == 0 ||
strcmp(direction, "BIDIRECTIONAL") == 0) {
if (cpu_cache.dirty) {
printf(" cache FLUSH → write 0x%02X back to phys memory\n",
cpu_cache.data);
phys_memory_data = cpu_cache.data;
cpu_cache.dirty = 0;
}
}
}
/* ======================== release ======================== */
static void sim_release(SimDmaBuf *dmabuf)
{
printf(" [release] dma-buf refcount=0, destroying buffer\n");
SimBuffer *buf = dmabuf->priv;
if (buf->kmap_cnt > 0)
printf(" WARNING: buffer still kernel-mapped (kmap_cnt=%d)\n",
buf->kmap_cnt);
printf(" ion_buffer_destroy → heap->ops->free()\n");
free(buf->sg_table);
free(buf);
dmabuf->priv = NULL;
}
/* ======================== main ======================== */
int main(void)
{
printf("============================================\n");
printf("ION dma-buf Integration Mechanism Simulator\n");
printf("============================================\n");
/* --- 创建模拟设备 --- */
SimDevice gpu = {"GPU", 0x00100000, 1}; /* 有 IOMMU */
SimDevice display = {"Display", 0, 0}; /* 无 IOMMU(直通) */
SimDevice camera = {"Camera", 0xFF000000, 1}; /* 有 IOMMU */
/* --- 1. 创建 buffer 和 sg_table(模拟 ion_alloc) --- */
printf("\n[Step 1] ion_alloc: create buffer\n");
SimSgTable *sg = malloc(sizeof(*sg));
sg->nents = 3;
sg->entries[0] = (SimSgEntry){0x80000000, 0, 1048576}; /* 1MB */
sg->entries[1] = (SimSgEntry){0x92000000, 0, 65536}; /* 64KB */
sg->entries[2] = (SimSgEntry){0x85001000, 0, 4096}; /* 4KB */
SimBuffer *buf = calloc(1, sizeof(*buf));
buf->sg_table = sg;
buf->size = 1048576 + 65536 + 4096;
buf->flags = 1; /* ION_FLAG_CACHED */
SimDmaBuf dmabuf = {.priv = buf, .refcount = 1};
printf(" buffer created: %lu bytes, %d sg entries\n",
buf->size, sg->nents);
printf(" dma_buf_export(ops=ion_dma_buf_ops, priv=buffer)\n");
printf(" dma_buf_fd → fd=7\n");
/* --- 2. GPU attach --- */
printf("\n[Step 2] GPU: attach + map\n");
int gpu_att = sim_attach(&dmabuf, &gpu);
dmabuf.refcount++;
SimSgTable *gpu_sg = sim_map_dma_buf(buf, gpu_att);
/* --- 3. Display attach --- */
printf("\n[Step 3] Display: attach + map\n");
int disp_att = sim_attach(&dmabuf, &display);
dmabuf.refcount++;
SimSgTable *disp_sg = sim_map_dma_buf(buf, disp_att);
/* --- 验证 sg_table 独立性 --- */
printf("\n[Step 4] Verify sg_table independence\n");
printf(" GPU sg[0].dma_address = 0x%08lx\n",
gpu_sg->entries[0].dma_address);
printf(" Display sg[0].dma_address = 0x%08lx\n",
disp_sg->entries[0].dma_address);
printf(" Same phys page? %s (GPU phys=0x%08lx, Disp phys=0x%08lx)\n",
gpu_sg->entries[0].phys_addr == disp_sg->entries[0].phys_addr
? "YES (zero-copy)" : "NO",
gpu_sg->entries[0].phys_addr,
disp_sg->entries[0].phys_addr);
/* --- 5. CPU 写入 + cache sync --- */
printf("\n[Step 5] CPU write with cache sync\n");
sim_begin_cpu_access(&dmabuf, "TO_DEVICE");
printf(" CPU writes 0xAA to buffer\n");
cpu_cache.data = 0xAA;
cpu_cache.dirty = 1;
cpu_cache.valid = 1;
printf(" cache state: data=0x%02X dirty=%d\n",
cpu_cache.data, cpu_cache.dirty);
printf(" phys memory: data=0x%02X (stale!)\n", phys_memory_data);
sim_end_cpu_access(&dmabuf, "TO_DEVICE");
printf(" phys memory after flush: data=0x%02X (updated)\n",
phys_memory_data);
/* --- 6. 模拟设备 DMA 写入 + CPU 读回 --- */
printf("\n[Step 6] Device DMA write → CPU read back\n");
phys_memory_data = 0xBB; /* 模拟设备通过 DMA 写入 */
printf(" Device DMA writes 0xBB to phys memory\n");
printf(" phys memory: 0x%02X\n", phys_memory_data);
printf(" cpu cache: 0x%02X (stale!)\n", cpu_cache.data);
sim_begin_cpu_access(&dmabuf, "FROM_DEVICE");
printf(" CPU reads buffer after invalidate\n");
if (!cpu_cache.valid) {
cpu_cache.data = phys_memory_data; /* 从内存重新加载 */
cpu_cache.valid = 1;
printf(" cache miss → loaded 0x%02X from phys memory\n",
cpu_cache.data);
}
sim_end_cpu_access(&dmabuf, "FROM_DEVICE");
/* --- 7. 清理 --- */
printf("\n[Step 7] Cleanup\n");
sim_unmap_dma_buf(buf, gpu_att);
sim_detach(&dmabuf, gpu_att);
dmabuf.refcount--;
printf(" refcount=%d\n", dmabuf.refcount);
sim_unmap_dma_buf(buf, disp_att);
sim_detach(&dmabuf, disp_att);
dmabuf.refcount--;
printf(" refcount=%d\n", dmabuf.refcount);
/* 用户 close(fd) */
printf(" close(fd=7)\n");
dmabuf.refcount--;
printf(" refcount=%d\n", dmabuf.refcount);
if (dmabuf.refcount == 0)
sim_release(&dmabuf);
printf("\n============================================\n");
printf("Simulation Complete\n");
printf("============================================\n");
return 0;
}
编译和运行:
bash$ gcc -o dma_buf_sim dma_buf_sim.c -Wall $ ./dma_buf_sim
预期输出:
============================================ ION dma-buf Integration Mechanism Simulator ============================================ [Step 1] ion_alloc: create buffer buffer created: 1117184 bytes, 3 sg entries dma_buf_export(ops=ion_dma_buf_ops, priv=buffer) dma_buf_fd → fd=7 [Step 2] GPU: attach + map dup_sg_table: copied 3 entries, dma_address all zeroed [attach] device 'GPU' attached (total: 1) [map_dma_buf] device 'GPU': sg[0]: phys=0x80000000 → dma=0x00100000 (IOMMU) sg[1]: phys=0x92000000 → dma=0x00101000 (IOMMU) sg[2]: phys=0x85001000 → dma=0x00102000 (IOMMU) [Step 3] Display: attach + map dup_sg_table: copied 3 entries, dma_address all zeroed [attach] device 'Display' attached (total: 2) [map_dma_buf] device 'Display': sg[0]: phys=0x80000000 → dma=0x80000000 (direct) sg[1]: phys=0x92000000 → dma=0x92000000 (direct) sg[2]: phys=0x85001000 → dma=0x85001000 (direct) [Step 4] Verify sg_table independence GPU sg[0].dma_address = 0x00100000 Display sg[0].dma_address = 0x80000000 Same phys page? YES (zero-copy) (GPU phys=0x80000000, Disp phys=0x80000000) [Step 5] CPU write with cache sync [begin_cpu_access] direction=TO_DEVICE, kmap_cnt=1 dma_sync_sg_for_cpu(dev='GPU') dma_sync_sg_for_cpu(dev='Display') CPU writes 0xAA to buffer cache state: data=0xAA dirty=1 phys memory: data=0x00 (stale!) [end_cpu_access] direction=TO_DEVICE, kmap_cnt=0 dma_sync_sg_for_device(dev='GPU') dma_sync_sg_for_device(dev='Display') cache FLUSH → write 0xAA back to phys memory phys memory after flush: data=0xAA (updated) [Step 6] Device DMA write → CPU read back Device DMA writes 0xBB to phys memory phys memory: 0xBB cpu cache: 0xAA (stale!) [begin_cpu_access] direction=FROM_DEVICE, kmap_cnt=1 dma_sync_sg_for_cpu(dev='GPU') dma_sync_sg_for_cpu(dev='Display') cache INVALIDATE → discard stale cache lines CPU reads buffer after invalidate cache miss → loaded 0xBB from phys memory [end_cpu_access] direction=FROM_DEVICE, kmap_cnt=0 dma_sync_sg_for_device(dev='GPU') dma_sync_sg_for_device(dev='Display') [Step 7] Cleanup [unmap_dma_buf] device 'GPU': cleared 3 DMA mappings [detach] device 'GPU' free_duped_table: released 3 sg entries refcount=2 [unmap_dma_buf] device 'Display': cleared 3 DMA mappings [detach] device 'Display' free_duped_table: released 3 sg entries refcount=1 close(fd=7) refcount=0 [release] dma-buf refcount=0, destroying buffer ion_buffer_destroy → heap->ops->free() ============================================ Simulation Complete ============================================
关键观察点:
0x00100000,Display 直通得到 0x80000000,同一物理页面的 DMA 地址完全不同dma_address 独立,但 phys_addr 相同 — 零拷贝共享end_cpu_access(TO_DEVICE) flush 后 phys=0xAA,设备才能读到begin_cpu_access(FROM_DEVICE) invalidate 后 CPU 才读到 0xBBdma_buf_export() 将 ion_buffer 包装为 dma-buf 对象,dmabuf->priv 指向 ion_buffer,所有 ops 回调通过此指针访问底层数据dma_map_sg 建立设备的 DMA 映射,有 IOMMU 的设备获得 IOVA,无 IOMMU 的设备获得物理地址直通remap_pfn_range 逐 sg entry 映射,将散布的物理页面映射到用户态连续虚拟地址;非 cached buffer 使用 write-combine 页保护避免 cache 一致性问题dma_sync_sg_for_cpu(invalidate),end 做 dma_sync_sg_for_device(flush)+ kunmap,对每个 attachment 独立同步_ion_buffer_destroy 进入 deferred free 或立即释放路径dma_buf_ops 履行导出者契约drivers/staging/android/ion/ion.c(dma_buf_ops 实现,第 140-411 行)drivers/staging/android/ion/ion_heap.c(map_kernel / map_user 通用实现)drivers/staging/android/ion/ion.h(ion_buffer / ion_dma_buf_attachment 定义)include/linux/dma-buf.h(struct dma_buf_ops 定义)drivers/dma-buf/dma-buf.c(dma_buf_export / dma_buf_attach / dma_buf_map_attachment 实现)Documentation/DMA-API.txt(dma_map_sg / dma_sync_sg 说明)include/linux/dma-mapping.h