系统调用—Read


read源码

搜索sys_read:https://elixir.bootlin.com/linux/v6.8.1/source/fs/read_write.c#L627

上一篇已经对read系统调用的源码进行了一定程度的初步解读,那么我又产生了新的疑问:read的具体实现是有多种的,系统应该是会根据不同的文件采取对应的读取方式的,也就是说会调用不同的read具体实现,那么系统具体是如何实现的?于是便有了这篇博客。

// fs/read_write.c
// 输入参数:
// fd:文件描述符(用户进程打开的文件句柄)
// buf:用户空间缓冲区指针(数据将被读取到这里)
// count:请求读取的字节数
// 输出结果:读取到的字节数

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
return ksys_read(fd, buf, count);
}

ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
{
// 根据 fd 获取对应的文件描述符结构,并获取引用次数
struct fd f = fdget_pos(fd);
// 初始化返回值为无效文件描述符错误
ssize_t ret = -EBADF;

if (f.file) { // 检查文件有效性
// 获取文件当前位置指针
loff_t pos, *ppos = file_ppos(f.file);
if (ppos) {
pos = *ppos; // 保存当前文件位置(文件偏移量)
ppos = &pos; // 将 ppos 指向局部变量 pos
}
// 调用虚拟文件系统(VFS)读取,成功则返回读取的字节数
ret = vfs_read(f.file, buf, count, ppos);
if (ret >= 0 && ppos)
f.file->f_pos = pos; // 读取成功后更新位置
fdput_pos(f); // 释放 fdget_pos 获取的引用计数,即释放引用
}
return ret;
}

我们已经知道read中会调用vfs_read,然后会调用file->f_op->read或者new_sync_read,还是new_sync_read这种新型实现更多一些,那就重点看看new_sync_read

// fs/read_write.c
// 输入参数:
// file:内核文件对象指针,包含文件的元数据和操作方法。
// buf:用户空间缓冲区指针,用于存储读取到的数据。
// count:请求读取的字节数。
// pos:读取位置指针(NULL 表示从当前文件位置读取)。
// 输出结果:读取到的字节数

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;

// 检查文件是否以可读模式打开
if (!(file->f_mode & FMODE_READ))
return -EBADF;
// 检查文件是否允许读取
if (!(file->f_mode & FMODE_CAN_READ))
return -EINVAL;
// 验证用户空间缓冲区是否合法
if (unlikely(!access_ok(buf, count)))
return -EFAULT;

// 验证读取区域是否合法
ret = rw_verify_area(READ, file, pos, count);
if (ret)
return ret; // 失败则返回错误码
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT; // 限制单次读取大小

// 调用具体读取方法
if (file->f_op->read) // 传统读取方法
ret = file->f_op->read(file, buf, count, pos);
else if (file->f_op->read_iter) // 迭代读取方法
ret = new_sync_read(file, buf, count, pos);
else // 无读取支持(文件系统未实现读取方法)
ret = -EINVAL;
// 读取后处理
if (ret > 0) {
fsnotify_access(file); // 触发文件访问通知,通知用户空间或监控程序
add_rchar(current, ret); // 统计当前进程的读取字节数
}
inc_syscr(current); // 统计当前进程的系统调用读取次数
return ret;
}

以读取管道文件为例,管道文件对new_sync_read的实现

static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
struct kiocb kiocb;
struct iov_iter iter;
ssize_t ret;

init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = (ppos ? *ppos : 0);
iov_iter_ubuf(&iter, ITER_DEST, buf, len);

ret = call_read_iter(filp, &kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
if (ppos)
*ppos = kiocb.ki_pos;
return ret;
}

在往下追,可以看到,new_sync_read最后依然调用的是f_op中定义的函数。

// /include/linux/fs.h
static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
struct iov_iter *iter)
{
return file->f_op->read_iter(kio, iter);
}

f_op的定义

const struct file_operations	*f_op;

pipefile_operations的实现

// /fs/pipe.c  https://elixir.bootlin.com/linux/v6.8.1/source/fs/pipe.c
const struct file_operations pipefifo_fops = {
.open = fifo_open,
.llseek = no_llseek,
.read_iter = pipe_read, // *
.write_iter = pipe_write,
.poll = pipe_poll,
.unlocked_ioctl = pipe_ioctl,
.release = pipe_release,
.fasync = pipe_fasync,
.splice_write = iter_file_splice_write,
};

static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
// ** 初始化 **
size_t total_len = iov_iter_count(to); // 用户读取字节数
struct file *filp = iocb->ki_filp; // 获取文件对象
struct pipe_inode_info *pipe = filp->private_data; // 管道核心数据结构inode
bool was_full, wake_next_reader = false;
ssize_t ret;

/* Null read succeeds. */
if (unlikely(total_len == 0))
return 0;

ret = 0;
__pipe_lock(pipe); // 获取管道锁

/*
* We only wake up writers if the pipe was full when we started
* reading in order to avoid unnecessary wakeups.
*
* But when we do wake up writers, we do so using a sync wakeup
* (WF_SYNC), because we want them to get going and generate more
* data for us.
*/
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); // 初始管道是否满

// ** 主循环:读取数据 **
for (;;) {
/* Read ->head with a barrier vs post_one_notification() */
// 通过内存屏障安全获取头指针
unsigned int head = smp_load_acquire(&pipe->head);
unsigned int tail = pipe->tail;
unsigned int mask = pipe->ring_size - 1;

// 处理监控队列通知
#ifdef CONFIG_WATCH_QUEUE
if (pipe->note_loss) {
struct watch_notification n;

if (total_len < 8) {
if (ret == 0)
ret = -ENOBUFS;
break;
}

n.type = WATCH_TYPE_META;
n.subtype = WATCH_META_LOSS_NOTIFICATION;
n.info = watch_sizeof(n);
if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
if (ret == 0)
ret = -EFAULT;
break;
}
ret += sizeof(n);
total_len -= sizeof(n);
pipe->note_loss = false;
}
#endif

// 管道非空,读取数据
if (!pipe_empty(head, tail)) {
struct pipe_buffer *buf = &pipe->bufs[tail & mask]; // 当前缓冲区
size_t chars = buf->len; // 缓冲区有效数据长度
size_t written;
int error;

// 限制读取量为用户读取的剩余量
if (chars > total_len) {
if (buf->flags & PIPE_BUF_FLAG_WHOLE) { // 不允许部分读取
if (ret == 0)
ret = -ENOBUFS;
break;
}
chars = total_len;
}

// 确认缓冲区有效性
error = pipe_buf_confirm(pipe, buf);
if (error) {
if (!ret)
ret = error;
break;
}

// 将内核页数据拷贝到用户空间
written = copy_page_to_iter(buf->page, buf->offset, chars, to);
if (unlikely(written < chars)) {
if (!ret)
ret = -EFAULT;
break;
}
// 更新缓冲区状态
ret += chars;
buf->offset += chars; // 偏移量增加
buf->len -= chars; // 剩余长度减少

/* Was it a packet buffer? Clean up and exit */
// 处理包模式
if (buf->flags & PIPE_BUF_FLAG_PACKET) {
total_len = chars;
buf->len = 0;
}

// 缓冲区读空,更新尾指针
if (!buf->len)
tail = pipe_update_tail(pipe, buf, tail);
total_len -= chars;
if (!total_len)
break; /* common path: read succeeded */
if (!pipe_empty(head, tail)) /* More to do? */
continue;
}

// ** 处理阻塞与唤醒 **
// 查看是否有写者,没有则退出
if (!pipe->writers)
break;
// 已有部分数据或错误
if (ret)
break;
// 非阻塞模式立即返回-EAGAIN
if ((filp->f_flags & O_NONBLOCK) ||
(iocb->ki_flags & IOCB_NOWAIT)) {
ret = -EAGAIN;
break;
}
// 释放锁
__pipe_unlock(pipe);

/*
* We only get here if we didn't actually read anything.
*
* However, we could have seen (and removed) a zero-sized
* pipe buffer, and might have made space in the buffers
* that way.
*
* You can't make zero-sized pipe buffers by doing an empty
* write (not even in packet mode), but they can happen if
* the writer gets an EFAULT when trying to fill a buffer
* that already got allocated and inserted in the buffer
* array.
*
* So we still need to wake up any pending writers in the
* _very_ unlikely case that the pipe was full, but we got
* no data.
*/
// 若管道之前是满的,唤醒写进程
if (unlikely(was_full))
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);

/*
* But because we didn't read anything, at this point we can
* just return directly with -ERESTARTSYS if we're interrupted,
* since we've done any required wakeups and there's no need
* to mark anything accessed. And we've dropped the lock.
*/
// 阻塞等待数据到达
if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
return -ERESTARTSYS; // 被信号中断

__pipe_lock(pipe); // 重新加锁
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
wake_next_reader = true; // 标记需要唤醒的其他读进程
}
// ** 收尾处理 **
// 管道空则无需唤醒其他读进程
if (pipe_empty(pipe->head, pipe->tail))
wake_next_reader = false;
__pipe_unlock(pipe); // 释放锁

if (was_full) // 唤醒写进程
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
if (wake_next_reader) // 唤醒其他读进程
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); // 异步通知
if (ret > 0)
file_accessed(filp); // 更新文件访问时间
return ret; // 返回字节数或错误码
}

pipe_read是怎么读取数据的大概知道了。