系统调用—Read

read源码

搜索sys_read:https://elixir.bootlin.com/linux/v6.8.1/source/fs/read_write.c#L627

上一篇已经对read系统调用的源码进行了一定程度的初步解读，那么我又产生了新的疑问：read的具体实现是有多种的，系统应该是会根据不同的文件采取对应的读取方式的，也就是说会调用不同的read具体实现，那么系统具体是如何实现的？于是便有了这篇博客。

// fs/read_write.c
// 输入参数：
// fd：文件描述符（用户进程打开的文件句柄）
// buf：用户空间缓冲区指针（数据将被读取到这里）
// count：请求读取的字节数
// 输出结果：读取到的字节数

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
	return ksys_read(fd, buf, count);
}

ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
{
    // 根据 fd 获取对应的文件描述符结构，并获取引用次数
	struct fd f = fdget_pos(fd);
    // 初始化返回值为无效文件描述符错误
	ssize_t ret = -EBADF;

	if (f.file) { // 检查文件有效性
        // 获取文件当前位置指针
		loff_t pos, *ppos = file_ppos(f.file);
		if (ppos) {
			pos = *ppos; // 保存当前文件位置（文件偏移量）
			ppos = &pos; // 将 ppos 指向局部变量 pos
		}
        // 调用虚拟文件系统（VFS）读取，成功则返回读取的字节数
		ret = vfs_read(f.file, buf, count, ppos);
		if (ret >= 0 && ppos)
			f.file->f_pos = pos; // 读取成功后更新位置
		fdput_pos(f); // 释放 fdget_pos 获取的引用计数，即释放引用
	}
	return ret;
}

我们已经知道read中会调用vfs_read，然后会调用file->f_op->read或者new_sync_read，还是new_sync_read这种新型实现更多一些，那就重点看看new_sync_read。

// fs/read_write.c
// 输入参数：
// file：内核文件对象指针，包含文件的元数据和操作方法。
// buf：用户空间缓冲区指针，用于存储读取到的数据。
// count：请求读取的字节数。
// pos：读取位置指针（NULL 表示从当前文件位置读取）。
// 输出结果：读取到的字节数

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
	ssize_t ret;

    // 检查文件是否以可读模式打开
	if (!(file->f_mode & FMODE_READ))
		return -EBADF;
    // 检查文件是否允许读取
	if (!(file->f_mode & FMODE_CAN_READ))
		return -EINVAL;
    // 验证用户空间缓冲区是否合法
	if (unlikely(!access_ok(buf, count)))
		return -EFAULT;

    // 验证读取区域是否合法
	ret = rw_verify_area(READ, file, pos, count);
	if (ret)
		return ret; // 失败则返回错误码
	if (count > MAX_RW_COUNT)
		count =  MAX_RW_COUNT; // 限制单次读取大小

    // 调用具体读取方法
	if (file->f_op->read) // 传统读取方法
		ret = file->f_op->read(file, buf, count, pos);
	else if (file->f_op->read_iter) // 迭代读取方法
		ret = new_sync_read(file, buf, count, pos);
	else // 无读取支持（文件系统未实现读取方法）
		ret = -EINVAL;
    // 读取后处理
	if (ret > 0) {
		fsnotify_access(file); // 触发文件访问通知，通知用户空间或监控程序
		add_rchar(current, ret); // 统计当前进程的读取字节数
	}
	inc_syscr(current); // 统计当前进程的系统调用读取次数
	return ret;
}

以读取管道文件为例，管道文件对new_sync_read的实现

static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
	struct kiocb kiocb;
	struct iov_iter iter;
	ssize_t ret;

	init_sync_kiocb(&kiocb, filp);
	kiocb.ki_pos = (ppos ? *ppos : 0);
	iov_iter_ubuf(&iter, ITER_DEST, buf, len);

	ret = call_read_iter(filp, &kiocb, &iter);
	BUG_ON(ret == -EIOCBQUEUED);
	if (ppos)
		*ppos = kiocb.ki_pos;
	return ret;
}

在往下追，可以看到，new_sync_read最后依然调用的是f_op中定义的函数。

// /include/linux/fs.h
static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
				     struct iov_iter *iter)
{
	return file->f_op->read_iter(kio, iter);
}

f_op的定义

const struct file_operations	*f_op;

pipe中file_operations的实现

// /fs/pipe.c  https://elixir.bootlin.com/linux/v6.8.1/source/fs/pipe.c
const struct file_operations pipefifo_fops = {
	.open		= fifo_open,
	.llseek		= no_llseek,
	.read_iter	= pipe_read, // *
	.write_iter	= pipe_write,
	.poll		= pipe_poll,
	.unlocked_ioctl	= pipe_ioctl,
	.release	= pipe_release,
	.fasync		= pipe_fasync,
	.splice_write	= iter_file_splice_write,
};

static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
    // ** 初始化 **
	size_t total_len = iov_iter_count(to); // 用户读取字节数
	struct file *filp = iocb->ki_filp; // 获取文件对象
	struct pipe_inode_info *pipe = filp->private_data; // 管道核心数据结构inode
	bool was_full, wake_next_reader = false;
	ssize_t ret;

	/* Null read succeeds. */
	if (unlikely(total_len == 0))
		return 0;

	ret = 0;
	__pipe_lock(pipe); // 获取管道锁

	/*
	 * We only wake up writers if the pipe was full when we started
	 * reading in order to avoid unnecessary wakeups.
	 *
	 * But when we do wake up writers, we do so using a sync wakeup
	 * (WF_SYNC), because we want them to get going and generate more
	 * data for us.
	 */
	was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); // 初始管道是否满
    
    // ** 主循环：读取数据 **
	for (;;) {
		/* Read ->head with a barrier vs post_one_notification() */
        // 通过内存屏障安全获取头指针
		unsigned int head = smp_load_acquire(&pipe->head);
		unsigned int tail = pipe->tail;
		unsigned int mask = pipe->ring_size - 1;

        // 处理监控队列通知
#ifdef CONFIG_WATCH_QUEUE
		if (pipe->note_loss) {
			struct watch_notification n;

			if (total_len < 8) {
				if (ret == 0)
					ret = -ENOBUFS;
				break;
			}

			n.type = WATCH_TYPE_META;
			n.subtype = WATCH_META_LOSS_NOTIFICATION;
			n.info = watch_sizeof(n);
			if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
				if (ret == 0)
					ret = -EFAULT;
				break;
			}
			ret += sizeof(n);
			total_len -= sizeof(n);
			pipe->note_loss = false;
		}
#endif

        // 管道非空，读取数据
		if (!pipe_empty(head, tail)) {
			struct pipe_buffer *buf = &pipe->bufs[tail & mask]; // 当前缓冲区
			size_t chars = buf->len; // 缓冲区有效数据长度
			size_t written;
			int error;

            // 限制读取量为用户读取的剩余量
			if (chars > total_len) {
				if (buf->flags & PIPE_BUF_FLAG_WHOLE) { // 不允许部分读取
					if (ret == 0)
						ret = -ENOBUFS;
					break;
				}
				chars = total_len;
			}

            // 确认缓冲区有效性
			error = pipe_buf_confirm(pipe, buf);
			if (error) {
				if (!ret)
					ret = error;
				break;
			}

            // 将内核页数据拷贝到用户空间
			written = copy_page_to_iter(buf->page, buf->offset, chars, to);
			if (unlikely(written < chars)) {
				if (!ret)
					ret = -EFAULT;
				break;
			}
            // 更新缓冲区状态
			ret += chars;
			buf->offset += chars; // 偏移量增加
			buf->len -= chars; // 剩余长度减少

			/* Was it a packet buffer? Clean up and exit */
            // 处理包模式
			if (buf->flags & PIPE_BUF_FLAG_PACKET) {
				total_len = chars;
				buf->len = 0;
			}

            // 缓冲区读空，更新尾指针
			if (!buf->len)
				tail = pipe_update_tail(pipe, buf, tail);
			total_len -= chars;
			if (!total_len)
				break;	/* common path: read succeeded */
			if (!pipe_empty(head, tail))	/* More to do? */
				continue;
		}

        // ** 处理阻塞与唤醒 **
        // 查看是否有写者，没有则退出
		if (!pipe->writers)
			break;
        // 已有部分数据或错误
		if (ret)
			break;
        // 非阻塞模式立即返回-EAGAIN
		if ((filp->f_flags & O_NONBLOCK) ||
		    (iocb->ki_flags & IOCB_NOWAIT)) {
			ret = -EAGAIN;
			break;
		}
        // 释放锁
		__pipe_unlock(pipe);

		/*
		 * We only get here if we didn't actually read anything.
		 *
		 * However, we could have seen (and removed) a zero-sized
		 * pipe buffer, and might have made space in the buffers
		 * that way.
		 *
		 * You can't make zero-sized pipe buffers by doing an empty
		 * write (not even in packet mode), but they can happen if
		 * the writer gets an EFAULT when trying to fill a buffer
		 * that already got allocated and inserted in the buffer
		 * array.
		 *
		 * So we still need to wake up any pending writers in the
		 * _very_ unlikely case that the pipe was full, but we got
		 * no data.
		 */
        // 若管道之前是满的，唤醒写进程
		if (unlikely(was_full))
			wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);

		/*
		 * But because we didn't read anything, at this point we can
		 * just return directly with -ERESTARTSYS if we're interrupted,
		 * since we've done any required wakeups and there's no need
		 * to mark anything accessed. And we've dropped the lock.
		 */
        // 阻塞等待数据到达
		if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
			return -ERESTARTSYS; // 被信号中断

		__pipe_lock(pipe); // 重新加锁
		was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
		wake_next_reader = true; // 标记需要唤醒的其他读进程
	}
    // ** 收尾处理 **
    // 管道空则无需唤醒其他读进程
	if (pipe_empty(pipe->head, pipe->tail))
		wake_next_reader = false;
	__pipe_unlock(pipe); // 释放锁

	if (was_full) // 唤醒写进程
		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
	if (wake_next_reader) // 唤醒其他读进程
		wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); // 异步通知
	if (ret > 0)
		file_accessed(filp); // 更新文件访问时间
	return ret; // 返回字节数或错误码
}

pipe_read是怎么读取数据的大概知道了。