page_pool: devmem support
authorMina Almasry <almasrymina@google.com>
Tue, 10 Sep 2024 17:14:49 +0000 (17:14 +0000)
committerJakub Kicinski <kuba@kernel.org>
Thu, 12 Sep 2024 03:44:31 +0000 (20:44 -0700)
Convert netmem to be a union of struct page and struct netmem. Overload
the LSB of struct netmem* to indicate that it's a net_iov, otherwise
it's a page.

Currently these entries in struct page are rented by the page_pool and
used exclusively by the net stack:

struct {
unsigned long pp_magic;
struct page_pool *pp;
unsigned long _pp_mapping_pad;
unsigned long dma_addr;
atomic_long_t pp_ref_count;
};

Mirror these (and only these) entries into struct net_iov and implement
netmem helpers that can access these common fields regardless of
whether the underlying type is page or net_iov.

Implement checks for net_iov in netmem helpers which delegate to mm
APIs, to ensure net_iov are never passed to the mm stack.

Signed-off-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20240910171458.219195-6-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
include/net/netmem.h
include/net/page_pool/helpers.h
include/trace/events/page_pool.h
net/core/devmem.c
net/core/netmem_priv.h [new file with mode: 0644]
net/core/page_pool.c
net/core/page_pool_priv.h
net/core/skbuff.c

index c23e224dd6a0997295dcc2e3deef792a5daf787a..8a6e20be4b9d3094fb6d7314c1e447b43bc8e77e 100644 (file)
@@ -8,12 +8,52 @@
 #ifndef _NET_NETMEM_H
 #define _NET_NETMEM_H
 
+#include <linux/mm.h>
+#include <net/net_debug.h>
+
 /* net_iov */
 
+DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers);
+
+/*  We overload the LSB of the struct page pointer to indicate whether it's
+ *  a page or net_iov.
+ */
+#define NET_IOV 0x01UL
+
 struct net_iov {
+       unsigned long __unused_padding;
+       unsigned long pp_magic;
+       struct page_pool *pp;
        struct dmabuf_genpool_chunk_owner *owner;
+       unsigned long dma_addr;
+       atomic_long_t pp_ref_count;
 };
 
+/* These fields in struct page are used by the page_pool and net stack:
+ *
+ *        struct {
+ *                unsigned long pp_magic;
+ *                struct page_pool *pp;
+ *                unsigned long _pp_mapping_pad;
+ *                unsigned long dma_addr;
+ *                atomic_long_t pp_ref_count;
+ *        };
+ *
+ * We mirror the page_pool fields here so the page_pool can access these fields
+ * without worrying whether the underlying fields belong to a page or net_iov.
+ *
+ * The non-net stack fields of struct page are private to the mm stack and must
+ * never be mirrored to net_iov.
+ */
+#define NET_IOV_ASSERT_OFFSET(pg, iov)             \
+       static_assert(offsetof(struct page, pg) == \
+                     offsetof(struct net_iov, iov))
+NET_IOV_ASSERT_OFFSET(pp_magic, pp_magic);
+NET_IOV_ASSERT_OFFSET(pp, pp);
+NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr);
+NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count);
+#undef NET_IOV_ASSERT_OFFSET
+
 /* netmem */
 
 /**
@@ -27,20 +67,37 @@ struct net_iov {
  */
 typedef unsigned long __bitwise netmem_ref;
 
+static inline bool netmem_is_net_iov(const netmem_ref netmem)
+{
+       return (__force unsigned long)netmem & NET_IOV;
+}
+
 /* This conversion fails (returns NULL) if the netmem_ref is not struct page
  * backed.
- *
- * Currently struct page is the only possible netmem, and this helper never
- * fails.
  */
 static inline struct page *netmem_to_page(netmem_ref netmem)
 {
+       if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
+               return NULL;
+
        return (__force struct page *)netmem;
 }
 
-/* Converting from page to netmem is always safe, because a page can always be
- * a netmem.
- */
+static inline struct net_iov *netmem_to_net_iov(netmem_ref netmem)
+{
+       if (netmem_is_net_iov(netmem))
+               return (struct net_iov *)((__force unsigned long)netmem &
+                                         ~NET_IOV);
+
+       DEBUG_NET_WARN_ON_ONCE(true);
+       return NULL;
+}
+
+static inline netmem_ref net_iov_to_netmem(struct net_iov *niov)
+{
+       return (__force netmem_ref)((unsigned long)niov | NET_IOV);
+}
+
 static inline netmem_ref page_to_netmem(struct page *page)
 {
        return (__force netmem_ref)page;
@@ -48,17 +105,70 @@ static inline netmem_ref page_to_netmem(struct page *page)
 
 static inline int netmem_ref_count(netmem_ref netmem)
 {
+       /* The non-pp refcount of net_iov is always 1. On net_iov, we only
+        * support pp refcounting which uses the pp_ref_count field.
+        */
+       if (netmem_is_net_iov(netmem))
+               return 1;
+
        return page_ref_count(netmem_to_page(netmem));
 }
 
-static inline unsigned long netmem_to_pfn(netmem_ref netmem)
+static inline unsigned long netmem_pfn_trace(netmem_ref netmem)
 {
+       if (netmem_is_net_iov(netmem))
+               return 0;
+
        return page_to_pfn(netmem_to_page(netmem));
 }
 
+static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem)
+{
+       return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV);
+}
+
+static inline struct page_pool *netmem_get_pp(netmem_ref netmem)
+{
+       return __netmem_clear_lsb(netmem)->pp;
+}
+
+static inline atomic_long_t *netmem_get_pp_ref_count_ref(netmem_ref netmem)
+{
+       return &__netmem_clear_lsb(netmem)->pp_ref_count;
+}
+
+static inline bool netmem_is_pref_nid(netmem_ref netmem, int pref_nid)
+{
+       /* NUMA node preference only makes sense if we're allocating
+        * system memory. Memory providers (which give us net_iovs)
+        * choose for us.
+        */
+       if (netmem_is_net_iov(netmem))
+               return true;
+
+       return page_to_nid(netmem_to_page(netmem)) == pref_nid;
+}
+
 static inline netmem_ref netmem_compound_head(netmem_ref netmem)
 {
+       /* niov are never compounded */
+       if (netmem_is_net_iov(netmem))
+               return netmem;
+
        return page_to_netmem(compound_head(netmem_to_page(netmem)));
 }
 
+static inline void *netmem_address(netmem_ref netmem)
+{
+       if (netmem_is_net_iov(netmem))
+               return NULL;
+
+       return page_address(netmem_to_page(netmem));
+}
+
+static inline unsigned long netmem_get_dma_addr(netmem_ref netmem)
+{
+       return __netmem_clear_lsb(netmem)->dma_addr;
+}
+
 #endif /* _NET_NETMEM_H */
index 2b43a893c619d9c5c66194a757d682670dad3c92..793e6fd78bc5c0890bc875a0ddfcc896e89896dd 100644 (file)
@@ -216,7 +216,7 @@ page_pool_get_dma_dir(const struct page_pool *pool)
 
 static inline void page_pool_fragment_netmem(netmem_ref netmem, long nr)
 {
-       atomic_long_set(&netmem_to_page(netmem)->pp_ref_count, nr);
+       atomic_long_set(netmem_get_pp_ref_count_ref(netmem), nr);
 }
 
 /**
@@ -244,7 +244,7 @@ static inline void page_pool_fragment_page(struct page *page, long nr)
 
 static inline long page_pool_unref_netmem(netmem_ref netmem, long nr)
 {
-       struct page *page = netmem_to_page(netmem);
+       atomic_long_t *pp_ref_count = netmem_get_pp_ref_count_ref(netmem);
        long ret;
 
        /* If nr == pp_ref_count then we have cleared all remaining
@@ -261,19 +261,19 @@ static inline long page_pool_unref_netmem(netmem_ref netmem, long nr)
         * initially, and only overwrite it when the page is partitioned into
         * more than one piece.
         */
-       if (atomic_long_read(&page->pp_ref_count) == nr) {
+       if (atomic_long_read(pp_ref_count) == nr) {
                /* As we have ensured nr is always one for constant case using
                 * the BUILD_BUG_ON(), only need to handle the non-constant case
                 * here for pp_ref_count draining, which is a rare case.
                 */
                BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1);
                if (!__builtin_constant_p(nr))
-                       atomic_long_set(&page->pp_ref_count, 1);
+                       atomic_long_set(pp_ref_count, 1);
 
                return 0;
        }
 
-       ret = atomic_long_sub_return(nr, &page->pp_ref_count);
+       ret = atomic_long_sub_return(nr, pp_ref_count);
        WARN_ON(ret < 0);
 
        /* We are the last user here too, reset pp_ref_count back to 1 to
@@ -282,7 +282,7 @@ static inline long page_pool_unref_netmem(netmem_ref netmem, long nr)
         * page_pool_unref_page() currently.
         */
        if (unlikely(!ret))
-               atomic_long_set(&page->pp_ref_count, 1);
+               atomic_long_set(pp_ref_count, 1);
 
        return ret;
 }
@@ -401,9 +401,7 @@ static inline void page_pool_free_va(struct page_pool *pool, void *va,
 
 static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem)
 {
-       struct page *page = netmem_to_page(netmem);
-
-       dma_addr_t ret = page->dma_addr;
+       dma_addr_t ret = netmem_get_dma_addr(netmem);
 
        if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA)
                ret <<= PAGE_SHIFT;
@@ -423,24 +421,6 @@ static inline dma_addr_t page_pool_get_dma_addr(const struct page *page)
        return page_pool_get_dma_addr_netmem(page_to_netmem((struct page *)page));
 }
 
-static inline bool page_pool_set_dma_addr_netmem(netmem_ref netmem,
-                                                dma_addr_t addr)
-{
-       struct page *page = netmem_to_page(netmem);
-
-       if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) {
-               page->dma_addr = addr >> PAGE_SHIFT;
-
-               /* We assume page alignment to shave off bottom bits,
-                * if this "compression" doesn't work we need to drop.
-                */
-               return addr != (dma_addr_t)page->dma_addr << PAGE_SHIFT;
-       }
-
-       page->dma_addr = addr;
-       return false;
-}
-
 /**
  * page_pool_dma_sync_for_cpu - sync Rx page for CPU after it's written by HW
  * @pool: &page_pool the @page belongs to
@@ -463,11 +443,6 @@ static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool,
                                      page_pool_get_dma_dir(pool));
 }
 
-static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
-{
-       return page_pool_set_dma_addr_netmem(page_to_netmem(page), addr);
-}
-
 static inline bool page_pool_put(struct page_pool *pool)
 {
        return refcount_dec_and_test(&pool->user_cnt);
index 543e54e432a1822ceb34c55b39c59169b289b111..31825ed30032418d8258f336899fe516a5638c83 100644 (file)
@@ -57,12 +57,12 @@ TRACE_EVENT(page_pool_state_release,
                __entry->pool           = pool;
                __entry->netmem         = (__force unsigned long)netmem;
                __entry->release        = release;
-               __entry->pfn            = netmem_to_pfn(netmem);
+               __entry->pfn            = netmem_pfn_trace(netmem);
        ),
 
-       TP_printk("page_pool=%p netmem=%p pfn=0x%lx release=%u",
+       TP_printk("page_pool=%p netmem=%p is_net_iov=%lu pfn=0x%lx release=%u",
                  __entry->pool, (void *)__entry->netmem,
-                 __entry->pfn, __entry->release)
+                 __entry->netmem & NET_IOV, __entry->pfn, __entry->release)
 );
 
 TRACE_EVENT(page_pool_state_hold,
@@ -83,12 +83,12 @@ TRACE_EVENT(page_pool_state_hold,
                __entry->pool   = pool;
                __entry->netmem = (__force unsigned long)netmem;
                __entry->hold   = hold;
-               __entry->pfn    = netmem_to_pfn(netmem);
+               __entry->pfn    = netmem_pfn_trace(netmem);
        ),
 
-       TP_printk("page_pool=%p netmem=%p pfn=0x%lx hold=%u",
+       TP_printk("page_pool=%p netmem=%p is_net_iov=%lu, pfn=0x%lx hold=%u",
                  __entry->pool, (void *)__entry->netmem,
-                 __entry->pfn, __entry->hold)
+                 __entry->netmem & NET_IOV, __entry->pfn, __entry->hold)
 );
 
 TRACE_EVENT(page_pool_update_nid,
index 9beb03763dc93ae9d3be2ad456a2d9fa4856513a..7efeb602cf4547373ce78dac500d718d7a807988 100644 (file)
@@ -18,6 +18,7 @@
 #include <trace/events/page_pool.h>
 
 #include "devmem.h"
+#include "page_pool_priv.h"
 
 /* Device memory support */
 
@@ -82,6 +83,10 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
        index = offset / PAGE_SIZE;
        niov = &owner->niovs[index];
 
+       niov->pp_magic = 0;
+       niov->pp = NULL;
+       atomic_long_set(&niov->pp_ref_count, 0);
+
        return niov;
 }
 
@@ -269,6 +274,8 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
                for (i = 0; i < owner->num_niovs; i++) {
                        niov = &owner->niovs[i];
                        niov->owner = owner;
+                       page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov),
+                                                     net_devmem_get_dma_addr(niov));
                }
 
                virtual += len;
diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h
new file mode 100644 (file)
index 0000000..7eadb83
--- /dev/null
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __NETMEM_PRIV_H
+#define __NETMEM_PRIV_H
+
+static inline unsigned long netmem_get_pp_magic(netmem_ref netmem)
+{
+       return __netmem_clear_lsb(netmem)->pp_magic;
+}
+
+static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
+{
+       __netmem_clear_lsb(netmem)->pp_magic |= pp_magic;
+}
+
+static inline void netmem_clear_pp_magic(netmem_ref netmem)
+{
+       __netmem_clear_lsb(netmem)->pp_magic = 0;
+}
+
+static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool)
+{
+       __netmem_clear_lsb(netmem)->pp = pool;
+}
+
+static inline void netmem_set_dma_addr(netmem_ref netmem,
+                                      unsigned long dma_addr)
+{
+       __netmem_clear_lsb(netmem)->dma_addr = dma_addr;
+}
+#endif
index 2abe6e919224d98ba72a645c699e8de5ba6e414a..52659db2d7651a392401a5561b4ea42f59d51e9e 100644 (file)
 
 #include <trace/events/page_pool.h>
 
+#include "netmem_priv.h"
 #include "page_pool_priv.h"
 
+DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
+
 #define DEFER_TIME (msecs_to_jiffies(1000))
 #define DEFER_WARN_INTERVAL (60 * HZ)
 
@@ -358,7 +361,7 @@ static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
                if (unlikely(!netmem))
                        break;
 
-               if (likely(page_to_nid(netmem_to_page(netmem)) == pref_nid)) {
+               if (likely(netmem_is_pref_nid(netmem, pref_nid))) {
                        pool->alloc.cache[pool->alloc.count++] = netmem;
                } else {
                        /* NUMA mismatch;
@@ -454,10 +457,8 @@ unmap_failed:
 
 static void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
 {
-       struct page *page = netmem_to_page(netmem);
-
-       page->pp = pool;
-       page->pp_magic |= PP_SIGNATURE;
+       netmem_set_pp(netmem, pool);
+       netmem_or_pp_magic(netmem, PP_SIGNATURE);
 
        /* Ensuring all pages have been split into one fragment initially:
         * page_pool_set_pp_info() is only called once for every page when it
@@ -472,10 +473,8 @@ static void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
 
 static void page_pool_clear_pp_info(netmem_ref netmem)
 {
-       struct page *page = netmem_to_page(netmem);
-
-       page->pp_magic = 0;
-       page->pp = NULL;
+       netmem_clear_pp_magic(netmem);
+       netmem_set_pp(netmem, NULL);
 }
 
 static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
@@ -692,8 +691,9 @@ static bool page_pool_recycle_in_cache(netmem_ref netmem,
 
 static bool __page_pool_page_can_be_recycled(netmem_ref netmem)
 {
-       return page_ref_count(netmem_to_page(netmem)) == 1 &&
-              !page_is_pfmemalloc(netmem_to_page(netmem));
+       return netmem_is_net_iov(netmem) ||
+              (page_ref_count(netmem_to_page(netmem)) == 1 &&
+               !page_is_pfmemalloc(netmem_to_page(netmem)));
 }
 
 /* If the page refcnt == 1, this will try to recycle the page.
@@ -728,6 +728,7 @@ __page_pool_put_page(struct page_pool *pool, netmem_ref netmem,
                /* Page found as candidate for recycling */
                return netmem;
        }
+
        /* Fallback/non-XDP mode: API user have elevated refcnt.
         *
         * Many drivers split up the page into fragments, and some
@@ -949,7 +950,7 @@ static void page_pool_empty_ring(struct page_pool *pool)
        /* Empty recycle ring */
        while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) {
                /* Verify the refcnt invariant of cached pages */
-               if (!(page_ref_count(netmem_to_page(netmem)) == 1))
+               if (!(netmem_ref_count(netmem) == 1))
                        pr_crit("%s() page_pool refcnt %d violation\n",
                                __func__, netmem_ref_count(netmem));
 
index 90665d40f1eb73fe6d41daf099da1e49d9ab56ed..d602c1e728c2181676555356a8e819ddb0a64d7c 100644 (file)
@@ -3,10 +3,36 @@
 #ifndef __PAGE_POOL_PRIV_H
 #define __PAGE_POOL_PRIV_H
 
+#include <net/page_pool/helpers.h>
+
+#include "netmem_priv.h"
+
 s32 page_pool_inflight(const struct page_pool *pool, bool strict);
 
 int page_pool_list(struct page_pool *pool);
 void page_pool_detached(struct page_pool *pool);
 void page_pool_unlist(struct page_pool *pool);
 
+static inline bool
+page_pool_set_dma_addr_netmem(netmem_ref netmem, dma_addr_t addr)
+{
+       if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) {
+               netmem_set_dma_addr(netmem, addr >> PAGE_SHIFT);
+
+               /* We assume page alignment to shave off bottom bits,
+                * if this "compression" doesn't work we need to drop.
+                */
+               return addr != (dma_addr_t)netmem_get_dma_addr(netmem)
+                                      << PAGE_SHIFT;
+       }
+
+       netmem_set_dma_addr(netmem, addr);
+       return false;
+}
+
+static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
+{
+       return page_pool_set_dma_addr_netmem(page_to_netmem(page), addr);
+}
+
 #endif
index 038a059b5924558651fe358e8563603be2bea23f..8abcc28bb1898aac26d3c8115c69eee411581d81 100644 (file)
@@ -88,6 +88,7 @@
 #include <linux/textsearch.h>
 
 #include "dev.h"
+#include "netmem_priv.h"
 #include "sock_destructor.h"
 
 #ifdef CONFIG_SKB_EXTENSIONS
@@ -920,9 +921,9 @@ static void skb_clone_fraglist(struct sk_buff *skb)
                skb_get(list);
 }
 
-static bool is_pp_page(struct page *page)
+static bool is_pp_netmem(netmem_ref netmem)
 {
-       return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
+       return (netmem_get_pp_magic(netmem) & ~0x3UL) == PP_SIGNATURE;
 }
 
 int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
@@ -1020,9 +1021,7 @@ EXPORT_SYMBOL(skb_cow_data_for_xdp);
 #if IS_ENABLED(CONFIG_PAGE_POOL)
 bool napi_pp_put_page(netmem_ref netmem)
 {
-       struct page *page = netmem_to_page(netmem);
-
-       page = compound_head(page);
+       netmem = netmem_compound_head(netmem);
 
        /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
         * in order to preserve any existing bits, such as bit 0 for the
@@ -1031,10 +1030,10 @@ bool napi_pp_put_page(netmem_ref netmem)
         * and page_is_pfmemalloc() is checked in __page_pool_put_page()
         * to avoid recycling the pfmemalloc page.
         */
-       if (unlikely(!is_pp_page(page)))
+       if (unlikely(!is_pp_netmem(netmem)))
                return false;
 
-       page_pool_put_full_netmem(page->pp, page_to_netmem(page), false);
+       page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);
 
        return true;
 }
@@ -1061,7 +1060,7 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data)
 static int skb_pp_frag_ref(struct sk_buff *skb)
 {
        struct skb_shared_info *shinfo;
-       struct page *head_page;
+       netmem_ref head_netmem;
        int i;
 
        if (!skb->pp_recycle)
@@ -1070,11 +1069,11 @@ static int skb_pp_frag_ref(struct sk_buff *skb)
        shinfo = skb_shinfo(skb);
 
        for (i = 0; i < shinfo->nr_frags; i++) {
-               head_page = compound_head(skb_frag_page(&shinfo->frags[i]));
-               if (likely(is_pp_page(head_page)))
-                       page_pool_ref_page(head_page);
+               head_netmem = netmem_compound_head(shinfo->frags[i].netmem);
+               if (likely(is_pp_netmem(head_netmem)))
+                       page_pool_ref_netmem(head_netmem);
                else
-                       page_ref_inc(head_page);
+                       page_ref_inc(netmem_to_page(head_netmem));
        }
        return 0;
 }