Merge branch 'accurate-memory-charging-for-msg_zerocopy'

Talal Ahmad says: ==================== Accurate Memory Charging For MSG_ZEROCOPY This series improves the accuracy of msg_zerocopy memory accounting. At present, when msg_zerocopy is used memory is charged twice for the data - once when user space allocates it, and then again within __zerocopy_sg_from_iter. The memory charging in the kernel is excessive because data is held in user pages and is never actually copied to skb fragments. This leads to incorrectly inflated memory statistics for programs passing MSG_ZEROCOPY. We reduce this inaccuracy by introducing the notion of "pure" zerocopy SKBs - where all the frags in the SKB are backed by pinned userspace pages, and none are backed by copied pages. For such SKBs, tracked via the new SKBFL_PURE_ZEROCOPY flag, we elide sk_mem_charge/uncharge calls, leading to more accurate accounting. However, SKBs can also be coalesced by the stack at present, potentially leading to "impure" SKBs. We restrict this coalescing so it can only happen within the sendmsg() system call itself, for the most recently allocated SKB. While this can lead to a small degree of double-charging of memory, this case does not arise often in practice for workloads that set MSG_ZEROCOPY. Testing verified that memory usage in the kernel is lowered. Instrumentation with counters also showed that accounting at time charging and uncharging is balanced. ==================== Link: https://lore.kernel.org/r/20211030020542.3870542-1-mailtalalahmad@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
author: Jakub Kicinski <kuba@kernel.org> 2021-11-01 16:33:29 -0700
committer: Jakub Kicinski <kuba@kernel.org> 2021-11-01 16:33:29 -0700
commit: 8a75e30e6d473f6f63bc0ca9bdde6caa1563b6d0 (patch)
tree: 20453ce2f7741682aabd80e1c21c2db8c8562680 /include
parent: 047304d0bfa5be2ace106974f87eec51e0832cd0 (diff)
parent: f1a456f8f3fc5828d8abcad941860380ae147b1d (diff)
download: cachepc-linux-8a75e30e6d473f6f63bc0ca9bdde6caa1563b6d0.tar.gz
cachepc-linux-8a75e30e6d473f6f63bc0ca9bdde6caa1563b6d0.zip
3 files changed, 31 insertions, 10 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0bd6520329f6..10869906cc57 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -454,9 +454,15 @@ enum {
 	 * all frags to avoid possible bad checksum
 	 */
 	SKBFL_SHARED_FRAG = BIT(1),
+
+	/* segment contains only zerocopy data and should not be
+	 * charged to the kernel memory.
+	 */
+	SKBFL_PURE_ZEROCOPY = BIT(2),
 };
 
 #define SKBFL_ZEROCOPY_FRAG	(SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
+#define SKBFL_ALL_ZEROCOPY	(SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY)
 
 /*
  * The callback notifies userspace to release buffers when skb DMA is done in
@@ -1464,6 +1470,17 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
 	return is_zcopy ? skb_uarg(skb) : NULL;
 }
 
+static inline bool skb_zcopy_pure(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
+}
+
+static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
+				       const struct sk_buff *skb2)
+{
+	return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2);
+}
+
 static inline void net_zcopy_get(struct ubuf_info *uarg)
 {
 	refcount_inc(&uarg->refcnt);
@@ -1528,7 +1545,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
 		if (!skb_zcopy_is_nouarg(skb))
 			uarg->callback(skb, uarg, zerocopy_success);
 
-		skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG;
+		skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY;
 	}
 }
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 620de053002d..b32906e1ab55 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1603,13 +1603,6 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
 		__sk_mem_reclaim(sk, SK_RECLAIM_CHUNK);
 }
 
-static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
-{
-	sk_wmem_queued_add(sk, -skb->truesize);
-	sk_mem_uncharge(sk, skb->truesize);
-	__kfree_skb(skb);
-}
-
 static inline void sock_release_ownership(struct sock *sk)
 {
 	if (sk->sk_lock.owned) {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8e8c5922a7b0..af91f370432e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -290,6 +290,16 @@ static inline bool tcp_out_of_memory(struct sock *sk)
 	return false;
 }
 
+static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
+{
+	sk_wmem_queued_add(sk, -skb->truesize);
+	if (!skb_zcopy_pure(skb))
+		sk_mem_uncharge(sk, skb->truesize);
+	else
+		sk_mem_uncharge(sk, SKB_TRUESIZE(MAX_TCP_HEADER));
+	__kfree_skb(skb);
+}
+
 void sk_forced_mem_schedule(struct sock *sk, int size);
 
 bool tcp_check_oom(struct sock *sk, int shift);
@@ -967,7 +977,8 @@ static inline bool tcp_skb_can_collapse(const struct sk_buff *to,
 					const struct sk_buff *from)
 {
 	return likely(tcp_skb_can_collapse_to(to) &&
-		      mptcp_skb_can_collapse(to, from));
+		      mptcp_skb_can_collapse(to, from) &&
+		      skb_pure_zcopy_same(to, from));
 }
 
 /* Events passed to congestion control interface */
@@ -1875,7 +1886,7 @@ static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct soc
 {
 	list_del(&skb->tcp_tsorted_anchor);
 	tcp_rtx_queue_unlink(skb, sk);
-	sk_wmem_free_skb(sk, skb);
+	tcp_wmem_free_skb(sk, skb);
 }
 
 static inline void tcp_push_pending_frames(struct sock *sk)
author	Jakub Kicinski <kuba@kernel.org>	2021-11-01 16:33:29 -0700
committer	Jakub Kicinski <kuba@kernel.org>	2021-11-01 16:33:29 -0700
commit	8a75e30e6d473f6f63bc0ca9bdde6caa1563b6d0 (patch)
tree	20453ce2f7741682aabd80e1c21c2db8c8562680 /include
parent	047304d0bfa5be2ace106974f87eec51e0832cd0 (diff)
parent	f1a456f8f3fc5828d8abcad941860380ae147b1d (diff)
download	cachepc-linux-8a75e30e6d473f6f63bc0ca9bdde6caa1563b6d0.tar.gz cachepc-linux-8a75e30e6d473f6f63bc0ca9bdde6caa1563b6d0.zip