cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

tcp_rate.c (8436B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2#include <net/tcp.h>
      3
      4/* The bandwidth estimator estimates the rate at which the network
      5 * can currently deliver outbound data packets for this flow. At a high
      6 * level, it operates by taking a delivery rate sample for each ACK.
      7 *
      8 * A rate sample records the rate at which the network delivered packets
      9 * for this flow, calculated over the time interval between the transmission
     10 * of a data packet and the acknowledgment of that packet.
     11 *
     12 * Specifically, over the interval between each transmit and corresponding ACK,
     13 * the estimator generates a delivery rate sample. Typically it uses the rate
     14 * at which packets were acknowledged. However, the approach of using only the
     15 * acknowledgment rate faces a challenge under the prevalent ACK decimation or
     16 * compression: packets can temporarily appear to be delivered much quicker
     17 * than the bottleneck rate. Since it is physically impossible to do that in a
     18 * sustained fashion, when the estimator notices that the ACK rate is faster
     19 * than the transmit rate, it uses the latter:
     20 *
     21 *    send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
     22 *    ack_rate  = #pkts_delivered/(last_ack_time - first_ack_time)
     23 *    bw = min(send_rate, ack_rate)
     24 *
     25 * Notice the estimator essentially estimates the goodput, not always the
     26 * network bottleneck link rate when the sending or receiving is limited by
     27 * other factors like applications or receiver window limits.  The estimator
     28 * deliberately avoids using the inter-packet spacing approach because that
     29 * approach requires a large number of samples and sophisticated filtering.
     30 *
     31 * TCP flows can often be application-limited in request/response workloads.
     32 * The estimator marks a bandwidth sample as application-limited if there
     33 * was some moment during the sampled window of packets when there was no data
     34 * ready to send in the write queue.
     35 */
     36
     37/* Snapshot the current delivery information in the skb, to generate
     38 * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
     39 */
     40void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
     41{
     42	struct tcp_sock *tp = tcp_sk(sk);
     43
     44	 /* In general we need to start delivery rate samples from the
     45	  * time we received the most recent ACK, to ensure we include
     46	  * the full time the network needs to deliver all in-flight
     47	  * packets. If there are no packets in flight yet, then we
     48	  * know that any ACKs after now indicate that the network was
     49	  * able to deliver those packets completely in the sampling
     50	  * interval between now and the next ACK.
     51	  *
     52	  * Note that we use packets_out instead of tcp_packets_in_flight(tp)
     53	  * because the latter is a guess based on RTO and loss-marking
     54	  * heuristics. We don't want spurious RTOs or loss markings to cause
     55	  * a spuriously small time interval, causing a spuriously high
     56	  * bandwidth estimate.
     57	  */
     58	if (!tp->packets_out) {
     59		u64 tstamp_us = tcp_skb_timestamp_us(skb);
     60
     61		tp->first_tx_mstamp  = tstamp_us;
     62		tp->delivered_mstamp = tstamp_us;
     63	}
     64
     65	TCP_SKB_CB(skb)->tx.first_tx_mstamp	= tp->first_tx_mstamp;
     66	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
     67	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
     68	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
     69	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
     70}
     71
     72/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
     73 * delivery information when the skb was last transmitted.
     74 *
     75 * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
     76 * called multiple times. We favor the information from the most recently
     77 * sent skb, i.e., the skb with the most recently sent time and the highest
     78 * sequence.
     79 */
     80void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
     81			    struct rate_sample *rs)
     82{
     83	struct tcp_sock *tp = tcp_sk(sk);
     84	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
     85	u64 tx_tstamp;
     86
     87	if (!scb->tx.delivered_mstamp)
     88		return;
     89
     90	tx_tstamp = tcp_skb_timestamp_us(skb);
     91	if (!rs->prior_delivered ||
     92	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
     93			       scb->end_seq, rs->last_end_seq)) {
     94		rs->prior_delivered_ce  = scb->tx.delivered_ce;
     95		rs->prior_delivered  = scb->tx.delivered;
     96		rs->prior_mstamp     = scb->tx.delivered_mstamp;
     97		rs->is_app_limited   = scb->tx.is_app_limited;
     98		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
     99		rs->last_end_seq     = scb->end_seq;
    100
    101		/* Record send time of most recently ACKed packet: */
    102		tp->first_tx_mstamp  = tx_tstamp;
    103		/* Find the duration of the "send phase" of this window: */
    104		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
    105						     scb->tx.first_tx_mstamp);
    106
    107	}
    108	/* Mark off the skb delivered once it's sacked to avoid being
    109	 * used again when it's cumulatively acked. For acked packets
    110	 * we don't need to reset since it'll be freed soon.
    111	 */
    112	if (scb->sacked & TCPCB_SACKED_ACKED)
    113		scb->tx.delivered_mstamp = 0;
    114}
    115
    116/* Update the connection delivery information and generate a rate sample. */
    117void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
    118		  bool is_sack_reneg, struct rate_sample *rs)
    119{
    120	struct tcp_sock *tp = tcp_sk(sk);
    121	u32 snd_us, ack_us;
    122
    123	/* Clear app limited if bubble is acked and gone. */
    124	if (tp->app_limited && after(tp->delivered, tp->app_limited))
    125		tp->app_limited = 0;
    126
    127	/* TODO: there are multiple places throughout tcp_ack() to get
    128	 * current time. Refactor the code using a new "tcp_acktag_state"
    129	 * to carry current time, flags, stats like "tcp_sacktag_state".
    130	 */
    131	if (delivered)
    132		tp->delivered_mstamp = tp->tcp_mstamp;
    133
    134	rs->acked_sacked = delivered;	/* freshly ACKed or SACKed */
    135	rs->losses = lost;		/* freshly marked lost */
    136	/* Return an invalid sample if no timing information is available or
    137	 * in recovery from loss with SACK reneging. Rate samples taken during
    138	 * a SACK reneging event may overestimate bw by including packets that
    139	 * were SACKed before the reneg.
    140	 */
    141	if (!rs->prior_mstamp || is_sack_reneg) {
    142		rs->delivered = -1;
    143		rs->interval_us = -1;
    144		return;
    145	}
    146	rs->delivered   = tp->delivered - rs->prior_delivered;
    147
    148	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
    149	/* delivered_ce occupies less than 32 bits in the skb control block */
    150	rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK;
    151
    152	/* Model sending data and receiving ACKs as separate pipeline phases
    153	 * for a window. Usually the ACK phase is longer, but with ACK
    154	 * compression the send phase can be longer. To be safe we use the
    155	 * longer phase.
    156	 */
    157	snd_us = rs->interval_us;				/* send phase */
    158	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
    159				    rs->prior_mstamp); /* ack phase */
    160	rs->interval_us = max(snd_us, ack_us);
    161
    162	/* Record both segment send and ack receive intervals */
    163	rs->snd_interval_us = snd_us;
    164	rs->rcv_interval_us = ack_us;
    165
    166	/* Normally we expect interval_us >= min-rtt.
    167	 * Note that rate may still be over-estimated when a spuriously
    168	 * retransmistted skb was first (s)acked because "interval_us"
    169	 * is under-estimated (up to an RTT). However continuously
    170	 * measuring the delivery rate during loss recovery is crucial
    171	 * for connections suffer heavy or prolonged losses.
    172	 */
    173	if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
    174		if (!rs->is_retrans)
    175			pr_debug("tcp rate: %ld %d %u %u %u\n",
    176				 rs->interval_us, rs->delivered,
    177				 inet_csk(sk)->icsk_ca_state,
    178				 tp->rx_opt.sack_ok, tcp_min_rtt(tp));
    179		rs->interval_us = -1;
    180		return;
    181	}
    182
    183	/* Record the last non-app-limited or the highest app-limited bw */
    184	if (!rs->is_app_limited ||
    185	    ((u64)rs->delivered * tp->rate_interval_us >=
    186	     (u64)tp->rate_delivered * rs->interval_us)) {
    187		tp->rate_delivered = rs->delivered;
    188		tp->rate_interval_us = rs->interval_us;
    189		tp->rate_app_limited = rs->is_app_limited;
    190	}
    191}
    192
    193/* If a gap is detected between sends, mark the socket application-limited. */
    194void tcp_rate_check_app_limited(struct sock *sk)
    195{
    196	struct tcp_sock *tp = tcp_sk(sk);
    197
    198	if (/* We have less than one packet to send. */
    199	    tp->write_seq - tp->snd_nxt < tp->mss_cache &&
    200	    /* Nothing in sending host's qdisc queues or NIC tx queue. */
    201	    sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
    202	    /* We are not limited by CWND. */
    203	    tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) &&
    204	    /* All lost packets have been retransmitted. */
    205	    tp->lost_out <= tp->retrans_out)
    206		tp->app_limited =
    207			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
    208}
    209EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);