Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net

[linux-2.6-block.git] / net / ipv4 / tcp_output.c
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

index 3f510cad0b3ec884aeb23f58aaa597ec98c82c88..c31badfee806a70d5af6dc033cf9155adb1f9375 100644 (file)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -233,16 +233,14 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
         if (init_rcv_wnd)
                 *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
  
-       (*rcv_wscale) = 0;
+       *rcv_wscale = 0;
         if (wscale_ok) {
                 /* Set window scaling on max possible window */
                 space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
                 space = max_t(u32, space, sysctl_rmem_max);
                 space = min_t(u32, space, *window_clamp);
-               while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
-                       space >>= 1;
-                       (*rcv_wscale)++;
-               }
+               *rcv_wscale = clamp_t(int, ilog2(space) - 15,
+                                     0, TCP_MAX_WSCALE);
         }
         /* Set the clamp no higher than max representable value */
         (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
@@ -596,7 +594,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
  
         *md5 = NULL;
  #ifdef CONFIG_TCP_MD5SIG
-       if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+       if (static_key_false(&tcp_md5_needed) &&
+           rcu_access_pointer(tp->md5sig_info)) {
                 *md5 = tp->af_specific->md5_lookup(sk, sk);
                 if (*md5) {
                         opts->options |= OPTION_MD5;
@@ -732,7 +731,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
  
         *md5 = NULL;
  #ifdef CONFIG_TCP_MD5SIG
-       if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+       if (static_key_false(&tcp_md5_needed) &&
+           rcu_access_pointer(tp->md5sig_info)) {
                 *md5 = tp->af_specific->md5_lookup(sk, sk);
                 if (*md5) {
                         opts->options |= OPTION_MD5;
@@ -1904,24 +1904,27 @@ static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
   * This algorithm is from John Heffner.
   */
  static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
-                                bool *is_cwnd_limited, u32 max_segs)
+                                bool *is_cwnd_limited,
+                                bool *is_rwnd_limited,
+                                u32 max_segs)
  {
         const struct inet_connection_sock *icsk = inet_csk(sk);
-       u32 age, send_win, cong_win, limit, in_flight;
+       u32 send_win, cong_win, limit, in_flight;
         struct tcp_sock *tp = tcp_sk(sk);
         struct sk_buff *head;
         int win_divisor;
-
-       if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
-               goto send_now;
+       s64 delta;
  
         if (icsk->icsk_ca_state >= TCP_CA_Recovery)
                 goto send_now;
  
         /* Avoid bursty behavior by allowing defer
-        * only if the last write was recent.
+        * only if the last write was recent (1 ms).
+        * Note that tp->tcp_wstamp_ns can be in the future if we have
+        * packets waiting in a qdisc or device for EDT delivery.
          */
-       if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0)
+       delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
+       if (delta > 0)
                 goto send_now;
  
         in_flight = tcp_packets_in_flight(tp);
@@ -1944,6 +1947,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
         if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
                 goto send_now;
  
+       /* If this packet won't get more data, do not wait. */
+       if (TCP_SKB_CB(skb)->eor)
+               goto send_now;
+
         win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
         if (win_divisor) {
                 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
@@ -1968,15 +1975,32 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
         head = tcp_rtx_queue_head(sk);
         if (!head)
                 goto send_now;
-       age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head));
+       delta = tp->tcp_clock_cache - head->tstamp;
         /* If next ACK is likely to come too late (half srtt), do not defer */
-       if (age < (tp->srtt_us >> 4))
+       if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
                 goto send_now;
  
-       /* Ok, it looks like it is advisable to defer. */
+       /* Ok, it looks like it is advisable to defer.
+        * Three cases are tracked :
+        * 1) We are cwnd-limited
+        * 2) We are rwnd-limited
+        * 3) We are application limited.
+        */
+       if (cong_win < send_win) {
+               if (cong_win <= skb->len) {
+                       *is_cwnd_limited = true;
+                       return true;
+               }
+       } else {
+               if (send_win <= skb->len) {
+                       *is_rwnd_limited = true;
+                       return true;
+               }
+       }
  
-       if (cong_win < send_win && cong_win <= skb->len)
-               *is_cwnd_limited = true;
+       /* If this packet won't get more data, do not wait. */
+       if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+               goto send_now;
  
         return true;
  
@@ -2212,8 +2236,9 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
         limit = max_t(unsigned long,
                       2 * skb->truesize,
                       sk->sk_pacing_rate >> sk->sk_pacing_shift);
-       limit = min_t(unsigned long, limit,
-                     sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
+       if (sk->sk_pacing_status == SK_PACING_NONE)
+               limit = min_t(unsigned long, limit,
+                             sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
         limit <<= factor;
  
         if (refcount_read(&sk->sk_wmem_alloc) > limit) {
@@ -2356,7 +2381,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                 } else {
                         if (!push_one &&
                             tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
-                                                max_segs))
+                                                &is_rwnd_limited, max_segs))
                                 break;
                 }
  
@@ -2494,15 +2519,18 @@ void tcp_send_loss_probe(struct sock *sk)
                 goto rearm_timer;
         }
         skb = skb_rb_last(&sk->tcp_rtx_queue);
+       if (unlikely(!skb)) {
+               WARN_ONCE(tp->packets_out,
+                         "invalid inflight: %u state %u cwnd %u mss %d\n",
+                         tp->packets_out, sk->sk_state, tp->snd_cwnd, mss);
+               inet_csk(sk)->icsk_pending = 0;
+               return;
+       }
  
         /* At most one outstanding TLP retransmission. */
         if (tp->tlp_high_seq)
                 goto rearm_timer;
  
-       /* Retransmit last segment. */
-       if (WARN_ON(!skb))
-               goto rearm_timer;
-
         if (skb_still_in_host_queue(sk, skb))
                 goto rearm_timer;
  
@@ -2920,7 +2948,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
                 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
                 trace_tcp_retransmit_skb(sk, skb);
         } else if (err != -EBUSY) {
-               NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
+               NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
         }
         return err;
  }