Merge tag 'kvm-arm-fixes-for-v4.16-2' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-block.git] / drivers / net / hyperv / netvsc_drv.c
1 /*
2  * Copyright (c) 2009, Microsoft Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, see <http://www.gnu.org/licenses/>.
15  *
16  * Authors:
17  *   Haiyang Zhang <haiyangz@microsoft.com>
18  *   Hank Janssen  <hjanssen@microsoft.com>
19  */
20 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
21
22 #include <linux/init.h>
23 #include <linux/atomic.h>
24 #include <linux/module.h>
25 #include <linux/highmem.h>
26 #include <linux/device.h>
27 #include <linux/io.h>
28 #include <linux/delay.h>
29 #include <linux/netdevice.h>
30 #include <linux/inetdevice.h>
31 #include <linux/etherdevice.h>
32 #include <linux/skbuff.h>
33 #include <linux/if_vlan.h>
34 #include <linux/in.h>
35 #include <linux/slab.h>
36 #include <linux/rtnetlink.h>
37 #include <linux/netpoll.h>
38 #include <linux/reciprocal_div.h>
39
40 #include <net/arp.h>
41 #include <net/route.h>
42 #include <net/sock.h>
43 #include <net/pkt_sched.h>
44 #include <net/checksum.h>
45 #include <net/ip6_checksum.h>
46
47 #include "hyperv_net.h"
48
49 #define RING_SIZE_MIN           64
50
51 #define LINKCHANGE_INT (2 * HZ)
52 #define VF_TAKEOVER_INT (HZ / 10)
53
54 static unsigned int ring_size __ro_after_init = 128;
55 module_param(ring_size, uint, S_IRUGO);
56 MODULE_PARM_DESC(ring_size, "Ring buffer size (# of pages)");
57 unsigned int netvsc_ring_bytes __ro_after_init;
58 struct reciprocal_value netvsc_ring_reciprocal __ro_after_init;
59
60 static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE |
61                                 NETIF_MSG_LINK | NETIF_MSG_IFUP |
62                                 NETIF_MSG_IFDOWN | NETIF_MSG_RX_ERR |
63                                 NETIF_MSG_TX_ERR;
64
65 static int debug = -1;
66 module_param(debug, int, S_IRUGO);
67 MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
68
69 static void netvsc_change_rx_flags(struct net_device *net, int change)
70 {
71         struct net_device_context *ndev_ctx = netdev_priv(net);
72         struct net_device *vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
73         int inc;
74
75         if (!vf_netdev)
76                 return;
77
78         if (change & IFF_PROMISC) {
79                 inc = (net->flags & IFF_PROMISC) ? 1 : -1;
80                 dev_set_promiscuity(vf_netdev, inc);
81         }
82
83         if (change & IFF_ALLMULTI) {
84                 inc = (net->flags & IFF_ALLMULTI) ? 1 : -1;
85                 dev_set_allmulti(vf_netdev, inc);
86         }
87 }
88
89 static void netvsc_set_rx_mode(struct net_device *net)
90 {
91         struct net_device_context *ndev_ctx = netdev_priv(net);
92         struct net_device *vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
93         struct netvsc_device *nvdev = rtnl_dereference(ndev_ctx->nvdev);
94
95         if (vf_netdev) {
96                 dev_uc_sync(vf_netdev, net);
97                 dev_mc_sync(vf_netdev, net);
98         }
99
100         rndis_filter_update(nvdev);
101 }
102
103 static int netvsc_open(struct net_device *net)
104 {
105         struct net_device_context *ndev_ctx = netdev_priv(net);
106         struct net_device *vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
107         struct netvsc_device *nvdev = rtnl_dereference(ndev_ctx->nvdev);
108         struct rndis_device *rdev;
109         int ret = 0;
110
111         netif_carrier_off(net);
112
113         /* Open up the device */
114         ret = rndis_filter_open(nvdev);
115         if (ret != 0) {
116                 netdev_err(net, "unable to open device (ret %d).\n", ret);
117                 return ret;
118         }
119
120         rdev = nvdev->extension;
121         if (!rdev->link_state) {
122                 netif_carrier_on(net);
123                 netif_tx_wake_all_queues(net);
124         }
125
126         if (vf_netdev) {
127                 /* Setting synthetic device up transparently sets
128                  * slave as up. If open fails, then slave will be
129                  * still be offline (and not used).
130                  */
131                 ret = dev_open(vf_netdev);
132                 if (ret)
133                         netdev_warn(net,
134                                     "unable to open slave: %s: %d\n",
135                                     vf_netdev->name, ret);
136         }
137         return 0;
138 }
139
140 static int netvsc_close(struct net_device *net)
141 {
142         struct net_device_context *net_device_ctx = netdev_priv(net);
143         struct net_device *vf_netdev
144                 = rtnl_dereference(net_device_ctx->vf_netdev);
145         struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
146         int ret = 0;
147         u32 aread, i, msec = 10, retry = 0, retry_max = 20;
148         struct vmbus_channel *chn;
149
150         netif_tx_disable(net);
151
152         /* No need to close rndis filter if it is removed already */
153         if (!nvdev)
154                 goto out;
155
156         ret = rndis_filter_close(nvdev);
157         if (ret != 0) {
158                 netdev_err(net, "unable to close device (ret %d).\n", ret);
159                 return ret;
160         }
161
162         /* Ensure pending bytes in ring are read */
163         while (true) {
164                 aread = 0;
165                 for (i = 0; i < nvdev->num_chn; i++) {
166                         chn = nvdev->chan_table[i].channel;
167                         if (!chn)
168                                 continue;
169
170                         aread = hv_get_bytes_to_read(&chn->inbound);
171                         if (aread)
172                                 break;
173
174                         aread = hv_get_bytes_to_read(&chn->outbound);
175                         if (aread)
176                                 break;
177                 }
178
179                 retry++;
180                 if (retry > retry_max || aread == 0)
181                         break;
182
183                 msleep(msec);
184
185                 if (msec < 1000)
186                         msec *= 2;
187         }
188
189         if (aread) {
190                 netdev_err(net, "Ring buffer not empty after closing rndis\n");
191                 ret = -ETIMEDOUT;
192         }
193
194 out:
195         if (vf_netdev)
196                 dev_close(vf_netdev);
197
198         return ret;
199 }
200
201 static inline void *init_ppi_data(struct rndis_message *msg,
202                                   u32 ppi_size, u32 pkt_type)
203 {
204         struct rndis_packet *rndis_pkt = &msg->msg.pkt;
205         struct rndis_per_packet_info *ppi;
206
207         rndis_pkt->data_offset += ppi_size;
208         ppi = (void *)rndis_pkt + rndis_pkt->per_pkt_info_offset
209                 + rndis_pkt->per_pkt_info_len;
210
211         ppi->size = ppi_size;
212         ppi->type = pkt_type;
213         ppi->ppi_offset = sizeof(struct rndis_per_packet_info);
214
215         rndis_pkt->per_pkt_info_len += ppi_size;
216
217         return ppi + 1;
218 }
219
220 /* Azure hosts don't support non-TCP port numbers in hashing for fragmented
221  * packets. We can use ethtool to change UDP hash level when necessary.
222  */
223 static inline u32 netvsc_get_hash(
224         struct sk_buff *skb,
225         const struct net_device_context *ndc)
226 {
227         struct flow_keys flow;
228         u32 hash, pkt_proto = 0;
229         static u32 hashrnd __read_mostly;
230
231         net_get_random_once(&hashrnd, sizeof(hashrnd));
232
233         if (!skb_flow_dissect_flow_keys(skb, &flow, 0))
234                 return 0;
235
236         switch (flow.basic.ip_proto) {
237         case IPPROTO_TCP:
238                 if (flow.basic.n_proto == htons(ETH_P_IP))
239                         pkt_proto = HV_TCP4_L4HASH;
240                 else if (flow.basic.n_proto == htons(ETH_P_IPV6))
241                         pkt_proto = HV_TCP6_L4HASH;
242
243                 break;
244
245         case IPPROTO_UDP:
246                 if (flow.basic.n_proto == htons(ETH_P_IP))
247                         pkt_proto = HV_UDP4_L4HASH;
248                 else if (flow.basic.n_proto == htons(ETH_P_IPV6))
249                         pkt_proto = HV_UDP6_L4HASH;
250
251                 break;
252         }
253
254         if (pkt_proto & ndc->l4_hash) {
255                 return skb_get_hash(skb);
256         } else {
257                 if (flow.basic.n_proto == htons(ETH_P_IP))
258                         hash = jhash2((u32 *)&flow.addrs.v4addrs, 2, hashrnd);
259                 else if (flow.basic.n_proto == htons(ETH_P_IPV6))
260                         hash = jhash2((u32 *)&flow.addrs.v6addrs, 8, hashrnd);
261                 else
262                         hash = 0;
263
264                 skb_set_hash(skb, hash, PKT_HASH_TYPE_L3);
265         }
266
267         return hash;
268 }
269
270 static inline int netvsc_get_tx_queue(struct net_device *ndev,
271                                       struct sk_buff *skb, int old_idx)
272 {
273         const struct net_device_context *ndc = netdev_priv(ndev);
274         struct sock *sk = skb->sk;
275         int q_idx;
276
277         q_idx = ndc->tx_table[netvsc_get_hash(skb, ndc) &
278                               (VRSS_SEND_TAB_SIZE - 1)];
279
280         /* If queue index changed record the new value */
281         if (q_idx != old_idx &&
282             sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache))
283                 sk_tx_queue_set(sk, q_idx);
284
285         return q_idx;
286 }
287
288 /*
289  * Select queue for transmit.
290  *
291  * If a valid queue has already been assigned, then use that.
292  * Otherwise compute tx queue based on hash and the send table.
293  *
294  * This is basically similar to default (__netdev_pick_tx) with the added step
295  * of using the host send_table when no other queue has been assigned.
296  *
297  * TODO support XPS - but get_xps_queue not exported
298  */
299 static u16 netvsc_pick_tx(struct net_device *ndev, struct sk_buff *skb)
300 {
301         int q_idx = sk_tx_queue_get(skb->sk);
302
303         if (q_idx < 0 || skb->ooo_okay || q_idx >= ndev->real_num_tx_queues) {
304                 /* If forwarding a packet, we use the recorded queue when
305                  * available for better cache locality.
306                  */
307                 if (skb_rx_queue_recorded(skb))
308                         q_idx = skb_get_rx_queue(skb);
309                 else
310                         q_idx = netvsc_get_tx_queue(ndev, skb, q_idx);
311         }
312
313         return q_idx;
314 }
315
316 static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
317                                void *accel_priv,
318                                select_queue_fallback_t fallback)
319 {
320         struct net_device_context *ndc = netdev_priv(ndev);
321         struct net_device *vf_netdev;
322         u16 txq;
323
324         rcu_read_lock();
325         vf_netdev = rcu_dereference(ndc->vf_netdev);
326         if (vf_netdev) {
327                 const struct net_device_ops *vf_ops = vf_netdev->netdev_ops;
328
329                 if (vf_ops->ndo_select_queue)
330                         txq = vf_ops->ndo_select_queue(vf_netdev, skb,
331                                                        accel_priv, fallback);
332                 else
333                         txq = fallback(vf_netdev, skb);
334
335                 /* Record the queue selected by VF so that it can be
336                  * used for common case where VF has more queues than
337                  * the synthetic device.
338                  */
339                 qdisc_skb_cb(skb)->slave_dev_queue_mapping = txq;
340         } else {
341                 txq = netvsc_pick_tx(ndev, skb);
342         }
343         rcu_read_unlock();
344
345         while (unlikely(txq >= ndev->real_num_tx_queues))
346                 txq -= ndev->real_num_tx_queues;
347
348         return txq;
349 }
350
351 static u32 fill_pg_buf(struct page *page, u32 offset, u32 len,
352                        struct hv_page_buffer *pb)
353 {
354         int j = 0;
355
356         /* Deal with compund pages by ignoring unused part
357          * of the page.
358          */
359         page += (offset >> PAGE_SHIFT);
360         offset &= ~PAGE_MASK;
361
362         while (len > 0) {
363                 unsigned long bytes;
364
365                 bytes = PAGE_SIZE - offset;
366                 if (bytes > len)
367                         bytes = len;
368                 pb[j].pfn = page_to_pfn(page);
369                 pb[j].offset = offset;
370                 pb[j].len = bytes;
371
372                 offset += bytes;
373                 len -= bytes;
374
375                 if (offset == PAGE_SIZE && len) {
376                         page++;
377                         offset = 0;
378                         j++;
379                 }
380         }
381
382         return j + 1;
383 }
384
385 static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb,
386                            struct hv_netvsc_packet *packet,
387                            struct hv_page_buffer *pb)
388 {
389         u32 slots_used = 0;
390         char *data = skb->data;
391         int frags = skb_shinfo(skb)->nr_frags;
392         int i;
393
394         /* The packet is laid out thus:
395          * 1. hdr: RNDIS header and PPI
396          * 2. skb linear data
397          * 3. skb fragment data
398          */
399         slots_used += fill_pg_buf(virt_to_page(hdr),
400                                   offset_in_page(hdr),
401                                   len, &pb[slots_used]);
402
403         packet->rmsg_size = len;
404         packet->rmsg_pgcnt = slots_used;
405
406         slots_used += fill_pg_buf(virt_to_page(data),
407                                 offset_in_page(data),
408                                 skb_headlen(skb), &pb[slots_used]);
409
410         for (i = 0; i < frags; i++) {
411                 skb_frag_t *frag = skb_shinfo(skb)->frags + i;
412
413                 slots_used += fill_pg_buf(skb_frag_page(frag),
414                                         frag->page_offset,
415                                         skb_frag_size(frag), &pb[slots_used]);
416         }
417         return slots_used;
418 }
419
420 static int count_skb_frag_slots(struct sk_buff *skb)
421 {
422         int i, frags = skb_shinfo(skb)->nr_frags;
423         int pages = 0;
424
425         for (i = 0; i < frags; i++) {
426                 skb_frag_t *frag = skb_shinfo(skb)->frags + i;
427                 unsigned long size = skb_frag_size(frag);
428                 unsigned long offset = frag->page_offset;
429
430                 /* Skip unused frames from start of page */
431                 offset &= ~PAGE_MASK;
432                 pages += PFN_UP(offset + size);
433         }
434         return pages;
435 }
436
437 static int netvsc_get_slots(struct sk_buff *skb)
438 {
439         char *data = skb->data;
440         unsigned int offset = offset_in_page(data);
441         unsigned int len = skb_headlen(skb);
442         int slots;
443         int frag_slots;
444
445         slots = DIV_ROUND_UP(offset + len, PAGE_SIZE);
446         frag_slots = count_skb_frag_slots(skb);
447         return slots + frag_slots;
448 }
449
450 static u32 net_checksum_info(struct sk_buff *skb)
451 {
452         if (skb->protocol == htons(ETH_P_IP)) {
453                 struct iphdr *ip = ip_hdr(skb);
454
455                 if (ip->protocol == IPPROTO_TCP)
456                         return TRANSPORT_INFO_IPV4_TCP;
457                 else if (ip->protocol == IPPROTO_UDP)
458                         return TRANSPORT_INFO_IPV4_UDP;
459         } else {
460                 struct ipv6hdr *ip6 = ipv6_hdr(skb);
461
462                 if (ip6->nexthdr == IPPROTO_TCP)
463                         return TRANSPORT_INFO_IPV6_TCP;
464                 else if (ip6->nexthdr == IPPROTO_UDP)
465                         return TRANSPORT_INFO_IPV6_UDP;
466         }
467
468         return TRANSPORT_INFO_NOT_IP;
469 }
470
471 /* Send skb on the slave VF device. */
472 static int netvsc_vf_xmit(struct net_device *net, struct net_device *vf_netdev,
473                           struct sk_buff *skb)
474 {
475         struct net_device_context *ndev_ctx = netdev_priv(net);
476         unsigned int len = skb->len;
477         int rc;
478
479         skb->dev = vf_netdev;
480         skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
481
482         rc = dev_queue_xmit(skb);
483         if (likely(rc == NET_XMIT_SUCCESS || rc == NET_XMIT_CN)) {
484                 struct netvsc_vf_pcpu_stats *pcpu_stats
485                         = this_cpu_ptr(ndev_ctx->vf_stats);
486
487                 u64_stats_update_begin(&pcpu_stats->syncp);
488                 pcpu_stats->tx_packets++;
489                 pcpu_stats->tx_bytes += len;
490                 u64_stats_update_end(&pcpu_stats->syncp);
491         } else {
492                 this_cpu_inc(ndev_ctx->vf_stats->tx_dropped);
493         }
494
495         return rc;
496 }
497
498 static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net)
499 {
500         struct net_device_context *net_device_ctx = netdev_priv(net);
501         struct hv_netvsc_packet *packet = NULL;
502         int ret;
503         unsigned int num_data_pgs;
504         struct rndis_message *rndis_msg;
505         struct net_device *vf_netdev;
506         u32 rndis_msg_size;
507         u32 hash;
508         struct hv_page_buffer pb[MAX_PAGE_BUFFER_COUNT];
509
510         /* if VF is present and up then redirect packets
511          * already called with rcu_read_lock_bh
512          */
513         vf_netdev = rcu_dereference_bh(net_device_ctx->vf_netdev);
514         if (vf_netdev && netif_running(vf_netdev) &&
515             !netpoll_tx_running(net))
516                 return netvsc_vf_xmit(net, vf_netdev, skb);
517
518         /* We will atmost need two pages to describe the rndis
519          * header. We can only transmit MAX_PAGE_BUFFER_COUNT number
520          * of pages in a single packet. If skb is scattered around
521          * more pages we try linearizing it.
522          */
523
524         num_data_pgs = netvsc_get_slots(skb) + 2;
525
526         if (unlikely(num_data_pgs > MAX_PAGE_BUFFER_COUNT)) {
527                 ++net_device_ctx->eth_stats.tx_scattered;
528
529                 if (skb_linearize(skb))
530                         goto no_memory;
531
532                 num_data_pgs = netvsc_get_slots(skb) + 2;
533                 if (num_data_pgs > MAX_PAGE_BUFFER_COUNT) {
534                         ++net_device_ctx->eth_stats.tx_too_big;
535                         goto drop;
536                 }
537         }
538
539         /*
540          * Place the rndis header in the skb head room and
541          * the skb->cb will be used for hv_netvsc_packet
542          * structure.
543          */
544         ret = skb_cow_head(skb, RNDIS_AND_PPI_SIZE);
545         if (ret)
546                 goto no_memory;
547
548         /* Use the skb control buffer for building up the packet */
549         BUILD_BUG_ON(sizeof(struct hv_netvsc_packet) >
550                         FIELD_SIZEOF(struct sk_buff, cb));
551         packet = (struct hv_netvsc_packet *)skb->cb;
552
553         packet->q_idx = skb_get_queue_mapping(skb);
554
555         packet->total_data_buflen = skb->len;
556         packet->total_bytes = skb->len;
557         packet->total_packets = 1;
558
559         rndis_msg = (struct rndis_message *)skb->head;
560
561         /* Add the rndis header */
562         rndis_msg->ndis_msg_type = RNDIS_MSG_PACKET;
563         rndis_msg->msg_len = packet->total_data_buflen;
564
565         rndis_msg->msg.pkt = (struct rndis_packet) {
566                 .data_offset = sizeof(struct rndis_packet),
567                 .data_len = packet->total_data_buflen,
568                 .per_pkt_info_offset = sizeof(struct rndis_packet),
569         };
570
571         rndis_msg_size = RNDIS_MESSAGE_SIZE(struct rndis_packet);
572
573         hash = skb_get_hash_raw(skb);
574         if (hash != 0 && net->real_num_tx_queues > 1) {
575                 u32 *hash_info;
576
577                 rndis_msg_size += NDIS_HASH_PPI_SIZE;
578                 hash_info = init_ppi_data(rndis_msg, NDIS_HASH_PPI_SIZE,
579                                           NBL_HASH_VALUE);
580                 *hash_info = hash;
581         }
582
583         if (skb_vlan_tag_present(skb)) {
584                 struct ndis_pkt_8021q_info *vlan;
585
586                 rndis_msg_size += NDIS_VLAN_PPI_SIZE;
587                 vlan = init_ppi_data(rndis_msg, NDIS_VLAN_PPI_SIZE,
588                                      IEEE_8021Q_INFO);
589
590                 vlan->value = 0;
591                 vlan->vlanid = skb->vlan_tci & VLAN_VID_MASK;
592                 vlan->pri = (skb->vlan_tci & VLAN_PRIO_MASK) >>
593                                 VLAN_PRIO_SHIFT;
594         }
595
596         if (skb_is_gso(skb)) {
597                 struct ndis_tcp_lso_info *lso_info;
598
599                 rndis_msg_size += NDIS_LSO_PPI_SIZE;
600                 lso_info = init_ppi_data(rndis_msg, NDIS_LSO_PPI_SIZE,
601                                          TCP_LARGESEND_PKTINFO);
602
603                 lso_info->value = 0;
604                 lso_info->lso_v2_transmit.type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
605                 if (skb->protocol == htons(ETH_P_IP)) {
606                         lso_info->lso_v2_transmit.ip_version =
607                                 NDIS_TCP_LARGE_SEND_OFFLOAD_IPV4;
608                         ip_hdr(skb)->tot_len = 0;
609                         ip_hdr(skb)->check = 0;
610                         tcp_hdr(skb)->check =
611                                 ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
612                                                    ip_hdr(skb)->daddr, 0, IPPROTO_TCP, 0);
613                 } else {
614                         lso_info->lso_v2_transmit.ip_version =
615                                 NDIS_TCP_LARGE_SEND_OFFLOAD_IPV6;
616                         ipv6_hdr(skb)->payload_len = 0;
617                         tcp_hdr(skb)->check =
618                                 ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
619                                                  &ipv6_hdr(skb)->daddr, 0, IPPROTO_TCP, 0);
620                 }
621                 lso_info->lso_v2_transmit.tcp_header_offset = skb_transport_offset(skb);
622                 lso_info->lso_v2_transmit.mss = skb_shinfo(skb)->gso_size;
623         } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
624                 if (net_checksum_info(skb) & net_device_ctx->tx_checksum_mask) {
625                         struct ndis_tcp_ip_checksum_info *csum_info;
626
627                         rndis_msg_size += NDIS_CSUM_PPI_SIZE;
628                         csum_info = init_ppi_data(rndis_msg, NDIS_CSUM_PPI_SIZE,
629                                                   TCPIP_CHKSUM_PKTINFO);
630
631                         csum_info->value = 0;
632                         csum_info->transmit.tcp_header_offset = skb_transport_offset(skb);
633
634                         if (skb->protocol == htons(ETH_P_IP)) {
635                                 csum_info->transmit.is_ipv4 = 1;
636
637                                 if (ip_hdr(skb)->protocol == IPPROTO_TCP)
638                                         csum_info->transmit.tcp_checksum = 1;
639                                 else
640                                         csum_info->transmit.udp_checksum = 1;
641                         } else {
642                                 csum_info->transmit.is_ipv6 = 1;
643
644                                 if (ipv6_hdr(skb)->nexthdr == IPPROTO_TCP)
645                                         csum_info->transmit.tcp_checksum = 1;
646                                 else
647                                         csum_info->transmit.udp_checksum = 1;
648                         }
649                 } else {
650                         /* Can't do offload of this type of checksum */
651                         if (skb_checksum_help(skb))
652                                 goto drop;
653                 }
654         }
655
656         /* Start filling in the page buffers with the rndis hdr */
657         rndis_msg->msg_len += rndis_msg_size;
658         packet->total_data_buflen = rndis_msg->msg_len;
659         packet->page_buf_cnt = init_page_array(rndis_msg, rndis_msg_size,
660                                                skb, packet, pb);
661
662         /* timestamp packet in software */
663         skb_tx_timestamp(skb);
664
665         ret = netvsc_send(net, packet, rndis_msg, pb, skb);
666         if (likely(ret == 0))
667                 return NETDEV_TX_OK;
668
669         if (ret == -EAGAIN) {
670                 ++net_device_ctx->eth_stats.tx_busy;
671                 return NETDEV_TX_BUSY;
672         }
673
674         if (ret == -ENOSPC)
675                 ++net_device_ctx->eth_stats.tx_no_space;
676
677 drop:
678         dev_kfree_skb_any(skb);
679         net->stats.tx_dropped++;
680
681         return NETDEV_TX_OK;
682
683 no_memory:
684         ++net_device_ctx->eth_stats.tx_no_memory;
685         goto drop;
686 }
687
688 /*
689  * netvsc_linkstatus_callback - Link up/down notification
690  */
691 void netvsc_linkstatus_callback(struct net_device *net,
692                                 struct rndis_message *resp)
693 {
694         struct rndis_indicate_status *indicate = &resp->msg.indicate_status;
695         struct net_device_context *ndev_ctx = netdev_priv(net);
696         struct netvsc_reconfig *event;
697         unsigned long flags;
698
699         /* Update the physical link speed when changing to another vSwitch */
700         if (indicate->status == RNDIS_STATUS_LINK_SPEED_CHANGE) {
701                 u32 speed;
702
703                 speed = *(u32 *)((void *)indicate
704                                  + indicate->status_buf_offset) / 10000;
705                 ndev_ctx->speed = speed;
706                 return;
707         }
708
709         /* Handle these link change statuses below */
710         if (indicate->status != RNDIS_STATUS_NETWORK_CHANGE &&
711             indicate->status != RNDIS_STATUS_MEDIA_CONNECT &&
712             indicate->status != RNDIS_STATUS_MEDIA_DISCONNECT)
713                 return;
714
715         if (net->reg_state != NETREG_REGISTERED)
716                 return;
717
718         event = kzalloc(sizeof(*event), GFP_ATOMIC);
719         if (!event)
720                 return;
721         event->event = indicate->status;
722
723         spin_lock_irqsave(&ndev_ctx->lock, flags);
724         list_add_tail(&event->list, &ndev_ctx->reconfig_events);
725         spin_unlock_irqrestore(&ndev_ctx->lock, flags);
726
727         schedule_delayed_work(&ndev_ctx->dwork, 0);
728 }
729
730 static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net,
731                                              struct napi_struct *napi,
732                                              const struct ndis_tcp_ip_checksum_info *csum_info,
733                                              const struct ndis_pkt_8021q_info *vlan,
734                                              void *data, u32 buflen)
735 {
736         struct sk_buff *skb;
737
738         skb = napi_alloc_skb(napi, buflen);
739         if (!skb)
740                 return skb;
741
742         /*
743          * Copy to skb. This copy is needed here since the memory pointed by
744          * hv_netvsc_packet cannot be deallocated
745          */
746         skb_put_data(skb, data, buflen);
747
748         skb->protocol = eth_type_trans(skb, net);
749
750         /* skb is already created with CHECKSUM_NONE */
751         skb_checksum_none_assert(skb);
752
753         /*
754          * In Linux, the IP checksum is always checked.
755          * Do L4 checksum offload if enabled and present.
756          */
757         if (csum_info && (net->features & NETIF_F_RXCSUM)) {
758                 if (csum_info->receive.tcp_checksum_succeeded ||
759                     csum_info->receive.udp_checksum_succeeded)
760                         skb->ip_summed = CHECKSUM_UNNECESSARY;
761         }
762
763         if (vlan) {
764                 u16 vlan_tci = vlan->vlanid | (vlan->pri << VLAN_PRIO_SHIFT);
765
766                 __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
767                                        vlan_tci);
768         }
769
770         return skb;
771 }
772
773 /*
774  * netvsc_recv_callback -  Callback when we receive a packet from the
775  * "wire" on the specified device.
776  */
777 int netvsc_recv_callback(struct net_device *net,
778                          struct netvsc_device *net_device,
779                          struct vmbus_channel *channel,
780                          void  *data, u32 len,
781                          const struct ndis_tcp_ip_checksum_info *csum_info,
782                          const struct ndis_pkt_8021q_info *vlan)
783 {
784         struct net_device_context *net_device_ctx = netdev_priv(net);
785         u16 q_idx = channel->offermsg.offer.sub_channel_index;
786         struct netvsc_channel *nvchan = &net_device->chan_table[q_idx];
787         struct sk_buff *skb;
788         struct netvsc_stats *rx_stats;
789
790         if (net->reg_state != NETREG_REGISTERED)
791                 return NVSP_STAT_FAIL;
792
793         /* Allocate a skb - TODO direct I/O to pages? */
794         skb = netvsc_alloc_recv_skb(net, &nvchan->napi,
795                                     csum_info, vlan, data, len);
796         if (unlikely(!skb)) {
797                 ++net_device_ctx->eth_stats.rx_no_memory;
798                 rcu_read_unlock();
799                 return NVSP_STAT_FAIL;
800         }
801
802         skb_record_rx_queue(skb, q_idx);
803
804         /*
805          * Even if injecting the packet, record the statistics
806          * on the synthetic device because modifying the VF device
807          * statistics will not work correctly.
808          */
809         rx_stats = &nvchan->rx_stats;
810         u64_stats_update_begin(&rx_stats->syncp);
811         rx_stats->packets++;
812         rx_stats->bytes += len;
813
814         if (skb->pkt_type == PACKET_BROADCAST)
815                 ++rx_stats->broadcast;
816         else if (skb->pkt_type == PACKET_MULTICAST)
817                 ++rx_stats->multicast;
818         u64_stats_update_end(&rx_stats->syncp);
819
820         napi_gro_receive(&nvchan->napi, skb);
821         return 0;
822 }
823
824 static void netvsc_get_drvinfo(struct net_device *net,
825                                struct ethtool_drvinfo *info)
826 {
827         strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
828         strlcpy(info->fw_version, "N/A", sizeof(info->fw_version));
829 }
830
831 static void netvsc_get_channels(struct net_device *net,
832                                 struct ethtool_channels *channel)
833 {
834         struct net_device_context *net_device_ctx = netdev_priv(net);
835         struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
836
837         if (nvdev) {
838                 channel->max_combined   = nvdev->max_chn;
839                 channel->combined_count = nvdev->num_chn;
840         }
841 }
842
843 static int netvsc_set_channels(struct net_device *net,
844                                struct ethtool_channels *channels)
845 {
846         struct net_device_context *net_device_ctx = netdev_priv(net);
847         struct hv_device *dev = net_device_ctx->device_ctx;
848         struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
849         unsigned int orig, count = channels->combined_count;
850         struct netvsc_device_info device_info;
851         bool was_opened;
852         int ret = 0;
853
854         /* We do not support separate count for rx, tx, or other */
855         if (count == 0 ||
856             channels->rx_count || channels->tx_count || channels->other_count)
857                 return -EINVAL;
858
859         if (!nvdev || nvdev->destroy)
860                 return -ENODEV;
861
862         if (nvdev->nvsp_version < NVSP_PROTOCOL_VERSION_5)
863                 return -EINVAL;
864
865         if (count > nvdev->max_chn)
866                 return -EINVAL;
867
868         orig = nvdev->num_chn;
869         was_opened = rndis_filter_opened(nvdev);
870         if (was_opened)
871                 rndis_filter_close(nvdev);
872
873         memset(&device_info, 0, sizeof(device_info));
874         device_info.num_chn = count;
875         device_info.send_sections = nvdev->send_section_cnt;
876         device_info.send_section_size = nvdev->send_section_size;
877         device_info.recv_sections = nvdev->recv_section_cnt;
878         device_info.recv_section_size = nvdev->recv_section_size;
879
880         rndis_filter_device_remove(dev, nvdev);
881
882         nvdev = rndis_filter_device_add(dev, &device_info);
883         if (IS_ERR(nvdev)) {
884                 ret = PTR_ERR(nvdev);
885                 device_info.num_chn = orig;
886                 nvdev = rndis_filter_device_add(dev, &device_info);
887
888                 if (IS_ERR(nvdev)) {
889                         netdev_err(net, "restoring channel setting failed: %ld\n",
890                                    PTR_ERR(nvdev));
891                         return ret;
892                 }
893         }
894
895         if (was_opened)
896                 rndis_filter_open(nvdev);
897
898         /* We may have missed link change notifications */
899         net_device_ctx->last_reconfig = 0;
900         schedule_delayed_work(&net_device_ctx->dwork, 0);
901
902         return ret;
903 }
904
905 static bool
906 netvsc_validate_ethtool_ss_cmd(const struct ethtool_link_ksettings *cmd)
907 {
908         struct ethtool_link_ksettings diff1 = *cmd;
909         struct ethtool_link_ksettings diff2 = {};
910
911         diff1.base.speed = 0;
912         diff1.base.duplex = 0;
913         /* advertising and cmd are usually set */
914         ethtool_link_ksettings_zero_link_mode(&diff1, advertising);
915         diff1.base.cmd = 0;
916         /* We set port to PORT_OTHER */
917         diff2.base.port = PORT_OTHER;
918
919         return !memcmp(&diff1, &diff2, sizeof(diff1));
920 }
921
922 static void netvsc_init_settings(struct net_device *dev)
923 {
924         struct net_device_context *ndc = netdev_priv(dev);
925
926         ndc->l4_hash = HV_DEFAULT_L4HASH;
927
928         ndc->speed = SPEED_UNKNOWN;
929         ndc->duplex = DUPLEX_FULL;
930 }
931
932 static int netvsc_get_link_ksettings(struct net_device *dev,
933                                      struct ethtool_link_ksettings *cmd)
934 {
935         struct net_device_context *ndc = netdev_priv(dev);
936
937         cmd->base.speed = ndc->speed;
938         cmd->base.duplex = ndc->duplex;
939         cmd->base.port = PORT_OTHER;
940
941         return 0;
942 }
943
944 static int netvsc_set_link_ksettings(struct net_device *dev,
945                                      const struct ethtool_link_ksettings *cmd)
946 {
947         struct net_device_context *ndc = netdev_priv(dev);
948         u32 speed;
949
950         speed = cmd->base.speed;
951         if (!ethtool_validate_speed(speed) ||
952             !ethtool_validate_duplex(cmd->base.duplex) ||
953             !netvsc_validate_ethtool_ss_cmd(cmd))
954                 return -EINVAL;
955
956         ndc->speed = speed;
957         ndc->duplex = cmd->base.duplex;
958
959         return 0;
960 }
961
962 static int netvsc_change_mtu(struct net_device *ndev, int mtu)
963 {
964         struct net_device_context *ndevctx = netdev_priv(ndev);
965         struct net_device *vf_netdev = rtnl_dereference(ndevctx->vf_netdev);
966         struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
967         struct hv_device *hdev = ndevctx->device_ctx;
968         int orig_mtu = ndev->mtu;
969         struct netvsc_device_info device_info;
970         bool was_opened;
971         int ret = 0;
972
973         if (!nvdev || nvdev->destroy)
974                 return -ENODEV;
975
976         /* Change MTU of underlying VF netdev first. */
977         if (vf_netdev) {
978                 ret = dev_set_mtu(vf_netdev, mtu);
979                 if (ret)
980                         return ret;
981         }
982
983         netif_device_detach(ndev);
984         was_opened = rndis_filter_opened(nvdev);
985         if (was_opened)
986                 rndis_filter_close(nvdev);
987
988         memset(&device_info, 0, sizeof(device_info));
989         device_info.num_chn = nvdev->num_chn;
990         device_info.send_sections = nvdev->send_section_cnt;
991         device_info.send_section_size = nvdev->send_section_size;
992         device_info.recv_sections = nvdev->recv_section_cnt;
993         device_info.recv_section_size = nvdev->recv_section_size;
994
995         rndis_filter_device_remove(hdev, nvdev);
996
997         ndev->mtu = mtu;
998
999         nvdev = rndis_filter_device_add(hdev, &device_info);
1000         if (IS_ERR(nvdev)) {
1001                 ret = PTR_ERR(nvdev);
1002
1003                 /* Attempt rollback to original MTU */
1004                 ndev->mtu = orig_mtu;
1005                 nvdev = rndis_filter_device_add(hdev, &device_info);
1006
1007                 if (vf_netdev)
1008                         dev_set_mtu(vf_netdev, orig_mtu);
1009
1010                 if (IS_ERR(nvdev)) {
1011                         netdev_err(ndev, "restoring mtu failed: %ld\n",
1012                                    PTR_ERR(nvdev));
1013                         return ret;
1014                 }
1015         }
1016
1017         if (was_opened)
1018                 rndis_filter_open(nvdev);
1019
1020         netif_device_attach(ndev);
1021
1022         /* We may have missed link change notifications */
1023         schedule_delayed_work(&ndevctx->dwork, 0);
1024
1025         return ret;
1026 }
1027
1028 static void netvsc_get_vf_stats(struct net_device *net,
1029                                 struct netvsc_vf_pcpu_stats *tot)
1030 {
1031         struct net_device_context *ndev_ctx = netdev_priv(net);
1032         int i;
1033
1034         memset(tot, 0, sizeof(*tot));
1035
1036         for_each_possible_cpu(i) {
1037                 const struct netvsc_vf_pcpu_stats *stats
1038                         = per_cpu_ptr(ndev_ctx->vf_stats, i);
1039                 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
1040                 unsigned int start;
1041
1042                 do {
1043                         start = u64_stats_fetch_begin_irq(&stats->syncp);
1044                         rx_packets = stats->rx_packets;
1045                         tx_packets = stats->tx_packets;
1046                         rx_bytes = stats->rx_bytes;
1047                         tx_bytes = stats->tx_bytes;
1048                 } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
1049
1050                 tot->rx_packets += rx_packets;
1051                 tot->tx_packets += tx_packets;
1052                 tot->rx_bytes   += rx_bytes;
1053                 tot->tx_bytes   += tx_bytes;
1054                 tot->tx_dropped += stats->tx_dropped;
1055         }
1056 }
1057
1058 static void netvsc_get_stats64(struct net_device *net,
1059                                struct rtnl_link_stats64 *t)
1060 {
1061         struct net_device_context *ndev_ctx = netdev_priv(net);
1062         struct netvsc_device *nvdev = rcu_dereference_rtnl(ndev_ctx->nvdev);
1063         struct netvsc_vf_pcpu_stats vf_tot;
1064         int i;
1065
1066         if (!nvdev)
1067                 return;
1068
1069         netdev_stats_to_stats64(t, &net->stats);
1070
1071         netvsc_get_vf_stats(net, &vf_tot);
1072         t->rx_packets += vf_tot.rx_packets;
1073         t->tx_packets += vf_tot.tx_packets;
1074         t->rx_bytes   += vf_tot.rx_bytes;
1075         t->tx_bytes   += vf_tot.tx_bytes;
1076         t->tx_dropped += vf_tot.tx_dropped;
1077
1078         for (i = 0; i < nvdev->num_chn; i++) {
1079                 const struct netvsc_channel *nvchan = &nvdev->chan_table[i];
1080                 const struct netvsc_stats *stats;
1081                 u64 packets, bytes, multicast;
1082                 unsigned int start;
1083
1084                 stats = &nvchan->tx_stats;
1085                 do {
1086                         start = u64_stats_fetch_begin_irq(&stats->syncp);
1087                         packets = stats->packets;
1088                         bytes = stats->bytes;
1089                 } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
1090
1091                 t->tx_bytes     += bytes;
1092                 t->tx_packets   += packets;
1093
1094                 stats = &nvchan->rx_stats;
1095                 do {
1096                         start = u64_stats_fetch_begin_irq(&stats->syncp);
1097                         packets = stats->packets;
1098                         bytes = stats->bytes;
1099                         multicast = stats->multicast + stats->broadcast;
1100                 } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
1101
1102                 t->rx_bytes     += bytes;
1103                 t->rx_packets   += packets;
1104                 t->multicast    += multicast;
1105         }
1106 }
1107
1108 static int netvsc_set_mac_addr(struct net_device *ndev, void *p)
1109 {
1110         struct net_device_context *ndc = netdev_priv(ndev);
1111         struct net_device *vf_netdev = rtnl_dereference(ndc->vf_netdev);
1112         struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
1113         struct sockaddr *addr = p;
1114         int err;
1115
1116         err = eth_prepare_mac_addr_change(ndev, p);
1117         if (err)
1118                 return err;
1119
1120         if (!nvdev)
1121                 return -ENODEV;
1122
1123         if (vf_netdev) {
1124                 err = dev_set_mac_address(vf_netdev, addr);
1125                 if (err)
1126                         return err;
1127         }
1128
1129         err = rndis_filter_set_device_mac(nvdev, addr->sa_data);
1130         if (!err) {
1131                 eth_commit_mac_addr_change(ndev, p);
1132         } else if (vf_netdev) {
1133                 /* rollback change on VF */
1134                 memcpy(addr->sa_data, ndev->dev_addr, ETH_ALEN);
1135                 dev_set_mac_address(vf_netdev, addr);
1136         }
1137
1138         return err;
1139 }
1140
1141 static const struct {
1142         char name[ETH_GSTRING_LEN];
1143         u16 offset;
1144 } netvsc_stats[] = {
1145         { "tx_scattered", offsetof(struct netvsc_ethtool_stats, tx_scattered) },
1146         { "tx_no_memory", offsetof(struct netvsc_ethtool_stats, tx_no_memory) },
1147         { "tx_no_space",  offsetof(struct netvsc_ethtool_stats, tx_no_space) },
1148         { "tx_too_big",   offsetof(struct netvsc_ethtool_stats, tx_too_big) },
1149         { "tx_busy",      offsetof(struct netvsc_ethtool_stats, tx_busy) },
1150         { "tx_send_full", offsetof(struct netvsc_ethtool_stats, tx_send_full) },
1151         { "rx_comp_busy", offsetof(struct netvsc_ethtool_stats, rx_comp_busy) },
1152         { "rx_no_memory", offsetof(struct netvsc_ethtool_stats, rx_no_memory) },
1153         { "stop_queue", offsetof(struct netvsc_ethtool_stats, stop_queue) },
1154         { "wake_queue", offsetof(struct netvsc_ethtool_stats, wake_queue) },
1155 }, vf_stats[] = {
1156         { "vf_rx_packets", offsetof(struct netvsc_vf_pcpu_stats, rx_packets) },
1157         { "vf_rx_bytes",   offsetof(struct netvsc_vf_pcpu_stats, rx_bytes) },
1158         { "vf_tx_packets", offsetof(struct netvsc_vf_pcpu_stats, tx_packets) },
1159         { "vf_tx_bytes",   offsetof(struct netvsc_vf_pcpu_stats, tx_bytes) },
1160         { "vf_tx_dropped", offsetof(struct netvsc_vf_pcpu_stats, tx_dropped) },
1161 };
1162
1163 #define NETVSC_GLOBAL_STATS_LEN ARRAY_SIZE(netvsc_stats)
1164 #define NETVSC_VF_STATS_LEN     ARRAY_SIZE(vf_stats)
1165
1166 /* 4 statistics per queue (rx/tx packets/bytes) */
1167 #define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 4)
1168
1169 static int netvsc_get_sset_count(struct net_device *dev, int string_set)
1170 {
1171         struct net_device_context *ndc = netdev_priv(dev);
1172         struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
1173
1174         if (!nvdev)
1175                 return -ENODEV;
1176
1177         switch (string_set) {
1178         case ETH_SS_STATS:
1179                 return NETVSC_GLOBAL_STATS_LEN
1180                         + NETVSC_VF_STATS_LEN
1181                         + NETVSC_QUEUE_STATS_LEN(nvdev);
1182         default:
1183                 return -EINVAL;
1184         }
1185 }
1186
1187 static void netvsc_get_ethtool_stats(struct net_device *dev,
1188                                      struct ethtool_stats *stats, u64 *data)
1189 {
1190         struct net_device_context *ndc = netdev_priv(dev);
1191         struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
1192         const void *nds = &ndc->eth_stats;
1193         const struct netvsc_stats *qstats;
1194         struct netvsc_vf_pcpu_stats sum;
1195         unsigned int start;
1196         u64 packets, bytes;
1197         int i, j;
1198
1199         if (!nvdev)
1200                 return;
1201
1202         for (i = 0; i < NETVSC_GLOBAL_STATS_LEN; i++)
1203                 data[i] = *(unsigned long *)(nds + netvsc_stats[i].offset);
1204
1205         netvsc_get_vf_stats(dev, &sum);
1206         for (j = 0; j < NETVSC_VF_STATS_LEN; j++)
1207                 data[i++] = *(u64 *)((void *)&sum + vf_stats[j].offset);
1208
1209         for (j = 0; j < nvdev->num_chn; j++) {
1210                 qstats = &nvdev->chan_table[j].tx_stats;
1211
1212                 do {
1213                         start = u64_stats_fetch_begin_irq(&qstats->syncp);
1214                         packets = qstats->packets;
1215                         bytes = qstats->bytes;
1216                 } while (u64_stats_fetch_retry_irq(&qstats->syncp, start));
1217                 data[i++] = packets;
1218                 data[i++] = bytes;
1219
1220                 qstats = &nvdev->chan_table[j].rx_stats;
1221                 do {
1222                         start = u64_stats_fetch_begin_irq(&qstats->syncp);
1223                         packets = qstats->packets;
1224                         bytes = qstats->bytes;
1225                 } while (u64_stats_fetch_retry_irq(&qstats->syncp, start));
1226                 data[i++] = packets;
1227                 data[i++] = bytes;
1228         }
1229 }
1230
1231 static void netvsc_get_strings(struct net_device *dev, u32 stringset, u8 *data)
1232 {
1233         struct net_device_context *ndc = netdev_priv(dev);
1234         struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
1235         u8 *p = data;
1236         int i;
1237
1238         if (!nvdev)
1239                 return;
1240
1241         switch (stringset) {
1242         case ETH_SS_STATS:
1243                 for (i = 0; i < ARRAY_SIZE(netvsc_stats); i++) {
1244                         memcpy(p, netvsc_stats[i].name, ETH_GSTRING_LEN);
1245                         p += ETH_GSTRING_LEN;
1246                 }
1247
1248                 for (i = 0; i < ARRAY_SIZE(vf_stats); i++) {
1249                         memcpy(p, vf_stats[i].name, ETH_GSTRING_LEN);
1250                         p += ETH_GSTRING_LEN;
1251                 }
1252
1253                 for (i = 0; i < nvdev->num_chn; i++) {
1254                         sprintf(p, "tx_queue_%u_packets", i);
1255                         p += ETH_GSTRING_LEN;
1256                         sprintf(p, "tx_queue_%u_bytes", i);
1257                         p += ETH_GSTRING_LEN;
1258                         sprintf(p, "rx_queue_%u_packets", i);
1259                         p += ETH_GSTRING_LEN;
1260                         sprintf(p, "rx_queue_%u_bytes", i);
1261                         p += ETH_GSTRING_LEN;
1262                 }
1263
1264                 break;
1265         }
1266 }
1267
1268 static int
1269 netvsc_get_rss_hash_opts(struct net_device_context *ndc,
1270                          struct ethtool_rxnfc *info)
1271 {
1272         const u32 l4_flag = RXH_L4_B_0_1 | RXH_L4_B_2_3;
1273
1274         info->data = RXH_IP_SRC | RXH_IP_DST;
1275
1276         switch (info->flow_type) {
1277         case TCP_V4_FLOW:
1278                 if (ndc->l4_hash & HV_TCP4_L4HASH)
1279                         info->data |= l4_flag;
1280
1281                 break;
1282
1283         case TCP_V6_FLOW:
1284                 if (ndc->l4_hash & HV_TCP6_L4HASH)
1285                         info->data |= l4_flag;
1286
1287                 break;
1288
1289         case UDP_V4_FLOW:
1290                 if (ndc->l4_hash & HV_UDP4_L4HASH)
1291                         info->data |= l4_flag;
1292
1293                 break;
1294
1295         case UDP_V6_FLOW:
1296                 if (ndc->l4_hash & HV_UDP6_L4HASH)
1297                         info->data |= l4_flag;
1298
1299                 break;
1300
1301         case IPV4_FLOW:
1302         case IPV6_FLOW:
1303                 break;
1304         default:
1305                 info->data = 0;
1306                 break;
1307         }
1308
1309         return 0;
1310 }
1311
1312 static int
1313 netvsc_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info,
1314                  u32 *rules)
1315 {
1316         struct net_device_context *ndc = netdev_priv(dev);
1317         struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
1318
1319         if (!nvdev)
1320                 return -ENODEV;
1321
1322         switch (info->cmd) {
1323         case ETHTOOL_GRXRINGS:
1324                 info->data = nvdev->num_chn;
1325                 return 0;
1326
1327         case ETHTOOL_GRXFH:
1328                 return netvsc_get_rss_hash_opts(ndc, info);
1329         }
1330         return -EOPNOTSUPP;
1331 }
1332
1333 static int netvsc_set_rss_hash_opts(struct net_device_context *ndc,
1334                                     struct ethtool_rxnfc *info)
1335 {
1336         if (info->data == (RXH_IP_SRC | RXH_IP_DST |
1337                            RXH_L4_B_0_1 | RXH_L4_B_2_3)) {
1338                 switch (info->flow_type) {
1339                 case TCP_V4_FLOW:
1340                         ndc->l4_hash |= HV_TCP4_L4HASH;
1341                         break;
1342
1343                 case TCP_V6_FLOW:
1344                         ndc->l4_hash |= HV_TCP6_L4HASH;
1345                         break;
1346
1347                 case UDP_V4_FLOW:
1348                         ndc->l4_hash |= HV_UDP4_L4HASH;
1349                         break;
1350
1351                 case UDP_V6_FLOW:
1352                         ndc->l4_hash |= HV_UDP6_L4HASH;
1353                         break;
1354
1355                 default:
1356                         return -EOPNOTSUPP;
1357                 }
1358
1359                 return 0;
1360         }
1361
1362         if (info->data == (RXH_IP_SRC | RXH_IP_DST)) {
1363                 switch (info->flow_type) {
1364                 case TCP_V4_FLOW:
1365                         ndc->l4_hash &= ~HV_TCP4_L4HASH;
1366                         break;
1367
1368                 case TCP_V6_FLOW:
1369                         ndc->l4_hash &= ~HV_TCP6_L4HASH;
1370                         break;
1371
1372                 case UDP_V4_FLOW:
1373                         ndc->l4_hash &= ~HV_UDP4_L4HASH;
1374                         break;
1375
1376                 case UDP_V6_FLOW:
1377                         ndc->l4_hash &= ~HV_UDP6_L4HASH;
1378                         break;
1379
1380                 default:
1381                         return -EOPNOTSUPP;
1382                 }
1383
1384                 return 0;
1385         }
1386
1387         return -EOPNOTSUPP;
1388 }
1389
1390 static int
1391 netvsc_set_rxnfc(struct net_device *ndev, struct ethtool_rxnfc *info)
1392 {
1393         struct net_device_context *ndc = netdev_priv(ndev);
1394
1395         if (info->cmd == ETHTOOL_SRXFH)
1396                 return netvsc_set_rss_hash_opts(ndc, info);
1397
1398         return -EOPNOTSUPP;
1399 }
1400
1401 #ifdef CONFIG_NET_POLL_CONTROLLER
1402 static void netvsc_poll_controller(struct net_device *dev)
1403 {
1404         struct net_device_context *ndc = netdev_priv(dev);
1405         struct netvsc_device *ndev;
1406         int i;
1407
1408         rcu_read_lock();
1409         ndev = rcu_dereference(ndc->nvdev);
1410         if (ndev) {
1411                 for (i = 0; i < ndev->num_chn; i++) {
1412                         struct netvsc_channel *nvchan = &ndev->chan_table[i];
1413
1414                         napi_schedule(&nvchan->napi);
1415                 }
1416         }
1417         rcu_read_unlock();
1418 }
1419 #endif
1420
1421 static u32 netvsc_get_rxfh_key_size(struct net_device *dev)
1422 {
1423         return NETVSC_HASH_KEYLEN;
1424 }
1425
1426 static u32 netvsc_rss_indir_size(struct net_device *dev)
1427 {
1428         return ITAB_NUM;
1429 }
1430
1431 static int netvsc_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
1432                            u8 *hfunc)
1433 {
1434         struct net_device_context *ndc = netdev_priv(dev);
1435         struct netvsc_device *ndev = rtnl_dereference(ndc->nvdev);
1436         struct rndis_device *rndis_dev;
1437         int i;
1438
1439         if (!ndev)
1440                 return -ENODEV;
1441
1442         if (hfunc)
1443                 *hfunc = ETH_RSS_HASH_TOP;      /* Toeplitz */
1444
1445         rndis_dev = ndev->extension;
1446         if (indir) {
1447                 for (i = 0; i < ITAB_NUM; i++)
1448                         indir[i] = rndis_dev->rx_table[i];
1449         }
1450
1451         if (key)
1452                 memcpy(key, rndis_dev->rss_key, NETVSC_HASH_KEYLEN);
1453
1454         return 0;
1455 }
1456
1457 static int netvsc_set_rxfh(struct net_device *dev, const u32 *indir,
1458                            const u8 *key, const u8 hfunc)
1459 {
1460         struct net_device_context *ndc = netdev_priv(dev);
1461         struct netvsc_device *ndev = rtnl_dereference(ndc->nvdev);
1462         struct rndis_device *rndis_dev;
1463         int i;
1464
1465         if (!ndev)
1466                 return -ENODEV;
1467
1468         if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
1469                 return -EOPNOTSUPP;
1470
1471         rndis_dev = ndev->extension;
1472         if (indir) {
1473                 for (i = 0; i < ITAB_NUM; i++)
1474                         if (indir[i] >= ndev->num_chn)
1475                                 return -EINVAL;
1476
1477                 for (i = 0; i < ITAB_NUM; i++)
1478                         rndis_dev->rx_table[i] = indir[i];
1479         }
1480
1481         if (!key) {
1482                 if (!indir)
1483                         return 0;
1484
1485                 key = rndis_dev->rss_key;
1486         }
1487
1488         return rndis_filter_set_rss_param(rndis_dev, key);
1489 }
1490
1491 /* Hyper-V RNDIS protocol does not have ring in the HW sense.
1492  * It does have pre-allocated receive area which is divided into sections.
1493  */
1494 static void __netvsc_get_ringparam(struct netvsc_device *nvdev,
1495                                    struct ethtool_ringparam *ring)
1496 {
1497         u32 max_buf_size;
1498
1499         ring->rx_pending = nvdev->recv_section_cnt;
1500         ring->tx_pending = nvdev->send_section_cnt;
1501
1502         if (nvdev->nvsp_version <= NVSP_PROTOCOL_VERSION_2)
1503                 max_buf_size = NETVSC_RECEIVE_BUFFER_SIZE_LEGACY;
1504         else
1505                 max_buf_size = NETVSC_RECEIVE_BUFFER_SIZE;
1506
1507         ring->rx_max_pending = max_buf_size / nvdev->recv_section_size;
1508         ring->tx_max_pending = NETVSC_SEND_BUFFER_SIZE
1509                 / nvdev->send_section_size;
1510 }
1511
1512 static void netvsc_get_ringparam(struct net_device *ndev,
1513                                  struct ethtool_ringparam *ring)
1514 {
1515         struct net_device_context *ndevctx = netdev_priv(ndev);
1516         struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
1517
1518         if (!nvdev)
1519                 return;
1520
1521         __netvsc_get_ringparam(nvdev, ring);
1522 }
1523
1524 static int netvsc_set_ringparam(struct net_device *ndev,
1525                                 struct ethtool_ringparam *ring)
1526 {
1527         struct net_device_context *ndevctx = netdev_priv(ndev);
1528         struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
1529         struct hv_device *hdev = ndevctx->device_ctx;
1530         struct netvsc_device_info device_info;
1531         struct ethtool_ringparam orig;
1532         u32 new_tx, new_rx;
1533         bool was_opened;
1534         int ret = 0;
1535
1536         if (!nvdev || nvdev->destroy)
1537                 return -ENODEV;
1538
1539         memset(&orig, 0, sizeof(orig));
1540         __netvsc_get_ringparam(nvdev, &orig);
1541
1542         new_tx = clamp_t(u32, ring->tx_pending,
1543                          NETVSC_MIN_TX_SECTIONS, orig.tx_max_pending);
1544         new_rx = clamp_t(u32, ring->rx_pending,
1545                          NETVSC_MIN_RX_SECTIONS, orig.rx_max_pending);
1546
1547         if (new_tx == orig.tx_pending &&
1548             new_rx == orig.rx_pending)
1549                 return 0;        /* no change */
1550
1551         memset(&device_info, 0, sizeof(device_info));
1552         device_info.num_chn = nvdev->num_chn;
1553         device_info.send_sections = new_tx;
1554         device_info.send_section_size = nvdev->send_section_size;
1555         device_info.recv_sections = new_rx;
1556         device_info.recv_section_size = nvdev->recv_section_size;
1557
1558         netif_device_detach(ndev);
1559         was_opened = rndis_filter_opened(nvdev);
1560         if (was_opened)
1561                 rndis_filter_close(nvdev);
1562
1563         rndis_filter_device_remove(hdev, nvdev);
1564
1565         nvdev = rndis_filter_device_add(hdev, &device_info);
1566         if (IS_ERR(nvdev)) {
1567                 ret = PTR_ERR(nvdev);
1568
1569                 device_info.send_sections = orig.tx_pending;
1570                 device_info.recv_sections = orig.rx_pending;
1571                 nvdev = rndis_filter_device_add(hdev, &device_info);
1572                 if (IS_ERR(nvdev)) {
1573                         netdev_err(ndev, "restoring ringparam failed: %ld\n",
1574                                    PTR_ERR(nvdev));
1575                         return ret;
1576                 }
1577         }
1578
1579         if (was_opened)
1580                 rndis_filter_open(nvdev);
1581         netif_device_attach(ndev);
1582
1583         /* We may have missed link change notifications */
1584         ndevctx->last_reconfig = 0;
1585         schedule_delayed_work(&ndevctx->dwork, 0);
1586
1587         return ret;
1588 }
1589
1590 static const struct ethtool_ops ethtool_ops = {
1591         .get_drvinfo    = netvsc_get_drvinfo,
1592         .get_link       = ethtool_op_get_link,
1593         .get_ethtool_stats = netvsc_get_ethtool_stats,
1594         .get_sset_count = netvsc_get_sset_count,
1595         .get_strings    = netvsc_get_strings,
1596         .get_channels   = netvsc_get_channels,
1597         .set_channels   = netvsc_set_channels,
1598         .get_ts_info    = ethtool_op_get_ts_info,
1599         .get_rxnfc      = netvsc_get_rxnfc,
1600         .set_rxnfc      = netvsc_set_rxnfc,
1601         .get_rxfh_key_size = netvsc_get_rxfh_key_size,
1602         .get_rxfh_indir_size = netvsc_rss_indir_size,
1603         .get_rxfh       = netvsc_get_rxfh,
1604         .set_rxfh       = netvsc_set_rxfh,
1605         .get_link_ksettings = netvsc_get_link_ksettings,
1606         .set_link_ksettings = netvsc_set_link_ksettings,
1607         .get_ringparam  = netvsc_get_ringparam,
1608         .set_ringparam  = netvsc_set_ringparam,
1609 };
1610
1611 static const struct net_device_ops device_ops = {
1612         .ndo_open =                     netvsc_open,
1613         .ndo_stop =                     netvsc_close,
1614         .ndo_start_xmit =               netvsc_start_xmit,
1615         .ndo_change_rx_flags =          netvsc_change_rx_flags,
1616         .ndo_set_rx_mode =              netvsc_set_rx_mode,
1617         .ndo_change_mtu =               netvsc_change_mtu,
1618         .ndo_validate_addr =            eth_validate_addr,
1619         .ndo_set_mac_address =          netvsc_set_mac_addr,
1620         .ndo_select_queue =             netvsc_select_queue,
1621         .ndo_get_stats64 =              netvsc_get_stats64,
1622 #ifdef CONFIG_NET_POLL_CONTROLLER
1623         .ndo_poll_controller =          netvsc_poll_controller,
1624 #endif
1625 };
1626
1627 /*
1628  * Handle link status changes. For RNDIS_STATUS_NETWORK_CHANGE emulate link
1629  * down/up sequence. In case of RNDIS_STATUS_MEDIA_CONNECT when carrier is
1630  * present send GARP packet to network peers with netif_notify_peers().
1631  */
1632 static void netvsc_link_change(struct work_struct *w)
1633 {
1634         struct net_device_context *ndev_ctx =
1635                 container_of(w, struct net_device_context, dwork.work);
1636         struct hv_device *device_obj = ndev_ctx->device_ctx;
1637         struct net_device *net = hv_get_drvdata(device_obj);
1638         struct netvsc_device *net_device;
1639         struct rndis_device *rdev;
1640         struct netvsc_reconfig *event = NULL;
1641         bool notify = false, reschedule = false;
1642         unsigned long flags, next_reconfig, delay;
1643
1644         /* if changes are happening, comeback later */
1645         if (!rtnl_trylock()) {
1646                 schedule_delayed_work(&ndev_ctx->dwork, LINKCHANGE_INT);
1647                 return;
1648         }
1649
1650         net_device = rtnl_dereference(ndev_ctx->nvdev);
1651         if (!net_device)
1652                 goto out_unlock;
1653
1654         rdev = net_device->extension;
1655
1656         next_reconfig = ndev_ctx->last_reconfig + LINKCHANGE_INT;
1657         if (time_is_after_jiffies(next_reconfig)) {
1658                 /* link_watch only sends one notification with current state
1659                  * per second, avoid doing reconfig more frequently. Handle
1660                  * wrap around.
1661                  */
1662                 delay = next_reconfig - jiffies;
1663                 delay = delay < LINKCHANGE_INT ? delay : LINKCHANGE_INT;
1664                 schedule_delayed_work(&ndev_ctx->dwork, delay);
1665                 goto out_unlock;
1666         }
1667         ndev_ctx->last_reconfig = jiffies;
1668
1669         spin_lock_irqsave(&ndev_ctx->lock, flags);
1670         if (!list_empty(&ndev_ctx->reconfig_events)) {
1671                 event = list_first_entry(&ndev_ctx->reconfig_events,
1672                                          struct netvsc_reconfig, list);
1673                 list_del(&event->list);
1674                 reschedule = !list_empty(&ndev_ctx->reconfig_events);
1675         }
1676         spin_unlock_irqrestore(&ndev_ctx->lock, flags);
1677
1678         if (!event)
1679                 goto out_unlock;
1680
1681         switch (event->event) {
1682                 /* Only the following events are possible due to the check in
1683                  * netvsc_linkstatus_callback()
1684                  */
1685         case RNDIS_STATUS_MEDIA_CONNECT:
1686                 if (rdev->link_state) {
1687                         rdev->link_state = false;
1688                         netif_carrier_on(net);
1689                         netif_tx_wake_all_queues(net);
1690                 } else {
1691                         notify = true;
1692                 }
1693                 kfree(event);
1694                 break;
1695         case RNDIS_STATUS_MEDIA_DISCONNECT:
1696                 if (!rdev->link_state) {
1697                         rdev->link_state = true;
1698                         netif_carrier_off(net);
1699                         netif_tx_stop_all_queues(net);
1700                 }
1701                 kfree(event);
1702                 break;
1703         case RNDIS_STATUS_NETWORK_CHANGE:
1704                 /* Only makes sense if carrier is present */
1705                 if (!rdev->link_state) {
1706                         rdev->link_state = true;
1707                         netif_carrier_off(net);
1708                         netif_tx_stop_all_queues(net);
1709                         event->event = RNDIS_STATUS_MEDIA_CONNECT;
1710                         spin_lock_irqsave(&ndev_ctx->lock, flags);
1711                         list_add(&event->list, &ndev_ctx->reconfig_events);
1712                         spin_unlock_irqrestore(&ndev_ctx->lock, flags);
1713                         reschedule = true;
1714                 }
1715                 break;
1716         }
1717
1718         rtnl_unlock();
1719
1720         if (notify)
1721                 netdev_notify_peers(net);
1722
1723         /* link_watch only sends one notification with current state per
1724          * second, handle next reconfig event in 2 seconds.
1725          */
1726         if (reschedule)
1727                 schedule_delayed_work(&ndev_ctx->dwork, LINKCHANGE_INT);
1728
1729         return;
1730
1731 out_unlock:
1732         rtnl_unlock();
1733 }
1734
1735 static struct net_device *get_netvsc_bymac(const u8 *mac)
1736 {
1737         struct net_device *dev;
1738
1739         ASSERT_RTNL();
1740
1741         for_each_netdev(&init_net, dev) {
1742                 if (dev->netdev_ops != &device_ops)
1743                         continue;       /* not a netvsc device */
1744
1745                 if (ether_addr_equal(mac, dev->perm_addr))
1746                         return dev;
1747         }
1748
1749         return NULL;
1750 }
1751
1752 static struct net_device *get_netvsc_byref(struct net_device *vf_netdev)
1753 {
1754         struct net_device *dev;
1755
1756         ASSERT_RTNL();
1757
1758         for_each_netdev(&init_net, dev) {
1759                 struct net_device_context *net_device_ctx;
1760
1761                 if (dev->netdev_ops != &device_ops)
1762                         continue;       /* not a netvsc device */
1763
1764                 net_device_ctx = netdev_priv(dev);
1765                 if (!rtnl_dereference(net_device_ctx->nvdev))
1766                         continue;       /* device is removed */
1767
1768                 if (rtnl_dereference(net_device_ctx->vf_netdev) == vf_netdev)
1769                         return dev;     /* a match */
1770         }
1771
1772         return NULL;
1773 }
1774
1775 /* Called when VF is injecting data into network stack.
1776  * Change the associated network device from VF to netvsc.
1777  * note: already called with rcu_read_lock
1778  */
1779 static rx_handler_result_t netvsc_vf_handle_frame(struct sk_buff **pskb)
1780 {
1781         struct sk_buff *skb = *pskb;
1782         struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
1783         struct net_device_context *ndev_ctx = netdev_priv(ndev);
1784         struct netvsc_vf_pcpu_stats *pcpu_stats
1785                  = this_cpu_ptr(ndev_ctx->vf_stats);
1786
1787         skb->dev = ndev;
1788
1789         u64_stats_update_begin(&pcpu_stats->syncp);
1790         pcpu_stats->rx_packets++;
1791         pcpu_stats->rx_bytes += skb->len;
1792         u64_stats_update_end(&pcpu_stats->syncp);
1793
1794         return RX_HANDLER_ANOTHER;
1795 }
1796
1797 static int netvsc_vf_join(struct net_device *vf_netdev,
1798                           struct net_device *ndev)
1799 {
1800         struct net_device_context *ndev_ctx = netdev_priv(ndev);
1801         int ret;
1802
1803         ret = netdev_rx_handler_register(vf_netdev,
1804                                          netvsc_vf_handle_frame, ndev);
1805         if (ret != 0) {
1806                 netdev_err(vf_netdev,
1807                            "can not register netvsc VF receive handler (err = %d)\n",
1808                            ret);
1809                 goto rx_handler_failed;
1810         }
1811
1812         ret = netdev_upper_dev_link(vf_netdev, ndev, NULL);
1813         if (ret != 0) {
1814                 netdev_err(vf_netdev,
1815                            "can not set master device %s (err = %d)\n",
1816                            ndev->name, ret);
1817                 goto upper_link_failed;
1818         }
1819
1820         /* set slave flag before open to prevent IPv6 addrconf */
1821         vf_netdev->flags |= IFF_SLAVE;
1822
1823         schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
1824
1825         call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
1826
1827         netdev_info(vf_netdev, "joined to %s\n", ndev->name);
1828         return 0;
1829
1830 upper_link_failed:
1831         netdev_rx_handler_unregister(vf_netdev);
1832 rx_handler_failed:
1833         return ret;
1834 }
1835
1836 static void __netvsc_vf_setup(struct net_device *ndev,
1837                               struct net_device *vf_netdev)
1838 {
1839         int ret;
1840
1841         /* Align MTU of VF with master */
1842         ret = dev_set_mtu(vf_netdev, ndev->mtu);
1843         if (ret)
1844                 netdev_warn(vf_netdev,
1845                             "unable to change mtu to %u\n", ndev->mtu);
1846
1847         /* set multicast etc flags on VF */
1848         dev_change_flags(vf_netdev, ndev->flags | IFF_SLAVE);
1849         dev_uc_sync(vf_netdev, ndev);
1850         dev_mc_sync(vf_netdev, ndev);
1851
1852         if (netif_running(ndev)) {
1853                 ret = dev_open(vf_netdev);
1854                 if (ret)
1855                         netdev_warn(vf_netdev,
1856                                     "unable to open: %d\n", ret);
1857         }
1858 }
1859
1860 /* Setup VF as slave of the synthetic device.
1861  * Runs in workqueue to avoid recursion in netlink callbacks.
1862  */
1863 static void netvsc_vf_setup(struct work_struct *w)
1864 {
1865         struct net_device_context *ndev_ctx
1866                 = container_of(w, struct net_device_context, vf_takeover.work);
1867         struct net_device *ndev = hv_get_drvdata(ndev_ctx->device_ctx);
1868         struct net_device *vf_netdev;
1869
1870         if (!rtnl_trylock()) {
1871                 schedule_delayed_work(&ndev_ctx->vf_takeover, 0);
1872                 return;
1873         }
1874
1875         vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
1876         if (vf_netdev)
1877                 __netvsc_vf_setup(ndev, vf_netdev);
1878
1879         rtnl_unlock();
1880 }
1881
1882 static int netvsc_register_vf(struct net_device *vf_netdev)
1883 {
1884         struct net_device *ndev;
1885         struct net_device_context *net_device_ctx;
1886         struct netvsc_device *netvsc_dev;
1887
1888         if (vf_netdev->addr_len != ETH_ALEN)
1889                 return NOTIFY_DONE;
1890
1891         /*
1892          * We will use the MAC address to locate the synthetic interface to
1893          * associate with the VF interface. If we don't find a matching
1894          * synthetic interface, move on.
1895          */
1896         ndev = get_netvsc_bymac(vf_netdev->perm_addr);
1897         if (!ndev)
1898                 return NOTIFY_DONE;
1899
1900         net_device_ctx = netdev_priv(ndev);
1901         netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
1902         if (!netvsc_dev || rtnl_dereference(net_device_ctx->vf_netdev))
1903                 return NOTIFY_DONE;
1904
1905         if (netvsc_vf_join(vf_netdev, ndev) != 0)
1906                 return NOTIFY_DONE;
1907
1908         netdev_info(ndev, "VF registering: %s\n", vf_netdev->name);
1909
1910         dev_hold(vf_netdev);
1911         rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev);
1912         return NOTIFY_OK;
1913 }
1914
1915 /* VF up/down change detected, schedule to change data path */
1916 static int netvsc_vf_changed(struct net_device *vf_netdev)
1917 {
1918         struct net_device_context *net_device_ctx;
1919         struct netvsc_device *netvsc_dev;
1920         struct net_device *ndev;
1921         bool vf_is_up = netif_running(vf_netdev);
1922
1923         ndev = get_netvsc_byref(vf_netdev);
1924         if (!ndev)
1925                 return NOTIFY_DONE;
1926
1927         net_device_ctx = netdev_priv(ndev);
1928         netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
1929         if (!netvsc_dev)
1930                 return NOTIFY_DONE;
1931
1932         netvsc_switch_datapath(ndev, vf_is_up);
1933         netdev_info(ndev, "Data path switched %s VF: %s\n",
1934                     vf_is_up ? "to" : "from", vf_netdev->name);
1935
1936         return NOTIFY_OK;
1937 }
1938
1939 static int netvsc_unregister_vf(struct net_device *vf_netdev)
1940 {
1941         struct net_device *ndev;
1942         struct net_device_context *net_device_ctx;
1943
1944         ndev = get_netvsc_byref(vf_netdev);
1945         if (!ndev)
1946                 return NOTIFY_DONE;
1947
1948         net_device_ctx = netdev_priv(ndev);
1949         cancel_delayed_work_sync(&net_device_ctx->vf_takeover);
1950
1951         netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name);
1952
1953         netdev_rx_handler_unregister(vf_netdev);
1954         netdev_upper_dev_unlink(vf_netdev, ndev);
1955         RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL);
1956         dev_put(vf_netdev);
1957
1958         return NOTIFY_OK;
1959 }
1960
1961 static int netvsc_probe(struct hv_device *dev,
1962                         const struct hv_vmbus_device_id *dev_id)
1963 {
1964         struct net_device *net = NULL;
1965         struct net_device_context *net_device_ctx;
1966         struct netvsc_device_info device_info;
1967         struct netvsc_device *nvdev;
1968         int ret = -ENOMEM;
1969
1970         net = alloc_etherdev_mq(sizeof(struct net_device_context),
1971                                 VRSS_CHANNEL_MAX);
1972         if (!net)
1973                 goto no_net;
1974
1975         netif_carrier_off(net);
1976
1977         netvsc_init_settings(net);
1978
1979         net_device_ctx = netdev_priv(net);
1980         net_device_ctx->device_ctx = dev;
1981         net_device_ctx->msg_enable = netif_msg_init(debug, default_msg);
1982         if (netif_msg_probe(net_device_ctx))
1983                 netdev_dbg(net, "netvsc msg_enable: %d\n",
1984                            net_device_ctx->msg_enable);
1985
1986         hv_set_drvdata(dev, net);
1987
1988         INIT_DELAYED_WORK(&net_device_ctx->dwork, netvsc_link_change);
1989
1990         spin_lock_init(&net_device_ctx->lock);
1991         INIT_LIST_HEAD(&net_device_ctx->reconfig_events);
1992         INIT_DELAYED_WORK(&net_device_ctx->vf_takeover, netvsc_vf_setup);
1993
1994         net_device_ctx->vf_stats
1995                 = netdev_alloc_pcpu_stats(struct netvsc_vf_pcpu_stats);
1996         if (!net_device_ctx->vf_stats)
1997                 goto no_stats;
1998
1999         net->netdev_ops = &device_ops;
2000         net->ethtool_ops = &ethtool_ops;
2001         SET_NETDEV_DEV(net, &dev->device);
2002
2003         /* We always need headroom for rndis header */
2004         net->needed_headroom = RNDIS_AND_PPI_SIZE;
2005
2006         /* Initialize the number of queues to be 1, we may change it if more
2007          * channels are offered later.
2008          */
2009         netif_set_real_num_tx_queues(net, 1);
2010         netif_set_real_num_rx_queues(net, 1);
2011
2012         /* Notify the netvsc driver of the new device */
2013         memset(&device_info, 0, sizeof(device_info));
2014         device_info.num_chn = VRSS_CHANNEL_DEFAULT;
2015         device_info.send_sections = NETVSC_DEFAULT_TX;
2016         device_info.send_section_size = NETVSC_SEND_SECTION_SIZE;
2017         device_info.recv_sections = NETVSC_DEFAULT_RX;
2018         device_info.recv_section_size = NETVSC_RECV_SECTION_SIZE;
2019
2020         nvdev = rndis_filter_device_add(dev, &device_info);
2021         if (IS_ERR(nvdev)) {
2022                 ret = PTR_ERR(nvdev);
2023                 netdev_err(net, "unable to add netvsc device (ret %d)\n", ret);
2024                 goto rndis_failed;
2025         }
2026
2027         memcpy(net->dev_addr, device_info.mac_adr, ETH_ALEN);
2028
2029         /* hw_features computed in rndis_netdev_set_hwcaps() */
2030         net->features = net->hw_features |
2031                 NETIF_F_HIGHDMA | NETIF_F_SG |
2032                 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
2033         net->vlan_features = net->features;
2034
2035         netdev_lockdep_set_classes(net);
2036
2037         /* MTU range: 68 - 1500 or 65521 */
2038         net->min_mtu = NETVSC_MTU_MIN;
2039         if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2)
2040                 net->max_mtu = NETVSC_MTU - ETH_HLEN;
2041         else
2042                 net->max_mtu = ETH_DATA_LEN;
2043
2044         ret = register_netdev(net);
2045         if (ret != 0) {
2046                 pr_err("Unable to register netdev.\n");
2047                 goto register_failed;
2048         }
2049
2050         return ret;
2051
2052 register_failed:
2053         rndis_filter_device_remove(dev, nvdev);
2054 rndis_failed:
2055         free_percpu(net_device_ctx->vf_stats);
2056 no_stats:
2057         hv_set_drvdata(dev, NULL);
2058         free_netdev(net);
2059 no_net:
2060         return ret;
2061 }
2062
2063 static int netvsc_remove(struct hv_device *dev)
2064 {
2065         struct net_device_context *ndev_ctx;
2066         struct net_device *vf_netdev;
2067         struct net_device *net;
2068
2069         net = hv_get_drvdata(dev);
2070         if (net == NULL) {
2071                 dev_err(&dev->device, "No net device to remove\n");
2072                 return 0;
2073         }
2074
2075         ndev_ctx = netdev_priv(net);
2076
2077         netif_device_detach(net);
2078
2079         cancel_delayed_work_sync(&ndev_ctx->dwork);
2080
2081         /*
2082          * Call to the vsc driver to let it know that the device is being
2083          * removed. Also blocks mtu and channel changes.
2084          */
2085         rtnl_lock();
2086         vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
2087         if (vf_netdev)
2088                 netvsc_unregister_vf(vf_netdev);
2089
2090         unregister_netdevice(net);
2091
2092         rndis_filter_device_remove(dev,
2093                                    rtnl_dereference(ndev_ctx->nvdev));
2094         rtnl_unlock();
2095
2096         hv_set_drvdata(dev, NULL);
2097
2098         free_percpu(ndev_ctx->vf_stats);
2099         free_netdev(net);
2100         return 0;
2101 }
2102
2103 static const struct hv_vmbus_device_id id_table[] = {
2104         /* Network guid */
2105         { HV_NIC_GUID, },
2106         { },
2107 };
2108
2109 MODULE_DEVICE_TABLE(vmbus, id_table);
2110
2111 /* The one and only one */
2112 static struct  hv_driver netvsc_drv = {
2113         .name = KBUILD_MODNAME,
2114         .id_table = id_table,
2115         .probe = netvsc_probe,
2116         .remove = netvsc_remove,
2117 };
2118
2119 /*
2120  * On Hyper-V, every VF interface is matched with a corresponding
2121  * synthetic interface. The synthetic interface is presented first
2122  * to the guest. When the corresponding VF instance is registered,
2123  * we will take care of switching the data path.
2124  */
2125 static int netvsc_netdev_event(struct notifier_block *this,
2126                                unsigned long event, void *ptr)
2127 {
2128         struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
2129
2130         /* Skip our own events */
2131         if (event_dev->netdev_ops == &device_ops)
2132                 return NOTIFY_DONE;
2133
2134         /* Avoid non-Ethernet type devices */
2135         if (event_dev->type != ARPHRD_ETHER)
2136                 return NOTIFY_DONE;
2137
2138         /* Avoid Vlan dev with same MAC registering as VF */
2139         if (is_vlan_dev(event_dev))
2140                 return NOTIFY_DONE;
2141
2142         /* Avoid Bonding master dev with same MAC registering as VF */
2143         if ((event_dev->priv_flags & IFF_BONDING) &&
2144             (event_dev->flags & IFF_MASTER))
2145                 return NOTIFY_DONE;
2146
2147         switch (event) {
2148         case NETDEV_REGISTER:
2149                 return netvsc_register_vf(event_dev);
2150         case NETDEV_UNREGISTER:
2151                 return netvsc_unregister_vf(event_dev);
2152         case NETDEV_UP:
2153         case NETDEV_DOWN:
2154                 return netvsc_vf_changed(event_dev);
2155         default:
2156                 return NOTIFY_DONE;
2157         }
2158 }
2159
2160 static struct notifier_block netvsc_netdev_notifier = {
2161         .notifier_call = netvsc_netdev_event,
2162 };
2163
2164 static void __exit netvsc_drv_exit(void)
2165 {
2166         unregister_netdevice_notifier(&netvsc_netdev_notifier);
2167         vmbus_driver_unregister(&netvsc_drv);
2168 }
2169
2170 static int __init netvsc_drv_init(void)
2171 {
2172         int ret;
2173
2174         if (ring_size < RING_SIZE_MIN) {
2175                 ring_size = RING_SIZE_MIN;
2176                 pr_info("Increased ring_size to %u (min allowed)\n",
2177                         ring_size);
2178         }
2179         netvsc_ring_bytes = ring_size * PAGE_SIZE;
2180         netvsc_ring_reciprocal = reciprocal_value(netvsc_ring_bytes);
2181
2182         ret = vmbus_driver_register(&netvsc_drv);
2183         if (ret)
2184                 return ret;
2185
2186         register_netdevice_notifier(&netvsc_netdev_notifier);
2187         return 0;
2188 }
2189
2190 MODULE_LICENSE("GPL");
2191 MODULE_DESCRIPTION("Microsoft Hyper-V network driver");
2192
2193 module_init(netvsc_drv_init);
2194 module_exit(netvsc_drv_exit);