ASoc: Another series to convert to struct
[linux-block.git] / drivers / net / ethernet / google / gve / gve_rx.c
1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3  *
4  * Copyright (C) 2015-2021 Google, Inc.
5  */
6
7 #include "gve.h"
8 #include "gve_adminq.h"
9 #include "gve_utils.h"
10 #include <linux/etherdevice.h>
11 #include <linux/filter.h>
12 #include <net/xdp.h>
13 #include <net/xdp_sock_drv.h>
14
15 static void gve_rx_free_buffer(struct device *dev,
16                                struct gve_rx_slot_page_info *page_info,
17                                union gve_rx_data_slot *data_slot)
18 {
19         dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) &
20                                       GVE_DATA_SLOT_ADDR_PAGE_MASK);
21
22         page_ref_sub(page_info->page, page_info->pagecnt_bias - 1);
23         gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE);
24 }
25
26 static void gve_rx_unfill_pages(struct gve_priv *priv, struct gve_rx_ring *rx)
27 {
28         u32 slots = rx->mask + 1;
29         int i;
30
31         if (rx->data.raw_addressing) {
32                 for (i = 0; i < slots; i++)
33                         gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i],
34                                            &rx->data.data_ring[i]);
35         } else {
36                 for (i = 0; i < slots; i++)
37                         page_ref_sub(rx->data.page_info[i].page,
38                                      rx->data.page_info[i].pagecnt_bias - 1);
39                 gve_unassign_qpl(priv, rx->data.qpl->id);
40                 rx->data.qpl = NULL;
41
42                 for (i = 0; i < rx->qpl_copy_pool_mask + 1; i++) {
43                         page_ref_sub(rx->qpl_copy_pool[i].page,
44                                      rx->qpl_copy_pool[i].pagecnt_bias - 1);
45                         put_page(rx->qpl_copy_pool[i].page);
46                 }
47         }
48         kvfree(rx->data.page_info);
49         rx->data.page_info = NULL;
50 }
51
52 static void gve_rx_free_ring(struct gve_priv *priv, int idx)
53 {
54         struct gve_rx_ring *rx = &priv->rx[idx];
55         struct device *dev = &priv->pdev->dev;
56         u32 slots = rx->mask + 1;
57         size_t bytes;
58
59         gve_rx_remove_from_block(priv, idx);
60
61         bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt;
62         dma_free_coherent(dev, bytes, rx->desc.desc_ring, rx->desc.bus);
63         rx->desc.desc_ring = NULL;
64
65         dma_free_coherent(dev, sizeof(*rx->q_resources),
66                           rx->q_resources, rx->q_resources_bus);
67         rx->q_resources = NULL;
68
69         gve_rx_unfill_pages(priv, rx);
70
71         bytes = sizeof(*rx->data.data_ring) * slots;
72         dma_free_coherent(dev, bytes, rx->data.data_ring,
73                           rx->data.data_bus);
74         rx->data.data_ring = NULL;
75
76         kvfree(rx->qpl_copy_pool);
77         rx->qpl_copy_pool = NULL;
78
79         netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx);
80 }
81
82 static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info,
83                              dma_addr_t addr, struct page *page, __be64 *slot_addr)
84 {
85         page_info->page = page;
86         page_info->page_offset = 0;
87         page_info->page_address = page_address(page);
88         *slot_addr = cpu_to_be64(addr);
89         /* The page already has 1 ref */
90         page_ref_add(page, INT_MAX - 1);
91         page_info->pagecnt_bias = INT_MAX;
92 }
93
94 static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev,
95                                struct gve_rx_slot_page_info *page_info,
96                                union gve_rx_data_slot *data_slot)
97 {
98         struct page *page;
99         dma_addr_t dma;
100         int err;
101
102         err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE,
103                              GFP_ATOMIC);
104         if (err)
105                 return err;
106
107         gve_setup_rx_buffer(page_info, dma, page, &data_slot->addr);
108         return 0;
109 }
110
111 static int gve_prefill_rx_pages(struct gve_rx_ring *rx)
112 {
113         struct gve_priv *priv = rx->gve;
114         u32 slots;
115         int err;
116         int i;
117         int j;
118
119         /* Allocate one page per Rx queue slot. Each page is split into two
120          * packet buffers, when possible we "page flip" between the two.
121          */
122         slots = rx->mask + 1;
123
124         rx->data.page_info = kvzalloc(slots *
125                                       sizeof(*rx->data.page_info), GFP_KERNEL);
126         if (!rx->data.page_info)
127                 return -ENOMEM;
128
129         if (!rx->data.raw_addressing) {
130                 rx->data.qpl = gve_assign_rx_qpl(priv, rx->q_num);
131                 if (!rx->data.qpl) {
132                         kvfree(rx->data.page_info);
133                         rx->data.page_info = NULL;
134                         return -ENOMEM;
135                 }
136         }
137         for (i = 0; i < slots; i++) {
138                 if (!rx->data.raw_addressing) {
139                         struct page *page = rx->data.qpl->pages[i];
140                         dma_addr_t addr = i * PAGE_SIZE;
141
142                         gve_setup_rx_buffer(&rx->data.page_info[i], addr, page,
143                                             &rx->data.data_ring[i].qpl_offset);
144                         continue;
145                 }
146                 err = gve_rx_alloc_buffer(priv, &priv->pdev->dev, &rx->data.page_info[i],
147                                           &rx->data.data_ring[i]);
148                 if (err)
149                         goto alloc_err;
150         }
151
152         if (!rx->data.raw_addressing) {
153                 for (j = 0; j < rx->qpl_copy_pool_mask + 1; j++) {
154                         struct page *page = alloc_page(GFP_KERNEL);
155
156                         if (!page) {
157                                 err = -ENOMEM;
158                                 goto alloc_err_qpl;
159                         }
160
161                         rx->qpl_copy_pool[j].page = page;
162                         rx->qpl_copy_pool[j].page_offset = 0;
163                         rx->qpl_copy_pool[j].page_address = page_address(page);
164
165                         /* The page already has 1 ref. */
166                         page_ref_add(page, INT_MAX - 1);
167                         rx->qpl_copy_pool[j].pagecnt_bias = INT_MAX;
168                 }
169         }
170
171         return slots;
172
173 alloc_err_qpl:
174         while (j--) {
175                 page_ref_sub(rx->qpl_copy_pool[j].page,
176                              rx->qpl_copy_pool[j].pagecnt_bias - 1);
177                 put_page(rx->qpl_copy_pool[j].page);
178         }
179 alloc_err:
180         while (i--)
181                 gve_rx_free_buffer(&priv->pdev->dev,
182                                    &rx->data.page_info[i],
183                                    &rx->data.data_ring[i]);
184         return err;
185 }
186
187 static void gve_rx_ctx_clear(struct gve_rx_ctx *ctx)
188 {
189         ctx->skb_head = NULL;
190         ctx->skb_tail = NULL;
191         ctx->total_size = 0;
192         ctx->frag_cnt = 0;
193         ctx->drop_pkt = false;
194 }
195
196 static int gve_rx_alloc_ring(struct gve_priv *priv, int idx)
197 {
198         struct gve_rx_ring *rx = &priv->rx[idx];
199         struct device *hdev = &priv->pdev->dev;
200         u32 slots, npages;
201         int filled_pages;
202         size_t bytes;
203         int err;
204
205         netif_dbg(priv, drv, priv->dev, "allocating rx ring\n");
206         /* Make sure everything is zeroed to start with */
207         memset(rx, 0, sizeof(*rx));
208
209         rx->gve = priv;
210         rx->q_num = idx;
211
212         slots = priv->rx_data_slot_cnt;
213         rx->mask = slots - 1;
214         rx->data.raw_addressing = priv->queue_format == GVE_GQI_RDA_FORMAT;
215
216         /* alloc rx data ring */
217         bytes = sizeof(*rx->data.data_ring) * slots;
218         rx->data.data_ring = dma_alloc_coherent(hdev, bytes,
219                                                 &rx->data.data_bus,
220                                                 GFP_KERNEL);
221         if (!rx->data.data_ring)
222                 return -ENOMEM;
223
224         rx->qpl_copy_pool_mask = min_t(u32, U32_MAX, slots * 2) - 1;
225         rx->qpl_copy_pool_head = 0;
226         rx->qpl_copy_pool = kvcalloc(rx->qpl_copy_pool_mask + 1,
227                                      sizeof(rx->qpl_copy_pool[0]),
228                                      GFP_KERNEL);
229
230         if (!rx->qpl_copy_pool) {
231                 err = -ENOMEM;
232                 goto abort_with_slots;
233         }
234
235         filled_pages = gve_prefill_rx_pages(rx);
236         if (filled_pages < 0) {
237                 err = -ENOMEM;
238                 goto abort_with_copy_pool;
239         }
240         rx->fill_cnt = filled_pages;
241         /* Ensure data ring slots (packet buffers) are visible. */
242         dma_wmb();
243
244         /* Alloc gve_queue_resources */
245         rx->q_resources =
246                 dma_alloc_coherent(hdev,
247                                    sizeof(*rx->q_resources),
248                                    &rx->q_resources_bus,
249                                    GFP_KERNEL);
250         if (!rx->q_resources) {
251                 err = -ENOMEM;
252                 goto abort_filled;
253         }
254         netif_dbg(priv, drv, priv->dev, "rx[%d]->data.data_bus=%lx\n", idx,
255                   (unsigned long)rx->data.data_bus);
256
257         /* alloc rx desc ring */
258         bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt;
259         npages = bytes / PAGE_SIZE;
260         if (npages * PAGE_SIZE != bytes) {
261                 err = -EIO;
262                 goto abort_with_q_resources;
263         }
264
265         rx->desc.desc_ring = dma_alloc_coherent(hdev, bytes, &rx->desc.bus,
266                                                 GFP_KERNEL);
267         if (!rx->desc.desc_ring) {
268                 err = -ENOMEM;
269                 goto abort_with_q_resources;
270         }
271         rx->cnt = 0;
272         rx->db_threshold = priv->rx_desc_cnt / 2;
273         rx->desc.seqno = 1;
274
275         /* Allocating half-page buffers allows page-flipping which is faster
276          * than copying or allocating new pages.
277          */
278         rx->packet_buffer_size = PAGE_SIZE / 2;
279         gve_rx_ctx_clear(&rx->ctx);
280         gve_rx_add_to_block(priv, idx);
281
282         return 0;
283
284 abort_with_q_resources:
285         dma_free_coherent(hdev, sizeof(*rx->q_resources),
286                           rx->q_resources, rx->q_resources_bus);
287         rx->q_resources = NULL;
288 abort_filled:
289         gve_rx_unfill_pages(priv, rx);
290 abort_with_copy_pool:
291         kvfree(rx->qpl_copy_pool);
292         rx->qpl_copy_pool = NULL;
293 abort_with_slots:
294         bytes = sizeof(*rx->data.data_ring) * slots;
295         dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus);
296         rx->data.data_ring = NULL;
297
298         return err;
299 }
300
301 int gve_rx_alloc_rings(struct gve_priv *priv)
302 {
303         int err = 0;
304         int i;
305
306         for (i = 0; i < priv->rx_cfg.num_queues; i++) {
307                 err = gve_rx_alloc_ring(priv, i);
308                 if (err) {
309                         netif_err(priv, drv, priv->dev,
310                                   "Failed to alloc rx ring=%d: err=%d\n",
311                                   i, err);
312                         break;
313                 }
314         }
315         /* Unallocate if there was an error */
316         if (err) {
317                 int j;
318
319                 for (j = 0; j < i; j++)
320                         gve_rx_free_ring(priv, j);
321         }
322         return err;
323 }
324
325 void gve_rx_free_rings_gqi(struct gve_priv *priv)
326 {
327         int i;
328
329         for (i = 0; i < priv->rx_cfg.num_queues; i++)
330                 gve_rx_free_ring(priv, i);
331 }
332
333 void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx)
334 {
335         u32 db_idx = be32_to_cpu(rx->q_resources->db_index);
336
337         iowrite32be(rx->fill_cnt, &priv->db_bar2[db_idx]);
338 }
339
340 static enum pkt_hash_types gve_rss_type(__be16 pkt_flags)
341 {
342         if (likely(pkt_flags & (GVE_RXF_TCP | GVE_RXF_UDP)))
343                 return PKT_HASH_TYPE_L4;
344         if (pkt_flags & (GVE_RXF_IPV4 | GVE_RXF_IPV6))
345                 return PKT_HASH_TYPE_L3;
346         return PKT_HASH_TYPE_L2;
347 }
348
349 static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi,
350                                         struct gve_rx_slot_page_info *page_info,
351                                         u16 packet_buffer_size, u16 len,
352                                         struct gve_rx_ctx *ctx)
353 {
354         u32 offset = page_info->page_offset + page_info->pad;
355         struct sk_buff *skb = ctx->skb_tail;
356         int num_frags = 0;
357
358         if (!skb) {
359                 skb = napi_get_frags(napi);
360                 if (unlikely(!skb))
361                         return NULL;
362
363                 ctx->skb_head = skb;
364                 ctx->skb_tail = skb;
365         } else {
366                 num_frags = skb_shinfo(ctx->skb_tail)->nr_frags;
367                 if (num_frags == MAX_SKB_FRAGS) {
368                         skb = napi_alloc_skb(napi, 0);
369                         if (!skb)
370                                 return NULL;
371
372                         // We will never chain more than two SKBs: 2 * 16 * 2k > 64k
373                         // which is why we do not need to chain by using skb->next
374                         skb_shinfo(ctx->skb_tail)->frag_list = skb;
375
376                         ctx->skb_tail = skb;
377                         num_frags = 0;
378                 }
379         }
380
381         if (skb != ctx->skb_head) {
382                 ctx->skb_head->len += len;
383                 ctx->skb_head->data_len += len;
384                 ctx->skb_head->truesize += packet_buffer_size;
385         }
386         skb_add_rx_frag(skb, num_frags, page_info->page,
387                         offset, len, packet_buffer_size);
388
389         return ctx->skb_head;
390 }
391
392 static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr)
393 {
394         const __be64 offset = cpu_to_be64(PAGE_SIZE / 2);
395
396         /* "flip" to other packet buffer on this page */
397         page_info->page_offset ^= PAGE_SIZE / 2;
398         *(slot_addr) ^= offset;
399 }
400
401 static int gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info *page_info)
402 {
403         int pagecount = page_count(page_info->page);
404
405         /* This page is not being used by any SKBs - reuse */
406         if (pagecount == page_info->pagecnt_bias)
407                 return 1;
408         /* This page is still being used by an SKB - we can't reuse */
409         else if (pagecount > page_info->pagecnt_bias)
410                 return 0;
411         WARN(pagecount < page_info->pagecnt_bias,
412              "Pagecount should never be less than the bias.");
413         return -1;
414 }
415
416 static struct sk_buff *
417 gve_rx_raw_addressing(struct device *dev, struct net_device *netdev,
418                       struct gve_rx_slot_page_info *page_info, u16 len,
419                       struct napi_struct *napi,
420                       union gve_rx_data_slot *data_slot,
421                       u16 packet_buffer_size, struct gve_rx_ctx *ctx)
422 {
423         struct sk_buff *skb = gve_rx_add_frags(napi, page_info, packet_buffer_size, len, ctx);
424
425         if (!skb)
426                 return NULL;
427
428         /* Optimistically stop the kernel from freeing the page.
429          * We will check again in refill to determine if we need to alloc a
430          * new page.
431          */
432         gve_dec_pagecnt_bias(page_info);
433
434         return skb;
435 }
436
437 static struct sk_buff *gve_rx_copy_to_pool(struct gve_rx_ring *rx,
438                                            struct gve_rx_slot_page_info *page_info,
439                                            u16 len, struct napi_struct *napi)
440 {
441         u32 pool_idx = rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask;
442         void *src = page_info->page_address + page_info->page_offset;
443         struct gve_rx_slot_page_info *copy_page_info;
444         struct gve_rx_ctx *ctx = &rx->ctx;
445         bool alloc_page = false;
446         struct sk_buff *skb;
447         void *dst;
448
449         copy_page_info = &rx->qpl_copy_pool[pool_idx];
450         if (!copy_page_info->can_flip) {
451                 int recycle = gve_rx_can_recycle_buffer(copy_page_info);
452
453                 if (unlikely(recycle < 0)) {
454                         gve_schedule_reset(rx->gve);
455                         return NULL;
456                 }
457                 alloc_page = !recycle;
458         }
459
460         if (alloc_page) {
461                 struct gve_rx_slot_page_info alloc_page_info;
462                 struct page *page;
463
464                 /* The least recently used page turned out to be
465                  * still in use by the kernel. Ignoring it and moving
466                  * on alleviates head-of-line blocking.
467                  */
468                 rx->qpl_copy_pool_head++;
469
470                 page = alloc_page(GFP_ATOMIC);
471                 if (!page)
472                         return NULL;
473
474                 alloc_page_info.page = page;
475                 alloc_page_info.page_offset = 0;
476                 alloc_page_info.page_address = page_address(page);
477                 alloc_page_info.pad = page_info->pad;
478
479                 memcpy(alloc_page_info.page_address, src, page_info->pad + len);
480                 skb = gve_rx_add_frags(napi, &alloc_page_info,
481                                        rx->packet_buffer_size,
482                                        len, ctx);
483
484                 u64_stats_update_begin(&rx->statss);
485                 rx->rx_frag_copy_cnt++;
486                 rx->rx_frag_alloc_cnt++;
487                 u64_stats_update_end(&rx->statss);
488
489                 return skb;
490         }
491
492         dst = copy_page_info->page_address + copy_page_info->page_offset;
493         memcpy(dst, src, page_info->pad + len);
494         copy_page_info->pad = page_info->pad;
495
496         skb = gve_rx_add_frags(napi, copy_page_info,
497                                rx->packet_buffer_size, len, ctx);
498         if (unlikely(!skb))
499                 return NULL;
500
501         gve_dec_pagecnt_bias(copy_page_info);
502         copy_page_info->page_offset += rx->packet_buffer_size;
503         copy_page_info->page_offset &= (PAGE_SIZE - 1);
504
505         if (copy_page_info->can_flip) {
506                 /* We have used both halves of this copy page, it
507                  * is time for it to go to the back of the queue.
508                  */
509                 copy_page_info->can_flip = false;
510                 rx->qpl_copy_pool_head++;
511                 prefetch(rx->qpl_copy_pool[rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask].page);
512         } else {
513                 copy_page_info->can_flip = true;
514         }
515
516         u64_stats_update_begin(&rx->statss);
517         rx->rx_frag_copy_cnt++;
518         u64_stats_update_end(&rx->statss);
519
520         return skb;
521 }
522
523 static struct sk_buff *
524 gve_rx_qpl(struct device *dev, struct net_device *netdev,
525            struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info,
526            u16 len, struct napi_struct *napi,
527            union gve_rx_data_slot *data_slot)
528 {
529         struct gve_rx_ctx *ctx = &rx->ctx;
530         struct sk_buff *skb;
531
532         /* if raw_addressing mode is not enabled gvnic can only receive into
533          * registered segments. If the buffer can't be recycled, our only
534          * choice is to copy the data out of it so that we can return it to the
535          * device.
536          */
537         if (page_info->can_flip) {
538                 skb = gve_rx_add_frags(napi, page_info, rx->packet_buffer_size, len, ctx);
539                 /* No point in recycling if we didn't get the skb */
540                 if (skb) {
541                         /* Make sure that the page isn't freed. */
542                         gve_dec_pagecnt_bias(page_info);
543                         gve_rx_flip_buff(page_info, &data_slot->qpl_offset);
544                 }
545         } else {
546                 skb = gve_rx_copy_to_pool(rx, page_info, len, napi);
547         }
548         return skb;
549 }
550
551 static struct sk_buff *gve_rx_skb(struct gve_priv *priv, struct gve_rx_ring *rx,
552                                   struct gve_rx_slot_page_info *page_info, struct napi_struct *napi,
553                                   u16 len, union gve_rx_data_slot *data_slot,
554                                   bool is_only_frag)
555 {
556         struct net_device *netdev = priv->dev;
557         struct gve_rx_ctx *ctx = &rx->ctx;
558         struct sk_buff *skb = NULL;
559
560         if (len <= priv->rx_copybreak && is_only_frag)  {
561                 /* Just copy small packets */
562                 skb = gve_rx_copy(netdev, napi, page_info, len);
563                 if (skb) {
564                         u64_stats_update_begin(&rx->statss);
565                         rx->rx_copied_pkt++;
566                         rx->rx_frag_copy_cnt++;
567                         rx->rx_copybreak_pkt++;
568                         u64_stats_update_end(&rx->statss);
569                 }
570         } else {
571                 int recycle = gve_rx_can_recycle_buffer(page_info);
572
573                 if (unlikely(recycle < 0)) {
574                         gve_schedule_reset(priv);
575                         return NULL;
576                 }
577                 page_info->can_flip = recycle;
578                 if (page_info->can_flip) {
579                         u64_stats_update_begin(&rx->statss);
580                         rx->rx_frag_flip_cnt++;
581                         u64_stats_update_end(&rx->statss);
582                 }
583
584                 if (rx->data.raw_addressing) {
585                         skb = gve_rx_raw_addressing(&priv->pdev->dev, netdev,
586                                                     page_info, len, napi,
587                                                     data_slot,
588                                                     rx->packet_buffer_size, ctx);
589                 } else {
590                         skb = gve_rx_qpl(&priv->pdev->dev, netdev, rx,
591                                          page_info, len, napi, data_slot);
592                 }
593         }
594         return skb;
595 }
596
597 static int gve_xsk_pool_redirect(struct net_device *dev,
598                                  struct gve_rx_ring *rx,
599                                  void *data, int len,
600                                  struct bpf_prog *xdp_prog)
601 {
602         struct xdp_buff *xdp;
603         int err;
604
605         if (rx->xsk_pool->frame_len < len)
606                 return -E2BIG;
607         xdp = xsk_buff_alloc(rx->xsk_pool);
608         if (!xdp) {
609                 u64_stats_update_begin(&rx->statss);
610                 rx->xdp_alloc_fails++;
611                 u64_stats_update_end(&rx->statss);
612                 return -ENOMEM;
613         }
614         xdp->data_end = xdp->data + len;
615         memcpy(xdp->data, data, len);
616         err = xdp_do_redirect(dev, xdp, xdp_prog);
617         if (err)
618                 xsk_buff_free(xdp);
619         return err;
620 }
621
622 static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx,
623                             struct xdp_buff *orig, struct bpf_prog *xdp_prog)
624 {
625         int total_len, len = orig->data_end - orig->data;
626         int headroom = XDP_PACKET_HEADROOM;
627         struct xdp_buff new;
628         void *frame;
629         int err;
630
631         if (rx->xsk_pool)
632                 return gve_xsk_pool_redirect(dev, rx, orig->data,
633                                              len, xdp_prog);
634
635         total_len = headroom + SKB_DATA_ALIGN(len) +
636                 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
637         frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
638         if (!frame) {
639                 u64_stats_update_begin(&rx->statss);
640                 rx->xdp_alloc_fails++;
641                 u64_stats_update_end(&rx->statss);
642                 return -ENOMEM;
643         }
644         xdp_init_buff(&new, total_len, &rx->xdp_rxq);
645         xdp_prepare_buff(&new, frame, headroom, len, false);
646         memcpy(new.data, orig->data, len);
647
648         err = xdp_do_redirect(dev, &new, xdp_prog);
649         if (err)
650                 page_frag_free(frame);
651
652         return err;
653 }
654
655 static void gve_xdp_done(struct gve_priv *priv, struct gve_rx_ring *rx,
656                          struct xdp_buff *xdp, struct bpf_prog *xprog,
657                          int xdp_act)
658 {
659         struct gve_tx_ring *tx;
660         int tx_qid;
661         int err;
662
663         switch (xdp_act) {
664         case XDP_ABORTED:
665         case XDP_DROP:
666         default:
667                 break;
668         case XDP_TX:
669                 tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num);
670                 tx = &priv->tx[tx_qid];
671                 spin_lock(&tx->xdp_lock);
672                 err = gve_xdp_xmit_one(priv, tx, xdp->data,
673                                        xdp->data_end - xdp->data, NULL);
674                 spin_unlock(&tx->xdp_lock);
675
676                 if (unlikely(err)) {
677                         u64_stats_update_begin(&rx->statss);
678                         rx->xdp_tx_errors++;
679                         u64_stats_update_end(&rx->statss);
680                 }
681                 break;
682         case XDP_REDIRECT:
683                 err = gve_xdp_redirect(priv->dev, rx, xdp, xprog);
684
685                 if (unlikely(err)) {
686                         u64_stats_update_begin(&rx->statss);
687                         rx->xdp_redirect_errors++;
688                         u64_stats_update_end(&rx->statss);
689                 }
690                 break;
691         }
692         u64_stats_update_begin(&rx->statss);
693         if ((u32)xdp_act < GVE_XDP_ACTIONS)
694                 rx->xdp_actions[xdp_act]++;
695         u64_stats_update_end(&rx->statss);
696 }
697
698 #define GVE_PKTCONT_BIT_IS_SET(x) (GVE_RXF_PKT_CONT & (x))
699 static void gve_rx(struct gve_rx_ring *rx, netdev_features_t feat,
700                    struct gve_rx_desc *desc, u32 idx,
701                    struct gve_rx_cnts *cnts)
702 {
703         bool is_last_frag = !GVE_PKTCONT_BIT_IS_SET(desc->flags_seq);
704         struct gve_rx_slot_page_info *page_info;
705         u16 frag_size = be16_to_cpu(desc->len);
706         struct gve_rx_ctx *ctx = &rx->ctx;
707         union gve_rx_data_slot *data_slot;
708         struct gve_priv *priv = rx->gve;
709         struct sk_buff *skb = NULL;
710         struct bpf_prog *xprog;
711         struct xdp_buff xdp;
712         dma_addr_t page_bus;
713         void *va;
714
715         u16 len = frag_size;
716         struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
717         bool is_first_frag = ctx->frag_cnt == 0;
718
719         bool is_only_frag = is_first_frag && is_last_frag;
720
721         if (unlikely(ctx->drop_pkt))
722                 goto finish_frag;
723
724         if (desc->flags_seq & GVE_RXF_ERR) {
725                 ctx->drop_pkt = true;
726                 cnts->desc_err_pkt_cnt++;
727                 napi_free_frags(napi);
728                 goto finish_frag;
729         }
730
731         if (unlikely(frag_size > rx->packet_buffer_size)) {
732                 netdev_warn(priv->dev, "Unexpected frag size %d, can't exceed %d, scheduling reset",
733                             frag_size, rx->packet_buffer_size);
734                 ctx->drop_pkt = true;
735                 napi_free_frags(napi);
736                 gve_schedule_reset(rx->gve);
737                 goto finish_frag;
738         }
739
740         /* Prefetch two packet buffers ahead, we will need it soon. */
741         page_info = &rx->data.page_info[(idx + 2) & rx->mask];
742         va = page_info->page_address + page_info->page_offset;
743         prefetch(page_info->page); /* Kernel page struct. */
744         prefetch(va);              /* Packet header. */
745         prefetch(va + 64);         /* Next cacheline too. */
746
747         page_info = &rx->data.page_info[idx];
748         data_slot = &rx->data.data_ring[idx];
749         page_bus = (rx->data.raw_addressing) ?
750                 be64_to_cpu(data_slot->addr) - page_info->page_offset :
751                 rx->data.qpl->page_buses[idx];
752         dma_sync_single_for_cpu(&priv->pdev->dev, page_bus,
753                                 PAGE_SIZE, DMA_FROM_DEVICE);
754         page_info->pad = is_first_frag ? GVE_RX_PAD : 0;
755         len -= page_info->pad;
756         frag_size -= page_info->pad;
757
758         xprog = READ_ONCE(priv->xdp_prog);
759         if (xprog && is_only_frag) {
760                 void *old_data;
761                 int xdp_act;
762
763                 xdp_init_buff(&xdp, rx->packet_buffer_size, &rx->xdp_rxq);
764                 xdp_prepare_buff(&xdp, page_info->page_address +
765                                  page_info->page_offset, GVE_RX_PAD,
766                                  len, false);
767                 old_data = xdp.data;
768                 xdp_act = bpf_prog_run_xdp(xprog, &xdp);
769                 if (xdp_act != XDP_PASS) {
770                         gve_xdp_done(priv, rx, &xdp, xprog, xdp_act);
771                         ctx->total_size += frag_size;
772                         goto finish_ok_pkt;
773                 }
774
775                 page_info->pad += xdp.data - old_data;
776                 len = xdp.data_end - xdp.data;
777
778                 u64_stats_update_begin(&rx->statss);
779                 rx->xdp_actions[XDP_PASS]++;
780                 u64_stats_update_end(&rx->statss);
781         }
782
783         skb = gve_rx_skb(priv, rx, page_info, napi, len,
784                          data_slot, is_only_frag);
785         if (!skb) {
786                 u64_stats_update_begin(&rx->statss);
787                 rx->rx_skb_alloc_fail++;
788                 u64_stats_update_end(&rx->statss);
789
790                 napi_free_frags(napi);
791                 ctx->drop_pkt = true;
792                 goto finish_frag;
793         }
794         ctx->total_size += frag_size;
795
796         if (is_first_frag) {
797                 if (likely(feat & NETIF_F_RXCSUM)) {
798                         /* NIC passes up the partial sum */
799                         if (desc->csum)
800                                 skb->ip_summed = CHECKSUM_COMPLETE;
801                         else
802                                 skb->ip_summed = CHECKSUM_NONE;
803                         skb->csum = csum_unfold(desc->csum);
804                 }
805
806                 /* parse flags & pass relevant info up */
807                 if (likely(feat & NETIF_F_RXHASH) &&
808                     gve_needs_rss(desc->flags_seq))
809                         skb_set_hash(skb, be32_to_cpu(desc->rss_hash),
810                                      gve_rss_type(desc->flags_seq));
811         }
812
813         if (is_last_frag) {
814                 skb_record_rx_queue(skb, rx->q_num);
815                 if (skb_is_nonlinear(skb))
816                         napi_gro_frags(napi);
817                 else
818                         napi_gro_receive(napi, skb);
819                 goto finish_ok_pkt;
820         }
821
822         goto finish_frag;
823
824 finish_ok_pkt:
825         cnts->ok_pkt_bytes += ctx->total_size;
826         cnts->ok_pkt_cnt++;
827 finish_frag:
828         ctx->frag_cnt++;
829         if (is_last_frag) {
830                 cnts->total_pkt_cnt++;
831                 cnts->cont_pkt_cnt += (ctx->frag_cnt > 1);
832                 gve_rx_ctx_clear(ctx);
833         }
834 }
835
836 bool gve_rx_work_pending(struct gve_rx_ring *rx)
837 {
838         struct gve_rx_desc *desc;
839         __be16 flags_seq;
840         u32 next_idx;
841
842         next_idx = rx->cnt & rx->mask;
843         desc = rx->desc.desc_ring + next_idx;
844
845         flags_seq = desc->flags_seq;
846
847         return (GVE_SEQNO(flags_seq) == rx->desc.seqno);
848 }
849
850 static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx)
851 {
852         int refill_target = rx->mask + 1;
853         u32 fill_cnt = rx->fill_cnt;
854
855         while (fill_cnt - rx->cnt < refill_target) {
856                 struct gve_rx_slot_page_info *page_info;
857                 u32 idx = fill_cnt & rx->mask;
858
859                 page_info = &rx->data.page_info[idx];
860                 if (page_info->can_flip) {
861                         /* The other half of the page is free because it was
862                          * free when we processed the descriptor. Flip to it.
863                          */
864                         union gve_rx_data_slot *data_slot =
865                                                 &rx->data.data_ring[idx];
866
867                         gve_rx_flip_buff(page_info, &data_slot->addr);
868                         page_info->can_flip = 0;
869                 } else {
870                         /* It is possible that the networking stack has already
871                          * finished processing all outstanding packets in the buffer
872                          * and it can be reused.
873                          * Flipping is unnecessary here - if the networking stack still
874                          * owns half the page it is impossible to tell which half. Either
875                          * the whole page is free or it needs to be replaced.
876                          */
877                         int recycle = gve_rx_can_recycle_buffer(page_info);
878
879                         if (recycle < 0) {
880                                 if (!rx->data.raw_addressing)
881                                         gve_schedule_reset(priv);
882                                 return false;
883                         }
884                         if (!recycle) {
885                                 /* We can't reuse the buffer - alloc a new one*/
886                                 union gve_rx_data_slot *data_slot =
887                                                 &rx->data.data_ring[idx];
888                                 struct device *dev = &priv->pdev->dev;
889                                 gve_rx_free_buffer(dev, page_info, data_slot);
890                                 page_info->page = NULL;
891                                 if (gve_rx_alloc_buffer(priv, dev, page_info,
892                                                         data_slot)) {
893                                         u64_stats_update_begin(&rx->statss);
894                                         rx->rx_buf_alloc_fail++;
895                                         u64_stats_update_end(&rx->statss);
896                                         break;
897                                 }
898                         }
899                 }
900                 fill_cnt++;
901         }
902         rx->fill_cnt = fill_cnt;
903         return true;
904 }
905
906 static int gve_clean_rx_done(struct gve_rx_ring *rx, int budget,
907                              netdev_features_t feat)
908 {
909         u64 xdp_redirects = rx->xdp_actions[XDP_REDIRECT];
910         u64 xdp_txs = rx->xdp_actions[XDP_TX];
911         struct gve_rx_ctx *ctx = &rx->ctx;
912         struct gve_priv *priv = rx->gve;
913         struct gve_rx_cnts cnts = {0};
914         struct gve_rx_desc *next_desc;
915         u32 idx = rx->cnt & rx->mask;
916         u32 work_done = 0;
917
918         struct gve_rx_desc *desc = &rx->desc.desc_ring[idx];
919
920         // Exceed budget only if (and till) the inflight packet is consumed.
921         while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) &&
922                (work_done < budget || ctx->frag_cnt)) {
923                 next_desc = &rx->desc.desc_ring[(idx + 1) & rx->mask];
924                 prefetch(next_desc);
925
926                 gve_rx(rx, feat, desc, idx, &cnts);
927
928                 rx->cnt++;
929                 idx = rx->cnt & rx->mask;
930                 desc = &rx->desc.desc_ring[idx];
931                 rx->desc.seqno = gve_next_seqno(rx->desc.seqno);
932                 work_done++;
933         }
934
935         // The device will only send whole packets.
936         if (unlikely(ctx->frag_cnt)) {
937                 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
938
939                 napi_free_frags(napi);
940                 gve_rx_ctx_clear(&rx->ctx);
941                 netdev_warn(priv->dev, "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset",
942                             GVE_SEQNO(desc->flags_seq), rx->desc.seqno);
943                 gve_schedule_reset(rx->gve);
944         }
945
946         if (!work_done && rx->fill_cnt - rx->cnt > rx->db_threshold)
947                 return 0;
948
949         if (work_done) {
950                 u64_stats_update_begin(&rx->statss);
951                 rx->rpackets += cnts.ok_pkt_cnt;
952                 rx->rbytes += cnts.ok_pkt_bytes;
953                 rx->rx_cont_packet_cnt += cnts.cont_pkt_cnt;
954                 rx->rx_desc_err_dropped_pkt += cnts.desc_err_pkt_cnt;
955                 u64_stats_update_end(&rx->statss);
956         }
957
958         if (xdp_txs != rx->xdp_actions[XDP_TX])
959                 gve_xdp_tx_flush(priv, rx->q_num);
960
961         if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT])
962                 xdp_do_flush();
963
964         /* restock ring slots */
965         if (!rx->data.raw_addressing) {
966                 /* In QPL mode buffs are refilled as the desc are processed */
967                 rx->fill_cnt += work_done;
968         } else if (rx->fill_cnt - rx->cnt <= rx->db_threshold) {
969                 /* In raw addressing mode buffs are only refilled if the avail
970                  * falls below a threshold.
971                  */
972                 if (!gve_rx_refill_buffers(priv, rx))
973                         return 0;
974
975                 /* If we were not able to completely refill buffers, we'll want
976                  * to schedule this queue for work again to refill buffers.
977                  */
978                 if (rx->fill_cnt - rx->cnt <= rx->db_threshold) {
979                         gve_rx_write_doorbell(priv, rx);
980                         return budget;
981                 }
982         }
983
984         gve_rx_write_doorbell(priv, rx);
985         return cnts.total_pkt_cnt;
986 }
987
988 int gve_rx_poll(struct gve_notify_block *block, int budget)
989 {
990         struct gve_rx_ring *rx = block->rx;
991         netdev_features_t feat;
992         int work_done = 0;
993
994         feat = block->napi.dev->features;
995
996         /* If budget is 0, do all the work */
997         if (budget == 0)
998                 budget = INT_MAX;
999
1000         if (budget > 0)
1001                 work_done = gve_clean_rx_done(rx, budget, feat);
1002
1003         return work_done;
1004 }