Commit | Line | Data |
---|---|---|
cdedef59 AV |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* Copyright (c) 2018, Intel Corporation. */ | |
3 | ||
4 | /* The driver transmit and receive code */ | |
5 | ||
6 | #include <linux/prefetch.h> | |
7 | #include <linux/mm.h> | |
efc2214b MF |
8 | #include <linux/bpf_trace.h> |
9 | #include <net/xdp.h> | |
0891d6d4 | 10 | #include "ice_txrx_lib.h" |
efc2214b | 11 | #include "ice_lib.h" |
cdedef59 | 12 | #include "ice.h" |
5f6aa50e | 13 | #include "ice_dcb_lib.h" |
2d4238f5 | 14 | #include "ice_xsk.h" |
cdedef59 | 15 | |
2b245cb2 AV |
16 | #define ICE_RX_HDR_SIZE 256 |
17 | ||
cdedef59 AV |
18 | /** |
19 | * ice_unmap_and_free_tx_buf - Release a Tx buffer | |
20 | * @ring: the ring that owns the buffer | |
21 | * @tx_buf: the buffer to free | |
22 | */ | |
23 | static void | |
24 | ice_unmap_and_free_tx_buf(struct ice_ring *ring, struct ice_tx_buf *tx_buf) | |
25 | { | |
26 | if (tx_buf->skb) { | |
efc2214b MF |
27 | if (ice_ring_is_xdp(ring)) |
28 | page_frag_free(tx_buf->raw_buf); | |
29 | else | |
30 | dev_kfree_skb_any(tx_buf->skb); | |
cdedef59 AV |
31 | if (dma_unmap_len(tx_buf, len)) |
32 | dma_unmap_single(ring->dev, | |
33 | dma_unmap_addr(tx_buf, dma), | |
34 | dma_unmap_len(tx_buf, len), | |
35 | DMA_TO_DEVICE); | |
36 | } else if (dma_unmap_len(tx_buf, len)) { | |
37 | dma_unmap_page(ring->dev, | |
38 | dma_unmap_addr(tx_buf, dma), | |
39 | dma_unmap_len(tx_buf, len), | |
40 | DMA_TO_DEVICE); | |
41 | } | |
42 | ||
43 | tx_buf->next_to_watch = NULL; | |
44 | tx_buf->skb = NULL; | |
45 | dma_unmap_len_set(tx_buf, len, 0); | |
46 | /* tx_buf must be completely set up in the transmit path */ | |
47 | } | |
48 | ||
49 | static struct netdev_queue *txring_txq(const struct ice_ring *ring) | |
50 | { | |
51 | return netdev_get_tx_queue(ring->netdev, ring->q_index); | |
52 | } | |
53 | ||
54 | /** | |
55 | * ice_clean_tx_ring - Free any empty Tx buffers | |
56 | * @tx_ring: ring to be cleaned | |
57 | */ | |
58 | void ice_clean_tx_ring(struct ice_ring *tx_ring) | |
59 | { | |
cdedef59 AV |
60 | u16 i; |
61 | ||
2d4238f5 KK |
62 | if (ice_ring_is_xdp(tx_ring) && tx_ring->xsk_umem) { |
63 | ice_xsk_clean_xdp_ring(tx_ring); | |
64 | goto tx_skip_free; | |
65 | } | |
66 | ||
cdedef59 AV |
67 | /* ring already cleared, nothing to do */ |
68 | if (!tx_ring->tx_buf) | |
69 | return; | |
70 | ||
2f2da36e | 71 | /* Free all the Tx ring sk_buffs */ |
cdedef59 AV |
72 | for (i = 0; i < tx_ring->count; i++) |
73 | ice_unmap_and_free_tx_buf(tx_ring, &tx_ring->tx_buf[i]); | |
74 | ||
2d4238f5 | 75 | tx_skip_free: |
c6dfd690 | 76 | memset(tx_ring->tx_buf, 0, sizeof(*tx_ring->tx_buf) * tx_ring->count); |
cdedef59 AV |
77 | |
78 | /* Zero out the descriptor ring */ | |
79 | memset(tx_ring->desc, 0, tx_ring->size); | |
80 | ||
81 | tx_ring->next_to_use = 0; | |
82 | tx_ring->next_to_clean = 0; | |
83 | ||
84 | if (!tx_ring->netdev) | |
85 | return; | |
86 | ||
87 | /* cleanup Tx queue statistics */ | |
88 | netdev_tx_reset_queue(txring_txq(tx_ring)); | |
89 | } | |
90 | ||
91 | /** | |
92 | * ice_free_tx_ring - Free Tx resources per queue | |
93 | * @tx_ring: Tx descriptor ring for a specific queue | |
94 | * | |
95 | * Free all transmit software resources | |
96 | */ | |
97 | void ice_free_tx_ring(struct ice_ring *tx_ring) | |
98 | { | |
99 | ice_clean_tx_ring(tx_ring); | |
100 | devm_kfree(tx_ring->dev, tx_ring->tx_buf); | |
101 | tx_ring->tx_buf = NULL; | |
102 | ||
103 | if (tx_ring->desc) { | |
104 | dmam_free_coherent(tx_ring->dev, tx_ring->size, | |
105 | tx_ring->desc, tx_ring->dma); | |
106 | tx_ring->desc = NULL; | |
107 | } | |
108 | } | |
109 | ||
2b245cb2 AV |
110 | /** |
111 | * ice_clean_tx_irq - Reclaim resources after transmit completes | |
2b245cb2 AV |
112 | * @tx_ring: Tx ring to clean |
113 | * @napi_budget: Used to determine if we are in netpoll | |
114 | * | |
115 | * Returns true if there's any budget left (e.g. the clean is finished) | |
116 | */ | |
2fb0821f | 117 | static bool ice_clean_tx_irq(struct ice_ring *tx_ring, int napi_budget) |
2b245cb2 AV |
118 | { |
119 | unsigned int total_bytes = 0, total_pkts = 0; | |
2fb0821f JB |
120 | unsigned int budget = ICE_DFLT_IRQ_WORK; |
121 | struct ice_vsi *vsi = tx_ring->vsi; | |
2b245cb2 AV |
122 | s16 i = tx_ring->next_to_clean; |
123 | struct ice_tx_desc *tx_desc; | |
124 | struct ice_tx_buf *tx_buf; | |
125 | ||
126 | tx_buf = &tx_ring->tx_buf[i]; | |
127 | tx_desc = ICE_TX_DESC(tx_ring, i); | |
128 | i -= tx_ring->count; | |
129 | ||
2fb0821f JB |
130 | prefetch(&vsi->state); |
131 | ||
2b245cb2 AV |
132 | do { |
133 | struct ice_tx_desc *eop_desc = tx_buf->next_to_watch; | |
134 | ||
135 | /* if next_to_watch is not set then there is no work pending */ | |
136 | if (!eop_desc) | |
137 | break; | |
138 | ||
139 | smp_rmb(); /* prevent any other reads prior to eop_desc */ | |
140 | ||
141 | /* if the descriptor isn't done, no work yet to do */ | |
142 | if (!(eop_desc->cmd_type_offset_bsz & | |
143 | cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE))) | |
144 | break; | |
145 | ||
146 | /* clear next_to_watch to prevent false hangs */ | |
147 | tx_buf->next_to_watch = NULL; | |
148 | ||
149 | /* update the statistics for this packet */ | |
150 | total_bytes += tx_buf->bytecount; | |
151 | total_pkts += tx_buf->gso_segs; | |
152 | ||
efc2214b MF |
153 | if (ice_ring_is_xdp(tx_ring)) |
154 | page_frag_free(tx_buf->raw_buf); | |
155 | else | |
156 | /* free the skb */ | |
157 | napi_consume_skb(tx_buf->skb, napi_budget); | |
2b245cb2 AV |
158 | |
159 | /* unmap skb header data */ | |
160 | dma_unmap_single(tx_ring->dev, | |
161 | dma_unmap_addr(tx_buf, dma), | |
162 | dma_unmap_len(tx_buf, len), | |
163 | DMA_TO_DEVICE); | |
164 | ||
165 | /* clear tx_buf data */ | |
166 | tx_buf->skb = NULL; | |
167 | dma_unmap_len_set(tx_buf, len, 0); | |
168 | ||
169 | /* unmap remaining buffers */ | |
170 | while (tx_desc != eop_desc) { | |
171 | tx_buf++; | |
172 | tx_desc++; | |
173 | i++; | |
174 | if (unlikely(!i)) { | |
175 | i -= tx_ring->count; | |
176 | tx_buf = tx_ring->tx_buf; | |
177 | tx_desc = ICE_TX_DESC(tx_ring, 0); | |
178 | } | |
179 | ||
180 | /* unmap any remaining paged data */ | |
181 | if (dma_unmap_len(tx_buf, len)) { | |
182 | dma_unmap_page(tx_ring->dev, | |
183 | dma_unmap_addr(tx_buf, dma), | |
184 | dma_unmap_len(tx_buf, len), | |
185 | DMA_TO_DEVICE); | |
186 | dma_unmap_len_set(tx_buf, len, 0); | |
187 | } | |
188 | } | |
189 | ||
190 | /* move us one more past the eop_desc for start of next pkt */ | |
191 | tx_buf++; | |
192 | tx_desc++; | |
193 | i++; | |
194 | if (unlikely(!i)) { | |
195 | i -= tx_ring->count; | |
196 | tx_buf = tx_ring->tx_buf; | |
197 | tx_desc = ICE_TX_DESC(tx_ring, 0); | |
198 | } | |
199 | ||
200 | prefetch(tx_desc); | |
201 | ||
202 | /* update budget accounting */ | |
203 | budget--; | |
204 | } while (likely(budget)); | |
205 | ||
206 | i += tx_ring->count; | |
207 | tx_ring->next_to_clean = i; | |
2d4238f5 KK |
208 | |
209 | ice_update_tx_ring_stats(tx_ring, total_pkts, total_bytes); | |
2b245cb2 | 210 | |
efc2214b MF |
211 | if (ice_ring_is_xdp(tx_ring)) |
212 | return !!budget; | |
213 | ||
2b245cb2 AV |
214 | netdev_tx_completed_queue(txring_txq(tx_ring), total_pkts, |
215 | total_bytes); | |
216 | ||
217 | #define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2)) | |
218 | if (unlikely(total_pkts && netif_carrier_ok(tx_ring->netdev) && | |
219 | (ICE_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) { | |
220 | /* Make sure that anybody stopping the queue after this | |
221 | * sees the new next_to_clean. | |
222 | */ | |
223 | smp_mb(); | |
224 | if (__netif_subqueue_stopped(tx_ring->netdev, | |
225 | tx_ring->q_index) && | |
2fb0821f | 226 | !test_bit(__ICE_DOWN, vsi->state)) { |
2b245cb2 AV |
227 | netif_wake_subqueue(tx_ring->netdev, |
228 | tx_ring->q_index); | |
229 | ++tx_ring->tx_stats.restart_q; | |
230 | } | |
231 | } | |
232 | ||
233 | return !!budget; | |
234 | } | |
235 | ||
cdedef59 AV |
236 | /** |
237 | * ice_setup_tx_ring - Allocate the Tx descriptors | |
d337f2af | 238 | * @tx_ring: the Tx ring to set up |
cdedef59 AV |
239 | * |
240 | * Return 0 on success, negative on error | |
241 | */ | |
242 | int ice_setup_tx_ring(struct ice_ring *tx_ring) | |
243 | { | |
244 | struct device *dev = tx_ring->dev; | |
cdedef59 AV |
245 | |
246 | if (!dev) | |
247 | return -ENOMEM; | |
248 | ||
249 | /* warn if we are about to overwrite the pointer */ | |
250 | WARN_ON(tx_ring->tx_buf); | |
c6dfd690 BA |
251 | tx_ring->tx_buf = |
252 | devm_kzalloc(dev, sizeof(*tx_ring->tx_buf) * tx_ring->count, | |
253 | GFP_KERNEL); | |
cdedef59 AV |
254 | if (!tx_ring->tx_buf) |
255 | return -ENOMEM; | |
256 | ||
ad71b256 | 257 | /* round up to nearest page */ |
c6dfd690 | 258 | tx_ring->size = ALIGN(tx_ring->count * sizeof(struct ice_tx_desc), |
ad71b256 | 259 | PAGE_SIZE); |
cdedef59 AV |
260 | tx_ring->desc = dmam_alloc_coherent(dev, tx_ring->size, &tx_ring->dma, |
261 | GFP_KERNEL); | |
262 | if (!tx_ring->desc) { | |
263 | dev_err(dev, "Unable to allocate memory for the Tx descriptor ring, size=%d\n", | |
264 | tx_ring->size); | |
265 | goto err; | |
266 | } | |
267 | ||
268 | tx_ring->next_to_use = 0; | |
269 | tx_ring->next_to_clean = 0; | |
b3969fd7 | 270 | tx_ring->tx_stats.prev_pkt = -1; |
cdedef59 AV |
271 | return 0; |
272 | ||
273 | err: | |
274 | devm_kfree(dev, tx_ring->tx_buf); | |
275 | tx_ring->tx_buf = NULL; | |
276 | return -ENOMEM; | |
277 | } | |
278 | ||
279 | /** | |
280 | * ice_clean_rx_ring - Free Rx buffers | |
281 | * @rx_ring: ring to be cleaned | |
282 | */ | |
283 | void ice_clean_rx_ring(struct ice_ring *rx_ring) | |
284 | { | |
285 | struct device *dev = rx_ring->dev; | |
cdedef59 AV |
286 | u16 i; |
287 | ||
288 | /* ring already cleared, nothing to do */ | |
289 | if (!rx_ring->rx_buf) | |
290 | return; | |
291 | ||
2d4238f5 KK |
292 | if (rx_ring->xsk_umem) { |
293 | ice_xsk_clean_rx_ring(rx_ring); | |
294 | goto rx_skip_free; | |
295 | } | |
296 | ||
cdedef59 AV |
297 | /* Free all the Rx ring sk_buffs */ |
298 | for (i = 0; i < rx_ring->count; i++) { | |
299 | struct ice_rx_buf *rx_buf = &rx_ring->rx_buf[i]; | |
300 | ||
301 | if (rx_buf->skb) { | |
302 | dev_kfree_skb(rx_buf->skb); | |
303 | rx_buf->skb = NULL; | |
304 | } | |
305 | if (!rx_buf->page) | |
306 | continue; | |
307 | ||
a65f71fe MF |
308 | /* Invalidate cache lines that may have been written to by |
309 | * device so that we avoid corrupting memory. | |
310 | */ | |
311 | dma_sync_single_range_for_cpu(dev, rx_buf->dma, | |
312 | rx_buf->page_offset, | |
7237f5b0 MF |
313 | rx_ring->rx_buf_len, |
314 | DMA_FROM_DEVICE); | |
a65f71fe MF |
315 | |
316 | /* free resources associated with mapping */ | |
7237f5b0 | 317 | dma_unmap_page_attrs(dev, rx_buf->dma, ice_rx_pg_size(rx_ring), |
a65f71fe | 318 | DMA_FROM_DEVICE, ICE_RX_DMA_ATTR); |
03c66a13 | 319 | __page_frag_cache_drain(rx_buf->page, rx_buf->pagecnt_bias); |
cdedef59 AV |
320 | |
321 | rx_buf->page = NULL; | |
322 | rx_buf->page_offset = 0; | |
323 | } | |
324 | ||
2d4238f5 | 325 | rx_skip_free: |
c6dfd690 | 326 | memset(rx_ring->rx_buf, 0, sizeof(*rx_ring->rx_buf) * rx_ring->count); |
cdedef59 AV |
327 | |
328 | /* Zero out the descriptor ring */ | |
329 | memset(rx_ring->desc, 0, rx_ring->size); | |
330 | ||
331 | rx_ring->next_to_alloc = 0; | |
332 | rx_ring->next_to_clean = 0; | |
333 | rx_ring->next_to_use = 0; | |
334 | } | |
335 | ||
336 | /** | |
337 | * ice_free_rx_ring - Free Rx resources | |
338 | * @rx_ring: ring to clean the resources from | |
339 | * | |
340 | * Free all receive software resources | |
341 | */ | |
342 | void ice_free_rx_ring(struct ice_ring *rx_ring) | |
343 | { | |
344 | ice_clean_rx_ring(rx_ring); | |
efc2214b MF |
345 | if (rx_ring->vsi->type == ICE_VSI_PF) |
346 | if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) | |
347 | xdp_rxq_info_unreg(&rx_ring->xdp_rxq); | |
348 | rx_ring->xdp_prog = NULL; | |
cdedef59 AV |
349 | devm_kfree(rx_ring->dev, rx_ring->rx_buf); |
350 | rx_ring->rx_buf = NULL; | |
351 | ||
352 | if (rx_ring->desc) { | |
353 | dmam_free_coherent(rx_ring->dev, rx_ring->size, | |
354 | rx_ring->desc, rx_ring->dma); | |
355 | rx_ring->desc = NULL; | |
356 | } | |
357 | } | |
358 | ||
359 | /** | |
360 | * ice_setup_rx_ring - Allocate the Rx descriptors | |
d337f2af | 361 | * @rx_ring: the Rx ring to set up |
cdedef59 AV |
362 | * |
363 | * Return 0 on success, negative on error | |
364 | */ | |
365 | int ice_setup_rx_ring(struct ice_ring *rx_ring) | |
366 | { | |
367 | struct device *dev = rx_ring->dev; | |
cdedef59 AV |
368 | |
369 | if (!dev) | |
370 | return -ENOMEM; | |
371 | ||
372 | /* warn if we are about to overwrite the pointer */ | |
373 | WARN_ON(rx_ring->rx_buf); | |
c6dfd690 BA |
374 | rx_ring->rx_buf = |
375 | devm_kzalloc(dev, sizeof(*rx_ring->rx_buf) * rx_ring->count, | |
376 | GFP_KERNEL); | |
cdedef59 AV |
377 | if (!rx_ring->rx_buf) |
378 | return -ENOMEM; | |
379 | ||
ad71b256 BC |
380 | /* round up to nearest page */ |
381 | rx_ring->size = ALIGN(rx_ring->count * sizeof(union ice_32byte_rx_desc), | |
382 | PAGE_SIZE); | |
cdedef59 AV |
383 | rx_ring->desc = dmam_alloc_coherent(dev, rx_ring->size, &rx_ring->dma, |
384 | GFP_KERNEL); | |
385 | if (!rx_ring->desc) { | |
386 | dev_err(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n", | |
387 | rx_ring->size); | |
388 | goto err; | |
389 | } | |
390 | ||
391 | rx_ring->next_to_use = 0; | |
392 | rx_ring->next_to_clean = 0; | |
efc2214b MF |
393 | |
394 | if (ice_is_xdp_ena_vsi(rx_ring->vsi)) | |
395 | WRITE_ONCE(rx_ring->xdp_prog, rx_ring->vsi->xdp_prog); | |
396 | ||
397 | if (rx_ring->vsi->type == ICE_VSI_PF && | |
398 | !xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) | |
399 | if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, | |
400 | rx_ring->q_index)) | |
401 | goto err; | |
cdedef59 AV |
402 | return 0; |
403 | ||
404 | err: | |
405 | devm_kfree(dev, rx_ring->rx_buf); | |
406 | rx_ring->rx_buf = NULL; | |
407 | return -ENOMEM; | |
408 | } | |
409 | ||
efc2214b MF |
410 | /** |
411 | * ice_rx_offset - Return expected offset into page to access data | |
412 | * @rx_ring: Ring we are requesting offset of | |
413 | * | |
414 | * Returns the offset value for ring into the data buffer. | |
415 | */ | |
416 | static unsigned int ice_rx_offset(struct ice_ring *rx_ring) | |
417 | { | |
59bb0808 MF |
418 | if (ice_ring_uses_build_skb(rx_ring)) |
419 | return ICE_SKB_PAD; | |
420 | else if (ice_is_xdp_ena_vsi(rx_ring->vsi)) | |
421 | return XDP_PACKET_HEADROOM; | |
422 | ||
423 | return 0; | |
efc2214b MF |
424 | } |
425 | ||
efc2214b MF |
426 | /** |
427 | * ice_run_xdp - Executes an XDP program on initialized xdp_buff | |
428 | * @rx_ring: Rx ring | |
429 | * @xdp: xdp_buff used as input to the XDP program | |
430 | * @xdp_prog: XDP program to run | |
431 | * | |
432 | * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR} | |
433 | */ | |
434 | static int | |
435 | ice_run_xdp(struct ice_ring *rx_ring, struct xdp_buff *xdp, | |
436 | struct bpf_prog *xdp_prog) | |
437 | { | |
438 | int err, result = ICE_XDP_PASS; | |
439 | struct ice_ring *xdp_ring; | |
440 | u32 act; | |
441 | ||
442 | act = bpf_prog_run_xdp(xdp_prog, xdp); | |
443 | switch (act) { | |
444 | case XDP_PASS: | |
445 | break; | |
446 | case XDP_TX: | |
447 | xdp_ring = rx_ring->vsi->xdp_rings[smp_processor_id()]; | |
448 | result = ice_xmit_xdp_buff(xdp, xdp_ring); | |
449 | break; | |
450 | case XDP_REDIRECT: | |
451 | err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog); | |
452 | result = !err ? ICE_XDP_REDIR : ICE_XDP_CONSUMED; | |
453 | break; | |
454 | default: | |
455 | bpf_warn_invalid_xdp_action(act); | |
456 | /* fallthrough -- not supported action */ | |
457 | case XDP_ABORTED: | |
458 | trace_xdp_exception(rx_ring->netdev, xdp_prog, act); | |
459 | /* fallthrough -- handle aborts by dropping frame */ | |
460 | case XDP_DROP: | |
461 | result = ICE_XDP_CONSUMED; | |
462 | break; | |
463 | } | |
464 | ||
465 | return result; | |
466 | } | |
467 | ||
468 | /** | |
469 | * ice_xdp_xmit - submit packets to XDP ring for transmission | |
470 | * @dev: netdev | |
471 | * @n: number of XDP frames to be transmitted | |
472 | * @frames: XDP frames to be transmitted | |
473 | * @flags: transmit flags | |
474 | * | |
475 | * Returns number of frames successfully sent. Frames that fail are | |
476 | * free'ed via XDP return API. | |
477 | * For error cases, a negative errno code is returned and no-frames | |
478 | * are transmitted (caller must handle freeing frames). | |
479 | */ | |
480 | int | |
481 | ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, | |
482 | u32 flags) | |
483 | { | |
484 | struct ice_netdev_priv *np = netdev_priv(dev); | |
485 | unsigned int queue_index = smp_processor_id(); | |
486 | struct ice_vsi *vsi = np->vsi; | |
487 | struct ice_ring *xdp_ring; | |
488 | int drops = 0, i; | |
489 | ||
490 | if (test_bit(__ICE_DOWN, vsi->state)) | |
491 | return -ENETDOWN; | |
492 | ||
493 | if (!ice_is_xdp_ena_vsi(vsi) || queue_index >= vsi->num_xdp_txq) | |
494 | return -ENXIO; | |
495 | ||
496 | if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) | |
497 | return -EINVAL; | |
498 | ||
499 | xdp_ring = vsi->xdp_rings[queue_index]; | |
500 | for (i = 0; i < n; i++) { | |
501 | struct xdp_frame *xdpf = frames[i]; | |
502 | int err; | |
503 | ||
504 | err = ice_xmit_xdp_ring(xdpf->data, xdpf->len, xdp_ring); | |
505 | if (err != ICE_XDP_TX) { | |
506 | xdp_return_frame_rx_napi(xdpf); | |
507 | drops++; | |
508 | } | |
509 | } | |
510 | ||
511 | if (unlikely(flags & XDP_XMIT_FLUSH)) | |
512 | ice_xdp_ring_update_tail(xdp_ring); | |
513 | ||
514 | return n - drops; | |
515 | } | |
516 | ||
cdedef59 AV |
517 | /** |
518 | * ice_alloc_mapped_page - recycle or make a new page | |
519 | * @rx_ring: ring to use | |
520 | * @bi: rx_buf struct to modify | |
521 | * | |
522 | * Returns true if the page was successfully allocated or | |
523 | * reused. | |
524 | */ | |
c8b7abdd BA |
525 | static bool |
526 | ice_alloc_mapped_page(struct ice_ring *rx_ring, struct ice_rx_buf *bi) | |
cdedef59 AV |
527 | { |
528 | struct page *page = bi->page; | |
529 | dma_addr_t dma; | |
530 | ||
531 | /* since we are recycling buffers we should seldom need to alloc */ | |
2b245cb2 AV |
532 | if (likely(page)) { |
533 | rx_ring->rx_stats.page_reuse_count++; | |
cdedef59 | 534 | return true; |
2b245cb2 | 535 | } |
cdedef59 AV |
536 | |
537 | /* alloc new page for storage */ | |
7237f5b0 | 538 | page = dev_alloc_pages(ice_rx_pg_order(rx_ring)); |
2b245cb2 AV |
539 | if (unlikely(!page)) { |
540 | rx_ring->rx_stats.alloc_page_failed++; | |
cdedef59 | 541 | return false; |
2b245cb2 | 542 | } |
cdedef59 AV |
543 | |
544 | /* map page for use */ | |
7237f5b0 | 545 | dma = dma_map_page_attrs(rx_ring->dev, page, 0, ice_rx_pg_size(rx_ring), |
a65f71fe | 546 | DMA_FROM_DEVICE, ICE_RX_DMA_ATTR); |
cdedef59 AV |
547 | |
548 | /* if mapping failed free memory back to system since | |
549 | * there isn't much point in holding memory we can't use | |
550 | */ | |
551 | if (dma_mapping_error(rx_ring->dev, dma)) { | |
7237f5b0 | 552 | __free_pages(page, ice_rx_pg_order(rx_ring)); |
2b245cb2 | 553 | rx_ring->rx_stats.alloc_page_failed++; |
cdedef59 AV |
554 | return false; |
555 | } | |
556 | ||
557 | bi->dma = dma; | |
558 | bi->page = page; | |
efc2214b | 559 | bi->page_offset = ice_rx_offset(rx_ring); |
03c66a13 MF |
560 | page_ref_add(page, USHRT_MAX - 1); |
561 | bi->pagecnt_bias = USHRT_MAX; | |
cdedef59 AV |
562 | |
563 | return true; | |
564 | } | |
565 | ||
566 | /** | |
567 | * ice_alloc_rx_bufs - Replace used receive buffers | |
568 | * @rx_ring: ring to place buffers on | |
569 | * @cleaned_count: number of buffers to replace | |
570 | * | |
cb7db356 BC |
571 | * Returns false if all allocations were successful, true if any fail. Returning |
572 | * true signals to the caller that we didn't replace cleaned_count buffers and | |
573 | * there is more work to do. | |
574 | * | |
575 | * First, try to clean "cleaned_count" Rx buffers. Then refill the cleaned Rx | |
576 | * buffers. Then bump tail at most one time. Grouping like this lets us avoid | |
577 | * multiple tail writes per call. | |
cdedef59 AV |
578 | */ |
579 | bool ice_alloc_rx_bufs(struct ice_ring *rx_ring, u16 cleaned_count) | |
580 | { | |
581 | union ice_32b_rx_flex_desc *rx_desc; | |
582 | u16 ntu = rx_ring->next_to_use; | |
583 | struct ice_rx_buf *bi; | |
584 | ||
585 | /* do nothing if no valid netdev defined */ | |
586 | if (!rx_ring->netdev || !cleaned_count) | |
587 | return false; | |
588 | ||
f9867df6 | 589 | /* get the Rx descriptor and buffer based on next_to_use */ |
cdedef59 AV |
590 | rx_desc = ICE_RX_DESC(rx_ring, ntu); |
591 | bi = &rx_ring->rx_buf[ntu]; | |
592 | ||
593 | do { | |
a1e99685 | 594 | /* if we fail here, we have work remaining */ |
cdedef59 | 595 | if (!ice_alloc_mapped_page(rx_ring, bi)) |
a1e99685 | 596 | break; |
cdedef59 | 597 | |
a65f71fe MF |
598 | /* sync the buffer for use by the device */ |
599 | dma_sync_single_range_for_device(rx_ring->dev, bi->dma, | |
600 | bi->page_offset, | |
7237f5b0 | 601 | rx_ring->rx_buf_len, |
a65f71fe MF |
602 | DMA_FROM_DEVICE); |
603 | ||
cdedef59 AV |
604 | /* Refresh the desc even if buffer_addrs didn't change |
605 | * because each write-back erases this info. | |
606 | */ | |
607 | rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset); | |
608 | ||
609 | rx_desc++; | |
610 | bi++; | |
611 | ntu++; | |
612 | if (unlikely(ntu == rx_ring->count)) { | |
613 | rx_desc = ICE_RX_DESC(rx_ring, 0); | |
614 | bi = rx_ring->rx_buf; | |
615 | ntu = 0; | |
616 | } | |
617 | ||
618 | /* clear the status bits for the next_to_use descriptor */ | |
619 | rx_desc->wb.status_error0 = 0; | |
620 | ||
621 | cleaned_count--; | |
622 | } while (cleaned_count); | |
623 | ||
624 | if (rx_ring->next_to_use != ntu) | |
625 | ice_release_rx_desc(rx_ring, ntu); | |
626 | ||
a1e99685 | 627 | return !!cleaned_count; |
cdedef59 | 628 | } |
2b245cb2 AV |
629 | |
630 | /** | |
631 | * ice_page_is_reserved - check if reuse is possible | |
632 | * @page: page struct to check | |
633 | */ | |
634 | static bool ice_page_is_reserved(struct page *page) | |
635 | { | |
636 | return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page); | |
637 | } | |
638 | ||
639 | /** | |
1d032bc7 MF |
640 | * ice_rx_buf_adjust_pg_offset - Prepare Rx buffer for reuse |
641 | * @rx_buf: Rx buffer to adjust | |
642 | * @size: Size of adjustment | |
2b245cb2 | 643 | * |
1d032bc7 MF |
644 | * Update the offset within page so that Rx buf will be ready to be reused. |
645 | * For systems with PAGE_SIZE < 8192 this function will flip the page offset | |
646 | * so the second half of page assigned to Rx buffer will be used, otherwise | |
4ee656bb | 647 | * the offset is moved by "size" bytes |
2b245cb2 | 648 | */ |
1d032bc7 MF |
649 | static void |
650 | ice_rx_buf_adjust_pg_offset(struct ice_rx_buf *rx_buf, unsigned int size) | |
2b245cb2 AV |
651 | { |
652 | #if (PAGE_SIZE < 8192) | |
1d032bc7 MF |
653 | /* flip page offset to other buffer */ |
654 | rx_buf->page_offset ^= size; | |
2b245cb2 | 655 | #else |
1d032bc7 MF |
656 | /* move offset up to the next cache line */ |
657 | rx_buf->page_offset += size; | |
658 | #endif | |
659 | } | |
2b245cb2 | 660 | |
bbb97808 MF |
661 | /** |
662 | * ice_can_reuse_rx_page - Determine if page can be reused for another Rx | |
663 | * @rx_buf: buffer containing the page | |
bbb97808 MF |
664 | * |
665 | * If page is reusable, we have a green light for calling ice_reuse_rx_page, | |
666 | * which will assign the current buffer to the buffer that next_to_alloc is | |
667 | * pointing to; otherwise, the DMA mapping needs to be destroyed and | |
668 | * page freed | |
669 | */ | |
1d032bc7 | 670 | static bool ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf) |
bbb97808 | 671 | { |
03c66a13 | 672 | unsigned int pagecnt_bias = rx_buf->pagecnt_bias; |
bbb97808 | 673 | struct page *page = rx_buf->page; |
2b245cb2 AV |
674 | |
675 | /* avoid re-using remote pages */ | |
676 | if (unlikely(ice_page_is_reserved(page))) | |
677 | return false; | |
678 | ||
679 | #if (PAGE_SIZE < 8192) | |
680 | /* if we are only owner of page we can reuse it */ | |
03c66a13 | 681 | if (unlikely((page_count(page) - pagecnt_bias) > 1)) |
2b245cb2 | 682 | return false; |
2b245cb2 | 683 | #else |
7237f5b0 MF |
684 | #define ICE_LAST_OFFSET \ |
685 | (SKB_WITH_OVERHEAD(PAGE_SIZE) - ICE_RXBUF_2048) | |
686 | if (rx_buf->page_offset > ICE_LAST_OFFSET) | |
2b245cb2 AV |
687 | return false; |
688 | #endif /* PAGE_SIZE < 8192) */ | |
689 | ||
03c66a13 MF |
690 | /* If we have drained the page fragment pool we need to update |
691 | * the pagecnt_bias and page count so that we fully restock the | |
692 | * number of references the driver holds. | |
2b245cb2 | 693 | */ |
03c66a13 MF |
694 | if (unlikely(pagecnt_bias == 1)) { |
695 | page_ref_add(page, USHRT_MAX - 1); | |
696 | rx_buf->pagecnt_bias = USHRT_MAX; | |
697 | } | |
2b245cb2 AV |
698 | |
699 | return true; | |
700 | } | |
701 | ||
2b245cb2 | 702 | /** |
712edbbb | 703 | * ice_add_rx_frag - Add contents of Rx buffer to sk_buff as a frag |
7237f5b0 | 704 | * @rx_ring: Rx descriptor ring to transact packets on |
2b245cb2 | 705 | * @rx_buf: buffer containing page to add |
712edbbb MF |
706 | * @skb: sk_buff to place the data into |
707 | * @size: packet length from rx_desc | |
2b245cb2 AV |
708 | * |
709 | * This function will add the data contained in rx_buf->page to the skb. | |
712edbbb MF |
710 | * It will just attach the page as a frag to the skb. |
711 | * The function will then update the page offset. | |
2b245cb2 | 712 | */ |
1d032bc7 | 713 | static void |
7237f5b0 MF |
714 | ice_add_rx_frag(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf, |
715 | struct sk_buff *skb, unsigned int size) | |
2b245cb2 | 716 | { |
712edbbb | 717 | #if (PAGE_SIZE >= 8192) |
59bb0808 | 718 | unsigned int truesize = SKB_DATA_ALIGN(size + ice_rx_offset(rx_ring)); |
2b245cb2 | 719 | #else |
7237f5b0 | 720 | unsigned int truesize = ice_rx_pg_size(rx_ring) / 2; |
712edbbb | 721 | #endif |
1857ca42 | 722 | |
ac6f733a MW |
723 | if (!size) |
724 | return; | |
712edbbb MF |
725 | skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buf->page, |
726 | rx_buf->page_offset, size, truesize); | |
1857ca42 | 727 | |
712edbbb | 728 | /* page is being used so we must update the page offset */ |
1d032bc7 | 729 | ice_rx_buf_adjust_pg_offset(rx_buf, truesize); |
2b245cb2 AV |
730 | } |
731 | ||
732 | /** | |
733 | * ice_reuse_rx_page - page flip buffer and store it back on the ring | |
d337f2af | 734 | * @rx_ring: Rx descriptor ring to store buffers on |
2b245cb2 AV |
735 | * @old_buf: donor buffer to have page reused |
736 | * | |
737 | * Synchronizes page for reuse by the adapter | |
738 | */ | |
c8b7abdd BA |
739 | static void |
740 | ice_reuse_rx_page(struct ice_ring *rx_ring, struct ice_rx_buf *old_buf) | |
2b245cb2 AV |
741 | { |
742 | u16 nta = rx_ring->next_to_alloc; | |
743 | struct ice_rx_buf *new_buf; | |
744 | ||
745 | new_buf = &rx_ring->rx_buf[nta]; | |
746 | ||
747 | /* update, and store next to alloc */ | |
748 | nta++; | |
749 | rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; | |
750 | ||
712edbbb MF |
751 | /* Transfer page from old buffer to new buffer. |
752 | * Move each member individually to avoid possible store | |
753 | * forwarding stalls and unnecessary copy of skb. | |
754 | */ | |
755 | new_buf->dma = old_buf->dma; | |
756 | new_buf->page = old_buf->page; | |
757 | new_buf->page_offset = old_buf->page_offset; | |
758 | new_buf->pagecnt_bias = old_buf->pagecnt_bias; | |
2b245cb2 AV |
759 | } |
760 | ||
761 | /** | |
6c869cb7 | 762 | * ice_get_rx_buf - Fetch Rx buffer and synchronize data for use |
d337f2af | 763 | * @rx_ring: Rx descriptor ring to transact packets on |
712edbbb | 764 | * @skb: skb to be used |
6c869cb7 | 765 | * @size: size of buffer to add to skb |
2b245cb2 | 766 | * |
6c869cb7 MF |
767 | * This function will pull an Rx buffer from the ring and synchronize it |
768 | * for use by the CPU. | |
2b245cb2 | 769 | */ |
6c869cb7 | 770 | static struct ice_rx_buf * |
712edbbb MF |
771 | ice_get_rx_buf(struct ice_ring *rx_ring, struct sk_buff **skb, |
772 | const unsigned int size) | |
2b245cb2 AV |
773 | { |
774 | struct ice_rx_buf *rx_buf; | |
2b245cb2 AV |
775 | |
776 | rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean]; | |
6c869cb7 | 777 | prefetchw(rx_buf->page); |
712edbbb | 778 | *skb = rx_buf->skb; |
6c869cb7 | 779 | |
ac6f733a MW |
780 | if (!size) |
781 | return rx_buf; | |
6c869cb7 MF |
782 | /* we are reusing so sync this buffer for CPU use */ |
783 | dma_sync_single_range_for_cpu(rx_ring->dev, rx_buf->dma, | |
784 | rx_buf->page_offset, size, | |
785 | DMA_FROM_DEVICE); | |
2b245cb2 | 786 | |
03c66a13 MF |
787 | /* We have pulled a buffer for use, so decrement pagecnt_bias */ |
788 | rx_buf->pagecnt_bias--; | |
2b245cb2 | 789 | |
6c869cb7 MF |
790 | return rx_buf; |
791 | } | |
2b245cb2 | 792 | |
aaf27254 MF |
793 | /** |
794 | * ice_build_skb - Build skb around an existing buffer | |
795 | * @rx_ring: Rx descriptor ring to transact packets on | |
796 | * @rx_buf: Rx buffer to pull data from | |
797 | * @xdp: xdp_buff pointing to the data | |
798 | * | |
799 | * This function builds an skb around an existing Rx buffer, taking care | |
800 | * to set up the skb correctly and avoid any memcpy overhead. | |
801 | */ | |
802 | static struct sk_buff * | |
803 | ice_build_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf, | |
804 | struct xdp_buff *xdp) | |
805 | { | |
806 | unsigned int metasize = xdp->data - xdp->data_meta; | |
807 | #if (PAGE_SIZE < 8192) | |
808 | unsigned int truesize = ice_rx_pg_size(rx_ring) / 2; | |
809 | #else | |
810 | unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) + | |
811 | SKB_DATA_ALIGN(xdp->data_end - | |
812 | xdp->data_hard_start); | |
813 | #endif | |
814 | struct sk_buff *skb; | |
815 | ||
816 | /* Prefetch first cache line of first page. If xdp->data_meta | |
817 | * is unused, this points exactly as xdp->data, otherwise we | |
818 | * likely have a consumer accessing first few bytes of meta | |
819 | * data, and then actual data. | |
820 | */ | |
821 | prefetch(xdp->data_meta); | |
822 | #if L1_CACHE_BYTES < 128 | |
823 | prefetch((void *)(xdp->data + L1_CACHE_BYTES)); | |
824 | #endif | |
825 | /* build an skb around the page buffer */ | |
826 | skb = build_skb(xdp->data_hard_start, truesize); | |
827 | if (unlikely(!skb)) | |
828 | return NULL; | |
829 | ||
830 | /* must to record Rx queue, otherwise OS features such as | |
831 | * symmetric queue won't work | |
832 | */ | |
833 | skb_record_rx_queue(skb, rx_ring->q_index); | |
834 | ||
835 | /* update pointers within the skb to store the data */ | |
836 | skb_reserve(skb, xdp->data - xdp->data_hard_start); | |
837 | __skb_put(skb, xdp->data_end - xdp->data); | |
838 | if (metasize) | |
839 | skb_metadata_set(skb, metasize); | |
840 | ||
841 | /* buffer is used by skb, update page_offset */ | |
842 | ice_rx_buf_adjust_pg_offset(rx_buf, truesize); | |
843 | ||
844 | return skb; | |
845 | } | |
846 | ||
2b245cb2 | 847 | /** |
712edbbb | 848 | * ice_construct_skb - Allocate skb and populate it |
d337f2af | 849 | * @rx_ring: Rx descriptor ring to transact packets on |
6c869cb7 | 850 | * @rx_buf: Rx buffer to pull data from |
efc2214b | 851 | * @xdp: xdp_buff pointing to the data |
2b245cb2 | 852 | * |
712edbbb MF |
853 | * This function allocates an skb. It then populates it with the page |
854 | * data from the current receive descriptor, taking care to set up the | |
855 | * skb correctly. | |
2b245cb2 | 856 | */ |
c8b7abdd | 857 | static struct sk_buff * |
712edbbb | 858 | ice_construct_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf, |
efc2214b | 859 | struct xdp_buff *xdp) |
2b245cb2 | 860 | { |
efc2214b | 861 | unsigned int size = xdp->data_end - xdp->data; |
712edbbb MF |
862 | unsigned int headlen; |
863 | struct sk_buff *skb; | |
2b245cb2 | 864 | |
712edbbb | 865 | /* prefetch first cache line of first page */ |
efc2214b | 866 | prefetch(xdp->data); |
2b245cb2 | 867 | #if L1_CACHE_BYTES < 128 |
efc2214b | 868 | prefetch((void *)(xdp->data + L1_CACHE_BYTES)); |
2b245cb2 AV |
869 | #endif /* L1_CACHE_BYTES */ |
870 | ||
712edbbb MF |
871 | /* allocate a skb to store the frags */ |
872 | skb = __napi_alloc_skb(&rx_ring->q_vector->napi, ICE_RX_HDR_SIZE, | |
873 | GFP_ATOMIC | __GFP_NOWARN); | |
874 | if (unlikely(!skb)) | |
875 | return NULL; | |
2b245cb2 | 876 | |
712edbbb MF |
877 | skb_record_rx_queue(skb, rx_ring->q_index); |
878 | /* Determine available headroom for copy */ | |
879 | headlen = size; | |
880 | if (headlen > ICE_RX_HDR_SIZE) | |
efc2214b | 881 | headlen = eth_get_headlen(skb->dev, xdp->data, ICE_RX_HDR_SIZE); |
2b245cb2 | 882 | |
712edbbb | 883 | /* align pull length to size of long to optimize memcpy performance */ |
efc2214b MF |
884 | memcpy(__skb_put(skb, headlen), xdp->data, ALIGN(headlen, |
885 | sizeof(long))); | |
2b245cb2 | 886 | |
712edbbb MF |
887 | /* if we exhaust the linear part then add what is left as a frag */ |
888 | size -= headlen; | |
889 | if (size) { | |
890 | #if (PAGE_SIZE >= 8192) | |
891 | unsigned int truesize = SKB_DATA_ALIGN(size); | |
892 | #else | |
7237f5b0 | 893 | unsigned int truesize = ice_rx_pg_size(rx_ring) / 2; |
712edbbb MF |
894 | #endif |
895 | skb_add_rx_frag(skb, 0, rx_buf->page, | |
896 | rx_buf->page_offset + headlen, size, truesize); | |
897 | /* buffer is used by skb, update page_offset */ | |
898 | ice_rx_buf_adjust_pg_offset(rx_buf, truesize); | |
2b245cb2 | 899 | } else { |
712edbbb MF |
900 | /* buffer is unused, reset bias back to rx_buf; data was copied |
901 | * onto skb's linear part so there's no need for adjusting | |
902 | * page offset and we can reuse this buffer as-is | |
903 | */ | |
904 | rx_buf->pagecnt_bias++; | |
2b245cb2 AV |
905 | } |
906 | ||
2b245cb2 AV |
907 | return skb; |
908 | } | |
909 | ||
910 | /** | |
1d032bc7 MF |
911 | * ice_put_rx_buf - Clean up used buffer and either recycle or free |
912 | * @rx_ring: Rx descriptor ring to transact packets on | |
913 | * @rx_buf: Rx buffer to pull data from | |
2b245cb2 | 914 | * |
efc2214b MF |
915 | * This function will update next_to_clean and then clean up the contents |
916 | * of the rx_buf. It will either recycle the buffer or unmap it and free | |
917 | * the associated resources. | |
2b245cb2 | 918 | */ |
1d032bc7 | 919 | static void ice_put_rx_buf(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf) |
2b245cb2 | 920 | { |
efc2214b MF |
921 | u32 ntc = rx_ring->next_to_clean + 1; |
922 | ||
923 | /* fetch, update, and store next to clean */ | |
924 | ntc = (ntc < rx_ring->count) ? ntc : 0; | |
925 | rx_ring->next_to_clean = ntc; | |
926 | ||
ac6f733a MW |
927 | if (!rx_buf) |
928 | return; | |
929 | ||
1d032bc7 | 930 | if (ice_can_reuse_rx_page(rx_buf)) { |
ac6f733a | 931 | /* hand second half of page back to the ring */ |
2b245cb2 AV |
932 | ice_reuse_rx_page(rx_ring, rx_buf); |
933 | rx_ring->rx_stats.page_reuse_count++; | |
934 | } else { | |
935 | /* we are not reusing the buffer so unmap it */ | |
7237f5b0 MF |
936 | dma_unmap_page_attrs(rx_ring->dev, rx_buf->dma, |
937 | ice_rx_pg_size(rx_ring), DMA_FROM_DEVICE, | |
938 | ICE_RX_DMA_ATTR); | |
03c66a13 | 939 | __page_frag_cache_drain(rx_buf->page, rx_buf->pagecnt_bias); |
2b245cb2 | 940 | } |
2b245cb2 | 941 | |
2b245cb2 AV |
942 | /* clear contents of buffer_info */ |
943 | rx_buf->page = NULL; | |
712edbbb | 944 | rx_buf->skb = NULL; |
2b245cb2 AV |
945 | } |
946 | ||
2b245cb2 AV |
947 | /** |
948 | * ice_is_non_eop - process handling of non-EOP buffers | |
949 | * @rx_ring: Rx ring being processed | |
950 | * @rx_desc: Rx descriptor for current buffer | |
951 | * @skb: Current socket buffer containing buffer in progress | |
952 | * | |
efc2214b MF |
953 | * If the buffer is an EOP buffer, this function exits returning false, |
954 | * otherwise return true indicating that this is in fact a non-EOP buffer. | |
2b245cb2 | 955 | */ |
c8b7abdd BA |
956 | static bool |
957 | ice_is_non_eop(struct ice_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc, | |
958 | struct sk_buff *skb) | |
2b245cb2 | 959 | { |
2b245cb2 AV |
960 | /* if we are the last buffer then there is nothing else to do */ |
961 | #define ICE_RXD_EOF BIT(ICE_RX_FLEX_DESC_STATUS0_EOF_S) | |
962 | if (likely(ice_test_staterr(rx_desc, ICE_RXD_EOF))) | |
963 | return false; | |
964 | ||
965 | /* place skb in next buffer to be received */ | |
efc2214b | 966 | rx_ring->rx_buf[rx_ring->next_to_clean].skb = skb; |
2b245cb2 AV |
967 | rx_ring->rx_stats.non_eop_descs++; |
968 | ||
969 | return true; | |
970 | } | |
971 | ||
2b245cb2 AV |
972 | /** |
973 | * ice_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf | |
d337f2af | 974 | * @rx_ring: Rx descriptor ring to transact packets on |
2b245cb2 AV |
975 | * @budget: Total limit on number of packets to process |
976 | * | |
977 | * This function provides a "bounce buffer" approach to Rx interrupt | |
df17b7e0 | 978 | * processing. The advantage to this is that on systems that have |
2b245cb2 AV |
979 | * expensive overhead for IOMMU access this provides a means of avoiding |
980 | * it by maintaining the mapping of the page to the system. | |
981 | * | |
982 | * Returns amount of work completed | |
983 | */ | |
984 | static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) | |
985 | { | |
986 | unsigned int total_rx_bytes = 0, total_rx_pkts = 0; | |
987 | u16 cleaned_count = ICE_DESC_UNUSED(rx_ring); | |
efc2214b MF |
988 | unsigned int xdp_res, xdp_xmit = 0; |
989 | struct bpf_prog *xdp_prog = NULL; | |
990 | struct xdp_buff xdp; | |
cb7db356 | 991 | bool failure; |
2b245cb2 | 992 | |
efc2214b MF |
993 | xdp.rxq = &rx_ring->xdp_rxq; |
994 | ||
f9867df6 | 995 | /* start the loop to process Rx packets bounded by 'budget' */ |
2b245cb2 AV |
996 | while (likely(total_rx_pkts < (unsigned int)budget)) { |
997 | union ice_32b_rx_flex_desc *rx_desc; | |
6c869cb7 | 998 | struct ice_rx_buf *rx_buf; |
2b245cb2 | 999 | struct sk_buff *skb; |
6c869cb7 | 1000 | unsigned int size; |
2b245cb2 AV |
1001 | u16 stat_err_bits; |
1002 | u16 vlan_tag = 0; | |
d76a60ba | 1003 | u8 rx_ptype; |
2b245cb2 | 1004 | |
f9867df6 | 1005 | /* get the Rx desc from Rx ring based on 'next_to_clean' */ |
2b245cb2 AV |
1006 | rx_desc = ICE_RX_DESC(rx_ring, rx_ring->next_to_clean); |
1007 | ||
1008 | /* status_error_len will always be zero for unused descriptors | |
1009 | * because it's cleared in cleanup, and overlaps with hdr_addr | |
1010 | * which is always zero because packet split isn't used, if the | |
1011 | * hardware wrote DD then it will be non-zero | |
1012 | */ | |
1013 | stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S); | |
1014 | if (!ice_test_staterr(rx_desc, stat_err_bits)) | |
1015 | break; | |
1016 | ||
1017 | /* This memory barrier is needed to keep us from reading | |
1018 | * any other fields out of the rx_desc until we know the | |
1019 | * DD bit is set. | |
1020 | */ | |
1021 | dma_rmb(); | |
1022 | ||
6c869cb7 MF |
1023 | size = le16_to_cpu(rx_desc->wb.pkt_len) & |
1024 | ICE_RX_FLX_DESC_PKT_LEN_M; | |
1025 | ||
ac6f733a | 1026 | /* retrieve a buffer from the ring */ |
712edbbb | 1027 | rx_buf = ice_get_rx_buf(rx_ring, &skb, size); |
ac6f733a | 1028 | |
efc2214b MF |
1029 | if (!size) { |
1030 | xdp.data = NULL; | |
1031 | xdp.data_end = NULL; | |
aaf27254 MF |
1032 | xdp.data_hard_start = NULL; |
1033 | xdp.data_meta = NULL; | |
efc2214b MF |
1034 | goto construct_skb; |
1035 | } | |
1036 | ||
1037 | xdp.data = page_address(rx_buf->page) + rx_buf->page_offset; | |
1038 | xdp.data_hard_start = xdp.data - ice_rx_offset(rx_ring); | |
aaf27254 | 1039 | xdp.data_meta = xdp.data; |
efc2214b MF |
1040 | xdp.data_end = xdp.data + size; |
1041 | ||
1042 | rcu_read_lock(); | |
1043 | xdp_prog = READ_ONCE(rx_ring->xdp_prog); | |
1044 | if (!xdp_prog) { | |
1045 | rcu_read_unlock(); | |
1046 | goto construct_skb; | |
1047 | } | |
1048 | ||
1049 | xdp_res = ice_run_xdp(rx_ring, &xdp, xdp_prog); | |
1050 | rcu_read_unlock(); | |
59bb0808 MF |
1051 | if (!xdp_res) |
1052 | goto construct_skb; | |
1053 | if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) { | |
1054 | unsigned int truesize; | |
7237f5b0 MF |
1055 | |
1056 | #if (PAGE_SIZE < 8192) | |
59bb0808 | 1057 | truesize = ice_rx_pg_size(rx_ring) / 2; |
7237f5b0 | 1058 | #else |
59bb0808 MF |
1059 | truesize = SKB_DATA_ALIGN(ice_rx_offset(rx_ring) + |
1060 | size); | |
7237f5b0 | 1061 | #endif |
59bb0808 MF |
1062 | xdp_xmit |= xdp_res; |
1063 | ice_rx_buf_adjust_pg_offset(rx_buf, truesize); | |
1064 | } else { | |
1065 | rx_buf->pagecnt_bias++; | |
efc2214b | 1066 | } |
59bb0808 MF |
1067 | total_rx_bytes += size; |
1068 | total_rx_pkts++; | |
1069 | ||
1070 | cleaned_count++; | |
1071 | ice_put_rx_buf(rx_ring, rx_buf); | |
1072 | continue; | |
efc2214b | 1073 | construct_skb: |
1f45ebe0 | 1074 | if (skb) { |
7237f5b0 | 1075 | ice_add_rx_frag(rx_ring, rx_buf, skb, size); |
1f45ebe0 MW |
1076 | } else if (likely(xdp.data)) { |
1077 | if (ice_ring_uses_build_skb(rx_ring)) | |
1078 | skb = ice_build_skb(rx_ring, rx_buf, &xdp); | |
1079 | else | |
1080 | skb = ice_construct_skb(rx_ring, rx_buf, &xdp); | |
1f45ebe0 | 1081 | } |
712edbbb MF |
1082 | /* exit if we failed to retrieve a buffer */ |
1083 | if (!skb) { | |
1084 | rx_ring->rx_stats.alloc_buf_failed++; | |
ac6f733a MW |
1085 | if (rx_buf) |
1086 | rx_buf->pagecnt_bias++; | |
2b245cb2 | 1087 | break; |
712edbbb | 1088 | } |
2b245cb2 | 1089 | |
1d032bc7 | 1090 | ice_put_rx_buf(rx_ring, rx_buf); |
2b245cb2 AV |
1091 | cleaned_count++; |
1092 | ||
1093 | /* skip if it is NOP desc */ | |
1094 | if (ice_is_non_eop(rx_ring, rx_desc, skb)) | |
1095 | continue; | |
1096 | ||
1097 | stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S); | |
1098 | if (unlikely(ice_test_staterr(rx_desc, stat_err_bits))) { | |
1099 | dev_kfree_skb_any(skb); | |
1100 | continue; | |
1101 | } | |
1102 | ||
1103 | stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_L2TAG1P_S); | |
1104 | if (ice_test_staterr(rx_desc, stat_err_bits)) | |
1105 | vlan_tag = le16_to_cpu(rx_desc->wb.l2tag1); | |
1106 | ||
133f4883 KK |
1107 | /* pad the skb if needed, to make a valid ethernet frame */ |
1108 | if (eth_skb_pad(skb)) { | |
2b245cb2 AV |
1109 | skb = NULL; |
1110 | continue; | |
1111 | } | |
1112 | ||
1113 | /* probably a little skewed due to removing CRC */ | |
1114 | total_rx_bytes += skb->len; | |
1115 | ||
d76a60ba | 1116 | /* populate checksum, VLAN, and protocol */ |
6503b659 JB |
1117 | rx_ptype = le16_to_cpu(rx_desc->wb.ptype_flex_flags0) & |
1118 | ICE_RX_FLEX_DESC_PTYPE_M; | |
1119 | ||
d76a60ba AV |
1120 | ice_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype); |
1121 | ||
2b245cb2 AV |
1122 | /* send completed skb up the stack */ |
1123 | ice_receive_skb(rx_ring, skb, vlan_tag); | |
1124 | ||
1125 | /* update budget accounting */ | |
1126 | total_rx_pkts++; | |
1127 | } | |
1128 | ||
cb7db356 BC |
1129 | /* return up to cleaned_count buffers to hardware */ |
1130 | failure = ice_alloc_rx_bufs(rx_ring, cleaned_count); | |
1131 | ||
efc2214b MF |
1132 | if (xdp_prog) |
1133 | ice_finalize_xdp_rx(rx_ring, xdp_xmit); | |
1134 | ||
2d4238f5 | 1135 | ice_update_rx_ring_stats(rx_ring, total_rx_pkts, total_rx_bytes); |
2b245cb2 AV |
1136 | |
1137 | /* guarantee a trip back through this routine if there was a failure */ | |
1138 | return failure ? budget : (int)total_rx_pkts; | |
1139 | } | |
1140 | ||
711987bb BC |
1141 | /** |
1142 | * ice_adjust_itr_by_size_and_speed - Adjust ITR based on current traffic | |
1143 | * @port_info: port_info structure containing the current link speed | |
1144 | * @avg_pkt_size: average size of Tx or Rx packets based on clean routine | |
2f2da36e | 1145 | * @itr: ITR value to update |
711987bb BC |
1146 | * |
1147 | * Calculate how big of an increment should be applied to the ITR value passed | |
1148 | * in based on wmem_default, SKB overhead, Ethernet overhead, and the current | |
1149 | * link speed. | |
1150 | * | |
1151 | * The following is a calculation derived from: | |
1152 | * wmem_default / (size + overhead) = desired_pkts_per_int | |
1153 | * rate / bits_per_byte / (size + Ethernet overhead) = pkt_rate | |
1154 | * (desired_pkt_rate / pkt_rate) * usecs_per_sec = ITR value | |
1155 | * | |
1156 | * Assuming wmem_default is 212992 and overhead is 640 bytes per | |
1157 | * packet, (256 skb, 64 headroom, 320 shared info), we can reduce the | |
1158 | * formula down to: | |
1159 | * | |
1160 | * wmem_default * bits_per_byte * usecs_per_sec pkt_size + 24 | |
1161 | * ITR = -------------------------------------------- * -------------- | |
1162 | * rate pkt_size + 640 | |
1163 | */ | |
1164 | static unsigned int | |
1165 | ice_adjust_itr_by_size_and_speed(struct ice_port_info *port_info, | |
1166 | unsigned int avg_pkt_size, | |
1167 | unsigned int itr) | |
64a59d05 | 1168 | { |
711987bb BC |
1169 | switch (port_info->phy.link_info.link_speed) { |
1170 | case ICE_AQ_LINK_SPEED_100GB: | |
1171 | itr += DIV_ROUND_UP(17 * (avg_pkt_size + 24), | |
1172 | avg_pkt_size + 640); | |
1173 | break; | |
1174 | case ICE_AQ_LINK_SPEED_50GB: | |
1175 | itr += DIV_ROUND_UP(34 * (avg_pkt_size + 24), | |
1176 | avg_pkt_size + 640); | |
1177 | break; | |
64a59d05 | 1178 | case ICE_AQ_LINK_SPEED_40GB: |
711987bb BC |
1179 | itr += DIV_ROUND_UP(43 * (avg_pkt_size + 24), |
1180 | avg_pkt_size + 640); | |
1181 | break; | |
64a59d05 | 1182 | case ICE_AQ_LINK_SPEED_25GB: |
711987bb BC |
1183 | itr += DIV_ROUND_UP(68 * (avg_pkt_size + 24), |
1184 | avg_pkt_size + 640); | |
1185 | break; | |
64a59d05 | 1186 | case ICE_AQ_LINK_SPEED_20GB: |
711987bb BC |
1187 | itr += DIV_ROUND_UP(85 * (avg_pkt_size + 24), |
1188 | avg_pkt_size + 640); | |
1189 | break; | |
1190 | case ICE_AQ_LINK_SPEED_10GB: | |
1191 | /* fall through */ | |
64a59d05 | 1192 | default: |
711987bb BC |
1193 | itr += DIV_ROUND_UP(170 * (avg_pkt_size + 24), |
1194 | avg_pkt_size + 640); | |
1195 | break; | |
64a59d05 | 1196 | } |
711987bb BC |
1197 | |
1198 | if ((itr & ICE_ITR_MASK) > ICE_ITR_ADAPTIVE_MAX_USECS) { | |
1199 | itr &= ICE_ITR_ADAPTIVE_LATENCY; | |
1200 | itr += ICE_ITR_ADAPTIVE_MAX_USECS; | |
1201 | } | |
1202 | ||
1203 | return itr; | |
64a59d05 AV |
1204 | } |
1205 | ||
1206 | /** | |
1207 | * ice_update_itr - update the adaptive ITR value based on statistics | |
1208 | * @q_vector: structure containing interrupt and ring information | |
1209 | * @rc: structure containing ring performance data | |
1210 | * | |
1211 | * Stores a new ITR value based on packets and byte | |
1212 | * counts during the last interrupt. The advantage of per interrupt | |
1213 | * computation is faster updates and more accurate ITR for the current | |
1214 | * traffic pattern. Constants in this function were computed | |
1215 | * based on theoretical maximum wire speed and thresholds were set based | |
1216 | * on testing data as well as attempting to minimize response time | |
1217 | * while increasing bulk throughput. | |
1218 | */ | |
1219 | static void | |
1220 | ice_update_itr(struct ice_q_vector *q_vector, struct ice_ring_container *rc) | |
1221 | { | |
64a59d05 | 1222 | unsigned long next_update = jiffies; |
711987bb | 1223 | unsigned int packets, bytes, itr; |
64a59d05 AV |
1224 | bool container_is_rx; |
1225 | ||
1226 | if (!rc->ring || !ITR_IS_DYNAMIC(rc->itr_setting)) | |
1227 | return; | |
1228 | ||
1229 | /* If itr_countdown is set it means we programmed an ITR within | |
1230 | * the last 4 interrupt cycles. This has a side effect of us | |
1231 | * potentially firing an early interrupt. In order to work around | |
1232 | * this we need to throw out any data received for a few | |
1233 | * interrupts following the update. | |
1234 | */ | |
1235 | if (q_vector->itr_countdown) { | |
1236 | itr = rc->target_itr; | |
1237 | goto clear_counts; | |
1238 | } | |
1239 | ||
1240 | container_is_rx = (&q_vector->rx == rc); | |
1241 | /* For Rx we want to push the delay up and default to low latency. | |
1242 | * for Tx we want to pull the delay down and default to high latency. | |
1243 | */ | |
1244 | itr = container_is_rx ? | |
1245 | ICE_ITR_ADAPTIVE_MIN_USECS | ICE_ITR_ADAPTIVE_LATENCY : | |
1246 | ICE_ITR_ADAPTIVE_MAX_USECS | ICE_ITR_ADAPTIVE_LATENCY; | |
1247 | ||
1248 | /* If we didn't update within up to 1 - 2 jiffies we can assume | |
1249 | * that either packets are coming in so slow there hasn't been | |
1250 | * any work, or that there is so much work that NAPI is dealing | |
1251 | * with interrupt moderation and we don't need to do anything. | |
1252 | */ | |
1253 | if (time_after(next_update, rc->next_update)) | |
1254 | goto clear_counts; | |
1255 | ||
d27525ec JB |
1256 | prefetch(q_vector->vsi->port_info); |
1257 | ||
64a59d05 AV |
1258 | packets = rc->total_pkts; |
1259 | bytes = rc->total_bytes; | |
1260 | ||
1261 | if (container_is_rx) { | |
1262 | /* If Rx there are 1 to 4 packets and bytes are less than | |
1263 | * 9000 assume insufficient data to use bulk rate limiting | |
1264 | * approach unless Tx is already in bulk rate limiting. We | |
1265 | * are likely latency driven. | |
1266 | */ | |
1267 | if (packets && packets < 4 && bytes < 9000 && | |
1268 | (q_vector->tx.target_itr & ICE_ITR_ADAPTIVE_LATENCY)) { | |
1269 | itr = ICE_ITR_ADAPTIVE_LATENCY; | |
711987bb | 1270 | goto adjust_by_size_and_speed; |
64a59d05 AV |
1271 | } |
1272 | } else if (packets < 4) { | |
1273 | /* If we have Tx and Rx ITR maxed and Tx ITR is running in | |
1274 | * bulk mode and we are receiving 4 or fewer packets just | |
1275 | * reset the ITR_ADAPTIVE_LATENCY bit for latency mode so | |
1276 | * that the Rx can relax. | |
1277 | */ | |
1278 | if (rc->target_itr == ICE_ITR_ADAPTIVE_MAX_USECS && | |
1279 | (q_vector->rx.target_itr & ICE_ITR_MASK) == | |
1280 | ICE_ITR_ADAPTIVE_MAX_USECS) | |
1281 | goto clear_counts; | |
1282 | } else if (packets > 32) { | |
1283 | /* If we have processed over 32 packets in a single interrupt | |
1284 | * for Tx assume we need to switch over to "bulk" mode. | |
1285 | */ | |
1286 | rc->target_itr &= ~ICE_ITR_ADAPTIVE_LATENCY; | |
1287 | } | |
1288 | ||
1289 | /* We have no packets to actually measure against. This means | |
1290 | * either one of the other queues on this vector is active or | |
1291 | * we are a Tx queue doing TSO with too high of an interrupt rate. | |
1292 | * | |
1293 | * Between 4 and 56 we can assume that our current interrupt delay | |
1294 | * is only slightly too low. As such we should increase it by a small | |
1295 | * fixed amount. | |
1296 | */ | |
1297 | if (packets < 56) { | |
1298 | itr = rc->target_itr + ICE_ITR_ADAPTIVE_MIN_INC; | |
1299 | if ((itr & ICE_ITR_MASK) > ICE_ITR_ADAPTIVE_MAX_USECS) { | |
1300 | itr &= ICE_ITR_ADAPTIVE_LATENCY; | |
1301 | itr += ICE_ITR_ADAPTIVE_MAX_USECS; | |
1302 | } | |
1303 | goto clear_counts; | |
1304 | } | |
1305 | ||
1306 | if (packets <= 256) { | |
1307 | itr = min(q_vector->tx.current_itr, q_vector->rx.current_itr); | |
1308 | itr &= ICE_ITR_MASK; | |
1309 | ||
1310 | /* Between 56 and 112 is our "goldilocks" zone where we are | |
1311 | * working out "just right". Just report that our current | |
1312 | * ITR is good for us. | |
1313 | */ | |
1314 | if (packets <= 112) | |
1315 | goto clear_counts; | |
1316 | ||
1317 | /* If packet count is 128 or greater we are likely looking | |
1318 | * at a slight overrun of the delay we want. Try halving | |
1319 | * our delay to see if that will cut the number of packets | |
1320 | * in half per interrupt. | |
1321 | */ | |
1322 | itr >>= 1; | |
1323 | itr &= ICE_ITR_MASK; | |
1324 | if (itr < ICE_ITR_ADAPTIVE_MIN_USECS) | |
1325 | itr = ICE_ITR_ADAPTIVE_MIN_USECS; | |
1326 | ||
1327 | goto clear_counts; | |
1328 | } | |
1329 | ||
1330 | /* The paths below assume we are dealing with a bulk ITR since | |
1331 | * number of packets is greater than 256. We are just going to have | |
1332 | * to compute a value and try to bring the count under control, | |
1333 | * though for smaller packet sizes there isn't much we can do as | |
1334 | * NAPI polling will likely be kicking in sooner rather than later. | |
1335 | */ | |
1336 | itr = ICE_ITR_ADAPTIVE_BULK; | |
1337 | ||
711987bb | 1338 | adjust_by_size_and_speed: |
64a59d05 | 1339 | |
711987bb BC |
1340 | /* based on checks above packets cannot be 0 so division is safe */ |
1341 | itr = ice_adjust_itr_by_size_and_speed(q_vector->vsi->port_info, | |
1342 | bytes / packets, itr); | |
64a59d05 AV |
1343 | |
1344 | clear_counts: | |
1345 | /* write back value */ | |
1346 | rc->target_itr = itr; | |
1347 | ||
1348 | /* next update should occur within next jiffy */ | |
1349 | rc->next_update = next_update + 1; | |
1350 | ||
1351 | rc->total_bytes = 0; | |
1352 | rc->total_pkts = 0; | |
1353 | } | |
1354 | ||
63f545ed BC |
1355 | /** |
1356 | * ice_buildreg_itr - build value for writing to the GLINT_DYN_CTL register | |
1357 | * @itr_idx: interrupt throttling index | |
64a59d05 | 1358 | * @itr: interrupt throttling value in usecs |
63f545ed | 1359 | */ |
8244dd2d | 1360 | static u32 ice_buildreg_itr(u16 itr_idx, u16 itr) |
63f545ed | 1361 | { |
2f2da36e | 1362 | /* The ITR value is reported in microseconds, and the register value is |
64a59d05 AV |
1363 | * recorded in 2 microsecond units. For this reason we only need to |
1364 | * shift by the GLINT_DYN_CTL_INTERVAL_S - ICE_ITR_GRAN_S to apply this | |
1365 | * granularity as a shift instead of division. The mask makes sure the | |
1366 | * ITR value is never odd so we don't accidentally write into the field | |
1367 | * prior to the ITR field. | |
1368 | */ | |
1369 | itr &= ICE_ITR_MASK; | |
1370 | ||
63f545ed BC |
1371 | return GLINT_DYN_CTL_INTENA_M | GLINT_DYN_CTL_CLEARPBA_M | |
1372 | (itr_idx << GLINT_DYN_CTL_ITR_INDX_S) | | |
64a59d05 | 1373 | (itr << (GLINT_DYN_CTL_INTERVAL_S - ICE_ITR_GRAN_S)); |
63f545ed BC |
1374 | } |
1375 | ||
64a59d05 AV |
1376 | /* The act of updating the ITR will cause it to immediately trigger. In order |
1377 | * to prevent this from throwing off adaptive update statistics we defer the | |
1378 | * update so that it can only happen so often. So after either Tx or Rx are | |
1379 | * updated we make the adaptive scheme wait until either the ITR completely | |
1380 | * expires via the next_update expiration or we have been through at least | |
1381 | * 3 interrupts. | |
1382 | */ | |
1383 | #define ITR_COUNTDOWN_START 3 | |
1384 | ||
63f545ed BC |
1385 | /** |
1386 | * ice_update_ena_itr - Update ITR and re-enable MSIX interrupt | |
63f545ed BC |
1387 | * @q_vector: q_vector for which ITR is being updated and interrupt enabled |
1388 | */ | |
2fb0821f | 1389 | static void ice_update_ena_itr(struct ice_q_vector *q_vector) |
63f545ed | 1390 | { |
64a59d05 AV |
1391 | struct ice_ring_container *tx = &q_vector->tx; |
1392 | struct ice_ring_container *rx = &q_vector->rx; | |
2fb0821f | 1393 | struct ice_vsi *vsi = q_vector->vsi; |
63f545ed BC |
1394 | u32 itr_val; |
1395 | ||
2ab28bb0 BC |
1396 | /* when exiting WB_ON_ITR lets set a low ITR value and trigger |
1397 | * interrupts to expire right away in case we have more work ready to go | |
1398 | * already | |
1399 | */ | |
1400 | if (q_vector->itr_countdown == ICE_IN_WB_ON_ITR_MODE) { | |
1401 | itr_val = ice_buildreg_itr(rx->itr_idx, ICE_WB_ON_ITR_USECS); | |
1402 | wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx), itr_val); | |
1403 | /* set target back to last user set value */ | |
1404 | rx->target_itr = rx->itr_setting; | |
1405 | /* set current to what we just wrote and dynamic if needed */ | |
1406 | rx->current_itr = ICE_WB_ON_ITR_USECS | | |
1407 | (rx->itr_setting & ICE_ITR_DYNAMIC); | |
1408 | /* allow normal interrupt flow to start */ | |
1409 | q_vector->itr_countdown = 0; | |
1410 | return; | |
1411 | } | |
1412 | ||
64a59d05 AV |
1413 | /* This will do nothing if dynamic updates are not enabled */ |
1414 | ice_update_itr(q_vector, tx); | |
1415 | ice_update_itr(q_vector, rx); | |
1416 | ||
63f545ed BC |
1417 | /* This block of logic allows us to get away with only updating |
1418 | * one ITR value with each interrupt. The idea is to perform a | |
1419 | * pseudo-lazy update with the following criteria. | |
1420 | * | |
1421 | * 1. Rx is given higher priority than Tx if both are in same state | |
1422 | * 2. If we must reduce an ITR that is given highest priority. | |
1423 | * 3. We then give priority to increasing ITR based on amount. | |
1424 | */ | |
64a59d05 | 1425 | if (rx->target_itr < rx->current_itr) { |
63f545ed | 1426 | /* Rx ITR needs to be reduced, this is highest priority */ |
64a59d05 AV |
1427 | itr_val = ice_buildreg_itr(rx->itr_idx, rx->target_itr); |
1428 | rx->current_itr = rx->target_itr; | |
1429 | q_vector->itr_countdown = ITR_COUNTDOWN_START; | |
1430 | } else if ((tx->target_itr < tx->current_itr) || | |
1431 | ((rx->target_itr - rx->current_itr) < | |
1432 | (tx->target_itr - tx->current_itr))) { | |
63f545ed BC |
1433 | /* Tx ITR needs to be reduced, this is second priority |
1434 | * Tx ITR needs to be increased more than Rx, fourth priority | |
1435 | */ | |
64a59d05 AV |
1436 | itr_val = ice_buildreg_itr(tx->itr_idx, tx->target_itr); |
1437 | tx->current_itr = tx->target_itr; | |
1438 | q_vector->itr_countdown = ITR_COUNTDOWN_START; | |
1439 | } else if (rx->current_itr != rx->target_itr) { | |
63f545ed | 1440 | /* Rx ITR needs to be increased, third priority */ |
64a59d05 AV |
1441 | itr_val = ice_buildreg_itr(rx->itr_idx, rx->target_itr); |
1442 | rx->current_itr = rx->target_itr; | |
1443 | q_vector->itr_countdown = ITR_COUNTDOWN_START; | |
63f545ed BC |
1444 | } else { |
1445 | /* Still have to re-enable the interrupts */ | |
1446 | itr_val = ice_buildreg_itr(ICE_ITR_NONE, 0); | |
64a59d05 AV |
1447 | if (q_vector->itr_countdown) |
1448 | q_vector->itr_countdown--; | |
63f545ed BC |
1449 | } |
1450 | ||
2fb0821f JB |
1451 | if (!test_bit(__ICE_DOWN, q_vector->vsi->state)) |
1452 | wr32(&q_vector->vsi->back->hw, | |
b07833a0 | 1453 | GLINT_DYN_CTL(q_vector->reg_idx), |
64a59d05 | 1454 | itr_val); |
63f545ed BC |
1455 | } |
1456 | ||
2ab28bb0 BC |
1457 | /** |
1458 | * ice_set_wb_on_itr - set WB_ON_ITR for this q_vector | |
2ab28bb0 BC |
1459 | * @q_vector: q_vector to set WB_ON_ITR on |
1460 | * | |
1461 | * We need to tell hardware to write-back completed descriptors even when | |
1462 | * interrupts are disabled. Descriptors will be written back on cache line | |
1463 | * boundaries without WB_ON_ITR enabled, but if we don't enable WB_ON_ITR | |
1464 | * descriptors may not be written back if they don't fill a cache line until the | |
1465 | * next interrupt. | |
1466 | * | |
1467 | * This sets the write-back frequency to 2 microseconds as that is the minimum | |
1468 | * value that's not 0 due to ITR granularity. Also, set the INTENA_MSK bit to | |
1469 | * make sure hardware knows we aren't meddling with the INTENA_M bit. | |
1470 | */ | |
2fb0821f | 1471 | static void ice_set_wb_on_itr(struct ice_q_vector *q_vector) |
2ab28bb0 | 1472 | { |
2fb0821f JB |
1473 | struct ice_vsi *vsi = q_vector->vsi; |
1474 | ||
2ab28bb0 BC |
1475 | /* already in WB_ON_ITR mode no need to change it */ |
1476 | if (q_vector->itr_countdown == ICE_IN_WB_ON_ITR_MODE) | |
1477 | return; | |
1478 | ||
1479 | if (q_vector->num_ring_rx) | |
1480 | wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx), | |
1481 | ICE_GLINT_DYN_CTL_WB_ON_ITR(ICE_WB_ON_ITR_USECS, | |
1482 | ICE_RX_ITR)); | |
1483 | ||
1484 | if (q_vector->num_ring_tx) | |
1485 | wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx), | |
1486 | ICE_GLINT_DYN_CTL_WB_ON_ITR(ICE_WB_ON_ITR_USECS, | |
1487 | ICE_TX_ITR)); | |
1488 | ||
1489 | q_vector->itr_countdown = ICE_IN_WB_ON_ITR_MODE; | |
1490 | } | |
1491 | ||
2b245cb2 AV |
1492 | /** |
1493 | * ice_napi_poll - NAPI polling Rx/Tx cleanup routine | |
1494 | * @napi: napi struct with our devices info in it | |
1495 | * @budget: amount of work driver is allowed to do this pass, in packets | |
1496 | * | |
1497 | * This function will clean all queues associated with a q_vector. | |
1498 | * | |
1499 | * Returns the amount of work done | |
1500 | */ | |
1501 | int ice_napi_poll(struct napi_struct *napi, int budget) | |
1502 | { | |
1503 | struct ice_q_vector *q_vector = | |
1504 | container_of(napi, struct ice_q_vector, napi); | |
2b245cb2 | 1505 | bool clean_complete = true; |
2b245cb2 | 1506 | struct ice_ring *ring; |
9118fcd5 | 1507 | int budget_per_ring; |
2b245cb2 AV |
1508 | int work_done = 0; |
1509 | ||
1510 | /* Since the actual Tx work is minimal, we can give the Tx a larger | |
1511 | * budget and be more aggressive about cleaning up the Tx descriptors. | |
1512 | */ | |
2d4238f5 KK |
1513 | ice_for_each_ring(ring, q_vector->tx) { |
1514 | bool wd = ring->xsk_umem ? | |
1515 | ice_clean_tx_irq_zc(ring, budget) : | |
1516 | ice_clean_tx_irq(ring, budget); | |
1517 | ||
1518 | if (!wd) | |
2b245cb2 | 1519 | clean_complete = false; |
2d4238f5 | 1520 | } |
2b245cb2 AV |
1521 | |
1522 | /* Handle case where we are called by netpoll with a budget of 0 */ | |
d27525ec | 1523 | if (unlikely(budget <= 0)) |
2b245cb2 AV |
1524 | return budget; |
1525 | ||
9118fcd5 BC |
1526 | /* normally we have 1 Rx ring per q_vector */ |
1527 | if (unlikely(q_vector->num_ring_rx > 1)) | |
1528 | /* We attempt to distribute budget to each Rx queue fairly, but | |
1529 | * don't allow the budget to go below 1 because that would exit | |
1530 | * polling early. | |
1531 | */ | |
2b245cb2 | 1532 | budget_per_ring = max(budget / q_vector->num_ring_rx, 1); |
9118fcd5 BC |
1533 | else |
1534 | /* Max of 1 Rx ring in this q_vector so give it the budget */ | |
1535 | budget_per_ring = budget; | |
2b245cb2 AV |
1536 | |
1537 | ice_for_each_ring(ring, q_vector->rx) { | |
1538 | int cleaned; | |
1539 | ||
2d4238f5 KK |
1540 | /* A dedicated path for zero-copy allows making a single |
1541 | * comparison in the irq context instead of many inside the | |
1542 | * ice_clean_rx_irq function and makes the codebase cleaner. | |
1543 | */ | |
1544 | cleaned = ring->xsk_umem ? | |
1545 | ice_clean_rx_irq_zc(ring, budget_per_ring) : | |
1546 | ice_clean_rx_irq(ring, budget_per_ring); | |
2b245cb2 AV |
1547 | work_done += cleaned; |
1548 | /* if we clean as many as budgeted, we must not be done */ | |
1549 | if (cleaned >= budget_per_ring) | |
1550 | clean_complete = false; | |
1551 | } | |
1552 | ||
1553 | /* If work not completed, return budget and polling will return */ | |
1554 | if (!clean_complete) | |
1555 | return budget; | |
1556 | ||
0bcd952f JB |
1557 | /* Exit the polling mode, but don't re-enable interrupts if stack might |
1558 | * poll us due to busy-polling | |
1559 | */ | |
1560 | if (likely(napi_complete_done(napi, work_done))) | |
2fb0821f | 1561 | ice_update_ena_itr(q_vector); |
2ab28bb0 | 1562 | else |
2fb0821f | 1563 | ice_set_wb_on_itr(q_vector); |
e0c9fd9b | 1564 | |
32a64994 | 1565 | return min_t(int, work_done, budget - 1); |
2b245cb2 AV |
1566 | } |
1567 | ||
2b245cb2 | 1568 | /** |
d337f2af | 1569 | * __ice_maybe_stop_tx - 2nd level check for Tx stop conditions |
2b245cb2 AV |
1570 | * @tx_ring: the ring to be checked |
1571 | * @size: the size buffer we want to assure is available | |
1572 | * | |
1573 | * Returns -EBUSY if a stop is needed, else 0 | |
1574 | */ | |
1575 | static int __ice_maybe_stop_tx(struct ice_ring *tx_ring, unsigned int size) | |
1576 | { | |
1577 | netif_stop_subqueue(tx_ring->netdev, tx_ring->q_index); | |
1578 | /* Memory barrier before checking head and tail */ | |
1579 | smp_mb(); | |
1580 | ||
1581 | /* Check again in a case another CPU has just made room available. */ | |
1582 | if (likely(ICE_DESC_UNUSED(tx_ring) < size)) | |
1583 | return -EBUSY; | |
1584 | ||
1585 | /* A reprieve! - use start_subqueue because it doesn't call schedule */ | |
1586 | netif_start_subqueue(tx_ring->netdev, tx_ring->q_index); | |
1587 | ++tx_ring->tx_stats.restart_q; | |
1588 | return 0; | |
1589 | } | |
1590 | ||
1591 | /** | |
d337f2af | 1592 | * ice_maybe_stop_tx - 1st level check for Tx stop conditions |
2b245cb2 AV |
1593 | * @tx_ring: the ring to be checked |
1594 | * @size: the size buffer we want to assure is available | |
1595 | * | |
1596 | * Returns 0 if stop is not needed | |
1597 | */ | |
1598 | static int ice_maybe_stop_tx(struct ice_ring *tx_ring, unsigned int size) | |
1599 | { | |
1600 | if (likely(ICE_DESC_UNUSED(tx_ring) >= size)) | |
1601 | return 0; | |
d337f2af | 1602 | |
2b245cb2 AV |
1603 | return __ice_maybe_stop_tx(tx_ring, size); |
1604 | } | |
1605 | ||
1606 | /** | |
1607 | * ice_tx_map - Build the Tx descriptor | |
1608 | * @tx_ring: ring to send buffer on | |
1609 | * @first: first buffer info buffer to use | |
d76a60ba | 1610 | * @off: pointer to struct that holds offload parameters |
2b245cb2 AV |
1611 | * |
1612 | * This function loops over the skb data pointed to by *first | |
1613 | * and gets a physical address for each memory location and programs | |
1614 | * it and the length into the transmit descriptor. | |
1615 | */ | |
d76a60ba AV |
1616 | static void |
1617 | ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first, | |
1618 | struct ice_tx_offload_params *off) | |
2b245cb2 | 1619 | { |
d76a60ba | 1620 | u64 td_offset, td_tag, td_cmd; |
2b245cb2 | 1621 | u16 i = tx_ring->next_to_use; |
2b245cb2 AV |
1622 | unsigned int data_len, size; |
1623 | struct ice_tx_desc *tx_desc; | |
1624 | struct ice_tx_buf *tx_buf; | |
1625 | struct sk_buff *skb; | |
4ee656bb | 1626 | skb_frag_t *frag; |
2b245cb2 AV |
1627 | dma_addr_t dma; |
1628 | ||
d76a60ba AV |
1629 | td_tag = off->td_l2tag1; |
1630 | td_cmd = off->td_cmd; | |
1631 | td_offset = off->td_offset; | |
2b245cb2 AV |
1632 | skb = first->skb; |
1633 | ||
1634 | data_len = skb->data_len; | |
1635 | size = skb_headlen(skb); | |
1636 | ||
1637 | tx_desc = ICE_TX_DESC(tx_ring, i); | |
1638 | ||
d76a60ba AV |
1639 | if (first->tx_flags & ICE_TX_FLAGS_HW_VLAN) { |
1640 | td_cmd |= (u64)ICE_TX_DESC_CMD_IL2TAG1; | |
1641 | td_tag = (first->tx_flags & ICE_TX_FLAGS_VLAN_M) >> | |
1642 | ICE_TX_FLAGS_VLAN_S; | |
1643 | } | |
1644 | ||
2b245cb2 AV |
1645 | dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE); |
1646 | ||
1647 | tx_buf = first; | |
1648 | ||
1649 | for (frag = &skb_shinfo(skb)->frags[0];; frag++) { | |
1650 | unsigned int max_data = ICE_MAX_DATA_PER_TXD_ALIGNED; | |
1651 | ||
1652 | if (dma_mapping_error(tx_ring->dev, dma)) | |
1653 | goto dma_error; | |
1654 | ||
1655 | /* record length, and DMA address */ | |
1656 | dma_unmap_len_set(tx_buf, len, size); | |
1657 | dma_unmap_addr_set(tx_buf, dma, dma); | |
1658 | ||
1659 | /* align size to end of page */ | |
1660 | max_data += -dma & (ICE_MAX_READ_REQ_SIZE - 1); | |
1661 | tx_desc->buf_addr = cpu_to_le64(dma); | |
1662 | ||
1663 | /* account for data chunks larger than the hardware | |
1664 | * can handle | |
1665 | */ | |
1666 | while (unlikely(size > ICE_MAX_DATA_PER_TXD)) { | |
1667 | tx_desc->cmd_type_offset_bsz = | |
1668 | build_ctob(td_cmd, td_offset, max_data, td_tag); | |
1669 | ||
1670 | tx_desc++; | |
1671 | i++; | |
1672 | ||
1673 | if (i == tx_ring->count) { | |
1674 | tx_desc = ICE_TX_DESC(tx_ring, 0); | |
1675 | i = 0; | |
1676 | } | |
1677 | ||
1678 | dma += max_data; | |
1679 | size -= max_data; | |
1680 | ||
1681 | max_data = ICE_MAX_DATA_PER_TXD_ALIGNED; | |
1682 | tx_desc->buf_addr = cpu_to_le64(dma); | |
1683 | } | |
1684 | ||
1685 | if (likely(!data_len)) | |
1686 | break; | |
1687 | ||
1688 | tx_desc->cmd_type_offset_bsz = build_ctob(td_cmd, td_offset, | |
1689 | size, td_tag); | |
1690 | ||
1691 | tx_desc++; | |
1692 | i++; | |
1693 | ||
1694 | if (i == tx_ring->count) { | |
1695 | tx_desc = ICE_TX_DESC(tx_ring, 0); | |
1696 | i = 0; | |
1697 | } | |
1698 | ||
1699 | size = skb_frag_size(frag); | |
1700 | data_len -= size; | |
1701 | ||
1702 | dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size, | |
1703 | DMA_TO_DEVICE); | |
1704 | ||
1705 | tx_buf = &tx_ring->tx_buf[i]; | |
1706 | } | |
1707 | ||
1708 | /* record bytecount for BQL */ | |
1709 | netdev_tx_sent_queue(txring_txq(tx_ring), first->bytecount); | |
1710 | ||
1711 | /* record SW timestamp if HW timestamp is not available */ | |
1712 | skb_tx_timestamp(first->skb); | |
1713 | ||
1714 | i++; | |
1715 | if (i == tx_ring->count) | |
1716 | i = 0; | |
1717 | ||
1718 | /* write last descriptor with RS and EOP bits */ | |
efc2214b MF |
1719 | td_cmd |= (u64)ICE_TXD_LAST_DESC_CMD; |
1720 | tx_desc->cmd_type_offset_bsz = build_ctob(td_cmd, td_offset, size, | |
1721 | td_tag); | |
2b245cb2 AV |
1722 | |
1723 | /* Force memory writes to complete before letting h/w know there | |
1724 | * are new descriptors to fetch. | |
1725 | * | |
1726 | * We also use this memory barrier to make certain all of the | |
1727 | * status bits have been updated before next_to_watch is written. | |
1728 | */ | |
1729 | wmb(); | |
1730 | ||
1731 | /* set next_to_watch value indicating a packet is present */ | |
1732 | first->next_to_watch = tx_desc; | |
1733 | ||
1734 | tx_ring->next_to_use = i; | |
1735 | ||
1736 | ice_maybe_stop_tx(tx_ring, DESC_NEEDED); | |
1737 | ||
1738 | /* notify HW of packet */ | |
4ee656bb | 1739 | if (netif_xmit_stopped(txring_txq(tx_ring)) || !netdev_xmit_more()) |
2b245cb2 | 1740 | writel(i, tx_ring->tail); |
2b245cb2 AV |
1741 | |
1742 | return; | |
1743 | ||
1744 | dma_error: | |
2f2da36e | 1745 | /* clear DMA mappings for failed tx_buf map */ |
2b245cb2 AV |
1746 | for (;;) { |
1747 | tx_buf = &tx_ring->tx_buf[i]; | |
1748 | ice_unmap_and_free_tx_buf(tx_ring, tx_buf); | |
1749 | if (tx_buf == first) | |
1750 | break; | |
1751 | if (i == 0) | |
1752 | i = tx_ring->count; | |
1753 | i--; | |
1754 | } | |
1755 | ||
1756 | tx_ring->next_to_use = i; | |
1757 | } | |
1758 | ||
d76a60ba AV |
1759 | /** |
1760 | * ice_tx_csum - Enable Tx checksum offloads | |
1761 | * @first: pointer to the first descriptor | |
1762 | * @off: pointer to struct that holds offload parameters | |
1763 | * | |
1764 | * Returns 0 or error (negative) if checksum offload can't happen, 1 otherwise. | |
1765 | */ | |
1766 | static | |
1767 | int ice_tx_csum(struct ice_tx_buf *first, struct ice_tx_offload_params *off) | |
1768 | { | |
1769 | u32 l4_len = 0, l3_len = 0, l2_len = 0; | |
1770 | struct sk_buff *skb = first->skb; | |
1771 | union { | |
1772 | struct iphdr *v4; | |
1773 | struct ipv6hdr *v6; | |
1774 | unsigned char *hdr; | |
1775 | } ip; | |
1776 | union { | |
1777 | struct tcphdr *tcp; | |
1778 | unsigned char *hdr; | |
1779 | } l4; | |
1780 | __be16 frag_off, protocol; | |
1781 | unsigned char *exthdr; | |
1782 | u32 offset, cmd = 0; | |
1783 | u8 l4_proto = 0; | |
1784 | ||
1785 | if (skb->ip_summed != CHECKSUM_PARTIAL) | |
1786 | return 0; | |
1787 | ||
1788 | ip.hdr = skb_network_header(skb); | |
1789 | l4.hdr = skb_transport_header(skb); | |
1790 | ||
1791 | /* compute outer L2 header size */ | |
1792 | l2_len = ip.hdr - skb->data; | |
1793 | offset = (l2_len / 2) << ICE_TX_DESC_LEN_MACLEN_S; | |
1794 | ||
1795 | if (skb->encapsulation) | |
1796 | return -1; | |
1797 | ||
1798 | /* Enable IP checksum offloads */ | |
1799 | protocol = vlan_get_protocol(skb); | |
1800 | if (protocol == htons(ETH_P_IP)) { | |
1801 | l4_proto = ip.v4->protocol; | |
1802 | /* the stack computes the IP header already, the only time we | |
1803 | * need the hardware to recompute it is in the case of TSO. | |
1804 | */ | |
1805 | if (first->tx_flags & ICE_TX_FLAGS_TSO) | |
1806 | cmd |= ICE_TX_DESC_CMD_IIPT_IPV4_CSUM; | |
1807 | else | |
1808 | cmd |= ICE_TX_DESC_CMD_IIPT_IPV4; | |
1809 | ||
1810 | } else if (protocol == htons(ETH_P_IPV6)) { | |
1811 | cmd |= ICE_TX_DESC_CMD_IIPT_IPV6; | |
1812 | exthdr = ip.hdr + sizeof(*ip.v6); | |
1813 | l4_proto = ip.v6->nexthdr; | |
1814 | if (l4.hdr != exthdr) | |
1815 | ipv6_skip_exthdr(skb, exthdr - skb->data, &l4_proto, | |
1816 | &frag_off); | |
1817 | } else { | |
1818 | return -1; | |
1819 | } | |
1820 | ||
1821 | /* compute inner L3 header size */ | |
1822 | l3_len = l4.hdr - ip.hdr; | |
1823 | offset |= (l3_len / 4) << ICE_TX_DESC_LEN_IPLEN_S; | |
1824 | ||
1825 | /* Enable L4 checksum offloads */ | |
1826 | switch (l4_proto) { | |
1827 | case IPPROTO_TCP: | |
1828 | /* enable checksum offloads */ | |
1829 | cmd |= ICE_TX_DESC_CMD_L4T_EOFT_TCP; | |
1830 | l4_len = l4.tcp->doff; | |
1831 | offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S; | |
1832 | break; | |
1833 | case IPPROTO_UDP: | |
1834 | /* enable UDP checksum offload */ | |
1835 | cmd |= ICE_TX_DESC_CMD_L4T_EOFT_UDP; | |
1836 | l4_len = (sizeof(struct udphdr) >> 2); | |
1837 | offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S; | |
1838 | break; | |
1839 | case IPPROTO_SCTP: | |
cf909e19 AV |
1840 | /* enable SCTP checksum offload */ |
1841 | cmd |= ICE_TX_DESC_CMD_L4T_EOFT_SCTP; | |
1842 | l4_len = sizeof(struct sctphdr) >> 2; | |
1843 | offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S; | |
1844 | break; | |
1845 | ||
d76a60ba AV |
1846 | default: |
1847 | if (first->tx_flags & ICE_TX_FLAGS_TSO) | |
1848 | return -1; | |
1849 | skb_checksum_help(skb); | |
1850 | return 0; | |
1851 | } | |
1852 | ||
1853 | off->td_cmd |= cmd; | |
1854 | off->td_offset |= offset; | |
1855 | return 1; | |
1856 | } | |
1857 | ||
1858 | /** | |
f9867df6 | 1859 | * ice_tx_prepare_vlan_flags - prepare generic Tx VLAN tagging flags for HW |
d76a60ba AV |
1860 | * @tx_ring: ring to send buffer on |
1861 | * @first: pointer to struct ice_tx_buf | |
1862 | * | |
1863 | * Checks the skb and set up correspondingly several generic transmit flags | |
1864 | * related to VLAN tagging for the HW, such as VLAN, DCB, etc. | |
1865 | * | |
1866 | * Returns error code indicate the frame should be dropped upon error and the | |
1867 | * otherwise returns 0 to indicate the flags has been set properly. | |
1868 | */ | |
1869 | static int | |
1870 | ice_tx_prepare_vlan_flags(struct ice_ring *tx_ring, struct ice_tx_buf *first) | |
1871 | { | |
1872 | struct sk_buff *skb = first->skb; | |
1873 | __be16 protocol = skb->protocol; | |
1874 | ||
1875 | if (protocol == htons(ETH_P_8021Q) && | |
1876 | !(tx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_TX)) { | |
1877 | /* when HW VLAN acceleration is turned off by the user the | |
1878 | * stack sets the protocol to 8021q so that the driver | |
1879 | * can take any steps required to support the SW only | |
1880 | * VLAN handling. In our case the driver doesn't need | |
1881 | * to take any further steps so just set the protocol | |
1882 | * to the encapsulated ethertype. | |
1883 | */ | |
1884 | skb->protocol = vlan_get_protocol(skb); | |
5f6aa50e | 1885 | return 0; |
d76a60ba AV |
1886 | } |
1887 | ||
1888 | /* if we have a HW VLAN tag being added, default to the HW one */ | |
1889 | if (skb_vlan_tag_present(skb)) { | |
1890 | first->tx_flags |= skb_vlan_tag_get(skb) << ICE_TX_FLAGS_VLAN_S; | |
1891 | first->tx_flags |= ICE_TX_FLAGS_HW_VLAN; | |
1892 | } else if (protocol == htons(ETH_P_8021Q)) { | |
1893 | struct vlan_hdr *vhdr, _vhdr; | |
1894 | ||
1895 | /* for SW VLAN, check the next protocol and store the tag */ | |
1896 | vhdr = (struct vlan_hdr *)skb_header_pointer(skb, ETH_HLEN, | |
1897 | sizeof(_vhdr), | |
1898 | &_vhdr); | |
1899 | if (!vhdr) | |
1900 | return -EINVAL; | |
1901 | ||
1902 | first->tx_flags |= ntohs(vhdr->h_vlan_TCI) << | |
1903 | ICE_TX_FLAGS_VLAN_S; | |
1904 | first->tx_flags |= ICE_TX_FLAGS_SW_VLAN; | |
1905 | } | |
1906 | ||
5f6aa50e | 1907 | return ice_tx_prepare_vlan_flags_dcb(tx_ring, first); |
d76a60ba AV |
1908 | } |
1909 | ||
1910 | /** | |
1911 | * ice_tso - computes mss and TSO length to prepare for TSO | |
1912 | * @first: pointer to struct ice_tx_buf | |
1913 | * @off: pointer to struct that holds offload parameters | |
1914 | * | |
1915 | * Returns 0 or error (negative) if TSO can't happen, 1 otherwise. | |
1916 | */ | |
1917 | static | |
1918 | int ice_tso(struct ice_tx_buf *first, struct ice_tx_offload_params *off) | |
1919 | { | |
1920 | struct sk_buff *skb = first->skb; | |
1921 | union { | |
1922 | struct iphdr *v4; | |
1923 | struct ipv6hdr *v6; | |
1924 | unsigned char *hdr; | |
1925 | } ip; | |
1926 | union { | |
1927 | struct tcphdr *tcp; | |
a54e3b8c | 1928 | struct udphdr *udp; |
d76a60ba AV |
1929 | unsigned char *hdr; |
1930 | } l4; | |
1931 | u64 cd_mss, cd_tso_len; | |
1932 | u32 paylen, l4_start; | |
1933 | int err; | |
1934 | ||
1935 | if (skb->ip_summed != CHECKSUM_PARTIAL) | |
1936 | return 0; | |
1937 | ||
1938 | if (!skb_is_gso(skb)) | |
1939 | return 0; | |
1940 | ||
1941 | err = skb_cow_head(skb, 0); | |
1942 | if (err < 0) | |
1943 | return err; | |
1944 | ||
c3a6825e | 1945 | /* cppcheck-suppress unreadVariable */ |
d76a60ba AV |
1946 | ip.hdr = skb_network_header(skb); |
1947 | l4.hdr = skb_transport_header(skb); | |
1948 | ||
1949 | /* initialize outer IP header fields */ | |
1950 | if (ip.v4->version == 4) { | |
1951 | ip.v4->tot_len = 0; | |
1952 | ip.v4->check = 0; | |
1953 | } else { | |
1954 | ip.v6->payload_len = 0; | |
1955 | } | |
1956 | ||
1957 | /* determine offset of transport header */ | |
1958 | l4_start = l4.hdr - skb->data; | |
1959 | ||
1960 | /* remove payload length from checksum */ | |
1961 | paylen = skb->len - l4_start; | |
d76a60ba | 1962 | |
a54e3b8c BC |
1963 | if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) { |
1964 | csum_replace_by_diff(&l4.udp->check, | |
1965 | (__force __wsum)htonl(paylen)); | |
1966 | /* compute length of UDP segmentation header */ | |
1967 | off->header_len = sizeof(l4.udp) + l4_start; | |
1968 | } else { | |
1969 | csum_replace_by_diff(&l4.tcp->check, | |
1970 | (__force __wsum)htonl(paylen)); | |
1971 | /* compute length of TCP segmentation header */ | |
1972 | off->header_len = (l4.tcp->doff * 4) + l4_start; | |
1973 | } | |
d76a60ba AV |
1974 | |
1975 | /* update gso_segs and bytecount */ | |
1976 | first->gso_segs = skb_shinfo(skb)->gso_segs; | |
d944b469 | 1977 | first->bytecount += (first->gso_segs - 1) * off->header_len; |
d76a60ba AV |
1978 | |
1979 | cd_tso_len = skb->len - off->header_len; | |
1980 | cd_mss = skb_shinfo(skb)->gso_size; | |
1981 | ||
1982 | /* record cdesc_qw1 with TSO parameters */ | |
e65e9e15 BA |
1983 | off->cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX | |
1984 | (ICE_TX_CTX_DESC_TSO << ICE_TXD_CTX_QW1_CMD_S) | | |
1985 | (cd_tso_len << ICE_TXD_CTX_QW1_TSO_LEN_S) | | |
1986 | (cd_mss << ICE_TXD_CTX_QW1_MSS_S)); | |
d76a60ba AV |
1987 | first->tx_flags |= ICE_TX_FLAGS_TSO; |
1988 | return 1; | |
1989 | } | |
1990 | ||
2b245cb2 AV |
1991 | /** |
1992 | * ice_txd_use_count - estimate the number of descriptors needed for Tx | |
1993 | * @size: transmit request size in bytes | |
1994 | * | |
1995 | * Due to hardware alignment restrictions (4K alignment), we need to | |
1996 | * assume that we can have no more than 12K of data per descriptor, even | |
1997 | * though each descriptor can take up to 16K - 1 bytes of aligned memory. | |
1998 | * Thus, we need to divide by 12K. But division is slow! Instead, | |
1999 | * we decompose the operation into shifts and one relatively cheap | |
2000 | * multiply operation. | |
2001 | * | |
2002 | * To divide by 12K, we first divide by 4K, then divide by 3: | |
2003 | * To divide by 4K, shift right by 12 bits | |
2004 | * To divide by 3, multiply by 85, then divide by 256 | |
2005 | * (Divide by 256 is done by shifting right by 8 bits) | |
2006 | * Finally, we add one to round up. Because 256 isn't an exact multiple of | |
2007 | * 3, we'll underestimate near each multiple of 12K. This is actually more | |
2008 | * accurate as we have 4K - 1 of wiggle room that we can fit into the last | |
df17b7e0 | 2009 | * segment. For our purposes this is accurate out to 1M which is orders of |
2b245cb2 AV |
2010 | * magnitude greater than our largest possible GSO size. |
2011 | * | |
2012 | * This would then be implemented as: | |
c585ea42 | 2013 | * return (((size >> 12) * 85) >> 8) + ICE_DESCS_FOR_SKB_DATA_PTR; |
2b245cb2 AV |
2014 | * |
2015 | * Since multiplication and division are commutative, we can reorder | |
2016 | * operations into: | |
c585ea42 | 2017 | * return ((size * 85) >> 20) + ICE_DESCS_FOR_SKB_DATA_PTR; |
2b245cb2 AV |
2018 | */ |
2019 | static unsigned int ice_txd_use_count(unsigned int size) | |
2020 | { | |
c585ea42 | 2021 | return ((size * 85) >> 20) + ICE_DESCS_FOR_SKB_DATA_PTR; |
2b245cb2 AV |
2022 | } |
2023 | ||
2024 | /** | |
d337f2af | 2025 | * ice_xmit_desc_count - calculate number of Tx descriptors needed |
2b245cb2 AV |
2026 | * @skb: send buffer |
2027 | * | |
2028 | * Returns number of data descriptors needed for this skb. | |
2029 | */ | |
2030 | static unsigned int ice_xmit_desc_count(struct sk_buff *skb) | |
2031 | { | |
d7840976 | 2032 | const skb_frag_t *frag = &skb_shinfo(skb)->frags[0]; |
2b245cb2 AV |
2033 | unsigned int nr_frags = skb_shinfo(skb)->nr_frags; |
2034 | unsigned int count = 0, size = skb_headlen(skb); | |
2035 | ||
2036 | for (;;) { | |
2037 | count += ice_txd_use_count(size); | |
2038 | ||
2039 | if (!nr_frags--) | |
2040 | break; | |
2041 | ||
2042 | size = skb_frag_size(frag++); | |
2043 | } | |
2044 | ||
2045 | return count; | |
2046 | } | |
2047 | ||
2048 | /** | |
2049 | * __ice_chk_linearize - Check if there are more than 8 buffers per packet | |
2050 | * @skb: send buffer | |
2051 | * | |
2052 | * Note: This HW can't DMA more than 8 buffers to build a packet on the wire | |
2053 | * and so we need to figure out the cases where we need to linearize the skb. | |
2054 | * | |
2055 | * For TSO we need to count the TSO header and segment payload separately. | |
2056 | * As such we need to check cases where we have 7 fragments or more as we | |
2057 | * can potentially require 9 DMA transactions, 1 for the TSO header, 1 for | |
2058 | * the segment payload in the first descriptor, and another 7 for the | |
2059 | * fragments. | |
2060 | */ | |
2061 | static bool __ice_chk_linearize(struct sk_buff *skb) | |
2062 | { | |
d7840976 | 2063 | const skb_frag_t *frag, *stale; |
2b245cb2 AV |
2064 | int nr_frags, sum; |
2065 | ||
2066 | /* no need to check if number of frags is less than 7 */ | |
2067 | nr_frags = skb_shinfo(skb)->nr_frags; | |
2068 | if (nr_frags < (ICE_MAX_BUF_TXD - 1)) | |
2069 | return false; | |
2070 | ||
2071 | /* We need to walk through the list and validate that each group | |
2072 | * of 6 fragments totals at least gso_size. | |
2073 | */ | |
2074 | nr_frags -= ICE_MAX_BUF_TXD - 2; | |
2075 | frag = &skb_shinfo(skb)->frags[0]; | |
2076 | ||
df17b7e0 | 2077 | /* Initialize size to the negative value of gso_size minus 1. We |
4ee656bb | 2078 | * use this as the worst case scenario in which the frag ahead |
2b245cb2 AV |
2079 | * of us only provides one byte which is why we are limited to 6 |
2080 | * descriptors for a single transmit as the header and previous | |
2081 | * fragment are already consuming 2 descriptors. | |
2082 | */ | |
2083 | sum = 1 - skb_shinfo(skb)->gso_size; | |
2084 | ||
2085 | /* Add size of frags 0 through 4 to create our initial sum */ | |
2086 | sum += skb_frag_size(frag++); | |
2087 | sum += skb_frag_size(frag++); | |
2088 | sum += skb_frag_size(frag++); | |
2089 | sum += skb_frag_size(frag++); | |
2090 | sum += skb_frag_size(frag++); | |
2091 | ||
2092 | /* Walk through fragments adding latest fragment, testing it, and | |
2093 | * then removing stale fragments from the sum. | |
2094 | */ | |
2095 | stale = &skb_shinfo(skb)->frags[0]; | |
2096 | for (;;) { | |
2097 | sum += skb_frag_size(frag++); | |
2098 | ||
2099 | /* if sum is negative we failed to make sufficient progress */ | |
2100 | if (sum < 0) | |
2101 | return true; | |
2102 | ||
2103 | if (!nr_frags--) | |
2104 | break; | |
2105 | ||
2106 | sum -= skb_frag_size(stale++); | |
2107 | } | |
2108 | ||
2109 | return false; | |
2110 | } | |
2111 | ||
2112 | /** | |
2113 | * ice_chk_linearize - Check if there are more than 8 fragments per packet | |
2114 | * @skb: send buffer | |
2115 | * @count: number of buffers used | |
2116 | * | |
2117 | * Note: Our HW can't scatter-gather more than 8 fragments to build | |
2118 | * a packet on the wire and so we need to figure out the cases where we | |
2119 | * need to linearize the skb. | |
2120 | */ | |
2121 | static bool ice_chk_linearize(struct sk_buff *skb, unsigned int count) | |
2122 | { | |
2123 | /* Both TSO and single send will work if count is less than 8 */ | |
2124 | if (likely(count < ICE_MAX_BUF_TXD)) | |
2125 | return false; | |
2126 | ||
2127 | if (skb_is_gso(skb)) | |
2128 | return __ice_chk_linearize(skb); | |
2129 | ||
2130 | /* we can support up to 8 data buffers for a single send */ | |
2131 | return count != ICE_MAX_BUF_TXD; | |
2132 | } | |
2133 | ||
2134 | /** | |
2135 | * ice_xmit_frame_ring - Sends buffer on Tx ring | |
2136 | * @skb: send buffer | |
2137 | * @tx_ring: ring to send buffer on | |
2138 | * | |
2139 | * Returns NETDEV_TX_OK if sent, else an error code | |
2140 | */ | |
2141 | static netdev_tx_t | |
2142 | ice_xmit_frame_ring(struct sk_buff *skb, struct ice_ring *tx_ring) | |
2143 | { | |
d76a60ba | 2144 | struct ice_tx_offload_params offload = { 0 }; |
0c3a6101 | 2145 | struct ice_vsi *vsi = tx_ring->vsi; |
2b245cb2 AV |
2146 | struct ice_tx_buf *first; |
2147 | unsigned int count; | |
d76a60ba | 2148 | int tso, csum; |
2b245cb2 AV |
2149 | |
2150 | count = ice_xmit_desc_count(skb); | |
2151 | if (ice_chk_linearize(skb, count)) { | |
2152 | if (__skb_linearize(skb)) | |
2153 | goto out_drop; | |
2154 | count = ice_txd_use_count(skb->len); | |
2155 | tx_ring->tx_stats.tx_linearize++; | |
2156 | } | |
2157 | ||
2158 | /* need: 1 descriptor per page * PAGE_SIZE/ICE_MAX_DATA_PER_TXD, | |
2159 | * + 1 desc for skb_head_len/ICE_MAX_DATA_PER_TXD, | |
2160 | * + 4 desc gap to avoid the cache line where head is, | |
2161 | * + 1 desc for context descriptor, | |
2162 | * otherwise try next time | |
2163 | */ | |
c585ea42 BC |
2164 | if (ice_maybe_stop_tx(tx_ring, count + ICE_DESCS_PER_CACHE_LINE + |
2165 | ICE_DESCS_FOR_CTX_DESC)) { | |
2b245cb2 AV |
2166 | tx_ring->tx_stats.tx_busy++; |
2167 | return NETDEV_TX_BUSY; | |
2168 | } | |
2169 | ||
d76a60ba AV |
2170 | offload.tx_ring = tx_ring; |
2171 | ||
2b245cb2 AV |
2172 | /* record the location of the first descriptor for this packet */ |
2173 | first = &tx_ring->tx_buf[tx_ring->next_to_use]; | |
2174 | first->skb = skb; | |
2175 | first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN); | |
2176 | first->gso_segs = 1; | |
d76a60ba AV |
2177 | first->tx_flags = 0; |
2178 | ||
2179 | /* prepare the VLAN tagging flags for Tx */ | |
2180 | if (ice_tx_prepare_vlan_flags(tx_ring, first)) | |
2181 | goto out_drop; | |
2182 | ||
2183 | /* set up TSO offload */ | |
2184 | tso = ice_tso(first, &offload); | |
2185 | if (tso < 0) | |
2186 | goto out_drop; | |
2187 | ||
2188 | /* always set up Tx checksum offload */ | |
2189 | csum = ice_tx_csum(first, &offload); | |
2190 | if (csum < 0) | |
2191 | goto out_drop; | |
2192 | ||
0c3a6101 DE |
2193 | /* allow CONTROL frames egress from main VSI if FW LLDP disabled */ |
2194 | if (unlikely(skb->priority == TC_PRIO_CONTROL && | |
2195 | vsi->type == ICE_VSI_PF && | |
2196 | vsi->port_info->is_sw_lldp)) | |
2197 | offload.cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX | | |
2198 | ICE_TX_CTX_DESC_SWTCH_UPLINK << | |
2199 | ICE_TXD_CTX_QW1_CMD_S); | |
2200 | ||
2201 | if (offload.cd_qw1 & ICE_TX_DESC_DTYPE_CTX) { | |
d76a60ba AV |
2202 | struct ice_tx_ctx_desc *cdesc; |
2203 | int i = tx_ring->next_to_use; | |
2204 | ||
2205 | /* grab the next descriptor */ | |
2206 | cdesc = ICE_TX_CTX_DESC(tx_ring, i); | |
2207 | i++; | |
2208 | tx_ring->next_to_use = (i < tx_ring->count) ? i : 0; | |
2209 | ||
2210 | /* setup context descriptor */ | |
2211 | cdesc->tunneling_params = cpu_to_le32(offload.cd_tunnel_params); | |
2212 | cdesc->l2tag2 = cpu_to_le16(offload.cd_l2tag2); | |
2213 | cdesc->rsvd = cpu_to_le16(0); | |
2214 | cdesc->qw1 = cpu_to_le64(offload.cd_qw1); | |
2215 | } | |
2b245cb2 | 2216 | |
d76a60ba | 2217 | ice_tx_map(tx_ring, first, &offload); |
2b245cb2 AV |
2218 | return NETDEV_TX_OK; |
2219 | ||
2220 | out_drop: | |
2221 | dev_kfree_skb_any(skb); | |
2222 | return NETDEV_TX_OK; | |
2223 | } | |
2224 | ||
2225 | /** | |
2226 | * ice_start_xmit - Selects the correct VSI and Tx queue to send buffer | |
2227 | * @skb: send buffer | |
2228 | * @netdev: network interface device structure | |
2229 | * | |
2230 | * Returns NETDEV_TX_OK if sent, else an error code | |
2231 | */ | |
2232 | netdev_tx_t ice_start_xmit(struct sk_buff *skb, struct net_device *netdev) | |
2233 | { | |
2234 | struct ice_netdev_priv *np = netdev_priv(netdev); | |
2235 | struct ice_vsi *vsi = np->vsi; | |
2236 | struct ice_ring *tx_ring; | |
2237 | ||
2238 | tx_ring = vsi->tx_rings[skb->queue_mapping]; | |
2239 | ||
2240 | /* hardware can't handle really short frames, hardware padding works | |
2241 | * beyond this point | |
2242 | */ | |
2243 | if (skb_put_padto(skb, ICE_MIN_TX_LEN)) | |
2244 | return NETDEV_TX_OK; | |
2245 | ||
2246 | return ice_xmit_frame_ring(skb, tx_ring); | |
2247 | } |