The server side of qperf panics as follows:
[242446.336860] IP: report_bug+0x64/0x10
[242446.341031] PGD
1c0c067
[242446.341032] P4D
1c0c067
[242446.343951] PUD
1c0d063
[242446.346870] PMD
8587ea067
[242446.349788] PTE
800000083e14016
[242446.352901]
[242446.358352] Oops: 0003 [#1] SM
[242446.437919] CPU: 1 PID: 7442 Comm: irq/92-hfi1_0 k Not tainted 4.12.0-mam-asm #1
[242446.446365] Hardware name: Intel Corporation S2600WT2/S2600WT2, BIOS SE5C610.86B.01.01.0018.C4.
072020161249 07/20/201
[242446.458397] task:
ffff8808392d2b80 task.stack:
ffffc9000664000
[242446.465097] RIP: 0010:report_bug+0x64/0x10
[242446.469859] RSP: 0018:
ffffc900066439c0 EFLAGS:
0001000
[242446.475784] RAX:
ffffffffa06647e4 RBX:
ffffffffa06461e1 RCX:
000000000000000
[242446.483840] RDX:
0000000000000907 RSI:
ffffffffa0675040 RDI:
ffffffffffff740
[242446.491897] RBP:
ffffc900066439e0 R08:
0000000000000001 R09:
000000000000025
[242446.499953] R10:
ffffffff81a253df R11:
0000000000000133 R12:
ffffc90006643b3
[242446.508010] R13:
ffffffffa065bbf0 R14:
00000000000001e5 R15:
000000000000000
[242446.516067] FS:
0000000000000000(0000) GS:
ffff88085f640000(0000) knlGS:
000000000000000
[242446.525191] CS: 0010 DS: 0000 ES: 0000 CR0:
000000008005003
[242446.531698] CR2:
ffffffffa06647ee CR3:
0000000001c09000 CR4:
00000000001406e
[242446.539756] Call Trace
[242446.542582] fixup_bug+0x2c/0x5
[242446.546277] do_trap+0x12b/0x18
[242446.549972] do_error_trap+0x89/0x11
[242446.554171] ? hfi1_copy_sge+0x271/0x2b0 [hfi1
[242446.559324] ? ttwu_do_wakeup+0x1e/0x14
[242446.563795] ? ttwu_do_activate+0x77/0x8
[242446.568363] do_invalid_op+0x20/0x3
[242446.572448] invalid_op+0x1e/0x3
[242446.576247] RIP: 0010:hfi1_copy_sge+0x271/0x2b0 [hfi1
[242446.582075] RSP: 0018:
ffffc90006643be8 EFLAGS:
0001004
[242446.587999] RAX:
0000000000000000 RBX:
ffff88083e0fa240 RCX:
000000000000000
[242446.596058] RDX:
0000000000000000 RSI:
ffff880842508000 RDI:
ffff88083e0fa24
[242446.604116] RBP:
ffffc90006643c28 R08:
0000000000000000 R09:
000000000000000
[242446.612172] R10:
ffffc90009473640 R11:
0000000000000133 R12:
000000000000000
[242446.620228] R13:
0000000000000000 R14:
0000000000002000 R15:
ffff88084250800
[242446.628293] ? hfi1_copy_sge+0x1a1/0x2b0 [hfi1
[242446.633449] hfi1_rc_rcv+0x3da/0x1270 [hfi1
[242446.638312] ? sc_buffer_alloc+0x113/0x150 [hfi1
[242446.643662] hfi1_ib_rcv+0x1c9/0x2e0 [hfi1
[242446.648428] process_receive_ib+0x19a/0x270 [hfi1
[242446.653866] ? process_rcv_qp_work+0xd2/0x160 [hfi1
[242446.659505] handle_receive_interrupt_nodma_rtail+0x184/0x2e0 [hfi1
[242446.666693] ? irq_finalize_oneshot+0x100/0x10
[242446.671846] receive_context_thread+0x1b/0x140 [hfi1
[242446.677576] irq_thread_fn+0x1e/0x4
[242446.681659] irq_thread+0x13c/0x1b
[242446.685646] ? irq_forced_thread_fn+0x60/0x6
[242446.690604] kthread+0x112/0x15
[242446.694298] ? irq_thread_check_affinity+0xe0/0xe
[242446.699738] ? kthread_park+0x60/0x6
[242446.703919] ? do_syscall_64+0x67/0x15
[242446.708292] ret_from_fork+0x25/0x3
[242446.712374] Code: 63 78 04 44 0f b7 70 08 41 89 d0 4c 8d 2c 38 41 83 e0 01 f6 c2 02 74 17 66 45 85 c0 74 11 f6 c2 04 b9 01 00 00 00 75 bb 83 ca 04 <66> 89 50 0a 66 45 85 c0 74 52 0f b6 48 0b 41 0f b7 f6 4d 89 e0
[242446.733527] RIP: report_bug+0x64/0x100 RSP:
ffffc900066439c
[242446.739935] CR2:
ffffffffa06647e
[242446.743763] ---[ end trace
0e90a20d0aa494f7 ]--
The root cause is that the qib/hfi1 post receive call to rvt_lkey_ok()
doesn't interpret the new return value from rvt_lkey_ok() properly
leading to an mr reference count underrun.
Additionally, remove an unused argument in rvt_sge_adjacent()
aw well as an unneeded incr local in rvt_post_one_wr().
Fixes: Commit
14fe13fcd3af ("IB/rdmavt: Compress adjacent SGEs in rvt_lkey_ok()")
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
if (wqe->sg_list[i].length == 0)
continue;
/* Check LKEY */
- if (!rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
- NULL, &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
+ ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
+ NULL, &wqe->sg_list[i],
+ IB_ACCESS_LOCAL_WRITE);
+ if (unlikely(ret <= 0))
goto bad_lkey;
qp->r_len += wqe->sg_list[i].length;
j++;
if (wqe->sg_list[i].length == 0)
continue;
/* Check LKEY */
- if (!rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
- NULL, &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
+ ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
+ NULL, &wqe->sg_list[i],
+ IB_ACCESS_LOCAL_WRITE);
+ if (unlikely(ret <= 0))
goto bad_lkey;
qp->r_len += wqe->sg_list[i].length;
j++;
/**
* rvt_sge_adjacent - is isge compressible
- * @isge: outgoing internal SGE
* @last_sge: last outgoing SGE written
* @sge: SGE to check
*
*
* Return: true if isge is adjacent to last sge
*/
-static inline bool rvt_sge_adjacent(struct rvt_sge *isge,
- struct rvt_sge *last_sge,
+static inline bool rvt_sge_adjacent(struct rvt_sge *last_sge,
struct ib_sge *sge)
{
if (last_sge && sge->lkey == last_sge->mr->lkey &&
if (pd->user)
return -EINVAL;
- if (rvt_sge_adjacent(isge, last_sge, sge))
+ if (rvt_sge_adjacent(last_sge, sge))
return 0;
rcu_read_lock();
mr = rcu_dereference(dev->dma_mr);
isge->n = 0;
goto ok;
}
- if (rvt_sge_adjacent(isge, last_sge, sge))
+ if (rvt_sge_adjacent(last_sge, sge))
return 0;
rcu_read_lock();
mr = rcu_dereference(rkt->table[sge->lkey >> rkt->shift]);
struct rvt_pd *pd;
struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
u8 log_pmtu;
- int ret, incr;
+ int ret;
size_t cplen;
bool reserved_op;
int local_ops_delayed = 0;
if (length == 0)
continue;
- incr = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j], last_sge,
- &wr->sg_list[i], acc);
- if (unlikely(incr < 0))
- goto bail_lkey_error;
+ ret = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j], last_sge,
+ &wr->sg_list[i], acc);
+ if (unlikely(ret < 0))
+ goto bail_inval_free;
wqe->length += length;
- if (incr)
+ if (ret)
last_sge = &wqe->sg_list[j];
- j += incr;
+ j += ret;
}
wqe->wr.num_sge = j;
}
return 0;
-bail_lkey_error:
- ret = incr;
bail_inval_free:
/* release mr holds */
while (j) {