From f70575805708cabdedea7498aaa3f710fde4d920 Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Wed, 31 Jan 2024 03:29:01 +0000 Subject: [PATCH] add lvds1024*800 --- kernel/drivers/infiniband/hw/hfi1/verbs.c | 546 +++++++++++++++++++++++++----------------------------- 1 files changed, 253 insertions(+), 293 deletions(-) diff --git a/kernel/drivers/infiniband/hw/hfi1/verbs.c b/kernel/drivers/infiniband/hw/hfi1/verbs.c index 1cf1dfb..693922d 100644 --- a/kernel/drivers/infiniband/hw/hfi1/verbs.c +++ b/kernel/drivers/infiniband/hw/hfi1/verbs.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2018 Intel Corporation. + * Copyright(c) 2015 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -66,6 +66,7 @@ #include "vnic.h" #include "fault.h" #include "affinity.h" +#include "ipoib.h" static unsigned int hfi1_lkey_table_size = 16; module_param_named(lkey_table_size, hfi1_lkey_table_size, uint, @@ -130,8 +131,6 @@ module_param(piothreshold, ushort, S_IRUGO); MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio"); -#define COPY_CACHELESS 1 -#define COPY_ADAPTIVE 2 static unsigned int sge_copy_mode; module_param(sge_copy_mode, uint, S_IRUGO); MODULE_PARM_DESC(sge_copy_mode, @@ -149,168 +148,24 @@ /* Length of buffer to create verbs txreq cache name */ #define TXREQ_NAME_LEN 24 -static uint wss_threshold; +static uint wss_threshold = 80; module_param(wss_threshold, uint, S_IRUGO); MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy"); static uint wss_clean_period = 256; module_param(wss_clean_period, uint, S_IRUGO); MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned"); -/* memory working set size */ -struct hfi1_wss { - unsigned long *entries; - atomic_t total_count; - atomic_t clean_counter; - atomic_t clean_entry; - - int threshold; - int num_entries; - long pages_mask; -}; - -static struct hfi1_wss wss; - -int hfi1_wss_init(void) -{ - long llc_size; - long llc_bits; - long table_size; - long table_bits; - - /* check for a valid percent range - default to 80 if none or invalid */ - if (wss_threshold < 1 || wss_threshold > 100) - wss_threshold = 80; - /* reject a wildly large period */ - if (wss_clean_period > 1000000) - wss_clean_period = 256; - /* reject a zero period */ - if (wss_clean_period == 0) - wss_clean_period = 1; - - /* - * Calculate the table size - the next power of 2 larger than the - * LLC size. LLC size is in KiB. - */ - llc_size = wss_llc_size() * 1024; - table_size = roundup_pow_of_two(llc_size); - - /* one bit per page in rounded up table */ - llc_bits = llc_size / PAGE_SIZE; - table_bits = table_size / PAGE_SIZE; - wss.pages_mask = table_bits - 1; - wss.num_entries = table_bits / BITS_PER_LONG; - - wss.threshold = (llc_bits * wss_threshold) / 100; - if (wss.threshold == 0) - wss.threshold = 1; - - atomic_set(&wss.clean_counter, wss_clean_period); - - wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries), - GFP_KERNEL); - if (!wss.entries) { - hfi1_wss_exit(); - return -ENOMEM; - } - - return 0; -} - -void hfi1_wss_exit(void) -{ - /* coded to handle partially initialized and repeat callers */ - kfree(wss.entries); - wss.entries = NULL; -} - -/* - * Advance the clean counter. When the clean period has expired, - * clean an entry. - * - * This is implemented in atomics to avoid locking. Because multiple - * variables are involved, it can be racy which can lead to slightly - * inaccurate information. Since this is only a heuristic, this is - * OK. Any innaccuracies will clean themselves out as the counter - * advances. That said, it is unlikely the entry clean operation will - * race - the next possible racer will not start until the next clean - * period. - * - * The clean counter is implemented as a decrement to zero. When zero - * is reached an entry is cleaned. - */ -static void wss_advance_clean_counter(void) -{ - int entry; - int weight; - unsigned long bits; - - /* become the cleaner if we decrement the counter to zero */ - if (atomic_dec_and_test(&wss.clean_counter)) { - /* - * Set, not add, the clean period. This avoids an issue - * where the counter could decrement below the clean period. - * Doing a set can result in lost decrements, slowing the - * clean advance. Since this a heuristic, this possible - * slowdown is OK. - * - * An alternative is to loop, advancing the counter by a - * clean period until the result is > 0. However, this could - * lead to several threads keeping another in the clean loop. - * This could be mitigated by limiting the number of times - * we stay in the loop. - */ - atomic_set(&wss.clean_counter, wss_clean_period); - - /* - * Uniquely grab the entry to clean and move to next. - * The current entry is always the lower bits of - * wss.clean_entry. The table size, wss.num_entries, - * is always a power-of-2. - */ - entry = (atomic_inc_return(&wss.clean_entry) - 1) - & (wss.num_entries - 1); - - /* clear the entry and count the bits */ - bits = xchg(&wss.entries[entry], 0); - weight = hweight64((u64)bits); - /* only adjust the contended total count if needed */ - if (weight) - atomic_sub(weight, &wss.total_count); - } -} - -/* - * Insert the given address into the working set array. - */ -static void wss_insert(void *address) -{ - u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask; - u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */ - u32 nr = page & (BITS_PER_LONG - 1); - - if (!test_and_set_bit(nr, &wss.entries[entry])) - atomic_inc(&wss.total_count); - - wss_advance_clean_counter(); -} - -/* - * Is the working set larger than the threshold? - */ -static inline bool wss_exceeds_threshold(void) -{ - return atomic_read(&wss.total_count) >= wss.threshold; -} - /* * Translate ib_wr_opcode into ib_wc_opcode. */ const enum ib_wc_opcode ib_hfi1_wc_opcode[] = { [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE, + [IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE, [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, [IB_WR_SEND] = IB_WC_SEND, [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, [IB_WR_RDMA_READ] = IB_WC_RDMA_READ, + [IB_WR_TID_RDMA_READ] = IB_WC_RDMA_READ, [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP, [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD, [IB_WR_SEND_WITH_INV] = IB_WC_SEND, @@ -346,6 +201,14 @@ [IB_OPCODE_RC_FETCH_ADD] = 12 + 8 + 28, [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = 12 + 8 + 4, [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = 12 + 8 + 4, + [IB_OPCODE_TID_RDMA_READ_REQ] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_READ_RESP] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_REQ] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_RESP] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_DATA] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_ACK] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_RESYNC] = 12 + 8 + 36, /* UC */ [IB_OPCODE_UC_SEND_FIRST] = 12 + 8, [IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8, @@ -389,6 +252,17 @@ [IB_OPCODE_RC_FETCH_ADD] = &hfi1_rc_rcv, [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = &hfi1_rc_rcv, [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = &hfi1_rc_rcv, + + /* TID RDMA has separate handlers for different opcodes.*/ + [IB_OPCODE_TID_RDMA_WRITE_REQ] = &hfi1_rc_rcv_tid_rdma_write_req, + [IB_OPCODE_TID_RDMA_WRITE_RESP] = &hfi1_rc_rcv_tid_rdma_write_resp, + [IB_OPCODE_TID_RDMA_WRITE_DATA] = &hfi1_rc_rcv_tid_rdma_write_data, + [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = &hfi1_rc_rcv_tid_rdma_write_data, + [IB_OPCODE_TID_RDMA_READ_REQ] = &hfi1_rc_rcv_tid_rdma_read_req, + [IB_OPCODE_TID_RDMA_READ_RESP] = &hfi1_rc_rcv_tid_rdma_read_resp, + [IB_OPCODE_TID_RDMA_RESYNC] = &hfi1_rc_rcv_tid_rdma_resync, + [IB_OPCODE_TID_RDMA_ACK] = &hfi1_rc_rcv_tid_rdma_ack, + /* UC */ [IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv, [IB_OPCODE_UC_SEND_MIDDLE] = &hfi1_uc_rcv, @@ -436,79 +310,6 @@ */ __be64 ib_hfi1_sys_image_guid; -/** - * hfi1_copy_sge - copy data to SGE memory - * @ss: the SGE state - * @data: the data to copy - * @length: the length of the data - * @release: boolean to release MR - * @copy_last: do a separate copy of the last 8 bytes - */ -void hfi1_copy_sge( - struct rvt_sge_state *ss, - void *data, u32 length, - bool release, - bool copy_last) -{ - struct rvt_sge *sge = &ss->sge; - int i; - bool in_last = false; - bool cacheless_copy = false; - - if (sge_copy_mode == COPY_CACHELESS) { - cacheless_copy = length >= PAGE_SIZE; - } else if (sge_copy_mode == COPY_ADAPTIVE) { - if (length >= PAGE_SIZE) { - /* - * NOTE: this *assumes*: - * o The first vaddr is the dest. - * o If multiple pages, then vaddr is sequential. - */ - wss_insert(sge->vaddr); - if (length >= (2 * PAGE_SIZE)) - wss_insert(sge->vaddr + PAGE_SIZE); - - cacheless_copy = wss_exceeds_threshold(); - } else { - wss_advance_clean_counter(); - } - } - if (copy_last) { - if (length > 8) { - length -= 8; - } else { - copy_last = false; - in_last = true; - } - } - -again: - while (length) { - u32 len = rvt_get_sge_length(sge, length); - - WARN_ON_ONCE(len == 0); - if (unlikely(in_last)) { - /* enforce byte transfer ordering */ - for (i = 0; i < len; i++) - ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i]; - } else if (cacheless_copy) { - cacheless_memcpy(sge->vaddr, data, len); - } else { - memcpy(sge->vaddr, data, len); - } - rvt_update_sge(ss, len, release); - data += len; - length -= len; - } - - if (copy_last) { - copy_last = false; - in_last = true; - length = 8; - goto again; - } -} - /* * Make sure the QP is ready and able to accept the given opcode. */ @@ -527,7 +328,7 @@ static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc) { #ifdef CONFIG_FAULT_INJECTION - if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP) + if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP) { /* * In order to drop non-IB traffic we * set PbcInsertHrc to NONE (0x2). @@ -538,8 +339,9 @@ * packet will not be delivered to the * correct context. */ + pbc &= ~PBC_INSERT_HCRC_SMASK; pbc |= (u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT; - else + } else { /* * In order to drop regular verbs * traffic we set the PbcTestEbp @@ -549,8 +351,127 @@ * triggered and will be dropped. */ pbc |= PBC_TEST_EBP; + } #endif return pbc; +} + +static opcode_handler tid_qp_ok(int opcode, struct hfi1_packet *packet) +{ + if (packet->qp->ibqp.qp_type != IB_QPT_RC || + !(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK)) + return NULL; + if ((opcode & RVT_OPCODE_QP_MASK) == IB_OPCODE_TID_RDMA) + return opcode_handler_tbl[opcode]; + return NULL; +} + +void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct ib_header *hdr = packet->hdr; + u32 tlen = packet->tlen; + struct hfi1_pportdata *ppd = rcd->ppd; + struct hfi1_ibport *ibp = &ppd->ibport_data; + struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi; + opcode_handler opcode_handler; + unsigned long flags; + u32 qp_num; + int lnh; + u8 opcode; + + /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */ + if (unlikely(tlen < 15 * sizeof(u32))) + goto drop; + + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh != HFI1_LRH_BTH) + goto drop; + + packet->ohdr = &hdr->u.oth; + trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); + + opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24); + inc_opstats(tlen, &rcd->opstats->stats[opcode]); + + /* verbs_qp can be picked up from any tid_rdma header struct */ + qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_req.verbs_qp) & + RVT_QPN_MASK; + + rcu_read_lock(); + packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); + if (!packet->qp) + goto drop_rcu; + spin_lock_irqsave(&packet->qp->r_lock, flags); + opcode_handler = tid_qp_ok(opcode, packet); + if (likely(opcode_handler)) + opcode_handler(packet); + else + goto drop_unlock; + spin_unlock_irqrestore(&packet->qp->r_lock, flags); + rcu_read_unlock(); + + return; +drop_unlock: + spin_unlock_irqrestore(&packet->qp->r_lock, flags); +drop_rcu: + rcu_read_unlock(); +drop: + ibp->rvp.n_pkt_drops++; +} + +void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct ib_header *hdr = packet->hdr; + u32 tlen = packet->tlen; + struct hfi1_pportdata *ppd = rcd->ppd; + struct hfi1_ibport *ibp = &ppd->ibport_data; + struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi; + opcode_handler opcode_handler; + unsigned long flags; + u32 qp_num; + int lnh; + u8 opcode; + + /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */ + if (unlikely(tlen < 15 * sizeof(u32))) + goto drop; + + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh != HFI1_LRH_BTH) + goto drop; + + packet->ohdr = &hdr->u.oth; + trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); + + opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24); + inc_opstats(tlen, &rcd->opstats->stats[opcode]); + + /* verbs_qp can be picked up from any tid_rdma header struct */ + qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_rsp.verbs_qp) & + RVT_QPN_MASK; + + rcu_read_lock(); + packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); + if (!packet->qp) + goto drop_rcu; + spin_lock_irqsave(&packet->qp->r_lock, flags); + opcode_handler = tid_qp_ok(opcode, packet); + if (likely(opcode_handler)) + opcode_handler(packet); + else + goto drop_unlock; + spin_unlock_irqrestore(&packet->qp->r_lock, flags); + rcu_read_unlock(); + + return; +drop_unlock: + spin_unlock_irqrestore(&packet->qp->r_lock, flags); +drop_rcu: + rcu_read_unlock(); +drop: + ibp->rvp.n_pkt_drops++; } static int hfi1_do_pkey_check(struct hfi1_packet *packet) @@ -713,11 +634,13 @@ spin_lock(&qp->s_lock); if (tx->wqe) { - hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS); + rvt_send_complete(qp, tx->wqe, IB_WC_SUCCESS); } else if (qp->ibqp.qp_type == IB_QPT_RC) { struct hfi1_opa_header *hdr; hdr = &tx->phdr.hdr; + if (unlikely(status == SDMA_TXREQ_S_ABORTED)) + hfi1_rc_verbs_aborted(qp, hdr); hfi1_rc_send_complete(qp, hdr); } spin_unlock(&qp->s_lock); @@ -725,11 +648,28 @@ hfi1_put_txreq(tx); } +void hfi1_wait_kmem(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_qp *ibqp = &qp->ibqp; + struct ib_device *ibdev = ibqp->device; + struct hfi1_ibdev *dev = to_idev(ibdev); + + if (list_empty(&priv->s_iowait.list)) { + if (list_empty(&dev->memwait)) + mod_timer(&dev->mem_timer, jiffies + 1); + qp->s_flags |= RVT_S_WAIT_KMEM; + list_add_tail(&priv->s_iowait.list, &dev->memwait); + priv->s_iowait.lock = &dev->iowait_lock; + trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM); + rvt_get_qp(qp); + } +} + static int wait_kmem(struct hfi1_ibdev *dev, struct rvt_qp *qp, struct hfi1_pkt_state *ps) { - struct hfi1_qp_priv *priv = qp->priv; unsigned long flags; int ret = 0; @@ -737,18 +677,10 @@ if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { write_seqlock(&dev->iowait_lock); list_add_tail(&ps->s_txreq->txreq.list, - &priv->s_iowait.tx_head); - if (list_empty(&priv->s_iowait.list)) { - if (list_empty(&dev->memwait)) - mod_timer(&dev->mem_timer, jiffies + 1); - qp->s_flags |= RVT_S_WAIT_KMEM; - list_add_tail(&priv->s_iowait.list, &dev->memwait); - priv->s_iowait.lock = &dev->iowait_lock; - trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM); - rvt_get_qp(qp); - } + &ps->wait->tx_head); + hfi1_wait_kmem(qp); write_sequnlock(&dev->iowait_lock); - qp->s_flags &= ~RVT_S_BUSY; + hfi1_qp_unbusy(qp, ps->wait); ret = -EBUSY; } spin_unlock_irqrestore(&qp->s_lock, flags); @@ -774,11 +706,7 @@ int ret = 0; while (length) { - len = ss->sge.length; - if (len > length) - len = length; - if (len > ss->sge.sge_length) - len = ss->sge.sge_length; + len = rvt_get_sge_length(&ss->sge, length); WARN_ON_ONCE(len == 0); ret = sdma_txadd_kvaddr( sde->dd, @@ -892,11 +820,20 @@ /* add icrc, lt byte, and padding to flit */ if (extra_bytes) - ret = sdma_txadd_daddr(sde->dd, &tx->txreq, - sde->dd->sdma_pad_phys, extra_bytes); + ret = sdma_txadd_daddr(sde->dd, &tx->txreq, sde->dd->sdma_pad_phys, + extra_bytes); bail_txadd: return ret; +} + +static u64 update_hcrc(u8 opcode, u64 pbc) +{ + if ((opcode & IB_OPCODE_TID_RDMA) == IB_OPCODE_TID_RDMA) { + pbc &= ~PBC_INSERT_HCRC_SMASK; + pbc |= (u64)PBC_IHCRC_LKDETH << PBC_INSERT_HCRC_SHIFT; + } + return pbc; } int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, @@ -937,21 +874,24 @@ else pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT); - if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode))) - pbc = hfi1_fault_tx(qp, ps->opcode, pbc); pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen); + + if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode))) + pbc = hfi1_fault_tx(qp, ps->opcode, pbc); + else + /* Update HCRC based on packet opcode */ + pbc = update_hcrc(ps->opcode, pbc); } tx->wqe = qp->s_wqe; ret = build_verbs_tx_desc(tx->sde, len, tx, ahg_info, pbc); if (unlikely(ret)) goto bail_build; } - ret = sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq, - ps->pkts_sent); + ret = sdma_send_txreq(tx->sde, ps->wait, &tx->txreq, ps->pkts_sent); if (unlikely(ret < 0)) { if (ret == -ECOMM) goto bail_ecomm; @@ -987,7 +927,6 @@ { struct hfi1_qp_priv *priv = qp->priv; struct hfi1_devdata *dd = sc->dd; - struct hfi1_ibdev *dev = &dd->verbs_dev; unsigned long flags; int ret = 0; @@ -999,9 +938,9 @@ */ spin_lock_irqsave(&qp->s_lock, flags); if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { - write_seqlock(&dev->iowait_lock); + write_seqlock(&sc->waitlock); list_add_tail(&ps->s_txreq->txreq.list, - &priv->s_iowait.tx_head); + &ps->wait->tx_head); if (list_empty(&priv->s_iowait.list)) { struct hfi1_ibdev *dev = &dd->verbs_dev; int was_empty; @@ -1010,17 +949,18 @@ dev->n_piodrain += !!(flag & HFI1_S_WAIT_PIO_DRAIN); qp->s_flags |= flag; was_empty = list_empty(&sc->piowait); + iowait_get_priority(&priv->s_iowait); iowait_queue(ps->pkts_sent, &priv->s_iowait, &sc->piowait); - priv->s_iowait.lock = &dev->iowait_lock; + priv->s_iowait.lock = &sc->waitlock; trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO); rvt_get_qp(qp); /* counting: only call wantpiobuf_intr if first user */ if (was_empty) hfi1_sc_wantpiobuf_intr(sc, 1); } - write_sequnlock(&dev->iowait_lock); - qp->s_flags &= ~RVT_S_BUSY; + write_sequnlock(&sc->waitlock); + hfi1_qp_unbusy(qp, ps->wait); ret = -EBUSY; } spin_unlock_irqrestore(&qp->s_lock, flags); @@ -1091,14 +1031,17 @@ else pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT); + pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen); if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode))) pbc = hfi1_fault_tx(qp, ps->opcode, pbc); - pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen); + else + /* Update HCRC based on packet opcode */ + pbc = update_hcrc(ps->opcode, pbc); } if (cb) iowait_pio_inc(&priv->s_iowait); pbuf = sc_buffer_alloc(sc, plen, cb, qp); - if (unlikely(IS_ERR_OR_NULL(pbuf))) { + if (IS_ERR_OR_NULL(pbuf)) { if (cb) verbs_pio_complete(qp, 0); if (IS_ERR(pbuf)) { @@ -1137,12 +1080,8 @@ if (ss) { while (len) { void *addr = ss->sge.vaddr; - u32 slen = ss->sge.length; + u32 slen = rvt_get_sge_length(&ss->sge, len); - if (slen > len) - slen = len; - if (slen > ss->sge.sge_length) - slen = ss->sge.sge_length; rvt_update_sge(ss, slen, false); seg_pio_copy_mid(pbuf, addr, slen); len -= slen; @@ -1161,15 +1100,15 @@ &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5)); pio_bail: + spin_lock_irqsave(&qp->s_lock, flags); if (qp->s_wqe) { - spin_lock_irqsave(&qp->s_lock, flags); - hfi1_send_complete(qp, qp->s_wqe, wc_status); - spin_unlock_irqrestore(&qp->s_lock, flags); + rvt_send_complete(qp, qp->s_wqe, wc_status); } else if (qp->ibqp.qp_type == IB_QPT_RC) { - spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(wc_status == IB_WC_GENERAL_ERR)) + hfi1_rc_verbs_aborted(qp, &ps->s_txreq->phdr.hdr); hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr); - spin_unlock_irqrestore(&qp->s_lock, flags); } + spin_unlock_irqrestore(&qp->s_lock, flags); ret = 0; @@ -1289,15 +1228,16 @@ case IB_QPT_UD: break; case IB_QPT_UC: - case IB_QPT_RC: { + case IB_QPT_RC: + priv->s_running_pkt_size = + (tx->s_cur_size + priv->s_running_pkt_size) / 2; if (piothreshold && - tx->s_cur_size <= min(piothreshold, qp->pmtu) && + priv->s_running_pkt_size <= min(piothreshold, qp->pmtu) && (BIT(ps->opcode & OPMASK) & pio_opmask[ps->opcode >> 5]) && iowait_sdma_pending(&priv->s_iowait) == 0 && !sdma_txreq_built(&tx->txreq)) return dd->process_pio_send; break; - } default: break; } @@ -1370,7 +1310,7 @@ hfi1_cdbg(PIO, "%s() Failed. Completing with err", __func__); spin_lock_irqsave(&qp->s_lock, flags); - hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); + rvt_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); spin_unlock_irqrestore(&qp->s_lock, flags); } return -EINVAL; @@ -1403,7 +1343,7 @@ IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE | IB_DEVICE_MEM_MGT_EXTENSIONS | - IB_DEVICE_RDMA_NETDEV_OPA_VNIC; + IB_DEVICE_RDMA_NETDEV_OPA; rdi->dparms.props.page_size_cap = PAGE_SIZE; rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3; rdi->dparms.props.vendor_part_id = dd->pcidev->device; @@ -1412,14 +1352,15 @@ rdi->dparms.props.max_mr_size = U64_MAX; rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX; rdi->dparms.props.max_qp = hfi1_max_qps; - rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs; + rdi->dparms.props.max_qp_wr = + (hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ? + HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs); rdi->dparms.props.max_send_sge = hfi1_max_sges; rdi->dparms.props.max_recv_sge = hfi1_max_sges; rdi->dparms.props.max_sge_rd = hfi1_max_sges; rdi->dparms.props.max_cq = hfi1_max_cqs; rdi->dparms.props.max_ah = hfi1_max_ahs; rdi->dparms.props.max_cqe = hfi1_max_cqes; - rdi->dparms.props.max_map_per_fmr = 32767; rdi->dparms.props.max_pd = hfi1_max_pds; rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC; rdi->dparms.props.max_qp_init_rd_atom = 255; @@ -1483,7 +1424,7 @@ props->gid_tbl_len = HFI1_GUIDS_PER_PORT; props->active_width = (u8)opa_width_to_ib(ppd->link_width_active); /* see rate_show() in ib core/sysfs.c */ - props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active); + props->active_speed = opa_speed_to_ib(ppd->link_speed_active); props->max_vl_num = ppd->vls_supported; /* Once we are a "first class" citizen and have added the OPA MTUs to @@ -1498,6 +1439,7 @@ 4096 : hfi1_max_mtu), IB_MTU_4096); props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu : mtu_to_enum(ppd->ibmtu, IB_MTU_4096); + props->phys_mtu = hfi1_max_mtu; return 0; } @@ -1802,15 +1744,15 @@ static u64 hfi1_sps_ints(void) { - unsigned long flags; + unsigned long index, flags; struct hfi1_devdata *dd; u64 sps_ints = 0; - spin_lock_irqsave(&hfi1_devs_lock, flags); - list_for_each_entry(dd, &hfi1_dev_list, list) { + xa_lock_irqsave(&hfi1_dev_table, flags); + xa_for_each(&hfi1_dev_table, index, dd) { sps_ints += get_all_cpu_total(dd->int_counter); } - spin_unlock_irqrestore(&hfi1_devs_lock, flags); + xa_unlock_irqrestore(&hfi1_dev_table, flags); return sps_ints; } @@ -1839,6 +1781,21 @@ memcpy(stats->value, values, count * sizeof(u64)); return count; } + +static const struct ib_device_ops hfi1_dev_ops = { + .owner = THIS_MODULE, + .driver_id = RDMA_DRIVER_HFI1, + + .alloc_hw_stats = alloc_hw_stats, + .alloc_rdma_netdev = hfi1_vnic_alloc_rn, + .get_dev_fw_str = hfi1_get_dev_fw_str, + .get_hw_stats = get_hw_stats, + .init_port = hfi1_create_port_files, + .modify_device = modify_device, + /* keep process mad in the driver */ + .process_mad = hfi1_process_mad, + .rdma_netdev_get_params = hfi1_ipoib_rn_get_params, +}; /** * hfi1_register_ib_device - register our device with the infiniband core @@ -1880,17 +1837,10 @@ */ if (!ib_hfi1_sys_image_guid) ib_hfi1_sys_image_guid = ibdev->node_guid; - ibdev->owner = THIS_MODULE; ibdev->phys_port_cnt = dd->num_pports; ibdev->dev.parent = &dd->pcidev->dev; - ibdev->modify_device = modify_device; - ibdev->alloc_hw_stats = alloc_hw_stats; - ibdev->get_hw_stats = get_hw_stats; - ibdev->alloc_rdma_netdev = hfi1_vnic_alloc_rn; - /* keep process mad in the driver */ - ibdev->process_mad = hfi1_process_mad; - ibdev->get_dev_fw_str = hfi1_get_dev_fw_str; + ib_set_device_ops(ibdev, &hfi1_dev_ops); strlcpy(ibdev->node_desc, init_utsname()->nodename, sizeof(ibdev->node_desc)); @@ -1898,7 +1848,6 @@ /* * Fill in rvt info object. */ - dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files; dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev; dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah; dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah; @@ -1916,9 +1865,8 @@ dd->verbs_dev.rdi.dparms.qpn_start = 0; dd->verbs_dev.rdi.dparms.qpn_inc = 1; dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift; - dd->verbs_dev.rdi.dparms.qpn_res_start = kdeth_qp << 16; - dd->verbs_dev.rdi.dparms.qpn_res_end = - dd->verbs_dev.rdi.dparms.qpn_res_start + 65535; + dd->verbs_dev.rdi.dparms.qpn_res_start = RVT_KDETH_QP_BASE; + dd->verbs_dev.rdi.dparms.qpn_res_end = RVT_AIP_QP_MAX; dd->verbs_dev.rdi.dparms.max_rdma_atomic = HFI1_MAX_RDMA_ATOMIC; dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK; dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT; @@ -1928,6 +1876,7 @@ dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE; dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc; + dd->verbs_dev.rdi.driver_f.qp_priv_init = hfi1_qp_priv_init; dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free; dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps; dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset; @@ -1945,7 +1894,7 @@ dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp; dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp; dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc; - dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe; + dd->verbs_dev.rdi.driver_f.setup_wqe = hfi1_setup_wqe; dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup = hfi1_comp_vect_mappings_lookup; @@ -1958,9 +1907,17 @@ dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size; dd->verbs_dev.rdi.dparms.nports = dd->num_pports; dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd); + dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode; + dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold; + dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period; + dd->verbs_dev.rdi.dparms.reserved_operations = 1; + dd->verbs_dev.rdi.dparms.extra_rdma_atomic = HFI1_TID_RDMA_WRITE_CNT; /* post send table */ dd->verbs_dev.rdi.post_parms = hfi1_post_parms; + + /* opcode translation table */ + dd->verbs_dev.rdi.wc_opcode = ib_hfi1_wc_opcode; ppd = dd->pport; for (i = 0; i < dd->num_pports; i++, ppd++) @@ -1969,7 +1926,10 @@ i, ppd->pkeys); - ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_HFI1); + rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev, + &ib_hfi1_attr_group); + + ret = rvt_register_device(&dd->verbs_dev.rdi); if (ret) goto err_verbs_txreq; -- Gitblit v1.6.2