verbs.c 58.1 KB
Newer Older
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1
/*
2
 * Copyright(c) 2015 - 2017 Intel Corporation.
Mike Marciniszyn's avatar
Mike Marciniszyn committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
 *
 * This file is provided under a dual BSD/GPLv2 license.  When using or
 * redistributing this file, you may do so under either license.
 *
 * GPL LICENSE SUMMARY
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of version 2 of the GNU General Public License as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * BSD LICENSE
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *  - Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  - Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *  - Neither the name of Intel Corporation nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include <rdma/ib_mad.h>
#include <rdma/ib_user_verbs.h>
#include <linux/io.h>
#include <linux/module.h>
#include <linux/utsname.h>
#include <linux/rculist.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
56
#include <rdma/opa_addr.h>
Mike Marciniszyn's avatar
Mike Marciniszyn committed
57
58
59
60
61
62

#include "hfi.h"
#include "common.h"
#include "device.h"
#include "trace.h"
#include "qp.h"
63
#include "verbs_txreq.h"
64
#include "debugfs.h"
65
#include "vnic.h"
Mike Marciniszyn's avatar
Mike Marciniszyn committed
66

67
static unsigned int hfi1_lkey_table_size = 16;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
68
69
70
71
72
73
74
75
76
77
78
79
80
81
module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
		   S_IRUGO);
MODULE_PARM_DESC(lkey_table_size,
		 "LKEY table size in bits (2^n, 1 <= n <= 23)");

static unsigned int hfi1_max_pds = 0xFFFF;
module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO);
MODULE_PARM_DESC(max_pds,
		 "Maximum number of protection domains to support");

static unsigned int hfi1_max_ahs = 0xFFFF;
module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO);
MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");

82
unsigned int hfi1_max_cqes = 0x2FFFFF;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
83
84
85
86
87
88
89
90
91
92
93
94
module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO);
MODULE_PARM_DESC(max_cqes,
		 "Maximum number of completion queue entries to support");

unsigned int hfi1_max_cqs = 0x1FFFF;
module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO);
MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");

unsigned int hfi1_max_qp_wrs = 0x3FFF;
module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO);
MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");

95
unsigned int hfi1_max_qps = 32768;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO);
MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");

unsigned int hfi1_max_sges = 0x60;
module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO);
MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");

unsigned int hfi1_max_mcast_grps = 16384;
module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO);
MODULE_PARM_DESC(max_mcast_grps,
		 "Maximum number of multicast groups to support");

unsigned int hfi1_max_mcast_qp_attached = 16;
module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached,
		   uint, S_IRUGO);
MODULE_PARM_DESC(max_mcast_qp_attached,
		 "Maximum number of attached QPs to support");

unsigned int hfi1_max_srqs = 1024;
module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO);
MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");

unsigned int hfi1_max_srq_sges = 128;
module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO);
MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");

unsigned int hfi1_max_srq_wrs = 0x1FFFF;
module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO);
MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");

126
unsigned short piothreshold = 256;
127
128
129
module_param(piothreshold, ushort, S_IRUGO);
MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");

130
131
132
133
134
135
136
#define COPY_CACHELESS 1
#define COPY_ADAPTIVE  2
static unsigned int sge_copy_mode;
module_param(sge_copy_mode, uint, S_IRUGO);
MODULE_PARM_DESC(sge_copy_mode,
		 "Verbs copy mode: 0 use memcpy, 1 use cacheless copy, 2 adapt based on WSS");

Mike Marciniszyn's avatar
Mike Marciniszyn committed
137
138
static void verbs_sdma_complete(
	struct sdma_txreq *cookie,
139
	int status);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
140

141
142
143
144
145
static int pio_wait(struct rvt_qp *qp,
		    struct send_context *sc,
		    struct hfi1_pkt_state *ps,
		    u32 flag);

146
147
148
/* Length of buffer to create verbs txreq cache name */
#define TXREQ_NAME_LEN 24

149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
static uint wss_threshold;
module_param(wss_threshold, uint, S_IRUGO);
MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy");
static uint wss_clean_period = 256;
module_param(wss_clean_period, uint, S_IRUGO);
MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned");

/* memory working set size */
struct hfi1_wss {
	unsigned long *entries;
	atomic_t total_count;
	atomic_t clean_counter;
	atomic_t clean_entry;

	int threshold;
	int num_entries;
	long pages_mask;
};

static struct hfi1_wss wss;

int hfi1_wss_init(void)
{
	long llc_size;
	long llc_bits;
	long table_size;
	long table_bits;

	/* check for a valid percent range - default to 80 if none or invalid */
	if (wss_threshold < 1 || wss_threshold > 100)
		wss_threshold = 80;
	/* reject a wildly large period */
	if (wss_clean_period > 1000000)
		wss_clean_period = 256;
	/* reject a zero period */
	if (wss_clean_period == 0)
		wss_clean_period = 1;

	/*
	 * Calculate the table size - the next power of 2 larger than the
	 * LLC size.  LLC size is in KiB.
	 */
	llc_size = wss_llc_size() * 1024;
	table_size = roundup_pow_of_two(llc_size);

	/* one bit per page in rounded up table */
	llc_bits = llc_size / PAGE_SIZE;
	table_bits = table_size / PAGE_SIZE;
	wss.pages_mask = table_bits - 1;
	wss.num_entries = table_bits / BITS_PER_LONG;

	wss.threshold = (llc_bits * wss_threshold) / 100;
	if (wss.threshold == 0)
		wss.threshold = 1;

	atomic_set(&wss.clean_counter, wss_clean_period);

	wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries),
			      GFP_KERNEL);
	if (!wss.entries) {
		hfi1_wss_exit();
		return -ENOMEM;
	}

	return 0;
}

void hfi1_wss_exit(void)
{
	/* coded to handle partially initialized and repeat callers */
	kfree(wss.entries);
	wss.entries = NULL;
}

/*
 * Advance the clean counter.  When the clean period has expired,
 * clean an entry.
 *
 * This is implemented in atomics to avoid locking.  Because multiple
 * variables are involved, it can be racy which can lead to slightly
 * inaccurate information.  Since this is only a heuristic, this is
 * OK.  Any innaccuracies will clean themselves out as the counter
 * advances.  That said, it is unlikely the entry clean operation will
 * race - the next possible racer will not start until the next clean
 * period.
 *
 * The clean counter is implemented as a decrement to zero.  When zero
 * is reached an entry is cleaned.
 */
static void wss_advance_clean_counter(void)
{
	int entry;
	int weight;
	unsigned long bits;

	/* become the cleaner if we decrement the counter to zero */
	if (atomic_dec_and_test(&wss.clean_counter)) {
		/*
		 * Set, not add, the clean period.  This avoids an issue
		 * where the counter could decrement below the clean period.
		 * Doing a set can result in lost decrements, slowing the
		 * clean advance.  Since this a heuristic, this possible
		 * slowdown is OK.
		 *
		 * An alternative is to loop, advancing the counter by a
		 * clean period until the result is > 0. However, this could
		 * lead to several threads keeping another in the clean loop.
		 * This could be mitigated by limiting the number of times
		 * we stay in the loop.
		 */
		atomic_set(&wss.clean_counter, wss_clean_period);

		/*
		 * Uniquely grab the entry to clean and move to next.
		 * The current entry is always the lower bits of
		 * wss.clean_entry.  The table size, wss.num_entries,
		 * is always a power-of-2.
		 */
		entry = (atomic_inc_return(&wss.clean_entry) - 1)
			& (wss.num_entries - 1);

		/* clear the entry and count the bits */
		bits = xchg(&wss.entries[entry], 0);
		weight = hweight64((u64)bits);
		/* only adjust the contended total count if needed */
		if (weight)
			atomic_sub(weight, &wss.total_count);
	}
}

/*
 * Insert the given address into the working set array.
 */
static void wss_insert(void *address)
{
	u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask;
	u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
	u32 nr = page & (BITS_PER_LONG - 1);

	if (!test_and_set_bit(nr, &wss.entries[entry]))
		atomic_inc(&wss.total_count);

	wss_advance_clean_counter();
}

/*
 * Is the working set larger than the threshold?
 */
297
static inline bool wss_exceeds_threshold(void)
298
299
300
301
{
	return atomic_read(&wss.total_count) >= wss.threshold;
}

302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
/*
 * Translate ib_wr_opcode into ib_wc_opcode.
 */
const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
	[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
	[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
	[IB_WR_SEND] = IB_WC_SEND,
	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
	[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
	[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
	[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
	[IB_WR_SEND_WITH_INV] = IB_WC_SEND,
	[IB_WR_LOCAL_INV] = IB_WC_LOCAL_INV,
	[IB_WR_REG_MR] = IB_WC_REG_MR
};

Mike Marciniszyn's avatar
Mike Marciniszyn committed
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
/*
 * Length of header by opcode, 0 --> not supported
 */
const u8 hdr_len_by_opcode[256] = {
	/* RC */
	[IB_OPCODE_RC_SEND_FIRST]                     = 12 + 8,
	[IB_OPCODE_RC_SEND_MIDDLE]                    = 12 + 8,
	[IB_OPCODE_RC_SEND_LAST]                      = 12 + 8,
	[IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
	[IB_OPCODE_RC_SEND_ONLY]                      = 12 + 8,
	[IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
	[IB_OPCODE_RC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
	[IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = 12 + 8,
	[IB_OPCODE_RC_RDMA_WRITE_LAST]                = 12 + 8,
	[IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
	[IB_OPCODE_RC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
	[IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
	[IB_OPCODE_RC_RDMA_READ_REQUEST]              = 12 + 8 + 16,
	[IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = 12 + 8 + 4,
	[IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = 12 + 8,
	[IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = 12 + 8 + 4,
	[IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = 12 + 8 + 4,
	[IB_OPCODE_RC_ACKNOWLEDGE]                    = 12 + 8 + 4,
341
	[IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = 12 + 8 + 4 + 8,
Mike Marciniszyn's avatar
Mike Marciniszyn committed
342
343
	[IB_OPCODE_RC_COMPARE_SWAP]                   = 12 + 8 + 28,
	[IB_OPCODE_RC_FETCH_ADD]                      = 12 + 8 + 28,
344
345
	[IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = 12 + 8 + 4,
	[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = 12 + 8 + 4,
Mike Marciniszyn's avatar
Mike Marciniszyn committed
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
	/* UC */
	[IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
	[IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
	[IB_OPCODE_UC_SEND_LAST]                      = 12 + 8,
	[IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
	[IB_OPCODE_UC_SEND_ONLY]                      = 12 + 8,
	[IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
	[IB_OPCODE_UC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
	[IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = 12 + 8,
	[IB_OPCODE_UC_RDMA_WRITE_LAST]                = 12 + 8,
	[IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
	[IB_OPCODE_UC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
	[IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
	/* UD */
	[IB_OPCODE_UD_SEND_ONLY]                      = 12 + 8 + 8,
	[IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 12
};

static const opcode_handler opcode_handler_tbl[256] = {
	/* RC */
	[IB_OPCODE_RC_SEND_FIRST]                     = &hfi1_rc_rcv,
	[IB_OPCODE_RC_SEND_MIDDLE]                    = &hfi1_rc_rcv,
	[IB_OPCODE_RC_SEND_LAST]                      = &hfi1_rc_rcv,
	[IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
	[IB_OPCODE_RC_SEND_ONLY]                      = &hfi1_rc_rcv,
	[IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
	[IB_OPCODE_RC_RDMA_WRITE_FIRST]               = &hfi1_rc_rcv,
	[IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = &hfi1_rc_rcv,
	[IB_OPCODE_RC_RDMA_WRITE_LAST]                = &hfi1_rc_rcv,
	[IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
	[IB_OPCODE_RC_RDMA_WRITE_ONLY]                = &hfi1_rc_rcv,
	[IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
	[IB_OPCODE_RC_RDMA_READ_REQUEST]              = &hfi1_rc_rcv,
	[IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = &hfi1_rc_rcv,
	[IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = &hfi1_rc_rcv,
	[IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = &hfi1_rc_rcv,
	[IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = &hfi1_rc_rcv,
	[IB_OPCODE_RC_ACKNOWLEDGE]                    = &hfi1_rc_rcv,
	[IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = &hfi1_rc_rcv,
	[IB_OPCODE_RC_COMPARE_SWAP]                   = &hfi1_rc_rcv,
	[IB_OPCODE_RC_FETCH_ADD]                      = &hfi1_rc_rcv,
387
388
	[IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = &hfi1_rc_rcv,
	[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = &hfi1_rc_rcv,
Mike Marciniszyn's avatar
Mike Marciniszyn committed
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
	/* UC */
	[IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
	[IB_OPCODE_UC_SEND_MIDDLE]                    = &hfi1_uc_rcv,
	[IB_OPCODE_UC_SEND_LAST]                      = &hfi1_uc_rcv,
	[IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
	[IB_OPCODE_UC_SEND_ONLY]                      = &hfi1_uc_rcv,
	[IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
	[IB_OPCODE_UC_RDMA_WRITE_FIRST]               = &hfi1_uc_rcv,
	[IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = &hfi1_uc_rcv,
	[IB_OPCODE_UC_RDMA_WRITE_LAST]                = &hfi1_uc_rcv,
	[IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
	[IB_OPCODE_UC_RDMA_WRITE_ONLY]                = &hfi1_uc_rcv,
	[IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
	/* UD */
	[IB_OPCODE_UD_SEND_ONLY]                      = &hfi1_ud_rcv,
	[IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_ud_rcv,
	/* CNP */
	[IB_OPCODE_CNP]				      = &hfi1_cnp_rcv
};

409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
#define OPMASK 0x1f

static const u32 pio_opmask[BIT(3)] = {
	/* RC */
	[IB_OPCODE_RC >> 5] =
		BIT(RC_OP(SEND_ONLY) & OPMASK) |
		BIT(RC_OP(SEND_ONLY_WITH_IMMEDIATE) & OPMASK) |
		BIT(RC_OP(RDMA_WRITE_ONLY) & OPMASK) |
		BIT(RC_OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE) & OPMASK) |
		BIT(RC_OP(RDMA_READ_REQUEST) & OPMASK) |
		BIT(RC_OP(ACKNOWLEDGE) & OPMASK) |
		BIT(RC_OP(ATOMIC_ACKNOWLEDGE) & OPMASK) |
		BIT(RC_OP(COMPARE_SWAP) & OPMASK) |
		BIT(RC_OP(FETCH_ADD) & OPMASK),
	/* UC */
	[IB_OPCODE_UC >> 5] =
		BIT(UC_OP(SEND_ONLY) & OPMASK) |
		BIT(UC_OP(SEND_ONLY_WITH_IMMEDIATE) & OPMASK) |
		BIT(UC_OP(RDMA_WRITE_ONLY) & OPMASK) |
		BIT(UC_OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE) & OPMASK),
};

Mike Marciniszyn's avatar
Mike Marciniszyn committed
431
432
433
434
435
436
437
438
439
440
/*
 * System image GUID.
 */
__be64 ib_hfi1_sys_image_guid;

/**
 * hfi1_copy_sge - copy data to SGE memory
 * @ss: the SGE state
 * @data: the data to copy
 * @length: the length of the data
441
 * @release: boolean to release MR
442
 * @copy_last: do a separate copy of the last 8 bytes
Mike Marciniszyn's avatar
Mike Marciniszyn committed
443
444
 */
void hfi1_copy_sge(
445
	struct rvt_sge_state *ss,
Mike Marciniszyn's avatar
Mike Marciniszyn committed
446
	void *data, u32 length,
447
448
	bool release,
	bool copy_last)
Mike Marciniszyn's avatar
Mike Marciniszyn committed
449
{
450
	struct rvt_sge *sge = &ss->sge;
451
	int i;
452
453
	bool in_last = false;
	bool cacheless_copy = false;
454
455
456
457
458
459
460
461
462
463
464
465
466

	if (sge_copy_mode == COPY_CACHELESS) {
		cacheless_copy = length >= PAGE_SIZE;
	} else if (sge_copy_mode == COPY_ADAPTIVE) {
		if (length >= PAGE_SIZE) {
			/*
			 * NOTE: this *assumes*:
			 * o The first vaddr is the dest.
			 * o If multiple pages, then vaddr is sequential.
			 */
			wss_insert(sge->vaddr);
			if (length >= (2 * PAGE_SIZE))
				wss_insert(sge->vaddr + PAGE_SIZE);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
467

468
469
470
471
472
			cacheless_copy = wss_exceeds_threshold();
		} else {
			wss_advance_clean_counter();
		}
	}
473
474
475
476
	if (copy_last) {
		if (length > 8) {
			length -= 8;
		} else {
477
478
			copy_last = false;
			in_last = true;
479
480
481
482
		}
	}

again:
Mike Marciniszyn's avatar
Mike Marciniszyn committed
483
	while (length) {
484
		u32 len = rvt_get_sge_length(sge, length);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
485
486

		WARN_ON_ONCE(len == 0);
487
488
		if (unlikely(in_last)) {
			/* enforce byte transfer ordering */
489
490
			for (i = 0; i < len; i++)
				((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
491
492
		} else if (cacheless_copy) {
			cacheless_memcpy(sge->vaddr, data, len);
493
494
495
		} else {
			memcpy(sge->vaddr, data, len);
		}
496
		rvt_update_sge(ss, len, release);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
497
498
499
		data += len;
		length -= len;
	}
500
501

	if (copy_last) {
502
503
		copy_last = false;
		in_last = true;
504
505
506
		length = 8;
		goto again;
	}
Mike Marciniszyn's avatar
Mike Marciniszyn committed
507
508
509
510
511
}

/*
 * Make sure the QP is ready and able to accept the given opcode.
 */
512
static inline opcode_handler qp_ok(struct hfi1_packet *packet)
Mike Marciniszyn's avatar
Mike Marciniszyn committed
513
{
514
	if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
515
		return NULL;
516
517
518
519
	if (((packet->opcode & RVT_OPCODE_QP_MASK) ==
	     packet->qp->allowed_ops) ||
	    (packet->opcode == IB_OPCODE_CNP))
		return opcode_handler_tbl[packet->opcode];
520
521

	return NULL;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
522
523
}

524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc)
{
#ifdef CONFIG_FAULT_INJECTION
	if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP)
		/*
		 * In order to drop non-IB traffic we
		 * set PbcInsertHrc to NONE (0x2).
		 * The packet will still be delivered
		 * to the receiving node but a
		 * KHdrHCRCErr (KDETH packet with a bad
		 * HCRC) will be triggered and the
		 * packet will not be delivered to the
		 * correct context.
		 */
		pbc |= (u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT;
	else
		/*
		 * In order to drop regular verbs
		 * traffic we set the PbcTestEbp
		 * flag. The packet will still be
		 * delivered to the receiving node but
		 * a 'late ebp error' will be
		 * triggered and will be dropped.
		 */
		pbc |= PBC_TEST_EBP;
#endif
	return pbc;
}

553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
static int hfi1_do_pkey_check(struct hfi1_packet *packet)
{
	struct hfi1_ctxtdata *rcd = packet->rcd;
	struct hfi1_pportdata *ppd = rcd->ppd;
	struct hfi1_16b_header *hdr = packet->hdr;
	u16 pkey;

	/* Pkey check needed only for bypass packets */
	if (packet->etype != RHF_RCV_TYPE_BYPASS)
		return 0;

	/* Perform pkey check */
	pkey = hfi1_16B_get_pkey(hdr);
	return ingress_pkey_check(ppd, pkey, packet->sc,
				  packet->qp->s_pkey_index,
				  packet->slid, true);
}

571
572
static inline void hfi1_handle_packet(struct hfi1_packet *packet,
				      bool is_mcast)
Mike Marciniszyn's avatar
Mike Marciniszyn committed
573
{
574
	u32 qp_num;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
575
576
	struct hfi1_ctxtdata *rcd = packet->rcd;
	struct hfi1_pportdata *ppd = rcd->ppd;
577
	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
578
	struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
579
	opcode_handler packet_handler;
580
	unsigned long flags;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
581

582
	inc_opstats(packet->tlen, &rcd->opstats->stats[packet->opcode]);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
583

584
	if (unlikely(is_mcast)) {
585
586
		struct rvt_mcast *mcast;
		struct rvt_mcast_qp *p;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
587

588
		if (!packet->grh)
Mike Marciniszyn's avatar
Mike Marciniszyn committed
589
			goto drop;
590
591
		mcast = rvt_mcast_find(&ibp->rvp,
				       &packet->grh->dgid,
592
				       opa_get_lid(packet->dlid, 9B));
593
		if (!mcast)
Mike Marciniszyn's avatar
Mike Marciniszyn committed
594
595
596
			goto drop;
		list_for_each_entry_rcu(p, &mcast->qp_list, list) {
			packet->qp = p->qp;
597
598
			if (hfi1_do_pkey_check(packet))
				goto drop;
599
			spin_lock_irqsave(&packet->qp->r_lock, flags);
600
			packet_handler = qp_ok(packet);
601
602
603
604
			if (likely(packet_handler))
				packet_handler(packet);
			else
				ibp->rvp.n_pkt_drops++;
605
			spin_unlock_irqrestore(&packet->qp->r_lock, flags);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
606
607
		}
		/*
608
		 * Notify rvt_multicast_detach() if it is waiting for us
Mike Marciniszyn's avatar
Mike Marciniszyn committed
609
610
611
612
613
		 * to finish.
		 */
		if (atomic_dec_return(&mcast->refcount) <= 1)
			wake_up(&mcast->wait);
	} else {
614
615
		/* Get the destination QP number. */
		qp_num = ib_bth_get_qpn(packet->ohdr);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
616
		rcu_read_lock();
617
		packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
618
619
620
621
622
623
		if (!packet->qp)
			goto unlock_drop;

		if (hfi1_do_pkey_check(packet))
			goto unlock_drop;

624
		if (unlikely(hfi1_dbg_fault_opcode(packet->qp, packet->opcode,
625
626
627
						   true)))
			goto unlock_drop;

628
		spin_lock_irqsave(&packet->qp->r_lock, flags);
629
		packet_handler = qp_ok(packet);
630
631
632
633
		if (likely(packet_handler))
			packet_handler(packet);
		else
			ibp->rvp.n_pkt_drops++;
634
		spin_unlock_irqrestore(&packet->qp->r_lock, flags);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
635
636
637
		rcu_read_unlock();
	}
	return;
638
639
unlock_drop:
	rcu_read_unlock();
Mike Marciniszyn's avatar
Mike Marciniszyn committed
640
drop:
641
	ibp->rvp.n_pkt_drops++;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
642
643
}

644
645
646
647
648
649
650
651
652
653
/**
 * hfi1_ib_rcv - process an incoming packet
 * @packet: data packet information
 *
 * This is called to process an incoming packet at interrupt level.
 */
void hfi1_ib_rcv(struct hfi1_packet *packet)
{
	struct hfi1_ctxtdata *rcd = packet->rcd;

654
655
656
657
658
659
660
	trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
	hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid));
}

void hfi1_16B_rcv(struct hfi1_packet *packet)
{
	struct hfi1_ctxtdata *rcd = packet->rcd;
661

662
663
	trace_input_ibhdr(rcd->dd, packet, false);
	hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid));
664
665
}

Mike Marciniszyn's avatar
Mike Marciniszyn committed
666
667
668
669
670
671
672
673
/*
 * This is called from a timer to check for QPs
 * which need kernel memory in order to send a packet.
 */
static void mem_timer(unsigned long data)
{
	struct hfi1_ibdev *dev = (struct hfi1_ibdev *)data;
	struct list_head *list = &dev->memwait;
674
	struct rvt_qp *qp = NULL;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
675
676
	struct iowait *wait;
	unsigned long flags;
677
	struct hfi1_qp_priv *priv;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
678
679
680
681

	write_seqlock_irqsave(&dev->iowait_lock, flags);
	if (!list_empty(list)) {
		wait = list_first_entry(list, struct iowait, list);
682
683
684
		qp = iowait_to_qp(wait);
		priv = qp->priv;
		list_del_init(&priv->s_iowait.list);
685
		priv->s_iowait.lock = NULL;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
686
687
688
689
690
691
692
		/* refcount held until actual wake up */
		if (!list_empty(list))
			mod_timer(&dev->mem_timer, jiffies + 1);
	}
	write_sequnlock_irqrestore(&dev->iowait_lock, flags);

	if (qp)
693
		hfi1_qp_wakeup(qp, RVT_S_WAIT_KMEM);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
694
695
696
697
698
699
700
701
}

/*
 * This is called with progress side lock held.
 */
/* New API */
static void verbs_sdma_complete(
	struct sdma_txreq *cookie,
702
	int status)
Mike Marciniszyn's avatar
Mike Marciniszyn committed
703
704
705
{
	struct verbs_txreq *tx =
		container_of(cookie, struct verbs_txreq, txreq);
706
	struct rvt_qp *qp = tx->qp;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
707
708

	spin_lock(&qp->s_lock);
709
	if (tx->wqe) {
Mike Marciniszyn's avatar
Mike Marciniszyn committed
710
		hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
711
	} else if (qp->ibqp.qp_type == IB_QPT_RC) {
712
		struct hfi1_opa_header *hdr;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
713
714
715
716
717
718
719
720
721

		hdr = &tx->phdr.hdr;
		hfi1_rc_send_complete(qp, hdr);
	}
	spin_unlock(&qp->s_lock);

	hfi1_put_txreq(tx);
}

722
723
724
static int wait_kmem(struct hfi1_ibdev *dev,
		     struct rvt_qp *qp,
		     struct hfi1_pkt_state *ps)
Mike Marciniszyn's avatar
Mike Marciniszyn committed
725
{
726
	struct hfi1_qp_priv *priv = qp->priv;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
727
728
729
730
	unsigned long flags;
	int ret = 0;

	spin_lock_irqsave(&qp->s_lock, flags);
731
	if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
Mike Marciniszyn's avatar
Mike Marciniszyn committed
732
		write_seqlock(&dev->iowait_lock);
733
734
		list_add_tail(&ps->s_txreq->txreq.list,
			      &priv->s_iowait.tx_head);
735
		if (list_empty(&priv->s_iowait.list)) {
Mike Marciniszyn's avatar
Mike Marciniszyn committed
736
737
			if (list_empty(&dev->memwait))
				mod_timer(&dev->mem_timer, jiffies + 1);
738
			qp->s_flags |= RVT_S_WAIT_KMEM;
739
			list_add_tail(&priv->s_iowait.list, &dev->memwait);
740
			priv->s_iowait.lock = &dev->iowait_lock;
741
			trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
742
			rvt_get_qp(qp);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
743
744
		}
		write_sequnlock(&dev->iowait_lock);
745
		qp->s_flags &= ~RVT_S_BUSY;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
746
747
748
749
750
751
752
753
754
755
756
757
		ret = -EBUSY;
	}
	spin_unlock_irqrestore(&qp->s_lock, flags);

	return ret;
}

/*
 * This routine calls txadds for each sg entry.
 *
 * Add failures will revert the sge cursor
 */
758
static noinline int build_verbs_ulp_payload(
Mike Marciniszyn's avatar
Mike Marciniszyn committed
759
760
761
762
	struct sdma_engine *sde,
	u32 length,
	struct verbs_txreq *tx)
{
763
	struct rvt_sge_state *ss = tx->ss;
764
765
	struct rvt_sge *sg_list = ss->sg_list;
	struct rvt_sge sge = ss->sge;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
	u8 num_sge = ss->num_sge;
	u32 len;
	int ret = 0;

	while (length) {
		len = ss->sge.length;
		if (len > length)
			len = length;
		if (len > ss->sge.sge_length)
			len = ss->sge.sge_length;
		WARN_ON_ONCE(len == 0);
		ret = sdma_txadd_kvaddr(
			sde->dd,
			&tx->txreq,
			ss->sge.vaddr,
			len);
		if (ret)
			goto bail_txadd;
784
		rvt_update_sge(ss, len, false);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
		length -= len;
	}
	return ret;
bail_txadd:
	/* unwind cursor */
	ss->sge = sge;
	ss->num_sge = num_sge;
	ss->sg_list = sg_list;
	return ret;
}

/*
 * Build the number of DMA descriptors needed to send length bytes of data.
 *
 * NOTE: DMA mapping is held in the tx until completed in the ring or
 *       the tx desc is freed without having been submitted to the ring
 *
802
 * This routine ensures all the helper routine calls succeed.
Mike Marciniszyn's avatar
Mike Marciniszyn committed
803
804
805
806
807
808
 */
/* New API */
static int build_verbs_tx_desc(
	struct sdma_engine *sde,
	u32 length,
	struct verbs_txreq *tx,
809
	struct hfi1_ahg_info *ahg_info,
Mike Marciniszyn's avatar
Mike Marciniszyn committed
810
811
812
	u64 pbc)
{
	int ret = 0;
813
	struct hfi1_sdma_header *phdr = &tx->phdr;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
814
	u16 hdrbytes = tx->hdr_dwords << 2;
815
816
817
	u32 *hdr;
	u8 extra_bytes = 0;
	static char trail_buf[12]; /* CRC = 4, LT = 1, Pad = 0 to 7 bytes */
Mike Marciniszyn's avatar
Mike Marciniszyn committed
818

819
820
821
822
823
824
825
826
827
828
829
	if (tx->phdr.hdr.hdr_type) {
		/*
		 * hdrbytes accounts for PBC. Need to subtract 8 bytes
		 * before calculating padding.
		 */
		extra_bytes = hfi1_get_16b_padding(hdrbytes - 8, length) +
			      (SIZE_OF_CRC << 2) + SIZE_OF_LT;
		hdr = (u32 *)&phdr->hdr.opah;
	} else {
		hdr = (u32 *)&phdr->hdr.ibh;
	}
830
	if (!ahg_info->ahgcount) {
Mike Marciniszyn's avatar
Mike Marciniszyn committed
831
832
		ret = sdma_txinit_ahg(
			&tx->txreq,
833
			ahg_info->tx_flags,
834
835
			hdrbytes + length +
			extra_bytes,
836
			ahg_info->ahgidx,
Mike Marciniszyn's avatar
Mike Marciniszyn committed
837
838
839
840
841
842
843
844
845
846
			0,
			NULL,
			0,
			verbs_sdma_complete);
		if (ret)
			goto bail_txadd;
		phdr->pbc = cpu_to_le64(pbc);
		ret = sdma_txadd_kvaddr(
			sde->dd,
			&tx->txreq,
847
848
			phdr,
			hdrbytes);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
849
850
851
852
853
		if (ret)
			goto bail_txadd;
	} else {
		ret = sdma_txinit_ahg(
			&tx->txreq,
854
			ahg_info->tx_flags,
Mike Marciniszyn's avatar
Mike Marciniszyn committed
855
			length,
856
857
858
			ahg_info->ahgidx,
			ahg_info->ahgcount,
			ahg_info->ahgdesc,
Mike Marciniszyn's avatar
Mike Marciniszyn committed
859
860
861
862
863
			hdrbytes,
			verbs_sdma_complete);
		if (ret)
			goto bail_txadd;
	}
864
	/* add the ulp payload - if any. tx->ss can be NULL for acks */
865
	if (tx->ss) {
866
		ret = build_verbs_ulp_payload(sde, length, tx);
867
868
869
870
871
872
873
874
875
		if (ret)
			goto bail_txadd;
	}

	/* add icrc, lt byte, and padding to flit */
	if (extra_bytes != 0)
		ret = sdma_txadd_kvaddr(sde->dd, &tx->txreq,
					trail_buf, extra_bytes);

Mike Marciniszyn's avatar
Mike Marciniszyn committed
876
877
878
879
bail_txadd:
	return ret;
}

880
int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
881
			u64 pbc)
Mike Marciniszyn's avatar
Mike Marciniszyn committed
882
{
883
	struct hfi1_qp_priv *priv = qp->priv;
884
	struct hfi1_ahg_info *ahg_info = priv->s_ahg;
885
	u32 hdrwords = qp->s_hdrwords;
886
	u32 len = ps->s_txreq->s_cur_size;
887
	u32 plen;
888
889
	struct hfi1_ibdev *dev = ps->dev;
	struct hfi1_pportdata *ppd = ps->ppd;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
890
	struct verbs_txreq *tx;
891
	u8 sc5 = priv->s_sc;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
892
	int ret;
893
894
895
896
897
898
899
900
901
902
903
904
905
	u32 dwords;
	bool bypass = false;

	if (ps->s_txreq->phdr.hdr.hdr_type) {
		u8 extra_bytes = hfi1_get_16b_padding((hdrwords << 2), len);

		dwords = (len + extra_bytes + (SIZE_OF_CRC << 2) +
			  SIZE_OF_LT) >> 2;
		bypass = true;
	} else {
		dwords = (len + 3) >> 2;
	}
	plen = hdrwords + dwords + 2;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
906

907
	tx = ps->s_txreq;
908
909
910
	if (!sdma_txreq_built(&tx->txreq)) {
		if (likely(pbc == 0)) {
			u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
911

912
			/* No vl15 here */
913
914
915
916
917
918
			/* set PBC_DC_INFO bit (aka SC[4]) in pbc */
			if (ps->s_txreq->phdr.hdr.hdr_type)
				pbc |= PBC_PACKET_BYPASS |
				       PBC_INSERT_BYPASS_ICRC;
			else
				pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
919

920
921
922
			if (unlikely(hfi1_dbg_fault_opcode(qp, ps->opcode,
							   false)))
				pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
923
			pbc = create_pbc(ppd,
924
					 pbc,
925
926
927
928
929
					 qp->srate_mbps,
					 vl,
					 plen);
		}
		tx->wqe = qp->s_wqe;
930
		ret = build_verbs_tx_desc(tx->sde, len, tx, ahg_info, pbc);
931
932
		if (unlikely(ret))
			goto bail_build;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
933
	}
934
935
	ret =  sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq,
			       ps->pkts_sent);
936
937
938
939
940
	if (unlikely(ret < 0)) {
		if (ret == -ECOMM)
			goto bail_ecomm;
		return ret;
	}
941
	trace_sdma_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
942
				&ps->s_txreq->phdr.hdr, ib_is_sc5(sc5));
Mike Marciniszyn's avatar
Mike Marciniszyn committed
943
944
945
946
947
948
	return ret;

bail_ecomm:
	/* The current one got "sent" */
	return 0;
bail_build:
949
950
951
952
953
954
955
	ret = wait_kmem(dev, qp, ps);
	if (!ret) {
		/* free txreq - bad state */
		hfi1_put_txreq(ps->s_txreq);
		ps->s_txreq = NULL;
	}
	return ret;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
956
957
958
959
960
961
}

/*
 * If we are now in the error state, return zero to flush the
 * send work request.
 */
962
963
964
965
static int pio_wait(struct rvt_qp *qp,
		    struct send_context *sc,
		    struct hfi1_pkt_state *ps,
		    u32 flag)
Mike Marciniszyn's avatar
Mike Marciniszyn committed
966
{
967
	struct hfi1_qp_priv *priv = qp->priv;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
968
969
970
971
972
973
974
975
976
977
978
979
	struct hfi1_devdata *dd = sc->dd;
	struct hfi1_ibdev *dev = &dd->verbs_dev;
	unsigned long flags;
	int ret = 0;

	/*
	 * Note that as soon as want_buffer() is called and
	 * possibly before it returns, sc_piobufavail()
	 * could be called. Therefore, put QP on the I/O wait list before
	 * enabling the PIO avail interrupt.
	 */
	spin_lock_irqsave(&qp->s_lock, flags);
980
	if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
Mike Marciniszyn's avatar
Mike Marciniszyn committed
981
		write_seqlock(&dev->iowait_lock);
982
983
		list_add_tail(&ps->s_txreq->txreq.list,
			      &priv->s_iowait.tx_head);
984
		if (list_empty(&priv->s_iowait.list)) {
Mike Marciniszyn's avatar
Mike Marciniszyn committed
985
986
987
			struct hfi1_ibdev *dev = &dd->verbs_dev;
			int was_empty;

988
989
990
			dev->n_piowait += !!(flag & RVT_S_WAIT_PIO);
			dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN);
			qp->s_flags |= flag;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
991
			was_empty = list_empty(&sc->piowait);
992
993
			iowait_queue(ps->pkts_sent, &priv->s_iowait,
				     &sc->piowait);
994
			priv->s_iowait.lock = &dev->iowait_lock;
995
			trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
996
			rvt_get_qp(qp);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
997
998
999
1000
1001
			/* counting: only call wantpiobuf_intr if first user */
			if (was_empty)
				hfi1_sc_wantpiobuf_intr(sc, 1);
		}
		write_sequnlock(&dev->iowait_lock);
1002
		qp->s_flags &= ~RVT_S_BUSY;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1003
1004
1005
1006
1007
1008
		ret = -EBUSY;
	}
	spin_unlock_irqrestore(&qp->s_lock, flags);
	return ret;
}

1009
1010
1011
1012
1013
1014
1015
1016
1017
static void verbs_pio_complete(void *arg, int code)
{
	struct rvt_qp *qp = (struct rvt_qp *)arg;
	struct hfi1_qp_priv *priv = qp->priv;

	if (iowait_pio_dec(&priv->s_iowait))
		iowait_drain_wakeup(&priv->s_iowait);
}

1018
int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
1019
			u64 pbc)
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1020
{
1021
	struct hfi1_qp_priv *priv = qp->priv;
1022
	u32 hdrwords = qp->s_hdrwords;
1023
	struct rvt_sge_state *ss = ps->s_txreq->ss;
1024
	u32 len = ps->s_txreq->s_cur_size;
1025
1026
	u32 dwords;
	u32 plen;
1027
	struct hfi1_pportdata *ppd = ps->ppd;
1028
	u32 *hdr;
1029
	u8 sc5;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1030
1031
1032
1033
	unsigned long flags = 0;
	struct send_context *sc;
	struct pio_buf *pbuf;
	int wc_status = IB_WC_SUCCESS;
1034
	int ret = 0;
1035
	pio_release_cb cb = NULL;
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
	u32 lrh0_16b;
	bool bypass = false;
	u8 extra_bytes = 0;

	if (ps->s_txreq->phdr.hdr.hdr_type) {
		u8 pad_size = hfi1_get_16b_padding((hdrwords << 2), len);

		extra_bytes = pad_size + (SIZE_OF_CRC << 2) + SIZE_OF_LT;
		dwords = (len + extra_bytes) >> 2;
		hdr = (u32 *)&ps->s_txreq->phdr.hdr.opah;
		lrh0_16b = ps->s_txreq->phdr.hdr.opah.lrh[0];
		bypass = true;
	} else {
		dwords = (len + 3) >> 2;
		hdr = (u32 *)&ps->s_txreq->phdr.hdr.ibh;
	}
	plen = hdrwords + dwords + 2;
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062

	/* only RC/UC use complete */
	switch (qp->ibqp.qp_type) {
	case IB_QPT_RC:
	case IB_QPT_UC:
		cb = verbs_pio_complete;
		break;
	default:
		break;
	}
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1063
1064

	/* vl15 special case taken care of in ud.c */
1065
	sc5 = priv->s_sc;
1066
	sc = ps->s_txreq->psc;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1067
1068

	if (likely(pbc == 0)) {
1069
		u8 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
1070

1071
1072
1073
1074
1075
1076
1077
		/* set PBC_DC_INFO bit (aka SC[4]) in pbc */
		if (ps->s_txreq->phdr.hdr.hdr_type)
			pbc |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC;
		else
			pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
		if (unlikely(hfi1_dbg_fault_opcode(qp, ps->opcode, false)))
			pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
1078
		pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1079
	}
1080
1081
1082
	if (cb)
		iowait_pio_inc(&priv->s_iowait);
	pbuf = sc_buffer_alloc(sc, plen, cb, qp);
1083
	if (unlikely(!pbuf)) {
1084
1085
		if (cb)
			verbs_pio_complete(qp, 0);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
		if (ppd->host_link_state != HLS_UP_ACTIVE) {
			/*
			 * If we have filled the PIO buffers to capacity and are
			 * not in an active state this request is not going to
			 * go out to so just complete it with an error or else a
			 * ULP or the core may be stuck waiting.
			 */
			hfi1_cdbg(
				PIO,
				"alloc failed. state not active, completing");
			wc_status = IB_WC_GENERAL_ERR;
			goto pio_bail;
		} else {
			/*
			 * This is a normal occurrence. The PIO buffs are full
			 * up but we are still happily sending, well we could be
			 * so lets continue to queue the request.
			 */
			hfi1_cdbg(PIO, "alloc failed. state active, queuing");
1105
			ret = pio_wait(qp, sc, ps, RVT_S_WAIT_PIO);
1106
			if (!ret)
1107
				/* txreq not queued - free */
1108
1109
1110
				goto bail;
			/* tx consumed in wait */
			return ret;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1111
1112
1113
		}
	}

1114
	if (dwords == 0) {
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1115
1116
		pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords);
	} else {
1117
1118
		seg_pio_copy_start(pbuf, pbc,
				   hdr, hdrwords * 4);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1119
1120
1121
1122
1123
1124
1125
		if (ss) {
			while (len) {
				void *addr = ss->sge.vaddr;
				u32 slen = ss->sge.length;

				if (slen > len)
					slen = len;
1126
				rvt_update_sge(ss, slen, false);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1127
1128
1129
1130
				seg_pio_copy_mid(pbuf, addr, slen);
				len -= slen;
			}
		}
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
		/*
		 * Bypass packet will need to copy additional
		 * bytes to accommodate for CRC and LT bytes
		 */
		if (extra_bytes) {
			u8 *empty_buf;

			empty_buf = kcalloc(extra_bytes, sizeof(u8),
					    GFP_KERNEL);
			seg_pio_copy_mid(pbuf, empty_buf, extra_bytes);
			kfree(empty_buf);
		}
		seg_pio_copy_end(pbuf);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1144
1145
	}

1146
	trace_pio_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
1147
			       &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5));
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1148
1149
1150
1151
1152
1153
1154
1155

pio_bail:
	if (qp->s_wqe) {
		spin_lock_irqsave(&qp->s_lock, flags);
		hfi1_send_complete(qp, qp->s_wqe, wc_status);
		spin_unlock_irqrestore(&qp->s_lock, flags);
	} else if (qp->ibqp.qp_type == IB_QPT_RC) {
		spin_lock_irqsave(&qp->s_lock, flags);
1156
		hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1157
1158
		spin_unlock_irqrestore(&qp->s_lock, flags);
	}
1159
1160
1161
1162
1163
1164

	ret = 0;

bail:
	hfi1_put_txreq(ps->s_txreq);
	return ret;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1165
}
1166

Mike Marciniszyn's avatar
Mike Marciniszyn committed
1167
1168
/*
 * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
1169
 * being an entry from the partition key table), return 0
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1170
1171
1172
1173
1174
1175
 * otherwise. Use the matching criteria for egress partition keys
 * specified in the OPAv1 spec., section 9.1l.7.
 */
static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
{
	u16 mkey = pkey & PKEY_LOW_15_MASK;
1176
	u16 mentry = ent & PKEY_LOW_15_MASK;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1177

1178
	if (mkey == mentry) {
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
		/*
		 * If pkey[15] is set (full partition member),
		 * is bit 15 in the corresponding table element
		 * clear (limited member)?
		 */
		if (pkey & PKEY_MEMBER_MASK)
			return !!(ent & PKEY_MEMBER_MASK);
		return 1;
	}
	return 0;
}

1191
1192
/**
 * egress_pkey_check - check P_KEY of a packet
1193
1194
1195
1196
 * @ppd:  Physical IB port data
 * @slid: SLID for packet
 * @bkey: PKEY for header
 * @sc5:  SC for packet
1197
1198
1199
1200
1201
1202
1203
 * @s_pkey_index: It will be used for look up optimization for kernel contexts
 * only. If it is negative value, then it means user contexts is calling this
 * function.
 *
 * It checks if hdr's pkey is valid.
 *
 * Return: 0 on success, otherwise, 1
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1204
 */
1205
int egress_pkey_check(struct hfi1_pportdata *ppd, u32 slid, u16 pkey,
1206
		      u8 sc5, int8_t s_pkey_index)
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1207
1208
{
	struct hfi1_devdata *dd;
1209
1210
	int i;
	int is_user_ctxt_mechanism = (s_pkey_index < 0);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222

	if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
		return 0;

	/* If SC15, pkey[0:14] must be 0x7fff */
	if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
		goto bad;

	/* Is the pkey = 0x0, or 0x8000? */
	if ((pkey & PKEY_LOW_15_MASK) == 0)
		goto bad;

1223
1224
1225
1226
1227
1228
1229
	/*
	 * For the kernel contexts only, if a qp is passed into the function,
	 * the most likely matching pkey has index qp->s_pkey_index
	 */
	if (!is_user_ctxt_mechanism &&
	    egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) {
		return 0;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1230
1231
	}

1232
1233
1234
1235
	for (i = 0; i < MAX_PKEY_VALUES; i++) {
		if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
			return 0;
	}
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1236
bad:
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
	/*
	 * For the user-context mechanism, the P_KEY check would only happen
	 * once per SDMA request, not once per packet.  Therefore, there's no
	 * need to increment the counter for the user-context mechanism.
	 */
	if (!is_user_ctxt_mechanism) {
		incr_cntr64(&ppd->port_xmit_constraint_errors);
		dd = ppd->dd;
		if (!(dd->err_info_xmit_constraint.status &
		      OPA_EI_STATUS_SMASK)) {
			dd->err_info_xmit_constraint.status |=
				OPA_EI_STATUS_SMASK;
			dd->err_info_xmit_constraint.slid = slid;
			dd->err_info_xmit_constraint.pkey = pkey;
		}
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1252
1253
1254
1255
	}
	return 1;
}

1256
1257
1258
1259
1260
1261
1262
/**
 * get_send_routine - choose an egress routine
 *
 * Choose an egress routine based on QP type
 * and size
 */
static inline send_routine get_send_routine(struct rvt_qp *qp,
1263
					    struct hfi1_pkt_state *ps)
1264
1265
1266
{
	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
	struct hfi1_qp_priv *priv = qp->priv;
1267
	struct verbs_txreq *tx = ps->s_txreq;
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277

	if (unlikely(!(dd->flags & HFI1_HAS_SEND_DMA)))
		return dd->process_pio_send;
	switch (qp->ibqp.qp_type) {
	case IB_QPT_SMI:
		return dd->process_pio_send;
	case IB_QPT_GSI:
	case IB_QPT_UD:
		break;
	case IB_QPT_UC:
1278
	case IB_QPT_RC: {
1279
		if (piothreshold &&
1280
		    tx->s_cur_size <= min(piothreshold, qp->pmtu) &&
1281
		    (BIT(ps->opcode & OPMASK) & pio_opmask[ps->opcode >> 5]) &&
1282
1283
		    iowait_sdma_pending(&priv->s_iowait) == 0 &&
		    !sdma_txreq_built(&tx->txreq))
1284
1285
			return dd->process_pio_send;
		break;
1286
	}
1287
1288
1289
1290
1291
1292
	default:
		break;
	}
	return dd->process_dma_send;
}

Mike Marciniszyn's avatar
Mike Marciniszyn committed
1293
1294
1295
/**
 * hfi1_verbs_send - send a packet
 * @qp: the QP to send on
1296
 * @ps: the state of the packet to send
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1297
1298
 *
 * Return zero if packet is sent or queued OK.
1299
 * Return non-zero and clear qp->s_flags RVT_S_BUSY otherwise.
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1300
 */
1301
int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1302
1303
{
	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
1304
	struct hfi1_qp_priv *priv = qp->priv;
1305
	struct ib_other_headers *ohdr;
1306
	send_routine sr;
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1307
	int ret;
1308
1309
	u16 pkey;
	u32 slid;
1310
1311

	/* locate the pkey within the headers */
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
	if (ps->s_txreq->phdr.hdr.hdr_type) {
		struct hfi1_16b_header *hdr = &ps->s_txreq->phdr.hdr.opah;
		u8 l4 = hfi1_16B_get_l4(hdr);

		if (l4 == OPA_16B_L4_IB_GLOBAL)
			ohdr = &hdr->u.l.oth;
		else
			ohdr = &hdr->u.oth;
		slid = hfi1_16B_get_slid(hdr);
		pkey = hfi1_16B_get_pkey(hdr);
	} else {
		struct ib_header *hdr = &ps->s_txreq->phdr.hdr.ibh;
		u8 lnh = ib_get_lnh(hdr);

		if (lnh == HFI1_LRH_GRH)
			ohdr = &hdr->u.l.oth;
		else
			ohdr = &hdr->u.oth;
		slid = ib_get_slid(hdr);
		pkey = ib_bth_get_pkey(ohdr);
	}

	ps->opcode = ib_bth_get_opcode(ohdr);
	sr = get_send_routine(qp, ps);
	ret = egress_pkey_check(dd->pport, slid, pkey,
				priv->s_sc, qp->s_pkey_index);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1338
1339
1340
1341
1342
1343
1344
1345
1346
	if (unlikely(ret)) {
		/*
		 * The value we are returning here does not get propagated to
		 * the verbs caller. Thus we need to complete the request with
		 * error otherwise the caller could be sitting waiting on the
		 * completion event. Only do this for PIO. SDMA has its own
		 * mechanism for handling the errors. So for SDMA we can just
		 * return.
		 */
1347
1348
1349
		if (sr == dd->process_pio_send) {
			unsigned long flags;

Mike Marciniszyn's avatar
Mike Marciniszyn committed
1350
1351
1352
1353
1354
1355
1356
1357
			hfi1_cdbg(PIO, "%s() Failed. Completing with err",
				  __func__);
			spin_lock_irqsave(&qp->s_lock, flags);
			hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
			spin_unlock_irqrestore(&qp->s_lock, flags);
		}
		return -EINVAL;
	}
1358
1359
1360
1361
1362
	if (sr == dd->process_dma_send && iowait_pio_pending(&priv->s_iowait))
		return pio_wait(qp,
				ps->s_txreq->psc,
				ps,
				RVT_S_WAIT_PIO_DRAIN);
1363
	return sr(qp, ps, 0);
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1364
1365
}

1366
1367
1368
1369
1370
/**
 * hfi1_fill_device_attr - Fill in rvt dev info device attributes.
 * @dd: the device data structure
 */
static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
Mike Marciniszyn's avatar
Mike Marciniszyn committed
1371
{
1372
	struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
1373
	u32 ver = dd->dc8051_ver;
1374
1375
1376

	memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props));

1377
1378
1379
1380
	rdi->dparms.props.fw_ver = ((u64)(dc8051_ver_maj(ver)) << 32) |
		((u64)(dc8051_ver_min(ver)) << 16) |
		(u64)dc8051_ver_patch(ver);

1381
1382
1383
	rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
			IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
			IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
1384