ehca_reqs.c 25.5 KB
Newer Older
1
2
3
4
5
/*
 *  IBM eServer eHCA Infiniband device driver for Linux on POWER
 *
 *  post_send/recv, poll_cq, req_notify
 *
6
7
8
 *  Authors: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
 *           Waleri Fomin <fomin@de.ibm.com>
 *           Joachim Fenkes <fenkes@de.ibm.com>
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
 *           Reinhard Ernst <rernst@de.ibm.com>
 *
 *  Copyright (c) 2005 IBM Corporation
 *
 *  All rights reserved.
 *
 *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
 *  BSD.
 *
 * OpenIB BSD License
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials
 * provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */


45
#include <asm/system.h>
46
47
48
49
50
51
52
#include "ehca_classes.h"
#include "ehca_tools.h"
#include "ehca_qes.h"
#include "ehca_iverbs.h"
#include "hcp_if.h"
#include "hipz_fns.h"

53
54
55
/* in RC traffic, insert an empty RDMA READ every this many packets */
#define ACK_CIRC_THRESHOLD 2000000

56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
static u64 replace_wr_id(u64 wr_id, u16 idx)
{
	u64 ret;

	ret = wr_id & ~QMAP_IDX_MASK;
	ret |= idx & QMAP_IDX_MASK;

	return ret;
}

static u16 get_app_wr_id(u64 wr_id)
{
	return wr_id & QMAP_IDX_MASK;
}

71
72
static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
				  struct ehca_wqe *wqe_p,
73
74
				  struct ib_recv_wr *recv_wr,
				  u32 rq_map_idx)
75
76
77
78
79
80
81
82
83
84
85
86
87
{
	u8 cnt_ds;
	if (unlikely((recv_wr->num_sge < 0) ||
		     (recv_wr->num_sge > ipz_rqueue->act_nr_of_sg))) {
		ehca_gen_err("Invalid number of WQE SGE. "
			 "num_sqe=%x max_nr_of_sg=%x",
			 recv_wr->num_sge, ipz_rqueue->act_nr_of_sg);
		return -EINVAL; /* invalid SG list length */
	}

	/* clear wqe header until sglist */
	memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));

88
	wqe_p->work_request_id = replace_wr_id(recv_wr->wr_id, rq_map_idx);
89
90
91
92
93
94
95
96
97
98
99
	wqe_p->nr_of_data_seg = recv_wr->num_sge;

	for (cnt_ds = 0; cnt_ds < recv_wr->num_sge; cnt_ds++) {
		wqe_p->u.all_rcv.sg_list[cnt_ds].vaddr =
			recv_wr->sg_list[cnt_ds].addr;
		wqe_p->u.all_rcv.sg_list[cnt_ds].lkey =
			recv_wr->sg_list[cnt_ds].lkey;
		wqe_p->u.all_rcv.sg_list[cnt_ds].length =
			recv_wr->sg_list[cnt_ds].length;
	}

100
	if (ehca_debug_level >= 3) {
101
102
		ehca_gen_dbg("RECEIVE WQE written into ipz_rqueue=%p",
			     ipz_rqueue);
103
		ehca_dmp(wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe");
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
	}

	return 0;
}

#if defined(DEBUG_GSI_SEND_WR)

/* need ib_mad struct */
#include <rdma/ib_mad.h>

static void trace_send_wr_ud(const struct ib_send_wr *send_wr)
{
	int idx;
	int j;
	while (send_wr) {
		struct ib_mad_hdr *mad_hdr = send_wr->wr.ud.mad_hdr;
		struct ib_sge *sge = send_wr->sg_list;
		ehca_gen_dbg("send_wr#%x wr_id=%lx num_sge=%x "
122
			     "send_flags=%x opcode=%x", idx, send_wr->wr_id,
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
			     send_wr->num_sge, send_wr->send_flags,
			     send_wr->opcode);
		if (mad_hdr) {
			ehca_gen_dbg("send_wr#%x mad_hdr base_version=%x "
				     "mgmt_class=%x class_version=%x method=%x "
				     "status=%x class_specific=%x tid=%lx "
				     "attr_id=%x resv=%x attr_mod=%x",
				     idx, mad_hdr->base_version,
				     mad_hdr->mgmt_class,
				     mad_hdr->class_version, mad_hdr->method,
				     mad_hdr->status, mad_hdr->class_specific,
				     mad_hdr->tid, mad_hdr->attr_id,
				     mad_hdr->resv,
				     mad_hdr->attr_mod);
		}
		for (j = 0; j < send_wr->num_sge; j++) {
139
			u8 *data = (u8 *)abs_to_virt(sge->addr);
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
			ehca_gen_dbg("send_wr#%x sge#%x addr=%p length=%x "
				     "lkey=%x",
				     idx, j, data, sge->length, sge->lkey);
			/* assume length is n*16 */
			ehca_dmp(data, sge->length, "send_wr#%x sge#%x",
				 idx, j);
			sge++;
		} /* eof for j */
		idx++;
		send_wr = send_wr->next;
	} /* eof while send_wr */
}

#endif /* DEBUG_GSI_SEND_WR */

static inline int ehca_write_swqe(struct ehca_qp *qp,
				  struct ehca_wqe *wqe_p,
157
				  const struct ib_send_wr *send_wr,
158
				  u32 sq_map_idx,
159
				  int hidden)
160
161
162
163
164
{
	u32 idx;
	u64 dma_length;
	struct ehca_av *my_av;
	u32 remote_qkey = send_wr->wr.ud.remote_qkey;
165
	struct ehca_qmap_entry *qmap_entry = &qp->sq_map.map[sq_map_idx];
166
167
168
169
170
171
172
173
174
175
176
177

	if (unlikely((send_wr->num_sge < 0) ||
		     (send_wr->num_sge > qp->ipz_squeue.act_nr_of_sg))) {
		ehca_gen_err("Invalid number of WQE SGE. "
			 "num_sqe=%x max_nr_of_sg=%x",
			 send_wr->num_sge, qp->ipz_squeue.act_nr_of_sg);
		return -EINVAL; /* invalid SG list length */
	}

	/* clear wqe header until sglist */
	memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));

178
	wqe_p->work_request_id = replace_wr_id(send_wr->wr_id, sq_map_idx);
179

180
181
	qmap_entry->app_wr_id = get_app_wr_id(send_wr->wr_id);
	qmap_entry->reported = 0;
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203

	switch (send_wr->opcode) {
	case IB_WR_SEND:
	case IB_WR_SEND_WITH_IMM:
		wqe_p->optype = WQE_OPTYPE_SEND;
		break;
	case IB_WR_RDMA_WRITE:
	case IB_WR_RDMA_WRITE_WITH_IMM:
		wqe_p->optype = WQE_OPTYPE_RDMAWRITE;
		break;
	case IB_WR_RDMA_READ:
		wqe_p->optype = WQE_OPTYPE_RDMAREAD;
		break;
	default:
		ehca_gen_err("Invalid opcode=%x", send_wr->opcode);
		return -EINVAL; /* invalid opcode */
	}

	wqe_p->wqef = (send_wr->opcode) & WQEF_HIGH_NIBBLE;

	wqe_p->wr_flag = 0;

204
205
206
	if ((send_wr->send_flags & IB_SEND_SIGNALED ||
	    qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR)
	    && !hidden)
207
208
209
210
211
		wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM;

	if (send_wr->opcode == IB_WR_SEND_WITH_IMM ||
	    send_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
		/* this might not work as long as HW does not support it */
212
		wqe_p->immediate_data = be32_to_cpu(send_wr->ex.imm_data);
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
		wqe_p->wr_flag |= WQE_WRFLAG_IMM_DATA_PRESENT;
	}

	wqe_p->nr_of_data_seg = send_wr->num_sge;

	switch (qp->qp_type) {
	case IB_QPT_SMI:
	case IB_QPT_GSI:
		/* no break is intential here */
	case IB_QPT_UD:
		/* IB 1.2 spec C10-15 compliance */
		if (send_wr->wr.ud.remote_qkey & 0x80000000)
			remote_qkey = qp->qkey;

		wqe_p->destination_qp_number = send_wr->wr.ud.remote_qpn << 8;
		wqe_p->local_ee_context_qkey = remote_qkey;
229
		if (unlikely(!send_wr->wr.ud.ah)) {
230
231
			ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp);
			return -EINVAL;
232
233
234
235
		}
		if (unlikely(send_wr->wr.ud.remote_qpn == 0)) {
			ehca_gen_err("dest QP# is 0. qp=%x", qp->real_qp_num);
			return -EINVAL;
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
		}
		my_av = container_of(send_wr->wr.ud.ah, struct ehca_av, ib_ah);
		wqe_p->u.ud_av.ud_av = my_av->av;

		/*
		 * omitted check of IB_SEND_INLINE
		 * since HW does not support it
		 */
		for (idx = 0; idx < send_wr->num_sge; idx++) {
			wqe_p->u.ud_av.sg_list[idx].vaddr =
				send_wr->sg_list[idx].addr;
			wqe_p->u.ud_av.sg_list[idx].lkey =
				send_wr->sg_list[idx].lkey;
			wqe_p->u.ud_av.sg_list[idx].length =
				send_wr->sg_list[idx].length;
		} /* eof for idx */
		if (qp->qp_type == IB_QPT_SMI ||
		    qp->qp_type == IB_QPT_GSI)
			wqe_p->u.ud_av.ud_av.pmtu = 1;
		if (qp->qp_type == IB_QPT_GSI) {
			wqe_p->pkeyi = send_wr->wr.ud.pkey_index;
#ifdef DEBUG_GSI_SEND_WR
			trace_send_wr_ud(send_wr);
#endif /* DEBUG_GSI_SEND_WR */
		}
		break;

	case IB_QPT_UC:
		if (send_wr->send_flags & IB_SEND_FENCE)
			wqe_p->wr_flag |= WQE_WRFLAG_FENCE;
		/* no break is intentional here */
	case IB_QPT_RC:
		/* TODO: atomic not implemented */
		wqe_p->u.nud.remote_virtual_adress =
			send_wr->wr.rdma.remote_addr;
		wqe_p->u.nud.rkey = send_wr->wr.rdma.rkey;

		/*
		 * omitted checking of IB_SEND_INLINE
		 * since HW does not support it
		 */
		dma_length = 0;
		for (idx = 0; idx < send_wr->num_sge; idx++) {
			wqe_p->u.nud.sg_list[idx].vaddr =
				send_wr->sg_list[idx].addr;
			wqe_p->u.nud.sg_list[idx].lkey =
				send_wr->sg_list[idx].lkey;
			wqe_p->u.nud.sg_list[idx].length =
				send_wr->sg_list[idx].length;
			dma_length += send_wr->sg_list[idx].length;
		} /* eof idx */
		wqe_p->u.nud.atomic_1st_op_dma_len = dma_length;

289
290
291
292
293
294
295
296
297
		/* unsolicited ack circumvention */
		if (send_wr->opcode == IB_WR_RDMA_READ) {
			/* on RDMA read, switch on and reset counters */
			qp->message_count = qp->packet_count = 0;
			qp->unsol_ack_circ = 1;
		} else
			/* else estimate #packets */
			qp->packet_count += (dma_length >> qp->mtu_shift) + 1;

298
299
300
301
302
303
304
		break;

	default:
		ehca_gen_err("Invalid qptype=%x", qp->qp_type);
		return -EINVAL;
	}

305
	if (ehca_debug_level >= 3) {
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
		ehca_gen_dbg("SEND WQE written into queue qp=%p ", qp);
		ehca_dmp( wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "send wqe");
	}
	return 0;
}

/* map_ib_wc_status converts raw cqe_status to ib_wc_status */
static inline void map_ib_wc_status(u32 cqe_status,
				    enum ib_wc_status *wc_status)
{
	if (unlikely(cqe_status & WC_STATUS_ERROR_BIT)) {
		switch (cqe_status & 0x3F) {
		case 0x01:
		case 0x21:
			*wc_status = IB_WC_LOC_LEN_ERR;
			break;
		case 0x02:
		case 0x22:
			*wc_status = IB_WC_LOC_QP_OP_ERR;
			break;
		case 0x03:
		case 0x23:
			*wc_status = IB_WC_LOC_EEC_OP_ERR;
			break;
		case 0x04:
		case 0x24:
			*wc_status = IB_WC_LOC_PROT_ERR;
			break;
		case 0x05:
		case 0x25:
			*wc_status = IB_WC_WR_FLUSH_ERR;
			break;
		case 0x06:
			*wc_status = IB_WC_MW_BIND_ERR;
			break;
		case 0x07: /* remote error - look into bits 20:24 */
			switch ((cqe_status
				 & WC_STATUS_REMOTE_ERROR_FLAGS) >> 11) {
			case 0x0:
				/*
				 * PSN Sequence Error!
				 * couldn't find a matching status!
				 */
				*wc_status = IB_WC_GENERAL_ERR;
				break;
			case 0x1:
				*wc_status = IB_WC_REM_INV_REQ_ERR;
				break;
			case 0x2:
				*wc_status = IB_WC_REM_ACCESS_ERR;
				break;
			case 0x3:
				*wc_status = IB_WC_REM_OP_ERR;
				break;
			case 0x4:
				*wc_status = IB_WC_REM_INV_RD_REQ_ERR;
				break;
			}
			break;
		case 0x08:
			*wc_status = IB_WC_RETRY_EXC_ERR;
			break;
		case 0x09:
			*wc_status = IB_WC_RNR_RETRY_EXC_ERR;
			break;
		case 0x0A:
		case 0x2D:
			*wc_status = IB_WC_REM_ABORT_ERR;
			break;
		case 0x0B:
		case 0x2E:
			*wc_status = IB_WC_INV_EECN_ERR;
			break;
		case 0x0C:
		case 0x2F:
			*wc_status = IB_WC_INV_EEC_STATE_ERR;
			break;
		case 0x0D:
			*wc_status = IB_WC_BAD_RESP_ERR;
			break;
		case 0x10:
			/* WQE purged */
			*wc_status = IB_WC_WR_FLUSH_ERR;
			break;
		default:
			*wc_status = IB_WC_FATAL_ERR;

		}
	} else
		*wc_status = IB_WC_SUCCESS;
}

398
399
400
401
402
403
404
static inline int post_one_send(struct ehca_qp *my_qp,
			 struct ib_send_wr *cur_send_wr,
			 struct ib_send_wr **bad_send_wr,
			 int hidden)
{
	struct ehca_wqe *wqe_p;
	int ret;
405
	u32 sq_map_idx;
406
407
408
409
410
411
412
413
414
415
416
417
	u64 start_offset = my_qp->ipz_squeue.current_q_offset;

	/* get pointer next to free WQE */
	wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue);
	if (unlikely(!wqe_p)) {
		/* too many posted work requests: queue overflow */
		if (bad_send_wr)
			*bad_send_wr = cur_send_wr;
		ehca_err(my_qp->ib_qp.device, "Too many posted WQEs "
			 "qp_num=%x", my_qp->ib_qp.qp_num);
		return -ENOMEM;
	}
418
419
420
421
422
423
424

	/*
	 * Get the index of the WQE in the send queue. The same index is used
	 * for writing into the sq_map.
	 */
	sq_map_idx = start_offset / my_qp->ipz_squeue.qe_size;

425
	/* write a SEND WQE into the QUEUE */
426
	ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr, sq_map_idx, hidden);
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
	/*
	 * if something failed,
	 * reset the free entry pointer to the start value
	 */
	if (unlikely(ret)) {
		my_qp->ipz_squeue.current_q_offset = start_offset;
		if (bad_send_wr)
			*bad_send_wr = cur_send_wr;
		ehca_err(my_qp->ib_qp.device, "Could not write WQE "
			 "qp_num=%x", my_qp->ib_qp.qp_num);
		return -EINVAL;
	}

	return 0;
}

443
444
445
446
447
448
449
450
int ehca_post_send(struct ib_qp *qp,
		   struct ib_send_wr *send_wr,
		   struct ib_send_wr **bad_send_wr)
{
	struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
	struct ib_send_wr *cur_send_wr;
	int wqe_cnt = 0;
	int ret = 0;
451
	unsigned long flags;
452

453
454
455
456
	/* Reject WR if QP is in RESET, INIT or RTR state */
	if (unlikely(my_qp->state < IB_QPS_RTS)) {
		ehca_err(qp->device, "Invalid QP state  qp_state=%d qpn=%x",
			 my_qp->state, qp->qp_num);
457
458
459
		return -EINVAL;
	}

460
	/* LOCK the QUEUE */
461
	spin_lock_irqsave(&my_qp->spinlock_s, flags);
462

463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
	/* Send an empty extra RDMA read if:
	 *  1) there has been an RDMA read on this connection before
	 *  2) no RDMA read occurred for ACK_CIRC_THRESHOLD link packets
	 *  3) we can be sure that any previous extra RDMA read has been
	 *     processed so we don't overflow the SQ
	 */
	if (unlikely(my_qp->unsol_ack_circ &&
		     my_qp->packet_count > ACK_CIRC_THRESHOLD &&
		     my_qp->message_count > my_qp->init_attr.cap.max_send_wr)) {
		/* insert an empty RDMA READ to fix up the remote QP state */
		struct ib_send_wr circ_wr;
		memset(&circ_wr, 0, sizeof(circ_wr));
		circ_wr.opcode = IB_WR_RDMA_READ;
		post_one_send(my_qp, &circ_wr, NULL, 1); /* ignore retcode */
		wqe_cnt++;
		ehca_dbg(qp->device, "posted circ wr  qp_num=%x", qp->qp_num);
		my_qp->message_count = my_qp->packet_count = 0;
	}

482
483
484
	/* loop processes list of send reqs */
	for (cur_send_wr = send_wr; cur_send_wr != NULL;
	     cur_send_wr = cur_send_wr->next) {
485
		ret = post_one_send(my_qp, cur_send_wr, bad_send_wr, 0);
486
		if (unlikely(ret)) {
487
488
489
			/* if one or more WQEs were successful, don't fail */
			if (wqe_cnt)
				ret = 0;
490
491
492
493
494
495
496
497
			goto post_send_exit0;
		}
		wqe_cnt++;
	} /* eof for cur_send_wr */

post_send_exit0:
	iosync(); /* serialize GAL register access */
	hipz_update_sqa(my_qp, wqe_cnt);
498
499
500
	if (unlikely(ret || ehca_debug_level >= 2))
		ehca_dbg(qp->device, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i",
			 my_qp, qp->qp_num, wqe_cnt, ret);
501
	my_qp->message_count += wqe_cnt;
502
	spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
503
504
505
	return ret;
}

506
507
508
509
static int internal_post_recv(struct ehca_qp *my_qp,
			      struct ib_device *dev,
			      struct ib_recv_wr *recv_wr,
			      struct ib_recv_wr **bad_recv_wr)
510
511
512
513
514
{
	struct ib_recv_wr *cur_recv_wr;
	struct ehca_wqe *wqe_p;
	int wqe_cnt = 0;
	int ret = 0;
515
	u32 rq_map_idx;
516
	unsigned long flags;
517
	struct ehca_qmap_entry *qmap_entry;
518

519
520
521
522
523
524
	if (unlikely(!HAS_RQ(my_qp))) {
		ehca_err(dev, "QP has no RQ  ehca_qp=%p qp_num=%x ext_type=%d",
			 my_qp, my_qp->real_qp_num, my_qp->ext_type);
		return -ENODEV;
	}

525
	/* LOCK the QUEUE */
526
	spin_lock_irqsave(&my_qp->spinlock_r, flags);
527
528
529
530
531
532
533
534
535
536
537
538
539

	/* loop processes list of send reqs */
	for (cur_recv_wr = recv_wr; cur_recv_wr != NULL;
	     cur_recv_wr = cur_recv_wr->next) {
		u64 start_offset = my_qp->ipz_rqueue.current_q_offset;
		/* get pointer next to free WQE */
		wqe_p = ipz_qeit_get_inc(&my_qp->ipz_rqueue);
		if (unlikely(!wqe_p)) {
			/* too many posted work requests: queue overflow */
			if (bad_recv_wr)
				*bad_recv_wr = cur_recv_wr;
			if (wqe_cnt == 0) {
				ret = -ENOMEM;
540
541
				ehca_err(dev, "Too many posted WQEs "
					 "qp_num=%x", my_qp->real_qp_num);
542
543
544
			}
			goto post_recv_exit0;
		}
545
546
547
548
549
550
		/*
		 * Get the index of the WQE in the recv queue. The same index
		 * is used for writing into the rq_map.
		 */
		rq_map_idx = start_offset / my_qp->ipz_rqueue.qe_size;

551
		/* write a RECV WQE into the QUEUE */
552
553
		ret = ehca_write_rwqe(&my_qp->ipz_rqueue, wqe_p, cur_recv_wr,
				rq_map_idx);
554
555
556
557
558
559
560
561
562
		/*
		 * if something failed,
		 * reset the free entry pointer to the start value
		 */
		if (unlikely(ret)) {
			my_qp->ipz_rqueue.current_q_offset = start_offset;
			*bad_recv_wr = cur_recv_wr;
			if (wqe_cnt == 0) {
				ret = -EINVAL;
563
564
				ehca_err(dev, "Could not write WQE "
					 "qp_num=%x", my_qp->real_qp_num);
565
566
567
			}
			goto post_recv_exit0;
		}
568
569
570
571
572

		qmap_entry = &my_qp->rq_map.map[rq_map_idx];
		qmap_entry->app_wr_id = get_app_wr_id(cur_recv_wr->wr_id);
		qmap_entry->reported = 0;

573
574
575
576
577
578
		wqe_cnt++;
	} /* eof for cur_recv_wr */

post_recv_exit0:
	iosync(); /* serialize GAL register access */
	hipz_update_rqa(my_qp, wqe_cnt);
579
580
581
	if (unlikely(ret || ehca_debug_level >= 2))
	    ehca_dbg(dev, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i",
		     my_qp, my_qp->real_qp_num, wqe_cnt, ret);
582
	spin_unlock_irqrestore(&my_qp->spinlock_r, flags);
583
584
585
	return ret;
}

586
587
588
589
int ehca_post_recv(struct ib_qp *qp,
		   struct ib_recv_wr *recv_wr,
		   struct ib_recv_wr **bad_recv_wr)
{
590
591
592
593
594
595
596
597
598
599
	struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);

	/* Reject WR if QP is in RESET state */
	if (unlikely(my_qp->state == IB_QPS_RESET)) {
		ehca_err(qp->device, "Invalid QP state  qp_state=%d qpn=%x",
			 my_qp->state, qp->qp_num);
		return -EINVAL;
	}

	return internal_post_recv(my_qp, qp->device, recv_wr, bad_recv_wr);
600
601
602
603
604
605
606
607
608
609
}

int ehca_post_srq_recv(struct ib_srq *srq,
		       struct ib_recv_wr *recv_wr,
		       struct ib_recv_wr **bad_recv_wr)
{
	return internal_post_recv(container_of(srq, struct ehca_qp, ib_srq),
				  srq->device, recv_wr, bad_recv_wr);
}

610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
/*
 * ib_wc_opcode table converts ehca wc opcode to ib
 * Since we use zero to indicate invalid opcode, the actual ib opcode must
 * be decremented!!!
 */
static const u8 ib_wc_opcode[255] = {
	[0x01] = IB_WC_RECV+1,
	[0x02] = IB_WC_RECV_RDMA_WITH_IMM+1,
	[0x04] = IB_WC_BIND_MW+1,
	[0x08] = IB_WC_FETCH_ADD+1,
	[0x10] = IB_WC_COMP_SWAP+1,
	[0x20] = IB_WC_RDMA_WRITE+1,
	[0x40] = IB_WC_RDMA_READ+1,
	[0x80] = IB_WC_SEND+1
};

/* internal function to poll one entry of cq */
static inline int ehca_poll_cq_one(struct ib_cq *cq, struct ib_wc *wc)
{
629
	int ret = 0, qmap_tail_idx;
630
631
	struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
	struct ehca_cqe *cqe;
632
	struct ehca_qp *my_qp;
633
634
	struct ehca_qmap_entry *qmap_entry;
	struct ehca_queue_map *qmap;
635
	int cqe_count = 0, is_error;
636

637
repoll:
638
639
640
641
	cqe = (struct ehca_cqe *)
		ipz_qeit_get_inc_valid(&my_cq->ipz_queue);
	if (!cqe) {
		ret = -EAGAIN;
642
643
644
645
		if (ehca_debug_level >= 3)
			ehca_dbg(cq->device, "Completion queue is empty  "
				 "my_cq=%p cq_num=%x", my_cq, my_cq->cq_number);
		goto poll_cq_one_exit0;
646
647
648
649
650
651
652
	}

	/* prevents loads being reordered across this point */
	rmb();

	cqe_count++;
	if (unlikely(cqe->status & WC_STATUS_PURGE_BIT)) {
653
		struct ehca_qp *qp;
654
		int purgeflag;
655
		unsigned long flags;
656
657

		qp = ehca_cq_get_qp(my_cq, cqe->local_qp_number);
658
659
660
661
662
663
664
		if (!qp) {
			ehca_err(cq->device, "cq_num=%x qp_num=%x "
				 "could not find qp -> ignore cqe",
				 my_cq->cq_number, cqe->local_qp_number);
			ehca_dmp(cqe, 64, "cq_num=%x qp_num=%x",
				 my_cq->cq_number, cqe->local_qp_number);
			/* ignore this purged cqe */
665
			goto repoll;
666
		}
667
		spin_lock_irqsave(&qp->spinlock_s, flags);
668
		purgeflag = qp->sqerr_purgeflag;
669
		spin_unlock_irqrestore(&qp->spinlock_s, flags);
670
671

		if (purgeflag) {
672
673
			ehca_dbg(cq->device,
				 "Got CQE with purged bit qp_num=%x src_qp=%x",
674
				 cqe->local_qp_number, cqe->remote_qp_number);
675
			if (ehca_debug_level >= 2)
676
677
678
679
680
681
682
683
				ehca_dmp(cqe, 64, "qp_num=%x src_qp=%x",
					 cqe->local_qp_number,
					 cqe->remote_qp_number);
			/*
			 * ignore this to avoid double cqes of bad wqe
			 * that caused sqe and turn off purge flag
			 */
			qp->sqerr_purgeflag = 0;
684
			goto repoll;
685
686
687
		}
	}

688
689
690
691
	is_error = cqe->status & WC_STATUS_ERROR_BIT;

	/* trace error CQEs if debug_level >= 1, trace all CQEs if >= 3 */
	if (unlikely(ehca_debug_level >= 3 || (ehca_debug_level && is_error))) {
692
		ehca_dbg(cq->device,
693
694
			 "Received %sCOMPLETION ehca_cq=%p cq_num=%x -----",
			 is_error ? "ERROR " : "", my_cq, my_cq->cq_number);
695
696
697
698
699
700
701
		ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x",
			 my_cq, my_cq->cq_number);
		ehca_dbg(cq->device,
			 "ehca_cq=%p cq_num=%x -------------------------",
			 my_cq, my_cq->cq_number);
	}

702
703
704
705
706
707
708
	read_lock(&ehca_qp_idr_lock);
	my_qp = idr_find(&ehca_qp_idr, cqe->qp_token);
	read_unlock(&ehca_qp_idr_lock);
	if (!my_qp)
		goto repoll;
	wc->qp = &my_qp->ib_qp;

709
	if (is_error) {
710
		/*
711
712
		 * set left_to_poll to 0 because in error state, we will not
		 * get any additional CQEs
713
		 */
714
715
		ehca_add_to_err_list(my_qp, 1);
		my_qp->sq_map.left_to_poll = 0;
716

717
718
719
720
721
722
723
724
725
726
		if (HAS_RQ(my_qp))
			ehca_add_to_err_list(my_qp, 0);
		my_qp->rq_map.left_to_poll = 0;
	}

	qmap_tail_idx = get_app_wr_id(cqe->work_request_id);
	if (!(cqe->w_completion_flags & WC_SEND_RECEIVE_BIT))
		/* We got a send completion. */
		qmap = &my_qp->sq_map;
	else
727
		/* We got a receive completion. */
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
		qmap = &my_qp->rq_map;

	qmap_entry = &qmap->map[qmap_tail_idx];
	if (qmap_entry->reported) {
		ehca_warn(cq->device, "Double cqe on qp_num=%#x",
				my_qp->real_qp_num);
		/* found a double cqe, discard it and read next one */
		goto repoll;
	}

	wc->wr_id = replace_wr_id(cqe->work_request_id, qmap_entry->app_wr_id);
	qmap_entry->reported = 1;

	/* this is a proper completion, we need to advance the tail pointer */
	if (++qmap->tail == qmap->entries)
		qmap->tail = 0;

	/* if left_to_poll is decremented to 0, add the QP to the error list */
	if (qmap->left_to_poll > 0) {
		qmap->left_to_poll--;
		if ((my_qp->sq_map.left_to_poll == 0) &&
				(my_qp->rq_map.left_to_poll == 0)) {
			ehca_add_to_err_list(my_qp, 1);
			if (HAS_RQ(my_qp))
				ehca_add_to_err_list(my_qp, 0);
		}
	}
755
756
757
758
759
760
761
762
763
764
765

	/* eval ib_wc_opcode */
	wc->opcode = ib_wc_opcode[cqe->optype]-1;
	if (unlikely(wc->opcode == -1)) {
		ehca_err(cq->device, "Invalid cqe->OPType=%x cqe->status=%x "
			 "ehca_cq=%p cq_num=%x",
			 cqe->optype, cqe->status, my_cq, my_cq->cq_number);
		/* dump cqe for other infos */
		ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x",
			 my_cq, my_cq->cq_number);
		/* update also queue adder to throw away this entry!!! */
766
		goto repoll;
767
	}
768

769
	/* eval ib_wc_status */
770
	if (unlikely(is_error)) {
771
772
773
774
775
776
777
778
779
780
781
782
		/* complete with errors */
		map_ib_wc_status(cqe->status, &wc->status);
		wc->vendor_err = wc->status;
	} else
		wc->status = IB_WC_SUCCESS;

	wc->byte_len = cqe->nr_bytes_transferred;
	wc->pkey_index = cqe->pkey_index;
	wc->slid = cqe->rlid;
	wc->dlid_path_bits = cqe->dlid;
	wc->src_qp = cqe->remote_qp_number;
	wc->wc_flags = cqe->w_completion_flags;
783
	wc->ex.imm_data = cpu_to_be32(cqe->immediate_data);
784
785
786
787
788
789
790
791
792
	wc->sl = cqe->service_level;

poll_cq_one_exit0:
	if (cqe_count > 0)
		hipz_update_feca(my_cq, cqe_count);

	return ret;
}

793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
static int generate_flush_cqes(struct ehca_qp *my_qp, struct ib_cq *cq,
			       struct ib_wc *wc, int num_entries,
			       struct ipz_queue *ipz_queue, int on_sq)
{
	int nr = 0;
	struct ehca_wqe *wqe;
	u64 offset;
	struct ehca_queue_map *qmap;
	struct ehca_qmap_entry *qmap_entry;

	if (on_sq)
		qmap = &my_qp->sq_map;
	else
		qmap = &my_qp->rq_map;

	qmap_entry = &qmap->map[qmap->tail];

	while ((nr < num_entries) && (qmap_entry->reported == 0)) {
		/* generate flush CQE */
		memset(wc, 0, sizeof(*wc));

		offset = qmap->tail * ipz_queue->qe_size;
		wqe = (struct ehca_wqe *)ipz_qeit_calc(ipz_queue, offset);
		if (!wqe) {
			ehca_err(cq->device, "Invalid wqe offset=%#lx on "
				 "qp_num=%#x", offset, my_qp->real_qp_num);
			return nr;
		}

		wc->wr_id = replace_wr_id(wqe->work_request_id,
					  qmap_entry->app_wr_id);

		if (on_sq) {
			switch (wqe->optype) {
			case WQE_OPTYPE_SEND:
				wc->opcode = IB_WC_SEND;
				break;
			case WQE_OPTYPE_RDMAWRITE:
				wc->opcode = IB_WC_RDMA_WRITE;
				break;
			case WQE_OPTYPE_RDMAREAD:
				wc->opcode = IB_WC_RDMA_READ;
				break;
			default:
				ehca_err(cq->device, "Invalid optype=%x",
						wqe->optype);
				return nr;
			}
		} else
			wc->opcode = IB_WC_RECV;

		if (wqe->wr_flag & WQE_WRFLAG_IMM_DATA_PRESENT) {
			wc->ex.imm_data = wqe->immediate_data;
			wc->wc_flags |= IB_WC_WITH_IMM;
		}

		wc->status = IB_WC_WR_FLUSH_ERR;

		wc->qp = &my_qp->ib_qp;

		/* mark as reported and advance tail pointer */
		qmap_entry->reported = 1;
		if (++qmap->tail == qmap->entries)
			qmap->tail = 0;
		qmap_entry = &qmap->map[qmap->tail];

		wc++; nr++;
	}

	return nr;

}

866
867
868
869
int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc)
{
	struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
	int nr;
870
	struct ehca_qp *err_qp;
871
872
	struct ib_wc *current_wc = wc;
	int ret = 0;
873
	unsigned long flags;
874
	int entries_left = num_entries;
875
876
877
878
879
880
881
882

	if (num_entries < 1) {
		ehca_err(cq->device, "Invalid num_entries=%d ehca_cq=%p "
			 "cq_num=%x", num_entries, my_cq, my_cq->cq_number);
		ret = -EINVAL;
		goto poll_cq_exit0;
	}

883
	spin_lock_irqsave(&my_cq->spinlock, flags);
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907

	/* generate flush cqes for send queues */
	list_for_each_entry(err_qp, &my_cq->sqp_err_list, sq_err_node) {
		nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
				&err_qp->ipz_squeue, 1);
		entries_left -= nr;
		current_wc += nr;

		if (entries_left == 0)
			break;
	}

	/* generate flush cqes for receive queues */
	list_for_each_entry(err_qp, &my_cq->rqp_err_list, rq_err_node) {
		nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
				&err_qp->ipz_rqueue, 0);
		entries_left -= nr;
		current_wc += nr;

		if (entries_left == 0)
			break;
	}

	for (nr = 0; nr < entries_left; nr++) {
908
909
910
911
912
		ret = ehca_poll_cq_one(cq, current_wc);
		if (ret)
			break;
		current_wc++;
	} /* eof for nr */
913
914
	entries_left -= nr;

915
	spin_unlock_irqrestore(&my_cq->spinlock, flags);
916
	if (ret == -EAGAIN  || !ret)
917
		ret = num_entries - entries_left;
918
919
920
921
922

poll_cq_exit0:
	return ret;
}

923
int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags)
924
925
{
	struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
926
	int ret = 0;
927

928
	switch (notify_flags & IB_CQ_SOLICITED_MASK) {
929
930
931
932
933
934
935
936
937
938
	case IB_CQ_SOLICITED:
		hipz_set_cqx_n0(my_cq, 1);
		break;
	case IB_CQ_NEXT_COMP:
		hipz_set_cqx_n1(my_cq, 1);
		break;
	default:
		return -EINVAL;
	}

939
	if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
940
		unsigned long spl_flags;
941
942
943
944
945
946
		spin_lock_irqsave(&my_cq->spinlock, spl_flags);
		ret = ipz_qeit_is_valid(&my_cq->ipz_queue);
		spin_unlock_irqrestore(&my_cq->spinlock, spl_flags);
	}

	return ret;
947
}