sched.c 29.5 KB
Newer Older
1
2
3
4
5
/* sched.c - SPU scheduler.
 *
 * Copyright (C) IBM 2005
 * Author: Mark Nutter <mnutter@us.ibm.com>
 *
6
 * 2006-03-31	NUMA domains added.
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

23
24
#undef DEBUG

25
26
27
28
29
30
31
32
33
34
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/completion.h>
#include <linux/vmalloc.h>
#include <linux/smp.h>
#include <linux/stddef.h>
#include <linux/unistd.h>
35
36
#include <linux/numa.h>
#include <linux/mutex.h>
37
#include <linux/notifier.h>
38
#include <linux/kthread.h>
39
40
41
#include <linux/pid_namespace.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
42
#include <linux/marker.h>
43
44
45
46
47

#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/spu.h>
#include <asm/spu_csa.h>
48
#include <asm/spu_priv1.h>
49
50
51
#include "spufs.h"

struct spu_prio_array {
52
	DECLARE_BITMAP(bitmap, MAX_PRIO);
53
54
	struct list_head runq[MAX_PRIO];
	spinlock_t runq_lock;
55
	int nr_waiting;
56
57
};

58
static unsigned long spu_avenrun[3];
59
static struct spu_prio_array *spu_prio;
60
61
static struct task_struct *spusched_task;
static struct timer_list spusched_timer;
62
static struct timer_list spuloadavg_timer;
63

64
65
66
67
68
69
70
71
72
73
74
75
76
77
/*
 * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
 */
#define NORMAL_PRIO		120

/*
 * Frequency of the spu scheduler tick.  By default we do one SPU scheduler
 * tick for every 10 CPU scheduler ticks.
 */
#define SPUSCHED_TICK		(10)

/*
 * These are the 'tuning knobs' of the scheduler:
 *
78
79
 * Minimum timeslice is 5 msecs (or 1 spu scheduler tick, whichever is
 * larger), default timeslice is 100 msecs, maximum timeslice is 800 msecs.
80
 */
81
82
#define MIN_SPU_TIMESLICE	max(5 * HZ / (1000 * SPUSCHED_TICK), 1)
#define DEF_SPU_TIMESLICE	(100 * HZ / (1000 * SPUSCHED_TICK))
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

#define MAX_USER_PRIO		(MAX_PRIO - MAX_RT_PRIO)
#define SCALE_PRIO(x, prio) \
	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE)

/*
 * scale user-nice values [ -20 ... 0 ... 19 ] to time slice values:
 * [800ms ... 100ms ... 5ms]
 *
 * The higher a thread's priority, the bigger timeslices
 * it gets during one round of execution. But even the lowest
 * priority thread gets MIN_TIMESLICE worth of execution time.
 */
void spu_set_timeslice(struct spu_context *ctx)
{
	if (ctx->prio < NORMAL_PRIO)
		ctx->time_slice = SCALE_PRIO(DEF_SPU_TIMESLICE * 4, ctx->prio);
	else
		ctx->time_slice = SCALE_PRIO(DEF_SPU_TIMESLICE, ctx->prio);
}

104
105
106
107
108
/*
 * Update scheduling information from the owning thread.
 */
void __spu_update_sched_info(struct spu_context *ctx)
{
109
110
111
112
113
114
	/*
	 * assert that the context is not on the runqueue, so it is safe
	 * to change its scheduling parameters.
	 */
	BUG_ON(!list_empty(&ctx->rq));

115
	/*
116
117
118
	 * 32-Bit assignments are atomic on powerpc, and we don't care about
	 * memory ordering here because retrieving the controlling thread is
	 * per definition racy.
119
120
121
	 */
	ctx->tid = current->pid;

122
123
	/*
	 * We do our own priority calculations, so we normally want
124
	 * ->static_prio to start with. Unfortunately this field
125
126
127
128
129
130
131
132
	 * contains junk for threads with a realtime scheduling
	 * policy so we have to look at ->prio in this case.
	 */
	if (rt_prio(current->prio))
		ctx->prio = current->prio;
	else
		ctx->prio = current->static_prio;
	ctx->policy = current->policy;
133
134

	/*
135
136
137
138
139
140
	 * TO DO: the context may be loaded, so we may need to activate
	 * it again on a different node. But it shouldn't hurt anything
	 * to update its parameters, because we know that the scheduler
	 * is not actively looking at this field, since it is not on the
	 * runqueue. The context will be rescheduled on the proper node
	 * if it is timesliced or preempted.
141
142
	 */
	ctx->cpus_allowed = current->cpus_allowed;
143
144
145

	/* Save the current cpu id for spu interrupt routing. */
	ctx->last_ran = raw_smp_processor_id();
146
147
148
149
}

void spu_update_sched_info(struct spu_context *ctx)
{
150
	int node;
151

152
153
	if (ctx->state == SPU_STATE_RUNNABLE) {
		node = ctx->spu->node;
154
155
156
157

		/*
		 * Take list_mutex to sync with find_victim().
		 */
158
159
160
161
162
163
		mutex_lock(&cbe_spu_info[node].list_mutex);
		__spu_update_sched_info(ctx);
		mutex_unlock(&cbe_spu_info[node].list_mutex);
	} else {
		__spu_update_sched_info(ctx);
	}
164
165
}

166
static int __node_allowed(struct spu_context *ctx, int node)
167
{
168
169
	if (nr_cpus_node(node)) {
		cpumask_t mask = node_to_cpumask(node);
170

171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
		if (cpus_intersects(mask, ctx->cpus_allowed))
			return 1;
	}

	return 0;
}

static int node_allowed(struct spu_context *ctx, int node)
{
	int rval;

	spin_lock(&spu_prio->runq_lock);
	rval = __node_allowed(ctx, node);
	spin_unlock(&spu_prio->runq_lock);

	return rval;
187
188
}

189
void do_notify_spus_active(void)
190
191
192
193
194
195
196
{
	int node;

	/*
	 * Wake up the active spu_contexts.
	 *
	 * When the awakened processes see their "notify_active" flag is set,
197
	 * they will call spu_switch_notify().
198
199
200
	 */
	for_each_online_node(node) {
		struct spu *spu;
201
202
203
204
205
206
207
208
209
210

		mutex_lock(&cbe_spu_info[node].list_mutex);
		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
			if (spu->alloc_state != SPU_FREE) {
				struct spu_context *ctx = spu->ctx;
				set_bit(SPU_SCHED_NOTIFY_ACTIVE,
					&ctx->sched_flags);
				mb();
				wake_up_all(&ctx->stop_wq);
			}
211
		}
212
		mutex_unlock(&cbe_spu_info[node].list_mutex);
213
214
215
	}
}

216
217
218
219
220
221
/**
 * spu_bind_context - bind spu context to physical spu
 * @spu:	physical spu to bind to
 * @ctx:	context to bind
 */
static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
222
{
223
224
	spu_context_trace(spu_bind_context__enter, ctx, spu);

225
	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
226

227
228
229
	if (ctx->flags & SPU_CREATE_NOSCHED)
		atomic_inc(&cbe_spu_info[spu->node].reserved_spus);

230
231
232
	ctx->stats.slb_flt_base = spu->stats.slb_flt;
	ctx->stats.class2_intr_base = spu->stats.class2_intr;

233
234
235
	spu_associate_mm(spu, ctx->owner);

	spin_lock_irq(&spu->register_lock);
236
237
238
239
240
	spu->ctx = ctx;
	spu->flags = 0;
	ctx->spu = spu;
	ctx->ops = &spu_hw_ops;
	spu->pid = current->pid;
241
	spu->tgid = current->tgid;
242
243
	spu->ibox_callback = spufs_ibox_callback;
	spu->wbox_callback = spufs_wbox_callback;
244
	spu->stop_callback = spufs_stop_callback;
245
	spu->mfc_callback = spufs_mfc_callback;
246
247
	spin_unlock_irq(&spu->register_lock);

248
	spu_unmap_mappings(ctx);
249

250
	spu_switch_log_notify(spu, ctx, SWITCH_LOG_START, 0);
251
	spu_restore(&ctx->csa, spu);
252
	spu->timestamp = jiffies;
253
	spu_switch_notify(spu, ctx);
254
	ctx->state = SPU_STATE_RUNNABLE;
255

256
	spuctx_switch_state(ctx, SPU_UTIL_USER);
257
258
}

259
/*
260
 * Must be used with the list_mutex held.
261
262
263
 */
static inline int sched_spu(struct spu *spu)
{
264
265
	BUG_ON(!mutex_is_locked(&cbe_spu_info[spu->node].list_mutex));

266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
	return (!spu->ctx || !(spu->ctx->flags & SPU_CREATE_NOSCHED));
}

static void aff_merge_remaining_ctxs(struct spu_gang *gang)
{
	struct spu_context *ctx;

	list_for_each_entry(ctx, &gang->aff_list_head, aff_list) {
		if (list_empty(&ctx->aff_list))
			list_add(&ctx->aff_list, &gang->aff_list_head);
	}
	gang->aff_flags |= AFF_MERGED;
}

static void aff_set_offsets(struct spu_gang *gang)
{
	struct spu_context *ctx;
	int offset;

	offset = -1;
	list_for_each_entry_reverse(ctx, &gang->aff_ref_ctx->aff_list,
								aff_list) {
		if (&ctx->aff_list == &gang->aff_list_head)
			break;
		ctx->aff_offset = offset--;
	}

	offset = 0;
	list_for_each_entry(ctx, gang->aff_ref_ctx->aff_list.prev, aff_list) {
		if (&ctx->aff_list == &gang->aff_list_head)
			break;
		ctx->aff_offset = offset++;
	}

	gang->aff_flags |= AFF_OFFSETS_SET;
}

static struct spu *aff_ref_location(struct spu_context *ctx, int mem_aff,
		 int group_size, int lowest_offset)
{
	struct spu *spu;
	int node, n;

	/*
	 * TODO: A better algorithm could be used to find a good spu to be
	 *       used as reference location for the ctxs chain.
	 */
	node = cpu_to_node(raw_smp_processor_id());
	for (n = 0; n < MAX_NUMNODES; n++, node++) {
315
316
317
318
319
320
321
322
323
		/*
		 * "available_spus" counts how many spus are not potentially
		 * going to be used by other affinity gangs whose reference
		 * context is already in place. Although this code seeks to
		 * avoid having affinity gangs with a summed amount of
		 * contexts bigger than the amount of spus in the node,
		 * this may happen sporadically. In this case, available_spus
		 * becomes negative, which is harmless.
		 */
324
325
		int available_spus;

326
327
328
		node = (node < MAX_NUMNODES) ? node : 0;
		if (!node_allowed(ctx, node))
			continue;
329
330

		available_spus = 0;
331
		mutex_lock(&cbe_spu_info[node].list_mutex);
332
		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
333
334
335
336
			if (spu->ctx && spu->ctx->gang && !spu->ctx->aff_offset
					&& spu->ctx->gang->aff_ref_spu)
				available_spus -= spu->ctx->gang->contexts;
			available_spus++;
337
338
339
340
341
342
		}
		if (available_spus < ctx->gang->contexts) {
			mutex_unlock(&cbe_spu_info[node].list_mutex);
			continue;
		}

343
344
		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
			if ((!mem_aff || spu->has_mem_affinity) &&
345
346
							sched_spu(spu)) {
				mutex_unlock(&cbe_spu_info[node].list_mutex);
347
				return spu;
348
			}
349
		}
350
		mutex_unlock(&cbe_spu_info[node].list_mutex);
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
	}
	return NULL;
}

static void aff_set_ref_point_location(struct spu_gang *gang)
{
	int mem_aff, gs, lowest_offset;
	struct spu_context *ctx;
	struct spu *tmp;

	mem_aff = gang->aff_ref_ctx->flags & SPU_CREATE_AFFINITY_MEM;
	lowest_offset = 0;
	gs = 0;

	list_for_each_entry(tmp, &gang->aff_list_head, aff_list)
		gs++;

	list_for_each_entry_reverse(ctx, &gang->aff_ref_ctx->aff_list,
								aff_list) {
		if (&ctx->aff_list == &gang->aff_list_head)
			break;
		lowest_offset = ctx->aff_offset;
	}

375
376
	gang->aff_ref_spu = aff_ref_location(gang->aff_ref_ctx, mem_aff, gs,
							lowest_offset);
377
378
}

379
static struct spu *ctx_location(struct spu *ref, int offset, int node)
380
381
382
383
384
385
{
	struct spu *spu;

	spu = NULL;
	if (offset >= 0) {
		list_for_each_entry(spu, ref->aff_list.prev, aff_list) {
386
			BUG_ON(spu->node != node);
387
388
389
390
391
392
393
			if (offset == 0)
				break;
			if (sched_spu(spu))
				offset--;
		}
	} else {
		list_for_each_entry_reverse(spu, ref->aff_list.next, aff_list) {
394
			BUG_ON(spu->node != node);
395
396
397
398
399
400
			if (offset == 0)
				break;
			if (sched_spu(spu))
				offset++;
		}
	}
401

402
403
404
405
406
407
408
	return spu;
}

/*
 * affinity_check is called each time a context is going to be scheduled.
 * It returns the spu ptr on which the context must run.
 */
409
static int has_affinity(struct spu_context *ctx)
410
{
411
	struct spu_gang *gang = ctx->gang;
412
413

	if (list_empty(&ctx->aff_list))
414
415
		return 0;

416
417
418
	if (atomic_read(&ctx->gang->aff_sched_count) == 0)
		ctx->gang->aff_ref_spu = NULL;

419
420
421
422
423
424
425
	if (!gang->aff_ref_spu) {
		if (!(gang->aff_flags & AFF_MERGED))
			aff_merge_remaining_ctxs(gang);
		if (!(gang->aff_flags & AFF_OFFSETS_SET))
			aff_set_offsets(gang);
		aff_set_ref_point_location(gang);
	}
426
427

	return gang->aff_ref_spu != NULL;
428
429
}

430
431
432
433
434
/**
 * spu_unbind_context - unbind spu context from physical spu
 * @spu:	physical spu to unbind from
 * @ctx:	context to unbind
 */
435
static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
436
{
437
438
	u32 status;

439
440
	spu_context_trace(spu_unbind_context__enter, ctx, spu);

441
	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
442

443
444
 	if (spu->ctx->flags & SPU_CREATE_NOSCHED)
		atomic_dec(&cbe_spu_info[spu->node].reserved_spus);
445

446
	if (ctx->gang)
447
448
449
450
451
		/*
		 * If ctx->gang->aff_sched_count is positive, SPU affinity is
		 * being considered in this gang. Using atomic_dec_if_positive
		 * allow us to skip an explicit check for affinity in this gang
		 */
452
		atomic_dec_if_positive(&ctx->gang->aff_sched_count);
453

454
	spu_switch_notify(spu, NULL);
455
	spu_unmap_mappings(ctx);
456
	spu_save(&ctx->csa, spu);
457
	spu_switch_log_notify(spu, ctx, SWITCH_LOG_STOP, 0);
458
459

	spin_lock_irq(&spu->register_lock);
460
	spu->timestamp = jiffies;
461
462
463
	ctx->state = SPU_STATE_SAVED;
	spu->ibox_callback = NULL;
	spu->wbox_callback = NULL;
464
	spu->stop_callback = NULL;
465
	spu->mfc_callback = NULL;
466
	spu->pid = 0;
467
	spu->tgid = 0;
468
	ctx->ops = &spu_backing_ops;
469
	spu->flags = 0;
470
	spu->ctx = NULL;
471
472
473
	spin_unlock_irq(&spu->register_lock);

	spu_associate_mm(spu, NULL);
474
475
476
477
478

	ctx->stats.slb_flt +=
		(spu->stats.slb_flt - ctx->stats.slb_flt_base);
	ctx->stats.class2_intr +=
		(spu->stats.class2_intr - ctx->stats.class2_intr_base);
479
480
481
482

	/* This maps the underlying spu state to idle */
	spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED);
	ctx->spu = NULL;
483
484
485

	if (spu_stopped(ctx, &status))
		wake_up_all(&ctx->stop_wq);
486
487
}

488
489
490
491
/**
 * spu_add_to_rq - add a context to the runqueue
 * @ctx:       context to add
 */
492
static void __spu_add_to_rq(struct spu_context *ctx)
493
{
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
	/*
	 * Unfortunately this code path can be called from multiple threads
	 * on behalf of a single context due to the way the problem state
	 * mmap support works.
	 *
	 * Fortunately we need to wake up all these threads at the same time
	 * and can simply skip the runqueue addition for every but the first
	 * thread getting into this codepath.
	 *
	 * It's still quite hacky, and long-term we should proxy all other
	 * threads through the owner thread so that spu_run is in control
	 * of all the scheduling activity for a given context.
	 */
	if (list_empty(&ctx->rq)) {
		list_add_tail(&ctx->rq, &spu_prio->runq[ctx->prio]);
		set_bit(ctx->prio, spu_prio->bitmap);
		if (!spu_prio->nr_waiting++)
			__mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
	}
513
}
514

515
516
517
518
519
520
521
static void spu_add_to_rq(struct spu_context *ctx)
{
	spin_lock(&spu_prio->runq_lock);
	__spu_add_to_rq(ctx);
	spin_unlock(&spu_prio->runq_lock);
}

522
static void __spu_del_from_rq(struct spu_context *ctx)
523
{
524
525
	int prio = ctx->prio;

526
	if (!list_empty(&ctx->rq)) {
527
528
		if (!--spu_prio->nr_waiting)
			del_timer(&spusched_timer);
529
		list_del_init(&ctx->rq);
530
531
532

		if (list_empty(&spu_prio->runq[prio]))
			clear_bit(prio, spu_prio->bitmap);
533
	}
534
}
535

536
537
538
539
540
541
542
void spu_del_from_rq(struct spu_context *ctx)
{
	spin_lock(&spu_prio->runq_lock);
	__spu_del_from_rq(ctx);
	spin_unlock(&spu_prio->runq_lock);
}

543
static void spu_prio_wait(struct spu_context *ctx)
544
{
545
	DEFINE_WAIT(wait);
546

547
548
549
550
551
552
553
	/*
	 * The caller must explicitly wait for a context to be loaded
	 * if the nosched flag is set.  If NOSCHED is not set, the caller
	 * queues the context and waits for an spu event or error.
	 */
	BUG_ON(!(ctx->flags & SPU_CREATE_NOSCHED));

554
	spin_lock(&spu_prio->runq_lock);
555
	prepare_to_wait_exclusive(&ctx->stop_wq, &wait, TASK_INTERRUPTIBLE);
556
	if (!signal_pending(current)) {
557
558
		__spu_add_to_rq(ctx);
		spin_unlock(&spu_prio->runq_lock);
559
		mutex_unlock(&ctx->state_mutex);
560
		schedule();
561
		mutex_lock(&ctx->state_mutex);
562
563
		spin_lock(&spu_prio->runq_lock);
		__spu_del_from_rq(ctx);
564
	}
565
	spin_unlock(&spu_prio->runq_lock);
566
567
	__set_current_state(TASK_RUNNING);
	remove_wait_queue(&ctx->stop_wq, &wait);
568
569
}

570
static struct spu *spu_get_idle(struct spu_context *ctx)
571
{
572
	struct spu *spu, *aff_ref_spu;
573
574
	int node, n;

575
576
	spu_context_nospu_trace(spu_get_idle__enter, ctx);

577
578
579
580
581
582
583
584
585
586
587
588
589
	if (ctx->gang) {
		mutex_lock(&ctx->gang->aff_mutex);
		if (has_affinity(ctx)) {
			aff_ref_spu = ctx->gang->aff_ref_spu;
			atomic_inc(&ctx->gang->aff_sched_count);
			mutex_unlock(&ctx->gang->aff_mutex);
			node = aff_ref_spu->node;

			mutex_lock(&cbe_spu_info[node].list_mutex);
			spu = ctx_location(aff_ref_spu, ctx->aff_offset, node);
			if (spu && spu->alloc_state == SPU_FREE)
				goto found;
			mutex_unlock(&cbe_spu_info[node].list_mutex);
590

591
			atomic_dec(&ctx->gang->aff_sched_count);
592
			goto not_found;
593
594
595
		}
		mutex_unlock(&ctx->gang->aff_mutex);
	}
596
	node = cpu_to_node(raw_smp_processor_id());
597
598
	for (n = 0; n < MAX_NUMNODES; n++, node++) {
		node = (node < MAX_NUMNODES) ? node : 0;
599
		if (!node_allowed(ctx, node))
600
			continue;
601
602
603
604
605
606
607

		mutex_lock(&cbe_spu_info[node].list_mutex);
		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
			if (spu->alloc_state == SPU_FREE)
				goto found;
		}
		mutex_unlock(&cbe_spu_info[node].list_mutex);
608
	}
609

610
611
 not_found:
	spu_context_nospu_trace(spu_get_idle__not_found, ctx);
612
613
614
615
616
	return NULL;

 found:
	spu->alloc_state = SPU_USED;
	mutex_unlock(&cbe_spu_info[node].list_mutex);
617
	spu_context_trace(spu_get_idle__found, ctx, spu);
618
	spu_init_channels(spu);
619
620
	return spu;
}
621

622
623
624
625
626
627
628
629
630
631
632
633
/**
 * find_victim - find a lower priority context to preempt
 * @ctx:	canidate context for running
 *
 * Returns the freed physical spu to run the new context on.
 */
static struct spu *find_victim(struct spu_context *ctx)
{
	struct spu_context *victim = NULL;
	struct spu *spu;
	int node, n;

634
	spu_context_nospu_trace(spu_find_victim__enter, ctx);
635

636
637
638
	/*
	 * Look for a possible preemption candidate on the local node first.
	 * If there is no candidate look at the other nodes.  This isn't
639
	 * exactly fair, but so far the whole spu scheduler tries to keep
640
641
642
643
644
645
646
	 * a strong node affinity.  We might want to fine-tune this in
	 * the future.
	 */
 restart:
	node = cpu_to_node(raw_smp_processor_id());
	for (n = 0; n < MAX_NUMNODES; n++, node++) {
		node = (node < MAX_NUMNODES) ? node : 0;
647
		if (!node_allowed(ctx, node))
648
649
			continue;

650
651
		mutex_lock(&cbe_spu_info[node].list_mutex);
		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
652
653
			struct spu_context *tmp = spu->ctx;

654
			if (tmp && tmp->prio > ctx->prio &&
655
			    !(tmp->flags & SPU_CREATE_NOSCHED) &&
656
			    (!victim || tmp->prio > victim->prio)) {
657
				victim = spu->ctx;
658
			}
659
		}
660
661
		if (victim)
			get_spu_context(victim);
662
		mutex_unlock(&cbe_spu_info[node].list_mutex);
663
664
665
666
667
668
669

		if (victim) {
			/*
			 * This nests ctx->state_mutex, but we always lock
			 * higher priority contexts before lower priority
			 * ones, so this is safe until we introduce
			 * priority inheritance schemes.
670
671
672
673
			 *
			 * XXX if the highest priority context is locked,
			 * this can loop a long time.  Might be better to
			 * look at another context or give up after X retries.
674
675
			 */
			if (!mutex_trylock(&victim->state_mutex)) {
676
				put_spu_context(victim);
677
678
679
680
681
				victim = NULL;
				goto restart;
			}

			spu = victim->spu;
682
			if (!spu || victim->prio <= ctx->prio) {
683
684
				/*
				 * This race can happen because we've dropped
685
				 * the active list mutex.  Not a problem, just
686
687
688
				 * restart the search.
				 */
				mutex_unlock(&victim->state_mutex);
689
				put_spu_context(victim);
690
691
692
				victim = NULL;
				goto restart;
			}
693

694
695
			spu_context_trace(__spu_deactivate__unload, ctx, spu);

696
697
			mutex_lock(&cbe_spu_info[node].list_mutex);
			cbe_spu_info[node].nr_active--;
698
			spu_unbind_context(spu, victim);
699
700
			mutex_unlock(&cbe_spu_info[node].list_mutex);

701
			victim->stats.invol_ctx_switch++;
702
			spu->stats.invol_ctx_switch++;
703
			if (test_bit(SPU_SCHED_SPU_RUN, &victim->sched_flags))
704
				spu_add_to_rq(victim);
705

706
			mutex_unlock(&victim->state_mutex);
707
			put_spu_context(victim);
708

709
710
711
712
713
714
715
			return spu;
		}
	}

	return NULL;
}

716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
static void __spu_schedule(struct spu *spu, struct spu_context *ctx)
{
	int node = spu->node;
	int success = 0;

	spu_set_timeslice(ctx);

	mutex_lock(&cbe_spu_info[node].list_mutex);
	if (spu->ctx == NULL) {
		spu_bind_context(spu, ctx);
		cbe_spu_info[node].nr_active++;
		spu->alloc_state = SPU_USED;
		success = 1;
	}
	mutex_unlock(&cbe_spu_info[node].list_mutex);

	if (success)
		wake_up_all(&ctx->run_wq);
	else
		spu_add_to_rq(ctx);
}

static void spu_schedule(struct spu *spu, struct spu_context *ctx)
{
740
741
742
	/* not a candidate for interruptible because it's called either
	   from the scheduler thread or from spu_deactivate */
	mutex_lock(&ctx->state_mutex);
743
744
	if (ctx->state == SPU_STATE_SAVED)
		__spu_schedule(spu, ctx);
745
746
747
	spu_release(ctx);
}

748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
/**
 * spu_unschedule - remove a context from a spu, and possibly release it.
 * @spu:	The SPU to unschedule from
 * @ctx:	The context currently scheduled on the SPU
 * @free_spu	Whether to free the SPU for other contexts
 *
 * Unbinds the context @ctx from the SPU @spu. If @free_spu is non-zero, the
 * SPU is made available for other contexts (ie, may be returned by
 * spu_get_idle). If this is zero, the caller is expected to schedule another
 * context to this spu.
 *
 * Should be called with ctx->state_mutex held.
 */
static void spu_unschedule(struct spu *spu, struct spu_context *ctx,
		int free_spu)
763
764
765
766
767
{
	int node = spu->node;

	mutex_lock(&cbe_spu_info[node].list_mutex);
	cbe_spu_info[node].nr_active--;
768
769
	if (free_spu)
		spu->alloc_state = SPU_FREE;
770
771
772
773
774
775
	spu_unbind_context(spu, ctx);
	ctx->stats.invol_ctx_switch++;
	spu->stats.invol_ctx_switch++;
	mutex_unlock(&cbe_spu_info[node].list_mutex);
}

776
777
778
779
780
/**
 * spu_activate - find a free spu for a context and execute it
 * @ctx:	spu context to schedule
 * @flags:	flags (currently ignored)
 *
781
 * Tries to find a free spu to run @ctx.  If no free spu is available
782
783
784
 * add the context to the runqueue so it gets woken up once an spu
 * is available.
 */
785
int spu_activate(struct spu_context *ctx, unsigned long flags)
786
{
787
	struct spu *spu;
788

789
790
791
792
793
794
795
796
	/*
	 * If there are multiple threads waiting for a single context
	 * only one actually binds the context while the others will
	 * only be able to acquire the state_mutex once the context
	 * already is in runnable state.
	 */
	if (ctx->spu)
		return 0;
797

798
799
800
spu_activate_top:
	if (signal_pending(current))
		return -ERESTARTSYS;
801

802
803
804
805
806
807
808
809
810
811
812
813
814
815
	spu = spu_get_idle(ctx);
	/*
	 * If this is a realtime thread we try to get it running by
	 * preempting a lower priority thread.
	 */
	if (!spu && rt_prio(ctx->prio))
		spu = find_victim(ctx);
	if (spu) {
		unsigned long runcntl;

		runcntl = ctx->ops->runcntl_read(ctx);
		__spu_schedule(spu, ctx);
		if (runcntl & SPU_RUNCNTL_RUNNABLE)
			spuctx_switch_state(ctx, SPU_UTIL_USER);
816

817
818
819
820
		return 0;
	}

	if (ctx->flags & SPU_CREATE_NOSCHED) {
821
		spu_prio_wait(ctx);
822
823
824
825
		goto spu_activate_top;
	}

	spu_add_to_rq(ctx);
826

827
	return 0;
828
829
}

830
831
832
833
834
835
/**
 * grab_runnable_context - try to find a runnable context
 *
 * Remove the highest priority context on the runqueue and return it
 * to the caller.  Returns %NULL if no runnable context was found.
 */
836
static struct spu_context *grab_runnable_context(int prio, int node)
837
{
838
	struct spu_context *ctx;
839
840
841
	int best;

	spin_lock(&spu_prio->runq_lock);
842
	best = find_first_bit(spu_prio->bitmap, prio);
843
	while (best < prio) {
844
845
		struct list_head *rq = &spu_prio->runq[best];

846
847
848
849
850
851
852
853
		list_for_each_entry(ctx, rq, rq) {
			/* XXX(hch): check for affinity here aswell */
			if (__node_allowed(ctx, node)) {
				__spu_del_from_rq(ctx);
				goto found;
			}
		}
		best++;
854
	}
855
856
	ctx = NULL;
 found:
857
858
859
860
861
862
863
864
865
866
	spin_unlock(&spu_prio->runq_lock);
	return ctx;
}

static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio)
{
	struct spu *spu = ctx->spu;
	struct spu_context *new = NULL;

	if (spu) {
867
		new = grab_runnable_context(max_prio, spu->node);
868
		if (new || force) {
869
			spu_unschedule(spu, ctx, new == NULL);
870
871
872
873
874
875
			if (new) {
				if (new->flags & SPU_CREATE_NOSCHED)
					wake_up(&new->stop_wq);
				else {
					spu_release(ctx);
					spu_schedule(spu, new);
876
877
878
					/* this one can't easily be made
					   interruptible */
					mutex_lock(&ctx->state_mutex);
879
880
				}
			}
881
882
883
884
885
886
		}
	}

	return new != NULL;
}

887
888
889
890
891
892
893
/**
 * spu_deactivate - unbind a context from it's physical spu
 * @ctx:	spu context to unbind
 *
 * Unbind @ctx from the physical spu it is running on and schedule
 * the highest priority context to run on the freed physical spu.
 */
894
895
void spu_deactivate(struct spu_context *ctx)
{
896
	spu_context_nospu_trace(spu_deactivate__enter, ctx);
897
	__spu_deactivate(ctx, 1, MAX_PRIO);
898
899
}

900
/**
901
 * spu_yield -	yield a physical spu if others are waiting
902
903
904
905
906
907
 * @ctx:	spu context to yield
 *
 * Check if there is a higher priority context waiting and if yes
 * unbind @ctx from the physical spu and schedule the highest
 * priority context to run on the freed physical spu instead.
 */
908
909
void spu_yield(struct spu_context *ctx)
{
910
	spu_context_nospu_trace(spu_yield__enter, ctx);
911
912
	if (!(ctx->flags & SPU_CREATE_NOSCHED)) {
		mutex_lock(&ctx->state_mutex);
913
		__spu_deactivate(ctx, 0, MAX_PRIO);
914
915
		mutex_unlock(&ctx->state_mutex);
	}
916
}
917

918
static noinline void spusched_tick(struct spu_context *ctx)
919
{
920
921
922
	struct spu_context *new = NULL;
	struct spu *spu = NULL;

923
924
	if (spu_acquire(ctx))
		BUG();	/* a kernel thread never has signals pending */
925
926
927

	if (ctx->state != SPU_STATE_RUNNABLE)
		goto out;
928
	if (ctx->flags & SPU_CREATE_NOSCHED)
929
		goto out;
930
	if (ctx->policy == SCHED_FIFO)
931
		goto out;
932

933
	if (--ctx->time_slice && test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
934
		goto out;
935

936
	spu = ctx->spu;
937
938
939

	spu_context_trace(spusched_tick__preempt, ctx, spu);

940
941
	new = grab_runnable_context(ctx->prio + 1, spu->node);
	if (new) {
942
		spu_unschedule(spu, ctx, 0);
943
		if (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
944
			spu_add_to_rq(ctx);
945
	} else {
946
		spu_context_nospu_trace(spusched_tick__newslice, ctx);
947
948
		if (!ctx->time_slice)
			ctx->time_slice++;
949
	}
950
951
952
953
954
out:
	spu_release(ctx);

	if (new)
		spu_schedule(spu, new);
955
956
}

957
958
959
960
961
/**
 * count_active_contexts - count nr of active tasks
 *
 * Return the number of tasks currently running or waiting to run.
 *
962
 * Note that we don't take runq_lock / list_mutex here.  Reading
963
964
965
966
967
968
969
970
 * a single 32bit value is atomic on powerpc, and we don't care
 * about memory ordering issues here.
 */
static unsigned long count_active_contexts(void)
{
	int nr_active = 0, node;

	for (node = 0; node < MAX_NUMNODES; node++)
971
		nr_active += cbe_spu_info[node].nr_active;
972
973
974
975
976
977
	nr_active += spu_prio->nr_waiting;

	return nr_active;
}

/**
978
 * spu_calc_load - update the avenrun load estimates.
979
980
981
982
 *
 * No locking against reading these values from userspace, as for
 * the CPU loadavg code.
 */
983
static void spu_calc_load(void)
984
985
{
	unsigned long active_tasks; /* fixed-point */
986
987
988
989
990

	active_tasks = count_active_contexts() * FIXED_1;
	CALC_LOAD(spu_avenrun[0], EXP_1, active_tasks);
	CALC_LOAD(spu_avenrun[1], EXP_5, active_tasks);
	CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks);
991
992
}

993
994
995
996
static void spusched_wake(unsigned long data)
{
	mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
	wake_up_process(spusched_task);
997
998
999
1000
}

static void spuloadavg_wake(unsigned long data)
{