syscall.c 20.6 KB
Newer Older
1
/*
Gilles Chanteperdrix's avatar
Gilles Chanteperdrix committed
2
 * Copyright (C) 2005 Philippe Gerum <rpm@xenomai.org>
3
 * Copyright (C) 2005 Gilles Chanteperdrix <gilles.chanteperdrix@xenomai.org>
Gilles Chanteperdrix's avatar
Gilles Chanteperdrix committed
4
5
6
7
8
9
10
11
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the
Gilles Chanteperdrix's avatar
Gilles Chanteperdrix committed
13
14
15
16
17
18
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */
19
#include <linux/types.h>
20
#include <linux/err.h>
21
#include <linux/sched.h>
22
#include <linux/kconfig.h>
23
#include <linux/unistd.h>
24
#include <cobalt/uapi/corectl.h>
25
#include <cobalt/kernel/tree.h>
26
#include <cobalt/kernel/vdso.h>
27
#include <cobalt/kernel/init.h>
28
#include <pipeline/kevents.h>
29
#include <asm/syscall.h>
30
#include "internal.h"
31
#include "thread.h"
32
#include "sched.h"
33
34
#include "mutex.h"
#include "cond.h"
35
#include "mqueue.h"
36
#include "sem.h"
37
#include "signal.h"
38
#include "timer.h"
39
#include "monitor.h"
40
#include "clock.h"
41
#include "event.h"
42
#include "timerfd.h"
43
#include "io.h"
44
#include "corectl.h"
45
46
#include "../debug.h"
#include <trace/events/cobalt-posix.h>
Gilles Chanteperdrix's avatar
Gilles Chanteperdrix committed
47

48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
/* Syscall must run into the Linux domain. */
#define __xn_exec_lostage    0x1
/* Syscall must run into the Xenomai domain. */
#define __xn_exec_histage    0x2
/* Shadow syscall: caller must be mapped. */
#define __xn_exec_shadow     0x4
/* Switch back toggle; caller must return to its original mode. */
#define __xn_exec_switchback 0x8
/* Exec in current domain. */
#define __xn_exec_current    0x10
/* Exec in conforming domain, Xenomai or Linux. */
#define __xn_exec_conforming 0x20
/* Attempt syscall restart in the opposite domain upon -ENOSYS. */
#define __xn_exec_adaptive   0x40
/* Do not restart syscall upon signal receipt. */
#define __xn_exec_norestart  0x80
/* Shorthand for shadow init syscall. */
#define __xn_exec_init       __xn_exec_lostage
/* Shorthand for shadow syscall in Xenomai space. */
#define __xn_exec_primary   (__xn_exec_shadow|__xn_exec_histage)
/* Shorthand for shadow syscall in Linux space. */
#define __xn_exec_secondary (__xn_exec_shadow|__xn_exec_lostage)
/* Shorthand for syscall in Linux space with switchback if shadow. */
#define __xn_exec_downup    (__xn_exec_lostage|__xn_exec_switchback)
/* Shorthand for non-restartable primary syscall. */
#define __xn_exec_nonrestartable (__xn_exec_primary|__xn_exec_norestart)
74
/* Domain probing syscall starting in conforming mode. */
75
#define __xn_exec_probing   (__xn_exec_conforming|__xn_exec_adaptive)
76
77
/* Hand over mode selection to syscall.  */
#define __xn_exec_handover  (__xn_exec_current|__xn_exec_adaptive)
Gilles Chanteperdrix's avatar
Gilles Chanteperdrix committed
78

79
80
81
typedef long (*cobalt_syshand)(unsigned long arg1, unsigned long arg2,
			       unsigned long arg3, unsigned long arg4,
			       unsigned long arg5);
82

83
84
85
86
static void prepare_for_signal(struct task_struct *p,
			       struct xnthread *thread,
			       struct pt_regs *regs,
			       int sysflags)
87
{
88
	int notify = 0;
89
90
91
	spl_t s;

	xnlock_get_irqsave(&nklock, s);
92

93
94
95
96
97
	if (xnthread_test_info(thread, XNKICKED)) {
		if (signal_pending(p)) {
			__xn_error_return(regs,
					  (sysflags & __xn_exec_norestart) ?
					  -EINTR : -ERESTARTSYS);
98
			notify = !xnthread_test_state(thread, XNSSTEP);
99
100
101
102
			xnthread_clear_info(thread, XNBREAK);
		}
		xnthread_clear_info(thread, XNKICKED);
	}
103

104
105
	xnlock_put_irqrestore(&nklock, s);

106
	xnthread_test_cancel();
107

108
	xnthread_relax(notify, SIGDEBUG_MIGRATE_SIGNAL);
109
110
}

111
static COBALT_SYSCALL(migrate, current, (int domain))
112
{
113
	struct xnthread *thread = xnthread_current();
114

115
	if (is_secondary_domain()) {
116
117
118
119
120
121
122
123
124
125
		if (domain == COBALT_PRIMARY) {
			if (thread == NULL)
				return -EPERM;
			/*
			 * Paranoid: a corner case where userland
			 * fiddles with SIGSHADOW while the target
			 * thread is still waiting to be started.
			 */
			if (xnthread_test_state(thread, XNDORMANT))
				return 0;
126

127
128
129
130
			return xnthread_harden() ? : 1;
		}
		return 0;
	}
131

132
	/* We are running on the head stage, apply relax request. */
133
134
135
	if (domain == COBALT_SECONDARY) {
		xnthread_relax(0, 0);
		return 1;
136
137
	}

138
139
	return 0;
}
140

141
static COBALT_SYSCALL(trace, current,
142
143
		      (int op, unsigned long a1,
		       unsigned long a2, unsigned long a3))
144
145
{
	int ret = -EINVAL;
146

147
148
149
150
	switch (op) {
	case __xntrace_op_max_begin:
		ret = xntrace_max_begin(a1);
		break;
151

152
153
154
	case __xntrace_op_max_end:
		ret = xntrace_max_end(a1);
		break;
155

156
157
158
	case __xntrace_op_max_reset:
		ret = xntrace_max_reset();
		break;
159

160
161
162
	case __xntrace_op_user_start:
		ret = xntrace_user_start();
		break;
163

164
165
166
	case __xntrace_op_user_stop:
		ret = xntrace_user_stop(a1);
		break;
167

168
169
170
	case __xntrace_op_user_freeze:
		ret = xntrace_user_freeze(a1, a2);
		break;
171

172
173
174
	case __xntrace_op_special:
		ret = xntrace_special(a1 & 0xFF, a2);
		break;
175

176
177
178
179
180
181
182
	case __xntrace_op_special_u64:
		ret = xntrace_special_u64(a1 & 0xFF,
					  (((u64) a2) << 32) | a3);
		break;
	}
	return ret;
}
183

Jan Kiszka's avatar
Jan Kiszka committed
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
static COBALT_SYSCALL(ftrace_puts, current,
		      (const char __user *str))
{
	char buf[256];
	unsigned len;

	len = cobalt_strncpy_from_user(buf, str, sizeof(buf));
	if (len < 0)
		return -EFAULT;

#ifdef CONFIG_TRACING
	__trace_puts(_THIS_IP_, buf, len);
#endif

	return 0;
}

201
static COBALT_SYSCALL(archcall, current,
202
203
204
		      (unsigned long a1, unsigned long a2,
		       unsigned long a3, unsigned long a4,
		       unsigned long a5))
205
206
{
	return xnarch_local_syscall(a1, a2, a3, a4, a5);
207
208
}

209
static COBALT_SYSCALL(get_current, current,
210
		      (xnhandle_t __user *u_handle))
211
{
212
	struct xnthread *cur = xnthread_current();
213

214
215
	if (cur == NULL)
		return -EPERM;
216

217
	return cobalt_copy_to_user(u_handle, &cur->handle,
218
219
				      sizeof(*u_handle));
}
220

221
static COBALT_SYSCALL(backtrace, lostage,
222
		      (int nr, unsigned long __user *u_backtrace, int reason))
223
224
225
{
	unsigned long backtrace[SIGSHADOW_BACKTRACE_DEPTH];
	int ret;
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243

	/*
	 * In case backtrace() in userland is broken or fails. We may
	 * want to know about this in kernel space however, for future
	 * use.
	 */
	if (nr <= 0)
		return 0;
	/*
	 * We may omit the older frames if we can't store the full
	 * backtrace.
	 */
	if (nr > SIGSHADOW_BACKTRACE_DEPTH)
		nr = SIGSHADOW_BACKTRACE_DEPTH;
	/*
	 * Fetch the backtrace array, filled with PC values as seen
	 * from the relaxing thread in user-space. This can't fail
	 */
244
	ret = cobalt_copy_from_user(backtrace, u_backtrace, nr * sizeof(long));
245
246
247
248
249
	if (ret)
		return ret;

	xndebug_trace_relax(nr, backtrace, reason);

250
251
252
	return 0;
}

253
static COBALT_SYSCALL(serialdbg, current,
254
		      (const char __user *u_msg, int len))
255
256
257
258
259
260
261
262
{
	char buf[128];
	int n;

	while (len > 0) {
		n = len;
		if (n > sizeof(buf))
			n = sizeof(buf);
263
		if (cobalt_copy_from_user(buf, u_msg, n))
264
			return -EFAULT;
265
		raw_printk("%.*s", n, buf);
266
267
268
		u_msg += n;
		len -= n;
	}
269

270
	return 0;
Gilles Chanteperdrix's avatar
Gilles Chanteperdrix committed
271
272
}

273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
static void stringify_feature_set(unsigned long fset, char *buf, int size)
{
	unsigned long feature;
	int nc, nfeat;

	*buf = '\0';

	for (feature = 1, nc = nfeat = 0; fset != 0 && size > 0; feature <<= 1) {
		if (fset & feature) {
			nc = ksformat(buf, size, "%s%s",
				      nfeat > 0 ? " " : "",
				      get_feature_label(feature));
			nfeat++;
			size -= nc;
			buf += nc;
			fset &= ~feature;
		}
	}
}

293
static COBALT_SYSCALL(bind, lostage,
294
		      (struct cobalt_bindreq __user *u_breq))
295
296
297
298
299
300
{
	unsigned long featreq, featmis;
	struct cobalt_bindreq breq;
	struct cobalt_featinfo *f;
	int abirev;

301
	if (cobalt_copy_from_user(&breq, u_breq, sizeof(breq)))
302
303
304
305
		return -EFAULT;

	f = &breq.feat_ret;
	featreq = breq.feat_req;
306
	if (!realtime_core_running() && (featreq & __xn_feat_control) == 0)
307
		return -EAGAIN;
308

309
310
311
312
	/*
	 * Calculate the missing feature set:
	 * kernel_unavailable_set & user_mandatory_set.
	 */
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
	featmis = (~XENOMAI_FEAT_DEP & (featreq & XENOMAI_FEAT_MAN));
	abirev = breq.abi_rev;

	/*
	 * Pass back the supported feature set and the ABI revision
	 * level to user-space.
	 */
	f->feat_all = XENOMAI_FEAT_DEP;
	stringify_feature_set(XENOMAI_FEAT_DEP, f->feat_all_s,
			      sizeof(f->feat_all_s));
	f->feat_man = featreq & XENOMAI_FEAT_MAN;
	stringify_feature_set(f->feat_man, f->feat_man_s,
			      sizeof(f->feat_man_s));
	f->feat_mis = featmis;
	stringify_feature_set(featmis, f->feat_mis_s,
			      sizeof(f->feat_mis_s));
	f->feat_req = featreq;
	stringify_feature_set(featreq, f->feat_req_s,
			      sizeof(f->feat_req_s));
	f->feat_abirev = XENOMAI_ABI_REV;
	collect_arch_features(f);

335
	pipeline_collect_features(f);
336
337
	f->vdso_offset = cobalt_umm_offset(&cobalt_ppd_get(1)->umm, nkvdso);

338
	if (cobalt_copy_to_user(u_breq, &breq, sizeof(breq)))
339
340
341
342
343
344
345
346
347
348
349
350
		return -EFAULT;

	/*
	 * If some mandatory features the user-space code relies on
	 * are missing at kernel level, we cannot go further.
	 */
	if (featmis)
		return -EINVAL;

	if (!check_abi_revision(abirev))
		return -ENOEXEC;

351
	return cobalt_bind_core(featreq);
352
353
}

354
static COBALT_SYSCALL(extend, lostage, (unsigned int magic))
355
356
357
358
{
	return cobalt_bind_personality(magic);
}

359
static int CoBaLt_ni(void)
360
361
362
363
{
	return -ENOSYS;
}

364
365
366
367
368
369
370
371
372
373
374
375
376
/*
 * We have a single syscall table for all ABI models, i.e. 64bit
 * native + 32bit) or plain 32bit. In the former case, we may want to
 * support several models with a single build (e.g. ia32 and x32 for
 * x86_64).
 *
 * The syscall table is set up in a single step, based on three
 * subsequent sources of initializers:
 *
 * - first, all syscall entries are defaulted to a placeholder
 * returning -ENOSYS, as the table may be sparse.
 *
 * - then __COBALT_CALL_ENTRY() produces a native call entry
377
378
 * (e.g. pure 64bit call handler for a 64bit architecture), optionally
 * followed by a set of 32bit syscall entries offset by an
379
 * arch-specific base index, which default to the native calls. These
380
381
382
383
 * nitty-gritty details are defined by
 * <asm/xenomai/syscall32.h>. 32bit architectures - or 64bit ones for
 * which we don't support any 32bit ABI model - will simply define
 * __COBALT_CALL32_ENTRY() as an empty macro.
384
 *
385
386
387
 * - finally, 32bit thunk entries are generated per-architecture, by
 * including <asm/xenomai/syscall32-table.h>, overriding the default
 * handlers installed during the previous step.
388
389
390
391
392
393
394
395
396
397
398
 *
 * For instance, with CONFIG_X86_X32 support enabled in an x86_64
 * kernel, sc_cobalt_mq_timedreceive would appear twice in the table,
 * as:
 *
 * [sc_cobalt_mq_timedreceive] = cobalt_mq_timedreceive,
 * ...
 * [sc_cobalt_mq_timedreceive + __COBALT_X32_BASE] = cobalt32x_mq_timedreceive,
 *
 * cobalt32x_mq_timedreceive() would do the required thunking for
 * dealing with the 32<->64bit conversion of arguments. On the other
399
 * hand, sc_cobalt_sched_yield - which do not require any thunk -
400
 * would also appear twice, but both entries would point at the native
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
 * syscall implementation:
 *
 * [sc_cobalt_sched_yield] = cobalt_sched_yield,
 * ...
 * [sc_cobalt_sched_yield + __COBALT_X32_BASE] = cobalt_sched_yield,
 *
 * Accordingly, applications targeting the x32 model (-mx32) issue
 * syscalls in the range [__COBALT_X32_BASE..__COBALT_X32_BASE +
 * __NR_COBALT_SYSCALLS-1], whilst native (32/64bit) ones issue
 * syscalls in the range [0..__NR_COBALT_SYSCALLS-1].
 *
 * In short, this is an incremental process where the arch-specific
 * code can override the 32bit syscall entries, pointing at the thunk
 * routines it may need for handing 32bit calls over their respective
 * 64bit implementation.
 *
417
 * By convention, there is NO pure 32bit syscall, which means that
418
419
420
421
422
423
 * each 32bit syscall defined by a compat ABI interface MUST match a
 * native (64bit) syscall. This is important as we share the call
 * modes (i.e. __xn_exec_ bits) between all ABI models.
 *
 * --rpm
 */
424
#define __syshand__(__name)	((cobalt_syshand)(CoBaLt_ ## __name))
425
426
427
428

#define __COBALT_NI	__syshand__(ni)

#define __COBALT_CALL_NI				\
429
	[0 ... __NR_COBALT_SYSCALLS-1] = __COBALT_NI,	\
430
431
432
	__COBALT_CALL32_INITHAND(__COBALT_NI)

#define __COBALT_CALL_NFLAGS				\
433
	[0 ... __NR_COBALT_SYSCALLS-1] = 0,		\
434
435
436
	__COBALT_CALL32_INITMODE(0)

#define __COBALT_CALL_ENTRY(__name)				\
437
	[sc_cobalt_ ## __name] = __syshand__(__name),		\
438
439
440
	__COBALT_CALL32_ENTRY(__name, __syshand__(__name))

#define __COBALT_MODE(__name, __mode)	\
441
	[sc_cobalt_ ## __name] = __xn_exec_##__mode,
442
443
444

#ifdef CONFIG_XENO_ARCH_SYS3264
#include "syscall32.h"
445
#endif
446

447
448
#include "syscall_entries.h"

449
static const cobalt_syshand cobalt_syscalls[] = {
450
	__COBALT_CALL_NI
451
	__COBALT_CALL_ENTRIES
452
#ifdef CONFIG_XENO_ARCH_SYS3264
453
#include <asm/xenomai/syscall32-table.h>
454
#endif
455
456
};

457
static const int cobalt_sysmodes[] = {
458
	__COBALT_CALL_NFLAGS
459
	__COBALT_CALL_MODES
460
};
461

462
463
464
465
466
467
468
469
470
471
static inline int allowed_syscall(struct cobalt_process *process,
				  struct xnthread *thread,
				  int sysflags, int nr)
{
	if (nr == sc_cobalt_bind)
		return 1;
	
	if (process == NULL)
		return 0;

472
	if (thread == NULL && (sysflags & __xn_exec_shadow))
473
474
475
476
477
		return 0;

	return cap_raised(current_cap(), CAP_SYS_NICE);
}

478
int handle_head_syscall(bool caller_is_relaxed, struct pt_regs *regs)
479
480
{
	struct cobalt_process *process;
481
	int switched, sigs, sysflags;
482
483
484
485
	struct xnthread *thread;
	cobalt_syshand handler;
	struct task_struct *p;
	unsigned int nr, code;
486
	long ret;
487
488
489
490
491
492
493
494
495
496
497

	if (!__xn_syscall_p(regs))
		goto linux_syscall;

	thread = xnthread_current();
	code = __xn_syscall(regs);
	if (code >= ARRAY_SIZE(cobalt_syscalls))
		goto bad_syscall;

	nr = code & (__NR_COBALT_SYSCALLS - 1);

498
	trace_cobalt_head_sysentry(code);
499
500
501
502
503
504
505
506
507
508
509
510
511
512

	process = cobalt_current_process();
	if (process == NULL) {
		process = cobalt_search_process(current->mm);
		cobalt_set_process(process);
	}

	handler = cobalt_syscalls[code];
	sysflags = cobalt_sysmodes[nr];

	/*
	 * Executing Cobalt services requires CAP_SYS_NICE, except for
	 * sc_cobalt_bind which does its own checks.
	 */
513
	if (unlikely(!allowed_syscall(process, thread, sysflags, nr))) {
514
515
516
517
518
		/*
		 * Exclude get_current from reporting, it is used to probe the
		 * execution context.
		 */
		if (XENO_DEBUG(COBALT) && nr != sc_cobalt_get_current)
519
			printk(XENO_WARNING
520
			       "syscall <%d> denied to %s[%d]\n",
521
			       nr, current->comm, task_pid_nr(current));
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
		__xn_error_return(regs, -EPERM);
		goto ret_handled;
	}

	if (sysflags & __xn_exec_conforming)
		/*
		 * If the conforming exec bit is set, turn the exec
		 * bitmask for the syscall into the most appropriate
		 * setup for the caller, i.e. Xenomai domain for
		 * shadow threads, Linux otherwise.
		 */
		sysflags |= (thread ? __xn_exec_histage : __xn_exec_lostage);

	/*
	 * Here we have to dispatch the syscall execution properly,
	 * depending on:
	 *
	 * o Whether the syscall must be run into the Linux or Xenomai
	 * domain, or indifferently in the current Xenomai domain.
	 *
	 * o Whether the caller currently runs in the Linux or Xenomai
	 * domain.
	 */
restart:
	/*
	 * Process adaptive syscalls by restarting them in the
548
549
	 * opposite domain upon receiving -ENOSYS from the syscall
	 * handler.
550
	 */
551
	switched = 0;
552
553
554
555
	if (sysflags & __xn_exec_lostage) {
		/*
		 * The syscall must run from the Linux domain.
		 */
556
		if (!caller_is_relaxed) {
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
			/*
			 * Request originates from the Xenomai domain:
			 * relax the caller then invoke the syscall
			 * handler right after.
			 */
			xnthread_relax(1, SIGDEBUG_MIGRATE_SYSCALL);
			switched = 1;
		} else
			/*
			 * Request originates from the Linux domain:
			 * propagate the event to our Linux-based
			 * handler, so that the syscall is executed
			 * from there.
			 */
			return KEVENT_PROPAGATE;
	} else if (sysflags & (__xn_exec_histage | __xn_exec_current)) {
		/*
		 * Syscall must run either from the Xenomai domain, or
		 * from the calling domain.
		 *
		 * If the request originates from the Linux domain,
		 * hand it over to our secondary-mode dispatcher.
		 * Otherwise, invoke the syscall handler immediately.
		 */
581
		if (caller_is_relaxed)
582
583
584
			return KEVENT_PROPAGATE;
	}

585
586
587
588
589
590
	/*
	 * 'thread' has to be valid from that point: all syscalls
	 * regular threads may call have been pipelined to the root
	 * handler (lostage ones), or rejected by allowed_syscall().
	 */

591
	ret = handler(__xn_reg_arglist(regs));
592
	if (ret == -ENOSYS && (sysflags & __xn_exec_adaptive)) {
593
594
		if (switched) {
			ret = xnthread_harden();
595
596
			if (ret) {
				switched = 0;
597
				goto done;
598
599
600
			}
		} else /* Mark the primary -> secondary transition. */
			xnthread_set_localinfo(thread, XNDESCENT);
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
		sysflags ^=
		    (__xn_exec_lostage | __xn_exec_histage |
		     __xn_exec_adaptive);
		goto restart;
	}
done:
	__xn_status_return(regs, ret);
	sigs = 0;
	if (!xnsched_root_p()) {
		p = current;
		if (signal_pending(p) ||
		    xnthread_test_info(thread, XNKICKED)) {
			sigs = 1;
			prepare_for_signal(p, thread, regs, sysflags);
		} else if (xnthread_test_state(thread, XNWEAK) &&
			   thread->res_count == 0) {
			if (switched)
				switched = 0;
			else
				xnthread_relax(0, 0);
		}
	}
623
624
625
	if (!sigs && (sysflags & __xn_exec_switchback) && switched)
		/* -EPERM will be trapped later if needed. */
		xnthread_harden();
626
627
628
629

ret_handled:
	/* Update the stats and userland-visible state. */
	if (thread) {
630
		xnthread_clear_localinfo(thread, XNDESCENT);
631
632
633
634
		xnstat_counter_inc(&thread->stat.xsc);
		xnthread_sync_window(thread);
	}

635
	trace_cobalt_head_sysexit(__xn_reg_rval(regs));
636
637
638
639
640
641
642
643
644
645
646
647
648

	return KEVENT_STOP;

linux_syscall:
	if (xnsched_root_p())
		/*
		 * The call originates from the Linux domain, either
		 * from a relaxed shadow or from a regular Linux task;
		 * just propagate the event so that we will fall back
		 * to handle_root_syscall().
		 */
		return KEVENT_PROPAGATE;

649
650
651
	if (!__xn_rootcall_p(regs, &code))
		goto bad_syscall;

652
	/*
653
654
655
656
	 * We know this is a Cobalt thread since it runs over the head
	 * domain, however the current syscall should be handled by
	 * the host kernel instead.  Before this happens, we have to
	 * re-enter the root domain.
657
658
659
660
661
662
	 */
	xnthread_relax(1, SIGDEBUG_MIGRATE_SYSCALL);

	return KEVENT_PROPAGATE;

bad_syscall:
663
	printk(XENO_WARNING "bad syscall <%#x>\n", code);
664

665
666
667
668
669
	__xn_error_return(regs, -ENOSYS);

	return KEVENT_STOP;
}

670
int handle_root_syscall(struct pt_regs *regs)
671
{
672
	int sysflags, switched, sigs;
673
674
675
676
	struct xnthread *thread;
	cobalt_syshand handler;
	struct task_struct *p;
	unsigned int nr, code;
677
	long ret;
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695

	/*
	 * Catch cancellation requests pending for user shadows
	 * running mostly in secondary mode, i.e. XNWEAK. In that
	 * case, we won't run prepare_for_signal() that frequently, so
	 * check for cancellation here.
	 */
	xnthread_test_cancel();

	if (!__xn_syscall_p(regs))
		/* Fall back to Linux syscall handling. */
		return KEVENT_PROPAGATE;

	thread = xnthread_current();
	/* code has already been checked in the head domain handler. */
	code = __xn_syscall(regs);
	nr = code & (__NR_COBALT_SYSCALLS - 1);

696
	trace_cobalt_root_sysentry(code);
697
698
699
700
701
702

	/* Processing a Xenomai syscall. */

	handler = cobalt_syscalls[code];
	sysflags = cobalt_sysmodes[nr];

703
704
	if (thread && (sysflags & __xn_exec_conforming))
		sysflags |= __xn_exec_histage;
705
706
707
restart:
	/*
	 * Process adaptive syscalls by restarting them in the
708
709
	 * opposite domain upon receiving -ENOSYS from the syscall
	 * handler.
710
	 */
711
	switched = 0;
712
713
	if (sysflags & __xn_exec_histage) {
		/*
714
715
716
		 * This request originates from the Linux domain but
		 * should run into the Xenomai domain: harden the
		 * caller before invoking the syscall handler.
717
718
719
720
721
722
723
		 */
		ret = xnthread_harden();
		if (ret) {
			__xn_error_return(regs, ret);
			goto ret_handled;
		}
		switched = 1;
724
725
726
727
728
729
730
731
	} else {
		/*
		 * We want to run the syscall in the current Linux
		 * domain. This is a slow path, so proceed with any
		 * pending schedparam update on the fly.
		 */
		if (thread)
			xnthread_propagate_schedparam(thread);
732
	}
733
734

	ret = handler(__xn_reg_arglist(regs));
735
736
	if (ret == -ENOSYS && (sysflags & __xn_exec_adaptive)) {
		sysflags ^= __xn_exec_histage;
737
738
		if (switched) {
			xnthread_relax(1, SIGDEBUG_MIGRATE_SYSCALL);
739
740
741
			sysflags &= ~__xn_exec_adaptive;
			 /* Mark the primary -> secondary transition. */
			xnthread_set_localinfo(thread, XNDESCENT);
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
		}
		goto restart;
	}

	__xn_status_return(regs, ret);

	sigs = 0;
	if (!xnsched_root_p()) {
		/*
		 * We may have gained a shadow TCB from the syscall we
		 * just invoked, so make sure to fetch it.
		 */
		thread = xnthread_current();
		p = current;
		if (signal_pending(p)) {
			sigs = 1;
			prepare_for_signal(p, thread, regs, sysflags);
		} else if (xnthread_test_state(thread, XNWEAK) &&
			   thread->res_count == 0)
			sysflags |= __xn_exec_switchback;
	}
763
	if (!sigs && (sysflags & __xn_exec_switchback)
764
765
766
767
768
769
	    && (switched || xnsched_primary_p()))
		xnthread_relax(0, 0);

ret_handled:
	/* Update the stats and userland-visible state. */
	if (thread) {
770
		xnthread_clear_localinfo(thread, XNDESCENT|XNHICCUP);
771
772
773
774
		xnstat_counter_inc(&thread->stat.xsc);
		xnthread_sync_window(thread);
	}

775
	trace_cobalt_root_sysexit(__xn_reg_rval(regs));
776
777
778
779

	return KEVENT_STOP;
}

780
781
782
783
long cobalt_restart_syscall_placeholder(struct restart_block *param)
{
	return -EINVAL;
}