vsyscall_64.c 8.59 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/*
 *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
 *  Copyright 2003 Andi Kleen, SuSE Labs.
 *
 *  Thanks to hpa@transmeta.com for some useful hint.
 *  Special thanks to Ingo Molnar for his early experience with
 *  a different vsyscall implementation for Linux/IA32 and for the name.
 *
 *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
 *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
 *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
 *  jumping out of line if necessary. We cannot add more with this
 *  mechanism because older kernels won't return -ENOSYS.
 *  If we want more than four we need a vDSO.
 *
 *  Note: the concept clashes with user mode linux. If you use UML and
 *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
 */

20
/* Disable profiling for userspace code: */
21
#define DISABLE_BRANCH_PROFILING
22

Linus Torvalds's avatar
Linus Torvalds committed
23
24
25
26
27
28
29
#include <linux/time.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/seqlock.h>
#include <linux/jiffies.h>
#include <linux/sysctl.h>
30
#include <linux/clocksource.h>
31
#include <linux/getcpu.h>
32
33
34
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/notifier.h>
Linus Torvalds's avatar
Linus Torvalds committed
35
36
37
38

#include <asm/vsyscall.h>
#include <asm/pgtable.h>
#include <asm/page.h>
39
#include <asm/unistd.h>
Linus Torvalds's avatar
Linus Torvalds committed
40
41
42
#include <asm/fixmap.h>
#include <asm/errno.h>
#include <asm/io.h>
43
44
45
#include <asm/segment.h>
#include <asm/desc.h>
#include <asm/topology.h>
46
#include <asm/vgtod.h>
Linus Torvalds's avatar
Linus Torvalds committed
47

48
49
#define __vsyscall(nr) \
		__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
50
#define __syscall_clobber "r11","cx","memory"
Linus Torvalds's avatar
Linus Torvalds committed
51

52
53
54
/*
 * vsyscall_gtod_data contains data that is :
 * - readonly from vsyscalls
Simon Arlott's avatar
Simon Arlott committed
55
 * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
56
57
 * Try to keep this structure as small as possible to avoid cache line ping pongs
 */
58
int __vgetcpu_mode __section_vgetcpu_mode;
Linus Torvalds's avatar
Linus Torvalds committed
59

60
struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
Linus Torvalds's avatar
Linus Torvalds committed
61
{
62
63
64
	.lock = SEQLOCK_UNLOCKED,
	.sysctl_enabled = 1,
};
Linus Torvalds's avatar
Linus Torvalds committed
65

66
67
68
69
70
71
72
73
74
75
void update_vsyscall_tz(void)
{
	unsigned long flags;

	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
	/* sys_tz has changed */
	vsyscall_gtod_data.sys_tz = sys_tz;
	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}

76
void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
Linus Torvalds's avatar
Linus Torvalds committed
77
{
78
	unsigned long flags;
Linus Torvalds's avatar
Linus Torvalds committed
79

80
81
	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
	/* copy vsyscall data */
82
83
84
85
86
87
88
	vsyscall_gtod_data.clock.vread = clock->vread;
	vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
	vsyscall_gtod_data.clock.mask = clock->mask;
	vsyscall_gtod_data.clock.mult = clock->mult;
	vsyscall_gtod_data.clock.shift = clock->shift;
	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
89
	vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
90
	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
Linus Torvalds's avatar
Linus Torvalds committed
91
92
}

93
94
95
/* RED-PEN may want to readd seq locking, but then the variable should be
 * write-once.
 */
96
static __always_inline void do_get_tz(struct timezone * tz)
Linus Torvalds's avatar
Linus Torvalds committed
97
{
98
	*tz = __vsyscall_gtod_data.sys_tz;
Linus Torvalds's avatar
Linus Torvalds committed
99
100
}

101
static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
Linus Torvalds's avatar
Linus Torvalds committed
102
103
{
	int ret;
Thomas Gleixner's avatar
Thomas Gleixner committed
104
	asm volatile("syscall"
Linus Torvalds's avatar
Linus Torvalds committed
105
		: "=a" (ret)
106
107
		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
		: __syscall_clobber );
Linus Torvalds's avatar
Linus Torvalds committed
108
109
110
	return ret;
}

111
static __always_inline long time_syscall(long *t)
Linus Torvalds's avatar
Linus Torvalds committed
112
113
{
	long secs;
Thomas Gleixner's avatar
Thomas Gleixner committed
114
	asm volatile("syscall"
Linus Torvalds's avatar
Linus Torvalds committed
115
116
117
118
119
		: "=a" (secs)
		: "0" (__NR_time),"D" (t) : __syscall_clobber);
	return secs;
}

120
121
122
static __always_inline void do_vgettimeofday(struct timeval * tv)
{
	cycle_t now, base, mask, cycle_delta;
123
124
	unsigned seq;
	unsigned long mult, shift, nsec;
125
126
127
128
129
130
	cycle_t (*vread)(void);
	do {
		seq = read_seqbegin(&__vsyscall_gtod_data.lock);

		vread = __vsyscall_gtod_data.clock.vread;
		if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
Al Viro's avatar
Al Viro committed
131
			gettimeofday(tv,NULL);
132
133
			return;
		}
134

135
136
137
138
139
140
		now = vread();
		base = __vsyscall_gtod_data.clock.cycle_last;
		mask = __vsyscall_gtod_data.clock.mask;
		mult = __vsyscall_gtod_data.clock.mult;
		shift = __vsyscall_gtod_data.clock.shift;

141
142
		tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
		nsec = __vsyscall_gtod_data.wall_time_nsec;
143
144
145
146
147
	} while (read_seqretry(&__vsyscall_gtod_data.lock, seq));

	/* calculate interval: */
	cycle_delta = (now - base) & mask;
	/* convert to nsecs: */
148
	nsec += (cycle_delta * mult) >> shift;
149

150
	while (nsec >= NSEC_PER_SEC) {
151
		tv->tv_sec += 1;
152
		nsec -= NSEC_PER_SEC;
153
	}
154
	tv->tv_usec = nsec / NSEC_PER_USEC;
155
156
}

157
int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
Linus Torvalds's avatar
Linus Torvalds committed
158
159
160
161
162
163
164
165
166
167
{
	if (tv)
		do_vgettimeofday(tv);
	if (tz)
		do_get_tz(tz);
	return 0;
}

/* This will break when the xtime seconds get inaccurate, but that is
 * unlikely */
168
time_t __vsyscall(1) vtime(time_t *t)
Linus Torvalds's avatar
Linus Torvalds committed
169
{
john stultz's avatar
john stultz committed
170
	struct timeval tv;
171
	time_t result;
172
	if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
Linus Torvalds's avatar
Linus Torvalds committed
173
		return time_syscall(t);
john stultz's avatar
john stultz committed
174

175
	vgettimeofday(&tv, NULL);
john stultz's avatar
john stultz committed
176
	result = tv.tv_sec;
177
178
179
	if (t)
		*t = result;
	return result;
Linus Torvalds's avatar
Linus Torvalds committed
180
181
}

182
183
184
185
186
187
188
189
190
191
/* Fast way to get current CPU and node.
   This helps to do per node and per CPU caches in user space.
   The result is not guaranteed without CPU affinity, but usually
   works out because the scheduler tries to keep a thread on the same
   CPU.

   tcache must point to a two element sized long array.
   All arguments can be NULL. */
long __vsyscall(2)
vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
Linus Torvalds's avatar
Linus Torvalds committed
192
{
193
	unsigned int p;
194
195
196
197
198
199
200
201
202
203
	unsigned long j = 0;

	/* Fast cache - only recompute value once per jiffies and avoid
	   relatively costly rdtscp/cpuid otherwise.
	   This works because the scheduler usually keeps the process
	   on the same CPU and this syscall doesn't guarantee its
	   results anyways.
	   We do this here because otherwise user space would do it on
	   its own in a likely inferior way (no access to jiffies).
	   If you don't like it pass NULL. */
204
205
	if (tcache && tcache->blob[0] == (j = __jiffies)) {
		p = tcache->blob[1];
206
207
	} else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
		/* Load per CPU data from RDTSCP */
208
		native_read_tscp(&p);
209
210
211
212
213
	} else {
		/* Load per CPU data from GDT */
		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
	}
	if (tcache) {
214
215
		tcache->blob[0] = j;
		tcache->blob[1] = p;
216
217
218
219
220
221
	}
	if (cpu)
		*cpu = p & 0xfff;
	if (node)
		*node = p >> 12;
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
222
223
}

Ingo Molnar's avatar
Ingo Molnar committed
224
static long __vsyscall(3) venosys_1(void)
Linus Torvalds's avatar
Linus Torvalds committed
225
226
227
228
229
{
	return -ENOSYS;
}

#ifdef CONFIG_SYSCTL
230
231
232
233
234
235
236
237

static int
vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
		       void __user *buffer, size_t *lenp, loff_t *ppos)
{
	return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
}

Linus Torvalds's avatar
Linus Torvalds committed
238
static ctl_table kernel_table2[] = {
239
	{ .procname = "vsyscall64",
240
	  .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
241
242
	  .mode = 0644,
	  .proc_handler = vsyscall_sysctl_change },
243
	{}
Linus Torvalds's avatar
Linus Torvalds committed
244
245
246
247
248
};

static ctl_table kernel_root_table2[] = {
	{ .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
	  .child = kernel_table2 },
249
	{}
Linus Torvalds's avatar
Linus Torvalds committed
250
251
252
};
#endif

253
254
255
/* Assume __initcall executes before all user space. Hopefully kmod
   doesn't violate that. We'll find out if it does. */
static void __cpuinit vsyscall_set_cpu(int cpu)
256
{
257
	unsigned long d;
258
259
	unsigned long node = 0;
#ifdef CONFIG_NUMA
Mike Travis's avatar
Mike Travis committed
260
	node = cpu_to_node(cpu);
261
#endif
262
	if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
263
		write_rdtscp_aux((node << 12) | cpu);
264
265
266
267

	/* Store cpu number in limit so that it can be loaded quickly
	   in user space in vgetcpu.
	   12 bits for the CPU and 8 bits for the node. */
268
269
270
271
272
	d = 0x0f40000000000ULL;
	d |= cpu;
	d |= (node & 0xf) << 12;
	d |= (node >> 4) << 48;
	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
273
274
}

275
276
277
278
279
280
281
282
283
284
static void __cpuinit cpu_vsyscall_init(void *arg)
{
	/* preemption should be already off */
	vsyscall_set_cpu(raw_smp_processor_id());
}

static int __cpuinit
cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
{
	long cpu = (long)arg;
285
	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
286
		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
287
288
289
	return NOTIFY_DONE;
}

Ingo Molnar's avatar
Ingo Molnar committed
290
void __init map_vsyscall(void)
Linus Torvalds's avatar
Linus Torvalds committed
291
292
293
294
{
	extern char __vsyscall_0;
	unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);

295
	/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
Linus Torvalds's avatar
Linus Torvalds committed
296
297
298
299
300
301
302
303
304
	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
}

static int __init vsyscall_init(void)
{
	BUG_ON(((unsigned long) &vgettimeofday !=
			VSYSCALL_ADDR(__NR_vgettimeofday)));
	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
305
	BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
306
#ifdef CONFIG_SYSCTL
307
	register_sysctl_table(kernel_root_table2);
308
#endif
309
	on_each_cpu(cpu_vsyscall_init, NULL, 1);
310
	hotcpu_notifier(cpu_vsyscall_notifier, 0);
Linus Torvalds's avatar
Linus Torvalds committed
311
312
313
314
	return 0;
}

__initcall(vsyscall_init);