trace_syscalls.c 13.6 KB
Newer Older
1
#include <trace/syscall.h>
2
#include <trace/events/syscalls.h>
3
#include <linux/kernel.h>
4
#include <linux/ftrace.h>
5
#include <linux/perf_event.h>
6
7
8
9
10
#include <asm/syscall.h>

#include "trace_output.h"
#include "trace.h"

11
static DEFINE_MUTEX(syscall_trace_lock);
12
13
static int sys_refcount_enter;
static int sys_refcount_exit;
14
15
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
16

17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
extern unsigned long __start_syscalls_metadata[];
extern unsigned long __stop_syscalls_metadata[];

static struct syscall_metadata **syscalls_metadata;

static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
{
	struct syscall_metadata *start;
	struct syscall_metadata *stop;
	char str[KSYM_SYMBOL_LEN];


	start = (struct syscall_metadata *)__start_syscalls_metadata;
	stop = (struct syscall_metadata *)__stop_syscalls_metadata;
	kallsyms_lookup(syscall, NULL, NULL, NULL, str);

	for ( ; start < stop; start++) {
		/*
		 * Only compare after the "sys" prefix. Archs that use
		 * syscall wrappers may have syscalls symbols aliases prefixed
		 * with "SyS" instead of "sys", leading to an unwanted
		 * mismatch.
		 */
		if (start->name && !strcmp(start->name + 3, str + 3))
			return start;
	}
	return NULL;
}

static struct syscall_metadata *syscall_nr_to_meta(int nr)
{
	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
		return NULL;

	return syscalls_metadata[nr];
}

54
55
56
57
58
59
60
61
62
enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags)
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_enter *trace;
	struct syscall_metadata *entry;
	int i, ret, syscall;

63
	trace = (typeof(trace))ent;
64
65
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
66

67
68
69
	if (!entry)
		goto end;

70
	if (entry->enter_event->id != ent->type) {
71
72
73
74
		WARN_ON_ONCE(1);
		goto end;
	}

75
76
77
78
79
80
	ret = trace_seq_printf(s, "%s(", entry->name);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	for (i = 0; i < entry->nb_args; i++) {
		/* parameter types */
81
		if (trace_flags & TRACE_ITER_VERBOSE) {
82
83
84
85
86
			ret = trace_seq_printf(s, "%s ", entry->types[i]);
			if (!ret)
				return TRACE_TYPE_PARTIAL_LINE;
		}
		/* parameter values */
87
		ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
88
				       trace->args[i],
89
				       i == entry->nb_args - 1 ? "" : ", ");
90
91
92
93
		if (!ret)
			return TRACE_TYPE_PARTIAL_LINE;
	}

94
95
96
97
	ret = trace_seq_putc(s, ')');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

98
end:
99
100
101
102
	ret =  trace_seq_putc(s, '\n');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

103
104
105
106
107
108
109
110
111
112
113
114
115
	return TRACE_TYPE_HANDLED;
}

enum print_line_t
print_syscall_exit(struct trace_iterator *iter, int flags)
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_exit *trace;
	int syscall;
	struct syscall_metadata *entry;
	int ret;

116
	trace = (typeof(trace))ent;
117
118
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
119

120
121
122
123
124
	if (!entry) {
		trace_seq_printf(s, "\n");
		return TRACE_TYPE_HANDLED;
	}

125
	if (entry->exit_event->id != ent->type) {
126
127
128
129
		WARN_ON_ONCE(1);
		return TRACE_TYPE_UNHANDLED;
	}

130
131
132
133
134
135
136
137
	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
				trace->ret);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	return TRACE_TYPE_HANDLED;
}

138
139
140
141
142
extern char *__bad_type_size(void);

#define SYSCALL_FIELD(type, name)					\
	sizeof(type) != sizeof(trace.name) ?				\
		__bad_type_size() :					\
143
144
		#type, #name, offsetof(typeof(trace), name),		\
		sizeof(trace.name), is_signed_type(type)
145

146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
static
int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
	int i;
	int pos = 0;

	/* When len=0, we just calculate the needed length */
#define LEN_OR_ZERO (len ? len - pos : 0)

	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
				entry->args[i], sizeof(unsigned long),
				i == entry->nb_args - 1 ? "" : ", ");
	}
	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");

	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO,
				", ((unsigned long)(REC->%s))", entry->args[i]);
	}

#undef LEN_OR_ZERO

	/* return the length of print_fmt */
	return pos;
}

static int set_syscall_print_fmt(struct ftrace_event_call *call)
{
	char *print_fmt;
	int len;
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event != call) {
		call->print_fmt = "\"0x%lx\", REC->ret";
		return 0;
	}

	/* First: called with 0 length to calculate the needed length */
	len = __set_enter_print_fmt(entry, NULL, 0);

	print_fmt = kmalloc(len + 1, GFP_KERNEL);
	if (!print_fmt)
		return -ENOMEM;

	/* Second: actually write the @print_fmt */
	__set_enter_print_fmt(entry, print_fmt, len + 1);
	call->print_fmt = print_fmt;

	return 0;
}

static void free_syscall_print_fmt(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event == call)
		kfree(call->print_fmt);
}

207
208
209
int syscall_enter_define_fields(struct ftrace_event_call *call)
{
	struct syscall_trace_enter trace;
210
	struct syscall_metadata *meta = call->data;
211
212
213
214
	int ret;
	int i;
	int offset = offsetof(typeof(trace), args);

215
216
217
218
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

219
	for (i = 0; i < meta->nb_args; i++) {
220
221
		ret = trace_define_field(call, meta->types[i],
					 meta->args[i], offset,
222
223
					 sizeof(unsigned long), 0,
					 FILTER_OTHER);
224
225
226
227
228
229
230
231
232
233
234
		offset += sizeof(unsigned long);
	}

	return ret;
}

int syscall_exit_define_fields(struct ftrace_event_call *call)
{
	struct syscall_trace_exit trace;
	int ret;

235
236
237
238
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

239
	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
240
				 FILTER_OTHER);
241
242
243
244

	return ret;
}

245
void ftrace_syscall_enter(struct pt_regs *regs, long id)
246
{
247
248
249
	struct syscall_trace_enter *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
250
	struct ring_buffer *buffer;
251
	int size;
252
253
254
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
255
256
	if (syscall_nr < 0)
		return;
257
258
	if (!test_bit(syscall_nr, enabled_enter_syscalls))
		return;
259

260
261
262
263
264
265
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;

266
267
	event = trace_current_buffer_lock_reserve(&buffer,
			sys_data->enter_event->id, size, 0, 0);
268
269
270
271
272
273
274
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);

275
276
277
	if (!filter_current_check_discard(buffer, sys_data->enter_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
278
279
}

280
void ftrace_syscall_exit(struct pt_regs *regs, long ret)
281
{
282
283
284
	struct syscall_trace_exit *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
285
	struct ring_buffer *buffer;
286
287
288
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
289
290
	if (syscall_nr < 0)
		return;
291
292
	if (!test_bit(syscall_nr, enabled_exit_syscalls))
		return;
293

294
295
296
297
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

298
299
	event = trace_current_buffer_lock_reserve(&buffer,
			sys_data->exit_event->id, sizeof(*entry), 0, 0);
300
301
302
303
304
305
306
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	entry->ret = syscall_get_return_value(current, regs);

307
308
309
	if (!filter_current_check_discard(buffer, sys_data->exit_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
310
311
}

312
int reg_event_syscall_enter(struct ftrace_event_call *call)
313
{
314
315
316
	int ret = 0;
	int num;

317
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
318
	if (num < 0 || num >= NR_syscalls)
319
320
321
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_enter)
322
		ret = register_trace_sys_enter(ftrace_syscall_enter);
323
	if (!ret) {
324
325
326
327
328
		set_bit(num, enabled_enter_syscalls);
		sys_refcount_enter++;
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
329
330
}

331
void unreg_event_syscall_enter(struct ftrace_event_call *call)
332
{
333
	int num;
334

335
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
336
	if (num < 0 || num >= NR_syscalls)
337
338
339
340
341
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_enter--;
	clear_bit(num, enabled_enter_syscalls);
	if (!sys_refcount_enter)
342
		unregister_trace_sys_enter(ftrace_syscall_enter);
343
344
	mutex_unlock(&syscall_trace_lock);
}
345

346
int reg_event_syscall_exit(struct ftrace_event_call *call)
347
{
348
349
350
	int ret = 0;
	int num;

351
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
352
	if (num < 0 || num >= NR_syscalls)
353
354
355
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_exit)
356
		ret = register_trace_sys_exit(ftrace_syscall_exit);
357
	if (!ret) {
358
359
		set_bit(num, enabled_exit_syscalls);
		sys_refcount_exit++;
360
	}
361
362
363
	mutex_unlock(&syscall_trace_lock);
	return ret;
}
364

365
void unreg_event_syscall_exit(struct ftrace_event_call *call)
366
367
{
	int num;
368

369
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
370
	if (num < 0 || num >= NR_syscalls)
371
372
373
374
375
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_exit--;
	clear_bit(num, enabled_exit_syscalls);
	if (!sys_refcount_exit)
376
		unregister_trace_sys_exit(ftrace_syscall_exit);
377
	mutex_unlock(&syscall_trace_lock);
378
}
379

380
381
382
383
int init_syscall_trace(struct ftrace_event_call *call)
{
	int id;

384
385
386
	if (set_syscall_print_fmt(call) < 0)
		return -ENOMEM;

387
388
389
	id = trace_event_raw_init(call);

	if (id < 0) {
390
		free_syscall_print_fmt(call);
391
		return id;
392
	}
393
394

	return id;
395
396
}

397
398
399
400
401
unsigned long __init arch_syscall_addr(int nr)
{
	return (unsigned long)sys_call_table[nr];
}

402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
int __init init_ftrace_syscalls(void)
{
	struct syscall_metadata *meta;
	unsigned long addr;
	int i;

	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
					NR_syscalls, GFP_KERNEL);
	if (!syscalls_metadata) {
		WARN_ON(1);
		return -ENOMEM;
	}

	for (i = 0; i < NR_syscalls; i++) {
		addr = arch_syscall_addr(i);
		meta = find_syscall_meta(addr);
418
419
420
421
		if (!meta)
			continue;

		meta->syscall_nr = i;
422
423
424
425
426
427
428
		syscalls_metadata[i] = meta;
	}

	return 0;
}
core_initcall(init_ftrace_syscalls);

429
#ifdef CONFIG_PERF_EVENTS
430

431
432
433
434
static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
435

436
static void perf_syscall_enter(struct pt_regs *regs, long id)
437
438
{
	struct syscall_metadata *sys_data;
439
440
	struct syscall_trace_enter *rec;
	unsigned long flags;
441
	int syscall_nr;
442
	int rctx;
443
	int size;
444
445

	syscall_nr = syscall_get_nr(current, regs);
446
	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
447
448
449
450
451
452
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

453
454
455
456
457
	/* get the size after alignment with the u32 buffer size field */
	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
	size = ALIGN(size + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);

458
459
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		      "perf buffer not large enough"))
460
461
		return;

462
	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
463
464
465
				sys_data->enter_event->id, &rctx, &flags);
	if (!rec)
		return;
466
467
468
469

	rec->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
			       (unsigned long *)&rec->args);
470
	perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
471
472
}

473
int perf_sysenter_enable(struct ftrace_event_call *call)
474
475
476
477
{
	int ret = 0;
	int num;

478
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
479
480

	mutex_lock(&syscall_trace_lock);
481
482
	if (!sys_perf_refcount_enter)
		ret = register_trace_sys_enter(perf_syscall_enter);
483
484
485
486
	if (ret) {
		pr_info("event trace: Could not activate"
				"syscall entry trace point");
	} else {
487
488
		set_bit(num, enabled_perf_enter_syscalls);
		sys_perf_refcount_enter++;
489
490
491
492
493
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

494
void perf_sysenter_disable(struct ftrace_event_call *call)
495
496
497
{
	int num;

498
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
499
500

	mutex_lock(&syscall_trace_lock);
501
502
503
504
	sys_perf_refcount_enter--;
	clear_bit(num, enabled_perf_enter_syscalls);
	if (!sys_perf_refcount_enter)
		unregister_trace_sys_enter(perf_syscall_enter);
505
506
507
	mutex_unlock(&syscall_trace_lock);
}

508
static void perf_syscall_exit(struct pt_regs *regs, long ret)
509
510
{
	struct syscall_metadata *sys_data;
511
512
	struct syscall_trace_exit *rec;
	unsigned long flags;
513
	int syscall_nr;
514
	int rctx;
515
	int size;
516
517

	syscall_nr = syscall_get_nr(current, regs);
518
	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
519
520
521
522
523
524
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

525
526
527
	/* We can probably do that at build time */
	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);
528

529
530
531
532
	/*
	 * Impossible, but be paranoid with the future
	 * How to put this check outside runtime?
	 */
533
534
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		"exit event has grown above perf buffer size"))
535
536
		return;

537
	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
538
539
540
				sys_data->exit_event->id, &rctx, &flags);
	if (!rec)
		return;
541
542
543
544

	rec->nr = syscall_nr;
	rec->ret = syscall_get_return_value(current, regs);

545
	perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
546
547
}

548
int perf_sysexit_enable(struct ftrace_event_call *call)
549
550
551
552
{
	int ret = 0;
	int num;

553
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
554
555

	mutex_lock(&syscall_trace_lock);
556
557
	if (!sys_perf_refcount_exit)
		ret = register_trace_sys_exit(perf_syscall_exit);
558
559
	if (ret) {
		pr_info("event trace: Could not activate"
560
				"syscall exit trace point");
561
	} else {
562
563
		set_bit(num, enabled_perf_exit_syscalls);
		sys_perf_refcount_exit++;
564
565
566
567
568
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

569
void perf_sysexit_disable(struct ftrace_event_call *call)
570
571
572
{
	int num;

573
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
574
575

	mutex_lock(&syscall_trace_lock);
576
577
578
579
	sys_perf_refcount_exit--;
	clear_bit(num, enabled_perf_exit_syscalls);
	if (!sys_perf_refcount_exit)
		unregister_trace_sys_exit(perf_syscall_exit);
580
581
582
	mutex_unlock(&syscall_trace_lock);
}

583
#endif /* CONFIG_PERF_EVENTS */
584