trace_syscalls.c 16.5 KB
Newer Older
1
#include <trace/syscall.h>
2
#include <trace/events/syscalls.h>
3
#include <linux/slab.h>
4
#include <linux/kernel.h>
5
#include <linux/ftrace.h>
6
#include <linux/perf_event.h>
7
8
9
10
11
#include <asm/syscall.h>

#include "trace_output.h"
#include "trace.h"

12
static DEFINE_MUTEX(syscall_trace_lock);
13
14
static int sys_refcount_enter;
static int sys_refcount_exit;
15
16
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17

18
19
20
21
22
static int syscall_enter_register(struct ftrace_event_call *event,
				 enum trace_reg type);
static int syscall_exit_register(struct ftrace_event_call *event,
				 enum trace_reg type);

23
24
25
26
27
28
29
30
31
32
33
static int syscall_enter_define_fields(struct ftrace_event_call *call);
static int syscall_exit_define_fields(struct ftrace_event_call *call);

static struct list_head *
syscall_get_enter_fields(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	return &entry->enter_fields;
}

34
struct trace_event_functions enter_syscall_print_funcs = {
35
	.trace		= print_syscall_enter,
36
37
38
};

struct trace_event_functions exit_syscall_print_funcs = {
39
	.trace		= print_syscall_exit,
40
41
};

42
struct ftrace_event_class event_class_syscall_enter = {
43
44
45
46
47
	.system		= "syscalls",
	.reg		= syscall_enter_register,
	.define_fields	= syscall_enter_define_fields,
	.get_fields	= syscall_get_enter_fields,
	.raw_init	= init_syscall_trace,
48
49
50
};

struct ftrace_event_class event_class_syscall_exit = {
51
52
53
54
55
	.system		= "syscalls",
	.reg		= syscall_exit_register,
	.define_fields	= syscall_exit_define_fields,
	.fields		= LIST_HEAD_INIT(event_class_syscall_exit.fields),
	.raw_init	= init_syscall_trace,
56
57
};

58
59
extern struct syscall_metadata *__start_syscalls_metadata[];
extern struct syscall_metadata *__stop_syscalls_metadata[];
60
61
62

static struct syscall_metadata **syscalls_metadata;

63
64
65
66
67
68
69
70
71
72
73
74
75
#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
{
	/*
	 * Only compare after the "sys" prefix. Archs that use
	 * syscall wrappers may have syscalls symbols aliases prefixed
	 * with "SyS" instead of "sys", leading to an unwanted
	 * mismatch.
	 */
	return !strcmp(sym + 3, name + 3);
}
#endif

76
77
static __init struct syscall_metadata *
find_syscall_meta(unsigned long syscall)
78
{
79
80
	struct syscall_metadata **start;
	struct syscall_metadata **stop;
81
82
83
	char str[KSYM_SYMBOL_LEN];


84
85
	start = __start_syscalls_metadata;
	stop = __stop_syscalls_metadata;
86
87
	kallsyms_lookup(syscall, NULL, NULL, NULL, str);

88
89
90
	if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
		return NULL;

91
	for ( ; start < stop; start++) {
92
		if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
93
			return *start;
94
95
96
97
98
99
100
101
102
103
104
105
	}
	return NULL;
}

static struct syscall_metadata *syscall_nr_to_meta(int nr)
{
	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
		return NULL;

	return syscalls_metadata[nr];
}

106
enum print_line_t
107
108
print_syscall_enter(struct trace_iterator *iter, int flags,
		    struct trace_event *event)
109
110
111
112
113
114
115
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_enter *trace;
	struct syscall_metadata *entry;
	int i, ret, syscall;

116
	trace = (typeof(trace))ent;
117
118
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
119

120
121
122
	if (!entry)
		goto end;

123
	if (entry->enter_event->event.type != ent->type) {
124
125
126
127
		WARN_ON_ONCE(1);
		goto end;
	}

128
129
130
131
132
133
	ret = trace_seq_printf(s, "%s(", entry->name);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	for (i = 0; i < entry->nb_args; i++) {
		/* parameter types */
134
		if (trace_flags & TRACE_ITER_VERBOSE) {
135
136
137
138
139
			ret = trace_seq_printf(s, "%s ", entry->types[i]);
			if (!ret)
				return TRACE_TYPE_PARTIAL_LINE;
		}
		/* parameter values */
140
		ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
141
				       trace->args[i],
142
				       i == entry->nb_args - 1 ? "" : ", ");
143
144
145
146
		if (!ret)
			return TRACE_TYPE_PARTIAL_LINE;
	}

147
148
149
150
	ret = trace_seq_putc(s, ')');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

151
end:
152
153
154
155
	ret =  trace_seq_putc(s, '\n');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

156
157
158
159
	return TRACE_TYPE_HANDLED;
}

enum print_line_t
160
161
print_syscall_exit(struct trace_iterator *iter, int flags,
		   struct trace_event *event)
162
163
164
165
166
167
168
169
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_exit *trace;
	int syscall;
	struct syscall_metadata *entry;
	int ret;

170
	trace = (typeof(trace))ent;
171
172
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
173

174
175
176
177
178
	if (!entry) {
		trace_seq_printf(s, "\n");
		return TRACE_TYPE_HANDLED;
	}

179
	if (entry->exit_event->event.type != ent->type) {
180
181
182
183
		WARN_ON_ONCE(1);
		return TRACE_TYPE_UNHANDLED;
	}

184
185
186
187
188
189
190
191
	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
				trace->ret);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	return TRACE_TYPE_HANDLED;
}

192
193
194
195
196
extern char *__bad_type_size(void);

#define SYSCALL_FIELD(type, name)					\
	sizeof(type) != sizeof(trace.name) ?				\
		__bad_type_size() :					\
197
198
		#type, #name, offsetof(typeof(trace), name),		\
		sizeof(trace.name), is_signed_type(type)
199

200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
static
int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
	int i;
	int pos = 0;

	/* When len=0, we just calculate the needed length */
#define LEN_OR_ZERO (len ? len - pos : 0)

	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
				entry->args[i], sizeof(unsigned long),
				i == entry->nb_args - 1 ? "" : ", ");
	}
	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");

	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO,
				", ((unsigned long)(REC->%s))", entry->args[i]);
	}

#undef LEN_OR_ZERO

	/* return the length of print_fmt */
	return pos;
}

static int set_syscall_print_fmt(struct ftrace_event_call *call)
{
	char *print_fmt;
	int len;
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event != call) {
		call->print_fmt = "\"0x%lx\", REC->ret";
		return 0;
	}

	/* First: called with 0 length to calculate the needed length */
	len = __set_enter_print_fmt(entry, NULL, 0);

	print_fmt = kmalloc(len + 1, GFP_KERNEL);
	if (!print_fmt)
		return -ENOMEM;

	/* Second: actually write the @print_fmt */
	__set_enter_print_fmt(entry, print_fmt, len + 1);
	call->print_fmt = print_fmt;

	return 0;
}

static void free_syscall_print_fmt(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event == call)
		kfree(call->print_fmt);
}

261
static int syscall_enter_define_fields(struct ftrace_event_call *call)
262
263
{
	struct syscall_trace_enter trace;
264
	struct syscall_metadata *meta = call->data;
265
266
267
268
	int ret;
	int i;
	int offset = offsetof(typeof(trace), args);

269
270
271
272
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

273
	for (i = 0; i < meta->nb_args; i++) {
274
275
		ret = trace_define_field(call, meta->types[i],
					 meta->args[i], offset,
276
277
					 sizeof(unsigned long), 0,
					 FILTER_OTHER);
278
279
280
281
282
283
		offset += sizeof(unsigned long);
	}

	return ret;
}

284
static int syscall_exit_define_fields(struct ftrace_event_call *call)
285
286
287
288
{
	struct syscall_trace_exit trace;
	int ret;

289
290
291
292
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

293
	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
294
				 FILTER_OTHER);
295
296
297
298

	return ret;
}

299
void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
300
{
301
302
303
	struct syscall_trace_enter *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
304
	struct ring_buffer *buffer;
305
	int size;
306
307
308
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
309
310
	if (syscall_nr < 0)
		return;
311
312
	if (!test_bit(syscall_nr, enabled_enter_syscalls))
		return;
313

314
315
316
317
318
319
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;

320
	event = trace_current_buffer_lock_reserve(&buffer,
321
			sys_data->enter_event->event.type, size, 0, 0);
322
323
324
325
326
327
328
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);

329
330
331
	if (!filter_current_check_discard(buffer, sys_data->enter_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
332
333
}

334
void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
335
{
336
337
338
	struct syscall_trace_exit *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
339
	struct ring_buffer *buffer;
340
341
342
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
343
344
	if (syscall_nr < 0)
		return;
345
346
	if (!test_bit(syscall_nr, enabled_exit_syscalls))
		return;
347

348
349
350
351
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

352
	event = trace_current_buffer_lock_reserve(&buffer,
353
			sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
354
355
356
357
358
359
360
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	entry->ret = syscall_get_return_value(current, regs);

361
362
363
	if (!filter_current_check_discard(buffer, sys_data->exit_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
364
365
}

366
int reg_event_syscall_enter(struct ftrace_event_call *call)
367
{
368
369
370
	int ret = 0;
	int num;

371
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
372
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
373
374
375
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_enter)
376
		ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
377
	if (!ret) {
378
379
380
381
382
		set_bit(num, enabled_enter_syscalls);
		sys_refcount_enter++;
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
383
384
}

385
void unreg_event_syscall_enter(struct ftrace_event_call *call)
386
{
387
	int num;
388

389
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
390
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
391
392
393
394
395
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_enter--;
	clear_bit(num, enabled_enter_syscalls);
	if (!sys_refcount_enter)
396
		unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
397
398
	mutex_unlock(&syscall_trace_lock);
}
399

400
int reg_event_syscall_exit(struct ftrace_event_call *call)
401
{
402
403
404
	int ret = 0;
	int num;

405
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
406
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
407
408
409
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_exit)
410
		ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
411
	if (!ret) {
412
413
		set_bit(num, enabled_exit_syscalls);
		sys_refcount_exit++;
414
	}
415
416
417
	mutex_unlock(&syscall_trace_lock);
	return ret;
}
418

419
void unreg_event_syscall_exit(struct ftrace_event_call *call)
420
421
{
	int num;
422

423
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
424
	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
425
426
427
428
429
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_exit--;
	clear_bit(num, enabled_exit_syscalls);
	if (!sys_refcount_exit)
430
		unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
431
	mutex_unlock(&syscall_trace_lock);
432
}
433

434
435
436
int init_syscall_trace(struct ftrace_event_call *call)
{
	int id;
437
438
439
440
441
442
443
444
	int num;

	num = ((struct syscall_metadata *)call->data)->syscall_nr;
	if (num < 0 || num >= NR_syscalls) {
		pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
				((struct syscall_metadata *)call->data)->name);
		return -ENOSYS;
	}
445

446
447
448
	if (set_syscall_print_fmt(call) < 0)
		return -ENOMEM;

449
450
451
	id = trace_event_raw_init(call);

	if (id < 0) {
452
		free_syscall_print_fmt(call);
453
		return id;
454
	}
455
456

	return id;
457
458
}

459
unsigned long __init __weak arch_syscall_addr(int nr)
460
461
462
463
{
	return (unsigned long)sys_call_table[nr];
}

464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
int __init init_ftrace_syscalls(void)
{
	struct syscall_metadata *meta;
	unsigned long addr;
	int i;

	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
					NR_syscalls, GFP_KERNEL);
	if (!syscalls_metadata) {
		WARN_ON(1);
		return -ENOMEM;
	}

	for (i = 0; i < NR_syscalls; i++) {
		addr = arch_syscall_addr(i);
		meta = find_syscall_meta(addr);
480
481
482
483
		if (!meta)
			continue;

		meta->syscall_nr = i;
484
485
486
487
488
489
490
		syscalls_metadata[i] = meta;
	}

	return 0;
}
core_initcall(init_ftrace_syscalls);

491
#ifdef CONFIG_PERF_EVENTS
492

493
494
495
496
static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
497

498
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
499
500
{
	struct syscall_metadata *sys_data;
501
	struct syscall_trace_enter *rec;
502
	struct hlist_head *head;
503
	int syscall_nr;
504
	int rctx;
505
	int size;
506
507

	syscall_nr = syscall_get_nr(current, regs);
508
	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
509
510
511
512
513
514
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

515
516
517
518
519
	/* get the size after alignment with the u32 buffer size field */
	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
	size = ALIGN(size + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);

520
521
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		      "perf buffer not large enough"))
522
523
		return;

524
	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
525
				sys_data->enter_event->event.type, regs, &rctx);
526
527
	if (!rec)
		return;
528
529
530
531

	rec->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
			       (unsigned long *)&rec->args);
532

533
	head = this_cpu_ptr(sys_data->enter_event->perf_events);
534
	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
535
536
}

537
int perf_sysenter_enable(struct ftrace_event_call *call)
538
539
540
541
{
	int ret = 0;
	int num;

542
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
543
544

	mutex_lock(&syscall_trace_lock);
545
	if (!sys_perf_refcount_enter)
546
		ret = register_trace_sys_enter(perf_syscall_enter, NULL);
547
548
549
550
	if (ret) {
		pr_info("event trace: Could not activate"
				"syscall entry trace point");
	} else {
551
552
		set_bit(num, enabled_perf_enter_syscalls);
		sys_perf_refcount_enter++;
553
554
555
556
557
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

558
void perf_sysenter_disable(struct ftrace_event_call *call)
559
560
561
{
	int num;

562
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
563
564

	mutex_lock(&syscall_trace_lock);
565
566
567
	sys_perf_refcount_enter--;
	clear_bit(num, enabled_perf_enter_syscalls);
	if (!sys_perf_refcount_enter)
568
		unregister_trace_sys_enter(perf_syscall_enter, NULL);
569
570
571
	mutex_unlock(&syscall_trace_lock);
}

572
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
573
574
{
	struct syscall_metadata *sys_data;
575
	struct syscall_trace_exit *rec;
576
	struct hlist_head *head;
577
	int syscall_nr;
578
	int rctx;
579
	int size;
580
581

	syscall_nr = syscall_get_nr(current, regs);
582
	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
583
584
585
586
587
588
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

589
590
591
	/* We can probably do that at build time */
	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);
592

593
594
595
596
	/*
	 * Impossible, but be paranoid with the future
	 * How to put this check outside runtime?
	 */
597
598
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		"exit event has grown above perf buffer size"))
599
600
		return;

601
	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
602
				sys_data->exit_event->event.type, regs, &rctx);
603
604
	if (!rec)
		return;
605
606
607
608

	rec->nr = syscall_nr;
	rec->ret = syscall_get_return_value(current, regs);

609
	head = this_cpu_ptr(sys_data->exit_event->perf_events);
610
	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
611
612
}

613
int perf_sysexit_enable(struct ftrace_event_call *call)
614
615
616
617
{
	int ret = 0;
	int num;

618
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
619
620

	mutex_lock(&syscall_trace_lock);
621
	if (!sys_perf_refcount_exit)
622
		ret = register_trace_sys_exit(perf_syscall_exit, NULL);
623
624
	if (ret) {
		pr_info("event trace: Could not activate"
625
				"syscall exit trace point");
626
	} else {
627
628
		set_bit(num, enabled_perf_exit_syscalls);
		sys_perf_refcount_exit++;
629
630
631
632
633
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

634
void perf_sysexit_disable(struct ftrace_event_call *call)
635
636
637
{
	int num;

638
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
639
640

	mutex_lock(&syscall_trace_lock);
641
642
643
	sys_perf_refcount_exit--;
	clear_bit(num, enabled_perf_exit_syscalls);
	if (!sys_perf_refcount_exit)
644
		unregister_trace_sys_exit(perf_syscall_exit, NULL);
645
646
647
	mutex_unlock(&syscall_trace_lock);
}

648
#endif /* CONFIG_PERF_EVENTS */
649

650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
static int syscall_enter_register(struct ftrace_event_call *event,
				 enum trace_reg type)
{
	switch (type) {
	case TRACE_REG_REGISTER:
		return reg_event_syscall_enter(event);
	case TRACE_REG_UNREGISTER:
		unreg_event_syscall_enter(event);
		return 0;

#ifdef CONFIG_PERF_EVENTS
	case TRACE_REG_PERF_REGISTER:
		return perf_sysenter_enable(event);
	case TRACE_REG_PERF_UNREGISTER:
		perf_sysenter_disable(event);
		return 0;
#endif
	}
	return 0;
}

static int syscall_exit_register(struct ftrace_event_call *event,
				 enum trace_reg type)
{
	switch (type) {
	case TRACE_REG_REGISTER:
		return reg_event_syscall_exit(event);
	case TRACE_REG_UNREGISTER:
		unreg_event_syscall_exit(event);
		return 0;

#ifdef CONFIG_PERF_EVENTS
	case TRACE_REG_PERF_REGISTER:
		return perf_sysexit_enable(event);
	case TRACE_REG_PERF_UNREGISTER:
		perf_sysexit_disable(event);
		return 0;
#endif
	}
	return 0;
}