trace_syscalls.c 15.7 KB
Newer Older
1
#include <trace/syscall.h>
2
#include <trace/events/syscalls.h>
3
#include <linux/slab.h>
4
#include <linux/kernel.h>
5
#include <linux/ftrace.h>
6
#include <linux/perf_event.h>
7
8
9
10
11
#include <asm/syscall.h>

#include "trace_output.h"
#include "trace.h"

12
static DEFINE_MUTEX(syscall_trace_lock);
13
14
static int sys_refcount_enter;
static int sys_refcount_exit;
15
16
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17

18
19
20
21
22
static int syscall_enter_register(struct ftrace_event_call *event,
				 enum trace_reg type);
static int syscall_exit_register(struct ftrace_event_call *event,
				 enum trace_reg type);

23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
static int syscall_enter_define_fields(struct ftrace_event_call *call);
static int syscall_exit_define_fields(struct ftrace_event_call *call);

static struct list_head *
syscall_get_enter_fields(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	return &entry->enter_fields;
}

static struct list_head *
syscall_get_exit_fields(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	return &entry->exit_fields;
}

42
43
struct ftrace_event_class event_class_syscall_enter = {
	.system			= "syscalls",
44
45
46
	.reg			= syscall_enter_register,
	.define_fields		= syscall_enter_define_fields,
	.get_fields		= syscall_get_enter_fields,
47
	.raw_init		= init_syscall_trace,
48
49
50
51
};

struct ftrace_event_class event_class_syscall_exit = {
	.system			= "syscalls",
52
53
54
	.reg			= syscall_exit_register,
	.define_fields		= syscall_exit_define_fields,
	.get_fields		= syscall_get_exit_fields,
55
	.raw_init		= init_syscall_trace,
56
57
};

58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
extern unsigned long __start_syscalls_metadata[];
extern unsigned long __stop_syscalls_metadata[];

static struct syscall_metadata **syscalls_metadata;

static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
{
	struct syscall_metadata *start;
	struct syscall_metadata *stop;
	char str[KSYM_SYMBOL_LEN];


	start = (struct syscall_metadata *)__start_syscalls_metadata;
	stop = (struct syscall_metadata *)__stop_syscalls_metadata;
	kallsyms_lookup(syscall, NULL, NULL, NULL, str);

	for ( ; start < stop; start++) {
		/*
		 * Only compare after the "sys" prefix. Archs that use
		 * syscall wrappers may have syscalls symbols aliases prefixed
		 * with "SyS" instead of "sys", leading to an unwanted
		 * mismatch.
		 */
		if (start->name && !strcmp(start->name + 3, str + 3))
			return start;
	}
	return NULL;
}

static struct syscall_metadata *syscall_nr_to_meta(int nr)
{
	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
		return NULL;

	return syscalls_metadata[nr];
}

95
96
97
98
99
100
101
102
103
enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags)
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_enter *trace;
	struct syscall_metadata *entry;
	int i, ret, syscall;

104
	trace = (typeof(trace))ent;
105
106
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
107

108
109
110
	if (!entry)
		goto end;

111
	if (entry->enter_event->id != ent->type) {
112
113
114
115
		WARN_ON_ONCE(1);
		goto end;
	}

116
117
118
119
120
121
	ret = trace_seq_printf(s, "%s(", entry->name);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	for (i = 0; i < entry->nb_args; i++) {
		/* parameter types */
122
		if (trace_flags & TRACE_ITER_VERBOSE) {
123
124
125
126
127
			ret = trace_seq_printf(s, "%s ", entry->types[i]);
			if (!ret)
				return TRACE_TYPE_PARTIAL_LINE;
		}
		/* parameter values */
128
		ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
129
				       trace->args[i],
130
				       i == entry->nb_args - 1 ? "" : ", ");
131
132
133
134
		if (!ret)
			return TRACE_TYPE_PARTIAL_LINE;
	}

135
136
137
138
	ret = trace_seq_putc(s, ')');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

139
end:
140
141
142
143
	ret =  trace_seq_putc(s, '\n');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

144
145
146
147
148
149
150
151
152
153
154
155
156
	return TRACE_TYPE_HANDLED;
}

enum print_line_t
print_syscall_exit(struct trace_iterator *iter, int flags)
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_exit *trace;
	int syscall;
	struct syscall_metadata *entry;
	int ret;

157
	trace = (typeof(trace))ent;
158
159
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
160

161
162
163
164
165
	if (!entry) {
		trace_seq_printf(s, "\n");
		return TRACE_TYPE_HANDLED;
	}

166
	if (entry->exit_event->id != ent->type) {
167
168
169
170
		WARN_ON_ONCE(1);
		return TRACE_TYPE_UNHANDLED;
	}

171
172
173
174
175
176
177
178
	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
				trace->ret);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	return TRACE_TYPE_HANDLED;
}

179
180
181
182
183
extern char *__bad_type_size(void);

#define SYSCALL_FIELD(type, name)					\
	sizeof(type) != sizeof(trace.name) ?				\
		__bad_type_size() :					\
184
185
		#type, #name, offsetof(typeof(trace), name),		\
		sizeof(trace.name), is_signed_type(type)
186

187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
static
int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
	int i;
	int pos = 0;

	/* When len=0, we just calculate the needed length */
#define LEN_OR_ZERO (len ? len - pos : 0)

	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
				entry->args[i], sizeof(unsigned long),
				i == entry->nb_args - 1 ? "" : ", ");
	}
	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");

	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO,
				", ((unsigned long)(REC->%s))", entry->args[i]);
	}

#undef LEN_OR_ZERO

	/* return the length of print_fmt */
	return pos;
}

static int set_syscall_print_fmt(struct ftrace_event_call *call)
{
	char *print_fmt;
	int len;
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event != call) {
		call->print_fmt = "\"0x%lx\", REC->ret";
		return 0;
	}

	/* First: called with 0 length to calculate the needed length */
	len = __set_enter_print_fmt(entry, NULL, 0);

	print_fmt = kmalloc(len + 1, GFP_KERNEL);
	if (!print_fmt)
		return -ENOMEM;

	/* Second: actually write the @print_fmt */
	__set_enter_print_fmt(entry, print_fmt, len + 1);
	call->print_fmt = print_fmt;

	return 0;
}

static void free_syscall_print_fmt(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event == call)
		kfree(call->print_fmt);
}

248
static int syscall_enter_define_fields(struct ftrace_event_call *call)
249
250
{
	struct syscall_trace_enter trace;
251
	struct syscall_metadata *meta = call->data;
252
253
254
255
	int ret;
	int i;
	int offset = offsetof(typeof(trace), args);

256
257
258
259
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

260
	for (i = 0; i < meta->nb_args; i++) {
261
262
		ret = trace_define_field(call, meta->types[i],
					 meta->args[i], offset,
263
264
					 sizeof(unsigned long), 0,
					 FILTER_OTHER);
265
266
267
268
269
270
		offset += sizeof(unsigned long);
	}

	return ret;
}

271
static int syscall_exit_define_fields(struct ftrace_event_call *call)
272
273
274
275
{
	struct syscall_trace_exit trace;
	int ret;

276
277
278
279
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

280
	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
281
				 FILTER_OTHER);
282
283
284
285

	return ret;
}

286
void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
287
{
288
289
290
	struct syscall_trace_enter *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
291
	struct ring_buffer *buffer;
292
	int size;
293
294
295
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
296
297
	if (syscall_nr < 0)
		return;
298
299
	if (!test_bit(syscall_nr, enabled_enter_syscalls))
		return;
300

301
302
303
304
305
306
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;

307
308
	event = trace_current_buffer_lock_reserve(&buffer,
			sys_data->enter_event->id, size, 0, 0);
309
310
311
312
313
314
315
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);

316
317
318
	if (!filter_current_check_discard(buffer, sys_data->enter_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
319
320
}

321
void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
322
{
323
324
325
	struct syscall_trace_exit *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
326
	struct ring_buffer *buffer;
327
328
329
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
330
331
	if (syscall_nr < 0)
		return;
332
333
	if (!test_bit(syscall_nr, enabled_exit_syscalls))
		return;
334

335
336
337
338
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

339
340
	event = trace_current_buffer_lock_reserve(&buffer,
			sys_data->exit_event->id, sizeof(*entry), 0, 0);
341
342
343
344
345
346
347
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	entry->ret = syscall_get_return_value(current, regs);

348
349
350
	if (!filter_current_check_discard(buffer, sys_data->exit_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
351
352
}

353
int reg_event_syscall_enter(struct ftrace_event_call *call)
354
{
355
356
357
	int ret = 0;
	int num;

358
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
359
	if (num < 0 || num >= NR_syscalls)
360
361
362
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_enter)
363
		ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
364
	if (!ret) {
365
366
367
368
369
		set_bit(num, enabled_enter_syscalls);
		sys_refcount_enter++;
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
370
371
}

372
void unreg_event_syscall_enter(struct ftrace_event_call *call)
373
{
374
	int num;
375

376
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
377
	if (num < 0 || num >= NR_syscalls)
378
379
380
381
382
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_enter--;
	clear_bit(num, enabled_enter_syscalls);
	if (!sys_refcount_enter)
383
		unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
384
385
	mutex_unlock(&syscall_trace_lock);
}
386

387
int reg_event_syscall_exit(struct ftrace_event_call *call)
388
{
389
390
391
	int ret = 0;
	int num;

392
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
393
	if (num < 0 || num >= NR_syscalls)
394
395
396
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_exit)
397
		ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
398
	if (!ret) {
399
400
		set_bit(num, enabled_exit_syscalls);
		sys_refcount_exit++;
401
	}
402
403
404
	mutex_unlock(&syscall_trace_lock);
	return ret;
}
405

406
void unreg_event_syscall_exit(struct ftrace_event_call *call)
407
408
{
	int num;
409

410
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
411
	if (num < 0 || num >= NR_syscalls)
412
413
414
415
416
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_exit--;
	clear_bit(num, enabled_exit_syscalls);
	if (!sys_refcount_exit)
417
		unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
418
	mutex_unlock(&syscall_trace_lock);
419
}
420

421
422
423
424
int init_syscall_trace(struct ftrace_event_call *call)
{
	int id;

425
426
427
	if (set_syscall_print_fmt(call) < 0)
		return -ENOMEM;

428
429
430
	id = trace_event_raw_init(call);

	if (id < 0) {
431
		free_syscall_print_fmt(call);
432
		return id;
433
	}
434
435

	return id;
436
437
}

438
439
440
441
442
unsigned long __init arch_syscall_addr(int nr)
{
	return (unsigned long)sys_call_table[nr];
}

443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
int __init init_ftrace_syscalls(void)
{
	struct syscall_metadata *meta;
	unsigned long addr;
	int i;

	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
					NR_syscalls, GFP_KERNEL);
	if (!syscalls_metadata) {
		WARN_ON(1);
		return -ENOMEM;
	}

	for (i = 0; i < NR_syscalls; i++) {
		addr = arch_syscall_addr(i);
		meta = find_syscall_meta(addr);
459
460
461
462
		if (!meta)
			continue;

		meta->syscall_nr = i;
463
464
465
466
467
468
469
		syscalls_metadata[i] = meta;
	}

	return 0;
}
core_initcall(init_ftrace_syscalls);

470
#ifdef CONFIG_PERF_EVENTS
471

472
473
474
475
static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
476

477
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
478
479
{
	struct syscall_metadata *sys_data;
480
481
	struct syscall_trace_enter *rec;
	unsigned long flags;
482
	int syscall_nr;
483
	int rctx;
484
	int size;
485
486

	syscall_nr = syscall_get_nr(current, regs);
487
	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
488
489
490
491
492
493
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

494
495
496
497
498
	/* get the size after alignment with the u32 buffer size field */
	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
	size = ALIGN(size + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);

499
500
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		      "perf buffer not large enough"))
501
502
		return;

503
	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
504
505
506
				sys_data->enter_event->id, &rctx, &flags);
	if (!rec)
		return;
507
508
509
510

	rec->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
			       (unsigned long *)&rec->args);
511
	perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
512
513
}

514
int perf_sysenter_enable(struct ftrace_event_call *call)
515
516
517
518
{
	int ret = 0;
	int num;

519
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
520
521

	mutex_lock(&syscall_trace_lock);
522
	if (!sys_perf_refcount_enter)
523
		ret = register_trace_sys_enter(perf_syscall_enter, NULL);
524
525
526
527
	if (ret) {
		pr_info("event trace: Could not activate"
				"syscall entry trace point");
	} else {
528
529
		set_bit(num, enabled_perf_enter_syscalls);
		sys_perf_refcount_enter++;
530
531
532
533
534
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

535
void perf_sysenter_disable(struct ftrace_event_call *call)
536
537
538
{
	int num;

539
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
540
541

	mutex_lock(&syscall_trace_lock);
542
543
544
	sys_perf_refcount_enter--;
	clear_bit(num, enabled_perf_enter_syscalls);
	if (!sys_perf_refcount_enter)
545
		unregister_trace_sys_enter(perf_syscall_enter, NULL);
546
547
548
	mutex_unlock(&syscall_trace_lock);
}

549
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
550
551
{
	struct syscall_metadata *sys_data;
552
553
	struct syscall_trace_exit *rec;
	unsigned long flags;
554
	int syscall_nr;
555
	int rctx;
556
	int size;
557
558

	syscall_nr = syscall_get_nr(current, regs);
559
	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
560
561
562
563
564
565
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

566
567
568
	/* We can probably do that at build time */
	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);
569

570
571
572
573
	/*
	 * Impossible, but be paranoid with the future
	 * How to put this check outside runtime?
	 */
574
575
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		"exit event has grown above perf buffer size"))
576
577
		return;

578
	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
579
580
581
				sys_data->exit_event->id, &rctx, &flags);
	if (!rec)
		return;
582
583
584
585

	rec->nr = syscall_nr;
	rec->ret = syscall_get_return_value(current, regs);

586
	perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
587
588
}

589
int perf_sysexit_enable(struct ftrace_event_call *call)
590
591
592
593
{
	int ret = 0;
	int num;

594
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
595
596

	mutex_lock(&syscall_trace_lock);
597
	if (!sys_perf_refcount_exit)
598
		ret = register_trace_sys_exit(perf_syscall_exit, NULL);
599
600
	if (ret) {
		pr_info("event trace: Could not activate"
601
				"syscall exit trace point");
602
	} else {
603
604
		set_bit(num, enabled_perf_exit_syscalls);
		sys_perf_refcount_exit++;
605
606
607
608
609
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

610
void perf_sysexit_disable(struct ftrace_event_call *call)
611
612
613
{
	int num;

614
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
615
616

	mutex_lock(&syscall_trace_lock);
617
618
619
	sys_perf_refcount_exit--;
	clear_bit(num, enabled_perf_exit_syscalls);
	if (!sys_perf_refcount_exit)
620
		unregister_trace_sys_exit(perf_syscall_exit, NULL);
621
622
623
	mutex_unlock(&syscall_trace_lock);
}

624
#endif /* CONFIG_PERF_EVENTS */
625

626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
static int syscall_enter_register(struct ftrace_event_call *event,
				 enum trace_reg type)
{
	switch (type) {
	case TRACE_REG_REGISTER:
		return reg_event_syscall_enter(event);
	case TRACE_REG_UNREGISTER:
		unreg_event_syscall_enter(event);
		return 0;

#ifdef CONFIG_PERF_EVENTS
	case TRACE_REG_PERF_REGISTER:
		return perf_sysenter_enable(event);
	case TRACE_REG_PERF_UNREGISTER:
		perf_sysenter_disable(event);
		return 0;
#endif
	}
	return 0;
}

static int syscall_exit_register(struct ftrace_event_call *event,
				 enum trace_reg type)
{
	switch (type) {
	case TRACE_REG_REGISTER:
		return reg_event_syscall_exit(event);
	case TRACE_REG_UNREGISTER:
		unreg_event_syscall_exit(event);
		return 0;

#ifdef CONFIG_PERF_EVENTS
	case TRACE_REG_PERF_REGISTER:
		return perf_sysexit_enable(event);
	case TRACE_REG_PERF_UNREGISTER:
		perf_sysexit_disable(event);
		return 0;
#endif
	}
	return 0;
}