trace_syscalls.c 13.7 KB
Newer Older
1
#include <trace/syscall.h>
2
#include <trace/events/syscalls.h>
3
#include <linux/slab.h>
4
#include <linux/kernel.h>
5
#include <linux/ftrace.h>
6
#include <linux/perf_event.h>
7
8
9
10
11
#include <asm/syscall.h>

#include "trace_output.h"
#include "trace.h"

12
static DEFINE_MUTEX(syscall_trace_lock);
13
14
static int sys_refcount_enter;
static int sys_refcount_exit;
15
16
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17

18
19
20
21
struct ftrace_event_class event_class_syscalls = {
	.system			= "syscalls"
};

22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
extern unsigned long __start_syscalls_metadata[];
extern unsigned long __stop_syscalls_metadata[];

static struct syscall_metadata **syscalls_metadata;

static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
{
	struct syscall_metadata *start;
	struct syscall_metadata *stop;
	char str[KSYM_SYMBOL_LEN];


	start = (struct syscall_metadata *)__start_syscalls_metadata;
	stop = (struct syscall_metadata *)__stop_syscalls_metadata;
	kallsyms_lookup(syscall, NULL, NULL, NULL, str);

	for ( ; start < stop; start++) {
		/*
		 * Only compare after the "sys" prefix. Archs that use
		 * syscall wrappers may have syscalls symbols aliases prefixed
		 * with "SyS" instead of "sys", leading to an unwanted
		 * mismatch.
		 */
		if (start->name && !strcmp(start->name + 3, str + 3))
			return start;
	}
	return NULL;
}

static struct syscall_metadata *syscall_nr_to_meta(int nr)
{
	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
		return NULL;

	return syscalls_metadata[nr];
}

59
60
61
62
63
64
65
66
67
enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags)
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_enter *trace;
	struct syscall_metadata *entry;
	int i, ret, syscall;

68
	trace = (typeof(trace))ent;
69
70
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
71

72
73
74
	if (!entry)
		goto end;

75
	if (entry->enter_event->id != ent->type) {
76
77
78
79
		WARN_ON_ONCE(1);
		goto end;
	}

80
81
82
83
84
85
	ret = trace_seq_printf(s, "%s(", entry->name);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	for (i = 0; i < entry->nb_args; i++) {
		/* parameter types */
86
		if (trace_flags & TRACE_ITER_VERBOSE) {
87
88
89
90
91
			ret = trace_seq_printf(s, "%s ", entry->types[i]);
			if (!ret)
				return TRACE_TYPE_PARTIAL_LINE;
		}
		/* parameter values */
92
		ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
93
				       trace->args[i],
94
				       i == entry->nb_args - 1 ? "" : ", ");
95
96
97
98
		if (!ret)
			return TRACE_TYPE_PARTIAL_LINE;
	}

99
100
101
102
	ret = trace_seq_putc(s, ')');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

103
end:
104
105
106
107
	ret =  trace_seq_putc(s, '\n');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

108
109
110
111
112
113
114
115
116
117
118
119
120
	return TRACE_TYPE_HANDLED;
}

enum print_line_t
print_syscall_exit(struct trace_iterator *iter, int flags)
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_exit *trace;
	int syscall;
	struct syscall_metadata *entry;
	int ret;

121
	trace = (typeof(trace))ent;
122
123
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
124

125
126
127
128
129
	if (!entry) {
		trace_seq_printf(s, "\n");
		return TRACE_TYPE_HANDLED;
	}

130
	if (entry->exit_event->id != ent->type) {
131
132
133
134
		WARN_ON_ONCE(1);
		return TRACE_TYPE_UNHANDLED;
	}

135
136
137
138
139
140
141
142
	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
				trace->ret);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	return TRACE_TYPE_HANDLED;
}

143
144
145
146
147
extern char *__bad_type_size(void);

#define SYSCALL_FIELD(type, name)					\
	sizeof(type) != sizeof(trace.name) ?				\
		__bad_type_size() :					\
148
149
		#type, #name, offsetof(typeof(trace), name),		\
		sizeof(trace.name), is_signed_type(type)
150

151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
static
int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
	int i;
	int pos = 0;

	/* When len=0, we just calculate the needed length */
#define LEN_OR_ZERO (len ? len - pos : 0)

	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
				entry->args[i], sizeof(unsigned long),
				i == entry->nb_args - 1 ? "" : ", ");
	}
	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");

	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO,
				", ((unsigned long)(REC->%s))", entry->args[i]);
	}

#undef LEN_OR_ZERO

	/* return the length of print_fmt */
	return pos;
}

static int set_syscall_print_fmt(struct ftrace_event_call *call)
{
	char *print_fmt;
	int len;
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event != call) {
		call->print_fmt = "\"0x%lx\", REC->ret";
		return 0;
	}

	/* First: called with 0 length to calculate the needed length */
	len = __set_enter_print_fmt(entry, NULL, 0);

	print_fmt = kmalloc(len + 1, GFP_KERNEL);
	if (!print_fmt)
		return -ENOMEM;

	/* Second: actually write the @print_fmt */
	__set_enter_print_fmt(entry, print_fmt, len + 1);
	call->print_fmt = print_fmt;

	return 0;
}

static void free_syscall_print_fmt(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event == call)
		kfree(call->print_fmt);
}

212
213
214
int syscall_enter_define_fields(struct ftrace_event_call *call)
{
	struct syscall_trace_enter trace;
215
	struct syscall_metadata *meta = call->data;
216
217
218
219
	int ret;
	int i;
	int offset = offsetof(typeof(trace), args);

220
221
222
223
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

224
	for (i = 0; i < meta->nb_args; i++) {
225
226
		ret = trace_define_field(call, meta->types[i],
					 meta->args[i], offset,
227
228
					 sizeof(unsigned long), 0,
					 FILTER_OTHER);
229
230
231
232
233
234
235
236
237
238
239
		offset += sizeof(unsigned long);
	}

	return ret;
}

int syscall_exit_define_fields(struct ftrace_event_call *call)
{
	struct syscall_trace_exit trace;
	int ret;

240
241
242
243
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

244
	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
245
				 FILTER_OTHER);
246
247
248
249

	return ret;
}

250
void ftrace_syscall_enter(struct pt_regs *regs, long id)
251
{
252
253
254
	struct syscall_trace_enter *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
255
	struct ring_buffer *buffer;
256
	int size;
257
258
259
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
260
261
	if (syscall_nr < 0)
		return;
262
263
	if (!test_bit(syscall_nr, enabled_enter_syscalls))
		return;
264

265
266
267
268
269
270
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;

271
272
	event = trace_current_buffer_lock_reserve(&buffer,
			sys_data->enter_event->id, size, 0, 0);
273
274
275
276
277
278
279
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);

280
281
282
	if (!filter_current_check_discard(buffer, sys_data->enter_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
283
284
}

285
void ftrace_syscall_exit(struct pt_regs *regs, long ret)
286
{
287
288
289
	struct syscall_trace_exit *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
290
	struct ring_buffer *buffer;
291
292
293
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
294
295
	if (syscall_nr < 0)
		return;
296
297
	if (!test_bit(syscall_nr, enabled_exit_syscalls))
		return;
298

299
300
301
302
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

303
304
	event = trace_current_buffer_lock_reserve(&buffer,
			sys_data->exit_event->id, sizeof(*entry), 0, 0);
305
306
307
308
309
310
311
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	entry->ret = syscall_get_return_value(current, regs);

312
313
314
	if (!filter_current_check_discard(buffer, sys_data->exit_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
315
316
}

317
int reg_event_syscall_enter(struct ftrace_event_call *call)
318
{
319
320
321
	int ret = 0;
	int num;

322
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
323
	if (num < 0 || num >= NR_syscalls)
324
325
326
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_enter)
327
		ret = register_trace_sys_enter(ftrace_syscall_enter);
328
	if (!ret) {
329
330
331
332
333
		set_bit(num, enabled_enter_syscalls);
		sys_refcount_enter++;
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
334
335
}

336
void unreg_event_syscall_enter(struct ftrace_event_call *call)
337
{
338
	int num;
339

340
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
341
	if (num < 0 || num >= NR_syscalls)
342
343
344
345
346
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_enter--;
	clear_bit(num, enabled_enter_syscalls);
	if (!sys_refcount_enter)
347
		unregister_trace_sys_enter(ftrace_syscall_enter);
348
349
	mutex_unlock(&syscall_trace_lock);
}
350

351
int reg_event_syscall_exit(struct ftrace_event_call *call)
352
{
353
354
355
	int ret = 0;
	int num;

356
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
357
	if (num < 0 || num >= NR_syscalls)
358
359
360
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_exit)
361
		ret = register_trace_sys_exit(ftrace_syscall_exit);
362
	if (!ret) {
363
364
		set_bit(num, enabled_exit_syscalls);
		sys_refcount_exit++;
365
	}
366
367
368
	mutex_unlock(&syscall_trace_lock);
	return ret;
}
369

370
void unreg_event_syscall_exit(struct ftrace_event_call *call)
371
372
{
	int num;
373

374
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
375
	if (num < 0 || num >= NR_syscalls)
376
377
378
379
380
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_exit--;
	clear_bit(num, enabled_exit_syscalls);
	if (!sys_refcount_exit)
381
		unregister_trace_sys_exit(ftrace_syscall_exit);
382
	mutex_unlock(&syscall_trace_lock);
383
}
384

385
386
387
388
int init_syscall_trace(struct ftrace_event_call *call)
{
	int id;

389
390
391
	if (set_syscall_print_fmt(call) < 0)
		return -ENOMEM;

392
393
394
	id = trace_event_raw_init(call);

	if (id < 0) {
395
		free_syscall_print_fmt(call);
396
		return id;
397
	}
398
399

	return id;
400
401
}

402
403
404
405
406
unsigned long __init arch_syscall_addr(int nr)
{
	return (unsigned long)sys_call_table[nr];
}

407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
int __init init_ftrace_syscalls(void)
{
	struct syscall_metadata *meta;
	unsigned long addr;
	int i;

	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
					NR_syscalls, GFP_KERNEL);
	if (!syscalls_metadata) {
		WARN_ON(1);
		return -ENOMEM;
	}

	for (i = 0; i < NR_syscalls; i++) {
		addr = arch_syscall_addr(i);
		meta = find_syscall_meta(addr);
423
424
425
426
		if (!meta)
			continue;

		meta->syscall_nr = i;
427
428
429
430
431
432
433
		syscalls_metadata[i] = meta;
	}

	return 0;
}
core_initcall(init_ftrace_syscalls);

434
#ifdef CONFIG_PERF_EVENTS
435

436
437
438
439
static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
440

441
static void perf_syscall_enter(struct pt_regs *regs, long id)
442
443
{
	struct syscall_metadata *sys_data;
444
445
	struct syscall_trace_enter *rec;
	unsigned long flags;
446
	int syscall_nr;
447
	int rctx;
448
	int size;
449
450

	syscall_nr = syscall_get_nr(current, regs);
451
	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
452
453
454
455
456
457
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

458
459
460
461
462
	/* get the size after alignment with the u32 buffer size field */
	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
	size = ALIGN(size + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);

463
464
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		      "perf buffer not large enough"))
465
466
		return;

467
	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
468
469
470
				sys_data->enter_event->id, &rctx, &flags);
	if (!rec)
		return;
471
472
473
474

	rec->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
			       (unsigned long *)&rec->args);
475
	perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
476
477
}

478
int perf_sysenter_enable(struct ftrace_event_call *call)
479
480
481
482
{
	int ret = 0;
	int num;

483
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
484
485

	mutex_lock(&syscall_trace_lock);
486
487
	if (!sys_perf_refcount_enter)
		ret = register_trace_sys_enter(perf_syscall_enter);
488
489
490
491
	if (ret) {
		pr_info("event trace: Could not activate"
				"syscall entry trace point");
	} else {
492
493
		set_bit(num, enabled_perf_enter_syscalls);
		sys_perf_refcount_enter++;
494
495
496
497
498
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

499
void perf_sysenter_disable(struct ftrace_event_call *call)
500
501
502
{
	int num;

503
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
504
505

	mutex_lock(&syscall_trace_lock);
506
507
508
509
	sys_perf_refcount_enter--;
	clear_bit(num, enabled_perf_enter_syscalls);
	if (!sys_perf_refcount_enter)
		unregister_trace_sys_enter(perf_syscall_enter);
510
511
512
	mutex_unlock(&syscall_trace_lock);
}

513
static void perf_syscall_exit(struct pt_regs *regs, long ret)
514
515
{
	struct syscall_metadata *sys_data;
516
517
	struct syscall_trace_exit *rec;
	unsigned long flags;
518
	int syscall_nr;
519
	int rctx;
520
	int size;
521
522

	syscall_nr = syscall_get_nr(current, regs);
523
	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
524
525
526
527
528
529
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

530
531
532
	/* We can probably do that at build time */
	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);
533

534
535
536
537
	/*
	 * Impossible, but be paranoid with the future
	 * How to put this check outside runtime?
	 */
538
539
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		"exit event has grown above perf buffer size"))
540
541
		return;

542
	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
543
544
545
				sys_data->exit_event->id, &rctx, &flags);
	if (!rec)
		return;
546
547
548
549

	rec->nr = syscall_nr;
	rec->ret = syscall_get_return_value(current, regs);

550
	perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
551
552
}

553
int perf_sysexit_enable(struct ftrace_event_call *call)
554
555
556
557
{
	int ret = 0;
	int num;

558
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
559
560

	mutex_lock(&syscall_trace_lock);
561
562
	if (!sys_perf_refcount_exit)
		ret = register_trace_sys_exit(perf_syscall_exit);
563
564
	if (ret) {
		pr_info("event trace: Could not activate"
565
				"syscall exit trace point");
566
	} else {
567
568
		set_bit(num, enabled_perf_exit_syscalls);
		sys_perf_refcount_exit++;
569
570
571
572
573
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

574
void perf_sysexit_disable(struct ftrace_event_call *call)
575
576
577
{
	int num;

578
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
579
580

	mutex_lock(&syscall_trace_lock);
581
582
583
584
	sys_perf_refcount_exit--;
	clear_bit(num, enabled_perf_exit_syscalls);
	if (!sys_perf_refcount_exit)
		unregister_trace_sys_exit(perf_syscall_exit);
585
586
587
	mutex_unlock(&syscall_trace_lock);
}

588
#endif /* CONFIG_PERF_EVENTS */
589