trace_syscalls.c 13.6 KB
Newer Older
1
#include <trace/syscall.h>
2
#include <trace/events/syscalls.h>
3
#include <linux/slab.h>
4
#include <linux/kernel.h>
5
#include <linux/ftrace.h>
6
#include <linux/perf_event.h>
7
8
9
10
11
#include <asm/syscall.h>

#include "trace_output.h"
#include "trace.h"

12
static DEFINE_MUTEX(syscall_trace_lock);
13
14
static int sys_refcount_enter;
static int sys_refcount_exit;
15
16
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17

18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
extern unsigned long __start_syscalls_metadata[];
extern unsigned long __stop_syscalls_metadata[];

static struct syscall_metadata **syscalls_metadata;

static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
{
	struct syscall_metadata *start;
	struct syscall_metadata *stop;
	char str[KSYM_SYMBOL_LEN];


	start = (struct syscall_metadata *)__start_syscalls_metadata;
	stop = (struct syscall_metadata *)__stop_syscalls_metadata;
	kallsyms_lookup(syscall, NULL, NULL, NULL, str);

	for ( ; start < stop; start++) {
		/*
		 * Only compare after the "sys" prefix. Archs that use
		 * syscall wrappers may have syscalls symbols aliases prefixed
		 * with "SyS" instead of "sys", leading to an unwanted
		 * mismatch.
		 */
		if (start->name && !strcmp(start->name + 3, str + 3))
			return start;
	}
	return NULL;
}

static struct syscall_metadata *syscall_nr_to_meta(int nr)
{
	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
		return NULL;

	return syscalls_metadata[nr];
}

55
56
57
58
59
60
61
62
63
enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags)
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_enter *trace;
	struct syscall_metadata *entry;
	int i, ret, syscall;

64
	trace = (typeof(trace))ent;
65
66
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
67

68
69
70
	if (!entry)
		goto end;

71
	if (entry->enter_event->id != ent->type) {
72
73
74
75
		WARN_ON_ONCE(1);
		goto end;
	}

76
77
78
79
80
81
	ret = trace_seq_printf(s, "%s(", entry->name);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	for (i = 0; i < entry->nb_args; i++) {
		/* parameter types */
82
		if (trace_flags & TRACE_ITER_VERBOSE) {
83
84
85
86
87
			ret = trace_seq_printf(s, "%s ", entry->types[i]);
			if (!ret)
				return TRACE_TYPE_PARTIAL_LINE;
		}
		/* parameter values */
88
		ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
89
				       trace->args[i],
90
				       i == entry->nb_args - 1 ? "" : ", ");
91
92
93
94
		if (!ret)
			return TRACE_TYPE_PARTIAL_LINE;
	}

95
96
97
98
	ret = trace_seq_putc(s, ')');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

99
end:
100
101
102
103
	ret =  trace_seq_putc(s, '\n');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

104
105
106
107
108
109
110
111
112
113
114
115
116
	return TRACE_TYPE_HANDLED;
}

enum print_line_t
print_syscall_exit(struct trace_iterator *iter, int flags)
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_exit *trace;
	int syscall;
	struct syscall_metadata *entry;
	int ret;

117
	trace = (typeof(trace))ent;
118
119
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
120

121
122
123
124
125
	if (!entry) {
		trace_seq_printf(s, "\n");
		return TRACE_TYPE_HANDLED;
	}

126
	if (entry->exit_event->id != ent->type) {
127
128
129
130
		WARN_ON_ONCE(1);
		return TRACE_TYPE_UNHANDLED;
	}

131
132
133
134
135
136
137
138
	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
				trace->ret);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	return TRACE_TYPE_HANDLED;
}

139
140
141
142
143
extern char *__bad_type_size(void);

#define SYSCALL_FIELD(type, name)					\
	sizeof(type) != sizeof(trace.name) ?				\
		__bad_type_size() :					\
144
145
		#type, #name, offsetof(typeof(trace), name),		\
		sizeof(trace.name), is_signed_type(type)
146

147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
static
int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
	int i;
	int pos = 0;

	/* When len=0, we just calculate the needed length */
#define LEN_OR_ZERO (len ? len - pos : 0)

	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
				entry->args[i], sizeof(unsigned long),
				i == entry->nb_args - 1 ? "" : ", ");
	}
	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");

	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO,
				", ((unsigned long)(REC->%s))", entry->args[i]);
	}

#undef LEN_OR_ZERO

	/* return the length of print_fmt */
	return pos;
}

static int set_syscall_print_fmt(struct ftrace_event_call *call)
{
	char *print_fmt;
	int len;
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event != call) {
		call->print_fmt = "\"0x%lx\", REC->ret";
		return 0;
	}

	/* First: called with 0 length to calculate the needed length */
	len = __set_enter_print_fmt(entry, NULL, 0);

	print_fmt = kmalloc(len + 1, GFP_KERNEL);
	if (!print_fmt)
		return -ENOMEM;

	/* Second: actually write the @print_fmt */
	__set_enter_print_fmt(entry, print_fmt, len + 1);
	call->print_fmt = print_fmt;

	return 0;
}

static void free_syscall_print_fmt(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event == call)
		kfree(call->print_fmt);
}

208
209
210
int syscall_enter_define_fields(struct ftrace_event_call *call)
{
	struct syscall_trace_enter trace;
211
	struct syscall_metadata *meta = call->data;
212
213
214
215
	int ret;
	int i;
	int offset = offsetof(typeof(trace), args);

216
217
218
219
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

220
	for (i = 0; i < meta->nb_args; i++) {
221
222
		ret = trace_define_field(call, meta->types[i],
					 meta->args[i], offset,
223
224
					 sizeof(unsigned long), 0,
					 FILTER_OTHER);
225
226
227
228
229
230
231
232
233
234
235
		offset += sizeof(unsigned long);
	}

	return ret;
}

int syscall_exit_define_fields(struct ftrace_event_call *call)
{
	struct syscall_trace_exit trace;
	int ret;

236
237
238
239
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

240
	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
241
				 FILTER_OTHER);
242
243
244
245

	return ret;
}

246
void ftrace_syscall_enter(struct pt_regs *regs, long id)
247
{
248
249
250
	struct syscall_trace_enter *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
251
	struct ring_buffer *buffer;
252
	int size;
253
254
255
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
256
257
	if (syscall_nr < 0)
		return;
258
259
	if (!test_bit(syscall_nr, enabled_enter_syscalls))
		return;
260

261
262
263
264
265
266
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;

267
268
	event = trace_current_buffer_lock_reserve(&buffer,
			sys_data->enter_event->id, size, 0, 0);
269
270
271
272
273
274
275
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);

276
277
278
	if (!filter_current_check_discard(buffer, sys_data->enter_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
279
280
}

281
void ftrace_syscall_exit(struct pt_regs *regs, long ret)
282
{
283
284
285
	struct syscall_trace_exit *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
286
	struct ring_buffer *buffer;
287
288
289
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
290
291
	if (syscall_nr < 0)
		return;
292
293
	if (!test_bit(syscall_nr, enabled_exit_syscalls))
		return;
294

295
296
297
298
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

299
300
	event = trace_current_buffer_lock_reserve(&buffer,
			sys_data->exit_event->id, sizeof(*entry), 0, 0);
301
302
303
304
305
306
307
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	entry->ret = syscall_get_return_value(current, regs);

308
309
310
	if (!filter_current_check_discard(buffer, sys_data->exit_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
311
312
}

313
int reg_event_syscall_enter(struct ftrace_event_call *call)
314
{
315
316
317
	int ret = 0;
	int num;

318
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
319
	if (num < 0 || num >= NR_syscalls)
320
321
322
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_enter)
323
		ret = register_trace_sys_enter(ftrace_syscall_enter);
324
	if (!ret) {
325
326
327
328
329
		set_bit(num, enabled_enter_syscalls);
		sys_refcount_enter++;
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
330
331
}

332
void unreg_event_syscall_enter(struct ftrace_event_call *call)
333
{
334
	int num;
335

336
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
337
	if (num < 0 || num >= NR_syscalls)
338
339
340
341
342
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_enter--;
	clear_bit(num, enabled_enter_syscalls);
	if (!sys_refcount_enter)
343
		unregister_trace_sys_enter(ftrace_syscall_enter);
344
345
	mutex_unlock(&syscall_trace_lock);
}
346

347
int reg_event_syscall_exit(struct ftrace_event_call *call)
348
{
349
350
351
	int ret = 0;
	int num;

352
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
353
	if (num < 0 || num >= NR_syscalls)
354
355
356
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_exit)
357
		ret = register_trace_sys_exit(ftrace_syscall_exit);
358
	if (!ret) {
359
360
		set_bit(num, enabled_exit_syscalls);
		sys_refcount_exit++;
361
	}
362
363
364
	mutex_unlock(&syscall_trace_lock);
	return ret;
}
365

366
void unreg_event_syscall_exit(struct ftrace_event_call *call)
367
368
{
	int num;
369

370
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
371
	if (num < 0 || num >= NR_syscalls)
372
373
374
375
376
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_exit--;
	clear_bit(num, enabled_exit_syscalls);
	if (!sys_refcount_exit)
377
		unregister_trace_sys_exit(ftrace_syscall_exit);
378
	mutex_unlock(&syscall_trace_lock);
379
}
380

381
382
383
384
int init_syscall_trace(struct ftrace_event_call *call)
{
	int id;

385
386
387
	if (set_syscall_print_fmt(call) < 0)
		return -ENOMEM;

388
389
390
	id = trace_event_raw_init(call);

	if (id < 0) {
391
		free_syscall_print_fmt(call);
392
		return id;
393
	}
394
395

	return id;
396
397
}

398
399
400
401
402
unsigned long __init arch_syscall_addr(int nr)
{
	return (unsigned long)sys_call_table[nr];
}

403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
int __init init_ftrace_syscalls(void)
{
	struct syscall_metadata *meta;
	unsigned long addr;
	int i;

	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
					NR_syscalls, GFP_KERNEL);
	if (!syscalls_metadata) {
		WARN_ON(1);
		return -ENOMEM;
	}

	for (i = 0; i < NR_syscalls; i++) {
		addr = arch_syscall_addr(i);
		meta = find_syscall_meta(addr);
419
420
421
422
		if (!meta)
			continue;

		meta->syscall_nr = i;
423
424
425
426
427
428
429
		syscalls_metadata[i] = meta;
	}

	return 0;
}
core_initcall(init_ftrace_syscalls);

430
#ifdef CONFIG_PERF_EVENTS
431

432
433
434
435
static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
436

437
static void perf_syscall_enter(struct pt_regs *regs, long id)
438
439
{
	struct syscall_metadata *sys_data;
440
441
	struct syscall_trace_enter *rec;
	unsigned long flags;
442
	int syscall_nr;
443
	int rctx;
444
	int size;
445
446

	syscall_nr = syscall_get_nr(current, regs);
447
	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
448
449
450
451
452
453
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

454
455
456
457
458
	/* get the size after alignment with the u32 buffer size field */
	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
	size = ALIGN(size + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);

459
460
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		      "perf buffer not large enough"))
461
462
		return;

463
	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
464
465
466
				sys_data->enter_event->id, &rctx, &flags);
	if (!rec)
		return;
467
468
469
470

	rec->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
			       (unsigned long *)&rec->args);
471
	perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
472
473
}

474
int perf_sysenter_enable(struct ftrace_event_call *call)
475
476
477
478
{
	int ret = 0;
	int num;

479
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
480
481

	mutex_lock(&syscall_trace_lock);
482
483
	if (!sys_perf_refcount_enter)
		ret = register_trace_sys_enter(perf_syscall_enter);
484
485
486
487
	if (ret) {
		pr_info("event trace: Could not activate"
				"syscall entry trace point");
	} else {
488
489
		set_bit(num, enabled_perf_enter_syscalls);
		sys_perf_refcount_enter++;
490
491
492
493
494
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

495
void perf_sysenter_disable(struct ftrace_event_call *call)
496
497
498
{
	int num;

499
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
500
501

	mutex_lock(&syscall_trace_lock);
502
503
504
505
	sys_perf_refcount_enter--;
	clear_bit(num, enabled_perf_enter_syscalls);
	if (!sys_perf_refcount_enter)
		unregister_trace_sys_enter(perf_syscall_enter);
506
507
508
	mutex_unlock(&syscall_trace_lock);
}

509
static void perf_syscall_exit(struct pt_regs *regs, long ret)
510
511
{
	struct syscall_metadata *sys_data;
512
513
	struct syscall_trace_exit *rec;
	unsigned long flags;
514
	int syscall_nr;
515
	int rctx;
516
	int size;
517
518

	syscall_nr = syscall_get_nr(current, regs);
519
	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
520
521
522
523
524
525
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

526
527
528
	/* We can probably do that at build time */
	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);
529

530
531
532
533
	/*
	 * Impossible, but be paranoid with the future
	 * How to put this check outside runtime?
	 */
534
535
	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
		"exit event has grown above perf buffer size"))
536
537
		return;

538
	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
539
540
541
				sys_data->exit_event->id, &rctx, &flags);
	if (!rec)
		return;
542
543
544
545

	rec->nr = syscall_nr;
	rec->ret = syscall_get_return_value(current, regs);

546
	perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
547
548
}

549
int perf_sysexit_enable(struct ftrace_event_call *call)
550
551
552
553
{
	int ret = 0;
	int num;

554
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
555
556

	mutex_lock(&syscall_trace_lock);
557
558
	if (!sys_perf_refcount_exit)
		ret = register_trace_sys_exit(perf_syscall_exit);
559
560
	if (ret) {
		pr_info("event trace: Could not activate"
561
				"syscall exit trace point");
562
	} else {
563
564
		set_bit(num, enabled_perf_exit_syscalls);
		sys_perf_refcount_exit++;
565
566
567
568
569
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

570
void perf_sysexit_disable(struct ftrace_event_call *call)
571
572
573
{
	int num;

574
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
575
576

	mutex_lock(&syscall_trace_lock);
577
578
579
580
	sys_perf_refcount_exit--;
	clear_bit(num, enabled_perf_exit_syscalls);
	if (!sys_perf_refcount_exit)
		unregister_trace_sys_exit(perf_syscall_exit);
581
582
583
	mutex_unlock(&syscall_trace_lock);
}

584
#endif /* CONFIG_PERF_EVENTS */
585