trace_syscalls.c 14.6 KB
Newer Older
1
#include <trace/syscall.h>
2
#include <trace/events/syscalls.h>
3
#include <linux/kernel.h>
4
#include <linux/ftrace.h>
5
#include <linux/perf_event.h>
6
7
8
9
10
#include <asm/syscall.h>

#include "trace_output.h"
#include "trace.h"

11
static DEFINE_MUTEX(syscall_trace_lock);
12
13
static int sys_refcount_enter;
static int sys_refcount_exit;
14
15
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
16

17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
extern unsigned long __start_syscalls_metadata[];
extern unsigned long __stop_syscalls_metadata[];

static struct syscall_metadata **syscalls_metadata;

static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
{
	struct syscall_metadata *start;
	struct syscall_metadata *stop;
	char str[KSYM_SYMBOL_LEN];


	start = (struct syscall_metadata *)__start_syscalls_metadata;
	stop = (struct syscall_metadata *)__stop_syscalls_metadata;
	kallsyms_lookup(syscall, NULL, NULL, NULL, str);

	for ( ; start < stop; start++) {
		/*
		 * Only compare after the "sys" prefix. Archs that use
		 * syscall wrappers may have syscalls symbols aliases prefixed
		 * with "SyS" instead of "sys", leading to an unwanted
		 * mismatch.
		 */
		if (start->name && !strcmp(start->name + 3, str + 3))
			return start;
	}
	return NULL;
}

static struct syscall_metadata *syscall_nr_to_meta(int nr)
{
	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
		return NULL;

	return syscalls_metadata[nr];
}

54
55
56
57
58
59
60
61
62
enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags)
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_enter *trace;
	struct syscall_metadata *entry;
	int i, ret, syscall;

63
	trace = (typeof(trace))ent;
64
65
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
66

67
68
69
	if (!entry)
		goto end;

70
	if (entry->enter_event->id != ent->type) {
71
72
73
74
		WARN_ON_ONCE(1);
		goto end;
	}

75
76
77
78
79
80
	ret = trace_seq_printf(s, "%s(", entry->name);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	for (i = 0; i < entry->nb_args; i++) {
		/* parameter types */
81
		if (trace_flags & TRACE_ITER_VERBOSE) {
82
83
84
85
86
			ret = trace_seq_printf(s, "%s ", entry->types[i]);
			if (!ret)
				return TRACE_TYPE_PARTIAL_LINE;
		}
		/* parameter values */
87
		ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
88
				       trace->args[i],
89
				       i == entry->nb_args - 1 ? "" : ", ");
90
91
92
93
		if (!ret)
			return TRACE_TYPE_PARTIAL_LINE;
	}

94
95
96
97
	ret = trace_seq_putc(s, ')');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

98
end:
99
100
101
102
	ret =  trace_seq_putc(s, '\n');
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

103
104
105
106
107
108
109
110
111
112
113
114
115
	return TRACE_TYPE_HANDLED;
}

enum print_line_t
print_syscall_exit(struct trace_iterator *iter, int flags)
{
	struct trace_seq *s = &iter->seq;
	struct trace_entry *ent = iter->ent;
	struct syscall_trace_exit *trace;
	int syscall;
	struct syscall_metadata *entry;
	int ret;

116
	trace = (typeof(trace))ent;
117
118
	syscall = trace->nr;
	entry = syscall_nr_to_meta(syscall);
119

120
121
122
123
124
	if (!entry) {
		trace_seq_printf(s, "\n");
		return TRACE_TYPE_HANDLED;
	}

125
	if (entry->exit_event->id != ent->type) {
126
127
128
129
		WARN_ON_ONCE(1);
		return TRACE_TYPE_UNHANDLED;
	}

130
131
132
133
134
135
136
137
	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
				trace->ret);
	if (!ret)
		return TRACE_TYPE_PARTIAL_LINE;

	return TRACE_TYPE_HANDLED;
}

138
139
140
141
142
extern char *__bad_type_size(void);

#define SYSCALL_FIELD(type, name)					\
	sizeof(type) != sizeof(trace.name) ?				\
		__bad_type_size() :					\
143
144
		#type, #name, offsetof(typeof(trace), name),		\
		sizeof(trace.name), is_signed_type(type)
145

146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
static
int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
	int i;
	int pos = 0;

	/* When len=0, we just calculate the needed length */
#define LEN_OR_ZERO (len ? len - pos : 0)

	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
				entry->args[i], sizeof(unsigned long),
				i == entry->nb_args - 1 ? "" : ", ");
	}
	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");

	for (i = 0; i < entry->nb_args; i++) {
		pos += snprintf(buf + pos, LEN_OR_ZERO,
				", ((unsigned long)(REC->%s))", entry->args[i]);
	}

#undef LEN_OR_ZERO

	/* return the length of print_fmt */
	return pos;
}

static int set_syscall_print_fmt(struct ftrace_event_call *call)
{
	char *print_fmt;
	int len;
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event != call) {
		call->print_fmt = "\"0x%lx\", REC->ret";
		return 0;
	}

	/* First: called with 0 length to calculate the needed length */
	len = __set_enter_print_fmt(entry, NULL, 0);

	print_fmt = kmalloc(len + 1, GFP_KERNEL);
	if (!print_fmt)
		return -ENOMEM;

	/* Second: actually write the @print_fmt */
	__set_enter_print_fmt(entry, print_fmt, len + 1);
	call->print_fmt = print_fmt;

	return 0;
}

static void free_syscall_print_fmt(struct ftrace_event_call *call)
{
	struct syscall_metadata *entry = call->data;

	if (entry->enter_event == call)
		kfree(call->print_fmt);
}

207
208
209
int syscall_enter_define_fields(struct ftrace_event_call *call)
{
	struct syscall_trace_enter trace;
210
	struct syscall_metadata *meta = call->data;
211
212
213
214
	int ret;
	int i;
	int offset = offsetof(typeof(trace), args);

215
216
217
218
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

219
	for (i = 0; i < meta->nb_args; i++) {
220
221
		ret = trace_define_field(call, meta->types[i],
					 meta->args[i], offset,
222
223
					 sizeof(unsigned long), 0,
					 FILTER_OTHER);
224
225
226
227
228
229
230
231
232
233
234
		offset += sizeof(unsigned long);
	}

	return ret;
}

int syscall_exit_define_fields(struct ftrace_event_call *call)
{
	struct syscall_trace_exit trace;
	int ret;

235
236
237
238
	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
	if (ret)
		return ret;

239
	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
240
				 FILTER_OTHER);
241
242
243
244

	return ret;
}

245
void ftrace_syscall_enter(struct pt_regs *regs, long id)
246
{
247
248
249
	struct syscall_trace_enter *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
250
	struct ring_buffer *buffer;
251
	int size;
252
253
254
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
255
256
	if (syscall_nr < 0)
		return;
257
258
	if (!test_bit(syscall_nr, enabled_enter_syscalls))
		return;
259

260
261
262
263
264
265
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;

266
267
	event = trace_current_buffer_lock_reserve(&buffer,
			sys_data->enter_event->id, size, 0, 0);
268
269
270
271
272
273
274
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);

275
276
277
	if (!filter_current_check_discard(buffer, sys_data->enter_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
278
279
}

280
void ftrace_syscall_exit(struct pt_regs *regs, long ret)
281
{
282
283
284
	struct syscall_trace_exit *entry;
	struct syscall_metadata *sys_data;
	struct ring_buffer_event *event;
285
	struct ring_buffer *buffer;
286
287
288
	int syscall_nr;

	syscall_nr = syscall_get_nr(current, regs);
289
290
	if (syscall_nr < 0)
		return;
291
292
	if (!test_bit(syscall_nr, enabled_exit_syscalls))
		return;
293

294
295
296
297
	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

298
299
	event = trace_current_buffer_lock_reserve(&buffer,
			sys_data->exit_event->id, sizeof(*entry), 0, 0);
300
301
302
303
304
305
306
	if (!event)
		return;

	entry = ring_buffer_event_data(event);
	entry->nr = syscall_nr;
	entry->ret = syscall_get_return_value(current, regs);

307
308
309
	if (!filter_current_check_discard(buffer, sys_data->exit_event,
					  entry, event))
		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
310
311
}

312
int reg_event_syscall_enter(struct ftrace_event_call *call)
313
{
314
315
316
	int ret = 0;
	int num;

317
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
318
	if (num < 0 || num >= NR_syscalls)
319
320
321
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_enter)
322
		ret = register_trace_sys_enter(ftrace_syscall_enter);
323
	if (!ret) {
324
325
326
327
328
		set_bit(num, enabled_enter_syscalls);
		sys_refcount_enter++;
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
329
330
}

331
void unreg_event_syscall_enter(struct ftrace_event_call *call)
332
{
333
	int num;
334

335
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
336
	if (num < 0 || num >= NR_syscalls)
337
338
339
340
341
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_enter--;
	clear_bit(num, enabled_enter_syscalls);
	if (!sys_refcount_enter)
342
		unregister_trace_sys_enter(ftrace_syscall_enter);
343
344
	mutex_unlock(&syscall_trace_lock);
}
345

346
int reg_event_syscall_exit(struct ftrace_event_call *call)
347
{
348
349
350
	int ret = 0;
	int num;

351
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
352
	if (num < 0 || num >= NR_syscalls)
353
354
355
		return -ENOSYS;
	mutex_lock(&syscall_trace_lock);
	if (!sys_refcount_exit)
356
		ret = register_trace_sys_exit(ftrace_syscall_exit);
357
	if (!ret) {
358
359
		set_bit(num, enabled_exit_syscalls);
		sys_refcount_exit++;
360
	}
361
362
363
	mutex_unlock(&syscall_trace_lock);
	return ret;
}
364

365
void unreg_event_syscall_exit(struct ftrace_event_call *call)
366
367
{
	int num;
368

369
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
370
	if (num < 0 || num >= NR_syscalls)
371
372
373
374
375
		return;
	mutex_lock(&syscall_trace_lock);
	sys_refcount_exit--;
	clear_bit(num, enabled_exit_syscalls);
	if (!sys_refcount_exit)
376
		unregister_trace_sys_exit(ftrace_syscall_exit);
377
	mutex_unlock(&syscall_trace_lock);
378
}
379

380
381
382
383
int init_syscall_trace(struct ftrace_event_call *call)
{
	int id;

384
385
386
	if (set_syscall_print_fmt(call) < 0)
		return -ENOMEM;

387
388
389
	id = trace_event_raw_init(call);

	if (id < 0) {
390
		free_syscall_print_fmt(call);
391
		return id;
392
	}
393
394

	return id;
395
396
}

397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
int __init init_ftrace_syscalls(void)
{
	struct syscall_metadata *meta;
	unsigned long addr;
	int i;

	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
					NR_syscalls, GFP_KERNEL);
	if (!syscalls_metadata) {
		WARN_ON(1);
		return -ENOMEM;
	}

	for (i = 0; i < NR_syscalls; i++) {
		addr = arch_syscall_addr(i);
		meta = find_syscall_meta(addr);
413
414
415
416
		if (!meta)
			continue;

		meta->syscall_nr = i;
417
418
419
420
421
422
423
		syscalls_metadata[i] = meta;
	}

	return 0;
}
core_initcall(init_ftrace_syscalls);

424
#ifdef CONFIG_EVENT_PROFILE
425

426
427
static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
428
429
430
431
432
433
static int sys_prof_refcount_enter;
static int sys_prof_refcount_exit;

static void prof_syscall_enter(struct pt_regs *regs, long id)
{
	struct syscall_metadata *sys_data;
434
435
	struct syscall_trace_enter *rec;
	unsigned long flags;
436
	char *trace_buf;
437
	char *raw_data;
438
	int syscall_nr;
439
	int rctx;
440
	int size;
441
	int cpu;
442
443
444
445
446
447
448
449
450

	syscall_nr = syscall_get_nr(current, regs);
	if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

451
452
453
454
455
	/* get the size after alignment with the u32 buffer size field */
	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
	size = ALIGN(size + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);

456
457
458
459
460
461
	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
		      "profile buffer not large enough"))
		return;

	/* Protect the per cpu buffer, begin the rcu read side */
	local_irq_save(flags);
462

463
464
	rctx = perf_swevent_get_recursion_context();
	if (rctx < 0)
465
466
		goto end_recursion;

467
468
	cpu = smp_processor_id();

469
	trace_buf = rcu_dereference(perf_trace_buf);
470

471
	if (!trace_buf)
472
		goto end;
473

474
	raw_data = per_cpu_ptr(trace_buf, cpu);
475
476
477
478
479
480

	/* zero the dead bytes from align to not leak stack to user */
	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;

	rec = (struct syscall_trace_enter *) raw_data;
	tracing_generic_entry_update(&rec->ent, 0, 0);
481
	rec->ent.type = sys_data->enter_event->id;
482
483
484
	rec->nr = syscall_nr;
	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
			       (unsigned long *)&rec->args);
485
	perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
486
487

end:
488
	perf_swevent_put_recursion_context(rctx);
489
end_recursion:
490
	local_irq_restore(flags);
491
492
}

493
int prof_sysenter_enable(struct ftrace_event_call *call)
494
495
496
497
{
	int ret = 0;
	int num;

498
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
499
500
501

	mutex_lock(&syscall_trace_lock);
	if (!sys_prof_refcount_enter)
502
		ret = register_trace_sys_enter(prof_syscall_enter);
503
504
505
506
507
508
509
510
511
512
513
	if (ret) {
		pr_info("event trace: Could not activate"
				"syscall entry trace point");
	} else {
		set_bit(num, enabled_prof_enter_syscalls);
		sys_prof_refcount_enter++;
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

514
void prof_sysenter_disable(struct ftrace_event_call *call)
515
516
517
{
	int num;

518
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
519
520
521
522
523

	mutex_lock(&syscall_trace_lock);
	sys_prof_refcount_enter--;
	clear_bit(num, enabled_prof_enter_syscalls);
	if (!sys_prof_refcount_enter)
524
		unregister_trace_sys_enter(prof_syscall_enter);
525
526
527
528
529
530
	mutex_unlock(&syscall_trace_lock);
}

static void prof_syscall_exit(struct pt_regs *regs, long ret)
{
	struct syscall_metadata *sys_data;
531
532
	struct syscall_trace_exit *rec;
	unsigned long flags;
533
	int syscall_nr;
534
	char *trace_buf;
535
	char *raw_data;
536
	int rctx;
537
538
	int size;
	int cpu;
539
540
541
542
543
544
545
546
547

	syscall_nr = syscall_get_nr(current, regs);
	if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
		return;

	sys_data = syscall_nr_to_meta(syscall_nr);
	if (!sys_data)
		return;

548
549
550
	/* We can probably do that at build time */
	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
	size -= sizeof(u32);
551

552
553
554
555
556
557
558
559
560
561
	/*
	 * Impossible, but be paranoid with the future
	 * How to put this check outside runtime?
	 */
	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
		"exit event has grown above profile buffer size"))
		return;

	/* Protect the per cpu buffer, begin the rcu read side */
	local_irq_save(flags);
562

563
564
	rctx = perf_swevent_get_recursion_context();
	if (rctx < 0)
565
566
		goto end_recursion;

567
568
	cpu = smp_processor_id();

569
	trace_buf = rcu_dereference(perf_trace_buf);
570

571
	if (!trace_buf)
572
573
		goto end;

574
	raw_data = per_cpu_ptr(trace_buf, cpu);
575
576
577
578
579
580
581

	/* zero the dead bytes from align to not leak stack to user */
	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;

	rec = (struct syscall_trace_exit *)raw_data;

	tracing_generic_entry_update(&rec->ent, 0, 0);
582
	rec->ent.type = sys_data->exit_event->id;
583
584
585
	rec->nr = syscall_nr;
	rec->ret = syscall_get_return_value(current, regs);

586
	perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
587
588

end:
589
	perf_swevent_put_recursion_context(rctx);
590
end_recursion:
591
	local_irq_restore(flags);
592
593
}

594
int prof_sysexit_enable(struct ftrace_event_call *call)
595
596
597
598
{
	int ret = 0;
	int num;

599
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
600
601
602

	mutex_lock(&syscall_trace_lock);
	if (!sys_prof_refcount_exit)
603
		ret = register_trace_sys_exit(prof_syscall_exit);
604
605
606
607
608
609
610
611
612
613
614
	if (ret) {
		pr_info("event trace: Could not activate"
				"syscall entry trace point");
	} else {
		set_bit(num, enabled_prof_exit_syscalls);
		sys_prof_refcount_exit++;
	}
	mutex_unlock(&syscall_trace_lock);
	return ret;
}

615
void prof_sysexit_disable(struct ftrace_event_call *call)
616
617
618
{
	int num;

619
	num = ((struct syscall_metadata *)call->data)->syscall_nr;
620
621
622
623
624

	mutex_lock(&syscall_trace_lock);
	sys_prof_refcount_exit--;
	clear_bit(num, enabled_prof_exit_syscalls);
	if (!sys_prof_refcount_exit)
625
		unregister_trace_sys_exit(prof_syscall_exit);
626
627
628
629
630
631
	mutex_unlock(&syscall_trace_lock);
}

#endif