1#include <traceevent/event-parse.h>
2#include "builtin.h"
3#include "util/color.h"
4#include "util/debug.h"
5#include "util/evlist.h"
6#include "util/machine.h"
7#include "util/session.h"
8#include "util/thread.h"
9#include "util/parse-options.h"
10#include "util/strlist.h"
11#include "util/intlist.h"
12#include "util/thread_map.h"
13#include "util/stat.h"
14#include "trace-event.h"
15#include "util/parse-events.h"
16
17#include <libaudit.h>
18#include <stdlib.h>
19#include <sys/eventfd.h>
20#include <sys/mman.h>
21#include <linux/futex.h>
22
23/* For older distros: */
24#ifndef MAP_STACK
25# define MAP_STACK		0x20000
26#endif
27
28#ifndef MADV_HWPOISON
29# define MADV_HWPOISON		100
30#endif
31
32#ifndef MADV_MERGEABLE
33# define MADV_MERGEABLE		12
34#endif
35
36#ifndef MADV_UNMERGEABLE
37# define MADV_UNMERGEABLE	13
38#endif
39
40#ifndef EFD_SEMAPHORE
41# define EFD_SEMAPHORE		1
42#endif
43
44struct tp_field {
45	int offset;
46	union {
47		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
48		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
49	};
50};
51
52#define TP_UINT_FIELD(bits) \
53static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
54{ \
55	u##bits value; \
56	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
57	return value;  \
58}
59
60TP_UINT_FIELD(8);
61TP_UINT_FIELD(16);
62TP_UINT_FIELD(32);
63TP_UINT_FIELD(64);
64
65#define TP_UINT_FIELD__SWAPPED(bits) \
66static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
67{ \
68	u##bits value; \
69	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
70	return bswap_##bits(value);\
71}
72
73TP_UINT_FIELD__SWAPPED(16);
74TP_UINT_FIELD__SWAPPED(32);
75TP_UINT_FIELD__SWAPPED(64);
76
77static int tp_field__init_uint(struct tp_field *field,
78			       struct format_field *format_field,
79			       bool needs_swap)
80{
81	field->offset = format_field->offset;
82
83	switch (format_field->size) {
84	case 1:
85		field->integer = tp_field__u8;
86		break;
87	case 2:
88		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
89		break;
90	case 4:
91		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
92		break;
93	case 8:
94		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
95		break;
96	default:
97		return -1;
98	}
99
100	return 0;
101}
102
103static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
104{
105	return sample->raw_data + field->offset;
106}
107
108static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
109{
110	field->offset = format_field->offset;
111	field->pointer = tp_field__ptr;
112	return 0;
113}
114
115struct syscall_tp {
116	struct tp_field id;
117	union {
118		struct tp_field args, ret;
119	};
120};
121
122static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
123					  struct tp_field *field,
124					  const char *name)
125{
126	struct format_field *format_field = perf_evsel__field(evsel, name);
127
128	if (format_field == NULL)
129		return -1;
130
131	return tp_field__init_uint(field, format_field, evsel->needs_swap);
132}
133
134#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
135	({ struct syscall_tp *sc = evsel->priv;\
136	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
137
138static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
139					 struct tp_field *field,
140					 const char *name)
141{
142	struct format_field *format_field = perf_evsel__field(evsel, name);
143
144	if (format_field == NULL)
145		return -1;
146
147	return tp_field__init_ptr(field, format_field);
148}
149
150#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
151	({ struct syscall_tp *sc = evsel->priv;\
152	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
153
154static void perf_evsel__delete_priv(struct perf_evsel *evsel)
155{
156	zfree(&evsel->priv);
157	perf_evsel__delete(evsel);
158}
159
160static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
161{
162	evsel->priv = malloc(sizeof(struct syscall_tp));
163	if (evsel->priv != NULL) {
164		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
165			goto out_delete;
166
167		evsel->handler = handler;
168		return 0;
169	}
170
171	return -ENOMEM;
172
173out_delete:
174	zfree(&evsel->priv);
175	return -ENOENT;
176}
177
178static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
179{
180	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
181
182	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
183	if (evsel == NULL)
184		evsel = perf_evsel__newtp("syscalls", direction);
185
186	if (evsel) {
187		if (perf_evsel__init_syscall_tp(evsel, handler))
188			goto out_delete;
189	}
190
191	return evsel;
192
193out_delete:
194	perf_evsel__delete_priv(evsel);
195	return NULL;
196}
197
198#define perf_evsel__sc_tp_uint(evsel, name, sample) \
199	({ struct syscall_tp *fields = evsel->priv; \
200	   fields->name.integer(&fields->name, sample); })
201
202#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
203	({ struct syscall_tp *fields = evsel->priv; \
204	   fields->name.pointer(&fields->name, sample); })
205
206static int perf_evlist__add_syscall_newtp(struct perf_evlist *evlist,
207					  void *sys_enter_handler,
208					  void *sys_exit_handler)
209{
210	int ret = -1;
211	struct perf_evsel *sys_enter, *sys_exit;
212
213	sys_enter = perf_evsel__syscall_newtp("sys_enter", sys_enter_handler);
214	if (sys_enter == NULL)
215		goto out;
216
217	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
218		goto out_delete_sys_enter;
219
220	sys_exit = perf_evsel__syscall_newtp("sys_exit", sys_exit_handler);
221	if (sys_exit == NULL)
222		goto out_delete_sys_enter;
223
224	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
225		goto out_delete_sys_exit;
226
227	perf_evlist__add(evlist, sys_enter);
228	perf_evlist__add(evlist, sys_exit);
229
230	ret = 0;
231out:
232	return ret;
233
234out_delete_sys_exit:
235	perf_evsel__delete_priv(sys_exit);
236out_delete_sys_enter:
237	perf_evsel__delete_priv(sys_enter);
238	goto out;
239}
240
241
242struct syscall_arg {
243	unsigned long val;
244	struct thread *thread;
245	struct trace  *trace;
246	void	      *parm;
247	u8	      idx;
248	u8	      mask;
249};
250
251struct strarray {
252	int	    offset;
253	int	    nr_entries;
254	const char **entries;
255};
256
257#define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
258	.nr_entries = ARRAY_SIZE(array), \
259	.entries = array, \
260}
261
262#define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
263	.offset	    = off, \
264	.nr_entries = ARRAY_SIZE(array), \
265	.entries = array, \
266}
267
268static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
269						const char *intfmt,
270					        struct syscall_arg *arg)
271{
272	struct strarray *sa = arg->parm;
273	int idx = arg->val - sa->offset;
274
275	if (idx < 0 || idx >= sa->nr_entries)
276		return scnprintf(bf, size, intfmt, arg->val);
277
278	return scnprintf(bf, size, "%s", sa->entries[idx]);
279}
280
281static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
282					      struct syscall_arg *arg)
283{
284	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
285}
286
287#define SCA_STRARRAY syscall_arg__scnprintf_strarray
288
289#if defined(__i386__) || defined(__x86_64__)
290/*
291 * FIXME: Make this available to all arches as soon as the ioctl beautifier
292 * 	  gets rewritten to support all arches.
293 */
294static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
295						 struct syscall_arg *arg)
296{
297	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
298}
299
300#define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
301#endif /* defined(__i386__) || defined(__x86_64__) */
302
303static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
304					struct syscall_arg *arg);
305
306#define SCA_FD syscall_arg__scnprintf_fd
307
308static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
309					   struct syscall_arg *arg)
310{
311	int fd = arg->val;
312
313	if (fd == AT_FDCWD)
314		return scnprintf(bf, size, "CWD");
315
316	return syscall_arg__scnprintf_fd(bf, size, arg);
317}
318
319#define SCA_FDAT syscall_arg__scnprintf_fd_at
320
321static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
322					      struct syscall_arg *arg);
323
324#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
325
326static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
327					 struct syscall_arg *arg)
328{
329	return scnprintf(bf, size, "%#lx", arg->val);
330}
331
332#define SCA_HEX syscall_arg__scnprintf_hex
333
334static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
335					       struct syscall_arg *arg)
336{
337	int printed = 0, prot = arg->val;
338
339	if (prot == PROT_NONE)
340		return scnprintf(bf, size, "NONE");
341#define	P_MMAP_PROT(n) \
342	if (prot & PROT_##n) { \
343		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
344		prot &= ~PROT_##n; \
345	}
346
347	P_MMAP_PROT(EXEC);
348	P_MMAP_PROT(READ);
349	P_MMAP_PROT(WRITE);
350#ifdef PROT_SEM
351	P_MMAP_PROT(SEM);
352#endif
353	P_MMAP_PROT(GROWSDOWN);
354	P_MMAP_PROT(GROWSUP);
355#undef P_MMAP_PROT
356
357	if (prot)
358		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
359
360	return printed;
361}
362
363#define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
364
365static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
366						struct syscall_arg *arg)
367{
368	int printed = 0, flags = arg->val;
369
370#define	P_MMAP_FLAG(n) \
371	if (flags & MAP_##n) { \
372		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
373		flags &= ~MAP_##n; \
374	}
375
376	P_MMAP_FLAG(SHARED);
377	P_MMAP_FLAG(PRIVATE);
378#ifdef MAP_32BIT
379	P_MMAP_FLAG(32BIT);
380#endif
381	P_MMAP_FLAG(ANONYMOUS);
382	P_MMAP_FLAG(DENYWRITE);
383	P_MMAP_FLAG(EXECUTABLE);
384	P_MMAP_FLAG(FILE);
385	P_MMAP_FLAG(FIXED);
386	P_MMAP_FLAG(GROWSDOWN);
387#ifdef MAP_HUGETLB
388	P_MMAP_FLAG(HUGETLB);
389#endif
390	P_MMAP_FLAG(LOCKED);
391	P_MMAP_FLAG(NONBLOCK);
392	P_MMAP_FLAG(NORESERVE);
393	P_MMAP_FLAG(POPULATE);
394	P_MMAP_FLAG(STACK);
395#ifdef MAP_UNINITIALIZED
396	P_MMAP_FLAG(UNINITIALIZED);
397#endif
398#undef P_MMAP_FLAG
399
400	if (flags)
401		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
402
403	return printed;
404}
405
406#define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
407
408static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
409						  struct syscall_arg *arg)
410{
411	int printed = 0, flags = arg->val;
412
413#define P_MREMAP_FLAG(n) \
414	if (flags & MREMAP_##n) { \
415		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
416		flags &= ~MREMAP_##n; \
417	}
418
419	P_MREMAP_FLAG(MAYMOVE);
420#ifdef MREMAP_FIXED
421	P_MREMAP_FLAG(FIXED);
422#endif
423#undef P_MREMAP_FLAG
424
425	if (flags)
426		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
427
428	return printed;
429}
430
431#define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
432
433static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
434						      struct syscall_arg *arg)
435{
436	int behavior = arg->val;
437
438	switch (behavior) {
439#define	P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
440	P_MADV_BHV(NORMAL);
441	P_MADV_BHV(RANDOM);
442	P_MADV_BHV(SEQUENTIAL);
443	P_MADV_BHV(WILLNEED);
444	P_MADV_BHV(DONTNEED);
445	P_MADV_BHV(REMOVE);
446	P_MADV_BHV(DONTFORK);
447	P_MADV_BHV(DOFORK);
448	P_MADV_BHV(HWPOISON);
449#ifdef MADV_SOFT_OFFLINE
450	P_MADV_BHV(SOFT_OFFLINE);
451#endif
452	P_MADV_BHV(MERGEABLE);
453	P_MADV_BHV(UNMERGEABLE);
454#ifdef MADV_HUGEPAGE
455	P_MADV_BHV(HUGEPAGE);
456#endif
457#ifdef MADV_NOHUGEPAGE
458	P_MADV_BHV(NOHUGEPAGE);
459#endif
460#ifdef MADV_DONTDUMP
461	P_MADV_BHV(DONTDUMP);
462#endif
463#ifdef MADV_DODUMP
464	P_MADV_BHV(DODUMP);
465#endif
466#undef P_MADV_PHV
467	default: break;
468	}
469
470	return scnprintf(bf, size, "%#x", behavior);
471}
472
473#define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
474
475static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
476					   struct syscall_arg *arg)
477{
478	int printed = 0, op = arg->val;
479
480	if (op == 0)
481		return scnprintf(bf, size, "NONE");
482#define	P_CMD(cmd) \
483	if ((op & LOCK_##cmd) == LOCK_##cmd) { \
484		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
485		op &= ~LOCK_##cmd; \
486	}
487
488	P_CMD(SH);
489	P_CMD(EX);
490	P_CMD(NB);
491	P_CMD(UN);
492	P_CMD(MAND);
493	P_CMD(RW);
494	P_CMD(READ);
495	P_CMD(WRITE);
496#undef P_OP
497
498	if (op)
499		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
500
501	return printed;
502}
503
504#define SCA_FLOCK syscall_arg__scnprintf_flock
505
506static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
507{
508	enum syscall_futex_args {
509		SCF_UADDR   = (1 << 0),
510		SCF_OP	    = (1 << 1),
511		SCF_VAL	    = (1 << 2),
512		SCF_TIMEOUT = (1 << 3),
513		SCF_UADDR2  = (1 << 4),
514		SCF_VAL3    = (1 << 5),
515	};
516	int op = arg->val;
517	int cmd = op & FUTEX_CMD_MASK;
518	size_t printed = 0;
519
520	switch (cmd) {
521#define	P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
522	P_FUTEX_OP(WAIT);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
523	P_FUTEX_OP(WAKE);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
524	P_FUTEX_OP(FD);		    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
525	P_FUTEX_OP(REQUEUE);	    arg->mask |= SCF_VAL3|SCF_TIMEOUT;	          break;
526	P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;			  break;
527	P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;			  break;
528	P_FUTEX_OP(WAKE_OP);							  break;
529	P_FUTEX_OP(LOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
530	P_FUTEX_OP(UNLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
531	P_FUTEX_OP(TRYLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
532	P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;			  break;
533	P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;			  break;
534	P_FUTEX_OP(WAIT_REQUEUE_PI);						  break;
535	default: printed = scnprintf(bf, size, "%#x", cmd);			  break;
536	}
537
538	if (op & FUTEX_PRIVATE_FLAG)
539		printed += scnprintf(bf + printed, size - printed, "|PRIV");
540
541	if (op & FUTEX_CLOCK_REALTIME)
542		printed += scnprintf(bf + printed, size - printed, "|CLKRT");
543
544	return printed;
545}
546
547#define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
548
549static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
550static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
551
552static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
553static DEFINE_STRARRAY(itimers);
554
555static const char *whences[] = { "SET", "CUR", "END",
556#ifdef SEEK_DATA
557"DATA",
558#endif
559#ifdef SEEK_HOLE
560"HOLE",
561#endif
562};
563static DEFINE_STRARRAY(whences);
564
565static const char *fcntl_cmds[] = {
566	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
567	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
568	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
569	"F_GETOWNER_UIDS",
570};
571static DEFINE_STRARRAY(fcntl_cmds);
572
573static const char *rlimit_resources[] = {
574	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
575	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
576	"RTTIME",
577};
578static DEFINE_STRARRAY(rlimit_resources);
579
580static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
581static DEFINE_STRARRAY(sighow);
582
583static const char *clockid[] = {
584	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
585	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE",
586};
587static DEFINE_STRARRAY(clockid);
588
589static const char *socket_families[] = {
590	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
591	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
592	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
593	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
594	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
595	"ALG", "NFC", "VSOCK",
596};
597static DEFINE_STRARRAY(socket_families);
598
599#ifndef SOCK_TYPE_MASK
600#define SOCK_TYPE_MASK 0xf
601#endif
602
603static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
604						      struct syscall_arg *arg)
605{
606	size_t printed;
607	int type = arg->val,
608	    flags = type & ~SOCK_TYPE_MASK;
609
610	type &= SOCK_TYPE_MASK;
611	/*
612 	 * Can't use a strarray, MIPS may override for ABI reasons.
613 	 */
614	switch (type) {
615#define	P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
616	P_SK_TYPE(STREAM);
617	P_SK_TYPE(DGRAM);
618	P_SK_TYPE(RAW);
619	P_SK_TYPE(RDM);
620	P_SK_TYPE(SEQPACKET);
621	P_SK_TYPE(DCCP);
622	P_SK_TYPE(PACKET);
623#undef P_SK_TYPE
624	default:
625		printed = scnprintf(bf, size, "%#x", type);
626	}
627
628#define	P_SK_FLAG(n) \
629	if (flags & SOCK_##n) { \
630		printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
631		flags &= ~SOCK_##n; \
632	}
633
634	P_SK_FLAG(CLOEXEC);
635	P_SK_FLAG(NONBLOCK);
636#undef P_SK_FLAG
637
638	if (flags)
639		printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
640
641	return printed;
642}
643
644#define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
645
646#ifndef MSG_PROBE
647#define MSG_PROBE	     0x10
648#endif
649#ifndef MSG_WAITFORONE
650#define MSG_WAITFORONE	0x10000
651#endif
652#ifndef MSG_SENDPAGE_NOTLAST
653#define MSG_SENDPAGE_NOTLAST 0x20000
654#endif
655#ifndef MSG_FASTOPEN
656#define MSG_FASTOPEN	     0x20000000
657#endif
658
659static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
660					       struct syscall_arg *arg)
661{
662	int printed = 0, flags = arg->val;
663
664	if (flags == 0)
665		return scnprintf(bf, size, "NONE");
666#define	P_MSG_FLAG(n) \
667	if (flags & MSG_##n) { \
668		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
669		flags &= ~MSG_##n; \
670	}
671
672	P_MSG_FLAG(OOB);
673	P_MSG_FLAG(PEEK);
674	P_MSG_FLAG(DONTROUTE);
675	P_MSG_FLAG(TRYHARD);
676	P_MSG_FLAG(CTRUNC);
677	P_MSG_FLAG(PROBE);
678	P_MSG_FLAG(TRUNC);
679	P_MSG_FLAG(DONTWAIT);
680	P_MSG_FLAG(EOR);
681	P_MSG_FLAG(WAITALL);
682	P_MSG_FLAG(FIN);
683	P_MSG_FLAG(SYN);
684	P_MSG_FLAG(CONFIRM);
685	P_MSG_FLAG(RST);
686	P_MSG_FLAG(ERRQUEUE);
687	P_MSG_FLAG(NOSIGNAL);
688	P_MSG_FLAG(MORE);
689	P_MSG_FLAG(WAITFORONE);
690	P_MSG_FLAG(SENDPAGE_NOTLAST);
691	P_MSG_FLAG(FASTOPEN);
692	P_MSG_FLAG(CMSG_CLOEXEC);
693#undef P_MSG_FLAG
694
695	if (flags)
696		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
697
698	return printed;
699}
700
701#define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
702
703static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
704						 struct syscall_arg *arg)
705{
706	size_t printed = 0;
707	int mode = arg->val;
708
709	if (mode == F_OK) /* 0 */
710		return scnprintf(bf, size, "F");
711#define	P_MODE(n) \
712	if (mode & n##_OK) { \
713		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
714		mode &= ~n##_OK; \
715	}
716
717	P_MODE(R);
718	P_MODE(W);
719	P_MODE(X);
720#undef P_MODE
721
722	if (mode)
723		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
724
725	return printed;
726}
727
728#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
729
730static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
731					       struct syscall_arg *arg)
732{
733	int printed = 0, flags = arg->val;
734
735	if (!(flags & O_CREAT))
736		arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
737
738	if (flags == 0)
739		return scnprintf(bf, size, "RDONLY");
740#define	P_FLAG(n) \
741	if (flags & O_##n) { \
742		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
743		flags &= ~O_##n; \
744	}
745
746	P_FLAG(APPEND);
747	P_FLAG(ASYNC);
748	P_FLAG(CLOEXEC);
749	P_FLAG(CREAT);
750	P_FLAG(DIRECT);
751	P_FLAG(DIRECTORY);
752	P_FLAG(EXCL);
753	P_FLAG(LARGEFILE);
754	P_FLAG(NOATIME);
755	P_FLAG(NOCTTY);
756#ifdef O_NONBLOCK
757	P_FLAG(NONBLOCK);
758#elif O_NDELAY
759	P_FLAG(NDELAY);
760#endif
761#ifdef O_PATH
762	P_FLAG(PATH);
763#endif
764	P_FLAG(RDWR);
765#ifdef O_DSYNC
766	if ((flags & O_SYNC) == O_SYNC)
767		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
768	else {
769		P_FLAG(DSYNC);
770	}
771#else
772	P_FLAG(SYNC);
773#endif
774	P_FLAG(TRUNC);
775	P_FLAG(WRONLY);
776#undef P_FLAG
777
778	if (flags)
779		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
780
781	return printed;
782}
783
784#define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
785
786static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
787						   struct syscall_arg *arg)
788{
789	int printed = 0, flags = arg->val;
790
791	if (flags == 0)
792		return scnprintf(bf, size, "NONE");
793#define	P_FLAG(n) \
794	if (flags & EFD_##n) { \
795		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
796		flags &= ~EFD_##n; \
797	}
798
799	P_FLAG(SEMAPHORE);
800	P_FLAG(CLOEXEC);
801	P_FLAG(NONBLOCK);
802#undef P_FLAG
803
804	if (flags)
805		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
806
807	return printed;
808}
809
810#define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
811
812static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
813						struct syscall_arg *arg)
814{
815	int printed = 0, flags = arg->val;
816
817#define	P_FLAG(n) \
818	if (flags & O_##n) { \
819		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
820		flags &= ~O_##n; \
821	}
822
823	P_FLAG(CLOEXEC);
824	P_FLAG(NONBLOCK);
825#undef P_FLAG
826
827	if (flags)
828		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
829
830	return printed;
831}
832
833#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
834
835static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
836{
837	int sig = arg->val;
838
839	switch (sig) {
840#define	P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
841	P_SIGNUM(HUP);
842	P_SIGNUM(INT);
843	P_SIGNUM(QUIT);
844	P_SIGNUM(ILL);
845	P_SIGNUM(TRAP);
846	P_SIGNUM(ABRT);
847	P_SIGNUM(BUS);
848	P_SIGNUM(FPE);
849	P_SIGNUM(KILL);
850	P_SIGNUM(USR1);
851	P_SIGNUM(SEGV);
852	P_SIGNUM(USR2);
853	P_SIGNUM(PIPE);
854	P_SIGNUM(ALRM);
855	P_SIGNUM(TERM);
856	P_SIGNUM(CHLD);
857	P_SIGNUM(CONT);
858	P_SIGNUM(STOP);
859	P_SIGNUM(TSTP);
860	P_SIGNUM(TTIN);
861	P_SIGNUM(TTOU);
862	P_SIGNUM(URG);
863	P_SIGNUM(XCPU);
864	P_SIGNUM(XFSZ);
865	P_SIGNUM(VTALRM);
866	P_SIGNUM(PROF);
867	P_SIGNUM(WINCH);
868	P_SIGNUM(IO);
869	P_SIGNUM(PWR);
870	P_SIGNUM(SYS);
871#ifdef SIGEMT
872	P_SIGNUM(EMT);
873#endif
874#ifdef SIGSTKFLT
875	P_SIGNUM(STKFLT);
876#endif
877#ifdef SIGSWI
878	P_SIGNUM(SWI);
879#endif
880	default: break;
881	}
882
883	return scnprintf(bf, size, "%#x", sig);
884}
885
886#define SCA_SIGNUM syscall_arg__scnprintf_signum
887
888#if defined(__i386__) || defined(__x86_64__)
889/*
890 * FIXME: Make this available to all arches.
891 */
892#define TCGETS		0x5401
893
894static const char *tioctls[] = {
895	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
896	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
897	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
898	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
899	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
900	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
901	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
902	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
903	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
904	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
905	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
906	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
907	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
908	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
909	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
910};
911
912static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
913#endif /* defined(__i386__) || defined(__x86_64__) */
914
915#define STRARRAY(arg, name, array) \
916	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
917	  .arg_parm	 = { [arg] = &strarray__##array, }
918
919static struct syscall_fmt {
920	const char *name;
921	const char *alias;
922	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
923	void	   *arg_parm[6];
924	bool	   errmsg;
925	bool	   timeout;
926	bool	   hexret;
927} syscall_fmts[] = {
928	{ .name	    = "access",	    .errmsg = true,
929	  .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
930	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
931	{ .name	    = "brk",	    .hexret = true,
932	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
933	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
934	{ .name	    = "close",	    .errmsg = true,
935	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
936	{ .name	    = "connect",    .errmsg = true, },
937	{ .name	    = "dup",	    .errmsg = true,
938	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
939	{ .name	    = "dup2",	    .errmsg = true,
940	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
941	{ .name	    = "dup3",	    .errmsg = true,
942	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
943	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
944	{ .name	    = "eventfd2",   .errmsg = true,
945	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
946	{ .name	    = "faccessat",  .errmsg = true,
947	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
948	{ .name	    = "fadvise64",  .errmsg = true,
949	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
950	{ .name	    = "fallocate",  .errmsg = true,
951	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
952	{ .name	    = "fchdir",	    .errmsg = true,
953	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
954	{ .name	    = "fchmod",	    .errmsg = true,
955	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
956	{ .name	    = "fchmodat",   .errmsg = true,
957	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
958	{ .name	    = "fchown",	    .errmsg = true,
959	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
960	{ .name	    = "fchownat",   .errmsg = true,
961	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
962	{ .name	    = "fcntl",	    .errmsg = true,
963	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
964			     [1] = SCA_STRARRAY, /* cmd */ },
965	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
966	{ .name	    = "fdatasync",  .errmsg = true,
967	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
968	{ .name	    = "flock",	    .errmsg = true,
969	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
970			     [1] = SCA_FLOCK, /* cmd */ }, },
971	{ .name	    = "fsetxattr",  .errmsg = true,
972	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
973	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat",
974	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
975	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat",
976	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
977	{ .name	    = "fstatfs",    .errmsg = true,
978	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
979	{ .name	    = "fsync",    .errmsg = true,
980	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
981	{ .name	    = "ftruncate", .errmsg = true,
982	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
983	{ .name	    = "futex",	    .errmsg = true,
984	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
985	{ .name	    = "futimesat", .errmsg = true,
986	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
987	{ .name	    = "getdents",   .errmsg = true,
988	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
989	{ .name	    = "getdents64", .errmsg = true,
990	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
991	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
992	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
993	{ .name	    = "ioctl",	    .errmsg = true,
994	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
995#if defined(__i386__) || defined(__x86_64__)
996/*
997 * FIXME: Make this available to all arches.
998 */
999			     [1] = SCA_STRHEXARRAY, /* cmd */
1000			     [2] = SCA_HEX, /* arg */ },
1001	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
1002#else
1003			     [2] = SCA_HEX, /* arg */ }, },
1004#endif
1005	{ .name	    = "kill",	    .errmsg = true,
1006	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1007	{ .name	    = "linkat",	    .errmsg = true,
1008	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1009	{ .name	    = "lseek",	    .errmsg = true,
1010	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1011			     [2] = SCA_STRARRAY, /* whence */ },
1012	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
1013	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat", },
1014	{ .name     = "madvise",    .errmsg = true,
1015	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
1016			     [2] = SCA_MADV_BHV, /* behavior */ }, },
1017	{ .name	    = "mkdirat",    .errmsg = true,
1018	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1019	{ .name	    = "mknodat",    .errmsg = true,
1020	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1021	{ .name	    = "mlock",	    .errmsg = true,
1022	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1023	{ .name	    = "mlockall",   .errmsg = true,
1024	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1025	{ .name	    = "mmap",	    .hexret = true,
1026	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
1027			     [2] = SCA_MMAP_PROT, /* prot */
1028			     [3] = SCA_MMAP_FLAGS, /* flags */
1029			     [4] = SCA_FD, 	  /* fd */ }, },
1030	{ .name	    = "mprotect",   .errmsg = true,
1031	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
1032			     [2] = SCA_MMAP_PROT, /* prot */ }, },
1033	{ .name	    = "mremap",	    .hexret = true,
1034	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1035			     [3] = SCA_MREMAP_FLAGS, /* flags */
1036			     [4] = SCA_HEX, /* new_addr */ }, },
1037	{ .name	    = "munlock",    .errmsg = true,
1038	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1039	{ .name	    = "munmap",	    .errmsg = true,
1040	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1041	{ .name	    = "name_to_handle_at", .errmsg = true,
1042	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1043	{ .name	    = "newfstatat", .errmsg = true,
1044	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1045	{ .name	    = "open",	    .errmsg = true,
1046	  .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1047	{ .name	    = "open_by_handle_at", .errmsg = true,
1048	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1049			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1050	{ .name	    = "openat",	    .errmsg = true,
1051	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1052			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1053	{ .name	    = "pipe2",	    .errmsg = true,
1054	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1055	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
1056	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
1057	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64",
1058	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1059	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread",
1060	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1061	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1062	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64",
1063	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1064	{ .name	    = "pwritev",    .errmsg = true,
1065	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1066	{ .name	    = "read",	    .errmsg = true,
1067	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1068	{ .name	    = "readlinkat", .errmsg = true,
1069	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1070	{ .name	    = "readv",	    .errmsg = true,
1071	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1072	{ .name	    = "recvfrom",   .errmsg = true,
1073	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1074	{ .name	    = "recvmmsg",   .errmsg = true,
1075	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1076	{ .name	    = "recvmsg",    .errmsg = true,
1077	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1078	{ .name	    = "renameat",   .errmsg = true,
1079	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1080	{ .name	    = "rt_sigaction", .errmsg = true,
1081	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1082	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1083	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
1084	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1085	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
1086	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1087	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
1088	{ .name	    = "sendmmsg",    .errmsg = true,
1089	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1090	{ .name	    = "sendmsg",    .errmsg = true,
1091	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1092	{ .name	    = "sendto",	    .errmsg = true,
1093	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1094	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1095	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1096	{ .name	    = "shutdown",   .errmsg = true,
1097	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1098	{ .name	    = "socket",	    .errmsg = true,
1099	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1100			     [1] = SCA_SK_TYPE, /* type */ },
1101	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1102	{ .name	    = "socketpair", .errmsg = true,
1103	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1104			     [1] = SCA_SK_TYPE, /* type */ },
1105	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1106	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat", },
1107	{ .name	    = "symlinkat",  .errmsg = true,
1108	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1109	{ .name	    = "tgkill",	    .errmsg = true,
1110	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1111	{ .name	    = "tkill",	    .errmsg = true,
1112	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1113	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
1114	{ .name	    = "unlinkat",   .errmsg = true,
1115	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1116	{ .name	    = "utimensat",  .errmsg = true,
1117	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
1118	{ .name	    = "write",	    .errmsg = true,
1119	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1120	{ .name	    = "writev",	    .errmsg = true,
1121	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1122};
1123
1124static int syscall_fmt__cmp(const void *name, const void *fmtp)
1125{
1126	const struct syscall_fmt *fmt = fmtp;
1127	return strcmp(name, fmt->name);
1128}
1129
1130static struct syscall_fmt *syscall_fmt__find(const char *name)
1131{
1132	const int nmemb = ARRAY_SIZE(syscall_fmts);
1133	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1134}
1135
1136struct syscall {
1137	struct event_format *tp_format;
1138	int		    nr_args;
1139	struct format_field *args;
1140	const char	    *name;
1141	bool		    filtered;
1142	bool		    is_exit;
1143	struct syscall_fmt  *fmt;
1144	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1145	void		    **arg_parm;
1146};
1147
1148static size_t fprintf_duration(unsigned long t, FILE *fp)
1149{
1150	double duration = (double)t / NSEC_PER_MSEC;
1151	size_t printed = fprintf(fp, "(");
1152
1153	if (duration >= 1.0)
1154		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1155	else if (duration >= 0.01)
1156		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1157	else
1158		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1159	return printed + fprintf(fp, "): ");
1160}
1161
1162struct thread_trace {
1163	u64		  entry_time;
1164	u64		  exit_time;
1165	bool		  entry_pending;
1166	unsigned long	  nr_events;
1167	unsigned long	  pfmaj, pfmin;
1168	char		  *entry_str;
1169	double		  runtime_ms;
1170	struct {
1171		int	  max;
1172		char	  **table;
1173	} paths;
1174
1175	struct intlist *syscall_stats;
1176};
1177
1178static struct thread_trace *thread_trace__new(void)
1179{
1180	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1181
1182	if (ttrace)
1183		ttrace->paths.max = -1;
1184
1185	ttrace->syscall_stats = intlist__new(NULL);
1186
1187	return ttrace;
1188}
1189
1190static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1191{
1192	struct thread_trace *ttrace;
1193
1194	if (thread == NULL)
1195		goto fail;
1196
1197	if (thread__priv(thread) == NULL)
1198		thread__set_priv(thread, thread_trace__new());
1199
1200	if (thread__priv(thread) == NULL)
1201		goto fail;
1202
1203	ttrace = thread__priv(thread);
1204	++ttrace->nr_events;
1205
1206	return ttrace;
1207fail:
1208	color_fprintf(fp, PERF_COLOR_RED,
1209		      "WARNING: not enough memory, dropping samples!\n");
1210	return NULL;
1211}
1212
1213#define TRACE_PFMAJ		(1 << 0)
1214#define TRACE_PFMIN		(1 << 1)
1215
1216struct trace {
1217	struct perf_tool	tool;
1218	struct {
1219		int		machine;
1220		int		open_id;
1221	}			audit;
1222	struct {
1223		int		max;
1224		struct syscall  *table;
1225	} syscalls;
1226	struct record_opts	opts;
1227	struct perf_evlist	*evlist;
1228	struct machine		*host;
1229	struct thread		*current;
1230	u64			base_time;
1231	FILE			*output;
1232	unsigned long		nr_events;
1233	struct strlist		*ev_qualifier;
1234	const char 		*last_vfs_getname;
1235	struct intlist		*tid_list;
1236	struct intlist		*pid_list;
1237	struct {
1238		size_t		nr;
1239		pid_t		*entries;
1240	}			filter_pids;
1241	double			duration_filter;
1242	double			runtime_ms;
1243	struct {
1244		u64		vfs_getname,
1245				proc_getname;
1246	} stats;
1247	bool			not_ev_qualifier;
1248	bool			live;
1249	bool			full_time;
1250	bool			sched;
1251	bool			multiple_threads;
1252	bool			summary;
1253	bool			summary_only;
1254	bool			show_comm;
1255	bool			show_tool_stats;
1256	bool			trace_syscalls;
1257	bool			force;
1258	int			trace_pgfaults;
1259};
1260
1261static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1262{
1263	struct thread_trace *ttrace = thread__priv(thread);
1264
1265	if (fd > ttrace->paths.max) {
1266		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1267
1268		if (npath == NULL)
1269			return -1;
1270
1271		if (ttrace->paths.max != -1) {
1272			memset(npath + ttrace->paths.max + 1, 0,
1273			       (fd - ttrace->paths.max) * sizeof(char *));
1274		} else {
1275			memset(npath, 0, (fd + 1) * sizeof(char *));
1276		}
1277
1278		ttrace->paths.table = npath;
1279		ttrace->paths.max   = fd;
1280	}
1281
1282	ttrace->paths.table[fd] = strdup(pathname);
1283
1284	return ttrace->paths.table[fd] != NULL ? 0 : -1;
1285}
1286
1287static int thread__read_fd_path(struct thread *thread, int fd)
1288{
1289	char linkname[PATH_MAX], pathname[PATH_MAX];
1290	struct stat st;
1291	int ret;
1292
1293	if (thread->pid_ == thread->tid) {
1294		scnprintf(linkname, sizeof(linkname),
1295			  "/proc/%d/fd/%d", thread->pid_, fd);
1296	} else {
1297		scnprintf(linkname, sizeof(linkname),
1298			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1299	}
1300
1301	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1302		return -1;
1303
1304	ret = readlink(linkname, pathname, sizeof(pathname));
1305
1306	if (ret < 0 || ret > st.st_size)
1307		return -1;
1308
1309	pathname[ret] = '\0';
1310	return trace__set_fd_pathname(thread, fd, pathname);
1311}
1312
1313static const char *thread__fd_path(struct thread *thread, int fd,
1314				   struct trace *trace)
1315{
1316	struct thread_trace *ttrace = thread__priv(thread);
1317
1318	if (ttrace == NULL)
1319		return NULL;
1320
1321	if (fd < 0)
1322		return NULL;
1323
1324	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1325		if (!trace->live)
1326			return NULL;
1327		++trace->stats.proc_getname;
1328		if (thread__read_fd_path(thread, fd))
1329			return NULL;
1330	}
1331
1332	return ttrace->paths.table[fd];
1333}
1334
1335static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1336					struct syscall_arg *arg)
1337{
1338	int fd = arg->val;
1339	size_t printed = scnprintf(bf, size, "%d", fd);
1340	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1341
1342	if (path)
1343		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1344
1345	return printed;
1346}
1347
1348static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1349					      struct syscall_arg *arg)
1350{
1351	int fd = arg->val;
1352	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1353	struct thread_trace *ttrace = thread__priv(arg->thread);
1354
1355	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1356		zfree(&ttrace->paths.table[fd]);
1357
1358	return printed;
1359}
1360
1361static bool trace__filter_duration(struct trace *trace, double t)
1362{
1363	return t < (trace->duration_filter * NSEC_PER_MSEC);
1364}
1365
1366static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1367{
1368	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1369
1370	return fprintf(fp, "%10.3f ", ts);
1371}
1372
1373static bool done = false;
1374static bool interrupted = false;
1375
1376static void sig_handler(int sig)
1377{
1378	done = true;
1379	interrupted = sig == SIGINT;
1380}
1381
1382static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1383					u64 duration, u64 tstamp, FILE *fp)
1384{
1385	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1386	printed += fprintf_duration(duration, fp);
1387
1388	if (trace->multiple_threads) {
1389		if (trace->show_comm)
1390			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1391		printed += fprintf(fp, "%d ", thread->tid);
1392	}
1393
1394	return printed;
1395}
1396
1397static int trace__process_event(struct trace *trace, struct machine *machine,
1398				union perf_event *event, struct perf_sample *sample)
1399{
1400	int ret = 0;
1401
1402	switch (event->header.type) {
1403	case PERF_RECORD_LOST:
1404		color_fprintf(trace->output, PERF_COLOR_RED,
1405			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1406		ret = machine__process_lost_event(machine, event, sample);
1407	default:
1408		ret = machine__process_event(machine, event, sample);
1409		break;
1410	}
1411
1412	return ret;
1413}
1414
1415static int trace__tool_process(struct perf_tool *tool,
1416			       union perf_event *event,
1417			       struct perf_sample *sample,
1418			       struct machine *machine)
1419{
1420	struct trace *trace = container_of(tool, struct trace, tool);
1421	return trace__process_event(trace, machine, event, sample);
1422}
1423
1424static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1425{
1426	int err = symbol__init(NULL);
1427
1428	if (err)
1429		return err;
1430
1431	trace->host = machine__new_host();
1432	if (trace->host == NULL)
1433		return -ENOMEM;
1434
1435	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1436					    evlist->threads, trace__tool_process, false);
1437	if (err)
1438		symbol__exit();
1439
1440	return err;
1441}
1442
1443static int syscall__set_arg_fmts(struct syscall *sc)
1444{
1445	struct format_field *field;
1446	int idx = 0;
1447
1448	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1449	if (sc->arg_scnprintf == NULL)
1450		return -1;
1451
1452	if (sc->fmt)
1453		sc->arg_parm = sc->fmt->arg_parm;
1454
1455	for (field = sc->args; field; field = field->next) {
1456		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1457			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1458		else if (field->flags & FIELD_IS_POINTER)
1459			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1460		++idx;
1461	}
1462
1463	return 0;
1464}
1465
1466static int trace__read_syscall_info(struct trace *trace, int id)
1467{
1468	char tp_name[128];
1469	struct syscall *sc;
1470	const char *name = audit_syscall_to_name(id, trace->audit.machine);
1471
1472	if (name == NULL)
1473		return -1;
1474
1475	if (id > trace->syscalls.max) {
1476		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1477
1478		if (nsyscalls == NULL)
1479			return -1;
1480
1481		if (trace->syscalls.max != -1) {
1482			memset(nsyscalls + trace->syscalls.max + 1, 0,
1483			       (id - trace->syscalls.max) * sizeof(*sc));
1484		} else {
1485			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1486		}
1487
1488		trace->syscalls.table = nsyscalls;
1489		trace->syscalls.max   = id;
1490	}
1491
1492	sc = trace->syscalls.table + id;
1493	sc->name = name;
1494
1495	if (trace->ev_qualifier) {
1496		bool in = strlist__find(trace->ev_qualifier, name) != NULL;
1497
1498		if (!(in ^ trace->not_ev_qualifier)) {
1499			sc->filtered = true;
1500			/*
1501			 * No need to do read tracepoint information since this will be
1502			 * filtered out.
1503			 */
1504			return 0;
1505		}
1506	}
1507
1508	sc->fmt  = syscall_fmt__find(sc->name);
1509
1510	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1511	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1512
1513	if (sc->tp_format == NULL && sc->fmt && sc->fmt->alias) {
1514		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1515		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1516	}
1517
1518	if (sc->tp_format == NULL)
1519		return -1;
1520
1521	sc->args = sc->tp_format->format.fields;
1522	sc->nr_args = sc->tp_format->format.nr_fields;
1523	/* drop nr field - not relevant here; does not exist on older kernels */
1524	if (sc->args && strcmp(sc->args->name, "nr") == 0) {
1525		sc->args = sc->args->next;
1526		--sc->nr_args;
1527	}
1528
1529	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1530
1531	return syscall__set_arg_fmts(sc);
1532}
1533
1534/*
1535 * args is to be interpreted as a series of longs but we need to handle
1536 * 8-byte unaligned accesses. args points to raw_data within the event
1537 * and raw_data is guaranteed to be 8-byte unaligned because it is
1538 * preceded by raw_size which is a u32. So we need to copy args to a temp
1539 * variable to read it. Most notably this avoids extended load instructions
1540 * on unaligned addresses
1541 */
1542
1543static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1544				      unsigned char *args, struct trace *trace,
1545				      struct thread *thread)
1546{
1547	size_t printed = 0;
1548	unsigned char *p;
1549	unsigned long val;
1550
1551	if (sc->args != NULL) {
1552		struct format_field *field;
1553		u8 bit = 1;
1554		struct syscall_arg arg = {
1555			.idx	= 0,
1556			.mask	= 0,
1557			.trace  = trace,
1558			.thread = thread,
1559		};
1560
1561		for (field = sc->args; field;
1562		     field = field->next, ++arg.idx, bit <<= 1) {
1563			if (arg.mask & bit)
1564				continue;
1565
1566			/* special care for unaligned accesses */
1567			p = args + sizeof(unsigned long) * arg.idx;
1568			memcpy(&val, p, sizeof(val));
1569
1570			/*
1571 			 * Suppress this argument if its value is zero and
1572 			 * and we don't have a string associated in an
1573 			 * strarray for it.
1574 			 */
1575			if (val == 0 &&
1576			    !(sc->arg_scnprintf &&
1577			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1578			      sc->arg_parm[arg.idx]))
1579				continue;
1580
1581			printed += scnprintf(bf + printed, size - printed,
1582					     "%s%s: ", printed ? ", " : "", field->name);
1583			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1584				arg.val = val;
1585				if (sc->arg_parm)
1586					arg.parm = sc->arg_parm[arg.idx];
1587				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1588								      size - printed, &arg);
1589			} else {
1590				printed += scnprintf(bf + printed, size - printed,
1591						     "%ld", val);
1592			}
1593		}
1594	} else {
1595		int i = 0;
1596
1597		while (i < 6) {
1598			/* special care for unaligned accesses */
1599			p = args + sizeof(unsigned long) * i;
1600			memcpy(&val, p, sizeof(val));
1601			printed += scnprintf(bf + printed, size - printed,
1602					     "%sarg%d: %ld",
1603					     printed ? ", " : "", i, val);
1604			++i;
1605		}
1606	}
1607
1608	return printed;
1609}
1610
1611typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1612				  union perf_event *event,
1613				  struct perf_sample *sample);
1614
1615static struct syscall *trace__syscall_info(struct trace *trace,
1616					   struct perf_evsel *evsel, int id)
1617{
1618
1619	if (id < 0) {
1620
1621		/*
1622		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1623		 * before that, leaving at a higher verbosity level till that is
1624		 * explained. Reproduced with plain ftrace with:
1625		 *
1626		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1627		 * grep "NR -1 " /t/trace_pipe
1628		 *
1629		 * After generating some load on the machine.
1630 		 */
1631		if (verbose > 1) {
1632			static u64 n;
1633			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1634				id, perf_evsel__name(evsel), ++n);
1635		}
1636		return NULL;
1637	}
1638
1639	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1640	    trace__read_syscall_info(trace, id))
1641		goto out_cant_read;
1642
1643	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1644		goto out_cant_read;
1645
1646	return &trace->syscalls.table[id];
1647
1648out_cant_read:
1649	if (verbose) {
1650		fprintf(trace->output, "Problems reading syscall %d", id);
1651		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1652			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1653		fputs(" information\n", trace->output);
1654	}
1655	return NULL;
1656}
1657
1658static void thread__update_stats(struct thread_trace *ttrace,
1659				 int id, struct perf_sample *sample)
1660{
1661	struct int_node *inode;
1662	struct stats *stats;
1663	u64 duration = 0;
1664
1665	inode = intlist__findnew(ttrace->syscall_stats, id);
1666	if (inode == NULL)
1667		return;
1668
1669	stats = inode->priv;
1670	if (stats == NULL) {
1671		stats = malloc(sizeof(struct stats));
1672		if (stats == NULL)
1673			return;
1674		init_stats(stats);
1675		inode->priv = stats;
1676	}
1677
1678	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1679		duration = sample->time - ttrace->entry_time;
1680
1681	update_stats(stats, duration);
1682}
1683
1684static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1685{
1686	struct thread_trace *ttrace;
1687	u64 duration;
1688	size_t printed;
1689
1690	if (trace->current == NULL)
1691		return 0;
1692
1693	ttrace = thread__priv(trace->current);
1694
1695	if (!ttrace->entry_pending)
1696		return 0;
1697
1698	duration = sample->time - ttrace->entry_time;
1699
1700	printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1701	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1702	ttrace->entry_pending = false;
1703
1704	return printed;
1705}
1706
1707static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1708			    union perf_event *event __maybe_unused,
1709			    struct perf_sample *sample)
1710{
1711	char *msg;
1712	void *args;
1713	size_t printed = 0;
1714	struct thread *thread;
1715	int id = perf_evsel__sc_tp_uint(evsel, id, sample);
1716	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1717	struct thread_trace *ttrace;
1718
1719	if (sc == NULL)
1720		return -1;
1721
1722	if (sc->filtered)
1723		return 0;
1724
1725	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1726	ttrace = thread__trace(thread, trace->output);
1727	if (ttrace == NULL)
1728		return -1;
1729
1730	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1731
1732	if (ttrace->entry_str == NULL) {
1733		ttrace->entry_str = malloc(1024);
1734		if (!ttrace->entry_str)
1735			return -1;
1736	}
1737
1738	if (!trace->summary_only)
1739		trace__printf_interrupted_entry(trace, sample);
1740
1741	ttrace->entry_time = sample->time;
1742	msg = ttrace->entry_str;
1743	printed += scnprintf(msg + printed, 1024 - printed, "%s(", sc->name);
1744
1745	printed += syscall__scnprintf_args(sc, msg + printed, 1024 - printed,
1746					   args, trace, thread);
1747
1748	if (sc->is_exit) {
1749		if (!trace->duration_filter && !trace->summary_only) {
1750			trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1751			fprintf(trace->output, "%-70s\n", ttrace->entry_str);
1752		}
1753	} else
1754		ttrace->entry_pending = true;
1755
1756	if (trace->current != thread) {
1757		thread__put(trace->current);
1758		trace->current = thread__get(thread);
1759	}
1760
1761	return 0;
1762}
1763
1764static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1765			   union perf_event *event __maybe_unused,
1766			   struct perf_sample *sample)
1767{
1768	long ret;
1769	u64 duration = 0;
1770	struct thread *thread;
1771	int id = perf_evsel__sc_tp_uint(evsel, id, sample);
1772	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1773	struct thread_trace *ttrace;
1774
1775	if (sc == NULL)
1776		return -1;
1777
1778	if (sc->filtered)
1779		return 0;
1780
1781	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1782	ttrace = thread__trace(thread, trace->output);
1783	if (ttrace == NULL)
1784		return -1;
1785
1786	if (trace->summary)
1787		thread__update_stats(ttrace, id, sample);
1788
1789	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1790
1791	if (id == trace->audit.open_id && ret >= 0 && trace->last_vfs_getname) {
1792		trace__set_fd_pathname(thread, ret, trace->last_vfs_getname);
1793		trace->last_vfs_getname = NULL;
1794		++trace->stats.vfs_getname;
1795	}
1796
1797	ttrace->exit_time = sample->time;
1798
1799	if (ttrace->entry_time) {
1800		duration = sample->time - ttrace->entry_time;
1801		if (trace__filter_duration(trace, duration))
1802			goto out;
1803	} else if (trace->duration_filter)
1804		goto out;
1805
1806	if (trace->summary_only)
1807		goto out;
1808
1809	trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1810
1811	if (ttrace->entry_pending) {
1812		fprintf(trace->output, "%-70s", ttrace->entry_str);
1813	} else {
1814		fprintf(trace->output, " ... [");
1815		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1816		fprintf(trace->output, "]: %s()", sc->name);
1817	}
1818
1819	if (sc->fmt == NULL) {
1820signed_print:
1821		fprintf(trace->output, ") = %ld", ret);
1822	} else if (ret < 0 && sc->fmt->errmsg) {
1823		char bf[STRERR_BUFSIZE];
1824		const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
1825			   *e = audit_errno_to_name(-ret);
1826
1827		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1828	} else if (ret == 0 && sc->fmt->timeout)
1829		fprintf(trace->output, ") = 0 Timeout");
1830	else if (sc->fmt->hexret)
1831		fprintf(trace->output, ") = %#lx", ret);
1832	else
1833		goto signed_print;
1834
1835	fputc('\n', trace->output);
1836out:
1837	ttrace->entry_pending = false;
1838
1839	return 0;
1840}
1841
1842static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1843			      union perf_event *event __maybe_unused,
1844			      struct perf_sample *sample)
1845{
1846	trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname");
1847	return 0;
1848}
1849
1850static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1851				     union perf_event *event __maybe_unused,
1852				     struct perf_sample *sample)
1853{
1854        u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1855	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1856	struct thread *thread = machine__findnew_thread(trace->host,
1857							sample->pid,
1858							sample->tid);
1859	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1860
1861	if (ttrace == NULL)
1862		goto out_dump;
1863
1864	ttrace->runtime_ms += runtime_ms;
1865	trace->runtime_ms += runtime_ms;
1866	return 0;
1867
1868out_dump:
1869	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1870	       evsel->name,
1871	       perf_evsel__strval(evsel, sample, "comm"),
1872	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1873	       runtime,
1874	       perf_evsel__intval(evsel, sample, "vruntime"));
1875	return 0;
1876}
1877
1878static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1879				union perf_event *event __maybe_unused,
1880				struct perf_sample *sample)
1881{
1882	trace__printf_interrupted_entry(trace, sample);
1883	trace__fprintf_tstamp(trace, sample->time, trace->output);
1884
1885	if (trace->trace_syscalls)
1886		fprintf(trace->output, "(         ): ");
1887
1888	fprintf(trace->output, "%s:", evsel->name);
1889
1890	if (evsel->tp_format) {
1891		event_format__fprintf(evsel->tp_format, sample->cpu,
1892				      sample->raw_data, sample->raw_size,
1893				      trace->output);
1894	}
1895
1896	fprintf(trace->output, ")\n");
1897	return 0;
1898}
1899
1900static void print_location(FILE *f, struct perf_sample *sample,
1901			   struct addr_location *al,
1902			   bool print_dso, bool print_sym)
1903{
1904
1905	if ((verbose || print_dso) && al->map)
1906		fprintf(f, "%s@", al->map->dso->long_name);
1907
1908	if ((verbose || print_sym) && al->sym)
1909		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1910			al->addr - al->sym->start);
1911	else if (al->map)
1912		fprintf(f, "0x%" PRIx64, al->addr);
1913	else
1914		fprintf(f, "0x%" PRIx64, sample->addr);
1915}
1916
1917static int trace__pgfault(struct trace *trace,
1918			  struct perf_evsel *evsel,
1919			  union perf_event *event,
1920			  struct perf_sample *sample)
1921{
1922	struct thread *thread;
1923	u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
1924	struct addr_location al;
1925	char map_type = 'd';
1926	struct thread_trace *ttrace;
1927
1928	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1929	ttrace = thread__trace(thread, trace->output);
1930	if (ttrace == NULL)
1931		return -1;
1932
1933	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1934		ttrace->pfmaj++;
1935	else
1936		ttrace->pfmin++;
1937
1938	if (trace->summary_only)
1939		return 0;
1940
1941	thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
1942			      sample->ip, &al);
1943
1944	trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1945
1946	fprintf(trace->output, "%sfault [",
1947		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1948		"maj" : "min");
1949
1950	print_location(trace->output, sample, &al, false, true);
1951
1952	fprintf(trace->output, "] => ");
1953
1954	thread__find_addr_location(thread, cpumode, MAP__VARIABLE,
1955				   sample->addr, &al);
1956
1957	if (!al.map) {
1958		thread__find_addr_location(thread, cpumode,
1959					   MAP__FUNCTION, sample->addr, &al);
1960
1961		if (al.map)
1962			map_type = 'x';
1963		else
1964			map_type = '?';
1965	}
1966
1967	print_location(trace->output, sample, &al, true, false);
1968
1969	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1970
1971	return 0;
1972}
1973
1974static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1975{
1976	if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1977	    (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1978		return false;
1979
1980	if (trace->pid_list || trace->tid_list)
1981		return true;
1982
1983	return false;
1984}
1985
1986static int trace__process_sample(struct perf_tool *tool,
1987				 union perf_event *event,
1988				 struct perf_sample *sample,
1989				 struct perf_evsel *evsel,
1990				 struct machine *machine __maybe_unused)
1991{
1992	struct trace *trace = container_of(tool, struct trace, tool);
1993	int err = 0;
1994
1995	tracepoint_handler handler = evsel->handler;
1996
1997	if (skip_sample(trace, sample))
1998		return 0;
1999
2000	if (!trace->full_time && trace->base_time == 0)
2001		trace->base_time = sample->time;
2002
2003	if (handler) {
2004		++trace->nr_events;
2005		handler(trace, evsel, event, sample);
2006	}
2007
2008	return err;
2009}
2010
2011static int parse_target_str(struct trace *trace)
2012{
2013	if (trace->opts.target.pid) {
2014		trace->pid_list = intlist__new(trace->opts.target.pid);
2015		if (trace->pid_list == NULL) {
2016			pr_err("Error parsing process id string\n");
2017			return -EINVAL;
2018		}
2019	}
2020
2021	if (trace->opts.target.tid) {
2022		trace->tid_list = intlist__new(trace->opts.target.tid);
2023		if (trace->tid_list == NULL) {
2024			pr_err("Error parsing thread id string\n");
2025			return -EINVAL;
2026		}
2027	}
2028
2029	return 0;
2030}
2031
2032static int trace__record(struct trace *trace, int argc, const char **argv)
2033{
2034	unsigned int rec_argc, i, j;
2035	const char **rec_argv;
2036	const char * const record_args[] = {
2037		"record",
2038		"-R",
2039		"-m", "1024",
2040		"-c", "1",
2041	};
2042
2043	const char * const sc_args[] = { "-e", };
2044	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2045	const char * const majpf_args[] = { "-e", "major-faults" };
2046	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2047	const char * const minpf_args[] = { "-e", "minor-faults" };
2048	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2049
2050	/* +1 is for the event string below */
2051	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2052		majpf_args_nr + minpf_args_nr + argc;
2053	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2054
2055	if (rec_argv == NULL)
2056		return -ENOMEM;
2057
2058	j = 0;
2059	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2060		rec_argv[j++] = record_args[i];
2061
2062	if (trace->trace_syscalls) {
2063		for (i = 0; i < sc_args_nr; i++)
2064			rec_argv[j++] = sc_args[i];
2065
2066		/* event string may be different for older kernels - e.g., RHEL6 */
2067		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2068			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2069		else if (is_valid_tracepoint("syscalls:sys_enter"))
2070			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2071		else {
2072			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2073			return -1;
2074		}
2075	}
2076
2077	if (trace->trace_pgfaults & TRACE_PFMAJ)
2078		for (i = 0; i < majpf_args_nr; i++)
2079			rec_argv[j++] = majpf_args[i];
2080
2081	if (trace->trace_pgfaults & TRACE_PFMIN)
2082		for (i = 0; i < minpf_args_nr; i++)
2083			rec_argv[j++] = minpf_args[i];
2084
2085	for (i = 0; i < (unsigned int)argc; i++)
2086		rec_argv[j++] = argv[i];
2087
2088	return cmd_record(j, rec_argv, NULL);
2089}
2090
2091static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2092
2093static void perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2094{
2095	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2096	if (evsel == NULL)
2097		return;
2098
2099	if (perf_evsel__field(evsel, "pathname") == NULL) {
2100		perf_evsel__delete(evsel);
2101		return;
2102	}
2103
2104	evsel->handler = trace__vfs_getname;
2105	perf_evlist__add(evlist, evsel);
2106}
2107
2108static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2109				    u64 config)
2110{
2111	struct perf_evsel *evsel;
2112	struct perf_event_attr attr = {
2113		.type = PERF_TYPE_SOFTWARE,
2114		.mmap_data = 1,
2115	};
2116
2117	attr.config = config;
2118	attr.sample_period = 1;
2119
2120	event_attr_init(&attr);
2121
2122	evsel = perf_evsel__new(&attr);
2123	if (!evsel)
2124		return -ENOMEM;
2125
2126	evsel->handler = trace__pgfault;
2127	perf_evlist__add(evlist, evsel);
2128
2129	return 0;
2130}
2131
2132static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2133{
2134	const u32 type = event->header.type;
2135	struct perf_evsel *evsel;
2136
2137	if (!trace->full_time && trace->base_time == 0)
2138		trace->base_time = sample->time;
2139
2140	if (type != PERF_RECORD_SAMPLE) {
2141		trace__process_event(trace, trace->host, event, sample);
2142		return;
2143	}
2144
2145	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2146	if (evsel == NULL) {
2147		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2148		return;
2149	}
2150
2151	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2152	    sample->raw_data == NULL) {
2153		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2154		       perf_evsel__name(evsel), sample->tid,
2155		       sample->cpu, sample->raw_size);
2156	} else {
2157		tracepoint_handler handler = evsel->handler;
2158		handler(trace, evsel, event, sample);
2159	}
2160}
2161
2162static int trace__run(struct trace *trace, int argc, const char **argv)
2163{
2164	struct perf_evlist *evlist = trace->evlist;
2165	int err = -1, i;
2166	unsigned long before;
2167	const bool forks = argc > 0;
2168	bool draining = false;
2169
2170	trace->live = true;
2171
2172	if (trace->trace_syscalls &&
2173	    perf_evlist__add_syscall_newtp(evlist, trace__sys_enter,
2174					   trace__sys_exit))
2175		goto out_error_raw_syscalls;
2176
2177	if (trace->trace_syscalls)
2178		perf_evlist__add_vfs_getname(evlist);
2179
2180	if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2181	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2182		goto out_error_mem;
2183	}
2184
2185	if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2186	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2187		goto out_error_mem;
2188
2189	if (trace->sched &&
2190	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2191				   trace__sched_stat_runtime))
2192		goto out_error_sched_stat_runtime;
2193
2194	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2195	if (err < 0) {
2196		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2197		goto out_delete_evlist;
2198	}
2199
2200	err = trace__symbols_init(trace, evlist);
2201	if (err < 0) {
2202		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2203		goto out_delete_evlist;
2204	}
2205
2206	perf_evlist__config(evlist, &trace->opts);
2207
2208	signal(SIGCHLD, sig_handler);
2209	signal(SIGINT, sig_handler);
2210
2211	if (forks) {
2212		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2213						    argv, false, NULL);
2214		if (err < 0) {
2215			fprintf(trace->output, "Couldn't run the workload!\n");
2216			goto out_delete_evlist;
2217		}
2218	}
2219
2220	err = perf_evlist__open(evlist);
2221	if (err < 0)
2222		goto out_error_open;
2223
2224	/*
2225	 * Better not use !target__has_task() here because we need to cover the
2226	 * case where no threads were specified in the command line, but a
2227	 * workload was, and in that case we will fill in the thread_map when
2228	 * we fork the workload in perf_evlist__prepare_workload.
2229	 */
2230	if (trace->filter_pids.nr > 0)
2231		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2232	else if (evlist->threads->map[0] == -1)
2233		err = perf_evlist__set_filter_pid(evlist, getpid());
2234
2235	if (err < 0) {
2236		printf("err=%d,%s\n", -err, strerror(-err));
2237		exit(1);
2238	}
2239
2240	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2241	if (err < 0)
2242		goto out_error_mmap;
2243
2244	if (!target__none(&trace->opts.target))
2245		perf_evlist__enable(evlist);
2246
2247	if (forks)
2248		perf_evlist__start_workload(evlist);
2249
2250	trace->multiple_threads = evlist->threads->map[0] == -1 ||
2251				  evlist->threads->nr > 1 ||
2252				  perf_evlist__first(evlist)->attr.inherit;
2253again:
2254	before = trace->nr_events;
2255
2256	for (i = 0; i < evlist->nr_mmaps; i++) {
2257		union perf_event *event;
2258
2259		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2260			struct perf_sample sample;
2261
2262			++trace->nr_events;
2263
2264			err = perf_evlist__parse_sample(evlist, event, &sample);
2265			if (err) {
2266				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2267				goto next_event;
2268			}
2269
2270			trace__handle_event(trace, event, &sample);
2271next_event:
2272			perf_evlist__mmap_consume(evlist, i);
2273
2274			if (interrupted)
2275				goto out_disable;
2276
2277			if (done && !draining) {
2278				perf_evlist__disable(evlist);
2279				draining = true;
2280			}
2281		}
2282	}
2283
2284	if (trace->nr_events == before) {
2285		int timeout = done ? 100 : -1;
2286
2287		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2288			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2289				draining = true;
2290
2291			goto again;
2292		}
2293	} else {
2294		goto again;
2295	}
2296
2297out_disable:
2298	thread__zput(trace->current);
2299
2300	perf_evlist__disable(evlist);
2301
2302	if (!err) {
2303		if (trace->summary)
2304			trace__fprintf_thread_summary(trace, trace->output);
2305
2306		if (trace->show_tool_stats) {
2307			fprintf(trace->output, "Stats:\n "
2308					       " vfs_getname : %" PRIu64 "\n"
2309					       " proc_getname: %" PRIu64 "\n",
2310				trace->stats.vfs_getname,
2311				trace->stats.proc_getname);
2312		}
2313	}
2314
2315out_delete_evlist:
2316	perf_evlist__delete(evlist);
2317	trace->evlist = NULL;
2318	trace->live = false;
2319	return err;
2320{
2321	char errbuf[BUFSIZ];
2322
2323out_error_sched_stat_runtime:
2324	debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2325	goto out_error;
2326
2327out_error_raw_syscalls:
2328	debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2329	goto out_error;
2330
2331out_error_mmap:
2332	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2333	goto out_error;
2334
2335out_error_open:
2336	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2337
2338out_error:
2339	fprintf(trace->output, "%s\n", errbuf);
2340	goto out_delete_evlist;
2341}
2342out_error_mem:
2343	fprintf(trace->output, "Not enough memory to run!\n");
2344	goto out_delete_evlist;
2345}
2346
2347static int trace__replay(struct trace *trace)
2348{
2349	const struct perf_evsel_str_handler handlers[] = {
2350		{ "probe:vfs_getname",	     trace__vfs_getname, },
2351	};
2352	struct perf_data_file file = {
2353		.path  = input_name,
2354		.mode  = PERF_DATA_MODE_READ,
2355		.force = trace->force,
2356	};
2357	struct perf_session *session;
2358	struct perf_evsel *evsel;
2359	int err = -1;
2360
2361	trace->tool.sample	  = trace__process_sample;
2362	trace->tool.mmap	  = perf_event__process_mmap;
2363	trace->tool.mmap2	  = perf_event__process_mmap2;
2364	trace->tool.comm	  = perf_event__process_comm;
2365	trace->tool.exit	  = perf_event__process_exit;
2366	trace->tool.fork	  = perf_event__process_fork;
2367	trace->tool.attr	  = perf_event__process_attr;
2368	trace->tool.tracing_data = perf_event__process_tracing_data;
2369	trace->tool.build_id	  = perf_event__process_build_id;
2370
2371	trace->tool.ordered_events = true;
2372	trace->tool.ordering_requires_timestamps = true;
2373
2374	/* add tid to output */
2375	trace->multiple_threads = true;
2376
2377	session = perf_session__new(&file, false, &trace->tool);
2378	if (session == NULL)
2379		return -1;
2380
2381	if (symbol__init(&session->header.env) < 0)
2382		goto out;
2383
2384	trace->host = &session->machines.host;
2385
2386	err = perf_session__set_tracepoints_handlers(session, handlers);
2387	if (err)
2388		goto out;
2389
2390	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2391						     "raw_syscalls:sys_enter");
2392	/* older kernels have syscalls tp versus raw_syscalls */
2393	if (evsel == NULL)
2394		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2395							     "syscalls:sys_enter");
2396
2397	if (evsel &&
2398	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2399	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2400		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2401		goto out;
2402	}
2403
2404	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2405						     "raw_syscalls:sys_exit");
2406	if (evsel == NULL)
2407		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2408							     "syscalls:sys_exit");
2409	if (evsel &&
2410	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2411	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2412		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2413		goto out;
2414	}
2415
2416	evlist__for_each(session->evlist, evsel) {
2417		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2418		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2419		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2420		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2421			evsel->handler = trace__pgfault;
2422	}
2423
2424	err = parse_target_str(trace);
2425	if (err != 0)
2426		goto out;
2427
2428	setup_pager();
2429
2430	err = perf_session__process_events(session);
2431	if (err)
2432		pr_err("Failed to process events, error %d", err);
2433
2434	else if (trace->summary)
2435		trace__fprintf_thread_summary(trace, trace->output);
2436
2437out:
2438	perf_session__delete(session);
2439
2440	return err;
2441}
2442
2443static size_t trace__fprintf_threads_header(FILE *fp)
2444{
2445	size_t printed;
2446
2447	printed  = fprintf(fp, "\n Summary of events:\n\n");
2448
2449	return printed;
2450}
2451
2452static size_t thread__dump_stats(struct thread_trace *ttrace,
2453				 struct trace *trace, FILE *fp)
2454{
2455	struct stats *stats;
2456	size_t printed = 0;
2457	struct syscall *sc;
2458	struct int_node *inode = intlist__first(ttrace->syscall_stats);
2459
2460	if (inode == NULL)
2461		return 0;
2462
2463	printed += fprintf(fp, "\n");
2464
2465	printed += fprintf(fp, "   syscall            calls      min       avg       max      stddev\n");
2466	printed += fprintf(fp, "                               (msec)    (msec)    (msec)        (%%)\n");
2467	printed += fprintf(fp, "   --------------- -------- --------- --------- ---------     ------\n");
2468
2469	/* each int_node is a syscall */
2470	while (inode) {
2471		stats = inode->priv;
2472		if (stats) {
2473			double min = (double)(stats->min) / NSEC_PER_MSEC;
2474			double max = (double)(stats->max) / NSEC_PER_MSEC;
2475			double avg = avg_stats(stats);
2476			double pct;
2477			u64 n = (u64) stats->n;
2478
2479			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2480			avg /= NSEC_PER_MSEC;
2481
2482			sc = &trace->syscalls.table[inode->i];
2483			printed += fprintf(fp, "   %-15s", sc->name);
2484			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f",
2485					   n, min, avg);
2486			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2487		}
2488
2489		inode = intlist__next(inode);
2490	}
2491
2492	printed += fprintf(fp, "\n\n");
2493
2494	return printed;
2495}
2496
2497/* struct used to pass data to per-thread function */
2498struct summary_data {
2499	FILE *fp;
2500	struct trace *trace;
2501	size_t printed;
2502};
2503
2504static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2505{
2506	struct summary_data *data = priv;
2507	FILE *fp = data->fp;
2508	size_t printed = data->printed;
2509	struct trace *trace = data->trace;
2510	struct thread_trace *ttrace = thread__priv(thread);
2511	double ratio;
2512
2513	if (ttrace == NULL)
2514		return 0;
2515
2516	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2517
2518	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2519	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2520	printed += fprintf(fp, "%.1f%%", ratio);
2521	if (ttrace->pfmaj)
2522		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2523	if (ttrace->pfmin)
2524		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2525	printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2526	printed += thread__dump_stats(ttrace, trace, fp);
2527
2528	data->printed += printed;
2529
2530	return 0;
2531}
2532
2533static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2534{
2535	struct summary_data data = {
2536		.fp = fp,
2537		.trace = trace
2538	};
2539	data.printed = trace__fprintf_threads_header(fp);
2540
2541	machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2542
2543	return data.printed;
2544}
2545
2546static int trace__set_duration(const struct option *opt, const char *str,
2547			       int unset __maybe_unused)
2548{
2549	struct trace *trace = opt->value;
2550
2551	trace->duration_filter = atof(str);
2552	return 0;
2553}
2554
2555static int trace__set_filter_pids(const struct option *opt, const char *str,
2556				  int unset __maybe_unused)
2557{
2558	int ret = -1;
2559	size_t i;
2560	struct trace *trace = opt->value;
2561	/*
2562	 * FIXME: introduce a intarray class, plain parse csv and create a
2563	 * { int nr, int entries[] } struct...
2564	 */
2565	struct intlist *list = intlist__new(str);
2566
2567	if (list == NULL)
2568		return -1;
2569
2570	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2571	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2572
2573	if (trace->filter_pids.entries == NULL)
2574		goto out;
2575
2576	trace->filter_pids.entries[0] = getpid();
2577
2578	for (i = 1; i < trace->filter_pids.nr; ++i)
2579		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2580
2581	intlist__delete(list);
2582	ret = 0;
2583out:
2584	return ret;
2585}
2586
2587static int trace__open_output(struct trace *trace, const char *filename)
2588{
2589	struct stat st;
2590
2591	if (!stat(filename, &st) && st.st_size) {
2592		char oldname[PATH_MAX];
2593
2594		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2595		unlink(oldname);
2596		rename(filename, oldname);
2597	}
2598
2599	trace->output = fopen(filename, "w");
2600
2601	return trace->output == NULL ? -errno : 0;
2602}
2603
2604static int parse_pagefaults(const struct option *opt, const char *str,
2605			    int unset __maybe_unused)
2606{
2607	int *trace_pgfaults = opt->value;
2608
2609	if (strcmp(str, "all") == 0)
2610		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2611	else if (strcmp(str, "maj") == 0)
2612		*trace_pgfaults |= TRACE_PFMAJ;
2613	else if (strcmp(str, "min") == 0)
2614		*trace_pgfaults |= TRACE_PFMIN;
2615	else
2616		return -1;
2617
2618	return 0;
2619}
2620
2621static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2622{
2623	struct perf_evsel *evsel;
2624
2625	evlist__for_each(evlist, evsel)
2626		evsel->handler = handler;
2627}
2628
2629int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2630{
2631	const char *trace_usage[] = {
2632		"perf trace [<options>] [<command>]",
2633		"perf trace [<options>] -- <command> [<options>]",
2634		"perf trace record [<options>] [<command>]",
2635		"perf trace record [<options>] -- <command> [<options>]",
2636		NULL
2637	};
2638	struct trace trace = {
2639		.audit = {
2640			.machine = audit_detect_machine(),
2641			.open_id = audit_name_to_syscall("open", trace.audit.machine),
2642		},
2643		.syscalls = {
2644			. max = -1,
2645		},
2646		.opts = {
2647			.target = {
2648				.uid	   = UINT_MAX,
2649				.uses_mmap = true,
2650			},
2651			.user_freq     = UINT_MAX,
2652			.user_interval = ULLONG_MAX,
2653			.no_buffering  = true,
2654			.mmap_pages    = UINT_MAX,
2655		},
2656		.output = stdout,
2657		.show_comm = true,
2658		.trace_syscalls = true,
2659	};
2660	const char *output_name = NULL;
2661	const char *ev_qualifier_str = NULL;
2662	const struct option trace_options[] = {
2663	OPT_CALLBACK(0, "event", &trace.evlist, "event",
2664		     "event selector. use 'perf list' to list available events",
2665		     parse_events_option),
2666	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2667		    "show the thread COMM next to its id"),
2668	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2669	OPT_STRING('e', "expr", &ev_qualifier_str, "expr",
2670		    "list of events to trace"),
2671	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2672	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2673	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2674		    "trace events on existing process id"),
2675	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2676		    "trace events on existing thread id"),
2677	OPT_CALLBACK(0, "filter-pids", &trace, "float",
2678		     "show only events with duration > N.M ms", trace__set_filter_pids),
2679	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2680		    "system-wide collection from all CPUs"),
2681	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2682		    "list of cpus to monitor"),
2683	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2684		    "child tasks do not inherit counters"),
2685	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2686		     "number of mmap data pages",
2687		     perf_evlist__parse_mmap_pages),
2688	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2689		   "user to profile"),
2690	OPT_CALLBACK(0, "duration", &trace, "float",
2691		     "show only events with duration > N.M ms",
2692		     trace__set_duration),
2693	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2694	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2695	OPT_BOOLEAN('T', "time", &trace.full_time,
2696		    "Show full timestamp, not time relative to first start"),
2697	OPT_BOOLEAN('s', "summary", &trace.summary_only,
2698		    "Show only syscall summary with statistics"),
2699	OPT_BOOLEAN('S', "with-summary", &trace.summary,
2700		    "Show all syscalls and summary with statistics"),
2701	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2702		     "Trace pagefaults", parse_pagefaults, "maj"),
2703	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2704	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2705	OPT_END()
2706	};
2707	const char * const trace_subcommands[] = { "record", NULL };
2708	int err;
2709	char bf[BUFSIZ];
2710
2711	signal(SIGSEGV, sighandler_dump_stack);
2712	signal(SIGFPE, sighandler_dump_stack);
2713
2714	trace.evlist = perf_evlist__new();
2715	if (trace.evlist == NULL)
2716		return -ENOMEM;
2717
2718	if (trace.evlist == NULL) {
2719		pr_err("Not enough memory to run!\n");
2720		goto out;
2721	}
2722
2723	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2724				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2725
2726	if (trace.trace_pgfaults) {
2727		trace.opts.sample_address = true;
2728		trace.opts.sample_time = true;
2729	}
2730
2731	if (trace.evlist->nr_entries > 0)
2732		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2733
2734	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2735		return trace__record(&trace, argc-1, &argv[1]);
2736
2737	/* summary_only implies summary option, but don't overwrite summary if set */
2738	if (trace.summary_only)
2739		trace.summary = trace.summary_only;
2740
2741	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2742	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
2743		pr_err("Please specify something to trace.\n");
2744		return -1;
2745	}
2746
2747	if (output_name != NULL) {
2748		err = trace__open_output(&trace, output_name);
2749		if (err < 0) {
2750			perror("failed to create output file");
2751			goto out;
2752		}
2753	}
2754
2755	if (ev_qualifier_str != NULL) {
2756		const char *s = ev_qualifier_str;
2757
2758		trace.not_ev_qualifier = *s == '!';
2759		if (trace.not_ev_qualifier)
2760			++s;
2761		trace.ev_qualifier = strlist__new(true, s);
2762		if (trace.ev_qualifier == NULL) {
2763			fputs("Not enough memory to parse event qualifier",
2764			      trace.output);
2765			err = -ENOMEM;
2766			goto out_close;
2767		}
2768	}
2769
2770	err = target__validate(&trace.opts.target);
2771	if (err) {
2772		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2773		fprintf(trace.output, "%s", bf);
2774		goto out_close;
2775	}
2776
2777	err = target__parse_uid(&trace.opts.target);
2778	if (err) {
2779		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2780		fprintf(trace.output, "%s", bf);
2781		goto out_close;
2782	}
2783
2784	if (!argc && target__none(&trace.opts.target))
2785		trace.opts.target.system_wide = true;
2786
2787	if (input_name)
2788		err = trace__replay(&trace);
2789	else
2790		err = trace__run(&trace, argc, argv);
2791
2792out_close:
2793	if (output_name != NULL)
2794		fclose(trace.output);
2795out:
2796	return err;
2797}
2798