From fc5a40a2301aa241eedb16caf9169ca5763707c1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 9 Oct 2012 09:49:02 +0100
Subject: UAPI: (Scripted) Disintegrate include/linux/raid

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Dave Jones <davej@redhat.com>
---
 include/uapi/linux/raid/Kbuild |   2 +
 include/uapi/linux/raid/md_p.h | 301 +++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/raid/md_u.h | 155 +++++++++++++++++++++
 3 files changed, 458 insertions(+)
 create mode 100644 include/uapi/linux/raid/md_p.h
 create mode 100644 include/uapi/linux/raid/md_u.h

(limited to 'include/uapi')

diff --git a/include/uapi/linux/raid/Kbuild b/include/uapi/linux/raid/Kbuild
index aafaa5aa54d4..e2c3d25405d7 100644
--- a/include/uapi/linux/raid/Kbuild
+++ b/include/uapi/linux/raid/Kbuild
@@ -1 +1,3 @@
 # UAPI Header export list
+header-y += md_p.h
+header-y += md_u.h
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
new file mode 100644
index 000000000000..ee753536ab70
--- /dev/null
+++ b/include/uapi/linux/raid/md_p.h
@@ -0,0 +1,301 @@
+/*
+   md_p.h : physical layout of Linux RAID devices
+          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
+	  
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+
+#ifndef _MD_P_H
+#define _MD_P_H
+
+#include <linux/types.h>
+
+/*
+ * RAID superblock.
+ *
+ * The RAID superblock maintains some statistics on each RAID configuration.
+ * Each real device in the RAID set contains it near the end of the device.
+ * Some of the ideas are copied from the ext2fs implementation.
+ *
+ * We currently use 4096 bytes as follows:
+ *
+ *	word offset	function
+ *
+ *	   0  -    31	Constant generic RAID device information.
+ *        32  -    63   Generic state information.
+ *	  64  -   127	Personality specific information.
+ *	 128  -   511	12 32-words descriptors of the disks in the raid set.
+ *	 512  -   911	Reserved.
+ *	 912  -  1023	Disk specific descriptor.
+ */
+
+/*
+ * If x is the real device size in bytes, we return an apparent size of:
+ *
+ *	y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
+ *
+ * and place the 4kB superblock at offset y.
+ */
+#define MD_RESERVED_BYTES		(64 * 1024)
+#define MD_RESERVED_SECTORS		(MD_RESERVED_BYTES / 512)
+
+#define MD_NEW_SIZE_SECTORS(x)		((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
+
+#define MD_SB_BYTES			4096
+#define MD_SB_WORDS			(MD_SB_BYTES / 4)
+#define MD_SB_SECTORS			(MD_SB_BYTES / 512)
+
+/*
+ * The following are counted in 32-bit words
+ */
+#define	MD_SB_GENERIC_OFFSET		0
+#define MD_SB_PERSONALITY_OFFSET	64
+#define MD_SB_DISKS_OFFSET		128
+#define MD_SB_DESCRIPTOR_OFFSET		992
+
+#define MD_SB_GENERIC_CONSTANT_WORDS	32
+#define MD_SB_GENERIC_STATE_WORDS	32
+#define MD_SB_GENERIC_WORDS		(MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
+#define MD_SB_PERSONALITY_WORDS		64
+#define MD_SB_DESCRIPTOR_WORDS		32
+#define MD_SB_DISKS			27
+#define MD_SB_DISKS_WORDS		(MD_SB_DISKS*MD_SB_DESCRIPTOR_WORDS)
+#define MD_SB_RESERVED_WORDS		(1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
+#define MD_SB_EQUAL_WORDS		(MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
+
+/*
+ * Device "operational" state bits
+ */
+#define MD_DISK_FAULTY		0 /* disk is faulty / operational */
+#define MD_DISK_ACTIVE		1 /* disk is running or spare disk */
+#define MD_DISK_SYNC		2 /* disk is in sync with the raid set */
+#define MD_DISK_REMOVED		3 /* disk is in sync with the raid set */
+
+#define	MD_DISK_WRITEMOSTLY	9 /* disk is "write-mostly" is RAID1 config.
+				   * read requests will only be sent here in
+				   * dire need
+				   */
+
+typedef struct mdp_device_descriptor_s {
+	__u32 number;		/* 0 Device number in the entire set	      */
+	__u32 major;		/* 1 Device major number		      */
+	__u32 minor;		/* 2 Device minor number		      */
+	__u32 raid_disk;	/* 3 The role of the device in the raid set   */
+	__u32 state;		/* 4 Operational state			      */
+	__u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
+} mdp_disk_t;
+
+#define MD_SB_MAGIC		0xa92b4efc
+
+/*
+ * Superblock state bits
+ */
+#define MD_SB_CLEAN		0
+#define MD_SB_ERRORS		1
+
+#define	MD_SB_BITMAP_PRESENT	8 /* bitmap may be present nearby */
+
+/*
+ * Notes:
+ * - if an array is being reshaped (restriped) in order to change the
+ *   the number of active devices in the array, 'raid_disks' will be
+ *   the larger of the old and new numbers.  'delta_disks' will
+ *   be the "new - old".  So if +ve, raid_disks is the new value, and
+ *   "raid_disks-delta_disks" is the old.  If -ve, raid_disks is the
+ *   old value and "raid_disks+delta_disks" is the new (smaller) value.
+ */
+
+
+typedef struct mdp_superblock_s {
+	/*
+	 * Constant generic information
+	 */
+	__u32 md_magic;		/*  0 MD identifier 			      */
+	__u32 major_version;	/*  1 major version to which the set conforms */
+	__u32 minor_version;	/*  2 minor version ...			      */
+	__u32 patch_version;	/*  3 patchlevel version ...		      */
+	__u32 gvalid_words;	/*  4 Number of used words in this section    */
+	__u32 set_uuid0;	/*  5 Raid set identifier		      */
+	__u32 ctime;		/*  6 Creation time			      */
+	__u32 level;		/*  7 Raid personality			      */
+	__u32 size;		/*  8 Apparent size of each individual disk   */
+	__u32 nr_disks;		/*  9 total disks in the raid set	      */
+	__u32 raid_disks;	/* 10 disks in a fully functional raid set    */
+	__u32 md_minor;		/* 11 preferred MD minor device number	      */
+	__u32 not_persistent;	/* 12 does it have a persistent superblock    */
+	__u32 set_uuid1;	/* 13 Raid set identifier #2		      */
+	__u32 set_uuid2;	/* 14 Raid set identifier #3		      */
+	__u32 set_uuid3;	/* 15 Raid set identifier #4		      */
+	__u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16];
+
+	/*
+	 * Generic state information
+	 */
+	__u32 utime;		/*  0 Superblock update time		      */
+	__u32 state;		/*  1 State bits (clean, ...)		      */
+	__u32 active_disks;	/*  2 Number of currently active disks	      */
+	__u32 working_disks;	/*  3 Number of working disks		      */
+	__u32 failed_disks;	/*  4 Number of failed disks		      */
+	__u32 spare_disks;	/*  5 Number of spare disks		      */
+	__u32 sb_csum;		/*  6 checksum of the whole superblock        */
+#ifdef __BIG_ENDIAN
+	__u32 events_hi;	/*  7 high-order of superblock update count   */
+	__u32 events_lo;	/*  8 low-order of superblock update count    */
+	__u32 cp_events_hi;	/*  9 high-order of checkpoint update count   */
+	__u32 cp_events_lo;	/* 10 low-order of checkpoint update count    */
+#else
+	__u32 events_lo;	/*  7 low-order of superblock update count    */
+	__u32 events_hi;	/*  8 high-order of superblock update count   */
+	__u32 cp_events_lo;	/*  9 low-order of checkpoint update count    */
+	__u32 cp_events_hi;	/* 10 high-order of checkpoint update count   */
+#endif
+	__u32 recovery_cp;	/* 11 recovery checkpoint sector count	      */
+	/* There are only valid for minor_version > 90 */
+	__u64 reshape_position;	/* 12,13 next address in array-space for reshape */
+	__u32 new_level;	/* 14 new level we are reshaping to	      */
+	__u32 delta_disks;	/* 15 change in number of raid_disks	      */
+	__u32 new_layout;	/* 16 new layout			      */
+	__u32 new_chunk;	/* 17 new chunk size (bytes)		      */
+	__u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 18];
+
+	/*
+	 * Personality information
+	 */
+	__u32 layout;		/*  0 the array's physical layout	      */
+	__u32 chunk_size;	/*  1 chunk size in bytes		      */
+	__u32 root_pv;		/*  2 LV root PV */
+	__u32 root_block;	/*  3 LV root block */
+	__u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4];
+
+	/*
+	 * Disks information
+	 */
+	mdp_disk_t disks[MD_SB_DISKS];
+
+	/*
+	 * Reserved
+	 */
+	__u32 reserved[MD_SB_RESERVED_WORDS];
+
+	/*
+	 * Active descriptor
+	 */
+	mdp_disk_t this_disk;
+
+} mdp_super_t;
+
+static inline __u64 md_event(mdp_super_t *sb) {
+	__u64 ev = sb->events_hi;
+	return (ev<<32)| sb->events_lo;
+}
+
+#define MD_SUPERBLOCK_1_TIME_SEC_MASK ((1ULL<<40) - 1)
+
+/*
+ * The version-1 superblock :
+ * All numeric fields are little-endian.
+ *
+ * total size: 256 bytes plus 2 per device.
+ *  1K allows 384 devices.
+ */
+struct mdp_superblock_1 {
+	/* constant array information - 128 bytes */
+	__le32	magic;		/* MD_SB_MAGIC: 0xa92b4efc - little endian */
+	__le32	major_version;	/* 1 */
+	__le32	feature_map;	/* bit 0 set if 'bitmap_offset' is meaningful */
+	__le32	pad0;		/* always set to 0 when writing */
+
+	__u8	set_uuid[16];	/* user-space generated. */
+	char	set_name[32];	/* set and interpreted by user-space */
+
+	__le64	ctime;		/* lo 40 bits are seconds, top 24 are microseconds or 0*/
+	__le32	level;		/* -4 (multipath), -1 (linear), 0,1,4,5 */
+	__le32	layout;		/* only for raid5 and raid10 currently */
+	__le64	size;		/* used size of component devices, in 512byte sectors */
+
+	__le32	chunksize;	/* in 512byte sectors */
+	__le32	raid_disks;
+	__le32	bitmap_offset;	/* sectors after start of superblock that bitmap starts
+				 * NOTE: signed, so bitmap can be before superblock
+				 * only meaningful of feature_map[0] is set.
+				 */
+
+	/* These are only valid with feature bit '4' */
+	__le32	new_level;	/* new level we are reshaping to		*/
+	__le64	reshape_position;	/* next address in array-space for reshape */
+	__le32	delta_disks;	/* change in number of raid_disks		*/
+	__le32	new_layout;	/* new layout					*/
+	__le32	new_chunk;	/* new chunk size (512byte sectors)		*/
+	__le32  new_offset;	/* signed number to add to data_offset in new
+				 * layout.  0 == no-change.  This can be
+				 * different on each device in the array.
+				 */
+
+	/* constant this-device information - 64 bytes */
+	__le64	data_offset;	/* sector start of data, often 0 */
+	__le64	data_size;	/* sectors in this device that can be used for data */
+	__le64	super_offset;	/* sector start of this superblock */
+	__le64	recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
+	__le32	dev_number;	/* permanent identifier of this  device - not role in raid */
+	__le32	cnt_corrected_read; /* number of read errors that were corrected by re-writing */
+	__u8	device_uuid[16]; /* user-space setable, ignored by kernel */
+	__u8	devflags;	/* per-device flags.  Only one defined...*/
+#define	WriteMostly1	1	/* mask for writemostly flag in above */
+	/* Bad block log.  If there are any bad blocks the feature flag is set.
+	 * If offset and size are non-zero, that space is reserved and available
+	 */
+	__u8	bblog_shift;	/* shift from sectors to block size */
+	__le16	bblog_size;	/* number of sectors reserved for list */
+	__le32	bblog_offset;	/* sector offset from superblock to bblog,
+				 * signed - not unsigned */
+
+	/* array state information - 64 bytes */
+	__le64	utime;		/* 40 bits second, 24 bits microseconds */
+	__le64	events;		/* incremented when superblock updated */
+	__le64	resync_offset;	/* data before this offset (from data_offset) known to be in sync */
+	__le32	sb_csum;	/* checksum up to devs[max_dev] */
+	__le32	max_dev;	/* size of devs[] array to consider */
+	__u8	pad3[64-32];	/* set to 0 when writing */
+
+	/* device state information. Indexed by dev_number.
+	 * 2 bytes per device
+	 * Note there are no per-device state flags. State information is rolled
+	 * into the 'roles' value.  If a device is spare or faulty, then it doesn't
+	 * have a meaningful role.
+	 */
+	__le16	dev_roles[0];	/* role in array, or 0xffff for a spare, or 0xfffe for faulty */
+};
+
+/* feature_map bits */
+#define MD_FEATURE_BITMAP_OFFSET	1
+#define	MD_FEATURE_RECOVERY_OFFSET	2 /* recovery_offset is present and
+					   * must be honoured
+					   */
+#define	MD_FEATURE_RESHAPE_ACTIVE	4
+#define	MD_FEATURE_BAD_BLOCKS		8 /* badblock list is not empty */
+#define	MD_FEATURE_REPLACEMENT		16 /* This device is replacing an
+					    * active device with same 'role'.
+					    * 'recovery_offset' is also set.
+					    */
+#define	MD_FEATURE_RESHAPE_BACKWARDS	32 /* Reshape doesn't change number
+					    * of devices, but is going
+					    * backwards anyway.
+					    */
+#define	MD_FEATURE_NEW_OFFSET		64 /* new_offset must be honoured */
+#define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
+					|MD_FEATURE_RECOVERY_OFFSET	\
+					|MD_FEATURE_RESHAPE_ACTIVE	\
+					|MD_FEATURE_BAD_BLOCKS		\
+					|MD_FEATURE_REPLACEMENT		\
+					|MD_FEATURE_RESHAPE_BACKWARDS	\
+					|MD_FEATURE_NEW_OFFSET		\
+					)
+
+#endif 
diff --git a/include/uapi/linux/raid/md_u.h b/include/uapi/linux/raid/md_u.h
new file mode 100644
index 000000000000..4133e744e4e6
--- /dev/null
+++ b/include/uapi/linux/raid/md_u.h
@@ -0,0 +1,155 @@
+/*
+   md_u.h : user <=> kernel API between Linux raidtools and RAID drivers
+          Copyright (C) 1998 Ingo Molnar
+	  
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+   
+   You should have received a copy of the GNU General Public License
+   (for example /usr/src/linux/COPYING); if not, write to the Free
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+*/
+
+#ifndef _UAPI_MD_U_H
+#define _UAPI_MD_U_H
+
+/*
+ * Different major versions are not compatible.
+ * Different minor versions are only downward compatible.
+ * Different patchlevel versions are downward and upward compatible.
+ */
+#define MD_MAJOR_VERSION                0
+#define MD_MINOR_VERSION                90
+/*
+ * MD_PATCHLEVEL_VERSION indicates kernel functionality.
+ * >=1 means different superblock formats are selectable using SET_ARRAY_INFO
+ *     and major_version/minor_version accordingly
+ * >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT
+ *     in the super status byte
+ * >=3 means that bitmap superblock version 4 is supported, which uses
+ *     little-ending representation rather than host-endian
+ */
+#define MD_PATCHLEVEL_VERSION           3
+
+/* ioctls */
+
+/* status */
+#define RAID_VERSION		_IOR (MD_MAJOR, 0x10, mdu_version_t)
+#define GET_ARRAY_INFO		_IOR (MD_MAJOR, 0x11, mdu_array_info_t)
+#define GET_DISK_INFO		_IOR (MD_MAJOR, 0x12, mdu_disk_info_t)
+#define PRINT_RAID_DEBUG	_IO (MD_MAJOR, 0x13)
+#define RAID_AUTORUN		_IO (MD_MAJOR, 0x14)
+#define GET_BITMAP_FILE		_IOR (MD_MAJOR, 0x15, mdu_bitmap_file_t)
+
+/* configuration */
+#define CLEAR_ARRAY		_IO (MD_MAJOR, 0x20)
+#define ADD_NEW_DISK		_IOW (MD_MAJOR, 0x21, mdu_disk_info_t)
+#define HOT_REMOVE_DISK		_IO (MD_MAJOR, 0x22)
+#define SET_ARRAY_INFO		_IOW (MD_MAJOR, 0x23, mdu_array_info_t)
+#define SET_DISK_INFO		_IO (MD_MAJOR, 0x24)
+#define WRITE_RAID_INFO		_IO (MD_MAJOR, 0x25)
+#define UNPROTECT_ARRAY		_IO (MD_MAJOR, 0x26)
+#define PROTECT_ARRAY		_IO (MD_MAJOR, 0x27)
+#define HOT_ADD_DISK		_IO (MD_MAJOR, 0x28)
+#define SET_DISK_FAULTY		_IO (MD_MAJOR, 0x29)
+#define HOT_GENERATE_ERROR	_IO (MD_MAJOR, 0x2a)
+#define SET_BITMAP_FILE		_IOW (MD_MAJOR, 0x2b, int)
+
+/* usage */
+#define RUN_ARRAY		_IOW (MD_MAJOR, 0x30, mdu_param_t)
+/*  0x31 was START_ARRAY  */
+#define STOP_ARRAY		_IO (MD_MAJOR, 0x32)
+#define STOP_ARRAY_RO		_IO (MD_MAJOR, 0x33)
+#define RESTART_ARRAY_RW	_IO (MD_MAJOR, 0x34)
+
+/* 63 partitions with the alternate major number (mdp) */
+#define MdpMinorShift 6
+
+typedef struct mdu_version_s {
+	int major;
+	int minor;
+	int patchlevel;
+} mdu_version_t;
+
+typedef struct mdu_array_info_s {
+	/*
+	 * Generic constant information
+	 */
+	int major_version;
+	int minor_version;
+	int patch_version;
+	int ctime;
+	int level;
+	int size;
+	int nr_disks;
+	int raid_disks;
+	int md_minor;
+	int not_persistent;
+
+	/*
+	 * Generic state information
+	 */
+	int utime;		/*  0 Superblock update time		      */
+	int state;		/*  1 State bits (clean, ...)		      */
+	int active_disks;	/*  2 Number of currently active disks	      */
+	int working_disks;	/*  3 Number of working disks		      */
+	int failed_disks;	/*  4 Number of failed disks		      */
+	int spare_disks;	/*  5 Number of spare disks		      */
+
+	/*
+	 * Personality information
+	 */
+	int layout;		/*  0 the array's physical layout	      */
+	int chunk_size;	/*  1 chunk size in bytes		      */
+
+} mdu_array_info_t;
+
+/* non-obvious values for 'level' */
+#define	LEVEL_MULTIPATH		(-4)
+#define	LEVEL_LINEAR		(-1)
+#define	LEVEL_FAULTY		(-5)
+
+/* we need a value for 'no level specified' and 0
+ * means 'raid0', so we need something else.  This is
+ * for internal use only
+ */
+#define	LEVEL_NONE		(-1000000)
+
+typedef struct mdu_disk_info_s {
+	/*
+	 * configuration/status of one particular disk
+	 */
+	int number;
+	int major;
+	int minor;
+	int raid_disk;
+	int state;
+
+} mdu_disk_info_t;
+
+typedef struct mdu_start_info_s {
+	/*
+	 * configuration/status of one particular disk
+	 */
+	int major;
+	int minor;
+	int raid_disk;
+	int state;
+
+} mdu_start_info_t;
+
+typedef struct mdu_bitmap_file_s
+{
+	char pathname[4096];
+} mdu_bitmap_file_t;
+
+typedef struct mdu_param_s
+{
+	int			personality;	/* 1,2,3,4 */
+	int			chunk_size;	/* in bytes */
+	int			max_fault;	/* unused for now */
+} mdu_param_t;
+
+#endif /* _UAPI_MD_U_H */
-- 
cgit v1.2.3-71-gd317


From 6f73601efb35c7003f5c58c2bc6fd08f3652169c Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Fri, 19 Oct 2012 15:14:44 +0000
Subject: tcp: add SYN/data info to TCP_INFO

Add a bit TCPI_OPT_SYN_DATA (32) to the socket option TCP_INFO:tcpi_options.
It's set if the data in SYN (sent or received) is acked by SYN-ACK. Server or
client application can use this information to check Fast Open success rate.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 3 ++-
 include/uapi/linux/tcp.h | 1 +
 net/ipv4/tcp.c           | 2 ++
 net/ipv4/tcp_input.c     | 1 +
 net/ipv4/tcp_ipv4.c      | 1 +
 net/ipv4/tcp_minisocks.c | 1 +
 6 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 8a7fc4be2d75..60b7aac15e0e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -191,7 +191,8 @@ struct tcp_sock {
 	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
 		early_retrans_delayed:1, /* Delayed ER timer installed */
 		syn_data:1,	/* SYN includes data */
-		syn_fastopen:1;	/* SYN includes Fast Open option */
+		syn_fastopen:1,	/* SYN includes Fast Open option */
+		syn_data_acked:1;/* data in SYN is acked by SYN-ACK */
 
 /* RTT measurement */
 	u32	srtt;		/* smoothed round trip time << 3	*/
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index c4b89a5cb7df..e962faa5ab0d 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -130,6 +130,7 @@ enum {
 #define TCPI_OPT_WSCALE		4
 #define TCPI_OPT_ECN		8 /* ECN was negociated at TCP session init */
 #define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */
+#define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
 
 enum tcp_ca_state {
 	TCP_CA_Open = 0,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b7c2f439b54f..197c0008503c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2764,6 +2764,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
 		info->tcpi_options |= TCPI_OPT_ECN;
 	if (tp->ecn_flags & TCP_ECN_SEEN)
 		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
+	if (tp->syn_data_acked)
+		info->tcpi_options |= TCPI_OPT_SYN_DATA;
 
 	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
 	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 432c36649db3..036f85738141 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5646,6 +5646,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 		tcp_rearm_rto(sk);
 		return true;
 	}
+	tp->syn_data_acked = tp->syn_data;
 	return false;
 }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ef998b008a57..0c4a64355603 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1461,6 +1461,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk,
 		skb_set_owner_r(skb, child);
 		__skb_queue_tail(&child->sk_receive_queue, skb);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+		tp->syn_data_acked = 1;
 	}
 	sk->sk_data_ready(sk, 0);
 	bh_unlock_sock(child);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 27536ba16c9d..a7302d974f32 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -510,6 +510,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->rx_opt.mss_clamp = req->mss;
 		TCP_ECN_openreq_child(newtp, req);
 		newtp->fastopen_rsk = NULL;
+		newtp->syn_data_acked = 0;
 
 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
 	}
-- 
cgit v1.2.3-71-gd317


From a80a6b85b428e6ce12a8363bb1f08d44c50f3252 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 8 Nov 2012 15:53:35 -0800
Subject: revert "epoll: support for disabling items, and a self-test app"

Revert commit 03a7beb55b9f ("epoll: support for disabling items, and a
self-test app") pending resolution of the issues identified by Michael
Kerrisk, copied below.

We'll revisit this for 3.8.

: I've taken a look at this patch as it currently stands in 3.7-rc1, and
: done a bit of testing. (By the way, the test program
: tools/testing/selftests/epoll/test_epoll.c does not compile...)
:
: There are one or two places where the behavior seems a little strange,
: so I have a question or two at the end of this mail. But other than
: that, I want to check my understanding so that the interface can be
: correctly documented.
:
: Just to go though my understanding, the problem is the following
: scenario in a multithreaded application:
:
: 1. Multiple threads are performing epoll_wait() operations,
:    and maintaining a user-space cache that contains information
:    corresponding to each file descriptor being monitored by
:    epoll_wait().
:
: 2. At some point, a thread wants to delete (EPOLL_CTL_DEL)
:    a file descriptor from the epoll interest list, and
:    delete the corresponding record from the user-space cache.
:
: 3. The problem with (2) is that some other thread may have
:    previously done an epoll_wait() that retrieved information
:    about the fd in question, and may be in the middle of using
:    information in the cache that relates to that fd. Thus,
:    there is a potential race.
:
: 4. The race can't solved purely in user space, because doing
:    so would require applying a mutex across the epoll_wait()
:    call, which would of course blow thread concurrency.
:
: Right?
:
: Your solution is the EPOLL_CTL_DISABLE operation. I want to
: confirm my understanding about how to use this flag, since
: the description that has accompanied the patches so far
: has been a bit sparse
:
: 0. In the scenario you're concerned about, deleting a file
:    descriptor means (safely) doing the following:
:    (a) Deleting the file descriptor from the epoll interest list
:        using EPOLL_CTL_DEL
:    (b) Deleting the corresponding record in the user-space cache
:
: 1. It's only meaningful to use this EPOLL_CTL_DISABLE in
:    conjunction with EPOLLONESHOT.
:
: 2. Using EPOLL_CTL_DISABLE without using EPOLLONESHOT in
:    conjunction is a logical error.
:
: 3. The correct way to code multithreaded applications using
:    EPOLL_CTL_DISABLE and EPOLLONESHOT is as follows:
:
:    a. All EPOLL_CTL_ADD and EPOLL_CTL_MOD operations should
:       should EPOLLONESHOT.
:
:    b. When a thread wants to delete a file descriptor, it
:       should do the following:
:
:       [1] Call epoll_ctl(EPOLL_CTL_DISABLE)
:       [2] If the return status from epoll_ctl(EPOLL_CTL_DISABLE)
:           was zero, then the file descriptor can be safely
:           deleted by the thread that made this call.
:       [3] If the epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY,
:           then the descriptor is in use. In this case, the calling
:           thread should set a flag in the user-space cache to
:           indicate that the thread that is using the descriptor
:           should perform the deletion operation.
:
: Is all of the above correct?
:
: The implementation depends on checking on whether
: (events & ~EP_PRIVATE_BITS) == 0
: This replies on the fact that EPOLL_CTL_AD and EPOLL_CTL_MOD always
: set EPOLLHUP and EPOLLERR in the 'events' mask, and EPOLLONESHOT
: causes those flags (as well as all others in ~EP_PRIVATE_BITS) to be
: cleared.
:
: A corollary to the previous paragraph is that using EPOLL_CTL_DISABLE
: is only useful in conjunction with EPOLLONESHOT. However, as things
: stand, one can use EPOLL_CTL_DISABLE on a file descriptor that does
: not have EPOLLONESHOT set in 'events' This results in the following
: (slightly surprising) behavior:
:
: (a) The first call to epoll_ctl(EPOLL_CTL_DISABLE) returns 0
:     (the indicator that the file descriptor can be safely deleted).
: (b) The next call to epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY.
:
: This doesn't seem particularly useful, and in fact is probably an
: indication that the user made a logic error: they should only be using
: epoll_ctl(EPOLL_CTL_DISABLE) on a file descriptor for which
: EPOLLONESHOT was set in 'events'. If that is correct, then would it
: not make sense to return an error to user space for this case?

Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: "Paton J. Lewis" <palewis@adobe.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c                             |  38 +---
 include/uapi/linux/eventpoll.h             |   1 -
 tools/testing/selftests/Makefile           |   2 +-
 tools/testing/selftests/epoll/Makefile     |  11 -
 tools/testing/selftests/epoll/test_epoll.c | 344 -----------------------------
 5 files changed, 4 insertions(+), 392 deletions(-)
 delete mode 100644 tools/testing/selftests/epoll/Makefile
 delete mode 100644 tools/testing/selftests/epoll/test_epoll.c

(limited to 'include/uapi')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index da72250ddc1c..cd96649bfe62 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
 /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
 static inline int ep_op_has_event(int op)
 {
-	return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD;
+	return op != EPOLL_CTL_DEL;
 }
 
 /* Initialize the poll safe wake up structure */
@@ -676,34 +676,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	return 0;
 }
 
-/*
- * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item
- * had no event flags set, indicating that another thread may be currently
- * handling that item's events (in the case that EPOLLONESHOT was being
- * used). Otherwise a zero result indicates that the item has been disabled
- * from receiving events. A disabled item may be re-enabled via
- * EPOLL_CTL_MOD. Must be called with "mtx" held.
- */
-static int ep_disable(struct eventpoll *ep, struct epitem *epi)
-{
-	int result = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&ep->lock, flags);
-	if (epi->event.events & ~EP_PRIVATE_BITS) {
-		if (ep_is_linked(&epi->rdllink))
-			list_del_init(&epi->rdllink);
-		/* Ensure ep_poll_callback will not add epi back onto ready
-		   list: */
-		epi->event.events &= EP_PRIVATE_BITS;
-		}
-	else
-		result = -EBUSY;
-	spin_unlock_irqrestore(&ep->lock, flags);
-
-	return result;
-}
-
 static void ep_free(struct eventpoll *ep)
 {
 	struct rb_node *rbp;
@@ -1048,6 +1020,8 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
 	rb_insert_color(&epi->rbn, &ep->rbr);
 }
 
+
+
 #define PATH_ARR_SIZE 5
 /*
  * These are the number paths of length 1 to 5, that we are allowing to emanate
@@ -1813,12 +1787,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 		} else
 			error = -ENOENT;
 		break;
-	case EPOLL_CTL_DISABLE:
-		if (epi)
-			error = ep_disable(ep, epi);
-		else
-			error = -ENOENT;
-		break;
 	}
 	mutex_unlock(&ep->mtx);
 
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 8c99ce7202c5..2c267bcbb85c 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -25,7 +25,6 @@
 #define EPOLL_CTL_ADD 1
 #define EPOLL_CTL_DEL 2
 #define EPOLL_CTL_MOD 3
-#define EPOLL_CTL_DISABLE 4
 
 /*
  * Request the handling of system wakeup events so as to prevent system suspends
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 43480149119e..85baf11e2acd 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -1,4 +1,4 @@
-TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug epoll
+TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug
 
 all:
 	for TARGET in $(TARGETS); do \
diff --git a/tools/testing/selftests/epoll/Makefile b/tools/testing/selftests/epoll/Makefile
deleted file mode 100644
index 19806ed62f50..000000000000
--- a/tools/testing/selftests/epoll/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-# Makefile for epoll selftests
-
-all: test_epoll
-%: %.c
-	gcc -pthread -g -o $@ $^
-
-run_tests: all
-	./test_epoll
-
-clean:
-	$(RM) test_epoll
diff --git a/tools/testing/selftests/epoll/test_epoll.c b/tools/testing/selftests/epoll/test_epoll.c
deleted file mode 100644
index f7525392ce84..000000000000
--- a/tools/testing/selftests/epoll/test_epoll.c
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- *  tools/testing/selftests/epoll/test_epoll.c
- *
- *  Copyright 2012 Adobe Systems Incorporated
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  Paton J. Lewis <palewis@adobe.com>
- *
- */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/epoll.h>
-#include <sys/socket.h>
-
-/*
- * A pointer to an epoll_item_private structure will be stored in the epoll
- * item's event structure so that we can get access to the epoll_item_private
- * data after calling epoll_wait:
- */
-struct epoll_item_private {
-	int index;  /* Position of this struct within the epoll_items array. */
-	int fd;
-	uint32_t events;
-	pthread_mutex_t mutex;  /* Guards the following variables... */
-	int stop;
-	int status;  /* Stores any error encountered while handling item. */
-	/* The following variable allows us to test whether we have encountered
-	   a problem while attempting to cancel and delete the associated
-	   event. When the test program exits, 'deleted' should be exactly
-	   one. If it is greater than one, then the failed test reflects a real
-	   world situation where we would have tried to access the epoll item's
-	   private data after deleting it: */
-	int deleted;
-};
-
-struct epoll_item_private *epoll_items;
-
-/*
- * Delete the specified item from the epoll set. In a real-world secneario this
- * is where we would free the associated data structure, but in this testing
- * environment we retain the structure so that we can test for double-deletion:
- */
-void delete_item(int index)
-{
-	__sync_fetch_and_add(&epoll_items[index].deleted, 1);
-}
-
-/*
- * A pointer to a read_thread_data structure will be passed as the argument to
- * each read thread:
- */
-struct read_thread_data {
-	int stop;
-	int status;  /* Indicates any error encountered by the read thread. */
-	int epoll_set;
-};
-
-/*
- * The function executed by the read threads:
- */
-void *read_thread_function(void *function_data)
-{
-	struct read_thread_data *thread_data =
-		(struct read_thread_data *)function_data;
-	struct epoll_event event_data;
-	struct epoll_item_private *item_data;
-	char socket_data;
-
-	/* Handle events until we encounter an error or this thread's 'stop'
-	   condition is set: */
-	while (1) {
-		int result = epoll_wait(thread_data->epoll_set,
-					&event_data,
-					1,	/* Number of desired events */
-					1000);  /* Timeout in ms */
-		if (result < 0) {
-			/* Breakpoints signal all threads. Ignore that while
-			   debugging: */
-			if (errno == EINTR)
-				continue;
-			thread_data->status = errno;
-			return 0;
-		} else if (thread_data->stop)
-			return 0;
-		else if (result == 0)  /* Timeout */
-			continue;
-
-		/* We need the mutex here because checking for the stop
-		   condition and re-enabling the epoll item need to be done
-		   together as one atomic operation when EPOLL_CTL_DISABLE is
-		   available: */
-		item_data = (struct epoll_item_private *)event_data.data.ptr;
-		pthread_mutex_lock(&item_data->mutex);
-
-		/* Remove the item from the epoll set if we want to stop
-		   handling that event: */
-		if (item_data->stop)
-			delete_item(item_data->index);
-		else {
-			/* Clear the data that was written to the other end of
-			   our non-blocking socket: */
-			do {
-				if (read(item_data->fd, &socket_data, 1) < 1) {
-					if ((errno == EAGAIN) ||
-					    (errno == EWOULDBLOCK))
-						break;
-					else
-						goto error_unlock;
-				}
-			} while (item_data->events & EPOLLET);
-
-			/* The item was one-shot, so re-enable it: */
-			event_data.events = item_data->events;
-			if (epoll_ctl(thread_data->epoll_set,
-						  EPOLL_CTL_MOD,
-						  item_data->fd,
-						  &event_data) < 0)
-				goto error_unlock;
-		}
-
-		pthread_mutex_unlock(&item_data->mutex);
-	}
-
-error_unlock:
-	thread_data->status = item_data->status = errno;
-	pthread_mutex_unlock(&item_data->mutex);
-	return 0;
-}
-
-/*
- * A pointer to a write_thread_data structure will be passed as the argument to
- * the write thread:
- */
-struct write_thread_data {
-	int stop;
-	int status;  /* Indicates any error encountered by the write thread. */
-	int n_fds;
-	int *fds;
-};
-
-/*
- * The function executed by the write thread. It writes a single byte to each
- * socket in turn until the stop condition for this thread is set. If writing to
- * a socket would block (i.e. errno was EAGAIN), we leave that socket alone for
- * the moment and just move on to the next socket in the list. We don't care
- * about the order in which we deliver events to the epoll set. In fact we don't
- * care about the data we're writing to the pipes at all; we just want to
- * trigger epoll events:
- */
-void *write_thread_function(void *function_data)
-{
-	const char data = 'X';
-	int index;
-	struct write_thread_data *thread_data =
-		(struct write_thread_data *)function_data;
-	while (!thread_data->stop)
-		for (index = 0;
-		     !thread_data->stop && (index < thread_data->n_fds);
-		     ++index)
-			if ((write(thread_data->fds[index], &data, 1) < 1) &&
-				(errno != EAGAIN) &&
-				(errno != EWOULDBLOCK)) {
-				thread_data->status = errno;
-				return;
-			}
-}
-
-/*
- * Arguments are currently ignored:
- */
-int main(int argc, char **argv)
-{
-	const int n_read_threads = 100;
-	const int n_epoll_items = 500;
-	int index;
-	int epoll_set = epoll_create1(0);
-	struct write_thread_data write_thread_data = {
-		0, 0, n_epoll_items, malloc(n_epoll_items * sizeof(int))
-	};
-	struct read_thread_data *read_thread_data =
-		malloc(n_read_threads * sizeof(struct read_thread_data));
-	pthread_t *read_threads = malloc(n_read_threads * sizeof(pthread_t));
-	pthread_t write_thread;
-
-	printf("-----------------\n");
-	printf("Runing test_epoll\n");
-	printf("-----------------\n");
-
-	epoll_items = malloc(n_epoll_items * sizeof(struct epoll_item_private));
-
-	if (epoll_set < 0 || epoll_items == 0 || write_thread_data.fds == 0 ||
-		read_thread_data == 0 || read_threads == 0)
-		goto error;
-
-	if (sysconf(_SC_NPROCESSORS_ONLN) < 2) {
-		printf("Error: please run this test on a multi-core system.\n");
-		goto error;
-	}
-
-	/* Create the socket pairs and epoll items: */
-	for (index = 0; index < n_epoll_items; ++index) {
-		int socket_pair[2];
-		struct epoll_event event_data;
-		if (socketpair(AF_UNIX,
-			       SOCK_STREAM | SOCK_NONBLOCK,
-			       0,
-			       socket_pair) < 0)
-			goto error;
-		write_thread_data.fds[index] = socket_pair[0];
-		epoll_items[index].index = index;
-		epoll_items[index].fd = socket_pair[1];
-		if (pthread_mutex_init(&epoll_items[index].mutex, NULL) != 0)
-			goto error;
-		/* We always use EPOLLONESHOT because this test is currently
-		   structured to demonstrate the need for EPOLL_CTL_DISABLE,
-		   which only produces useful information in the EPOLLONESHOT
-		   case (without EPOLLONESHOT, calling epoll_ctl with
-		   EPOLL_CTL_DISABLE will never return EBUSY). If support for
-		   testing events without EPOLLONESHOT is desired, it should
-		   probably be implemented in a separate unit test. */
-		epoll_items[index].events = EPOLLIN | EPOLLONESHOT;
-		if (index < n_epoll_items / 2)
-			epoll_items[index].events |= EPOLLET;
-		epoll_items[index].stop = 0;
-		epoll_items[index].status = 0;
-		epoll_items[index].deleted = 0;
-		event_data.events = epoll_items[index].events;
-		event_data.data.ptr = &epoll_items[index];
-		if (epoll_ctl(epoll_set,
-			      EPOLL_CTL_ADD,
-			      epoll_items[index].fd,
-			      &event_data) < 0)
-			goto error;
-	}
-
-	/* Create and start the read threads: */
-	for (index = 0; index < n_read_threads; ++index) {
-		read_thread_data[index].stop = 0;
-		read_thread_data[index].status = 0;
-		read_thread_data[index].epoll_set = epoll_set;
-		if (pthread_create(&read_threads[index],
-				   NULL,
-				   read_thread_function,
-				   &read_thread_data[index]) != 0)
-			goto error;
-	}
-
-	if (pthread_create(&write_thread,
-			   NULL,
-			   write_thread_function,
-			   &write_thread_data) != 0)
-		goto error;
-
-	/* Cancel all event pollers: */
-#ifdef EPOLL_CTL_DISABLE
-	for (index = 0; index < n_epoll_items; ++index) {
-		pthread_mutex_lock(&epoll_items[index].mutex);
-		++epoll_items[index].stop;
-		if (epoll_ctl(epoll_set,
-			      EPOLL_CTL_DISABLE,
-			      epoll_items[index].fd,
-			      NULL) == 0)
-			delete_item(index);
-		else if (errno != EBUSY) {
-			pthread_mutex_unlock(&epoll_items[index].mutex);
-			goto error;
-		}
-		/* EBUSY means events were being handled; allow the other thread
-		   to delete the item. */
-		pthread_mutex_unlock(&epoll_items[index].mutex);
-	}
-#else
-	for (index = 0; index < n_epoll_items; ++index) {
-		pthread_mutex_lock(&epoll_items[index].mutex);
-		++epoll_items[index].stop;
-		pthread_mutex_unlock(&epoll_items[index].mutex);
-		/* Wait in case a thread running read_thread_function is
-		   currently executing code between epoll_wait and
-		   pthread_mutex_lock with this item. Note that a longer delay
-		   would make double-deletion less likely (at the expense of
-		   performance), but there is no guarantee that any delay would
-		   ever be sufficient. Note also that we delete all event
-		   pollers at once for testing purposes, but in a real-world
-		   environment we are likely to want to be able to cancel event
-		   pollers at arbitrary times. Therefore we can't improve this
-		   situation by just splitting this loop into two loops
-		   (i.e. signal 'stop' for all items, sleep, and then delete all
-		   items). We also can't fix the problem via EPOLL_CTL_DEL
-		   because that command can't prevent the case where some other
-		   thread is executing read_thread_function within the region
-		   mentioned above: */
-		usleep(1);
-		pthread_mutex_lock(&epoll_items[index].mutex);
-		if (!epoll_items[index].deleted)
-			delete_item(index);
-		pthread_mutex_unlock(&epoll_items[index].mutex);
-	}
-#endif
-
-	/* Shut down the read threads: */
-	for (index = 0; index < n_read_threads; ++index)
-		__sync_fetch_and_add(&read_thread_data[index].stop, 1);
-	for (index = 0; index < n_read_threads; ++index) {
-		if (pthread_join(read_threads[index], NULL) != 0)
-			goto error;
-		if (read_thread_data[index].status)
-			goto error;
-	}
-
-	/* Shut down the write thread: */
-	__sync_fetch_and_add(&write_thread_data.stop, 1);
-	if ((pthread_join(write_thread, NULL) != 0) || write_thread_data.status)
-		goto error;
-
-	/* Check for final error conditions: */
-	for (index = 0; index < n_epoll_items; ++index) {
-		if (epoll_items[index].status != 0)
-			goto error;
-		if (pthread_mutex_destroy(&epoll_items[index].mutex) < 0)
-			goto error;
-	}
-	for (index = 0; index < n_epoll_items; ++index)
-		if (epoll_items[index].deleted != 1) {
-			printf("Error: item data deleted %1d times.\n",
-				   epoll_items[index].deleted);
-			goto error;
-		}
-
-	printf("[PASS]\n");
-	return 0;
-
- error:
-	printf("[FAIL]\n");
-	return errno;
-}
-- 
cgit v1.2.3-71-gd317


From fa0cbbf145aabbf29c6f28f8a11935c0b0fd86fc Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 12 Nov 2012 17:53:04 -0800
Subject: mm, oom: reintroduce /proc/pid/oom_adj

This is mostly a revert of 01dc52ebdf47 ("oom: remove deprecated oom_adj")
from Davidlohr Bueso.

It reintroduces /proc/pid/oom_adj for backwards compatibility with earlier
kernels.  It simply scales the value linearly when /proc/pid/oom_score_adj
is written.

The major difference is that its scheduled removal is no longer included
in Documentation/feature-removal-schedule.txt.  We do warn users with a
single printk, though, to suggest the more powerful and supported
/proc/pid/oom_score_adj interface.

Reported-by: Artem S. Tashkinov <t.artem@lycos.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  16 ++++--
 fs/proc/base.c                     | 109 +++++++++++++++++++++++++++++++++++++
 include/uapi/linux/oom.h           |   9 +++
 3 files changed, 130 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a1793d670cd0..3844d21d6ca3 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -33,7 +33,7 @@ Table of Contents
   2	Modifying System Parameters
 
   3	Per-Process Parameters
-  3.1	/proc/<pid>/oom_score_adj - Adjust the oom-killer
+  3.1	/proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj - Adjust the oom-killer
 								score
   3.2	/proc/<pid>/oom_score - Display current oom-killer score
   3.3	/proc/<pid>/io - Display the IO accounting fields
@@ -1320,10 +1320,10 @@ of the kernel.
 CHAPTER 3: PER-PROCESS PARAMETERS
 ------------------------------------------------------------------------------
 
-3.1 /proc/<pid>/oom_score_adj- Adjust the oom-killer score
+3.1 /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj- Adjust the oom-killer score
 --------------------------------------------------------------------------------
 
-This file can be used to adjust the badness heuristic used to select which
+These file can be used to adjust the badness heuristic used to select which
 process gets killed in out of memory conditions.
 
 The badness heuristic assigns a value to each candidate task ranging from 0
@@ -1361,6 +1361,12 @@ same system, cpuset, mempolicy, or memory controller resources to use at least
 equivalent to discounting 50% of the task's allowed memory from being considered
 as scoring against the task.
 
+For backwards compatibility with previous kernels, /proc/<pid>/oom_adj may also
+be used to tune the badness score.  Its acceptable values range from -16
+(OOM_ADJUST_MIN) to +15 (OOM_ADJUST_MAX) and a special value of -17
+(OOM_DISABLE) to disable oom killing entirely for that task.  Its value is
+scaled linearly with /proc/<pid>/oom_score_adj.
+
 The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last
 value set by a CAP_SYS_RESOURCE process. To reduce the value any lower
 requires CAP_SYS_RESOURCE.
@@ -1375,7 +1381,9 @@ minimal amount of work.
 -------------------------------------------------------------
 
 This file can be used to check the current score used by the oom-killer is for
-any given <pid>.
+any given <pid>. Use it together with /proc/<pid>/oom_score_adj to tune which
+process should be killed in an out-of-memory situation.
+
 
 3.3  /proc/<pid>/io - Display the IO accounting fields
 -------------------------------------------------------
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 144a96732dd7..3c231adf8450 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -873,6 +873,113 @@ static const struct file_operations proc_environ_operations = {
 	.release	= mem_release,
 };
 
+static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	char buffer[PROC_NUMBUF];
+	int oom_adj = OOM_ADJUST_MIN;
+	size_t len;
+	unsigned long flags;
+
+	if (!task)
+		return -ESRCH;
+	if (lock_task_sighand(task, &flags)) {
+		if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
+			oom_adj = OOM_ADJUST_MAX;
+		else
+			oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
+				  OOM_SCORE_ADJ_MAX;
+		unlock_task_sighand(task, &flags);
+	}
+	put_task_struct(task);
+	len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
+	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t oom_adj_write(struct file *file, const char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF];
+	int oom_adj;
+	unsigned long flags;
+	int err;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count)) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	err = kstrtoint(strstrip(buffer), 0, &oom_adj);
+	if (err)
+		goto out;
+	if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
+	     oom_adj != OOM_DISABLE) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	task = get_proc_task(file->f_path.dentry->d_inode);
+	if (!task) {
+		err = -ESRCH;
+		goto out;
+	}
+
+	task_lock(task);
+	if (!task->mm) {
+		err = -EINVAL;
+		goto err_task_lock;
+	}
+
+	if (!lock_task_sighand(task, &flags)) {
+		err = -ESRCH;
+		goto err_task_lock;
+	}
+
+	/*
+	 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
+	 * value is always attainable.
+	 */
+	if (oom_adj == OOM_ADJUST_MAX)
+		oom_adj = OOM_SCORE_ADJ_MAX;
+	else
+		oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
+
+	if (oom_adj < task->signal->oom_score_adj &&
+	    !capable(CAP_SYS_RESOURCE)) {
+		err = -EACCES;
+		goto err_sighand;
+	}
+
+	/*
+	 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
+	 * /proc/pid/oom_score_adj instead.
+	 */
+	printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+		  current->comm, task_pid_nr(current), task_pid_nr(task),
+		  task_pid_nr(task));
+
+	task->signal->oom_score_adj = oom_adj;
+	trace_oom_score_adj_update(task);
+err_sighand:
+	unlock_task_sighand(task, &flags);
+err_task_lock:
+	task_unlock(task);
+	put_task_struct(task);
+out:
+	return err < 0 ? err : count;
+}
+
+static const struct file_operations proc_oom_adj_operations = {
+	.read		= oom_adj_read,
+	.write		= oom_adj_write,
+	.llseek		= generic_file_llseek,
+};
+
 static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
 					size_t count, loff_t *ppos)
 {
@@ -2598,6 +2705,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
 	INF("oom_score",  S_IRUGO, proc_oom_score),
+	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@ -2964,6 +3072,7 @@ static const struct pid_entry tid_base_stuff[] = {
 	REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
 	INF("oom_score", S_IRUGO, proc_oom_score),
+	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
diff --git a/include/uapi/linux/oom.h b/include/uapi/linux/oom.h
index a49c4afc7060..b29272d621ce 100644
--- a/include/uapi/linux/oom.h
+++ b/include/uapi/linux/oom.h
@@ -8,4 +8,13 @@
 #define OOM_SCORE_ADJ_MIN	(-1000)
 #define OOM_SCORE_ADJ_MAX	1000
 
+/*
+ * /proc/<pid>/oom_adj set to -17 protects from the oom killer for legacy
+ * purposes.
+ */
+#define OOM_DISABLE (-17)
+/* inclusive */
+#define OOM_ADJUST_MIN (-16)
+#define OOM_ADJUST_MAX 15
+
 #endif /* _UAPI__INCLUDE_LINUX_OOM_H */
-- 
cgit v1.2.3-71-gd317


From d2709c7ce4c513ab7f4ca9a106a930621811f2d3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 19 Nov 2012 22:21:03 +0000
Subject: perf: Make perf build for x86 with UAPI disintegration applied

Make perf build for x86 once the UAPI disintegration patches for that arch
have been applied by adding the appropriate -I flags - in the right order -
and then converting some #includes that use ../.. notation to find main kernel
headerfiles to use <asm/foo.h> and <linux/foo.h> instead.

Note that -Iarch/foo/include/uapi is present _before_ -Iarch/foo/include.
This makes sure we get the userspace version of the pt_regs struct.  Ideally,
we wouldn't have the latter -I flag at all, but unfortunately we want
asm/svm.h and asm/vmx.h in builtin-kvm.c and these aren't part of the UAPI -
at least not for x86.  I wonder if the bits outside of the __KERNEL__ guards
*should* be transferred there.

I note also that perf seems to do its dependency handling manually by listing
all the header files it might want to use in LIB_H in the Makefile.  Can this
be changed to use -MD?

Note that to do make this work, we need to export and UAPI disintegrate
linux/hw_breakpoint.h, which I think should've been exported previously so that
perf can access the bits.  We have to do this in the same patch to maintain
bisectability.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/linux/hw_breakpoint.h           | 31 +------------------------------
 include/uapi/linux/Kbuild               |  1 +
 include/uapi/linux/hw_breakpoint.h      | 30 ++++++++++++++++++++++++++++++
 tools/perf/Makefile                     | 29 ++++++++++++++++++++++++++++-
 tools/perf/arch/x86/include/perf_regs.h |  2 +-
 tools/perf/builtin-kvm.c                |  6 +++---
 tools/perf/builtin-test.c               |  2 +-
 tools/perf/perf.h                       | 16 +++-------------
 tools/perf/util/evsel.c                 |  4 ++--
 tools/perf/util/evsel.h                 |  3 ++-
 tools/perf/util/header.h                |  2 +-
 tools/perf/util/parse-events-test.c     |  2 +-
 tools/perf/util/parse-events.c          |  2 +-
 tools/perf/util/parse-events.h          |  2 +-
 tools/perf/util/pmu.h                   |  2 +-
 tools/perf/util/session.h               |  2 +-
 16 files changed, 78 insertions(+), 58 deletions(-)
 create mode 100644 include/uapi/linux/hw_breakpoint.h

(limited to 'include/uapi')

diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index 6ae9c631a1be..0464c85e63fd 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -1,35 +1,8 @@
 #ifndef _LINUX_HW_BREAKPOINT_H
 #define _LINUX_HW_BREAKPOINT_H
 
-enum {
-	HW_BREAKPOINT_LEN_1 = 1,
-	HW_BREAKPOINT_LEN_2 = 2,
-	HW_BREAKPOINT_LEN_4 = 4,
-	HW_BREAKPOINT_LEN_8 = 8,
-};
-
-enum {
-	HW_BREAKPOINT_EMPTY	= 0,
-	HW_BREAKPOINT_R		= 1,
-	HW_BREAKPOINT_W		= 2,
-	HW_BREAKPOINT_RW	= HW_BREAKPOINT_R | HW_BREAKPOINT_W,
-	HW_BREAKPOINT_X		= 4,
-	HW_BREAKPOINT_INVALID   = HW_BREAKPOINT_RW | HW_BREAKPOINT_X,
-};
-
-enum bp_type_idx {
-	TYPE_INST 	= 0,
-#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS
-	TYPE_DATA	= 0,
-#else
-	TYPE_DATA	= 1,
-#endif
-	TYPE_MAX
-};
-
-#ifdef __KERNEL__
-
 #include <linux/perf_event.h>
+#include <uapi/linux/hw_breakpoint.h>
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 
@@ -151,6 +124,4 @@ static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
 }
 
 #endif /* CONFIG_HAVE_HW_BREAKPOINT */
-#endif /* __KERNEL__ */
-
 #endif /* _LINUX_HW_BREAKPOINT_H */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index e194387ef784..19e765fbfef7 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -415,3 +415,4 @@ header-y += wireless.h
 header-y += x25.h
 header-y += xattr.h
 header-y += xfrm.h
+header-y += hw_breakpoint.h
diff --git a/include/uapi/linux/hw_breakpoint.h b/include/uapi/linux/hw_breakpoint.h
new file mode 100644
index 000000000000..b04000a2296a
--- /dev/null
+++ b/include/uapi/linux/hw_breakpoint.h
@@ -0,0 +1,30 @@
+#ifndef _UAPI_LINUX_HW_BREAKPOINT_H
+#define _UAPI_LINUX_HW_BREAKPOINT_H
+
+enum {
+	HW_BREAKPOINT_LEN_1 = 1,
+	HW_BREAKPOINT_LEN_2 = 2,
+	HW_BREAKPOINT_LEN_4 = 4,
+	HW_BREAKPOINT_LEN_8 = 8,
+};
+
+enum {
+	HW_BREAKPOINT_EMPTY	= 0,
+	HW_BREAKPOINT_R		= 1,
+	HW_BREAKPOINT_W		= 2,
+	HW_BREAKPOINT_RW	= HW_BREAKPOINT_R | HW_BREAKPOINT_W,
+	HW_BREAKPOINT_X		= 4,
+	HW_BREAKPOINT_INVALID   = HW_BREAKPOINT_RW | HW_BREAKPOINT_X,
+};
+
+enum bp_type_idx {
+	TYPE_INST 	= 0,
+#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS
+	TYPE_DATA	= 0,
+#else
+	TYPE_DATA	= 1,
+#endif
+	TYPE_MAX
+};
+
+#endif /* _UAPI_LINUX_HW_BREAKPOINT_H */
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 00deed4d6159..0a619af5be43 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -169,7 +169,34 @@ endif
 
 ### --- END CONFIGURATION SECTION ---
 
-BASIC_CFLAGS = -Iutil/include -Iarch/$(ARCH)/include -I$(OUTPUT)util -I$(TRACE_EVENT_DIR) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
+ifeq ($(srctree),)
+srctree := $(patsubst %/,%,$(dir $(shell pwd)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+#$(info Determined 'srctree' to be $(srctree))
+endif
+
+ifneq ($(objtree),)
+#$(info Determined 'objtree' to be $(objtree))
+endif
+
+ifneq ($(OUTPUT),)
+#$(info Determined 'OUTPUT' to be $(OUTPUT))
+endif
+
+BASIC_CFLAGS = \
+	-Iutil/include \
+	-Iarch/$(ARCH)/include \
+	$(if $(objtree),-I$(objtree)/arch/$(ARCH)/include/generated/uapi) \
+	-I$(srctree)/arch/$(ARCH)/include/uapi \
+	-I$(srctree)/arch/$(ARCH)/include \
+	$(if $(objtree),-I$(objtree)/include/generated/uapi) \
+	-I$(srctree)/include/uapi \
+	-I$(srctree)/include \
+	-I$(OUTPUT)util \
+	-Iutil \
+	-I. \
+	-I$(TRACE_EVENT_DIR) \
+	-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
 BASIC_LDFLAGS =
 
 # Guard against environment variables
diff --git a/tools/perf/arch/x86/include/perf_regs.h b/tools/perf/arch/x86/include/perf_regs.h
index 46fc9f15c6b3..7fcdcdbee917 100644
--- a/tools/perf/arch/x86/include/perf_regs.h
+++ b/tools/perf/arch/x86/include/perf_regs.h
@@ -3,7 +3,7 @@
 
 #include <stdlib.h>
 #include "../../util/types.h"
-#include "../../../../../arch/x86/include/asm/perf_regs.h"
+#include <asm/perf_regs.h>
 
 #ifndef ARCH_X86_64
 #define PERF_REGS_MASK ((1ULL << PERF_REG_X86_32_MAX) - 1)
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 260abc535b5b..e013bdb5e24a 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -22,9 +22,9 @@
 #include <pthread.h>
 #include <math.h>
 
-#include "../../arch/x86/include/asm/svm.h"
-#include "../../arch/x86/include/asm/vmx.h"
-#include "../../arch/x86/include/asm/kvm.h"
+#include <asm/svm.h>
+#include <asm/vmx.h>
+#include <asm/kvm.h>
 
 struct event_key {
 	#define INVALID_KEY     (~0ULL)
diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 484f26cc0c00..5acd6e8e658b 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -15,7 +15,7 @@
 #include "util/thread_map.h"
 #include "util/pmu.h"
 #include "event-parse.h"
-#include "../../include/linux/hw_breakpoint.h"
+#include <linux/hw_breakpoint.h>
 
 #include <sys/mman.h>
 
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index e2ba8f004d32..238f923f2218 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -5,8 +5,9 @@ struct winsize;
 
 void get_term_dimensions(struct winsize *ws);
 
+#include <asm/unistd.h>
+
 #if defined(__i386__)
-#include "../../arch/x86/include/asm/unistd.h"
 #define rmb()		asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #define CPUINFO_PROC	"model name"
@@ -16,7 +17,6 @@ void get_term_dimensions(struct winsize *ws);
 #endif
 
 #if defined(__x86_64__)
-#include "../../arch/x86/include/asm/unistd.h"
 #define rmb()		asm volatile("lfence" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #define CPUINFO_PROC	"model name"
@@ -26,20 +26,17 @@ void get_term_dimensions(struct winsize *ws);
 #endif
 
 #ifdef __powerpc__
-#include "../../arch/powerpc/include/uapi/asm/unistd.h"
 #define rmb()		asm volatile ("sync" ::: "memory")
 #define cpu_relax()	asm volatile ("" ::: "memory");
 #define CPUINFO_PROC	"cpu"
 #endif
 
 #ifdef __s390__
-#include "../../arch/s390/include/asm/unistd.h"
 #define rmb()		asm volatile("bcr 15,0" ::: "memory")
 #define cpu_relax()	asm volatile("" ::: "memory");
 #endif
 
 #ifdef __sh__
-#include "../../arch/sh/include/asm/unistd.h"
 #if defined(__SH4A__) || defined(__SH5__)
 # define rmb()		asm volatile("synco" ::: "memory")
 #else
@@ -50,35 +47,30 @@ void get_term_dimensions(struct winsize *ws);
 #endif
 
 #ifdef __hppa__
-#include "../../arch/parisc/include/asm/unistd.h"
 #define rmb()		asm volatile("" ::: "memory")
 #define cpu_relax()	asm volatile("" ::: "memory");
 #define CPUINFO_PROC	"cpu"
 #endif
 
 #ifdef __sparc__
-#include "../../arch/sparc/include/uapi/asm/unistd.h"
 #define rmb()		asm volatile("":::"memory")
 #define cpu_relax()	asm volatile("":::"memory")
 #define CPUINFO_PROC	"cpu"
 #endif
 
 #ifdef __alpha__
-#include "../../arch/alpha/include/asm/unistd.h"
 #define rmb()		asm volatile("mb" ::: "memory")
 #define cpu_relax()	asm volatile("" ::: "memory")
 #define CPUINFO_PROC	"cpu model"
 #endif
 
 #ifdef __ia64__
-#include "../../arch/ia64/include/asm/unistd.h"
 #define rmb()		asm volatile ("mf" ::: "memory")
 #define cpu_relax()	asm volatile ("hint @pause" ::: "memory")
 #define CPUINFO_PROC	"model name"
 #endif
 
 #ifdef __arm__
-#include "../../arch/arm/include/asm/unistd.h"
 /*
  * Use the __kuser_memory_barrier helper in the CPU helper page. See
  * arch/arm/kernel/entry-armv.S in the kernel source for details.
@@ -89,13 +81,11 @@ void get_term_dimensions(struct winsize *ws);
 #endif
 
 #ifdef __aarch64__
-#include "../../arch/arm64/include/asm/unistd.h"
 #define rmb()		asm volatile("dmb ld" ::: "memory")
 #define cpu_relax()	asm volatile("yield" ::: "memory")
 #endif
 
 #ifdef __mips__
-#include "../../arch/mips/include/asm/unistd.h"
 #define rmb()		asm volatile(					\
 				".set	mips2\n\t"			\
 				"sync\n\t"				\
@@ -112,7 +102,7 @@ void get_term_dimensions(struct winsize *ws);
 #include <sys/types.h>
 #include <sys/syscall.h>
 
-#include "../../include/uapi/linux/perf_event.h"
+#include <linux/perf_event.h>
 #include "util/types.h"
 #include <stdbool.h>
 
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 618d41140abd..d144d464ce39 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -18,8 +18,8 @@
 #include "cpumap.h"
 #include "thread_map.h"
 #include "target.h"
-#include "../../../include/linux/hw_breakpoint.h"
-#include "../../../include/uapi/linux/perf_event.h"
+#include <linux/hw_breakpoint.h>
+#include <linux/perf_event.h>
 #include "perf_regs.h"
 
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 6f94d6dea00f..d99b476ef37c 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -3,7 +3,8 @@
 
 #include <linux/list.h>
 #include <stdbool.h>
-#include "../../../include/uapi/linux/perf_event.h"
+#include <stddef.h>
+#include <linux/perf_event.h>
 #include "types.h"
 #include "xyarray.h"
 #include "cgroup.h"
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 879d215cdac9..9bc00783f24f 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -1,7 +1,7 @@
 #ifndef __PERF_HEADER_H
 #define __PERF_HEADER_H
 
-#include "../../../include/uapi/linux/perf_event.h"
+#include <linux/perf_event.h>
 #include <sys/types.h>
 #include <stdbool.h>
 #include "types.h"
diff --git a/tools/perf/util/parse-events-test.c b/tools/perf/util/parse-events-test.c
index 516ecd9ddd6e..6ef213b35ecd 100644
--- a/tools/perf/util/parse-events-test.c
+++ b/tools/perf/util/parse-events-test.c
@@ -3,7 +3,7 @@
 #include "evsel.h"
 #include "evlist.h"
 #include "sysfs.h"
-#include "../../../include/linux/hw_breakpoint.h"
+#include <linux/hw_breakpoint.h>
 
 #define TEST_ASSERT_VAL(text, cond) \
 do { \
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 75c7b0fca6d9..6b6d03e93c3d 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1,4 +1,4 @@
-#include "../../../include/linux/hw_breakpoint.h"
+#include <linux/hw_breakpoint.h>
 #include "util.h"
 #include "../perf.h"
 #include "evlist.h"
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 839230ceb18b..2820c407adb2 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -7,7 +7,7 @@
 #include <linux/list.h>
 #include <stdbool.h>
 #include "types.h"
-#include "../../../include/uapi/linux/perf_event.h"
+#include <linux/perf_event.h>
 #include "types.h"
 
 struct list_head;
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 39f3abac7744..fdeb8ac7c5d2 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -2,7 +2,7 @@
 #define __PMU_H
 
 #include <linux/bitops.h>
-#include "../../../include/uapi/linux/perf_event.h"
+#include <linux/perf_event.h>
 
 enum {
 	PERF_PMU_FORMAT_VALUE_CONFIG,
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index dd6426163ba6..0eae00ad5fe7 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -7,7 +7,7 @@
 #include "symbol.h"
 #include "thread.h"
 #include <linux/rbtree.h>
-#include "../../../include/uapi/linux/perf_event.h"
+#include <linux/perf_event.h>
 
 struct sample_queue;
 struct ip_callchain;
-- 
cgit v1.2.3-71-gd317