From fc5a40a2301aa241eedb16caf9169ca5763707c1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 9 Oct 2012 09:49:02 +0100 Subject: UAPI: (Scripted) Disintegrate include/linux/raid Signed-off-by: David Howells Acked-by: Arnd Bergmann Acked-by: Thomas Gleixner Acked-by: Michael Kerrisk Acked-by: Paul E. McKenney Acked-by: Dave Jones --- include/uapi/linux/raid/Kbuild | 2 + include/uapi/linux/raid/md_p.h | 301 +++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/raid/md_u.h | 155 +++++++++++++++++++++ 3 files changed, 458 insertions(+) create mode 100644 include/uapi/linux/raid/md_p.h create mode 100644 include/uapi/linux/raid/md_u.h (limited to 'include/uapi') diff --git a/include/uapi/linux/raid/Kbuild b/include/uapi/linux/raid/Kbuild index aafaa5aa54d4..e2c3d25405d7 100644 --- a/include/uapi/linux/raid/Kbuild +++ b/include/uapi/linux/raid/Kbuild @@ -1 +1,3 @@ # UAPI Header export list +header-y += md_p.h +header-y += md_u.h diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h new file mode 100644 index 000000000000..ee753536ab70 --- /dev/null +++ b/include/uapi/linux/raid/md_p.h @@ -0,0 +1,301 @@ +/* + md_p.h : physical layout of Linux RAID devices + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_P_H +#define _MD_P_H + +#include + +/* + * RAID superblock. + * + * The RAID superblock maintains some statistics on each RAID configuration. + * Each real device in the RAID set contains it near the end of the device. + * Some of the ideas are copied from the ext2fs implementation. + * + * We currently use 4096 bytes as follows: + * + * word offset function + * + * 0 - 31 Constant generic RAID device information. + * 32 - 63 Generic state information. + * 64 - 127 Personality specific information. + * 128 - 511 12 32-words descriptors of the disks in the raid set. + * 512 - 911 Reserved. + * 912 - 1023 Disk specific descriptor. + */ + +/* + * If x is the real device size in bytes, we return an apparent size of: + * + * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES + * + * and place the 4kB superblock at offset y. + */ +#define MD_RESERVED_BYTES (64 * 1024) +#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) + +#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) + +#define MD_SB_BYTES 4096 +#define MD_SB_WORDS (MD_SB_BYTES / 4) +#define MD_SB_SECTORS (MD_SB_BYTES / 512) + +/* + * The following are counted in 32-bit words + */ +#define MD_SB_GENERIC_OFFSET 0 +#define MD_SB_PERSONALITY_OFFSET 64 +#define MD_SB_DISKS_OFFSET 128 +#define MD_SB_DESCRIPTOR_OFFSET 992 + +#define MD_SB_GENERIC_CONSTANT_WORDS 32 +#define MD_SB_GENERIC_STATE_WORDS 32 +#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS) +#define MD_SB_PERSONALITY_WORDS 64 +#define MD_SB_DESCRIPTOR_WORDS 32 +#define MD_SB_DISKS 27 +#define MD_SB_DISKS_WORDS (MD_SB_DISKS*MD_SB_DESCRIPTOR_WORDS) +#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS) +#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS) + +/* + * Device "operational" state bits + */ +#define MD_DISK_FAULTY 0 /* disk is faulty / operational */ +#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */ +#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ +#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ + +#define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. + * read requests will only be sent here in + * dire need + */ + +typedef struct mdp_device_descriptor_s { + __u32 number; /* 0 Device number in the entire set */ + __u32 major; /* 1 Device major number */ + __u32 minor; /* 2 Device minor number */ + __u32 raid_disk; /* 3 The role of the device in the raid set */ + __u32 state; /* 4 Operational state */ + __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5]; +} mdp_disk_t; + +#define MD_SB_MAGIC 0xa92b4efc + +/* + * Superblock state bits + */ +#define MD_SB_CLEAN 0 +#define MD_SB_ERRORS 1 + +#define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */ + +/* + * Notes: + * - if an array is being reshaped (restriped) in order to change the + * the number of active devices in the array, 'raid_disks' will be + * the larger of the old and new numbers. 'delta_disks' will + * be the "new - old". So if +ve, raid_disks is the new value, and + * "raid_disks-delta_disks" is the old. If -ve, raid_disks is the + * old value and "raid_disks+delta_disks" is the new (smaller) value. + */ + + +typedef struct mdp_superblock_s { + /* + * Constant generic information + */ + __u32 md_magic; /* 0 MD identifier */ + __u32 major_version; /* 1 major version to which the set conforms */ + __u32 minor_version; /* 2 minor version ... */ + __u32 patch_version; /* 3 patchlevel version ... */ + __u32 gvalid_words; /* 4 Number of used words in this section */ + __u32 set_uuid0; /* 5 Raid set identifier */ + __u32 ctime; /* 6 Creation time */ + __u32 level; /* 7 Raid personality */ + __u32 size; /* 8 Apparent size of each individual disk */ + __u32 nr_disks; /* 9 total disks in the raid set */ + __u32 raid_disks; /* 10 disks in a fully functional raid set */ + __u32 md_minor; /* 11 preferred MD minor device number */ + __u32 not_persistent; /* 12 does it have a persistent superblock */ + __u32 set_uuid1; /* 13 Raid set identifier #2 */ + __u32 set_uuid2; /* 14 Raid set identifier #3 */ + __u32 set_uuid3; /* 15 Raid set identifier #4 */ + __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16]; + + /* + * Generic state information + */ + __u32 utime; /* 0 Superblock update time */ + __u32 state; /* 1 State bits (clean, ...) */ + __u32 active_disks; /* 2 Number of currently active disks */ + __u32 working_disks; /* 3 Number of working disks */ + __u32 failed_disks; /* 4 Number of failed disks */ + __u32 spare_disks; /* 5 Number of spare disks */ + __u32 sb_csum; /* 6 checksum of the whole superblock */ +#ifdef __BIG_ENDIAN + __u32 events_hi; /* 7 high-order of superblock update count */ + __u32 events_lo; /* 8 low-order of superblock update count */ + __u32 cp_events_hi; /* 9 high-order of checkpoint update count */ + __u32 cp_events_lo; /* 10 low-order of checkpoint update count */ +#else + __u32 events_lo; /* 7 low-order of superblock update count */ + __u32 events_hi; /* 8 high-order of superblock update count */ + __u32 cp_events_lo; /* 9 low-order of checkpoint update count */ + __u32 cp_events_hi; /* 10 high-order of checkpoint update count */ +#endif + __u32 recovery_cp; /* 11 recovery checkpoint sector count */ + /* There are only valid for minor_version > 90 */ + __u64 reshape_position; /* 12,13 next address in array-space for reshape */ + __u32 new_level; /* 14 new level we are reshaping to */ + __u32 delta_disks; /* 15 change in number of raid_disks */ + __u32 new_layout; /* 16 new layout */ + __u32 new_chunk; /* 17 new chunk size (bytes) */ + __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 18]; + + /* + * Personality information + */ + __u32 layout; /* 0 the array's physical layout */ + __u32 chunk_size; /* 1 chunk size in bytes */ + __u32 root_pv; /* 2 LV root PV */ + __u32 root_block; /* 3 LV root block */ + __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4]; + + /* + * Disks information + */ + mdp_disk_t disks[MD_SB_DISKS]; + + /* + * Reserved + */ + __u32 reserved[MD_SB_RESERVED_WORDS]; + + /* + * Active descriptor + */ + mdp_disk_t this_disk; + +} mdp_super_t; + +static inline __u64 md_event(mdp_super_t *sb) { + __u64 ev = sb->events_hi; + return (ev<<32)| sb->events_lo; +} + +#define MD_SUPERBLOCK_1_TIME_SEC_MASK ((1ULL<<40) - 1) + +/* + * The version-1 superblock : + * All numeric fields are little-endian. + * + * total size: 256 bytes plus 2 per device. + * 1K allows 384 devices. + */ +struct mdp_superblock_1 { + /* constant array information - 128 bytes */ + __le32 magic; /* MD_SB_MAGIC: 0xa92b4efc - little endian */ + __le32 major_version; /* 1 */ + __le32 feature_map; /* bit 0 set if 'bitmap_offset' is meaningful */ + __le32 pad0; /* always set to 0 when writing */ + + __u8 set_uuid[16]; /* user-space generated. */ + char set_name[32]; /* set and interpreted by user-space */ + + __le64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/ + __le32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */ + __le32 layout; /* only for raid5 and raid10 currently */ + __le64 size; /* used size of component devices, in 512byte sectors */ + + __le32 chunksize; /* in 512byte sectors */ + __le32 raid_disks; + __le32 bitmap_offset; /* sectors after start of superblock that bitmap starts + * NOTE: signed, so bitmap can be before superblock + * only meaningful of feature_map[0] is set. + */ + + /* These are only valid with feature bit '4' */ + __le32 new_level; /* new level we are reshaping to */ + __le64 reshape_position; /* next address in array-space for reshape */ + __le32 delta_disks; /* change in number of raid_disks */ + __le32 new_layout; /* new layout */ + __le32 new_chunk; /* new chunk size (512byte sectors) */ + __le32 new_offset; /* signed number to add to data_offset in new + * layout. 0 == no-change. This can be + * different on each device in the array. + */ + + /* constant this-device information - 64 bytes */ + __le64 data_offset; /* sector start of data, often 0 */ + __le64 data_size; /* sectors in this device that can be used for data */ + __le64 super_offset; /* sector start of this superblock */ + __le64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */ + __le32 dev_number; /* permanent identifier of this device - not role in raid */ + __le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ + __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ + __u8 devflags; /* per-device flags. Only one defined...*/ +#define WriteMostly1 1 /* mask for writemostly flag in above */ + /* Bad block log. If there are any bad blocks the feature flag is set. + * If offset and size are non-zero, that space is reserved and available + */ + __u8 bblog_shift; /* shift from sectors to block size */ + __le16 bblog_size; /* number of sectors reserved for list */ + __le32 bblog_offset; /* sector offset from superblock to bblog, + * signed - not unsigned */ + + /* array state information - 64 bytes */ + __le64 utime; /* 40 bits second, 24 bits microseconds */ + __le64 events; /* incremented when superblock updated */ + __le64 resync_offset; /* data before this offset (from data_offset) known to be in sync */ + __le32 sb_csum; /* checksum up to devs[max_dev] */ + __le32 max_dev; /* size of devs[] array to consider */ + __u8 pad3[64-32]; /* set to 0 when writing */ + + /* device state information. Indexed by dev_number. + * 2 bytes per device + * Note there are no per-device state flags. State information is rolled + * into the 'roles' value. If a device is spare or faulty, then it doesn't + * have a meaningful role. + */ + __le16 dev_roles[0]; /* role in array, or 0xffff for a spare, or 0xfffe for faulty */ +}; + +/* feature_map bits */ +#define MD_FEATURE_BITMAP_OFFSET 1 +#define MD_FEATURE_RECOVERY_OFFSET 2 /* recovery_offset is present and + * must be honoured + */ +#define MD_FEATURE_RESHAPE_ACTIVE 4 +#define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */ +#define MD_FEATURE_REPLACEMENT 16 /* This device is replacing an + * active device with same 'role'. + * 'recovery_offset' is also set. + */ +#define MD_FEATURE_RESHAPE_BACKWARDS 32 /* Reshape doesn't change number + * of devices, but is going + * backwards anyway. + */ +#define MD_FEATURE_NEW_OFFSET 64 /* new_offset must be honoured */ +#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ + |MD_FEATURE_RECOVERY_OFFSET \ + |MD_FEATURE_RESHAPE_ACTIVE \ + |MD_FEATURE_BAD_BLOCKS \ + |MD_FEATURE_REPLACEMENT \ + |MD_FEATURE_RESHAPE_BACKWARDS \ + |MD_FEATURE_NEW_OFFSET \ + ) + +#endif diff --git a/include/uapi/linux/raid/md_u.h b/include/uapi/linux/raid/md_u.h new file mode 100644 index 000000000000..4133e744e4e6 --- /dev/null +++ b/include/uapi/linux/raid/md_u.h @@ -0,0 +1,155 @@ +/* + md_u.h : user <=> kernel API between Linux raidtools and RAID drivers + Copyright (C) 1998 Ingo Molnar + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _UAPI_MD_U_H +#define _UAPI_MD_U_H + +/* + * Different major versions are not compatible. + * Different minor versions are only downward compatible. + * Different patchlevel versions are downward and upward compatible. + */ +#define MD_MAJOR_VERSION 0 +#define MD_MINOR_VERSION 90 +/* + * MD_PATCHLEVEL_VERSION indicates kernel functionality. + * >=1 means different superblock formats are selectable using SET_ARRAY_INFO + * and major_version/minor_version accordingly + * >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT + * in the super status byte + * >=3 means that bitmap superblock version 4 is supported, which uses + * little-ending representation rather than host-endian + */ +#define MD_PATCHLEVEL_VERSION 3 + +/* ioctls */ + +/* status */ +#define RAID_VERSION _IOR (MD_MAJOR, 0x10, mdu_version_t) +#define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_array_info_t) +#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t) +#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13) +#define RAID_AUTORUN _IO (MD_MAJOR, 0x14) +#define GET_BITMAP_FILE _IOR (MD_MAJOR, 0x15, mdu_bitmap_file_t) + +/* configuration */ +#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20) +#define ADD_NEW_DISK _IOW (MD_MAJOR, 0x21, mdu_disk_info_t) +#define HOT_REMOVE_DISK _IO (MD_MAJOR, 0x22) +#define SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, mdu_array_info_t) +#define SET_DISK_INFO _IO (MD_MAJOR, 0x24) +#define WRITE_RAID_INFO _IO (MD_MAJOR, 0x25) +#define UNPROTECT_ARRAY _IO (MD_MAJOR, 0x26) +#define PROTECT_ARRAY _IO (MD_MAJOR, 0x27) +#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28) +#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29) +#define HOT_GENERATE_ERROR _IO (MD_MAJOR, 0x2a) +#define SET_BITMAP_FILE _IOW (MD_MAJOR, 0x2b, int) + +/* usage */ +#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t) +/* 0x31 was START_ARRAY */ +#define STOP_ARRAY _IO (MD_MAJOR, 0x32) +#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) +#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) + +/* 63 partitions with the alternate major number (mdp) */ +#define MdpMinorShift 6 + +typedef struct mdu_version_s { + int major; + int minor; + int patchlevel; +} mdu_version_t; + +typedef struct mdu_array_info_s { + /* + * Generic constant information + */ + int major_version; + int minor_version; + int patch_version; + int ctime; + int level; + int size; + int nr_disks; + int raid_disks; + int md_minor; + int not_persistent; + + /* + * Generic state information + */ + int utime; /* 0 Superblock update time */ + int state; /* 1 State bits (clean, ...) */ + int active_disks; /* 2 Number of currently active disks */ + int working_disks; /* 3 Number of working disks */ + int failed_disks; /* 4 Number of failed disks */ + int spare_disks; /* 5 Number of spare disks */ + + /* + * Personality information + */ + int layout; /* 0 the array's physical layout */ + int chunk_size; /* 1 chunk size in bytes */ + +} mdu_array_info_t; + +/* non-obvious values for 'level' */ +#define LEVEL_MULTIPATH (-4) +#define LEVEL_LINEAR (-1) +#define LEVEL_FAULTY (-5) + +/* we need a value for 'no level specified' and 0 + * means 'raid0', so we need something else. This is + * for internal use only + */ +#define LEVEL_NONE (-1000000) + +typedef struct mdu_disk_info_s { + /* + * configuration/status of one particular disk + */ + int number; + int major; + int minor; + int raid_disk; + int state; + +} mdu_disk_info_t; + +typedef struct mdu_start_info_s { + /* + * configuration/status of one particular disk + */ + int major; + int minor; + int raid_disk; + int state; + +} mdu_start_info_t; + +typedef struct mdu_bitmap_file_s +{ + char pathname[4096]; +} mdu_bitmap_file_t; + +typedef struct mdu_param_s +{ + int personality; /* 1,2,3,4 */ + int chunk_size; /* in bytes */ + int max_fault; /* unused for now */ +} mdu_param_t; + +#endif /* _UAPI_MD_U_H */ -- cgit v1.2.3-71-gd317 From 6f73601efb35c7003f5c58c2bc6fd08f3652169c Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 19 Oct 2012 15:14:44 +0000 Subject: tcp: add SYN/data info to TCP_INFO Add a bit TCPI_OPT_SYN_DATA (32) to the socket option TCP_INFO:tcpi_options. It's set if the data in SYN (sent or received) is acked by SYN-ACK. Server or client application can use this information to check Fast Open success rate. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 ++- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_input.c | 1 + net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 1 + 6 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 8a7fc4be2d75..60b7aac15e0e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -191,7 +191,8 @@ struct tcp_sock { u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ early_retrans_delayed:1, /* Delayed ER timer installed */ syn_data:1, /* SYN includes data */ - syn_fastopen:1; /* SYN includes Fast Open option */ + syn_fastopen:1, /* SYN includes Fast Open option */ + syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ /* RTT measurement */ u32 srtt; /* smoothed round trip time << 3 */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index c4b89a5cb7df..e962faa5ab0d 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -130,6 +130,7 @@ enum { #define TCPI_OPT_WSCALE 4 #define TCPI_OPT_ECN 8 /* ECN was negociated at TCP session init */ #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ +#define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ enum tcp_ca_state { TCP_CA_Open = 0, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b7c2f439b54f..197c0008503c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2764,6 +2764,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) info->tcpi_options |= TCPI_OPT_ECN; if (tp->ecn_flags & TCP_ECN_SEEN) info->tcpi_options |= TCPI_OPT_ECN_SEEN; + if (tp->syn_data_acked) + info->tcpi_options |= TCPI_OPT_SYN_DATA; info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 432c36649db3..036f85738141 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5646,6 +5646,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tcp_rearm_rto(sk); return true; } + tp->syn_data_acked = tp->syn_data; return false; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ef998b008a57..0c4a64355603 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1461,6 +1461,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk, skb_set_owner_r(skb, child); __skb_queue_tail(&child->sk_receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tp->syn_data_acked = 1; } sk->sk_data_ready(sk, 0); bh_unlock_sock(child); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 27536ba16c9d..a7302d974f32 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -510,6 +510,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.mss_clamp = req->mss; TCP_ECN_openreq_child(newtp, req); newtp->fastopen_rsk = NULL; + newtp->syn_data_acked = 0; TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); } -- cgit v1.2.3-71-gd317 From a80a6b85b428e6ce12a8363bb1f08d44c50f3252 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 8 Nov 2012 15:53:35 -0800 Subject: revert "epoll: support for disabling items, and a self-test app" Revert commit 03a7beb55b9f ("epoll: support for disabling items, and a self-test app") pending resolution of the issues identified by Michael Kerrisk, copied below. We'll revisit this for 3.8. : I've taken a look at this patch as it currently stands in 3.7-rc1, and : done a bit of testing. (By the way, the test program : tools/testing/selftests/epoll/test_epoll.c does not compile...) : : There are one or two places where the behavior seems a little strange, : so I have a question or two at the end of this mail. But other than : that, I want to check my understanding so that the interface can be : correctly documented. : : Just to go though my understanding, the problem is the following : scenario in a multithreaded application: : : 1. Multiple threads are performing epoll_wait() operations, : and maintaining a user-space cache that contains information : corresponding to each file descriptor being monitored by : epoll_wait(). : : 2. At some point, a thread wants to delete (EPOLL_CTL_DEL) : a file descriptor from the epoll interest list, and : delete the corresponding record from the user-space cache. : : 3. The problem with (2) is that some other thread may have : previously done an epoll_wait() that retrieved information : about the fd in question, and may be in the middle of using : information in the cache that relates to that fd. Thus, : there is a potential race. : : 4. The race can't solved purely in user space, because doing : so would require applying a mutex across the epoll_wait() : call, which would of course blow thread concurrency. : : Right? : : Your solution is the EPOLL_CTL_DISABLE operation. I want to : confirm my understanding about how to use this flag, since : the description that has accompanied the patches so far : has been a bit sparse : : 0. In the scenario you're concerned about, deleting a file : descriptor means (safely) doing the following: : (a) Deleting the file descriptor from the epoll interest list : using EPOLL_CTL_DEL : (b) Deleting the corresponding record in the user-space cache : : 1. It's only meaningful to use this EPOLL_CTL_DISABLE in : conjunction with EPOLLONESHOT. : : 2. Using EPOLL_CTL_DISABLE without using EPOLLONESHOT in : conjunction is a logical error. : : 3. The correct way to code multithreaded applications using : EPOLL_CTL_DISABLE and EPOLLONESHOT is as follows: : : a. All EPOLL_CTL_ADD and EPOLL_CTL_MOD operations should : should EPOLLONESHOT. : : b. When a thread wants to delete a file descriptor, it : should do the following: : : [1] Call epoll_ctl(EPOLL_CTL_DISABLE) : [2] If the return status from epoll_ctl(EPOLL_CTL_DISABLE) : was zero, then the file descriptor can be safely : deleted by the thread that made this call. : [3] If the epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY, : then the descriptor is in use. In this case, the calling : thread should set a flag in the user-space cache to : indicate that the thread that is using the descriptor : should perform the deletion operation. : : Is all of the above correct? : : The implementation depends on checking on whether : (events & ~EP_PRIVATE_BITS) == 0 : This replies on the fact that EPOLL_CTL_AD and EPOLL_CTL_MOD always : set EPOLLHUP and EPOLLERR in the 'events' mask, and EPOLLONESHOT : causes those flags (as well as all others in ~EP_PRIVATE_BITS) to be : cleared. : : A corollary to the previous paragraph is that using EPOLL_CTL_DISABLE : is only useful in conjunction with EPOLLONESHOT. However, as things : stand, one can use EPOLL_CTL_DISABLE on a file descriptor that does : not have EPOLLONESHOT set in 'events' This results in the following : (slightly surprising) behavior: : : (a) The first call to epoll_ctl(EPOLL_CTL_DISABLE) returns 0 : (the indicator that the file descriptor can be safely deleted). : (b) The next call to epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY. : : This doesn't seem particularly useful, and in fact is probably an : indication that the user made a logic error: they should only be using : epoll_ctl(EPOLL_CTL_DISABLE) on a file descriptor for which : EPOLLONESHOT was set in 'events'. If that is correct, then would it : not make sense to return an error to user space for this case? Cc: Michael Kerrisk Cc: "Paton J. Lewis" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 38 +--- include/uapi/linux/eventpoll.h | 1 - tools/testing/selftests/Makefile | 2 +- tools/testing/selftests/epoll/Makefile | 11 - tools/testing/selftests/epoll/test_epoll.c | 344 ----------------------------- 5 files changed, 4 insertions(+), 392 deletions(-) delete mode 100644 tools/testing/selftests/epoll/Makefile delete mode 100644 tools/testing/selftests/epoll/test_epoll.c (limited to 'include/uapi') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index da72250ddc1c..cd96649bfe62 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p) /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) { - return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD; + return op != EPOLL_CTL_DEL; } /* Initialize the poll safe wake up structure */ @@ -676,34 +676,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) return 0; } -/* - * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item - * had no event flags set, indicating that another thread may be currently - * handling that item's events (in the case that EPOLLONESHOT was being - * used). Otherwise a zero result indicates that the item has been disabled - * from receiving events. A disabled item may be re-enabled via - * EPOLL_CTL_MOD. Must be called with "mtx" held. - */ -static int ep_disable(struct eventpoll *ep, struct epitem *epi) -{ - int result = 0; - unsigned long flags; - - spin_lock_irqsave(&ep->lock, flags); - if (epi->event.events & ~EP_PRIVATE_BITS) { - if (ep_is_linked(&epi->rdllink)) - list_del_init(&epi->rdllink); - /* Ensure ep_poll_callback will not add epi back onto ready - list: */ - epi->event.events &= EP_PRIVATE_BITS; - } - else - result = -EBUSY; - spin_unlock_irqrestore(&ep->lock, flags); - - return result; -} - static void ep_free(struct eventpoll *ep) { struct rb_node *rbp; @@ -1048,6 +1020,8 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) rb_insert_color(&epi->rbn, &ep->rbr); } + + #define PATH_ARR_SIZE 5 /* * These are the number paths of length 1 to 5, that we are allowing to emanate @@ -1813,12 +1787,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } else error = -ENOENT; break; - case EPOLL_CTL_DISABLE: - if (epi) - error = ep_disable(ep, epi); - else - error = -ENOENT; - break; } mutex_unlock(&ep->mtx); diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h index 8c99ce7202c5..2c267bcbb85c 100644 --- a/include/uapi/linux/eventpoll.h +++ b/include/uapi/linux/eventpoll.h @@ -25,7 +25,6 @@ #define EPOLL_CTL_ADD 1 #define EPOLL_CTL_DEL 2 #define EPOLL_CTL_MOD 3 -#define EPOLL_CTL_DISABLE 4 /* * Request the handling of system wakeup events so as to prevent system suspends diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 43480149119e..85baf11e2acd 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -1,4 +1,4 @@ -TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug epoll +TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug all: for TARGET in $(TARGETS); do \ diff --git a/tools/testing/selftests/epoll/Makefile b/tools/testing/selftests/epoll/Makefile deleted file mode 100644 index 19806ed62f50..000000000000 --- a/tools/testing/selftests/epoll/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# Makefile for epoll selftests - -all: test_epoll -%: %.c - gcc -pthread -g -o $@ $^ - -run_tests: all - ./test_epoll - -clean: - $(RM) test_epoll diff --git a/tools/testing/selftests/epoll/test_epoll.c b/tools/testing/selftests/epoll/test_epoll.c deleted file mode 100644 index f7525392ce84..000000000000 --- a/tools/testing/selftests/epoll/test_epoll.c +++ /dev/null @@ -1,344 +0,0 @@ -/* - * tools/testing/selftests/epoll/test_epoll.c - * - * Copyright 2012 Adobe Systems Incorporated - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * Paton J. Lewis - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * A pointer to an epoll_item_private structure will be stored in the epoll - * item's event structure so that we can get access to the epoll_item_private - * data after calling epoll_wait: - */ -struct epoll_item_private { - int index; /* Position of this struct within the epoll_items array. */ - int fd; - uint32_t events; - pthread_mutex_t mutex; /* Guards the following variables... */ - int stop; - int status; /* Stores any error encountered while handling item. */ - /* The following variable allows us to test whether we have encountered - a problem while attempting to cancel and delete the associated - event. When the test program exits, 'deleted' should be exactly - one. If it is greater than one, then the failed test reflects a real - world situation where we would have tried to access the epoll item's - private data after deleting it: */ - int deleted; -}; - -struct epoll_item_private *epoll_items; - -/* - * Delete the specified item from the epoll set. In a real-world secneario this - * is where we would free the associated data structure, but in this testing - * environment we retain the structure so that we can test for double-deletion: - */ -void delete_item(int index) -{ - __sync_fetch_and_add(&epoll_items[index].deleted, 1); -} - -/* - * A pointer to a read_thread_data structure will be passed as the argument to - * each read thread: - */ -struct read_thread_data { - int stop; - int status; /* Indicates any error encountered by the read thread. */ - int epoll_set; -}; - -/* - * The function executed by the read threads: - */ -void *read_thread_function(void *function_data) -{ - struct read_thread_data *thread_data = - (struct read_thread_data *)function_data; - struct epoll_event event_data; - struct epoll_item_private *item_data; - char socket_data; - - /* Handle events until we encounter an error or this thread's 'stop' - condition is set: */ - while (1) { - int result = epoll_wait(thread_data->epoll_set, - &event_data, - 1, /* Number of desired events */ - 1000); /* Timeout in ms */ - if (result < 0) { - /* Breakpoints signal all threads. Ignore that while - debugging: */ - if (errno == EINTR) - continue; - thread_data->status = errno; - return 0; - } else if (thread_data->stop) - return 0; - else if (result == 0) /* Timeout */ - continue; - - /* We need the mutex here because checking for the stop - condition and re-enabling the epoll item need to be done - together as one atomic operation when EPOLL_CTL_DISABLE is - available: */ - item_data = (struct epoll_item_private *)event_data.data.ptr; - pthread_mutex_lock(&item_data->mutex); - - /* Remove the item from the epoll set if we want to stop - handling that event: */ - if (item_data->stop) - delete_item(item_data->index); - else { - /* Clear the data that was written to the other end of - our non-blocking socket: */ - do { - if (read(item_data->fd, &socket_data, 1) < 1) { - if ((errno == EAGAIN) || - (errno == EWOULDBLOCK)) - break; - else - goto error_unlock; - } - } while (item_data->events & EPOLLET); - - /* The item was one-shot, so re-enable it: */ - event_data.events = item_data->events; - if (epoll_ctl(thread_data->epoll_set, - EPOLL_CTL_MOD, - item_data->fd, - &event_data) < 0) - goto error_unlock; - } - - pthread_mutex_unlock(&item_data->mutex); - } - -error_unlock: - thread_data->status = item_data->status = errno; - pthread_mutex_unlock(&item_data->mutex); - return 0; -} - -/* - * A pointer to a write_thread_data structure will be passed as the argument to - * the write thread: - */ -struct write_thread_data { - int stop; - int status; /* Indicates any error encountered by the write thread. */ - int n_fds; - int *fds; -}; - -/* - * The function executed by the write thread. It writes a single byte to each - * socket in turn until the stop condition for this thread is set. If writing to - * a socket would block (i.e. errno was EAGAIN), we leave that socket alone for - * the moment and just move on to the next socket in the list. We don't care - * about the order in which we deliver events to the epoll set. In fact we don't - * care about the data we're writing to the pipes at all; we just want to - * trigger epoll events: - */ -void *write_thread_function(void *function_data) -{ - const char data = 'X'; - int index; - struct write_thread_data *thread_data = - (struct write_thread_data *)function_data; - while (!thread_data->stop) - for (index = 0; - !thread_data->stop && (index < thread_data->n_fds); - ++index) - if ((write(thread_data->fds[index], &data, 1) < 1) && - (errno != EAGAIN) && - (errno != EWOULDBLOCK)) { - thread_data->status = errno; - return; - } -} - -/* - * Arguments are currently ignored: - */ -int main(int argc, char **argv) -{ - const int n_read_threads = 100; - const int n_epoll_items = 500; - int index; - int epoll_set = epoll_create1(0); - struct write_thread_data write_thread_data = { - 0, 0, n_epoll_items, malloc(n_epoll_items * sizeof(int)) - }; - struct read_thread_data *read_thread_data = - malloc(n_read_threads * sizeof(struct read_thread_data)); - pthread_t *read_threads = malloc(n_read_threads * sizeof(pthread_t)); - pthread_t write_thread; - - printf("-----------------\n"); - printf("Runing test_epoll\n"); - printf("-----------------\n"); - - epoll_items = malloc(n_epoll_items * sizeof(struct epoll_item_private)); - - if (epoll_set < 0 || epoll_items == 0 || write_thread_data.fds == 0 || - read_thread_data == 0 || read_threads == 0) - goto error; - - if (sysconf(_SC_NPROCESSORS_ONLN) < 2) { - printf("Error: please run this test on a multi-core system.\n"); - goto error; - } - - /* Create the socket pairs and epoll items: */ - for (index = 0; index < n_epoll_items; ++index) { - int socket_pair[2]; - struct epoll_event event_data; - if (socketpair(AF_UNIX, - SOCK_STREAM | SOCK_NONBLOCK, - 0, - socket_pair) < 0) - goto error; - write_thread_data.fds[index] = socket_pair[0]; - epoll_items[index].index = index; - epoll_items[index].fd = socket_pair[1]; - if (pthread_mutex_init(&epoll_items[index].mutex, NULL) != 0) - goto error; - /* We always use EPOLLONESHOT because this test is currently - structured to demonstrate the need for EPOLL_CTL_DISABLE, - which only produces useful information in the EPOLLONESHOT - case (without EPOLLONESHOT, calling epoll_ctl with - EPOLL_CTL_DISABLE will never return EBUSY). If support for - testing events without EPOLLONESHOT is desired, it should - probably be implemented in a separate unit test. */ - epoll_items[index].events = EPOLLIN | EPOLLONESHOT; - if (index < n_epoll_items / 2) - epoll_items[index].events |= EPOLLET; - epoll_items[index].stop = 0; - epoll_items[index].status = 0; - epoll_items[index].deleted = 0; - event_data.events = epoll_items[index].events; - event_data.data.ptr = &epoll_items[index]; - if (epoll_ctl(epoll_set, - EPOLL_CTL_ADD, - epoll_items[index].fd, - &event_data) < 0) - goto error; - } - - /* Create and start the read threads: */ - for (index = 0; index < n_read_threads; ++index) { - read_thread_data[index].stop = 0; - read_thread_data[index].status = 0; - read_thread_data[index].epoll_set = epoll_set; - if (pthread_create(&read_threads[index], - NULL, - read_thread_function, - &read_thread_data[index]) != 0) - goto error; - } - - if (pthread_create(&write_thread, - NULL, - write_thread_function, - &write_thread_data) != 0) - goto error; - - /* Cancel all event pollers: */ -#ifdef EPOLL_CTL_DISABLE - for (index = 0; index < n_epoll_items; ++index) { - pthread_mutex_lock(&epoll_items[index].mutex); - ++epoll_items[index].stop; - if (epoll_ctl(epoll_set, - EPOLL_CTL_DISABLE, - epoll_items[index].fd, - NULL) == 0) - delete_item(index); - else if (errno != EBUSY) { - pthread_mutex_unlock(&epoll_items[index].mutex); - goto error; - } - /* EBUSY means events were being handled; allow the other thread - to delete the item. */ - pthread_mutex_unlock(&epoll_items[index].mutex); - } -#else - for (index = 0; index < n_epoll_items; ++index) { - pthread_mutex_lock(&epoll_items[index].mutex); - ++epoll_items[index].stop; - pthread_mutex_unlock(&epoll_items[index].mutex); - /* Wait in case a thread running read_thread_function is - currently executing code between epoll_wait and - pthread_mutex_lock with this item. Note that a longer delay - would make double-deletion less likely (at the expense of - performance), but there is no guarantee that any delay would - ever be sufficient. Note also that we delete all event - pollers at once for testing purposes, but in a real-world - environment we are likely to want to be able to cancel event - pollers at arbitrary times. Therefore we can't improve this - situation by just splitting this loop into two loops - (i.e. signal 'stop' for all items, sleep, and then delete all - items). We also can't fix the problem via EPOLL_CTL_DEL - because that command can't prevent the case where some other - thread is executing read_thread_function within the region - mentioned above: */ - usleep(1); - pthread_mutex_lock(&epoll_items[index].mutex); - if (!epoll_items[index].deleted) - delete_item(index); - pthread_mutex_unlock(&epoll_items[index].mutex); - } -#endif - - /* Shut down the read threads: */ - for (index = 0; index < n_read_threads; ++index) - __sync_fetch_and_add(&read_thread_data[index].stop, 1); - for (index = 0; index < n_read_threads; ++index) { - if (pthread_join(read_threads[index], NULL) != 0) - goto error; - if (read_thread_data[index].status) - goto error; - } - - /* Shut down the write thread: */ - __sync_fetch_and_add(&write_thread_data.stop, 1); - if ((pthread_join(write_thread, NULL) != 0) || write_thread_data.status) - goto error; - - /* Check for final error conditions: */ - for (index = 0; index < n_epoll_items; ++index) { - if (epoll_items[index].status != 0) - goto error; - if (pthread_mutex_destroy(&epoll_items[index].mutex) < 0) - goto error; - } - for (index = 0; index < n_epoll_items; ++index) - if (epoll_items[index].deleted != 1) { - printf("Error: item data deleted %1d times.\n", - epoll_items[index].deleted); - goto error; - } - - printf("[PASS]\n"); - return 0; - - error: - printf("[FAIL]\n"); - return errno; -} -- cgit v1.2.3-71-gd317 From fa0cbbf145aabbf29c6f28f8a11935c0b0fd86fc Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 12 Nov 2012 17:53:04 -0800 Subject: mm, oom: reintroduce /proc/pid/oom_adj This is mostly a revert of 01dc52ebdf47 ("oom: remove deprecated oom_adj") from Davidlohr Bueso. It reintroduces /proc/pid/oom_adj for backwards compatibility with earlier kernels. It simply scales the value linearly when /proc/pid/oom_score_adj is written. The major difference is that its scheduled removal is no longer included in Documentation/feature-removal-schedule.txt. We do warn users with a single printk, though, to suggest the more powerful and supported /proc/pid/oom_score_adj interface. Reported-by: Artem S. Tashkinov Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 16 ++++-- fs/proc/base.c | 109 +++++++++++++++++++++++++++++++++++++ include/uapi/linux/oom.h | 9 +++ 3 files changed, 130 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index a1793d670cd0..3844d21d6ca3 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -33,7 +33,7 @@ Table of Contents 2 Modifying System Parameters 3 Per-Process Parameters - 3.1 /proc//oom_score_adj - Adjust the oom-killer + 3.1 /proc//oom_adj & /proc//oom_score_adj - Adjust the oom-killer score 3.2 /proc//oom_score - Display current oom-killer score 3.3 /proc//io - Display the IO accounting fields @@ -1320,10 +1320,10 @@ of the kernel. CHAPTER 3: PER-PROCESS PARAMETERS ------------------------------------------------------------------------------ -3.1 /proc//oom_score_adj- Adjust the oom-killer score +3.1 /proc//oom_adj & /proc//oom_score_adj- Adjust the oom-killer score -------------------------------------------------------------------------------- -This file can be used to adjust the badness heuristic used to select which +These file can be used to adjust the badness heuristic used to select which process gets killed in out of memory conditions. The badness heuristic assigns a value to each candidate task ranging from 0 @@ -1361,6 +1361,12 @@ same system, cpuset, mempolicy, or memory controller resources to use at least equivalent to discounting 50% of the task's allowed memory from being considered as scoring against the task. +For backwards compatibility with previous kernels, /proc//oom_adj may also +be used to tune the badness score. Its acceptable values range from -16 +(OOM_ADJUST_MIN) to +15 (OOM_ADJUST_MAX) and a special value of -17 +(OOM_DISABLE) to disable oom killing entirely for that task. Its value is +scaled linearly with /proc//oom_score_adj. + The value of /proc//oom_score_adj may be reduced no lower than the last value set by a CAP_SYS_RESOURCE process. To reduce the value any lower requires CAP_SYS_RESOURCE. @@ -1375,7 +1381,9 @@ minimal amount of work. ------------------------------------------------------------- This file can be used to check the current score used by the oom-killer is for -any given . +any given . Use it together with /proc//oom_score_adj to tune which +process should be killed in an out-of-memory situation. + 3.3 /proc//io - Display the IO accounting fields ------------------------------------------------------- diff --git a/fs/proc/base.c b/fs/proc/base.c index 144a96732dd7..3c231adf8450 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -873,6 +873,113 @@ static const struct file_operations proc_environ_operations = { .release = mem_release, }; +static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + char buffer[PROC_NUMBUF]; + int oom_adj = OOM_ADJUST_MIN; + size_t len; + unsigned long flags; + + if (!task) + return -ESRCH; + if (lock_task_sighand(task, &flags)) { + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) + oom_adj = OOM_ADJUST_MAX; + else + oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / + OOM_SCORE_ADJ_MAX; + unlock_task_sighand(task, &flags); + } + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t oom_adj_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + int oom_adj; + unsigned long flags; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &oom_adj); + if (err) + goto out; + if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) && + oom_adj != OOM_DISABLE) { + err = -EINVAL; + goto out; + } + + task = get_proc_task(file->f_path.dentry->d_inode); + if (!task) { + err = -ESRCH; + goto out; + } + + task_lock(task); + if (!task->mm) { + err = -EINVAL; + goto err_task_lock; + } + + if (!lock_task_sighand(task, &flags)) { + err = -ESRCH; + goto err_task_lock; + } + + /* + * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum + * value is always attainable. + */ + if (oom_adj == OOM_ADJUST_MAX) + oom_adj = OOM_SCORE_ADJ_MAX; + else + oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + + if (oom_adj < task->signal->oom_score_adj && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_sighand; + } + + /* + * /proc/pid/oom_adj is provided for legacy purposes, ask users to use + * /proc/pid/oom_score_adj instead. + */ + printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", + current->comm, task_pid_nr(current), task_pid_nr(task), + task_pid_nr(task)); + + task->signal->oom_score_adj = oom_adj; + trace_oom_score_adj_update(task); +err_sighand: + unlock_task_sighand(task, &flags); +err_task_lock: + task_unlock(task); + put_task_struct(task); +out: + return err < 0 ? err : count; +} + +static const struct file_operations proc_oom_adj_operations = { + .read = oom_adj_read, + .write = oom_adj_write, + .llseek = generic_file_llseek, +}; + static ssize_t oom_score_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -2598,6 +2705,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), @@ -2964,6 +3072,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), diff --git a/include/uapi/linux/oom.h b/include/uapi/linux/oom.h index a49c4afc7060..b29272d621ce 100644 --- a/include/uapi/linux/oom.h +++ b/include/uapi/linux/oom.h @@ -8,4 +8,13 @@ #define OOM_SCORE_ADJ_MIN (-1000) #define OOM_SCORE_ADJ_MAX 1000 +/* + * /proc//oom_adj set to -17 protects from the oom killer for legacy + * purposes. + */ +#define OOM_DISABLE (-17) +/* inclusive */ +#define OOM_ADJUST_MIN (-16) +#define OOM_ADJUST_MAX 15 + #endif /* _UAPI__INCLUDE_LINUX_OOM_H */ -- cgit v1.2.3-71-gd317 From d2709c7ce4c513ab7f4ca9a106a930621811f2d3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Nov 2012 22:21:03 +0000 Subject: perf: Make perf build for x86 with UAPI disintegration applied Make perf build for x86 once the UAPI disintegration patches for that arch have been applied by adding the appropriate -I flags - in the right order - and then converting some #includes that use ../.. notation to find main kernel headerfiles to use and instead. Note that -Iarch/foo/include/uapi is present _before_ -Iarch/foo/include. This makes sure we get the userspace version of the pt_regs struct. Ideally, we wouldn't have the latter -I flag at all, but unfortunately we want asm/svm.h and asm/vmx.h in builtin-kvm.c and these aren't part of the UAPI - at least not for x86. I wonder if the bits outside of the __KERNEL__ guards *should* be transferred there. I note also that perf seems to do its dependency handling manually by listing all the header files it might want to use in LIB_H in the Makefile. Can this be changed to use -MD? Note that to do make this work, we need to export and UAPI disintegrate linux/hw_breakpoint.h, which I think should've been exported previously so that perf can access the bits. We have to do this in the same patch to maintain bisectability. Signed-off-by: David Howells --- include/linux/hw_breakpoint.h | 31 +------------------------------ include/uapi/linux/Kbuild | 1 + include/uapi/linux/hw_breakpoint.h | 30 ++++++++++++++++++++++++++++++ tools/perf/Makefile | 29 ++++++++++++++++++++++++++++- tools/perf/arch/x86/include/perf_regs.h | 2 +- tools/perf/builtin-kvm.c | 6 +++--- tools/perf/builtin-test.c | 2 +- tools/perf/perf.h | 16 +++------------- tools/perf/util/evsel.c | 4 ++-- tools/perf/util/evsel.h | 3 ++- tools/perf/util/header.h | 2 +- tools/perf/util/parse-events-test.c | 2 +- tools/perf/util/parse-events.c | 2 +- tools/perf/util/parse-events.h | 2 +- tools/perf/util/pmu.h | 2 +- tools/perf/util/session.h | 2 +- 16 files changed, 78 insertions(+), 58 deletions(-) create mode 100644 include/uapi/linux/hw_breakpoint.h (limited to 'include/uapi') diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 6ae9c631a1be..0464c85e63fd 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -1,35 +1,8 @@ #ifndef _LINUX_HW_BREAKPOINT_H #define _LINUX_HW_BREAKPOINT_H -enum { - HW_BREAKPOINT_LEN_1 = 1, - HW_BREAKPOINT_LEN_2 = 2, - HW_BREAKPOINT_LEN_4 = 4, - HW_BREAKPOINT_LEN_8 = 8, -}; - -enum { - HW_BREAKPOINT_EMPTY = 0, - HW_BREAKPOINT_R = 1, - HW_BREAKPOINT_W = 2, - HW_BREAKPOINT_RW = HW_BREAKPOINT_R | HW_BREAKPOINT_W, - HW_BREAKPOINT_X = 4, - HW_BREAKPOINT_INVALID = HW_BREAKPOINT_RW | HW_BREAKPOINT_X, -}; - -enum bp_type_idx { - TYPE_INST = 0, -#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS - TYPE_DATA = 0, -#else - TYPE_DATA = 1, -#endif - TYPE_MAX -}; - -#ifdef __KERNEL__ - #include +#include #ifdef CONFIG_HAVE_HW_BREAKPOINT @@ -151,6 +124,4 @@ static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp) } #endif /* CONFIG_HAVE_HW_BREAKPOINT */ -#endif /* __KERNEL__ */ - #endif /* _LINUX_HW_BREAKPOINT_H */ diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index e194387ef784..19e765fbfef7 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -415,3 +415,4 @@ header-y += wireless.h header-y += x25.h header-y += xattr.h header-y += xfrm.h +header-y += hw_breakpoint.h diff --git a/include/uapi/linux/hw_breakpoint.h b/include/uapi/linux/hw_breakpoint.h new file mode 100644 index 000000000000..b04000a2296a --- /dev/null +++ b/include/uapi/linux/hw_breakpoint.h @@ -0,0 +1,30 @@ +#ifndef _UAPI_LINUX_HW_BREAKPOINT_H +#define _UAPI_LINUX_HW_BREAKPOINT_H + +enum { + HW_BREAKPOINT_LEN_1 = 1, + HW_BREAKPOINT_LEN_2 = 2, + HW_BREAKPOINT_LEN_4 = 4, + HW_BREAKPOINT_LEN_8 = 8, +}; + +enum { + HW_BREAKPOINT_EMPTY = 0, + HW_BREAKPOINT_R = 1, + HW_BREAKPOINT_W = 2, + HW_BREAKPOINT_RW = HW_BREAKPOINT_R | HW_BREAKPOINT_W, + HW_BREAKPOINT_X = 4, + HW_BREAKPOINT_INVALID = HW_BREAKPOINT_RW | HW_BREAKPOINT_X, +}; + +enum bp_type_idx { + TYPE_INST = 0, +#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS + TYPE_DATA = 0, +#else + TYPE_DATA = 1, +#endif + TYPE_MAX +}; + +#endif /* _UAPI_LINUX_HW_BREAKPOINT_H */ diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 00deed4d6159..0a619af5be43 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -169,7 +169,34 @@ endif ### --- END CONFIGURATION SECTION --- -BASIC_CFLAGS = -Iutil/include -Iarch/$(ARCH)/include -I$(OUTPUT)util -I$(TRACE_EVENT_DIR) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE +ifeq ($(srctree),) +srctree := $(patsubst %/,%,$(dir $(shell pwd))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +#$(info Determined 'srctree' to be $(srctree)) +endif + +ifneq ($(objtree),) +#$(info Determined 'objtree' to be $(objtree)) +endif + +ifneq ($(OUTPUT),) +#$(info Determined 'OUTPUT' to be $(OUTPUT)) +endif + +BASIC_CFLAGS = \ + -Iutil/include \ + -Iarch/$(ARCH)/include \ + $(if $(objtree),-I$(objtree)/arch/$(ARCH)/include/generated/uapi) \ + -I$(srctree)/arch/$(ARCH)/include/uapi \ + -I$(srctree)/arch/$(ARCH)/include \ + $(if $(objtree),-I$(objtree)/include/generated/uapi) \ + -I$(srctree)/include/uapi \ + -I$(srctree)/include \ + -I$(OUTPUT)util \ + -Iutil \ + -I. \ + -I$(TRACE_EVENT_DIR) \ + -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE BASIC_LDFLAGS = # Guard against environment variables diff --git a/tools/perf/arch/x86/include/perf_regs.h b/tools/perf/arch/x86/include/perf_regs.h index 46fc9f15c6b3..7fcdcdbee917 100644 --- a/tools/perf/arch/x86/include/perf_regs.h +++ b/tools/perf/arch/x86/include/perf_regs.h @@ -3,7 +3,7 @@ #include #include "../../util/types.h" -#include "../../../../../arch/x86/include/asm/perf_regs.h" +#include #ifndef ARCH_X86_64 #define PERF_REGS_MASK ((1ULL << PERF_REG_X86_32_MAX) - 1) diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 260abc535b5b..e013bdb5e24a 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -22,9 +22,9 @@ #include #include -#include "../../arch/x86/include/asm/svm.h" -#include "../../arch/x86/include/asm/vmx.h" -#include "../../arch/x86/include/asm/kvm.h" +#include +#include +#include struct event_key { #define INVALID_KEY (~0ULL) diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index 484f26cc0c00..5acd6e8e658b 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -15,7 +15,7 @@ #include "util/thread_map.h" #include "util/pmu.h" #include "event-parse.h" -#include "../../include/linux/hw_breakpoint.h" +#include #include diff --git a/tools/perf/perf.h b/tools/perf/perf.h index e2ba8f004d32..238f923f2218 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -5,8 +5,9 @@ struct winsize; void get_term_dimensions(struct winsize *ws); +#include + #if defined(__i386__) -#include "../../arch/x86/include/asm/unistd.h" #define rmb() asm volatile("lock; addl $0,0(%%esp)" ::: "memory") #define cpu_relax() asm volatile("rep; nop" ::: "memory"); #define CPUINFO_PROC "model name" @@ -16,7 +17,6 @@ void get_term_dimensions(struct winsize *ws); #endif #if defined(__x86_64__) -#include "../../arch/x86/include/asm/unistd.h" #define rmb() asm volatile("lfence" ::: "memory") #define cpu_relax() asm volatile("rep; nop" ::: "memory"); #define CPUINFO_PROC "model name" @@ -26,20 +26,17 @@ void get_term_dimensions(struct winsize *ws); #endif #ifdef __powerpc__ -#include "../../arch/powerpc/include/uapi/asm/unistd.h" #define rmb() asm volatile ("sync" ::: "memory") #define cpu_relax() asm volatile ("" ::: "memory"); #define CPUINFO_PROC "cpu" #endif #ifdef __s390__ -#include "../../arch/s390/include/asm/unistd.h" #define rmb() asm volatile("bcr 15,0" ::: "memory") #define cpu_relax() asm volatile("" ::: "memory"); #endif #ifdef __sh__ -#include "../../arch/sh/include/asm/unistd.h" #if defined(__SH4A__) || defined(__SH5__) # define rmb() asm volatile("synco" ::: "memory") #else @@ -50,35 +47,30 @@ void get_term_dimensions(struct winsize *ws); #endif #ifdef __hppa__ -#include "../../arch/parisc/include/asm/unistd.h" #define rmb() asm volatile("" ::: "memory") #define cpu_relax() asm volatile("" ::: "memory"); #define CPUINFO_PROC "cpu" #endif #ifdef __sparc__ -#include "../../arch/sparc/include/uapi/asm/unistd.h" #define rmb() asm volatile("":::"memory") #define cpu_relax() asm volatile("":::"memory") #define CPUINFO_PROC "cpu" #endif #ifdef __alpha__ -#include "../../arch/alpha/include/asm/unistd.h" #define rmb() asm volatile("mb" ::: "memory") #define cpu_relax() asm volatile("" ::: "memory") #define CPUINFO_PROC "cpu model" #endif #ifdef __ia64__ -#include "../../arch/ia64/include/asm/unistd.h" #define rmb() asm volatile ("mf" ::: "memory") #define cpu_relax() asm volatile ("hint @pause" ::: "memory") #define CPUINFO_PROC "model name" #endif #ifdef __arm__ -#include "../../arch/arm/include/asm/unistd.h" /* * Use the __kuser_memory_barrier helper in the CPU helper page. See * arch/arm/kernel/entry-armv.S in the kernel source for details. @@ -89,13 +81,11 @@ void get_term_dimensions(struct winsize *ws); #endif #ifdef __aarch64__ -#include "../../arch/arm64/include/asm/unistd.h" #define rmb() asm volatile("dmb ld" ::: "memory") #define cpu_relax() asm volatile("yield" ::: "memory") #endif #ifdef __mips__ -#include "../../arch/mips/include/asm/unistd.h" #define rmb() asm volatile( \ ".set mips2\n\t" \ "sync\n\t" \ @@ -112,7 +102,7 @@ void get_term_dimensions(struct winsize *ws); #include #include -#include "../../include/uapi/linux/perf_event.h" +#include #include "util/types.h" #include diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 618d41140abd..d144d464ce39 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -18,8 +18,8 @@ #include "cpumap.h" #include "thread_map.h" #include "target.h" -#include "../../../include/linux/hw_breakpoint.h" -#include "../../../include/uapi/linux/perf_event.h" +#include +#include #include "perf_regs.h" #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 6f94d6dea00f..d99b476ef37c 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -3,7 +3,8 @@ #include #include -#include "../../../include/uapi/linux/perf_event.h" +#include +#include #include "types.h" #include "xyarray.h" #include "cgroup.h" diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 879d215cdac9..9bc00783f24f 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -1,7 +1,7 @@ #ifndef __PERF_HEADER_H #define __PERF_HEADER_H -#include "../../../include/uapi/linux/perf_event.h" +#include #include #include #include "types.h" diff --git a/tools/perf/util/parse-events-test.c b/tools/perf/util/parse-events-test.c index 516ecd9ddd6e..6ef213b35ecd 100644 --- a/tools/perf/util/parse-events-test.c +++ b/tools/perf/util/parse-events-test.c @@ -3,7 +3,7 @@ #include "evsel.h" #include "evlist.h" #include "sysfs.h" -#include "../../../include/linux/hw_breakpoint.h" +#include #define TEST_ASSERT_VAL(text, cond) \ do { \ diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 75c7b0fca6d9..6b6d03e93c3d 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1,4 +1,4 @@ -#include "../../../include/linux/hw_breakpoint.h" +#include #include "util.h" #include "../perf.h" #include "evlist.h" diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 839230ceb18b..2820c407adb2 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -7,7 +7,7 @@ #include #include #include "types.h" -#include "../../../include/uapi/linux/perf_event.h" +#include #include "types.h" struct list_head; diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 39f3abac7744..fdeb8ac7c5d2 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -2,7 +2,7 @@ #define __PMU_H #include -#include "../../../include/uapi/linux/perf_event.h" +#include enum { PERF_PMU_FORMAT_VALUE_CONFIG, diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index dd6426163ba6..0eae00ad5fe7 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -7,7 +7,7 @@ #include "symbol.h" #include "thread.h" #include -#include "../../../include/uapi/linux/perf_event.h" +#include struct sample_queue; struct ip_callchain; -- cgit v1.2.3-71-gd317