cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

uprobes.c (58362B)


      1// SPDX-License-Identifier: GPL-2.0+
      2/*
      3 * User-space Probes (UProbes)
      4 *
      5 * Copyright (C) IBM Corporation, 2008-2012
      6 * Authors:
      7 *	Srikar Dronamraju
      8 *	Jim Keniston
      9 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
     10 */
     11
     12#include <linux/kernel.h>
     13#include <linux/highmem.h>
     14#include <linux/pagemap.h>	/* read_mapping_page */
     15#include <linux/slab.h>
     16#include <linux/sched.h>
     17#include <linux/sched/mm.h>
     18#include <linux/sched/coredump.h>
     19#include <linux/export.h>
     20#include <linux/rmap.h>		/* anon_vma_prepare */
     21#include <linux/mmu_notifier.h>	/* set_pte_at_notify */
     22#include <linux/swap.h>		/* try_to_free_swap */
     23#include <linux/ptrace.h>	/* user_enable_single_step */
     24#include <linux/kdebug.h>	/* notifier mechanism */
     25#include "../../mm/internal.h"	/* munlock_vma_page */
     26#include <linux/percpu-rwsem.h>
     27#include <linux/task_work.h>
     28#include <linux/shmem_fs.h>
     29#include <linux/khugepaged.h>
     30
     31#include <linux/uprobes.h>
     32
     33#define UINSNS_PER_PAGE			(PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
     34#define MAX_UPROBE_XOL_SLOTS		UINSNS_PER_PAGE
     35
     36static struct rb_root uprobes_tree = RB_ROOT;
     37/*
     38 * allows us to skip the uprobe_mmap if there are no uprobe events active
     39 * at this time.  Probably a fine grained per inode count is better?
     40 */
     41#define no_uprobe_events()	RB_EMPTY_ROOT(&uprobes_tree)
     42
     43static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */
     44
     45#define UPROBES_HASH_SZ	13
     46/* serialize uprobe->pending_list */
     47static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
     48#define uprobes_mmap_hash(v)	(&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
     49
     50DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
     51
     52/* Have a copy of original instruction */
     53#define UPROBE_COPY_INSN	0
     54
     55struct uprobe {
     56	struct rb_node		rb_node;	/* node in the rb tree */
     57	refcount_t		ref;
     58	struct rw_semaphore	register_rwsem;
     59	struct rw_semaphore	consumer_rwsem;
     60	struct list_head	pending_list;
     61	struct uprobe_consumer	*consumers;
     62	struct inode		*inode;		/* Also hold a ref to inode */
     63	loff_t			offset;
     64	loff_t			ref_ctr_offset;
     65	unsigned long		flags;
     66
     67	/*
     68	 * The generic code assumes that it has two members of unknown type
     69	 * owned by the arch-specific code:
     70	 *
     71	 * 	insn -	copy_insn() saves the original instruction here for
     72	 *		arch_uprobe_analyze_insn().
     73	 *
     74	 *	ixol -	potentially modified instruction to execute out of
     75	 *		line, copied to xol_area by xol_get_insn_slot().
     76	 */
     77	struct arch_uprobe	arch;
     78};
     79
     80struct delayed_uprobe {
     81	struct list_head list;
     82	struct uprobe *uprobe;
     83	struct mm_struct *mm;
     84};
     85
     86static DEFINE_MUTEX(delayed_uprobe_lock);
     87static LIST_HEAD(delayed_uprobe_list);
     88
     89/*
     90 * Execute out of line area: anonymous executable mapping installed
     91 * by the probed task to execute the copy of the original instruction
     92 * mangled by set_swbp().
     93 *
     94 * On a breakpoint hit, thread contests for a slot.  It frees the
     95 * slot after singlestep. Currently a fixed number of slots are
     96 * allocated.
     97 */
     98struct xol_area {
     99	wait_queue_head_t 		wq;		/* if all slots are busy */
    100	atomic_t 			slot_count;	/* number of in-use slots */
    101	unsigned long 			*bitmap;	/* 0 = free slot */
    102
    103	struct vm_special_mapping	xol_mapping;
    104	struct page 			*pages[2];
    105	/*
    106	 * We keep the vma's vm_start rather than a pointer to the vma
    107	 * itself.  The probed process or a naughty kernel module could make
    108	 * the vma go away, and we must handle that reasonably gracefully.
    109	 */
    110	unsigned long 			vaddr;		/* Page(s) of instruction slots */
    111};
    112
    113/*
    114 * valid_vma: Verify if the specified vma is an executable vma
    115 * Relax restrictions while unregistering: vm_flags might have
    116 * changed after breakpoint was inserted.
    117 *	- is_register: indicates if we are in register context.
    118 *	- Return 1 if the specified virtual address is in an
    119 *	  executable vma.
    120 */
    121static bool valid_vma(struct vm_area_struct *vma, bool is_register)
    122{
    123	vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
    124
    125	if (is_register)
    126		flags |= VM_WRITE;
    127
    128	return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
    129}
    130
    131static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
    132{
    133	return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
    134}
    135
    136static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
    137{
    138	return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
    139}
    140
    141/**
    142 * __replace_page - replace page in vma by new page.
    143 * based on replace_page in mm/ksm.c
    144 *
    145 * @vma:      vma that holds the pte pointing to page
    146 * @addr:     address the old @page is mapped at
    147 * @old_page: the page we are replacing by new_page
    148 * @new_page: the modified page we replace page by
    149 *
    150 * If @new_page is NULL, only unmap @old_page.
    151 *
    152 * Returns 0 on success, negative error code otherwise.
    153 */
    154static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
    155				struct page *old_page, struct page *new_page)
    156{
    157	struct mm_struct *mm = vma->vm_mm;
    158	DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0);
    159	int err;
    160	struct mmu_notifier_range range;
    161
    162	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
    163				addr + PAGE_SIZE);
    164
    165	if (new_page) {
    166		err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm,
    167					GFP_KERNEL);
    168		if (err)
    169			return err;
    170	}
    171
    172	/* For try_to_free_swap() below */
    173	lock_page(old_page);
    174
    175	mmu_notifier_invalidate_range_start(&range);
    176	err = -EAGAIN;
    177	if (!page_vma_mapped_walk(&pvmw))
    178		goto unlock;
    179	VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
    180
    181	if (new_page) {
    182		get_page(new_page);
    183		page_add_new_anon_rmap(new_page, vma, addr);
    184		lru_cache_add_inactive_or_unevictable(new_page, vma);
    185	} else
    186		/* no new page, just dec_mm_counter for old_page */
    187		dec_mm_counter(mm, MM_ANONPAGES);
    188
    189	if (!PageAnon(old_page)) {
    190		dec_mm_counter(mm, mm_counter_file(old_page));
    191		inc_mm_counter(mm, MM_ANONPAGES);
    192	}
    193
    194	flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
    195	ptep_clear_flush_notify(vma, addr, pvmw.pte);
    196	if (new_page)
    197		set_pte_at_notify(mm, addr, pvmw.pte,
    198				  mk_pte(new_page, vma->vm_page_prot));
    199
    200	page_remove_rmap(old_page, vma, false);
    201	if (!page_mapped(old_page))
    202		try_to_free_swap(old_page);
    203	page_vma_mapped_walk_done(&pvmw);
    204	put_page(old_page);
    205
    206	err = 0;
    207 unlock:
    208	mmu_notifier_invalidate_range_end(&range);
    209	unlock_page(old_page);
    210	return err;
    211}
    212
    213/**
    214 * is_swbp_insn - check if instruction is breakpoint instruction.
    215 * @insn: instruction to be checked.
    216 * Default implementation of is_swbp_insn
    217 * Returns true if @insn is a breakpoint instruction.
    218 */
    219bool __weak is_swbp_insn(uprobe_opcode_t *insn)
    220{
    221	return *insn == UPROBE_SWBP_INSN;
    222}
    223
    224/**
    225 * is_trap_insn - check if instruction is breakpoint instruction.
    226 * @insn: instruction to be checked.
    227 * Default implementation of is_trap_insn
    228 * Returns true if @insn is a breakpoint instruction.
    229 *
    230 * This function is needed for the case where an architecture has multiple
    231 * trap instructions (like powerpc).
    232 */
    233bool __weak is_trap_insn(uprobe_opcode_t *insn)
    234{
    235	return is_swbp_insn(insn);
    236}
    237
    238static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
    239{
    240	void *kaddr = kmap_atomic(page);
    241	memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
    242	kunmap_atomic(kaddr);
    243}
    244
    245static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
    246{
    247	void *kaddr = kmap_atomic(page);
    248	memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
    249	kunmap_atomic(kaddr);
    250}
    251
    252static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
    253{
    254	uprobe_opcode_t old_opcode;
    255	bool is_swbp;
    256
    257	/*
    258	 * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
    259	 * We do not check if it is any other 'trap variant' which could
    260	 * be conditional trap instruction such as the one powerpc supports.
    261	 *
    262	 * The logic is that we do not care if the underlying instruction
    263	 * is a trap variant; uprobes always wins over any other (gdb)
    264	 * breakpoint.
    265	 */
    266	copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
    267	is_swbp = is_swbp_insn(&old_opcode);
    268
    269	if (is_swbp_insn(new_opcode)) {
    270		if (is_swbp)		/* register: already installed? */
    271			return 0;
    272	} else {
    273		if (!is_swbp)		/* unregister: was it changed by us? */
    274			return 0;
    275	}
    276
    277	return 1;
    278}
    279
    280static struct delayed_uprobe *
    281delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm)
    282{
    283	struct delayed_uprobe *du;
    284
    285	list_for_each_entry(du, &delayed_uprobe_list, list)
    286		if (du->uprobe == uprobe && du->mm == mm)
    287			return du;
    288	return NULL;
    289}
    290
    291static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm)
    292{
    293	struct delayed_uprobe *du;
    294
    295	if (delayed_uprobe_check(uprobe, mm))
    296		return 0;
    297
    298	du  = kzalloc(sizeof(*du), GFP_KERNEL);
    299	if (!du)
    300		return -ENOMEM;
    301
    302	du->uprobe = uprobe;
    303	du->mm = mm;
    304	list_add(&du->list, &delayed_uprobe_list);
    305	return 0;
    306}
    307
    308static void delayed_uprobe_delete(struct delayed_uprobe *du)
    309{
    310	if (WARN_ON(!du))
    311		return;
    312	list_del(&du->list);
    313	kfree(du);
    314}
    315
    316static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm)
    317{
    318	struct list_head *pos, *q;
    319	struct delayed_uprobe *du;
    320
    321	if (!uprobe && !mm)
    322		return;
    323
    324	list_for_each_safe(pos, q, &delayed_uprobe_list) {
    325		du = list_entry(pos, struct delayed_uprobe, list);
    326
    327		if (uprobe && du->uprobe != uprobe)
    328			continue;
    329		if (mm && du->mm != mm)
    330			continue;
    331
    332		delayed_uprobe_delete(du);
    333	}
    334}
    335
    336static bool valid_ref_ctr_vma(struct uprobe *uprobe,
    337			      struct vm_area_struct *vma)
    338{
    339	unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset);
    340
    341	return uprobe->ref_ctr_offset &&
    342		vma->vm_file &&
    343		file_inode(vma->vm_file) == uprobe->inode &&
    344		(vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
    345		vma->vm_start <= vaddr &&
    346		vma->vm_end > vaddr;
    347}
    348
    349static struct vm_area_struct *
    350find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
    351{
    352	struct vm_area_struct *tmp;
    353
    354	for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
    355		if (valid_ref_ctr_vma(uprobe, tmp))
    356			return tmp;
    357
    358	return NULL;
    359}
    360
    361static int
    362__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
    363{
    364	void *kaddr;
    365	struct page *page;
    366	struct vm_area_struct *vma;
    367	int ret;
    368	short *ptr;
    369
    370	if (!vaddr || !d)
    371		return -EINVAL;
    372
    373	ret = get_user_pages_remote(mm, vaddr, 1,
    374			FOLL_WRITE, &page, &vma, NULL);
    375	if (unlikely(ret <= 0)) {
    376		/*
    377		 * We are asking for 1 page. If get_user_pages_remote() fails,
    378		 * it may return 0, in that case we have to return error.
    379		 */
    380		return ret == 0 ? -EBUSY : ret;
    381	}
    382
    383	kaddr = kmap_atomic(page);
    384	ptr = kaddr + (vaddr & ~PAGE_MASK);
    385
    386	if (unlikely(*ptr + d < 0)) {
    387		pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
    388			"curr val: %d, delta: %d\n", vaddr, *ptr, d);
    389		ret = -EINVAL;
    390		goto out;
    391	}
    392
    393	*ptr += d;
    394	ret = 0;
    395out:
    396	kunmap_atomic(kaddr);
    397	put_page(page);
    398	return ret;
    399}
    400
    401static void update_ref_ctr_warn(struct uprobe *uprobe,
    402				struct mm_struct *mm, short d)
    403{
    404	pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
    405		"0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n",
    406		d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
    407		(unsigned long long) uprobe->offset,
    408		(unsigned long long) uprobe->ref_ctr_offset, mm);
    409}
    410
    411static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
    412			  short d)
    413{
    414	struct vm_area_struct *rc_vma;
    415	unsigned long rc_vaddr;
    416	int ret = 0;
    417
    418	rc_vma = find_ref_ctr_vma(uprobe, mm);
    419
    420	if (rc_vma) {
    421		rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
    422		ret = __update_ref_ctr(mm, rc_vaddr, d);
    423		if (ret)
    424			update_ref_ctr_warn(uprobe, mm, d);
    425
    426		if (d > 0)
    427			return ret;
    428	}
    429
    430	mutex_lock(&delayed_uprobe_lock);
    431	if (d > 0)
    432		ret = delayed_uprobe_add(uprobe, mm);
    433	else
    434		delayed_uprobe_remove(uprobe, mm);
    435	mutex_unlock(&delayed_uprobe_lock);
    436
    437	return ret;
    438}
    439
    440/*
    441 * NOTE:
    442 * Expect the breakpoint instruction to be the smallest size instruction for
    443 * the architecture. If an arch has variable length instruction and the
    444 * breakpoint instruction is not of the smallest length instruction
    445 * supported by that architecture then we need to modify is_trap_at_addr and
    446 * uprobe_write_opcode accordingly. This would never be a problem for archs
    447 * that have fixed length instructions.
    448 *
    449 * uprobe_write_opcode - write the opcode at a given virtual address.
    450 * @auprobe: arch specific probepoint information.
    451 * @mm: the probed process address space.
    452 * @vaddr: the virtual address to store the opcode.
    453 * @opcode: opcode to be written at @vaddr.
    454 *
    455 * Called with mm->mmap_lock held for write.
    456 * Return 0 (success) or a negative errno.
    457 */
    458int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
    459			unsigned long vaddr, uprobe_opcode_t opcode)
    460{
    461	struct uprobe *uprobe;
    462	struct page *old_page, *new_page;
    463	struct vm_area_struct *vma;
    464	int ret, is_register, ref_ctr_updated = 0;
    465	bool orig_page_huge = false;
    466	unsigned int gup_flags = FOLL_FORCE;
    467
    468	is_register = is_swbp_insn(&opcode);
    469	uprobe = container_of(auprobe, struct uprobe, arch);
    470
    471retry:
    472	if (is_register)
    473		gup_flags |= FOLL_SPLIT_PMD;
    474	/* Read the page with vaddr into memory */
    475	ret = get_user_pages_remote(mm, vaddr, 1, gup_flags,
    476				    &old_page, &vma, NULL);
    477	if (ret <= 0)
    478		return ret;
    479
    480	ret = verify_opcode(old_page, vaddr, &opcode);
    481	if (ret <= 0)
    482		goto put_old;
    483
    484	if (WARN(!is_register && PageCompound(old_page),
    485		 "uprobe unregister should never work on compound page\n")) {
    486		ret = -EINVAL;
    487		goto put_old;
    488	}
    489
    490	/* We are going to replace instruction, update ref_ctr. */
    491	if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
    492		ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
    493		if (ret)
    494			goto put_old;
    495
    496		ref_ctr_updated = 1;
    497	}
    498
    499	ret = 0;
    500	if (!is_register && !PageAnon(old_page))
    501		goto put_old;
    502
    503	ret = anon_vma_prepare(vma);
    504	if (ret)
    505		goto put_old;
    506
    507	ret = -ENOMEM;
    508	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
    509	if (!new_page)
    510		goto put_old;
    511
    512	__SetPageUptodate(new_page);
    513	copy_highpage(new_page, old_page);
    514	copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
    515
    516	if (!is_register) {
    517		struct page *orig_page;
    518		pgoff_t index;
    519
    520		VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);
    521
    522		index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
    523		orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
    524					  index);
    525
    526		if (orig_page) {
    527			if (PageUptodate(orig_page) &&
    528			    pages_identical(new_page, orig_page)) {
    529				/* let go new_page */
    530				put_page(new_page);
    531				new_page = NULL;
    532
    533				if (PageCompound(orig_page))
    534					orig_page_huge = true;
    535			}
    536			put_page(orig_page);
    537		}
    538	}
    539
    540	ret = __replace_page(vma, vaddr, old_page, new_page);
    541	if (new_page)
    542		put_page(new_page);
    543put_old:
    544	put_page(old_page);
    545
    546	if (unlikely(ret == -EAGAIN))
    547		goto retry;
    548
    549	/* Revert back reference counter if instruction update failed. */
    550	if (ret && is_register && ref_ctr_updated)
    551		update_ref_ctr(uprobe, mm, -1);
    552
    553	/* try collapse pmd for compound page */
    554	if (!ret && orig_page_huge)
    555		collapse_pte_mapped_thp(mm, vaddr);
    556
    557	return ret;
    558}
    559
    560/**
    561 * set_swbp - store breakpoint at a given address.
    562 * @auprobe: arch specific probepoint information.
    563 * @mm: the probed process address space.
    564 * @vaddr: the virtual address to insert the opcode.
    565 *
    566 * For mm @mm, store the breakpoint instruction at @vaddr.
    567 * Return 0 (success) or a negative errno.
    568 */
    569int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
    570{
    571	return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
    572}
    573
    574/**
    575 * set_orig_insn - Restore the original instruction.
    576 * @mm: the probed process address space.
    577 * @auprobe: arch specific probepoint information.
    578 * @vaddr: the virtual address to insert the opcode.
    579 *
    580 * For mm @mm, restore the original opcode (opcode) at @vaddr.
    581 * Return 0 (success) or a negative errno.
    582 */
    583int __weak
    584set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
    585{
    586	return uprobe_write_opcode(auprobe, mm, vaddr,
    587			*(uprobe_opcode_t *)&auprobe->insn);
    588}
    589
    590static struct uprobe *get_uprobe(struct uprobe *uprobe)
    591{
    592	refcount_inc(&uprobe->ref);
    593	return uprobe;
    594}
    595
    596static void put_uprobe(struct uprobe *uprobe)
    597{
    598	if (refcount_dec_and_test(&uprobe->ref)) {
    599		/*
    600		 * If application munmap(exec_vma) before uprobe_unregister()
    601		 * gets called, we don't get a chance to remove uprobe from
    602		 * delayed_uprobe_list from remove_breakpoint(). Do it here.
    603		 */
    604		mutex_lock(&delayed_uprobe_lock);
    605		delayed_uprobe_remove(uprobe, NULL);
    606		mutex_unlock(&delayed_uprobe_lock);
    607		kfree(uprobe);
    608	}
    609}
    610
    611static __always_inline
    612int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset,
    613	       const struct uprobe *r)
    614{
    615	if (l_inode < r->inode)
    616		return -1;
    617
    618	if (l_inode > r->inode)
    619		return 1;
    620
    621	if (l_offset < r->offset)
    622		return -1;
    623
    624	if (l_offset > r->offset)
    625		return 1;
    626
    627	return 0;
    628}
    629
    630#define __node_2_uprobe(node) \
    631	rb_entry((node), struct uprobe, rb_node)
    632
    633struct __uprobe_key {
    634	struct inode *inode;
    635	loff_t offset;
    636};
    637
    638static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b)
    639{
    640	const struct __uprobe_key *a = key;
    641	return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b));
    642}
    643
    644static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
    645{
    646	struct uprobe *u = __node_2_uprobe(a);
    647	return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
    648}
    649
    650static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
    651{
    652	struct __uprobe_key key = {
    653		.inode = inode,
    654		.offset = offset,
    655	};
    656	struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
    657
    658	if (node)
    659		return get_uprobe(__node_2_uprobe(node));
    660
    661	return NULL;
    662}
    663
    664/*
    665 * Find a uprobe corresponding to a given inode:offset
    666 * Acquires uprobes_treelock
    667 */
    668static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
    669{
    670	struct uprobe *uprobe;
    671
    672	spin_lock(&uprobes_treelock);
    673	uprobe = __find_uprobe(inode, offset);
    674	spin_unlock(&uprobes_treelock);
    675
    676	return uprobe;
    677}
    678
    679static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
    680{
    681	struct rb_node *node;
    682
    683	node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
    684	if (node)
    685		return get_uprobe(__node_2_uprobe(node));
    686
    687	/* get access + creation ref */
    688	refcount_set(&uprobe->ref, 2);
    689	return NULL;
    690}
    691
    692/*
    693 * Acquire uprobes_treelock.
    694 * Matching uprobe already exists in rbtree;
    695 *	increment (access refcount) and return the matching uprobe.
    696 *
    697 * No matching uprobe; insert the uprobe in rb_tree;
    698 *	get a double refcount (access + creation) and return NULL.
    699 */
    700static struct uprobe *insert_uprobe(struct uprobe *uprobe)
    701{
    702	struct uprobe *u;
    703
    704	spin_lock(&uprobes_treelock);
    705	u = __insert_uprobe(uprobe);
    706	spin_unlock(&uprobes_treelock);
    707
    708	return u;
    709}
    710
    711static void
    712ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe)
    713{
    714	pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx "
    715		"ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n",
    716		uprobe->inode->i_ino, (unsigned long long) uprobe->offset,
    717		(unsigned long long) cur_uprobe->ref_ctr_offset,
    718		(unsigned long long) uprobe->ref_ctr_offset);
    719}
    720
    721static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
    722				   loff_t ref_ctr_offset)
    723{
    724	struct uprobe *uprobe, *cur_uprobe;
    725
    726	uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
    727	if (!uprobe)
    728		return NULL;
    729
    730	uprobe->inode = inode;
    731	uprobe->offset = offset;
    732	uprobe->ref_ctr_offset = ref_ctr_offset;
    733	init_rwsem(&uprobe->register_rwsem);
    734	init_rwsem(&uprobe->consumer_rwsem);
    735
    736	/* add to uprobes_tree, sorted on inode:offset */
    737	cur_uprobe = insert_uprobe(uprobe);
    738	/* a uprobe exists for this inode:offset combination */
    739	if (cur_uprobe) {
    740		if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
    741			ref_ctr_mismatch_warn(cur_uprobe, uprobe);
    742			put_uprobe(cur_uprobe);
    743			kfree(uprobe);
    744			return ERR_PTR(-EINVAL);
    745		}
    746		kfree(uprobe);
    747		uprobe = cur_uprobe;
    748	}
    749
    750	return uprobe;
    751}
    752
    753static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
    754{
    755	down_write(&uprobe->consumer_rwsem);
    756	uc->next = uprobe->consumers;
    757	uprobe->consumers = uc;
    758	up_write(&uprobe->consumer_rwsem);
    759}
    760
    761/*
    762 * For uprobe @uprobe, delete the consumer @uc.
    763 * Return true if the @uc is deleted successfully
    764 * or return false.
    765 */
    766static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
    767{
    768	struct uprobe_consumer **con;
    769	bool ret = false;
    770
    771	down_write(&uprobe->consumer_rwsem);
    772	for (con = &uprobe->consumers; *con; con = &(*con)->next) {
    773		if (*con == uc) {
    774			*con = uc->next;
    775			ret = true;
    776			break;
    777		}
    778	}
    779	up_write(&uprobe->consumer_rwsem);
    780
    781	return ret;
    782}
    783
    784static int __copy_insn(struct address_space *mapping, struct file *filp,
    785			void *insn, int nbytes, loff_t offset)
    786{
    787	struct page *page;
    788	/*
    789	 * Ensure that the page that has the original instruction is populated
    790	 * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
    791	 * see uprobe_register().
    792	 */
    793	if (mapping->a_ops->read_folio)
    794		page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
    795	else
    796		page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
    797	if (IS_ERR(page))
    798		return PTR_ERR(page);
    799
    800	copy_from_page(page, offset, insn, nbytes);
    801	put_page(page);
    802
    803	return 0;
    804}
    805
    806static int copy_insn(struct uprobe *uprobe, struct file *filp)
    807{
    808	struct address_space *mapping = uprobe->inode->i_mapping;
    809	loff_t offs = uprobe->offset;
    810	void *insn = &uprobe->arch.insn;
    811	int size = sizeof(uprobe->arch.insn);
    812	int len, err = -EIO;
    813
    814	/* Copy only available bytes, -EIO if nothing was read */
    815	do {
    816		if (offs >= i_size_read(uprobe->inode))
    817			break;
    818
    819		len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
    820		err = __copy_insn(mapping, filp, insn, len, offs);
    821		if (err)
    822			break;
    823
    824		insn += len;
    825		offs += len;
    826		size -= len;
    827	} while (size);
    828
    829	return err;
    830}
    831
    832static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
    833				struct mm_struct *mm, unsigned long vaddr)
    834{
    835	int ret = 0;
    836
    837	if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
    838		return ret;
    839
    840	/* TODO: move this into _register, until then we abuse this sem. */
    841	down_write(&uprobe->consumer_rwsem);
    842	if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
    843		goto out;
    844
    845	ret = copy_insn(uprobe, file);
    846	if (ret)
    847		goto out;
    848
    849	ret = -ENOTSUPP;
    850	if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
    851		goto out;
    852
    853	ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
    854	if (ret)
    855		goto out;
    856
    857	smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
    858	set_bit(UPROBE_COPY_INSN, &uprobe->flags);
    859
    860 out:
    861	up_write(&uprobe->consumer_rwsem);
    862
    863	return ret;
    864}
    865
    866static inline bool consumer_filter(struct uprobe_consumer *uc,
    867				   enum uprobe_filter_ctx ctx, struct mm_struct *mm)
    868{
    869	return !uc->filter || uc->filter(uc, ctx, mm);
    870}
    871
    872static bool filter_chain(struct uprobe *uprobe,
    873			 enum uprobe_filter_ctx ctx, struct mm_struct *mm)
    874{
    875	struct uprobe_consumer *uc;
    876	bool ret = false;
    877
    878	down_read(&uprobe->consumer_rwsem);
    879	for (uc = uprobe->consumers; uc; uc = uc->next) {
    880		ret = consumer_filter(uc, ctx, mm);
    881		if (ret)
    882			break;
    883	}
    884	up_read(&uprobe->consumer_rwsem);
    885
    886	return ret;
    887}
    888
    889static int
    890install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
    891			struct vm_area_struct *vma, unsigned long vaddr)
    892{
    893	bool first_uprobe;
    894	int ret;
    895
    896	ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
    897	if (ret)
    898		return ret;
    899
    900	/*
    901	 * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
    902	 * the task can hit this breakpoint right after __replace_page().
    903	 */
    904	first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
    905	if (first_uprobe)
    906		set_bit(MMF_HAS_UPROBES, &mm->flags);
    907
    908	ret = set_swbp(&uprobe->arch, mm, vaddr);
    909	if (!ret)
    910		clear_bit(MMF_RECALC_UPROBES, &mm->flags);
    911	else if (first_uprobe)
    912		clear_bit(MMF_HAS_UPROBES, &mm->flags);
    913
    914	return ret;
    915}
    916
    917static int
    918remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
    919{
    920	set_bit(MMF_RECALC_UPROBES, &mm->flags);
    921	return set_orig_insn(&uprobe->arch, mm, vaddr);
    922}
    923
    924static inline bool uprobe_is_active(struct uprobe *uprobe)
    925{
    926	return !RB_EMPTY_NODE(&uprobe->rb_node);
    927}
    928/*
    929 * There could be threads that have already hit the breakpoint. They
    930 * will recheck the current insn and restart if find_uprobe() fails.
    931 * See find_active_uprobe().
    932 */
    933static void delete_uprobe(struct uprobe *uprobe)
    934{
    935	if (WARN_ON(!uprobe_is_active(uprobe)))
    936		return;
    937
    938	spin_lock(&uprobes_treelock);
    939	rb_erase(&uprobe->rb_node, &uprobes_tree);
    940	spin_unlock(&uprobes_treelock);
    941	RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
    942	put_uprobe(uprobe);
    943}
    944
    945struct map_info {
    946	struct map_info *next;
    947	struct mm_struct *mm;
    948	unsigned long vaddr;
    949};
    950
    951static inline struct map_info *free_map_info(struct map_info *info)
    952{
    953	struct map_info *next = info->next;
    954	kfree(info);
    955	return next;
    956}
    957
    958static struct map_info *
    959build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
    960{
    961	unsigned long pgoff = offset >> PAGE_SHIFT;
    962	struct vm_area_struct *vma;
    963	struct map_info *curr = NULL;
    964	struct map_info *prev = NULL;
    965	struct map_info *info;
    966	int more = 0;
    967
    968 again:
    969	i_mmap_lock_read(mapping);
    970	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
    971		if (!valid_vma(vma, is_register))
    972			continue;
    973
    974		if (!prev && !more) {
    975			/*
    976			 * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
    977			 * reclaim. This is optimistic, no harm done if it fails.
    978			 */
    979			prev = kmalloc(sizeof(struct map_info),
    980					GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
    981			if (prev)
    982				prev->next = NULL;
    983		}
    984		if (!prev) {
    985			more++;
    986			continue;
    987		}
    988
    989		if (!mmget_not_zero(vma->vm_mm))
    990			continue;
    991
    992		info = prev;
    993		prev = prev->next;
    994		info->next = curr;
    995		curr = info;
    996
    997		info->mm = vma->vm_mm;
    998		info->vaddr = offset_to_vaddr(vma, offset);
    999	}
   1000	i_mmap_unlock_read(mapping);
   1001
   1002	if (!more)
   1003		goto out;
   1004
   1005	prev = curr;
   1006	while (curr) {
   1007		mmput(curr->mm);
   1008		curr = curr->next;
   1009	}
   1010
   1011	do {
   1012		info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
   1013		if (!info) {
   1014			curr = ERR_PTR(-ENOMEM);
   1015			goto out;
   1016		}
   1017		info->next = prev;
   1018		prev = info;
   1019	} while (--more);
   1020
   1021	goto again;
   1022 out:
   1023	while (prev)
   1024		prev = free_map_info(prev);
   1025	return curr;
   1026}
   1027
   1028static int
   1029register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
   1030{
   1031	bool is_register = !!new;
   1032	struct map_info *info;
   1033	int err = 0;
   1034
   1035	percpu_down_write(&dup_mmap_sem);
   1036	info = build_map_info(uprobe->inode->i_mapping,
   1037					uprobe->offset, is_register);
   1038	if (IS_ERR(info)) {
   1039		err = PTR_ERR(info);
   1040		goto out;
   1041	}
   1042
   1043	while (info) {
   1044		struct mm_struct *mm = info->mm;
   1045		struct vm_area_struct *vma;
   1046
   1047		if (err && is_register)
   1048			goto free;
   1049
   1050		mmap_write_lock(mm);
   1051		vma = find_vma(mm, info->vaddr);
   1052		if (!vma || !valid_vma(vma, is_register) ||
   1053		    file_inode(vma->vm_file) != uprobe->inode)
   1054			goto unlock;
   1055
   1056		if (vma->vm_start > info->vaddr ||
   1057		    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
   1058			goto unlock;
   1059
   1060		if (is_register) {
   1061			/* consult only the "caller", new consumer. */
   1062			if (consumer_filter(new,
   1063					UPROBE_FILTER_REGISTER, mm))
   1064				err = install_breakpoint(uprobe, mm, vma, info->vaddr);
   1065		} else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
   1066			if (!filter_chain(uprobe,
   1067					UPROBE_FILTER_UNREGISTER, mm))
   1068				err |= remove_breakpoint(uprobe, mm, info->vaddr);
   1069		}
   1070
   1071 unlock:
   1072		mmap_write_unlock(mm);
   1073 free:
   1074		mmput(mm);
   1075		info = free_map_info(info);
   1076	}
   1077 out:
   1078	percpu_up_write(&dup_mmap_sem);
   1079	return err;
   1080}
   1081
   1082static void
   1083__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
   1084{
   1085	int err;
   1086
   1087	if (WARN_ON(!consumer_del(uprobe, uc)))
   1088		return;
   1089
   1090	err = register_for_each_vma(uprobe, NULL);
   1091	/* TODO : cant unregister? schedule a worker thread */
   1092	if (!uprobe->consumers && !err)
   1093		delete_uprobe(uprobe);
   1094}
   1095
   1096/*
   1097 * uprobe_unregister - unregister an already registered probe.
   1098 * @inode: the file in which the probe has to be removed.
   1099 * @offset: offset from the start of the file.
   1100 * @uc: identify which probe if multiple probes are colocated.
   1101 */
   1102void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
   1103{
   1104	struct uprobe *uprobe;
   1105
   1106	uprobe = find_uprobe(inode, offset);
   1107	if (WARN_ON(!uprobe))
   1108		return;
   1109
   1110	down_write(&uprobe->register_rwsem);
   1111	__uprobe_unregister(uprobe, uc);
   1112	up_write(&uprobe->register_rwsem);
   1113	put_uprobe(uprobe);
   1114}
   1115EXPORT_SYMBOL_GPL(uprobe_unregister);
   1116
   1117/*
   1118 * __uprobe_register - register a probe
   1119 * @inode: the file in which the probe has to be placed.
   1120 * @offset: offset from the start of the file.
   1121 * @uc: information on howto handle the probe..
   1122 *
   1123 * Apart from the access refcount, __uprobe_register() takes a creation
   1124 * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
   1125 * inserted into the rbtree (i.e first consumer for a @inode:@offset
   1126 * tuple).  Creation refcount stops uprobe_unregister from freeing the
   1127 * @uprobe even before the register operation is complete. Creation
   1128 * refcount is released when the last @uc for the @uprobe
   1129 * unregisters. Caller of __uprobe_register() is required to keep @inode
   1130 * (and the containing mount) referenced.
   1131 *
   1132 * Return errno if it cannot successully install probes
   1133 * else return 0 (success)
   1134 */
   1135static int __uprobe_register(struct inode *inode, loff_t offset,
   1136			     loff_t ref_ctr_offset, struct uprobe_consumer *uc)
   1137{
   1138	struct uprobe *uprobe;
   1139	int ret;
   1140
   1141	/* Uprobe must have at least one set consumer */
   1142	if (!uc->handler && !uc->ret_handler)
   1143		return -EINVAL;
   1144
   1145	/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
   1146	if (!inode->i_mapping->a_ops->read_folio &&
   1147	    !shmem_mapping(inode->i_mapping))
   1148		return -EIO;
   1149	/* Racy, just to catch the obvious mistakes */
   1150	if (offset > i_size_read(inode))
   1151		return -EINVAL;
   1152
   1153	/*
   1154	 * This ensures that copy_from_page(), copy_to_page() and
   1155	 * __update_ref_ctr() can't cross page boundary.
   1156	 */
   1157	if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
   1158		return -EINVAL;
   1159	if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
   1160		return -EINVAL;
   1161
   1162 retry:
   1163	uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
   1164	if (!uprobe)
   1165		return -ENOMEM;
   1166	if (IS_ERR(uprobe))
   1167		return PTR_ERR(uprobe);
   1168
   1169	/*
   1170	 * We can race with uprobe_unregister()->delete_uprobe().
   1171	 * Check uprobe_is_active() and retry if it is false.
   1172	 */
   1173	down_write(&uprobe->register_rwsem);
   1174	ret = -EAGAIN;
   1175	if (likely(uprobe_is_active(uprobe))) {
   1176		consumer_add(uprobe, uc);
   1177		ret = register_for_each_vma(uprobe, uc);
   1178		if (ret)
   1179			__uprobe_unregister(uprobe, uc);
   1180	}
   1181	up_write(&uprobe->register_rwsem);
   1182	put_uprobe(uprobe);
   1183
   1184	if (unlikely(ret == -EAGAIN))
   1185		goto retry;
   1186	return ret;
   1187}
   1188
   1189int uprobe_register(struct inode *inode, loff_t offset,
   1190		    struct uprobe_consumer *uc)
   1191{
   1192	return __uprobe_register(inode, offset, 0, uc);
   1193}
   1194EXPORT_SYMBOL_GPL(uprobe_register);
   1195
   1196int uprobe_register_refctr(struct inode *inode, loff_t offset,
   1197			   loff_t ref_ctr_offset, struct uprobe_consumer *uc)
   1198{
   1199	return __uprobe_register(inode, offset, ref_ctr_offset, uc);
   1200}
   1201EXPORT_SYMBOL_GPL(uprobe_register_refctr);
   1202
   1203/*
   1204 * uprobe_apply - unregister an already registered probe.
   1205 * @inode: the file in which the probe has to be removed.
   1206 * @offset: offset from the start of the file.
   1207 * @uc: consumer which wants to add more or remove some breakpoints
   1208 * @add: add or remove the breakpoints
   1209 */
   1210int uprobe_apply(struct inode *inode, loff_t offset,
   1211			struct uprobe_consumer *uc, bool add)
   1212{
   1213	struct uprobe *uprobe;
   1214	struct uprobe_consumer *con;
   1215	int ret = -ENOENT;
   1216
   1217	uprobe = find_uprobe(inode, offset);
   1218	if (WARN_ON(!uprobe))
   1219		return ret;
   1220
   1221	down_write(&uprobe->register_rwsem);
   1222	for (con = uprobe->consumers; con && con != uc ; con = con->next)
   1223		;
   1224	if (con)
   1225		ret = register_for_each_vma(uprobe, add ? uc : NULL);
   1226	up_write(&uprobe->register_rwsem);
   1227	put_uprobe(uprobe);
   1228
   1229	return ret;
   1230}
   1231
   1232static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
   1233{
   1234	struct vm_area_struct *vma;
   1235	int err = 0;
   1236
   1237	mmap_read_lock(mm);
   1238	for (vma = mm->mmap; vma; vma = vma->vm_next) {
   1239		unsigned long vaddr;
   1240		loff_t offset;
   1241
   1242		if (!valid_vma(vma, false) ||
   1243		    file_inode(vma->vm_file) != uprobe->inode)
   1244			continue;
   1245
   1246		offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
   1247		if (uprobe->offset <  offset ||
   1248		    uprobe->offset >= offset + vma->vm_end - vma->vm_start)
   1249			continue;
   1250
   1251		vaddr = offset_to_vaddr(vma, uprobe->offset);
   1252		err |= remove_breakpoint(uprobe, mm, vaddr);
   1253	}
   1254	mmap_read_unlock(mm);
   1255
   1256	return err;
   1257}
   1258
   1259static struct rb_node *
   1260find_node_in_range(struct inode *inode, loff_t min, loff_t max)
   1261{
   1262	struct rb_node *n = uprobes_tree.rb_node;
   1263
   1264	while (n) {
   1265		struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
   1266
   1267		if (inode < u->inode) {
   1268			n = n->rb_left;
   1269		} else if (inode > u->inode) {
   1270			n = n->rb_right;
   1271		} else {
   1272			if (max < u->offset)
   1273				n = n->rb_left;
   1274			else if (min > u->offset)
   1275				n = n->rb_right;
   1276			else
   1277				break;
   1278		}
   1279	}
   1280
   1281	return n;
   1282}
   1283
   1284/*
   1285 * For a given range in vma, build a list of probes that need to be inserted.
   1286 */
   1287static void build_probe_list(struct inode *inode,
   1288				struct vm_area_struct *vma,
   1289				unsigned long start, unsigned long end,
   1290				struct list_head *head)
   1291{
   1292	loff_t min, max;
   1293	struct rb_node *n, *t;
   1294	struct uprobe *u;
   1295
   1296	INIT_LIST_HEAD(head);
   1297	min = vaddr_to_offset(vma, start);
   1298	max = min + (end - start) - 1;
   1299
   1300	spin_lock(&uprobes_treelock);
   1301	n = find_node_in_range(inode, min, max);
   1302	if (n) {
   1303		for (t = n; t; t = rb_prev(t)) {
   1304			u = rb_entry(t, struct uprobe, rb_node);
   1305			if (u->inode != inode || u->offset < min)
   1306				break;
   1307			list_add(&u->pending_list, head);
   1308			get_uprobe(u);
   1309		}
   1310		for (t = n; (t = rb_next(t)); ) {
   1311			u = rb_entry(t, struct uprobe, rb_node);
   1312			if (u->inode != inode || u->offset > max)
   1313				break;
   1314			list_add(&u->pending_list, head);
   1315			get_uprobe(u);
   1316		}
   1317	}
   1318	spin_unlock(&uprobes_treelock);
   1319}
   1320
   1321/* @vma contains reference counter, not the probed instruction. */
   1322static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
   1323{
   1324	struct list_head *pos, *q;
   1325	struct delayed_uprobe *du;
   1326	unsigned long vaddr;
   1327	int ret = 0, err = 0;
   1328
   1329	mutex_lock(&delayed_uprobe_lock);
   1330	list_for_each_safe(pos, q, &delayed_uprobe_list) {
   1331		du = list_entry(pos, struct delayed_uprobe, list);
   1332
   1333		if (du->mm != vma->vm_mm ||
   1334		    !valid_ref_ctr_vma(du->uprobe, vma))
   1335			continue;
   1336
   1337		vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset);
   1338		ret = __update_ref_ctr(vma->vm_mm, vaddr, 1);
   1339		if (ret) {
   1340			update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1);
   1341			if (!err)
   1342				err = ret;
   1343		}
   1344		delayed_uprobe_delete(du);
   1345	}
   1346	mutex_unlock(&delayed_uprobe_lock);
   1347	return err;
   1348}
   1349
   1350/*
   1351 * Called from mmap_region/vma_adjust with mm->mmap_lock acquired.
   1352 *
   1353 * Currently we ignore all errors and always return 0, the callers
   1354 * can't handle the failure anyway.
   1355 */
   1356int uprobe_mmap(struct vm_area_struct *vma)
   1357{
   1358	struct list_head tmp_list;
   1359	struct uprobe *uprobe, *u;
   1360	struct inode *inode;
   1361
   1362	if (no_uprobe_events())
   1363		return 0;
   1364
   1365	if (vma->vm_file &&
   1366	    (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
   1367	    test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags))
   1368		delayed_ref_ctr_inc(vma);
   1369
   1370	if (!valid_vma(vma, true))
   1371		return 0;
   1372
   1373	inode = file_inode(vma->vm_file);
   1374	if (!inode)
   1375		return 0;
   1376
   1377	mutex_lock(uprobes_mmap_hash(inode));
   1378	build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
   1379	/*
   1380	 * We can race with uprobe_unregister(), this uprobe can be already
   1381	 * removed. But in this case filter_chain() must return false, all
   1382	 * consumers have gone away.
   1383	 */
   1384	list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
   1385		if (!fatal_signal_pending(current) &&
   1386		    filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
   1387			unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
   1388			install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
   1389		}
   1390		put_uprobe(uprobe);
   1391	}
   1392	mutex_unlock(uprobes_mmap_hash(inode));
   1393
   1394	return 0;
   1395}
   1396
   1397static bool
   1398vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
   1399{
   1400	loff_t min, max;
   1401	struct inode *inode;
   1402	struct rb_node *n;
   1403
   1404	inode = file_inode(vma->vm_file);
   1405
   1406	min = vaddr_to_offset(vma, start);
   1407	max = min + (end - start) - 1;
   1408
   1409	spin_lock(&uprobes_treelock);
   1410	n = find_node_in_range(inode, min, max);
   1411	spin_unlock(&uprobes_treelock);
   1412
   1413	return !!n;
   1414}
   1415
   1416/*
   1417 * Called in context of a munmap of a vma.
   1418 */
   1419void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
   1420{
   1421	if (no_uprobe_events() || !valid_vma(vma, false))
   1422		return;
   1423
   1424	if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
   1425		return;
   1426
   1427	if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
   1428	     test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
   1429		return;
   1430
   1431	if (vma_has_uprobes(vma, start, end))
   1432		set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
   1433}
   1434
   1435/* Slot allocation for XOL */
   1436static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
   1437{
   1438	struct vm_area_struct *vma;
   1439	int ret;
   1440
   1441	if (mmap_write_lock_killable(mm))
   1442		return -EINTR;
   1443
   1444	if (mm->uprobes_state.xol_area) {
   1445		ret = -EALREADY;
   1446		goto fail;
   1447	}
   1448
   1449	if (!area->vaddr) {
   1450		/* Try to map as high as possible, this is only a hint. */
   1451		area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
   1452						PAGE_SIZE, 0, 0);
   1453		if (IS_ERR_VALUE(area->vaddr)) {
   1454			ret = area->vaddr;
   1455			goto fail;
   1456		}
   1457	}
   1458
   1459	vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
   1460				VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
   1461				&area->xol_mapping);
   1462	if (IS_ERR(vma)) {
   1463		ret = PTR_ERR(vma);
   1464		goto fail;
   1465	}
   1466
   1467	ret = 0;
   1468	/* pairs with get_xol_area() */
   1469	smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */
   1470 fail:
   1471	mmap_write_unlock(mm);
   1472
   1473	return ret;
   1474}
   1475
   1476static struct xol_area *__create_xol_area(unsigned long vaddr)
   1477{
   1478	struct mm_struct *mm = current->mm;
   1479	uprobe_opcode_t insn = UPROBE_SWBP_INSN;
   1480	struct xol_area *area;
   1481
   1482	area = kmalloc(sizeof(*area), GFP_KERNEL);
   1483	if (unlikely(!area))
   1484		goto out;
   1485
   1486	area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long),
   1487			       GFP_KERNEL);
   1488	if (!area->bitmap)
   1489		goto free_area;
   1490
   1491	area->xol_mapping.name = "[uprobes]";
   1492	area->xol_mapping.fault = NULL;
   1493	area->xol_mapping.pages = area->pages;
   1494	area->pages[0] = alloc_page(GFP_HIGHUSER);
   1495	if (!area->pages[0])
   1496		goto free_bitmap;
   1497	area->pages[1] = NULL;
   1498
   1499	area->vaddr = vaddr;
   1500	init_waitqueue_head(&area->wq);
   1501	/* Reserve the 1st slot for get_trampoline_vaddr() */
   1502	set_bit(0, area->bitmap);
   1503	atomic_set(&area->slot_count, 1);
   1504	arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
   1505
   1506	if (!xol_add_vma(mm, area))
   1507		return area;
   1508
   1509	__free_page(area->pages[0]);
   1510 free_bitmap:
   1511	kfree(area->bitmap);
   1512 free_area:
   1513	kfree(area);
   1514 out:
   1515	return NULL;
   1516}
   1517
   1518/*
   1519 * get_xol_area - Allocate process's xol_area if necessary.
   1520 * This area will be used for storing instructions for execution out of line.
   1521 *
   1522 * Returns the allocated area or NULL.
   1523 */
   1524static struct xol_area *get_xol_area(void)
   1525{
   1526	struct mm_struct *mm = current->mm;
   1527	struct xol_area *area;
   1528
   1529	if (!mm->uprobes_state.xol_area)
   1530		__create_xol_area(0);
   1531
   1532	/* Pairs with xol_add_vma() smp_store_release() */
   1533	area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */
   1534	return area;
   1535}
   1536
   1537/*
   1538 * uprobe_clear_state - Free the area allocated for slots.
   1539 */
   1540void uprobe_clear_state(struct mm_struct *mm)
   1541{
   1542	struct xol_area *area = mm->uprobes_state.xol_area;
   1543
   1544	mutex_lock(&delayed_uprobe_lock);
   1545	delayed_uprobe_remove(NULL, mm);
   1546	mutex_unlock(&delayed_uprobe_lock);
   1547
   1548	if (!area)
   1549		return;
   1550
   1551	put_page(area->pages[0]);
   1552	kfree(area->bitmap);
   1553	kfree(area);
   1554}
   1555
   1556void uprobe_start_dup_mmap(void)
   1557{
   1558	percpu_down_read(&dup_mmap_sem);
   1559}
   1560
   1561void uprobe_end_dup_mmap(void)
   1562{
   1563	percpu_up_read(&dup_mmap_sem);
   1564}
   1565
   1566void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
   1567{
   1568	if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
   1569		set_bit(MMF_HAS_UPROBES, &newmm->flags);
   1570		/* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
   1571		set_bit(MMF_RECALC_UPROBES, &newmm->flags);
   1572	}
   1573}
   1574
   1575/*
   1576 *  - search for a free slot.
   1577 */
   1578static unsigned long xol_take_insn_slot(struct xol_area *area)
   1579{
   1580	unsigned long slot_addr;
   1581	int slot_nr;
   1582
   1583	do {
   1584		slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
   1585		if (slot_nr < UINSNS_PER_PAGE) {
   1586			if (!test_and_set_bit(slot_nr, area->bitmap))
   1587				break;
   1588
   1589			slot_nr = UINSNS_PER_PAGE;
   1590			continue;
   1591		}
   1592		wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
   1593	} while (slot_nr >= UINSNS_PER_PAGE);
   1594
   1595	slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
   1596	atomic_inc(&area->slot_count);
   1597
   1598	return slot_addr;
   1599}
   1600
   1601/*
   1602 * xol_get_insn_slot - allocate a slot for xol.
   1603 * Returns the allocated slot address or 0.
   1604 */
   1605static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
   1606{
   1607	struct xol_area *area;
   1608	unsigned long xol_vaddr;
   1609
   1610	area = get_xol_area();
   1611	if (!area)
   1612		return 0;
   1613
   1614	xol_vaddr = xol_take_insn_slot(area);
   1615	if (unlikely(!xol_vaddr))
   1616		return 0;
   1617
   1618	arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
   1619			      &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
   1620
   1621	return xol_vaddr;
   1622}
   1623
   1624/*
   1625 * xol_free_insn_slot - If slot was earlier allocated by
   1626 * @xol_get_insn_slot(), make the slot available for
   1627 * subsequent requests.
   1628 */
   1629static void xol_free_insn_slot(struct task_struct *tsk)
   1630{
   1631	struct xol_area *area;
   1632	unsigned long vma_end;
   1633	unsigned long slot_addr;
   1634
   1635	if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
   1636		return;
   1637
   1638	slot_addr = tsk->utask->xol_vaddr;
   1639	if (unlikely(!slot_addr))
   1640		return;
   1641
   1642	area = tsk->mm->uprobes_state.xol_area;
   1643	vma_end = area->vaddr + PAGE_SIZE;
   1644	if (area->vaddr <= slot_addr && slot_addr < vma_end) {
   1645		unsigned long offset;
   1646		int slot_nr;
   1647
   1648		offset = slot_addr - area->vaddr;
   1649		slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
   1650		if (slot_nr >= UINSNS_PER_PAGE)
   1651			return;
   1652
   1653		clear_bit(slot_nr, area->bitmap);
   1654		atomic_dec(&area->slot_count);
   1655		smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
   1656		if (waitqueue_active(&area->wq))
   1657			wake_up(&area->wq);
   1658
   1659		tsk->utask->xol_vaddr = 0;
   1660	}
   1661}
   1662
   1663void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
   1664				  void *src, unsigned long len)
   1665{
   1666	/* Initialize the slot */
   1667	copy_to_page(page, vaddr, src, len);
   1668
   1669	/*
   1670	 * We probably need flush_icache_user_page() but it needs vma.
   1671	 * This should work on most of architectures by default. If
   1672	 * architecture needs to do something different it can define
   1673	 * its own version of the function.
   1674	 */
   1675	flush_dcache_page(page);
   1676}
   1677
   1678/**
   1679 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
   1680 * @regs: Reflects the saved state of the task after it has hit a breakpoint
   1681 * instruction.
   1682 * Return the address of the breakpoint instruction.
   1683 */
   1684unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
   1685{
   1686	return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
   1687}
   1688
   1689unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
   1690{
   1691	struct uprobe_task *utask = current->utask;
   1692
   1693	if (unlikely(utask && utask->active_uprobe))
   1694		return utask->vaddr;
   1695
   1696	return instruction_pointer(regs);
   1697}
   1698
   1699static struct return_instance *free_ret_instance(struct return_instance *ri)
   1700{
   1701	struct return_instance *next = ri->next;
   1702	put_uprobe(ri->uprobe);
   1703	kfree(ri);
   1704	return next;
   1705}
   1706
   1707/*
   1708 * Called with no locks held.
   1709 * Called in context of an exiting or an exec-ing thread.
   1710 */
   1711void uprobe_free_utask(struct task_struct *t)
   1712{
   1713	struct uprobe_task *utask = t->utask;
   1714	struct return_instance *ri;
   1715
   1716	if (!utask)
   1717		return;
   1718
   1719	if (utask->active_uprobe)
   1720		put_uprobe(utask->active_uprobe);
   1721
   1722	ri = utask->return_instances;
   1723	while (ri)
   1724		ri = free_ret_instance(ri);
   1725
   1726	xol_free_insn_slot(t);
   1727	kfree(utask);
   1728	t->utask = NULL;
   1729}
   1730
   1731/*
   1732 * Allocate a uprobe_task object for the task if necessary.
   1733 * Called when the thread hits a breakpoint.
   1734 *
   1735 * Returns:
   1736 * - pointer to new uprobe_task on success
   1737 * - NULL otherwise
   1738 */
   1739static struct uprobe_task *get_utask(void)
   1740{
   1741	if (!current->utask)
   1742		current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
   1743	return current->utask;
   1744}
   1745
   1746static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
   1747{
   1748	struct uprobe_task *n_utask;
   1749	struct return_instance **p, *o, *n;
   1750
   1751	n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
   1752	if (!n_utask)
   1753		return -ENOMEM;
   1754	t->utask = n_utask;
   1755
   1756	p = &n_utask->return_instances;
   1757	for (o = o_utask->return_instances; o; o = o->next) {
   1758		n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
   1759		if (!n)
   1760			return -ENOMEM;
   1761
   1762		*n = *o;
   1763		get_uprobe(n->uprobe);
   1764		n->next = NULL;
   1765
   1766		*p = n;
   1767		p = &n->next;
   1768		n_utask->depth++;
   1769	}
   1770
   1771	return 0;
   1772}
   1773
   1774static void uprobe_warn(struct task_struct *t, const char *msg)
   1775{
   1776	pr_warn("uprobe: %s:%d failed to %s\n",
   1777			current->comm, current->pid, msg);
   1778}
   1779
   1780static void dup_xol_work(struct callback_head *work)
   1781{
   1782	if (current->flags & PF_EXITING)
   1783		return;
   1784
   1785	if (!__create_xol_area(current->utask->dup_xol_addr) &&
   1786			!fatal_signal_pending(current))
   1787		uprobe_warn(current, "dup xol area");
   1788}
   1789
   1790/*
   1791 * Called in context of a new clone/fork from copy_process.
   1792 */
   1793void uprobe_copy_process(struct task_struct *t, unsigned long flags)
   1794{
   1795	struct uprobe_task *utask = current->utask;
   1796	struct mm_struct *mm = current->mm;
   1797	struct xol_area *area;
   1798
   1799	t->utask = NULL;
   1800
   1801	if (!utask || !utask->return_instances)
   1802		return;
   1803
   1804	if (mm == t->mm && !(flags & CLONE_VFORK))
   1805		return;
   1806
   1807	if (dup_utask(t, utask))
   1808		return uprobe_warn(t, "dup ret instances");
   1809
   1810	/* The task can fork() after dup_xol_work() fails */
   1811	area = mm->uprobes_state.xol_area;
   1812	if (!area)
   1813		return uprobe_warn(t, "dup xol area");
   1814
   1815	if (mm == t->mm)
   1816		return;
   1817
   1818	t->utask->dup_xol_addr = area->vaddr;
   1819	init_task_work(&t->utask->dup_xol_work, dup_xol_work);
   1820	task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME);
   1821}
   1822
   1823/*
   1824 * Current area->vaddr notion assume the trampoline address is always
   1825 * equal area->vaddr.
   1826 *
   1827 * Returns -1 in case the xol_area is not allocated.
   1828 */
   1829static unsigned long get_trampoline_vaddr(void)
   1830{
   1831	struct xol_area *area;
   1832	unsigned long trampoline_vaddr = -1;
   1833
   1834	/* Pairs with xol_add_vma() smp_store_release() */
   1835	area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */
   1836	if (area)
   1837		trampoline_vaddr = area->vaddr;
   1838
   1839	return trampoline_vaddr;
   1840}
   1841
   1842static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
   1843					struct pt_regs *regs)
   1844{
   1845	struct return_instance *ri = utask->return_instances;
   1846	enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
   1847
   1848	while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
   1849		ri = free_ret_instance(ri);
   1850		utask->depth--;
   1851	}
   1852	utask->return_instances = ri;
   1853}
   1854
   1855static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
   1856{
   1857	struct return_instance *ri;
   1858	struct uprobe_task *utask;
   1859	unsigned long orig_ret_vaddr, trampoline_vaddr;
   1860	bool chained;
   1861
   1862	if (!get_xol_area())
   1863		return;
   1864
   1865	utask = get_utask();
   1866	if (!utask)
   1867		return;
   1868
   1869	if (utask->depth >= MAX_URETPROBE_DEPTH) {
   1870		printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
   1871				" nestedness limit pid/tgid=%d/%d\n",
   1872				current->pid, current->tgid);
   1873		return;
   1874	}
   1875
   1876	ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
   1877	if (!ri)
   1878		return;
   1879
   1880	trampoline_vaddr = get_trampoline_vaddr();
   1881	orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
   1882	if (orig_ret_vaddr == -1)
   1883		goto fail;
   1884
   1885	/* drop the entries invalidated by longjmp() */
   1886	chained = (orig_ret_vaddr == trampoline_vaddr);
   1887	cleanup_return_instances(utask, chained, regs);
   1888
   1889	/*
   1890	 * We don't want to keep trampoline address in stack, rather keep the
   1891	 * original return address of first caller thru all the consequent
   1892	 * instances. This also makes breakpoint unwrapping easier.
   1893	 */
   1894	if (chained) {
   1895		if (!utask->return_instances) {
   1896			/*
   1897			 * This situation is not possible. Likely we have an
   1898			 * attack from user-space.
   1899			 */
   1900			uprobe_warn(current, "handle tail call");
   1901			goto fail;
   1902		}
   1903		orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
   1904	}
   1905
   1906	ri->uprobe = get_uprobe(uprobe);
   1907	ri->func = instruction_pointer(regs);
   1908	ri->stack = user_stack_pointer(regs);
   1909	ri->orig_ret_vaddr = orig_ret_vaddr;
   1910	ri->chained = chained;
   1911
   1912	utask->depth++;
   1913	ri->next = utask->return_instances;
   1914	utask->return_instances = ri;
   1915
   1916	return;
   1917 fail:
   1918	kfree(ri);
   1919}
   1920
   1921/* Prepare to single-step probed instruction out of line. */
   1922static int
   1923pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
   1924{
   1925	struct uprobe_task *utask;
   1926	unsigned long xol_vaddr;
   1927	int err;
   1928
   1929	utask = get_utask();
   1930	if (!utask)
   1931		return -ENOMEM;
   1932
   1933	xol_vaddr = xol_get_insn_slot(uprobe);
   1934	if (!xol_vaddr)
   1935		return -ENOMEM;
   1936
   1937	utask->xol_vaddr = xol_vaddr;
   1938	utask->vaddr = bp_vaddr;
   1939
   1940	err = arch_uprobe_pre_xol(&uprobe->arch, regs);
   1941	if (unlikely(err)) {
   1942		xol_free_insn_slot(current);
   1943		return err;
   1944	}
   1945
   1946	utask->active_uprobe = uprobe;
   1947	utask->state = UTASK_SSTEP;
   1948	return 0;
   1949}
   1950
   1951/*
   1952 * If we are singlestepping, then ensure this thread is not connected to
   1953 * non-fatal signals until completion of singlestep.  When xol insn itself
   1954 * triggers the signal,  restart the original insn even if the task is
   1955 * already SIGKILL'ed (since coredump should report the correct ip).  This
   1956 * is even more important if the task has a handler for SIGSEGV/etc, The
   1957 * _same_ instruction should be repeated again after return from the signal
   1958 * handler, and SSTEP can never finish in this case.
   1959 */
   1960bool uprobe_deny_signal(void)
   1961{
   1962	struct task_struct *t = current;
   1963	struct uprobe_task *utask = t->utask;
   1964
   1965	if (likely(!utask || !utask->active_uprobe))
   1966		return false;
   1967
   1968	WARN_ON_ONCE(utask->state != UTASK_SSTEP);
   1969
   1970	if (task_sigpending(t)) {
   1971		spin_lock_irq(&t->sighand->siglock);
   1972		clear_tsk_thread_flag(t, TIF_SIGPENDING);
   1973		spin_unlock_irq(&t->sighand->siglock);
   1974
   1975		if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
   1976			utask->state = UTASK_SSTEP_TRAPPED;
   1977			set_tsk_thread_flag(t, TIF_UPROBE);
   1978		}
   1979	}
   1980
   1981	return true;
   1982}
   1983
   1984static void mmf_recalc_uprobes(struct mm_struct *mm)
   1985{
   1986	struct vm_area_struct *vma;
   1987
   1988	for (vma = mm->mmap; vma; vma = vma->vm_next) {
   1989		if (!valid_vma(vma, false))
   1990			continue;
   1991		/*
   1992		 * This is not strictly accurate, we can race with
   1993		 * uprobe_unregister() and see the already removed
   1994		 * uprobe if delete_uprobe() was not yet called.
   1995		 * Or this uprobe can be filtered out.
   1996		 */
   1997		if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
   1998			return;
   1999	}
   2000
   2001	clear_bit(MMF_HAS_UPROBES, &mm->flags);
   2002}
   2003
   2004static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
   2005{
   2006	struct page *page;
   2007	uprobe_opcode_t opcode;
   2008	int result;
   2009
   2010	if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE)))
   2011		return -EINVAL;
   2012
   2013	pagefault_disable();
   2014	result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
   2015	pagefault_enable();
   2016
   2017	if (likely(result == 0))
   2018		goto out;
   2019
   2020	/*
   2021	 * The NULL 'tsk' here ensures that any faults that occur here
   2022	 * will not be accounted to the task.  'mm' *is* current->mm,
   2023	 * but we treat this as a 'remote' access since it is
   2024	 * essentially a kernel access to the memory.
   2025	 */
   2026	result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page,
   2027			NULL, NULL);
   2028	if (result < 0)
   2029		return result;
   2030
   2031	copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
   2032	put_page(page);
   2033 out:
   2034	/* This needs to return true for any variant of the trap insn */
   2035	return is_trap_insn(&opcode);
   2036}
   2037
   2038static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
   2039{
   2040	struct mm_struct *mm = current->mm;
   2041	struct uprobe *uprobe = NULL;
   2042	struct vm_area_struct *vma;
   2043
   2044	mmap_read_lock(mm);
   2045	vma = vma_lookup(mm, bp_vaddr);
   2046	if (vma) {
   2047		if (valid_vma(vma, false)) {
   2048			struct inode *inode = file_inode(vma->vm_file);
   2049			loff_t offset = vaddr_to_offset(vma, bp_vaddr);
   2050
   2051			uprobe = find_uprobe(inode, offset);
   2052		}
   2053
   2054		if (!uprobe)
   2055			*is_swbp = is_trap_at_addr(mm, bp_vaddr);
   2056	} else {
   2057		*is_swbp = -EFAULT;
   2058	}
   2059
   2060	if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
   2061		mmf_recalc_uprobes(mm);
   2062	mmap_read_unlock(mm);
   2063
   2064	return uprobe;
   2065}
   2066
   2067static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
   2068{
   2069	struct uprobe_consumer *uc;
   2070	int remove = UPROBE_HANDLER_REMOVE;
   2071	bool need_prep = false; /* prepare return uprobe, when needed */
   2072
   2073	down_read(&uprobe->register_rwsem);
   2074	for (uc = uprobe->consumers; uc; uc = uc->next) {
   2075		int rc = 0;
   2076
   2077		if (uc->handler) {
   2078			rc = uc->handler(uc, regs);
   2079			WARN(rc & ~UPROBE_HANDLER_MASK,
   2080				"bad rc=0x%x from %ps()\n", rc, uc->handler);
   2081		}
   2082
   2083		if (uc->ret_handler)
   2084			need_prep = true;
   2085
   2086		remove &= rc;
   2087	}
   2088
   2089	if (need_prep && !remove)
   2090		prepare_uretprobe(uprobe, regs); /* put bp at return */
   2091
   2092	if (remove && uprobe->consumers) {
   2093		WARN_ON(!uprobe_is_active(uprobe));
   2094		unapply_uprobe(uprobe, current->mm);
   2095	}
   2096	up_read(&uprobe->register_rwsem);
   2097}
   2098
   2099static void
   2100handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
   2101{
   2102	struct uprobe *uprobe = ri->uprobe;
   2103	struct uprobe_consumer *uc;
   2104
   2105	down_read(&uprobe->register_rwsem);
   2106	for (uc = uprobe->consumers; uc; uc = uc->next) {
   2107		if (uc->ret_handler)
   2108			uc->ret_handler(uc, ri->func, regs);
   2109	}
   2110	up_read(&uprobe->register_rwsem);
   2111}
   2112
   2113static struct return_instance *find_next_ret_chain(struct return_instance *ri)
   2114{
   2115	bool chained;
   2116
   2117	do {
   2118		chained = ri->chained;
   2119		ri = ri->next;	/* can't be NULL if chained */
   2120	} while (chained);
   2121
   2122	return ri;
   2123}
   2124
   2125static void handle_trampoline(struct pt_regs *regs)
   2126{
   2127	struct uprobe_task *utask;
   2128	struct return_instance *ri, *next;
   2129	bool valid;
   2130
   2131	utask = current->utask;
   2132	if (!utask)
   2133		goto sigill;
   2134
   2135	ri = utask->return_instances;
   2136	if (!ri)
   2137		goto sigill;
   2138
   2139	do {
   2140		/*
   2141		 * We should throw out the frames invalidated by longjmp().
   2142		 * If this chain is valid, then the next one should be alive
   2143		 * or NULL; the latter case means that nobody but ri->func
   2144		 * could hit this trampoline on return. TODO: sigaltstack().
   2145		 */
   2146		next = find_next_ret_chain(ri);
   2147		valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);
   2148
   2149		instruction_pointer_set(regs, ri->orig_ret_vaddr);
   2150		do {
   2151			if (valid)
   2152				handle_uretprobe_chain(ri, regs);
   2153			ri = free_ret_instance(ri);
   2154			utask->depth--;
   2155		} while (ri != next);
   2156	} while (!valid);
   2157
   2158	utask->return_instances = ri;
   2159	return;
   2160
   2161 sigill:
   2162	uprobe_warn(current, "handle uretprobe, sending SIGILL.");
   2163	force_sig(SIGILL);
   2164
   2165}
   2166
   2167bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
   2168{
   2169	return false;
   2170}
   2171
   2172bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
   2173					struct pt_regs *regs)
   2174{
   2175	return true;
   2176}
   2177
   2178/*
   2179 * Run handler and ask thread to singlestep.
   2180 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
   2181 */
   2182static void handle_swbp(struct pt_regs *regs)
   2183{
   2184	struct uprobe *uprobe;
   2185	unsigned long bp_vaddr;
   2186	int is_swbp;
   2187
   2188	bp_vaddr = uprobe_get_swbp_addr(regs);
   2189	if (bp_vaddr == get_trampoline_vaddr())
   2190		return handle_trampoline(regs);
   2191
   2192	uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
   2193	if (!uprobe) {
   2194		if (is_swbp > 0) {
   2195			/* No matching uprobe; signal SIGTRAP. */
   2196			force_sig(SIGTRAP);
   2197		} else {
   2198			/*
   2199			 * Either we raced with uprobe_unregister() or we can't
   2200			 * access this memory. The latter is only possible if
   2201			 * another thread plays with our ->mm. In both cases
   2202			 * we can simply restart. If this vma was unmapped we
   2203			 * can pretend this insn was not executed yet and get
   2204			 * the (correct) SIGSEGV after restart.
   2205			 */
   2206			instruction_pointer_set(regs, bp_vaddr);
   2207		}
   2208		return;
   2209	}
   2210
   2211	/* change it in advance for ->handler() and restart */
   2212	instruction_pointer_set(regs, bp_vaddr);
   2213
   2214	/*
   2215	 * TODO: move copy_insn/etc into _register and remove this hack.
   2216	 * After we hit the bp, _unregister + _register can install the
   2217	 * new and not-yet-analyzed uprobe at the same address, restart.
   2218	 */
   2219	if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
   2220		goto out;
   2221
   2222	/*
   2223	 * Pairs with the smp_wmb() in prepare_uprobe().
   2224	 *
   2225	 * Guarantees that if we see the UPROBE_COPY_INSN bit set, then
   2226	 * we must also see the stores to &uprobe->arch performed by the
   2227	 * prepare_uprobe() call.
   2228	 */
   2229	smp_rmb();
   2230
   2231	/* Tracing handlers use ->utask to communicate with fetch methods */
   2232	if (!get_utask())
   2233		goto out;
   2234
   2235	if (arch_uprobe_ignore(&uprobe->arch, regs))
   2236		goto out;
   2237
   2238	handler_chain(uprobe, regs);
   2239
   2240	if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
   2241		goto out;
   2242
   2243	if (!pre_ssout(uprobe, regs, bp_vaddr))
   2244		return;
   2245
   2246	/* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
   2247out:
   2248	put_uprobe(uprobe);
   2249}
   2250
   2251/*
   2252 * Perform required fix-ups and disable singlestep.
   2253 * Allow pending signals to take effect.
   2254 */
   2255static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
   2256{
   2257	struct uprobe *uprobe;
   2258	int err = 0;
   2259
   2260	uprobe = utask->active_uprobe;
   2261	if (utask->state == UTASK_SSTEP_ACK)
   2262		err = arch_uprobe_post_xol(&uprobe->arch, regs);
   2263	else if (utask->state == UTASK_SSTEP_TRAPPED)
   2264		arch_uprobe_abort_xol(&uprobe->arch, regs);
   2265	else
   2266		WARN_ON_ONCE(1);
   2267
   2268	put_uprobe(uprobe);
   2269	utask->active_uprobe = NULL;
   2270	utask->state = UTASK_RUNNING;
   2271	xol_free_insn_slot(current);
   2272
   2273	spin_lock_irq(&current->sighand->siglock);
   2274	recalc_sigpending(); /* see uprobe_deny_signal() */
   2275	spin_unlock_irq(&current->sighand->siglock);
   2276
   2277	if (unlikely(err)) {
   2278		uprobe_warn(current, "execute the probed insn, sending SIGILL.");
   2279		force_sig(SIGILL);
   2280	}
   2281}
   2282
   2283/*
   2284 * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
   2285 * allows the thread to return from interrupt. After that handle_swbp()
   2286 * sets utask->active_uprobe.
   2287 *
   2288 * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
   2289 * and allows the thread to return from interrupt.
   2290 *
   2291 * While returning to userspace, thread notices the TIF_UPROBE flag and calls
   2292 * uprobe_notify_resume().
   2293 */
   2294void uprobe_notify_resume(struct pt_regs *regs)
   2295{
   2296	struct uprobe_task *utask;
   2297
   2298	clear_thread_flag(TIF_UPROBE);
   2299
   2300	utask = current->utask;
   2301	if (utask && utask->active_uprobe)
   2302		handle_singlestep(utask, regs);
   2303	else
   2304		handle_swbp(regs);
   2305}
   2306
   2307/*
   2308 * uprobe_pre_sstep_notifier gets called from interrupt context as part of
   2309 * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
   2310 */
   2311int uprobe_pre_sstep_notifier(struct pt_regs *regs)
   2312{
   2313	if (!current->mm)
   2314		return 0;
   2315
   2316	if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
   2317	    (!current->utask || !current->utask->return_instances))
   2318		return 0;
   2319
   2320	set_thread_flag(TIF_UPROBE);
   2321	return 1;
   2322}
   2323
   2324/*
   2325 * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
   2326 * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
   2327 */
   2328int uprobe_post_sstep_notifier(struct pt_regs *regs)
   2329{
   2330	struct uprobe_task *utask = current->utask;
   2331
   2332	if (!current->mm || !utask || !utask->active_uprobe)
   2333		/* task is currently not uprobed */
   2334		return 0;
   2335
   2336	utask->state = UTASK_SSTEP_ACK;
   2337	set_thread_flag(TIF_UPROBE);
   2338	return 1;
   2339}
   2340
   2341static struct notifier_block uprobe_exception_nb = {
   2342	.notifier_call		= arch_uprobe_exception_notify,
   2343	.priority		= INT_MAX-1,	/* notified after kprobes, kgdb */
   2344};
   2345
   2346void __init uprobes_init(void)
   2347{
   2348	int i;
   2349
   2350	for (i = 0; i < UPROBES_HASH_SZ; i++)
   2351		mutex_init(&uprobes_mmap_mutex[i]);
   2352
   2353	BUG_ON(register_die_notifier(&uprobe_exception_nb));
   2354}