eeh-functions.sh - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
eeh-functions.sh (6022B)
      1#!/bin/sh
      2# SPDX-License-Identifier: GPL-2.0-only
      3
      4export KSELFTESTS_SKIP=4
      5
      6log() {
      7	echo >/dev/stderr $*
      8}
      9
     10pe_ok() {
     11	local dev="$1"
     12	local path="/sys/bus/pci/devices/$dev/eeh_pe_state"
     13
     14	# if a driver doesn't support the error handling callbacks then the
     15	# device is recovered by removing and re-probing it. This causes the
     16	# sysfs directory to disappear so read the PE state once and squash
     17	# any potential error messages
     18	local eeh_state="$(cat $path 2>/dev/null)"
     19	if [ -z "$eeh_state" ]; then
     20		return 1;
     21	fi
     22
     23	local fw_state="$(echo $eeh_state | cut -d' ' -f1)"
     24	local sw_state="$(echo $eeh_state | cut -d' ' -f2)"
     25
     26	# If EEH_PE_ISOLATED or EEH_PE_RECOVERING are set then the PE is in an
     27	# error state or being recovered. Either way, not ok.
     28	if [ "$((sw_state & 0x3))" -ne 0 ] ; then
     29		return 1
     30	fi
     31
     32	# A functioning PE should have the EEH_STATE_MMIO_ACTIVE and
     33	# EEH_STATE_DMA_ACTIVE flags set. For some goddamn stupid reason
     34	# the platform backends set these when the PE is in reset. The
     35	# RECOVERING check above should stop any false positives though.
     36	if [ "$((fw_state & 0x18))" -ne "$((0x18))" ] ; then
     37		return 1
     38	fi
     39
     40	return 0;
     41}
     42
     43eeh_supported() {
     44	test -e /proc/powerpc/eeh && \
     45	grep -q 'EEH Subsystem is enabled' /proc/powerpc/eeh
     46}
     47
     48eeh_test_prep() {
     49	if ! eeh_supported ; then
     50		echo "EEH not supported on this system, skipping"
     51		exit $KSELFTESTS_SKIP;
     52	fi
     53
     54	if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \
     55	   [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then
     56		log "debugfs EEH testing files are missing. Is debugfs mounted?"
     57		exit $KSELFTESTS_SKIP;
     58	fi
     59
     60	# Bump the max freeze count to something absurd so we don't
     61	# trip over it while breaking things.
     62	echo 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes
     63}
     64
     65eeh_can_break() {
     66	# skip bridges since we can't recover them (yet...)
     67	if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then
     68		log "$dev, Skipped: bridge"
     69		return 1;
     70	fi
     71
     72	# The ahci driver doesn't support error recovery. If the ahci device
     73	# happens to be hosting the root filesystem, and then we go and break
     74	# it the system will generally go down. We should probably fix that
     75	# at some point
     76	if [ "ahci" = "$(basename $(realpath /sys/bus/pci/devices/$dev/driver))" ] ; then
     77		log "$dev, Skipped: ahci doesn't support recovery"
     78		return 1;
     79	fi
     80
     81	# Don't inject errosr into an already-frozen PE. This happens with
     82	# PEs that contain multiple PCI devices (e.g. multi-function cards)
     83	# and injecting new errors during the recovery process will probably
     84	# result in the recovery failing and the device being marked as
     85	# failed.
     86	if ! pe_ok $dev ; then
     87		log "$dev, Skipped: Bad initial PE state"
     88		return 1;
     89	fi
     90
     91	return 0
     92}
     93
     94eeh_one_dev() {
     95	local dev="$1"
     96
     97	# Using this function from the command line is sometimes useful for
     98	# testing so check that the argument is a well-formed sysfs device
     99	# name.
    100	if ! test -e /sys/bus/pci/devices/$dev/ ; then
    101		log "Error: '$dev' must be a sysfs device name (DDDD:BB:DD.F)"
    102		return 1;
    103	fi
    104
    105	# Break it
    106	echo $dev >/sys/kernel/debug/powerpc/eeh_dev_break
    107
    108	# Force an EEH device check. If the kernel has already
    109	# noticed the EEH (due to a driver poll or whatever), this
    110	# is a no-op.
    111	echo $dev >/sys/kernel/debug/powerpc/eeh_dev_check
    112
    113	# Default to a 60s timeout when waiting for a device to recover. This
    114	# is an arbitrary default which can be overridden by setting the
    115	# EEH_MAX_WAIT environmental variable when required.
    116
    117	# The current record holder for longest recovery time is:
    118	#  "Adaptec Series 8 12G SAS/PCIe 3" at 39 seconds
    119	max_wait=${EEH_MAX_WAIT:=60}
    120
    121	for i in `seq 0 ${max_wait}` ; do
    122		if pe_ok $dev ; then
    123			break;
    124		fi
    125		log "$dev, waited $i/${max_wait}"
    126		sleep 1
    127	done
    128
    129	if ! pe_ok $dev ; then
    130		log "$dev, Failed to recover!"
    131		return 1;
    132	fi
    133
    134	log "$dev, Recovered after $i seconds"
    135	return 0;
    136}
    137
    138eeh_has_driver() {
    139	test -e /sys/bus/pci/devices/$1/driver;
    140	return $?
    141}
    142
    143eeh_can_recover() {
    144	# we'll get an IO error if the device's current driver doesn't support
    145	# error recovery
    146	echo $1 > '/sys/kernel/debug/powerpc/eeh_dev_can_recover' 2>/dev/null
    147
    148	return $?
    149}
    150
    151eeh_find_all_pfs() {
    152	devices=""
    153
    154	# SR-IOV on pseries requires hypervisor support, so check for that
    155	is_pseries=""
    156	if grep -q pSeries /proc/cpuinfo ; then
    157		if [ ! -f /proc/device-tree/rtas/ibm,open-sriov-allow-unfreeze ] ||
    158		   [ ! -f /proc/device-tree/rtas/ibm,open-sriov-map-pe-number ] ; then
    159			return 1;
    160		fi
    161
    162		is_pseries="true"
    163	fi
    164
    165	for dev in `ls -1 /sys/bus/pci/devices/` ; do
    166		sysfs="/sys/bus/pci/devices/$dev"
    167		if [ ! -e "$sysfs/sriov_numvfs" ] ; then
    168			continue
    169		fi
    170
    171		# skip unsupported PFs on pseries
    172		if [ -z "$is_pseries" ] &&
    173		   [ ! -f "$sysfs/of_node/ibm,is-open-sriov-pf" ] &&
    174		   [ ! -f "$sysfs/of_node/ibm,open-sriov-vf-bar-info" ] ; then
    175			continue;
    176		fi
    177
    178		# no driver, no vfs
    179		if ! eeh_has_driver $dev ; then
    180			continue
    181		fi
    182
    183		devices="$devices $dev"
    184	done
    185
    186	if [ -z "$devices" ] ; then
    187		return 1;
    188	fi
    189
    190	echo $devices
    191	return 0;
    192}
    193
    194# attempts to enable one VF on each PF so we can do VF specific tests.
    195# stdout: list of enabled VFs, one per line
    196# return code: 0 if vfs are found, 1 otherwise
    197eeh_enable_vfs() {
    198	pf_list="$(eeh_find_all_pfs)"
    199
    200	vfs=0
    201	for dev in $pf_list ; do
    202		pf_sysfs="/sys/bus/pci/devices/$dev"
    203
    204		# make sure we have a single VF
    205		echo 0 > "$pf_sysfs/sriov_numvfs"
    206		echo 1 > "$pf_sysfs/sriov_numvfs"
    207		if [ "$?" != 0 ] ; then
    208			log "Unable to enable VFs on $pf, skipping"
    209			continue;
    210		fi
    211
    212		vf="$(basename $(realpath "$pf_sysfs/virtfn0"))"
    213		if [ $? != 0 ] ; then
    214			log "unable to find enabled vf on $pf"
    215			echo 0 > "$pf_sysfs/sriov_numvfs"
    216			continue;
    217		fi
    218
    219		if ! eeh_can_break $vf ; then
    220			log "skipping "
    221
    222			echo 0 > "$pf_sysfs/sriov_numvfs"
    223			continue;
    224		fi
    225
    226		vfs="$((vfs + 1))"
    227		echo $vf
    228	done
    229
    230	test "$vfs" != 0
    231	return $?
    232}
    233
    234eeh_disable_vfs() {
    235	pf_list="$(eeh_find_all_pfs)"
    236	if [ -z "$pf_list" ] ; then
    237		return 1;
    238	fi
    239
    240	for dev in $pf_list ; do
    241		echo 0 > "/sys/bus/pci/devices/$dev/sriov_numvfs"
    242	done
    243
    244	return 0;
    245}