cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

aldebaran.c (12559B)


      1/*
      2 * Copyright 2021 Advanced Micro Devices, Inc.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice shall be included in
     12 * all copies or substantial portions of the Software.
     13 *
     14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
     18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     20 * OTHER DEALINGS IN THE SOFTWARE.
     21 *
     22 */
     23
     24#include "aldebaran.h"
     25#include "amdgpu_reset.h"
     26#include "amdgpu_amdkfd.h"
     27#include "amdgpu_dpm.h"
     28#include "amdgpu_job.h"
     29#include "amdgpu_ring.h"
     30#include "amdgpu_ras.h"
     31#include "amdgpu_psp.h"
     32#include "amdgpu_xgmi.h"
     33
     34static bool aldebaran_is_mode2_default(struct amdgpu_reset_control *reset_ctl)
     35{
     36	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
     37
     38	if ((adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2) &&
     39	     adev->gmc.xgmi.connected_to_cpu))
     40		return true;
     41
     42	return false;
     43}
     44
     45static struct amdgpu_reset_handler *
     46aldebaran_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
     47			    struct amdgpu_reset_context *reset_context)
     48{
     49	struct amdgpu_reset_handler *handler;
     50	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
     51
     52	if (reset_context->method != AMD_RESET_METHOD_NONE) {
     53		dev_dbg(adev->dev, "Getting reset handler for method %d\n",
     54			reset_context->method);
     55		list_for_each_entry(handler, &reset_ctl->reset_handlers,
     56				     handler_list) {
     57			if (handler->reset_method == reset_context->method)
     58				return handler;
     59		}
     60	}
     61
     62	if (aldebaran_is_mode2_default(reset_ctl)) {
     63		list_for_each_entry(handler, &reset_ctl->reset_handlers,
     64				     handler_list) {
     65			if (handler->reset_method == AMD_RESET_METHOD_MODE2) {
     66				reset_context->method = AMD_RESET_METHOD_MODE2;
     67				return handler;
     68			}
     69		}
     70	}
     71
     72	dev_dbg(adev->dev, "Reset handler not found!\n");
     73
     74	return NULL;
     75}
     76
     77static int aldebaran_mode2_suspend_ip(struct amdgpu_device *adev)
     78{
     79	int r, i;
     80
     81	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
     82	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
     83
     84	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
     85		if (!(adev->ip_blocks[i].version->type ==
     86			      AMD_IP_BLOCK_TYPE_GFX ||
     87		      adev->ip_blocks[i].version->type ==
     88			      AMD_IP_BLOCK_TYPE_SDMA))
     89			continue;
     90
     91		r = adev->ip_blocks[i].version->funcs->suspend(adev);
     92
     93		if (r) {
     94			dev_err(adev->dev,
     95				"suspend of IP block <%s> failed %d\n",
     96				adev->ip_blocks[i].version->funcs->name, r);
     97			return r;
     98		}
     99
    100		adev->ip_blocks[i].status.hw = false;
    101	}
    102
    103	return r;
    104}
    105
    106static int
    107aldebaran_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
    108				  struct amdgpu_reset_context *reset_context)
    109{
    110	int r = 0;
    111	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
    112
    113	dev_dbg(adev->dev, "Aldebaran prepare hw context\n");
    114	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
    115	if (!amdgpu_sriov_vf(adev))
    116		r = aldebaran_mode2_suspend_ip(adev);
    117
    118	return r;
    119}
    120
    121static void aldebaran_async_reset(struct work_struct *work)
    122{
    123	struct amdgpu_reset_handler *handler;
    124	struct amdgpu_reset_control *reset_ctl =
    125		container_of(work, struct amdgpu_reset_control, reset_work);
    126	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
    127
    128	list_for_each_entry(handler, &reset_ctl->reset_handlers,
    129			     handler_list) {
    130		if (handler->reset_method == reset_ctl->active_reset) {
    131			dev_dbg(adev->dev, "Resetting device\n");
    132			handler->do_reset(adev);
    133			break;
    134		}
    135	}
    136}
    137
    138static int aldebaran_mode2_reset(struct amdgpu_device *adev)
    139{
    140	/* disable BM */
    141	pci_clear_master(adev->pdev);
    142	adev->asic_reset_res = amdgpu_dpm_mode2_reset(adev);
    143	return adev->asic_reset_res;
    144}
    145
    146static int
    147aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
    148			      struct amdgpu_reset_context *reset_context)
    149{
    150	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
    151	struct amdgpu_device *tmp_adev = NULL;
    152	struct list_head reset_device_list;
    153	int r = 0;
    154
    155	dev_dbg(adev->dev, "aldebaran perform hw reset\n");
    156	if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2) &&
    157	    reset_context->hive == NULL) {
    158		/* Wrong context, return error */
    159		return -EINVAL;
    160	}
    161
    162	INIT_LIST_HEAD(&reset_device_list);
    163	if (reset_context->hive) {
    164		list_for_each_entry (tmp_adev,
    165				     &reset_context->hive->device_list,
    166				     gmc.xgmi.head)
    167			list_add_tail(&tmp_adev->reset_list,
    168				      &reset_device_list);
    169	} else {
    170		list_add_tail(&reset_context->reset_req_dev->reset_list,
    171			      &reset_device_list);
    172	}
    173
    174	list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
    175		mutex_lock(&tmp_adev->reset_cntl->reset_lock);
    176		tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_MODE2;
    177	}
    178	/*
    179	 * Mode2 reset doesn't need any sync between nodes in XGMI hive, instead launch
    180	 * them together so that they can be completed asynchronously on multiple nodes
    181	 */
    182	list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
    183		/* For XGMI run all resets in parallel to speed up the process */
    184		if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
    185			if (!queue_work(system_unbound_wq,
    186					&tmp_adev->reset_cntl->reset_work))
    187				r = -EALREADY;
    188		} else
    189			r = aldebaran_mode2_reset(tmp_adev);
    190		if (r) {
    191			dev_err(tmp_adev->dev,
    192				"ASIC reset failed with error, %d for drm dev, %s",
    193				r, adev_to_drm(tmp_adev)->unique);
    194			break;
    195		}
    196	}
    197
    198	/* For XGMI wait for all resets to complete before proceed */
    199	if (!r) {
    200		list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
    201			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
    202				flush_work(&tmp_adev->reset_cntl->reset_work);
    203				r = tmp_adev->asic_reset_res;
    204				if (r)
    205					break;
    206			}
    207		}
    208	}
    209
    210	list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
    211		mutex_unlock(&tmp_adev->reset_cntl->reset_lock);
    212		tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_NONE;
    213	}
    214
    215	return r;
    216}
    217
    218static int aldebaran_mode2_restore_ip(struct amdgpu_device *adev)
    219{
    220	struct amdgpu_firmware_info *ucode_list[AMDGPU_UCODE_ID_MAXIMUM];
    221	struct amdgpu_firmware_info *ucode;
    222	struct amdgpu_ip_block *cmn_block;
    223	int ucode_count = 0;
    224	int i, r;
    225
    226	dev_dbg(adev->dev, "Reloading ucodes after reset\n");
    227	for (i = 0; i < adev->firmware.max_ucodes; i++) {
    228		ucode = &adev->firmware.ucode[i];
    229		if (!ucode->fw)
    230			continue;
    231		switch (ucode->ucode_id) {
    232		case AMDGPU_UCODE_ID_SDMA0:
    233		case AMDGPU_UCODE_ID_SDMA1:
    234		case AMDGPU_UCODE_ID_SDMA2:
    235		case AMDGPU_UCODE_ID_SDMA3:
    236		case AMDGPU_UCODE_ID_SDMA4:
    237		case AMDGPU_UCODE_ID_SDMA5:
    238		case AMDGPU_UCODE_ID_SDMA6:
    239		case AMDGPU_UCODE_ID_SDMA7:
    240		case AMDGPU_UCODE_ID_CP_MEC1:
    241		case AMDGPU_UCODE_ID_CP_MEC1_JT:
    242		case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL:
    243		case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM:
    244		case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM:
    245		case AMDGPU_UCODE_ID_RLC_G:
    246			ucode_list[ucode_count++] = ucode;
    247			break;
    248		default:
    249			break;
    250		}
    251	}
    252
    253	/* Reinit NBIF block */
    254	cmn_block =
    255		amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_COMMON);
    256	if (unlikely(!cmn_block)) {
    257		dev_err(adev->dev, "Failed to get BIF handle\n");
    258		return -EINVAL;
    259	}
    260	r = cmn_block->version->funcs->resume(adev);
    261	if (r)
    262		return r;
    263
    264	/* Reinit GFXHUB */
    265	adev->gfxhub.funcs->init(adev);
    266	r = adev->gfxhub.funcs->gart_enable(adev);
    267	if (r) {
    268		dev_err(adev->dev, "GFXHUB gart reenable failed after reset\n");
    269		return r;
    270	}
    271
    272	/* Reload GFX firmware */
    273	r = psp_load_fw_list(&adev->psp, ucode_list, ucode_count);
    274	if (r) {
    275		dev_err(adev->dev, "GFX ucode load failed after reset\n");
    276		return r;
    277	}
    278
    279	/* Resume RLC, FW needs RLC alive to complete reset process */
    280	adev->gfx.rlc.funcs->resume(adev);
    281
    282	/* Wait for FW reset event complete */
    283	r = amdgpu_dpm_wait_for_event(adev, SMU_EVENT_RESET_COMPLETE, 0);
    284	if (r) {
    285		dev_err(adev->dev,
    286			"Failed to get response from firmware after reset\n");
    287		return r;
    288	}
    289
    290	for (i = 0; i < adev->num_ip_blocks; i++) {
    291		if (!(adev->ip_blocks[i].version->type ==
    292			      AMD_IP_BLOCK_TYPE_GFX ||
    293		      adev->ip_blocks[i].version->type ==
    294			      AMD_IP_BLOCK_TYPE_SDMA))
    295			continue;
    296		r = adev->ip_blocks[i].version->funcs->resume(adev);
    297		if (r) {
    298			dev_err(adev->dev,
    299				"resume of IP block <%s> failed %d\n",
    300				adev->ip_blocks[i].version->funcs->name, r);
    301			return r;
    302		}
    303
    304		adev->ip_blocks[i].status.hw = true;
    305	}
    306
    307	for (i = 0; i < adev->num_ip_blocks; i++) {
    308		if (!(adev->ip_blocks[i].version->type ==
    309			      AMD_IP_BLOCK_TYPE_GFX ||
    310		      adev->ip_blocks[i].version->type ==
    311			      AMD_IP_BLOCK_TYPE_SDMA ||
    312		      adev->ip_blocks[i].version->type ==
    313			      AMD_IP_BLOCK_TYPE_COMMON))
    314			continue;
    315
    316		if (adev->ip_blocks[i].version->funcs->late_init) {
    317			r = adev->ip_blocks[i].version->funcs->late_init(
    318				(void *)adev);
    319			if (r) {
    320				dev_err(adev->dev,
    321					"late_init of IP block <%s> failed %d after reset\n",
    322					adev->ip_blocks[i].version->funcs->name,
    323					r);
    324				return r;
    325			}
    326		}
    327		adev->ip_blocks[i].status.late_initialized = true;
    328	}
    329
    330	amdgpu_ras_set_error_query_ready(adev, true);
    331
    332	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
    333	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
    334
    335	return r;
    336}
    337
    338static int
    339aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
    340				  struct amdgpu_reset_context *reset_context)
    341{
    342	struct amdgpu_device *tmp_adev = NULL;
    343	struct list_head reset_device_list;
    344	int r;
    345
    346	if (reset_context->reset_req_dev->ip_versions[MP1_HWIP][0] ==
    347		    IP_VERSION(13, 0, 2) &&
    348	    reset_context->hive == NULL) {
    349		/* Wrong context, return error */
    350		return -EINVAL;
    351	}
    352
    353	INIT_LIST_HEAD(&reset_device_list);
    354	if (reset_context->hive) {
    355		list_for_each_entry (tmp_adev,
    356				     &reset_context->hive->device_list,
    357				     gmc.xgmi.head)
    358			list_add_tail(&tmp_adev->reset_list,
    359				      &reset_device_list);
    360	} else {
    361		list_add_tail(&reset_context->reset_req_dev->reset_list,
    362			      &reset_device_list);
    363	}
    364
    365	list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
    366		dev_info(tmp_adev->dev,
    367			 "GPU reset succeeded, trying to resume\n");
    368		r = aldebaran_mode2_restore_ip(tmp_adev);
    369		if (r)
    370			goto end;
    371
    372		/*
    373		 * Add this ASIC as tracked as reset was already
    374		 * complete successfully.
    375		 */
    376		amdgpu_register_gpu_instance(tmp_adev);
    377
    378		/* Resume RAS */
    379		amdgpu_ras_resume(tmp_adev);
    380
    381		/* Update PSP FW topology after reset */
    382		if (reset_context->hive &&
    383		    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
    384			r = amdgpu_xgmi_update_topology(reset_context->hive,
    385							tmp_adev);
    386
    387		if (!r) {
    388			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
    389
    390			r = amdgpu_ib_ring_tests(tmp_adev);
    391			if (r) {
    392				dev_err(tmp_adev->dev,
    393					"ib ring test failed (%d).\n", r);
    394				r = -EAGAIN;
    395				tmp_adev->asic_reset_res = r;
    396				goto end;
    397			}
    398		}
    399	}
    400
    401end:
    402	return r;
    403}
    404
    405static struct amdgpu_reset_handler aldebaran_mode2_handler = {
    406	.reset_method		= AMD_RESET_METHOD_MODE2,
    407	.prepare_env		= NULL,
    408	.prepare_hwcontext	= aldebaran_mode2_prepare_hwcontext,
    409	.perform_reset		= aldebaran_mode2_perform_reset,
    410	.restore_hwcontext	= aldebaran_mode2_restore_hwcontext,
    411	.restore_env		= NULL,
    412	.do_reset		= aldebaran_mode2_reset,
    413};
    414
    415int aldebaran_reset_init(struct amdgpu_device *adev)
    416{
    417	struct amdgpu_reset_control *reset_ctl;
    418
    419	reset_ctl = kzalloc(sizeof(*reset_ctl), GFP_KERNEL);
    420	if (!reset_ctl)
    421		return -ENOMEM;
    422
    423	reset_ctl->handle = adev;
    424	reset_ctl->async_reset = aldebaran_async_reset;
    425	reset_ctl->active_reset = AMD_RESET_METHOD_NONE;
    426	reset_ctl->get_reset_handler = aldebaran_get_reset_handler;
    427
    428	INIT_LIST_HEAD(&reset_ctl->reset_handlers);
    429	INIT_WORK(&reset_ctl->reset_work, reset_ctl->async_reset);
    430	/* Only mode2 is handled through reset control now */
    431	amdgpu_reset_add_handler(reset_ctl, &aldebaran_mode2_handler);
    432
    433	adev->reset_cntl = reset_ctl;
    434
    435	return 0;
    436}
    437
    438int aldebaran_reset_fini(struct amdgpu_device *adev)
    439{
    440	kfree(adev->reset_cntl);
    441	adev->reset_cntl = NULL;
    442	return 0;
    443}