amdgpu_umc.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
amdgpu_umc.c (6313B)
      1/*
      2 * Copyright 2019 Advanced Micro Devices, Inc.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice shall be included in
     12 * all copies or substantial portions of the Software.
     13 *
     14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
     18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     20 * OTHER DEALINGS IN THE SOFTWARE.
     21 *
     22 */
     23
     24#include "amdgpu.h"
     25
     26static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
     27		void *ras_error_status,
     28		struct amdgpu_iv_entry *entry,
     29		bool reset)
     30{
     31	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
     32	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
     33	int ret = 0;
     34
     35	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
     36	ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
     37	if (ret == -EOPNOTSUPP) {
     38		if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
     39		    adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
     40		    adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status);
     41
     42		if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
     43		    adev->umc.ras->ras_block.hw_ops->query_ras_error_address &&
     44		    adev->umc.max_ras_err_cnt_per_query) {
     45			err_data->err_addr =
     46				kcalloc(adev->umc.max_ras_err_cnt_per_query,
     47					sizeof(struct eeprom_table_record), GFP_KERNEL);
     48
     49			/* still call query_ras_error_address to clear error status
     50			 * even NOMEM error is encountered
     51			 */
     52			if(!err_data->err_addr)
     53				dev_warn(adev->dev, "Failed to alloc memory for "
     54						"umc error address record!\n");
     55
     56			/* umc query_ras_error_address is also responsible for clearing
     57			 * error status
     58			 */
     59			adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, ras_error_status);
     60		}
     61	} else if (!ret) {
     62		if (adev->umc.ras &&
     63		    adev->umc.ras->ecc_info_query_ras_error_count)
     64		    adev->umc.ras->ecc_info_query_ras_error_count(adev, ras_error_status);
     65
     66		if (adev->umc.ras &&
     67		    adev->umc.ras->ecc_info_query_ras_error_address &&
     68		    adev->umc.max_ras_err_cnt_per_query) {
     69			err_data->err_addr =
     70				kcalloc(adev->umc.max_ras_err_cnt_per_query,
     71					sizeof(struct eeprom_table_record), GFP_KERNEL);
     72
     73			/* still call query_ras_error_address to clear error status
     74			 * even NOMEM error is encountered
     75			 */
     76			if(!err_data->err_addr)
     77				dev_warn(adev->dev, "Failed to alloc memory for "
     78						"umc error address record!\n");
     79
     80			/* umc query_ras_error_address is also responsible for clearing
     81			 * error status
     82			 */
     83			adev->umc.ras->ecc_info_query_ras_error_address(adev, ras_error_status);
     84		}
     85	}
     86
     87	/* only uncorrectable error needs gpu reset */
     88	if (err_data->ue_count) {
     89		dev_info(adev->dev, "%ld uncorrectable hardware errors "
     90				"detected in UMC block\n",
     91				err_data->ue_count);
     92
     93		if ((amdgpu_bad_page_threshold != 0) &&
     94			err_data->err_addr_cnt) {
     95			amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
     96						err_data->err_addr_cnt);
     97			amdgpu_ras_save_bad_pages(adev);
     98
     99			amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
    100
    101			if (con->update_channel_flag == true) {
    102				amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
    103				con->update_channel_flag = false;
    104			}
    105		}
    106
    107		if (reset)
    108			amdgpu_ras_reset_gpu(adev);
    109	}
    110
    111	kfree(err_data->err_addr);
    112	return AMDGPU_RAS_SUCCESS;
    113}
    114
    115int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
    116		void *ras_error_status,
    117		bool reset)
    118{
    119	int ret;
    120	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
    121	struct ras_common_if head = {
    122		.block = AMDGPU_RAS_BLOCK__UMC,
    123	};
    124	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
    125
    126	ret =
    127		amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
    128
    129	if (ret == AMDGPU_RAS_SUCCESS && obj) {
    130		obj->err_data.ue_count += err_data->ue_count;
    131		obj->err_data.ce_count += err_data->ce_count;
    132	}
    133
    134	return ret;
    135}
    136
    137int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
    138		void *ras_error_status,
    139		struct amdgpu_iv_entry *entry)
    140{
    141	return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
    142}
    143
    144int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
    145{
    146	int r;
    147
    148	r = amdgpu_ras_block_late_init(adev, ras_block);
    149	if (r)
    150		return r;
    151
    152	if (amdgpu_ras_is_supported(adev, ras_block->block)) {
    153		r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
    154		if (r)
    155			goto late_fini;
    156	}
    157
    158	/* ras init of specific umc version */
    159	if (adev->umc.ras &&
    160	    adev->umc.ras->err_cnt_init)
    161		adev->umc.ras->err_cnt_init(adev);
    162
    163	return 0;
    164
    165late_fini:
    166	amdgpu_ras_block_late_fini(adev, ras_block);
    167	return r;
    168}
    169
    170int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
    171		struct amdgpu_irq_src *source,
    172		struct amdgpu_iv_entry *entry)
    173{
    174	struct ras_common_if *ras_if = adev->umc.ras_if;
    175	struct ras_dispatch_if ih_data = {
    176		.entry = entry,
    177	};
    178
    179	if (!ras_if)
    180		return 0;
    181
    182	ih_data.head = *ras_if;
    183
    184	amdgpu_ras_interrupt_dispatch(adev, &ih_data);
    185	return 0;
    186}
    187
    188void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
    189		uint64_t err_addr,
    190		uint64_t retired_page,
    191		uint32_t channel_index,
    192		uint32_t umc_inst)
    193{
    194	struct eeprom_table_record *err_rec =
    195		&err_data->err_addr[err_data->err_addr_cnt];
    196
    197	err_rec->address = err_addr;
    198	/* page frame address is saved */
    199	err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
    200	err_rec->ts = (uint64_t)ktime_get_real_seconds();
    201	err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
    202	err_rec->cu = 0;
    203	err_rec->mem_channel = channel_index;
    204	err_rec->mcumc_id = umc_inst;
    205
    206	err_data->err_addr_cnt++;
    207}