root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. __encode_table_header_to_buff
  2. __decode_table_header_from_buff
  3. __update_table_header
  4. amdgpu_ras_eeprom_init
  5. amdgpu_ras_eeprom_fini
  6. __encode_table_record_to_buff
  7. __decode_table_record_from_buff
  8. __correct_eeprom_dest_address
  9. __calc_hdr_byte_sum
  10. __calc_recs_byte_sum
  11. __calc_tbl_byte_sum
  12. __update_tbl_checksum
  13. __validate_tbl_checksum
  14. amdgpu_ras_eeprom_process_recods
  15. amdgpu_ras_eeprom_test

   1 /*
   2  * Copyright 2019 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  *
  22  */
  23 
  24 #include "amdgpu_ras_eeprom.h"
  25 #include "amdgpu.h"
  26 #include "amdgpu_ras.h"
  27 #include <linux/bits.h>
  28 #include "smu_v11_0_i2c.h"
  29 
  30 #define EEPROM_I2C_TARGET_ADDR 0xA0
  31 
  32 /*
  33  * The 2 macros bellow represent the actual size in bytes that
  34  * those entities occupy in the EEPROM memory.
  35  * EEPROM_TABLE_RECORD_SIZE is different than sizeof(eeprom_table_record) which
  36  * uses uint64 to store 6b fields such as retired_page.
  37  */
  38 #define EEPROM_TABLE_HEADER_SIZE 20
  39 #define EEPROM_TABLE_RECORD_SIZE 24
  40 
  41 #define EEPROM_ADDRESS_SIZE 0x2
  42 
  43 /* Table hdr is 'AMDR' */
  44 #define EEPROM_TABLE_HDR_VAL 0x414d4452
  45 #define EEPROM_TABLE_VER 0x00010000
  46 
  47 /* Assume 2 Mbit size */
  48 #define EEPROM_SIZE_BYTES 256000
  49 #define EEPROM_PAGE__SIZE_BYTES 256
  50 #define EEPROM_HDR_START 0
  51 #define EEPROM_RECORD_START (EEPROM_HDR_START + EEPROM_TABLE_HEADER_SIZE)
  52 #define EEPROM_MAX_RECORD_NUM ((EEPROM_SIZE_BYTES - EEPROM_TABLE_HEADER_SIZE) / EEPROM_TABLE_RECORD_SIZE)
  53 #define EEPROM_ADDR_MSB_MASK GENMASK(17, 8)
  54 
  55 #define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, eeprom_control))->adev
  56 
  57 static void __encode_table_header_to_buff(struct amdgpu_ras_eeprom_table_header *hdr,
  58                                           unsigned char *buff)
  59 {
  60         uint32_t *pp = (uint32_t *) buff;
  61 
  62         pp[0] = cpu_to_le32(hdr->header);
  63         pp[1] = cpu_to_le32(hdr->version);
  64         pp[2] = cpu_to_le32(hdr->first_rec_offset);
  65         pp[3] = cpu_to_le32(hdr->tbl_size);
  66         pp[4] = cpu_to_le32(hdr->checksum);
  67 }
  68 
  69 static void __decode_table_header_from_buff(struct amdgpu_ras_eeprom_table_header *hdr,
  70                                           unsigned char *buff)
  71 {
  72         uint32_t *pp = (uint32_t *)buff;
  73 
  74         hdr->header           = le32_to_cpu(pp[0]);
  75         hdr->version          = le32_to_cpu(pp[1]);
  76         hdr->first_rec_offset = le32_to_cpu(pp[2]);
  77         hdr->tbl_size         = le32_to_cpu(pp[3]);
  78         hdr->checksum         = le32_to_cpu(pp[4]);
  79 }
  80 
  81 static int __update_table_header(struct amdgpu_ras_eeprom_control *control,
  82                                  unsigned char *buff)
  83 {
  84         int ret = 0;
  85         struct i2c_msg msg = {
  86                         .addr   = EEPROM_I2C_TARGET_ADDR,
  87                         .flags  = 0,
  88                         .len    = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE,
  89                         .buf    = buff,
  90         };
  91 
  92 
  93         *(uint16_t *)buff = EEPROM_HDR_START;
  94         __encode_table_header_to_buff(&control->tbl_hdr, buff + EEPROM_ADDRESS_SIZE);
  95 
  96         ret = i2c_transfer(&control->eeprom_accessor, &msg, 1);
  97         if (ret < 1)
  98                 DRM_ERROR("Failed to write EEPROM table header, ret:%d", ret);
  99 
 100         return ret;
 101 }
 102 
 103 static uint32_t  __calc_hdr_byte_sum(struct amdgpu_ras_eeprom_control *control);
 104 
 105 int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
 106 {
 107         int ret = 0;
 108         struct amdgpu_device *adev = to_amdgpu_device(control);
 109         unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE] = { 0 };
 110         struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
 111         struct i2c_msg msg = {
 112                         .addr   = EEPROM_I2C_TARGET_ADDR,
 113                         .flags  = I2C_M_RD,
 114                         .len    = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE,
 115                         .buf    = buff,
 116         };
 117 
 118         mutex_init(&control->tbl_mutex);
 119 
 120         switch (adev->asic_type) {
 121         case CHIP_VEGA20:
 122                 ret = smu_v11_0_i2c_eeprom_control_init(&control->eeprom_accessor);
 123                 break;
 124 
 125         default:
 126                 return 0;
 127         }
 128 
 129         if (ret) {
 130                 DRM_ERROR("Failed to init I2C controller, ret:%d", ret);
 131                 return ret;
 132         }
 133 
 134         /* Read/Create table header from EEPROM address 0 */
 135         ret = i2c_transfer(&control->eeprom_accessor, &msg, 1);
 136         if (ret < 1) {
 137                 DRM_ERROR("Failed to read EEPROM table header, ret:%d", ret);
 138                 return ret;
 139         }
 140 
 141         __decode_table_header_from_buff(hdr, &buff[2]);
 142 
 143         if (hdr->header == EEPROM_TABLE_HDR_VAL) {
 144                 control->num_recs = (hdr->tbl_size - EEPROM_TABLE_HEADER_SIZE) /
 145                                     EEPROM_TABLE_RECORD_SIZE;
 146                 DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
 147                                  control->num_recs);
 148 
 149         } else {
 150                 DRM_INFO("Creating new EEPROM table");
 151 
 152                 hdr->header = EEPROM_TABLE_HDR_VAL;
 153                 hdr->version = EEPROM_TABLE_VER;
 154                 hdr->first_rec_offset = EEPROM_RECORD_START;
 155                 hdr->tbl_size = EEPROM_TABLE_HEADER_SIZE;
 156 
 157                 adev->psp.ras.ras->eeprom_control.tbl_byte_sum =
 158                                 __calc_hdr_byte_sum(&adev->psp.ras.ras->eeprom_control);
 159                 ret = __update_table_header(control, buff);
 160         }
 161 
 162         /* Start inserting records from here */
 163         adev->psp.ras.ras->eeprom_control.next_addr = EEPROM_RECORD_START;
 164 
 165         return ret == 1 ? 0 : -EIO;
 166 }
 167 
 168 void amdgpu_ras_eeprom_fini(struct amdgpu_ras_eeprom_control *control)
 169 {
 170         struct amdgpu_device *adev = to_amdgpu_device(control);
 171 
 172         switch (adev->asic_type) {
 173         case CHIP_VEGA20:
 174                 smu_v11_0_i2c_eeprom_control_fini(&control->eeprom_accessor);
 175                 break;
 176 
 177         default:
 178                 return;
 179         }
 180 }
 181 
 182 static void __encode_table_record_to_buff(struct amdgpu_ras_eeprom_control *control,
 183                                           struct eeprom_table_record *record,
 184                                           unsigned char *buff)
 185 {
 186         __le64 tmp = 0;
 187         int i = 0;
 188 
 189         /* Next are all record fields according to EEPROM page spec in LE foramt */
 190         buff[i++] = record->err_type;
 191 
 192         buff[i++] = record->bank;
 193 
 194         tmp = cpu_to_le64(record->ts);
 195         memcpy(buff + i, &tmp, 8);
 196         i += 8;
 197 
 198         tmp = cpu_to_le64((record->offset & 0xffffffffffff));
 199         memcpy(buff + i, &tmp, 6);
 200         i += 6;
 201 
 202         buff[i++] = record->mem_channel;
 203         buff[i++] = record->mcumc_id;
 204 
 205         tmp = cpu_to_le64((record->retired_page & 0xffffffffffff));
 206         memcpy(buff + i, &tmp, 6);
 207 }
 208 
 209 static void __decode_table_record_from_buff(struct amdgpu_ras_eeprom_control *control,
 210                                             struct eeprom_table_record *record,
 211                                             unsigned char *buff)
 212 {
 213         __le64 tmp = 0;
 214         int i =  0;
 215 
 216         /* Next are all record fields according to EEPROM page spec in LE foramt */
 217         record->err_type = buff[i++];
 218 
 219         record->bank = buff[i++];
 220 
 221         memcpy(&tmp, buff + i, 8);
 222         record->ts = le64_to_cpu(tmp);
 223         i += 8;
 224 
 225         memcpy(&tmp, buff + i, 6);
 226         record->offset = (le64_to_cpu(tmp) & 0xffffffffffff);
 227         i += 6;
 228 
 229         buff[i++] = record->mem_channel;
 230         buff[i++] = record->mcumc_id;
 231 
 232         memcpy(&tmp, buff + i,  6);
 233         record->retired_page = (le64_to_cpu(tmp) & 0xffffffffffff);
 234 }
 235 
 236 /*
 237  * When reaching end of EEPROM memory jump back to 0 record address
 238  * When next record access will go beyond EEPROM page boundary modify bits A17/A8
 239  * in I2C selector to go to next page
 240  */
 241 static uint32_t __correct_eeprom_dest_address(uint32_t curr_address)
 242 {
 243         uint32_t next_address = curr_address + EEPROM_TABLE_RECORD_SIZE;
 244 
 245         /* When all EEPROM memory used jump back to 0 address */
 246         if (next_address > EEPROM_SIZE_BYTES) {
 247                 DRM_INFO("Reached end of EEPROM memory, jumping to 0 "
 248                          "and overriding old record");
 249                 return EEPROM_RECORD_START;
 250         }
 251 
 252         /*
 253          * To check if we overflow page boundary  compare next address with
 254          * current and see if bits 17/8 of the EEPROM address will change
 255          * If they do start from the next 256b page
 256          *
 257          * https://www.st.com/resource/en/datasheet/m24m02-dr.pdf sec. 5.1.2
 258          */
 259         if ((curr_address & EEPROM_ADDR_MSB_MASK) != (next_address & EEPROM_ADDR_MSB_MASK)) {
 260                 DRM_DEBUG_DRIVER("Reached end of EEPROM memory page, jumping to next: %lx",
 261                                 (next_address & EEPROM_ADDR_MSB_MASK));
 262 
 263                 return  (next_address & EEPROM_ADDR_MSB_MASK);
 264         }
 265 
 266         return curr_address;
 267 }
 268 
 269 
 270 static uint32_t  __calc_hdr_byte_sum(struct amdgpu_ras_eeprom_control *control)
 271 {
 272         int i;
 273         uint32_t tbl_sum = 0;
 274 
 275         /* Header checksum, skip checksum field in the calculation */
 276         for (i = 0; i < sizeof(control->tbl_hdr) - sizeof(control->tbl_hdr.checksum); i++)
 277                 tbl_sum += *(((unsigned char *)&control->tbl_hdr) + i);
 278 
 279         return tbl_sum;
 280 }
 281 
 282 static uint32_t  __calc_recs_byte_sum(struct eeprom_table_record *records,
 283                                       int num)
 284 {
 285         int i, j;
 286         uint32_t tbl_sum = 0;
 287 
 288         /* Records checksum */
 289         for (i = 0; i < num; i++) {
 290                 struct eeprom_table_record *record = &records[i];
 291 
 292                 for (j = 0; j < sizeof(*record); j++) {
 293                         tbl_sum += *(((unsigned char *)record) + j);
 294                 }
 295         }
 296 
 297         return tbl_sum;
 298 }
 299 
 300 static inline uint32_t  __calc_tbl_byte_sum(struct amdgpu_ras_eeprom_control *control,
 301                                   struct eeprom_table_record *records, int num)
 302 {
 303         return __calc_hdr_byte_sum(control) + __calc_recs_byte_sum(records, num);
 304 }
 305 
 306 /* Checksum = 256 -((sum of all table entries) mod 256) */
 307 static void __update_tbl_checksum(struct amdgpu_ras_eeprom_control *control,
 308                                   struct eeprom_table_record *records, int num,
 309                                   uint32_t old_hdr_byte_sum)
 310 {
 311         /*
 312          * This will update the table sum with new records.
 313          *
 314          * TODO: What happens when the EEPROM table is to be wrapped around
 315          * and old records from start will get overridden.
 316          */
 317 
 318         /* need to recalculate updated header byte sum */
 319         control->tbl_byte_sum -= old_hdr_byte_sum;
 320         control->tbl_byte_sum += __calc_tbl_byte_sum(control, records, num);
 321 
 322         control->tbl_hdr.checksum = 256 - (control->tbl_byte_sum % 256);
 323 }
 324 
 325 /* table sum mod 256 + checksum must equals 256 */
 326 static bool __validate_tbl_checksum(struct amdgpu_ras_eeprom_control *control,
 327                             struct eeprom_table_record *records, int num)
 328 {
 329         control->tbl_byte_sum = __calc_tbl_byte_sum(control, records, num);
 330 
 331         if (control->tbl_hdr.checksum + (control->tbl_byte_sum % 256) != 256) {
 332                 DRM_WARN("Checksum mismatch, checksum: %u ", control->tbl_hdr.checksum);
 333                 return false;
 334         }
 335 
 336         return true;
 337 }
 338 
 339 int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
 340                                             struct eeprom_table_record *records,
 341                                             bool write,
 342                                             int num)
 343 {
 344         int i, ret = 0;
 345         struct i2c_msg *msgs;
 346         unsigned char *buffs;
 347         struct amdgpu_device *adev = to_amdgpu_device(control);
 348 
 349         if (adev->asic_type != CHIP_VEGA20)
 350                 return 0;
 351 
 352         buffs = kcalloc(num, EEPROM_ADDRESS_SIZE + EEPROM_TABLE_RECORD_SIZE,
 353                          GFP_KERNEL);
 354         if (!buffs)
 355                 return -ENOMEM;
 356 
 357         mutex_lock(&control->tbl_mutex);
 358 
 359         msgs = kcalloc(num, sizeof(*msgs), GFP_KERNEL);
 360         if (!msgs) {
 361                 ret = -ENOMEM;
 362                 goto free_buff;
 363         }
 364 
 365         /* In case of overflow just start from beginning to not lose newest records */
 366         if (write && (control->next_addr + EEPROM_TABLE_RECORD_SIZE * num > EEPROM_SIZE_BYTES))
 367                 control->next_addr = EEPROM_RECORD_START;
 368 
 369 
 370         /*
 371          * TODO Currently makes EEPROM writes for each record, this creates
 372          * internal fragmentation. Optimized the code to do full page write of
 373          * 256b
 374          */
 375         for (i = 0; i < num; i++) {
 376                 unsigned char *buff = &buffs[i * (EEPROM_ADDRESS_SIZE + EEPROM_TABLE_RECORD_SIZE)];
 377                 struct eeprom_table_record *record = &records[i];
 378                 struct i2c_msg *msg = &msgs[i];
 379 
 380                 control->next_addr = __correct_eeprom_dest_address(control->next_addr);
 381 
 382                 /*
 383                  * Update bits 16,17 of EEPROM address in I2C address by setting them
 384                  * to bits 1,2 of Device address byte
 385                  */
 386                 msg->addr = EEPROM_I2C_TARGET_ADDR |
 387                                ((control->next_addr & EEPROM_ADDR_MSB_MASK) >> 15);
 388                 msg->flags      = write ? 0 : I2C_M_RD;
 389                 msg->len        = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_RECORD_SIZE;
 390                 msg->buf        = buff;
 391 
 392                 /* Insert the EEPROM dest addess, bits 0-15 */
 393                 buff[0] = ((control->next_addr >> 8) & 0xff);
 394                 buff[1] = (control->next_addr & 0xff);
 395 
 396                 /* EEPROM table content is stored in LE format */
 397                 if (write)
 398                         __encode_table_record_to_buff(control, record, buff + EEPROM_ADDRESS_SIZE);
 399 
 400                 /*
 401                  * The destination EEPROM address might need to be corrected to account
 402                  * for page or entire memory wrapping
 403                  */
 404                 control->next_addr += EEPROM_TABLE_RECORD_SIZE;
 405         }
 406 
 407         ret = i2c_transfer(&control->eeprom_accessor, msgs, num);
 408         if (ret < 1) {
 409                 DRM_ERROR("Failed to process EEPROM table records, ret:%d", ret);
 410 
 411                 /* TODO Restore prev next EEPROM address ? */
 412                 goto free_msgs;
 413         }
 414 
 415 
 416         if (!write) {
 417                 for (i = 0; i < num; i++) {
 418                         unsigned char *buff = &buffs[i*(EEPROM_ADDRESS_SIZE + EEPROM_TABLE_RECORD_SIZE)];
 419                         struct eeprom_table_record *record = &records[i];
 420 
 421                         __decode_table_record_from_buff(control, record, buff + EEPROM_ADDRESS_SIZE);
 422                 }
 423         }
 424 
 425         if (write) {
 426                 uint32_t old_hdr_byte_sum = __calc_hdr_byte_sum(control);
 427 
 428                 /*
 429                  * Update table header with size and CRC and account for table
 430                  * wrap around where the assumption is that we treat it as empty
 431                  * table
 432                  *
 433                  * TODO - Check the assumption is correct
 434                  */
 435                 control->num_recs += num;
 436                 control->num_recs %= EEPROM_MAX_RECORD_NUM;
 437                 control->tbl_hdr.tbl_size += EEPROM_TABLE_RECORD_SIZE * num;
 438                 if (control->tbl_hdr.tbl_size > EEPROM_SIZE_BYTES)
 439                         control->tbl_hdr.tbl_size = EEPROM_TABLE_HEADER_SIZE +
 440                         control->num_recs * EEPROM_TABLE_RECORD_SIZE;
 441 
 442                 __update_tbl_checksum(control, records, num, old_hdr_byte_sum);
 443 
 444                 __update_table_header(control, buffs);
 445         } else if (!__validate_tbl_checksum(control, records, num)) {
 446                 DRM_WARN("EEPROM Table checksum mismatch!");
 447                 /* TODO Uncomment when EEPROM read/write is relliable */
 448                 /* ret = -EIO; */
 449         }
 450 
 451 free_msgs:
 452         kfree(msgs);
 453 
 454 free_buff:
 455         kfree(buffs);
 456 
 457         mutex_unlock(&control->tbl_mutex);
 458 
 459         return ret == num ? 0 : -EIO;
 460 }
 461 
 462 /* Used for testing if bugs encountered */
 463 #if 0
 464 void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control)
 465 {
 466         int i;
 467         struct eeprom_table_record *recs = kcalloc(1, sizeof(*recs), GFP_KERNEL);
 468 
 469         if (!recs)
 470                 return;
 471 
 472         for (i = 0; i < 1 ; i++) {
 473                 recs[i].address = 0xdeadbeef;
 474                 recs[i].retired_page = i;
 475         }
 476 
 477         if (!amdgpu_ras_eeprom_process_recods(control, recs, true, 1)) {
 478 
 479                 memset(recs, 0, sizeof(*recs) * 1);
 480 
 481                 control->next_addr = EEPROM_RECORD_START;
 482 
 483                 if (!amdgpu_ras_eeprom_process_recods(control, recs, false, 1)) {
 484                         for (i = 0; i < 1; i++)
 485                                 DRM_INFO("rec.address :0x%llx, rec.retired_page :%llu",
 486                                          recs[i].address, recs[i].retired_page);
 487                 } else
 488                         DRM_ERROR("Failed in reading from table");
 489 
 490         } else
 491                 DRM_ERROR("Failed in writing to table");
 492 }
 493 #endif

/* [<][>][^][v][top][bottom][index][help] */