root/drivers/misc/mic/cosm/cosm_scif_server.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. cosm_update_mic_status
  2. cosm_shutdown_status_int
  3. cosm_scif_recv
  4. cosm_set_crashed
  5. cosm_send_time
  6. cosm_scif_close
  7. cosm_set_online
  8. cosm_scif_work
  9. cosm_scif_server
  10. cosm_scif_listen
  11. cosm_scif_listen_exit
  12. cosm_scif_init
  13. cosm_scif_exit

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Intel MIC Platform Software Stack (MPSS)
   4  *
   5  * Copyright(c) 2015 Intel Corporation.
   6  *
   7  * Intel MIC Coprocessor State Management (COSM) Driver
   8  */
   9 #include <linux/kthread.h>
  10 #include <linux/sched/signal.h>
  11 
  12 #include "cosm_main.h"
  13 
  14 /*
  15  * The COSM driver uses SCIF to communicate between the management node and the
  16  * MIC cards. SCIF is used to (a) Send a shutdown command to the card (b)
  17  * receive a shutdown status back from the card upon completion of shutdown and
  18  * (c) receive periodic heartbeat messages from the card used to deduce if the
  19  * card has crashed.
  20  *
  21  * A COSM server consisting of a SCIF listening endpoint waits for incoming
  22  * connections from the card. Upon acceptance of the connection, a separate
  23  * work-item is scheduled to handle SCIF message processing for that card. The
  24  * life-time of this work-item is therefore the time from which the connection
  25  * from a card is accepted to the time at which the connection is closed. A new
  26  * work-item starts each time the card boots and is alive till the card (a)
  27  * shuts down (b) is reset (c) crashes (d) cosm_client driver on the card is
  28  * unloaded.
  29  *
  30  * From the point of view of COSM interactions with SCIF during card
  31  * shutdown, reset and crash are as follows:
  32  *
  33  * Card shutdown
  34  * -------------
  35  * 1. COSM client on the card invokes orderly_poweroff() in response to SHUTDOWN
  36  *    message from the host.
  37  * 2. Card driver shutdown callback invokes scif_unregister_device(..) resulting
  38  *    in scif_remove(..) getting called on the card
  39  * 3. scif_remove -> scif_stop -> scif_handle_remove_node ->
  40  *    scif_peer_unregister_device -> device_unregister for the host peer device
  41  * 4. During device_unregister remove(..) method of cosm_client is invoked which
  42  *    closes the COSM SCIF endpoint on the card. This results in a SCIF_DISCNCT
  43  *    message being sent to host SCIF. SCIF_DISCNCT message processing on the
  44  *    host SCIF sets the host COSM SCIF endpoint state to DISCONNECTED and wakes
  45  *    up the host COSM thread blocked in scif_poll(..) resulting in
  46  *    scif_poll(..)  returning EPOLLHUP.
  47  * 5. On the card, scif_peer_release_dev is next called which results in an
  48  *    SCIF_EXIT message being sent to the host and after receiving the
  49  *    SCIF_EXIT_ACK from the host the peer device teardown on the card is
  50  *    complete.
  51  * 6. As part of the SCIF_EXIT message processing on the host, host sends a
  52  *    SCIF_REMOVE_NODE to itself corresponding to the card being removed. This
  53  *    starts a similar SCIF peer device teardown sequence on the host
  54  *    corresponding to the card being shut down.
  55  *
  56  * Card reset
  57  * ----------
  58  * The case of interest here is when the card has not been previously shut down
  59  * since most of the steps below are skipped in that case:
  60 
  61  * 1. cosm_stop(..) invokes hw_ops->stop(..) method of the base PCIe driver
  62  *    which unregisters the SCIF HW device resulting in scif_remove(..) being
  63  *    called on the host.
  64  * 2. scif_remove(..) calls scif_disconnect_node(..) which results in a
  65  *    SCIF_EXIT message being sent to the card.
  66  * 3. The card executes scif_stop() as part of SCIF_EXIT message
  67  *    processing. This results in the COSM endpoint on the card being closed and
  68  *    the SCIF host peer device on the card getting unregistered similar to
  69  *    steps 3, 4 and 5 for the card shutdown case above. scif_poll(..) on the
  70  *    host returns EPOLLHUP as a result.
  71  * 4. On the host, card peer device unregister and SCIF HW remove(..) also
  72  *    subsequently complete.
  73  *
  74  * Card crash
  75  * ----------
  76  * If a reset is issued after the card has crashed, there is no SCIF_DISCNT
  77  * message from the card which would result in scif_poll(..) returning
  78  * EPOLLHUP. In this case when the host SCIF driver sends a SCIF_REMOVE_NODE
  79  * message to itself resulting in the card SCIF peer device being unregistered,
  80  * this results in a scif_peer_release_dev -> scif_cleanup_scifdev->
  81  * scif_invalidate_ep call sequence which sets the endpoint state to
  82  * DISCONNECTED and results in scif_poll(..) returning EPOLLHUP.
  83  */
  84 
  85 #define COSM_SCIF_BACKLOG 16
  86 #define COSM_HEARTBEAT_CHECK_DELTA_SEC 10
  87 #define COSM_HEARTBEAT_TIMEOUT_SEC \
  88                 (COSM_HEARTBEAT_SEND_SEC + COSM_HEARTBEAT_CHECK_DELTA_SEC)
  89 #define COSM_HEARTBEAT_TIMEOUT_MSEC (COSM_HEARTBEAT_TIMEOUT_SEC * MSEC_PER_SEC)
  90 
  91 static struct task_struct *server_thread;
  92 static scif_epd_t listen_epd;
  93 
  94 /* Publish MIC card's shutdown status to user space MIC daemon */
  95 static void cosm_update_mic_status(struct cosm_device *cdev)
  96 {
  97         if (cdev->shutdown_status_int != MIC_NOP) {
  98                 cosm_set_shutdown_status(cdev, cdev->shutdown_status_int);
  99                 cdev->shutdown_status_int = MIC_NOP;
 100         }
 101 }
 102 
 103 /* Store MIC card's shutdown status internally when it is received */
 104 static void cosm_shutdown_status_int(struct cosm_device *cdev,
 105                                      enum mic_status shutdown_status)
 106 {
 107         switch (shutdown_status) {
 108         case MIC_HALTED:
 109         case MIC_POWER_OFF:
 110         case MIC_RESTART:
 111         case MIC_CRASHED:
 112                 break;
 113         default:
 114                 dev_err(&cdev->dev, "%s %d Unexpected shutdown_status %d\n",
 115                         __func__, __LINE__, shutdown_status);
 116                 return;
 117         };
 118         cdev->shutdown_status_int = shutdown_status;
 119         cdev->heartbeat_watchdog_enable = false;
 120 
 121         if (cdev->state != MIC_SHUTTING_DOWN)
 122                 cosm_set_state(cdev, MIC_SHUTTING_DOWN);
 123 }
 124 
 125 /* Non-blocking recv. Read and process all available messages */
 126 static void cosm_scif_recv(struct cosm_device *cdev)
 127 {
 128         struct cosm_msg msg;
 129         int rc;
 130 
 131         while (1) {
 132                 rc = scif_recv(cdev->epd, &msg, sizeof(msg), 0);
 133                 if (!rc) {
 134                         break;
 135                 } else if (rc < 0) {
 136                         dev_dbg(&cdev->dev, "%s: %d rc %d\n",
 137                                 __func__, __LINE__, rc);
 138                         break;
 139                 }
 140                 dev_dbg(&cdev->dev, "%s: %d rc %d id 0x%llx\n",
 141                         __func__, __LINE__, rc, msg.id);
 142 
 143                 switch (msg.id) {
 144                 case COSM_MSG_SHUTDOWN_STATUS:
 145                         cosm_shutdown_status_int(cdev, msg.shutdown_status);
 146                         break;
 147                 case COSM_MSG_HEARTBEAT:
 148                         /* Nothing to do, heartbeat only unblocks scif_poll */
 149                         break;
 150                 default:
 151                         dev_err(&cdev->dev, "%s: %d unknown msg.id %lld\n",
 152                                 __func__, __LINE__, msg.id);
 153                         break;
 154                 }
 155         }
 156 }
 157 
 158 /* Publish crashed status for this MIC card */
 159 static void cosm_set_crashed(struct cosm_device *cdev)
 160 {
 161         dev_err(&cdev->dev, "node alive timeout\n");
 162         cosm_shutdown_status_int(cdev, MIC_CRASHED);
 163         cosm_update_mic_status(cdev);
 164 }
 165 
 166 /* Send host time to the MIC card to sync system time between host and MIC */
 167 static void cosm_send_time(struct cosm_device *cdev)
 168 {
 169         struct cosm_msg msg = { .id = COSM_MSG_SYNC_TIME };
 170         struct timespec64 ts;
 171         int rc;
 172 
 173         ktime_get_real_ts64(&ts);
 174         msg.timespec.tv_sec = ts.tv_sec;
 175         msg.timespec.tv_nsec = ts.tv_nsec;
 176 
 177         rc = scif_send(cdev->epd, &msg, sizeof(msg), SCIF_SEND_BLOCK);
 178         if (rc < 0)
 179                 dev_err(&cdev->dev, "%s %d scif_send failed rc %d\n",
 180                         __func__, __LINE__, rc);
 181 }
 182 
 183 /*
 184  * Close this cosm_device's endpoint after its peer endpoint on the card has
 185  * been closed. In all cases except MIC card crash EPOLLHUP on the host is
 186  * triggered by the client's endpoint being closed.
 187  */
 188 static void cosm_scif_close(struct cosm_device *cdev)
 189 {
 190         /*
 191          * Because SHUTDOWN_STATUS message is sent by the MIC cards in the
 192          * reboot notifier when shutdown is still not complete, we notify mpssd
 193          * to reset the card when SCIF endpoint is closed.
 194          */
 195         cosm_update_mic_status(cdev);
 196         scif_close(cdev->epd);
 197         cdev->epd = NULL;
 198         dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__);
 199 }
 200 
 201 /*
 202  * Set card state to ONLINE when a new SCIF connection from a MIC card is
 203  * received. Normally the state is BOOTING when the connection comes in, but can
 204  * be ONLINE if cosm_client driver on the card was unloaded and then reloaded.
 205  */
 206 static int cosm_set_online(struct cosm_device *cdev)
 207 {
 208         int rc = 0;
 209 
 210         if (MIC_BOOTING == cdev->state || MIC_ONLINE == cdev->state) {
 211                 cdev->heartbeat_watchdog_enable = cdev->sysfs_heartbeat_enable;
 212                 cdev->epd = cdev->newepd;
 213                 if (cdev->state == MIC_BOOTING)
 214                         cosm_set_state(cdev, MIC_ONLINE);
 215                 cosm_send_time(cdev);
 216                 dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__);
 217         } else {
 218                 dev_warn(&cdev->dev, "%s %d not going online in state: %s\n",
 219                          __func__, __LINE__, cosm_state_string[cdev->state]);
 220                 rc = -EINVAL;
 221         }
 222         /* Drop reference acquired by bus_find_device in the server thread */
 223         put_device(&cdev->dev);
 224         return rc;
 225 }
 226 
 227 /*
 228  * Work function for handling work for a SCIF connection from a particular MIC
 229  * card. It first sets the card state to ONLINE and then calls scif_poll to
 230  * block on activity such as incoming messages on the SCIF endpoint. When the
 231  * endpoint is closed, the work function exits, completing its life cycle, from
 232  * MIC card boot to card shutdown/reset/crash.
 233  */
 234 void cosm_scif_work(struct work_struct *work)
 235 {
 236         struct cosm_device *cdev = container_of(work, struct cosm_device,
 237                                                 scif_work);
 238         struct scif_pollepd pollepd;
 239         int rc;
 240 
 241         mutex_lock(&cdev->cosm_mutex);
 242         if (cosm_set_online(cdev))
 243                 goto exit;
 244 
 245         while (1) {
 246                 pollepd.epd = cdev->epd;
 247                 pollepd.events = EPOLLIN;
 248 
 249                 /* Drop the mutex before blocking in scif_poll(..) */
 250                 mutex_unlock(&cdev->cosm_mutex);
 251                 /* poll(..) with timeout on our endpoint */
 252                 rc = scif_poll(&pollepd, 1, COSM_HEARTBEAT_TIMEOUT_MSEC);
 253                 mutex_lock(&cdev->cosm_mutex);
 254                 if (rc < 0) {
 255                         dev_err(&cdev->dev, "%s %d scif_poll rc %d\n",
 256                                 __func__, __LINE__, rc);
 257                         continue;
 258                 }
 259 
 260                 /* There is a message from the card */
 261                 if (pollepd.revents & EPOLLIN)
 262                         cosm_scif_recv(cdev);
 263 
 264                 /* The peer endpoint is closed or this endpoint disconnected */
 265                 if (pollepd.revents & EPOLLHUP) {
 266                         cosm_scif_close(cdev);
 267                         break;
 268                 }
 269 
 270                 /* Did we timeout from poll? */
 271                 if (!rc && cdev->heartbeat_watchdog_enable)
 272                         cosm_set_crashed(cdev);
 273         }
 274 exit:
 275         dev_dbg(&cdev->dev, "%s %d exiting\n", __func__, __LINE__);
 276         mutex_unlock(&cdev->cosm_mutex);
 277 }
 278 
 279 /*
 280  * COSM SCIF server thread function. Accepts incoming SCIF connections from MIC
 281  * cards, finds the correct cosm_device to associate that connection with and
 282  * schedules individual work items for each MIC card.
 283  */
 284 static int cosm_scif_server(void *unused)
 285 {
 286         struct cosm_device *cdev;
 287         scif_epd_t newepd;
 288         struct scif_port_id port_id;
 289         int rc;
 290 
 291         allow_signal(SIGKILL);
 292 
 293         while (!kthread_should_stop()) {
 294                 rc = scif_accept(listen_epd, &port_id, &newepd,
 295                                  SCIF_ACCEPT_SYNC);
 296                 if (rc < 0) {
 297                         if (-ERESTARTSYS != rc)
 298                                 pr_err("%s %d rc %d\n", __func__, __LINE__, rc);
 299                         continue;
 300                 }
 301 
 302                 /*
 303                  * Associate the incoming connection with a particular
 304                  * cosm_device, COSM device ID == SCIF node ID - 1
 305                  */
 306                 cdev = cosm_find_cdev_by_id(port_id.node - 1);
 307                 if (!cdev)
 308                         continue;
 309                 cdev->newepd = newepd;
 310                 schedule_work(&cdev->scif_work);
 311         }
 312 
 313         pr_debug("%s %d Server thread stopped\n", __func__, __LINE__);
 314         return 0;
 315 }
 316 
 317 static int cosm_scif_listen(void)
 318 {
 319         int rc;
 320 
 321         listen_epd = scif_open();
 322         if (!listen_epd) {
 323                 pr_err("%s %d scif_open failed\n", __func__, __LINE__);
 324                 return -ENOMEM;
 325         }
 326 
 327         rc = scif_bind(listen_epd, SCIF_COSM_LISTEN_PORT);
 328         if (rc < 0) {
 329                 pr_err("%s %d scif_bind failed rc %d\n",
 330                        __func__, __LINE__, rc);
 331                 goto err;
 332         }
 333 
 334         rc = scif_listen(listen_epd, COSM_SCIF_BACKLOG);
 335         if (rc < 0) {
 336                 pr_err("%s %d scif_listen rc %d\n", __func__, __LINE__, rc);
 337                 goto err;
 338         }
 339         pr_debug("%s %d listen_epd set up\n", __func__, __LINE__);
 340         return 0;
 341 err:
 342         scif_close(listen_epd);
 343         listen_epd = NULL;
 344         return rc;
 345 }
 346 
 347 static void cosm_scif_listen_exit(void)
 348 {
 349         pr_debug("%s %d closing listen_epd\n", __func__, __LINE__);
 350         if (listen_epd) {
 351                 scif_close(listen_epd);
 352                 listen_epd = NULL;
 353         }
 354 }
 355 
 356 /*
 357  * Create a listening SCIF endpoint and a server kthread which accepts incoming
 358  * SCIF connections from MIC cards
 359  */
 360 int cosm_scif_init(void)
 361 {
 362         int rc = cosm_scif_listen();
 363 
 364         if (rc) {
 365                 pr_err("%s %d cosm_scif_listen rc %d\n",
 366                        __func__, __LINE__, rc);
 367                 goto err;
 368         }
 369 
 370         server_thread = kthread_run(cosm_scif_server, NULL, "cosm_server");
 371         if (IS_ERR(server_thread)) {
 372                 rc = PTR_ERR(server_thread);
 373                 pr_err("%s %d kthread_run rc %d\n", __func__, __LINE__, rc);
 374                 goto listen_exit;
 375         }
 376         return 0;
 377 listen_exit:
 378         cosm_scif_listen_exit();
 379 err:
 380         return rc;
 381 }
 382 
 383 /* Stop the running server thread and close the listening SCIF endpoint */
 384 void cosm_scif_exit(void)
 385 {
 386         int rc;
 387 
 388         if (!IS_ERR_OR_NULL(server_thread)) {
 389                 rc = send_sig(SIGKILL, server_thread, 0);
 390                 if (rc) {
 391                         pr_err("%s %d send_sig rc %d\n",
 392                                __func__, __LINE__, rc);
 393                         return;
 394                 }
 395                 kthread_stop(server_thread);
 396         }
 397 
 398         cosm_scif_listen_exit();
 399 }

/* [<][>][^][v][top][bottom][index][help] */