Skip to content

Commit 9307916

Browse files
bvanasscherolandd
authored andcommitted
scsi_transport_srp: Fix a race condition
The rport timers must be stopped before the SRP initiator destroys the resources associated with the SCSI host. This is necessary because otherwise the callback functions invoked from the SRP transport layer could trigger a use-after-free. Stopping the rport timers before invoking scsi_remove_host() can trigger long delays in the SCSI error handler if a transport layer failure occurs while scsi_remove_host() is in progress. Hence move the code for stopping the rport timers from srp_rport_release() into a new function and invoke that function after scsi_remove_host() has finished. This patch fixes the following sporadic kernel crash: kernel BUG at include/asm-generic/dma-mapping-common.h:64! invalid opcode: 0000 [#1] SMP RIP: 0010:[<ffffffffa03b20b1>] [<ffffffffa03b20b1>] srp_unmap_data+0x121/0x130 [ib_srp] Call Trace: [<ffffffffa03b20fc>] srp_free_req+0x3c/0x80 [ib_srp] [<ffffffffa03b2188>] srp_finish_req+0x48/0x70 [ib_srp] [<ffffffffa03b21fb>] srp_terminate_io+0x4b/0x60 [ib_srp] [<ffffffffa03a6fb5>] __rport_fail_io_fast+0x75/0x80 [scsi_transport_srp] [<ffffffffa03a7438>] rport_fast_io_fail_timedout+0x88/0xc0 [scsi_transport_srp] [<ffffffff8108b370>] worker_thread+0x170/0x2a0 [<ffffffff81090876>] kthread+0x96/0xa0 [<ffffffff8100c0ca>] child_rip+0xa/0x20 Signed-off-by: Bart Van Assche <[email protected]> Signed-off-by: Roland Dreier <[email protected]>
1 parent 18cc4e0 commit 9307916

File tree

3 files changed

+46
-42
lines changed

3 files changed

+46
-42
lines changed

drivers/infiniband/ulp/srp/ib_srp.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,7 @@ static void srp_remove_target(struct srp_target_port *target)
660660
srp_rport_get(target->rport);
661661
srp_remove_host(target->scsi_host);
662662
scsi_remove_host(target->scsi_host);
663+
srp_stop_rport_timers(target->rport);
663664
srp_disconnect_target(target);
664665
ib_destroy_cm_id(target->cm_id);
665666
srp_free_target_ib(target);

drivers/scsi/scsi_transport_srp.c

Lines changed: 43 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -456,37 +456,29 @@ static void __srp_start_tl_fail_timers(struct srp_rport *rport)
456456

457457
lockdep_assert_held(&rport->mutex);
458458

459-
if (!rport->deleted) {
460-
delay = rport->reconnect_delay;
461-
fast_io_fail_tmo = rport->fast_io_fail_tmo;
462-
dev_loss_tmo = rport->dev_loss_tmo;
463-
pr_debug("%s current state: %d\n",
464-
dev_name(&shost->shost_gendev), rport->state);
459+
delay = rport->reconnect_delay;
460+
fast_io_fail_tmo = rport->fast_io_fail_tmo;
461+
dev_loss_tmo = rport->dev_loss_tmo;
462+
pr_debug("%s current state: %d\n", dev_name(&shost->shost_gendev),
463+
rport->state);
465464

466-
if (delay > 0)
465+
if (rport->state == SRP_RPORT_LOST)
466+
return;
467+
if (delay > 0)
468+
queue_delayed_work(system_long_wq, &rport->reconnect_work,
469+
1UL * delay * HZ);
470+
if (srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) {
471+
pr_debug("%s new state: %d\n", dev_name(&shost->shost_gendev),
472+
rport->state);
473+
scsi_target_block(&shost->shost_gendev);
474+
if (fast_io_fail_tmo >= 0)
467475
queue_delayed_work(system_long_wq,
468-
&rport->reconnect_work,
469-
1UL * delay * HZ);
470-
if (srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) {
471-
pr_debug("%s new state: %d\n",
472-
dev_name(&shost->shost_gendev),
473-
rport->state);
474-
scsi_target_block(&shost->shost_gendev);
475-
if (fast_io_fail_tmo >= 0)
476-
queue_delayed_work(system_long_wq,
477-
&rport->fast_io_fail_work,
478-
1UL * fast_io_fail_tmo * HZ);
479-
if (dev_loss_tmo >= 0)
480-
queue_delayed_work(system_long_wq,
481-
&rport->dev_loss_work,
482-
1UL * dev_loss_tmo * HZ);
483-
}
484-
} else {
485-
pr_debug("%s has already been deleted\n",
486-
dev_name(&shost->shost_gendev));
487-
srp_rport_set_state(rport, SRP_RPORT_FAIL_FAST);
488-
scsi_target_unblock(&shost->shost_gendev,
489-
SDEV_TRANSPORT_OFFLINE);
476+
&rport->fast_io_fail_work,
477+
1UL * fast_io_fail_tmo * HZ);
478+
if (dev_loss_tmo >= 0)
479+
queue_delayed_work(system_long_wq,
480+
&rport->dev_loss_work,
481+
1UL * dev_loss_tmo * HZ);
490482
}
491483
}
492484

@@ -560,7 +552,7 @@ int srp_reconnect_rport(struct srp_rport *rport)
560552
scsi_target_block(&shost->shost_gendev);
561553
while (scsi_request_fn_active(shost))
562554
msleep(20);
563-
res = i->f->reconnect(rport);
555+
res = rport->state != SRP_RPORT_LOST ? i->f->reconnect(rport) : -ENODEV;
564556
pr_debug("%s (state %d): transport.reconnect() returned %d\n",
565557
dev_name(&shost->shost_gendev), rport->state, res);
566558
if (res == 0) {
@@ -626,10 +618,6 @@ static void srp_rport_release(struct device *dev)
626618
{
627619
struct srp_rport *rport = dev_to_rport(dev);
628620

629-
cancel_delayed_work_sync(&rport->reconnect_work);
630-
cancel_delayed_work_sync(&rport->fast_io_fail_work);
631-
cancel_delayed_work_sync(&rport->dev_loss_work);
632-
633621
put_device(dev->parent);
634622
kfree(rport);
635623
}
@@ -784,12 +772,6 @@ void srp_rport_del(struct srp_rport *rport)
784772
device_del(dev);
785773
transport_destroy_device(dev);
786774

787-
mutex_lock(&rport->mutex);
788-
if (rport->state == SRP_RPORT_BLOCKED)
789-
__rport_fail_io_fast(rport);
790-
rport->deleted = true;
791-
mutex_unlock(&rport->mutex);
792-
793775
put_device(dev);
794776
}
795777
EXPORT_SYMBOL_GPL(srp_rport_del);
@@ -814,6 +796,27 @@ void srp_remove_host(struct Scsi_Host *shost)
814796
}
815797
EXPORT_SYMBOL_GPL(srp_remove_host);
816798

799+
/**
800+
* srp_stop_rport_timers - stop the transport layer recovery timers
801+
*
802+
* Must be called after srp_remove_host() and scsi_remove_host(). The caller
803+
* must hold a reference on the rport (rport->dev) and on the SCSI host
804+
* (rport->dev.parent).
805+
*/
806+
void srp_stop_rport_timers(struct srp_rport *rport)
807+
{
808+
mutex_lock(&rport->mutex);
809+
if (rport->state == SRP_RPORT_BLOCKED)
810+
__rport_fail_io_fast(rport);
811+
srp_rport_set_state(rport, SRP_RPORT_LOST);
812+
mutex_unlock(&rport->mutex);
813+
814+
cancel_delayed_work_sync(&rport->reconnect_work);
815+
cancel_delayed_work_sync(&rport->fast_io_fail_work);
816+
cancel_delayed_work_sync(&rport->dev_loss_work);
817+
}
818+
EXPORT_SYMBOL_GPL(srp_stop_rport_timers);
819+
817820
static int srp_tsk_mgmt_response(struct Scsi_Host *shost, u64 nexus, u64 tm_id,
818821
int result)
819822
{

include/scsi/scsi_transport_srp.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ struct srp_rport_identifiers {
1919
* @SRP_RPORT_BLOCKED: Transport layer not operational; fast I/O fail timer
2020
* is running and I/O has been blocked.
2121
* @SRP_RPORT_FAIL_FAST: Fast I/O fail timer has expired; fail I/O fast.
22-
* @SRP_RPORT_LOST: Device loss timer has expired; port is being removed.
22+
* @SRP_RPORT_LOST: Port is being removed.
2323
*/
2424
enum srp_rport_state {
2525
SRP_RPORT_RUNNING,
@@ -48,7 +48,6 @@ struct srp_rport {
4848

4949
struct mutex mutex;
5050
enum srp_rport_state state;
51-
bool deleted;
5251
int reconnect_delay;
5352
int failed_reconnects;
5453
struct delayed_work reconnect_work;
@@ -101,6 +100,7 @@ extern int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo,
101100
extern int srp_reconnect_rport(struct srp_rport *rport);
102101
extern void srp_start_tl_fail_timers(struct srp_rport *rport);
103102
extern void srp_remove_host(struct Scsi_Host *);
103+
extern void srp_stop_rport_timers(struct srp_rport *rport);
104104

105105
/**
106106
* srp_chkready() - evaluate the transport layer state before I/O

0 commit comments

Comments
 (0)