patch-2.4.22 linux-2.4.22/drivers/block/ll_rw_blk.c
Next file: linux-2.4.22/drivers/block/loop.c
Previous file: linux-2.4.22/drivers/block/floppy.c
Back to the patch index
Back to the overall index
- Lines: 452
- Date:
2003-08-25 04:44:41.000000000 -0700
- Orig file:
linux-2.4.21/drivers/block/ll_rw_blk.c
- Orig date:
2003-06-13 07:51:32.000000000 -0700
diff -urN linux-2.4.21/drivers/block/ll_rw_blk.c linux-2.4.22/drivers/block/ll_rw_blk.c
@@ -176,11 +176,12 @@
{
int count = q->nr_requests;
- count -= __blk_cleanup_queue(&q->rq[READ]);
- count -= __blk_cleanup_queue(&q->rq[WRITE]);
+ count -= __blk_cleanup_queue(&q->rq);
if (count)
printk("blk_cleanup_queue: leaked requests (%d)\n", count);
+ if (atomic_read(&q->nr_sectors))
+ printk("blk_cleanup_queue: leaked sectors (%d)\n", atomic_read(&q->nr_sectors));
memset(q, 0, sizeof(*q));
}
@@ -215,6 +216,24 @@
}
/**
+ * blk_queue_throttle_sectors - indicates you will call sector throttling funcs
+ * @q: The queue which this applies to.
+ * @active: A flag indication if you want sector throttling on
+ *
+ * Description:
+ * The sector throttling code allows us to put a limit on the number of
+ * sectors pending io to the disk at a given time, sending @active nonzero
+ * indicates you will call blk_started_sectors and blk_finished_sectors in
+ * addition to calling blk_started_io and blk_finished_io in order to
+ * keep track of the number of sectors in flight.
+ **/
+
+void blk_queue_throttle_sectors(request_queue_t * q, int active)
+{
+ q->can_throttle = active;
+}
+
+/**
* blk_queue_make_request - define an alternate make_request function for a device
* @q: the request queue for the device to be affected
* @mfn: the alternate make_request function
@@ -389,7 +408,7 @@
*
* Returns the (new) number of requests which the queue has available.
*/
-int blk_grow_request_list(request_queue_t *q, int nr_requests)
+int blk_grow_request_list(request_queue_t *q, int nr_requests, int max_queue_sectors)
{
unsigned long flags;
/* Several broken drivers assume that this function doesn't sleep,
@@ -399,21 +418,34 @@
spin_lock_irqsave(&io_request_lock, flags);
while (q->nr_requests < nr_requests) {
struct request *rq;
- int rw;
rq = kmem_cache_alloc(request_cachep, SLAB_ATOMIC);
if (rq == NULL)
break;
memset(rq, 0, sizeof(*rq));
rq->rq_status = RQ_INACTIVE;
- rw = q->nr_requests & 1;
- list_add(&rq->queue, &q->rq[rw].free);
- q->rq[rw].count++;
+ list_add(&rq->queue, &q->rq.free);
+ q->rq.count++;
+
q->nr_requests++;
}
+
+ /*
+ * Wakeup waiters after both one quarter of the
+ * max-in-fligh queue and one quarter of the requests
+ * are available again.
+ */
+
q->batch_requests = q->nr_requests / 4;
if (q->batch_requests > 32)
q->batch_requests = 32;
+ q->batch_sectors = max_queue_sectors / 4;
+
+ q->max_queue_sectors = max_queue_sectors;
+
+ BUG_ON(!q->batch_sectors);
+ atomic_set(&q->nr_sectors, 0);
+
spin_unlock_irqrestore(&io_request_lock, flags);
return q->nr_requests;
}
@@ -422,23 +454,28 @@
{
struct sysinfo si;
int megs; /* Total memory, in megabytes */
- int nr_requests;
-
- INIT_LIST_HEAD(&q->rq[READ].free);
- INIT_LIST_HEAD(&q->rq[WRITE].free);
- q->rq[READ].count = 0;
- q->rq[WRITE].count = 0;
+ int nr_requests, max_queue_sectors = MAX_QUEUE_SECTORS;
+
+ INIT_LIST_HEAD(&q->rq.free);
+ q->rq.count = 0;
+ q->rq.pending[READ] = q->rq.pending[WRITE] = 0;
q->nr_requests = 0;
si_meminfo(&si);
megs = si.totalram >> (20 - PAGE_SHIFT);
- nr_requests = 128;
- if (megs < 32)
- nr_requests /= 2;
- blk_grow_request_list(q, nr_requests);
+ nr_requests = MAX_NR_REQUESTS;
+ if (megs < 30) {
+ nr_requests /= 2;
+ max_queue_sectors /= 2;
+ }
+ /* notice early if anybody screwed the defaults */
+ BUG_ON(!nr_requests);
+ BUG_ON(!max_queue_sectors);
+
+ blk_grow_request_list(q, nr_requests, max_queue_sectors);
+
+ init_waitqueue_head(&q->wait_for_requests);
- init_waitqueue_head(&q->wait_for_requests[0]);
- init_waitqueue_head(&q->wait_for_requests[1]);
spin_lock_init(&q->queue_lock);
}
@@ -491,6 +528,8 @@
q->plug_tq.routine = &generic_unplug_device;
q->plug_tq.data = q;
q->plugged = 0;
+ q->can_throttle = 0;
+
/*
* These booleans describe the queue properties. We set the
* default (and most common) values here. Other drivers can
@@ -511,12 +550,29 @@
static struct request *get_request(request_queue_t *q, int rw)
{
struct request *rq = NULL;
- struct request_list *rl = q->rq + rw;
+ struct request_list *rl = &q->rq;
+ if (blk_oversized_queue(q)) {
+ int rlim = q->nr_requests >> 5;
+
+ if (rlim < 4)
+ rlim = 4;
+
+ /*
+ * if its a write, or we have more than a handful of reads
+ * pending, bail out
+ */
+ if ((rw == WRITE) || (rw == READ && rl->pending[READ] > rlim))
+ return NULL;
+ if (blk_oversized_queue_reads(q))
+ return NULL;
+ }
+
if (!list_empty(&rl->free)) {
rq = blkdev_free_rq(&rl->free);
list_del(&rq->queue);
rl->count--;
+ rl->pending[rw]++;
rq->rq_status = RQ_ACTIVE;
rq->cmd = rw;
rq->special = NULL;
@@ -527,29 +583,19 @@
}
/*
- * Here's the request allocation design:
+ * Here's the request allocation design, low latency version:
*
* 1: Blocking on request exhaustion is a key part of I/O throttling.
*
* 2: We want to be `fair' to all requesters. We must avoid starvation, and
* attempt to ensure that all requesters sleep for a similar duration. Hence
* no stealing requests when there are other processes waiting.
- *
- * 3: We also wish to support `batching' of requests. So when a process is
- * woken, we want to allow it to allocate a decent number of requests
- * before it blocks again, so they can be nicely merged (this only really
- * matters if the process happens to be adding requests near the head of
- * the queue).
- *
- * 4: We want to avoid scheduling storms. This isn't really important, because
- * the system will be I/O bound anyway. But it's easy.
- *
- * There is tension between requirements 2 and 3. Once a task has woken,
- * we don't want to allow it to sleep as soon as it takes its second request.
- * But we don't want currently-running tasks to steal all the requests
- * from the sleepers. We handle this with wakeup hysteresis around
- * 0 .. batch_requests and with the assumption that request taking is much,
- * much faster than request freeing.
+ *
+ * There used to be more here, attempting to allow a process to send in a
+ * number of requests once it has woken up. But, there's no way to
+ * tell if a process has just been woken up, or if it is a new process
+ * coming in to steal requests from the waiters. So, we give up and force
+ * everyone to wait fairly.
*
* So here's what we do:
*
@@ -561,28 +607,23 @@
*
* When a process wants a new request:
*
- * b) If free_requests == 0, the requester sleeps in FIFO manner.
- *
- * b) If 0 < free_requests < batch_requests and there are waiters,
- * we still take a request non-blockingly. This provides batching.
- *
- * c) If free_requests >= batch_requests, the caller is immediately
- * granted a new request.
+ * b) If free_requests == 0, the requester sleeps in FIFO manner, and
+ * the queue full condition is set. The full condition is not
+ * cleared until there are no longer any waiters. Once the full
+ * condition is set, all new io must wait, hopefully for a very
+ * short period of time.
*
* When a request is released:
*
- * d) If free_requests < batch_requests, do nothing.
- *
- * f) If free_requests >= batch_requests, wake up a single waiter.
+ * c) If free_requests < batch_requests, do nothing.
*
- * The net effect is that when a process is woken at the batch_requests level,
- * it will be able to take approximately (batch_requests) requests before
- * blocking again (at the tail of the queue).
- *
- * This all assumes that the rate of taking requests is much, much higher
- * than the rate of releasing them. Which is very true.
+ * d) If free_requests >= batch_requests, wake up a single waiter.
*
- * -akpm, Feb 2002.
+ * As each waiter gets a request, he wakes another waiter. We do this
+ * to prevent a race where an unplug might get run before a request makes
+ * it's way onto the queue. The result is a cascade of wakeups, so delaying
+ * the initial wakeup until we've got batch_requests available helps avoid
+ * wakeups where there aren't any requests available yet.
*/
static struct request *__get_request_wait(request_queue_t *q, int rw)
@@ -590,21 +631,37 @@
register struct request *rq;
DECLARE_WAITQUEUE(wait, current);
- add_wait_queue(&q->wait_for_requests[rw], &wait);
+ add_wait_queue_exclusive(&q->wait_for_requests, &wait);
+
do {
set_current_state(TASK_UNINTERRUPTIBLE);
- generic_unplug_device(q);
- if (q->rq[rw].count == 0)
- schedule();
spin_lock_irq(&io_request_lock);
+ if (blk_oversized_queue(q) || q->rq.count == 0) {
+ __generic_unplug_device(q);
+ spin_unlock_irq(&io_request_lock);
+ schedule();
+ spin_lock_irq(&io_request_lock);
+ }
rq = get_request(q, rw);
spin_unlock_irq(&io_request_lock);
} while (rq == NULL);
- remove_wait_queue(&q->wait_for_requests[rw], &wait);
+ remove_wait_queue(&q->wait_for_requests, &wait);
current->state = TASK_RUNNING;
+
return rq;
}
+static void get_request_wait_wakeup(request_queue_t *q, int rw)
+{
+ /*
+ * avoid losing an unplug if a second __get_request_wait did the
+ * generic_unplug_device while our __get_request_wait was running
+ * w/o the queue_lock held and w/ our request out of the queue.
+ */
+ if (waitqueue_active(&q->wait_for_requests))
+ wake_up(&q->wait_for_requests);
+}
+
/* RO fail safe mechanism */
static long ro_bits[MAX_BLKDEV][8];
@@ -818,7 +875,6 @@
void blkdev_release_request(struct request *req)
{
request_queue_t *q = req->q;
- int rw = req->cmd;
req->rq_status = RQ_INACTIVE;
req->q = NULL;
@@ -828,9 +884,29 @@
* assume it has free buffers and check waiters
*/
if (q) {
- list_add(&req->queue, &q->rq[rw].free);
- if (++q->rq[rw].count >= q->batch_requests)
- wake_up(&q->wait_for_requests[rw]);
+ struct request_list *rl = &q->rq;
+ int oversized_batch = 0;
+
+ if (q->can_throttle)
+ oversized_batch = blk_oversized_queue_batch(q);
+ rl->count++;
+ /*
+ * paranoia check
+ */
+ if (req->cmd == READ || req->cmd == WRITE)
+ rl->pending[req->cmd]--;
+ if (rl->pending[READ] > q->nr_requests)
+ printk("blk: reads: %u\n", rl->pending[READ]);
+ if (rl->pending[WRITE] > q->nr_requests)
+ printk("blk: writes: %u\n", rl->pending[WRITE]);
+ if (rl->pending[READ] + rl->pending[WRITE] > q->nr_requests)
+ printk("blk: r/w: %u + %u > %u\n", rl->pending[READ], rl->pending[WRITE], q->nr_requests);
+ list_add(&req->queue, &rl->free);
+ if (rl->count >= q->batch_requests && !oversized_batch) {
+ smp_mb();
+ if (waitqueue_active(&q->wait_for_requests))
+ wake_up(&q->wait_for_requests);
+ }
}
}
@@ -901,16 +977,18 @@
static int __make_request(request_queue_t * q, int rw,
struct buffer_head * bh)
{
- unsigned int sector, count;
+ unsigned int sector, count, sync;
int max_segments = MAX_SEGMENTS;
struct request * req, *freereq = NULL;
int rw_ahead, max_sectors, el_ret;
struct list_head *head, *insert_here;
int latency;
elevator_t *elevator = &q->elevator;
+ int should_wake = 0;
count = bh->b_size >> 9;
sector = bh->b_rsector;
+ sync = test_and_clear_bit(BH_Sync, &bh->b_state);
rw_ahead = 0; /* normal case; gets changed below for READA */
switch (rw) {
@@ -948,7 +1026,6 @@
*/
max_sectors = get_max_sectors(bh->b_rdev);
-again:
req = NULL;
head = &q->queue_head;
/*
@@ -957,7 +1034,9 @@
*/
spin_lock_irq(&io_request_lock);
+again:
insert_here = head->prev;
+
if (list_empty(head)) {
q->plug_device_fn(q, bh->b_rdev); /* is atomic */
goto get_rq;
@@ -976,6 +1055,7 @@
req->bhtail = bh;
req->nr_sectors = req->hard_nr_sectors += count;
blk_started_io(count);
+ blk_started_sectors(req, count);
drive_stat_acct(req->rq_dev, req->cmd, count, 0);
req_new_io(req, 1, count);
attempt_back_merge(q, req, max_sectors, max_segments);
@@ -998,6 +1078,7 @@
req->sector = req->hard_sector = sector;
req->nr_sectors = req->hard_nr_sectors += count;
blk_started_io(count);
+ blk_started_sectors(req, count);
drive_stat_acct(req->rq_dev, req->cmd, count, 0);
req_new_io(req, 1, count);
attempt_front_merge(q, head, req, max_sectors, max_segments);
@@ -1030,7 +1111,7 @@
* See description above __get_request_wait()
*/
if (rw_ahead) {
- if (q->rq[rw].count < q->batch_requests) {
+ if (q->rq.count < q->batch_requests || blk_oversized_queue_batch(q)) {
spin_unlock_irq(&io_request_lock);
goto end_io;
}
@@ -1042,6 +1123,9 @@
if (req == NULL) {
spin_unlock_irq(&io_request_lock);
freereq = __get_request_wait(q, rw);
+ head = &q->queue_head;
+ spin_lock_irq(&io_request_lock);
+ should_wake = 1;
goto again;
}
}
@@ -1064,10 +1148,15 @@
req->start_time = jiffies;
req_new_io(req, 0, count);
blk_started_io(count);
+ blk_started_sectors(req, count);
add_request(q, req, insert_here);
out:
if (freereq)
blkdev_release_request(freereq);
+ if (should_wake)
+ get_request_wait_wakeup(q, rw);
+ if (sync)
+ __generic_unplug_device(q);
spin_unlock_irq(&io_request_lock);
return 0;
end_io:
@@ -1196,8 +1285,15 @@
bh->b_rdev = bh->b_dev;
bh->b_rsector = bh->b_blocknr * count;
+ get_bh(bh);
generic_make_request(rw, bh);
+ /* fix race condition with wait_on_buffer() */
+ smp_mb(); /* spin_unlock may have inclusive semantics */
+ if (waitqueue_active(&bh->b_wait))
+ wake_up(&bh->b_wait);
+
+ put_bh(bh);
switch (rw) {
case WRITE:
kstat.pgpgout += count;
@@ -1350,6 +1446,7 @@
if ((bh = req->bh) != NULL) {
nsect = bh->b_size >> 9;
blk_finished_io(nsect);
+ blk_finished_sectors(req, nsect);
req->bh = bh->b_reqnext;
bh->b_reqnext = NULL;
bh->b_end_io(bh, uptodate);
@@ -1509,6 +1606,7 @@
EXPORT_SYMBOL(blk_get_queue);
EXPORT_SYMBOL(blk_cleanup_queue);
EXPORT_SYMBOL(blk_queue_headactive);
+EXPORT_SYMBOL(blk_queue_throttle_sectors);
EXPORT_SYMBOL(blk_queue_make_request);
EXPORT_SYMBOL(generic_make_request);
EXPORT_SYMBOL(blkdev_release_request);
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)