The Linux kernel reads the file process source code and the block point super detailed explanation

The Linux kernel reads the file process source code and the block point super detailed explanation

Take the Linux kernel version 3.13 as an example. 1. the kernel read()executes sys_read()functions through system calls . In the file linux/fs/read_write.c:

//linux/fs/read_write.c

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
    struct fd f = fdget(fd);     //fd file 
    ssize_t ret = -EBADF;

    if (f.file) {
        loff_t pos = file_pos_read(f.file);       // 
        ret = vfs_read(f.file, buf, count, &pos); //vfs_read 
        if (ret >= 0)
            file_pos_write(f.file, pos);    //
        fdput(f);
    }
    return ret;
}12345678910111213141516 

task_structThere is a files_structstructure in the process control block of each process , which saves all open files of the process, and the corresponding file object can be found by using the file descriptor fd as an index. The file object also contains information about the current location of the file.
Let's look at the vfs_readfunction again , also in the file linux/fs/read_write.c:

//linux/fs/read_write.c

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
    ssize_t ret;

    if (!(file->f_mode & FMODE_READ))
        return -EBADF;
    if (!file->f_op->read && !file->f_op->aio_read)
        return -EINVAL;
    if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))   //
        return -EFAULT;

    ret = rw_verify_area(READ, file, pos, count); //
    if (ret >= 0) {
        count = ret;
        if (file->f_op->read)
            ret = file->f_op->read(file, buf, count, pos);
        else
            ret = do_sync_read(file, buf, count, pos);
        if (ret > 0) {
            fsnotify_access(file);
            add_rchar(current, ret);
        }
        inc_syscr(current);
    }

    return ret;
}1234567891011121314151617181920212223242526272829 

If the file defines a readfunction, the function of the file itself is called read, otherwise the function is called do_sync_read(). file->f_opFrom the corresponding inode->i_fopcome, and inode->i_fopis given to the corresponding file system type in generating the inode file->f_op->readfor the file system, the disk is usually equivalent do_sync_read(), such as ext2 file system.
Take a look at the do_sync_read()function:

//linux/fs/read_write.c

ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
   //iovec/,iov_base iov_len kiocb 
   //linux iovec sys_read() sys_readv() 
    struct iovec iov = { .iov_base = buf, .iov_len = len };
    struct kiocb kiocb;
    ssize_t ret;

   //kiocb
    init_sync_kiocb(&kiocb, filp);
    kiocb.ki_pos = *ppos;
    kiocb.ki_nbytes = len;

   //
    ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
   //-EIOCBQUEUED 
    if (-EIOCBQUEUED == ret)
        ret = wait_on_sync_kiocb(&kiocb); //TASK_UNINTERRUPTIBLE kiocb ki_ctx 
    *ppos = kiocb.ki_pos;
    return ret;
}1234567891011121314151617181920212223 

do_sync_read()The function continues to call the function of this file f_op->aio_read()for asynchronous read operation, and finally needs to call the wait_on_sync_kiocb()function for synchronization (that wait_on_sync_kiocb()is, the data is ready when the function returns). For the ext2 file system, its f_op->aio_read()functions point to general purpose generic_file_aio_read().
Take a look at the generic_file_aio_read()function:

//linux/mm/filemap.c

ssize_t
generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
        unsigned long nr_segs, loff_t pos)
{
    struct file *filp = iocb->ki_filp;
    ssize_t retval;
    unsigned long seg = 0;
    size_t count;
    loff_t *ppos = &iocb->ki_pos;

    count = 0;
   //iovec nr_segs
    retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
    if (retval)
        return retval;

   /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
   //direct IO address_space->direct_IO 
    if (filp->f_flags & O_DIRECT) {
        loff_t size;
        struct address_space *mapping;
        struct inode *inode;

        mapping = filp->f_mapping;
        inode = mapping->host;
        if (!count)
            goto out;/* skip atime */
        size = i_size_read(inode);
        if (pos < size) {
           //
            retval = filemap_write_and_wait_range(mapping, pos,
                    pos + iov_length(iov, nr_segs) - 1);
            if (!retval) {
               //address_space->direct_IO 
                retval = mapping->a_ops->direct_IO(READ, iocb,
                            iov, pos, nr_segs);
            }
            if (retval > 0) {
                *ppos = pos + retval;
                count -= retval;
            }

           /*
             * Btrfs can have a short DIO read if we encounter
             * compressed extents, so if there was an error, or if
             * we've already read everything we wanted to, or if
             * there was a short read because we hit EOF, go ahead
             * and return.  Otherwise fallthrough to buffered io for
             * the rest of the read.
             */
            if (retval < 0 || !count || *ppos >= size) {
                file_accessed(filp);
                goto out;
            }
        }
    }

    count = retval;
   //iovec read_descriptor_t do_generic_file_read 
    for (seg = 0; seg < nr_segs; seg++) {
        read_descriptor_t desc;
        loff_t offset = 0;

       /*
         * If we did a short DIO read we need to skip the section of the
         * iov that we've already read data into.
         */
        if (count) {
            if (count > iov[seg].iov_len) {
                count -= iov[seg].iov_len;
                continue;
            }
            offset = count;
            count = 0;
        }

        desc.written = 0;
        desc.arg.buf = iov[seg].iov_base + offset;
        desc.count = iov[seg].iov_len - offset;
        if (desc.count == 0)
            continue;
        desc.error = 0;
        do_generic_file_read(filp, ppos, &desc);
        retval += desc.written;
        if (desc.error) {
            retval = retval ?: desc.error;
            break;
        }
        if (desc.count > 0)
            break;
    }
out:
    return retval;
}123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596 

do_generic_file_read()The function is a general read function provided by the kernel, which completes the reading operation of the data on the continuous buffer corresponding to an iovec (described by the read_descriptor_ttype structure). do_generic_file_read()The function will determine whether the page is in the page cache. If it is, it will directly transfer the data to the space. If it is not, it will first read the page cache through the disk IO, and then process it according to the existing situation. code show as below:

//linux/mm/filemap.c

static void do_generic_file_read(struct file *filp, loff_t *ppos,
        read_descriptor_t *desc)
{
    struct address_space *mapping = filp->f_mapping;
    struct inode *inode = mapping->host;
    struct file_ra_state *ra = &filp->f_ra;
    pgoff_t index;
    pgoff_t last_index;
    pgoff_t prev_index;
    unsigned long offset;     /* offset into pagecache page */
    unsigned int prev_offset;
    int error;

   //index last_index
    index = *ppos >> PAGE_CACHE_SHIFT;
    prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
    prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
    last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
    offset = *ppos & ~PAGE_CACHE_MASK;

    for (;;) {
        struct page *page;
        pgoff_t end_index;
        loff_t isize;
        unsigned long nr, ret;

        cond_resched();  //
find_page:
        page = find_get_page(mapping, index);  //
        if (!page) {     //
            page_cache_sync_readahead(mapping,
                    ra, filp,
                    index, last_index - index);
            page = find_get_page(mapping, index);
            if (unlikely(page == NULL))  //
                goto no_cached_page;
        }
       //      
        if (PageReadahead(page)) {
            page_cache_async_readahead(mapping,
                    ra, filp, page,
                    index, last_index - index);
        }
       //PG_uptodate bio 
        if (!PageUptodate(page)) {
            if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
                    !mapping->a_ops->is_partially_uptodate)
                goto page_not_up_to_date;
            if (!trylock_page(page))
                goto page_not_up_to_date;
           /* Did it get truncated before we got the lock? */
            if (!page->mapping)
                goto page_not_up_to_date_locked;
            if (!mapping->a_ops->is_partially_uptodate(page,
                                desc, offset))
                goto page_not_up_to_date_locked;
            unlock_page(page);
        }
page_ok:
       /*
         * i_size must be checked after we know the page is Uptodate.
         *
         * Checking i_size after the check allows us to calculate
         * the correct value for "nr", which means the zero-filled
         * part of the page is not copied back to userspace (unless
         * another truncate extends the file - this is desired though).
         */

        isize = i_size_read(inode);
        end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
        if (unlikely(!isize || index > end_index)) {
            page_cache_release(page);
            goto out;
        }

       /* nr is the maximum number of bytes to copy from this page */
        nr = PAGE_CACHE_SIZE;
        if (index == end_index) {
            nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
            if (nr <= offset) {
                page_cache_release(page);
                goto out;
            }
        }
        nr = nr - offset;

       /* If users can be writing to this page using arbitrary
         * virtual addresses, take care about potential aliasing
         * before reading the page on the kernel side.
         */
        if (mapping_writably_mapped(mapping))
            flush_dcache_page(page);

       /*
         * When a sequential read accesses a page several times,
         * only mark it as accessed the first time.
         */
        if (prev_index != index || offset != prev_offset)
            mark_page_accessed(page);
        prev_index = index;

       /*
         * Ok, we have the page, and it's up-to-date, so
         * now we can copy it to user space...
         *
         * The file_read_actor routine returns how many bytes were
         * actually used..
         * NOTE! This may not be the same as how much of a user buffer
         * we filled up (we may be padding etc), so we can only update
         * "pos" here (the actor routine has to update the user buffer
         * pointers and the remaining count).
         */
       //
        ret = file_read_actor(desc, page, offset, nr);
        offset += ret;
        index += offset >> PAGE_CACHE_SHIFT;
        offset &= ~PAGE_CACHE_MASK;
        prev_offset = offset;

        page_cache_release(page);
        if (ret == nr && desc->count)
            continue;
        goto out;

page_not_up_to_date:
       /* Get exclusive access to the page ... */
       //
        error = lock_page_killable(page);
        if (unlikely(error))
            goto readpage_error;

page_not_up_to_date_locked:
       /* Did it get truncated before we got the lock? */
        if (!page->mapping) {
            unlock_page(page);
            page_cache_release(page);
            continue;
        }

       /* Did somebody else fill it already? */
       //
        if (PageUptodate(page)) {
            unlock_page(page);
            goto page_ok;
        }

readpage:
       /*
         * A previous I/O error may have been due to temporary
         * failures, eg. multipath errors.
         * PG_error will be set again if readpage fails.
         */
        ClearPageError(page);
       /* Start the actual read. The read will unlock the page. */
       //
        error = mapping->a_ops->readpage(filp, page);

        if (unlikely(error)) {
            if (error == AOP_TRUNCATED_PAGE) {
                page_cache_release(page);
                goto find_page;
            }
            goto readpage_error;
        }

       //
        if (!PageUptodate(page)) {
            error = lock_page_killable(page);
            if (unlikely(error))
                goto readpage_error;
            if (!PageUptodate(page)) {
                if (page->mapping == NULL) {
                   /*
                     * invalidate_mapping_pages got it
                     */
                    unlock_page(page);
                    page_cache_release(page);
                    goto find_page;
                }
                unlock_page(page);
                shrink_readahead_size_eio(filp, ra);
                error = -EIO;
                goto readpage_error;
            }
            unlock_page(page);
        }

        goto page_ok;

readpage_error:
       /* UHHUH! A synchronous read error occurred. Report it */
        desc->error = error;
        page_cache_release(page);
        goto out;

no_cached_page:
       /*
         * Ok, it wasn't cached, so we need to create a new
         * page..
         */
       //
        page = page_cache_alloc_cold(mapping);
        if (!page) {
            desc->error = -ENOMEM;
            goto out;
        }
       //address_space address_space 
        error = add_to_page_cache_lru(page, mapping,
                        index, GFP_KERNEL);
        if (error) {
            page_cache_release(page);
            if (error == -EEXIST)
                goto find_page;
            desc->error = error;
            goto out;
        }
        goto readpage;
    }

out:
    ra->prev_pos = prev_index;
    ra->prev_pos <<= PAGE_CACHE_SHIFT;
    ra->prev_pos |= prev_offset;

    *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
    file_accessed(filp);
}123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229 

do_generic_file_read()The main body of the function is a for loop, which first calculates the index value of the file in the page cache, and then there are three situations for each index:

  1. The latest data in file_read_actor()the page cache is used to copy the data in the page cache to the user space
  2. If there is data in the page cache but not the latest, you need to lock it and use it to address_space->a_ops->readpage()read in. The function corresponding to the ext2 file system is ext2_readpage(). After the lock is locked and readpage()before the start , it may also be transferred to the previous case because the operation of other processes has been completed.
  3. The page frame does not exist in the page cache. At this time, you need to call the page_cache_alloc_cold()function to allocate a page frame, add it address_space, and then go to the previous processing.

The read operation of ordinary files mainly interacts with the page cache. Even if the disk is to be read, it is usually first read into the page cache and then copied to the user space.

If the file data is not in the page cache, address_spacethe readpage()function needs to be called to read the corresponding page from the disk:

//linux/fs/ext2/inode.c
static int ext2_readpage(struct file *file, struct page *page)
{
    return mpage_readpage(page, ext2_get_block);
}

//linux/fs/mpage.c
int mpage_readpage(struct page *page, get_block_t get_block)
{
    struct bio *bio = NULL;
    sector_t last_block_in_bio = 0;
    struct buffer_head map_bh;
    unsigned long first_logical_block = 0;

    map_bh.b_state = 0;
    map_bh.b_size = 0;
   //do_mpage_readpage page bio bio 
    bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
            &map_bh, &first_logical_block, get_block);
    if (bio)
       //bio 
        mpage_bio_submit(READ, bio);
    return 0;
}

//linux/fs/mpage.c
static struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
   //bio bio 
    bio->bi_end_io = mpage_end_io;
   //IO IO 
    submit_bio(rw, bio);
    return NULL;
}

//linux/fs/mpage.c
static void mpage_end_io(struct bio *bio, int err)
{
    const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
    struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;

    do {
        struct page *page = bvec->bv_page;

        if (--bvec >= bio->bi_io_vec)
            prefetchw(&bvec->bv_page->flags);
        if (bio_data_dir(bio) == READ) {
            if (uptodate) {
                SetPageUptodate(page);
            } else {
                ClearPageUptodate(page);
                SetPageError(page);
            }
            unlock_page(page);
        } else {/* bio_data_dir(bio) == WRITE */
            if (!uptodate) {
                SetPageError(page);
                if (page->mapping)
                    set_bit(AS_EIO, &page->mapping->flags);
            }
            end_page_writeback(page);
        }
    } while (bvec >= bio->bi_io_vec);
    bio_put(bio);
}1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465 

submit_bio( The function converts the bio request into a disk request request, which includes merging and scheduling IO optimization, and hangs the request request on the disk request queue. Each hard disk device has a request queue for asynchronously receiving requests such as reading and writing. These requests are kblockd_workqueuecompleted by the disk work queue in the background, but the specific operation is still completed by the driver of the disk drive. The read and write requests sent by the user will be scheduled and optimized by IO before hanging into the request queue of the disk. The main function inside the generic_make_request()function is to call the function:

//linux/block/blk-core.c
void submit_bio(int rw, struct bio *bio)
{
    bio->bi_rw |= rw;

   /*
     * If it's a regular read/write or a barrier with data attached,
     * go through the normal accounting stuff before submission.
     */
    if (bio_has_data(bio)) {
        unsigned int count;

        if (unlikely(rw & REQ_WRITE_SAME))
            count = bdev_logical_block_size(bio->bi_bdev) >> 9;
        else
            count = bio_sectors(bio);

        if (rw & WRITE) {
            count_vm_events(PGPGOUT, count);
        } else {
            task_io_account_read(bio->bi_size);
            count_vm_events(PGPGIN, count);
        }

        if (unlikely(block_dump)) {
            char b[BDEVNAME_SIZE];
            printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
            current->comm, task_pid_nr(current),
                (rw & WRITE) ? "WRITE" : "READ",
                (unsigned long long)bio->bi_sector,
                bdevname(bio->bi_bdev, b),
                count);
        }
    }

    generic_make_request(bio);
}

void generic_make_request(struct bio *bio)
{
    struct bio_list bio_list_on_stack;

   //
    if (!generic_make_request_checks(bio))
        return;

   /*
     * We only want one ->make_request_fn to be active at a time, else
     * stack usage with stacked devices could be a problem.  So use
     * current->bio_list to keep a list of requests submited by a
     * make_request_fn function.  current->bio_list is also used as a
     * flag to say if generic_make_request is currently active in this
     * task or not.  If it is NULL, then no make_request is active.  If
     * it is non-NULL, then a make_request is active, and new requests
     * should be added at the tail
     */
   //current->bio_list NULL generic_make_request bio current->bio_list 
    if (current->bio_list) {
        bio_list_add(current->bio_list, bio);
        return;
    }

   /* following loop may be a bit non-obvious, and so deserves some
     * explanation.
     * Before entering the loop, bio->bi_next is NULL (as all callers
     * ensure that) so we have a list with a single bio.
     * We pretend that we have just taken it off a longer list, so
     * we assign bio_list to a pointer to the bio_list_on_stack,
     * thus initialising the bio_list of new bios to be
     * added.  ->make_request() may indeed add some more bios
     * through a recursive call to generic_make_request.  If it
     * did, we find a non-NULL value in bio_list and re-enter the loop
     * from the top.  In this case we really did just take the bio
     * of the top of the list (no pretending) and so remove it from
     * bio_list, and call into ->make_request() again.
     */
    BUG_ON(bio->bi_next);
   //bio_list 
    bio_list_init(&bio_list_on_stack);
    current->bio_list = &bio_list_on_stack;
    do {
       //
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);
       //make_request_fn make_request_fn blk_queue_bio()
        q->make_request_fn(q, bio);

        bio = bio_list_pop(current->bio_list);
    } while (bio);
    current->bio_list = NULL;/* deactivate */
}123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 

generic_make_request()The function is used to convert the bio to a request object and hang it on the appropriate request queue. Usually, multiple bios can be combined on a request object. The usual disk request queue processing function make_request_fn()points to the general function provided by the kernel blk_queue_bio(). In this function, it will check whether bio can be merged, some optimizations and the number of current IO requests, etc., to decide whether to generate a new request or add bio to one On the original request object, it is then decided whether to add the new request object to the preparation queue current->plugor directly call the __blk_run_queue()function to submit the request object to the driver. code show as below:

//linux/block/blk-core.c

void blk_queue_bio(struct request_queue *q, struct bio *bio)
{
    const bool sync = !!(bio->bi_rw & REQ_SYNC);
    struct blk_plug *plug;
    int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
    struct request *req;
    unsigned int request_count = 0;

   /*
     * low level driver can indicate that it wants pages above a
     * certain limit bounced to low memory (ie for highmem, or even
     * ISA dma in theory)
     */
    blk_queue_bounce(q, &bio);

    if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
        bio_endio(bio, -EIO);
        return;
    }

    if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
        spin_lock_irq(q->queue_lock);
        where = ELEVATOR_INSERT_FLUSH;
        goto get_rq;
    }

   /*
     * Check if we can merge with the plugged list before grabbing
     * any locks.
     */
    if (blk_attempt_plug_merge(q, bio, &request_count))
        return;

    spin_lock_irq(q->queue_lock);

   //
    el_ret = elv_merge(q, &req, bio);
    if (el_ret == ELEVATOR_BACK_MERGE) {
        if (bio_attempt_back_merge(q, req, bio)) {
            elv_bio_merged(q, req, bio);
            if (!attempt_back_merge(q, req))
                elv_merged_request(q, req, el_ret);
            goto out_unlock;
        }
    } else if (el_ret == ELEVATOR_FRONT_MERGE) {
        if (bio_attempt_front_merge(q, req, bio)) {
            elv_bio_merged(q, req, bio);
            if (!attempt_front_merge(q, req))
                elv_merged_request(q, req, el_ret);
            goto out_unlock;
        }
    }

get_rq:   //
   /*
     * This sync check and mask will be re-done in init_request_from_bio(),
     * but we need to set it earlier to expose the sync flag to the
     * rq allocator and io schedulers.
     */
    rw_flags = bio_data_dir(bio);
    if (sync)
        rw_flags |= REQ_SYNC;

   /*
     * Grab a free request. This is might sleep but can not fail.
     * Returns with the queue unlocked.
     */
   //request 
    req = get_request(q, rw_flags, bio, GFP_NOIO);
    if (unlikely(!req)) {
        bio_endio(bio, -ENODEV);   /* @q is dead */
        goto out_unlock;
    }

   /*
     * After dropping the lock and possibly sleeping here, our request
     * may now be mergeable after it had proven unmergeable (above).
     * We don't worry about that case for efficiency. It won't happen
     * often, and the elevators are able to handle it.
     */
    init_request_from_bio(req, bio);

    if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
        req->cpu = raw_smp_processor_id();

   //task_struct->plug IO Linux IO plug IO 
    plug = current->plug;
    if (plug) {    //plug request 
       /*
         * If this is the first request added after a plug, fire
         * of a plug trace.
         */
        if (!request_count)
            trace_block_plug(q);
        else {
            if (request_count >= BLK_MAX_REQUEST_COUNT) {
               //blk_flush_plug_list IO request_queue
                blk_flush_plug_list(plug, false);
                trace_block_plug(q);
            }
        }
        list_add_tail(&req->queuelist, &plug->list);
        blk_account_io_start(req, true);
    } else {    //plug 
        spin_lock_irq(q->queue_lock);
       //request_queue 
        add_acct_request(q, req, where);
       //request_fn() 
        __blk_run_queue(q);
out_unlock:
        spin_unlock_irq(q->queue_lock);
    }
}123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115 

Among them, it is blk_flush_plug_list()used to transfer the request on the plug request queue to the dispatch queue and the request queue of the disk device to prepare for delivery to the driver. The calling sequence is blk_flush_plug_list()->queue_unplugged()->__blk_run_queue(), you can see that the __blk_run_queue()function is also called at the end to process the request object.

__blk_run_queue()The function is used to submit the request on the request queue to the driver. There are two ways to submit, one is blk_queue_bioto call the __blk_run_queue()function directly after the request is generated , and the other is to kblockd_workqueuecall the __blk_run_queue()function indirectly through the work queue . __blk_run_queue()The code of the function is as follows:

//linux/block/blk-core.c

void __blk_run_queue(struct request_queue *q)
{
    if (unlikely(blk_queue_stopped(q)))
        return;

    __blk_run_queue_uncond(q);
}

inline void __blk_run_queue_uncond(struct request_queue *q)
{
    if (unlikely(blk_queue_dead(q)))
        return;

   /*
     * Some request_fn implementations, e.g. scsi_request_fn(), unlock
     * the queue lock internally. As a result multiple threads may be
     * running such a request function concurrently. Keep track of the
     * number of active request_fn invocations such that blk_drain_queue()
     * can wait until all these request_fn calls have finished.
     */
    q->request_fn_active++;
   //request_fn do_hd_request()
    q->request_fn(q);
    q->request_fn_active--;
}123456789101112131415161718192021222324252627 

do_hd_request()IDE hard disk function is used request_fn()function, which from the global variable hd_queueto obtain a request requestand appropriate interrupt and read or write request according to setting a corresponding interrupt handler read_intr()or write_intr(), then hd_out()the function sends an instruction to the hard disk controller and return . So far, the file reading process has ended, all that is left is to wait for the disk interrupt to return and call the interrupt handler. do_hd_request()The code is as follows:

//linux/drivers/block/hd.c
static void do_hd_request(struct request_queue *q)
{
    hd_request();
}

static void hd_request(void)
{
    unsigned int block, nsect, sec, track, head, cyl;
    struct hd_i_struct *disk;
    struct request *req;

    if (do_hd)
        return;
repeat:              //
    del_timer(&device_timer);

    if (!hd_req) {
        hd_req = blk_fetch_request(hd_queue);   //hd_queue request 
        if (!hd_req) {
            do_hd = NULL;
            return;
        }
    }
    req = hd_req;

    if (reset) {
        reset_hd();
        return;
    }
    disk = req->rq_disk->private_data;
    block = blk_rq_pos(req);
    nsect = blk_rq_sectors(req);
    if (block >= get_capacity(req->rq_disk) ||
        ((block+nsect) > get_capacity(req->rq_disk))) {
        printk("%s: bad access: block=%d, count=%d\n",
            req->rq_disk->disk_name, block, nsect);
        hd_end_request_cur(-EIO);
        goto repeat;
    }

    if (disk->special_op) {
        if (do_special_op(disk, req))
            goto repeat;
        return;
    }
    sec   = block % disk->sect + 1;
    track = block/disk->sect;
    head  = track % disk->head;
    cyl   = track/disk->head;
#ifdef DEBUG
    printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n",
        req->rq_disk->disk_name,
        req_data_dir(req) == READ ? "read" : "writ",
        cyl, head, sec, nsect, req->buffer);
#endif
    if (req->cmd_type == REQ_TYPE_FS) {
        switch (rq_data_dir(req)) {
        case READ:
           //hd_out SET_HANDLER do_hd &read_intr 
            hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_READ,
                &read_intr);
            if (reset)
                goto repeat;
            break;
        case WRITE:
            hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_WRITE,
                &write_intr);
            if (reset)
                goto repeat;
            if (wait_DRQ()) {
                bad_rw_intr();
                goto repeat;
            }
            outsw(HD_DATA, req->buffer, 256);
            break;
        default:
            printk("unknown hd-command\n");
            hd_end_request_cur(-EIO);
            break;
        }
    }
}1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 

The next hard disk interrupt processing function is hd_interrupt(), because the request may be read or write, the corresponding processing function is also different. Before performing hard disk operations, you need to do_hdset the global variable to point to read_intr()or write_intr()so that the interrupt can be handled correctly when the interrupt arrives. But this also means that hard disk requests are processed serially, that is, only one request can be processed at a time. hd_interrupt()The function code is as follows:

//linux/drivers/block/hd.c
static irqreturn_t hd_interrupt(int irq, void *dev_id)
{
    void (*handler)(void) = do_hd;    //do_hd NULL read_intr write_intr

    spin_lock(hd_queue->queue_lock);

    do_hd = NULL;
    del_timer(&device_timer);      //
    if (!handler)
        handler = unexpected_hd_interrupt;
    handler();    //

    spin_unlock(hd_queue->queue_lock);

    return IRQ_HANDLED;  //
}1234567891011121314151617 

Since the read request is set do_hdequal read_intr, take a look at read_intr()the code of the function:

//linux/drivers/block/hd.c
static void read_intr(void)
{
    struct request *req;
    int i, retries = 100000;

    do {
        i = (unsigned) inb_p(HD_STATUS);   //
        if (i & BUSY_STAT)
            continue;
        if (!OK_STATUS(i))
            break;
        if (i & DRQ_STAT)
            goto ok_to_read;               //
    } while (--retries > 0);               //retries 
    dump_status("read_intr", i);           //
    bad_rw_intr();
    hd_request();                          //hd_request() 
    return;

ok_to_read:
    req = hd_req;
    insw(HD_DATA, req->buffer, 256);       //256 u16 512 
#ifdef DEBUG
    printk("%s: read: sector %ld, remaining = %u, buffer=%p\n",
           req->rq_disk->disk_name, blk_rq_pos(req) + 1,
           blk_rq_sectors(req) - 1, req->buffer+512);
#endif
    if (hd_end_request(0, 512)) {          //hd_end_request() bio mpage_end_io mpage_bio_submit 
        SET_HANDLER(&read_intr);
        return;
    }

    (void) inb_p(HD_STATUS);
#if (HD_DELAY > 0)
    last_req = read_timer();
#endif
    hd_request();
}123456789101112131415161718192021222324252627282930313233343536373839 

It is worth noting that the hd_end_request()function, the callback function of the original bio is executed in this function mpage_end_io(). As mentioned in mpage_end_io()the function mentioned earlier , the function of this function is to traverse each vector of the bio structure to see if the relevant page has the latest data. If it is, the page is marked as the latest and the page is unlocked. The process of blocking on this page can be awakened by the unlock operation. Recall the do_generic_file_read()function of reading the page . At this time, after the process is awakened, it will check whether the page status is the latest. If it is, it will continue to execute downward and finally copy it to the user buffer.

At this point, readthe analysis process from the system call to the disk drive and then from the disk interrupt handling function to the kernel code and finally the file read back to the user has been completed! Since this article focuses on the analysis of the calling process, so the plug mechanism, disk block number calculation, IO scheduling algorithm, etc. are beyond the scope of the introduction, please understand.