diff -Nurb ./b47/hsfs_diffs_b47__0_ORIG/usr/src/uts/common/fs/hsfs/hsfs_node.c ./b47/hsfs_diffs_b47__1_NEW_Moinaks_hsfs/usr/src/uts/common/fs/hsfs/hsfs_node.c --- ./b47/hsfs_diffs_b47__0_ORIG/usr/src/uts/common/fs/hsfs/hsfs_node.c 2006-09-08 15:02:33.000000000 +0200 +++ ./b47/hsfs_diffs_b47__1_NEW_Moinaks_hsfs/usr/src/uts/common/fs/hsfs/hsfs_node.c 2006-09-08 15:02:45.000000000 +0200 @@ -549,6 +549,9 @@ hp->hs_dir_off = off; hp->hs_nodeid = nodeid; hp->hs_seq = 0; + hp->hs_prev_offset = 0; + hp->hs_num_contig = 0; + hp->hs_ra_bytes = 0; hp->hs_flags = HREF; if (off > HS_SECTOR_SIZE) cmn_err(CE_WARN, "hs_makenode: bad offset"); diff -Nurb ./b47/hsfs_diffs_b47__0_ORIG/usr/src/uts/common/fs/hsfs/hsfs_vfsops.c ./b47/hsfs_diffs_b47__1_NEW_Moinaks_hsfs/usr/src/uts/common/fs/hsfs/hsfs_vfsops.c --- ./b47/hsfs_diffs_b47__0_ORIG/usr/src/uts/common/fs/hsfs/hsfs_vfsops.c 2006-09-08 15:03:26.000000000 +0200 +++ ./b47/hsfs_diffs_b47__1_NEW_Moinaks_hsfs/usr/src/uts/common/fs/hsfs/hsfs_vfsops.c 2006-09-08 15:03:37.000000000 +0200 @@ -91,6 +91,8 @@ #define HOPT_NRR "nrr" #define HOPT_RR "rr" #define HOPT_RO MNTOPT_RO +#define HOPT_SCHEDIO "schedio" +#define HOPT_NSCHEDIO "noschedio" static char *global_cancel[] = { HOPT_NOGLOBAL, NULL }; static char *noglobal_cancel[] = { HOPT_GLOBAL, NULL }; @@ -101,6 +103,8 @@ static char *nrr_cancel[] = { HOPT_RR, NULL }; static char *trail_cancel[] = { HOPT_NOTRAILDOT, NULL }; static char *notrail_cancel[] = { HOPT_TRAILDOT, NULL }; +static char *schedio_cancel[] = { HOPT_NSCHEDIO, NULL }; +static char *noschedio_cancel[] = { HOPT_SCHEDIO, NULL }; static mntopt_t hsfs_options[] = { { HOPT_GLOBAL, global_cancel, NULL, 0, NULL }, @@ -112,6 +116,8 @@ { HOPT_NRR, nrr_cancel, NULL, 0, NULL }, { HOPT_TRAILDOT, trail_cancel, NULL, MO_DEFAULT, NULL }, { HOPT_NOTRAILDOT, notrail_cancel, NULL, 0, NULL }, + { HOPT_SCHEDIO, schedio_cancel, NULL, 0, NULL }, + { HOPT_NSCHEDIO, noschedio_cancel, NULL, MO_DEFAULT, NULL }, }; static mntopts_t hsfs_proto_opttbl = { @@ -167,6 +173,9 @@ uid_t hsfs_default_uid = 0; gid_t hsfs_default_gid = 3; +extern void hsched_init(struct hsfs *fsp, int fsid, struct modlinkage *modlinkage); +extern void hsched_fini(struct hsfs_queue *hqueue); + static int hsfs_mount(struct vfs *vfsp, struct vnode *mvp, struct mounta *uap, struct cred *cr); static int hsfs_unmount(struct vfs *vfsp, int, struct cred *cr); @@ -271,6 +280,8 @@ flags |= HSFSMNT_NOTRAILDOT; if (vfs_optionisset(vfsp, HOPT_NRR, NULL)) flags |= HSFSMNT_NORRIP; + if (vfs_optionisset(vfsp, HOPT_SCHEDIO, NULL)) + flags |= HSFSMNT_SCHEDIO; error = pn_get(uap->dir, (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn); @@ -358,6 +369,7 @@ if (fsp->hsfs_fsmnt != NULL) kmem_free(fsp->hsfs_fsmnt, strlen(fsp->hsfs_fsmnt) + 1); + hsched_fini(fsp->hqueue); mutex_destroy(&fsp->hsfs_free_lock); rw_destroy(&fsp->hsfs_hash_lock); @@ -672,6 +684,17 @@ fsp->hsfs_flags = mount_flags; + /* + * Setup I/O Scheduling structures if requested + */ + if (mount_flags & HSFSMNT_SCHEDIO) { + fsp->hqueue = kmem_alloc(sizeof(struct hsfs_queue), KM_SLEEP); + hsched_init(fsp, fsid, &modlinkage); + + } else { + fsp->hqueue = NULL; + } + /* set the magic word */ fsp->hsfs_magic = HSFS_MAGIC; mutex_exit(&hs_mounttab_lock); diff -Nurb ./b47/hsfs_diffs_b47__0_ORIG/usr/src/uts/common/fs/hsfs/hsfs_vnops.c ./b47/hsfs_diffs_b47__1_NEW_Moinaks_hsfs/usr/src/uts/common/fs/hsfs/hsfs_vnops.c --- ./b47/hsfs_diffs_b47__0_ORIG/usr/src/uts/common/fs/hsfs/hsfs_vnops.c 2006-09-08 15:04:06.000000000 +0200 +++ ./b47/hsfs_diffs_b47__1_NEW_Moinaks_hsfs/usr/src/uts/common/fs/hsfs/hsfs_vnops.c 2006-09-08 15:04:18.000000000 +0200 @@ -61,6 +61,7 @@ #include #include #include +#include #include #include @@ -73,6 +74,13 @@ #include #include #include +#include +#include + +/* + * For struct modlinkage + */ +#include #include #include @@ -82,6 +90,16 @@ #include +/* # of contiguous requests to detect sequential access pattern */ +static int seq_contig_requests = 4; + +static int hsfs_taskq_nthreads = 4; /* # of taskq threads per fs */ + +static int hsched_deadline_compare(const void *x1, const void *x2); +static int hsched_offset_compare(const void *x1, const void *x2); +static void hsched_enqueue_io(struct hsfs_queue *hqueue, struct hio *hsio); +static void hsched_invoke_strategy(void *arg); + /* ARGSUSED */ static int hsfs_fsync(vnode_t *cp, int syncflag, cred_t *cred) @@ -615,6 +633,337 @@ return (0); } + /* + * The taskq thread that invokes the scheduling function to ensure + * that all readaheads are complete and cleans up the associated + * memory and releases the page lock. + */ + static void + hsfs_ra_task(void *arg) + { + struct hio_info *info = arg; + uint_t count; + + ASSERT(info->pp != NULL); + + for (count = 0; count < info->bufsused; count++) { + while (sema_tryp(&(info->sema[count])) == 0) + hsched_invoke_strategy(info->hqueue); + sema_destroy(&(info->sema[count])); + sema_destroy(&(info->bufs[count].b_io)); + sema_destroy(&(info->bufs[count].b_sem)); + if (info->vas[count] != NULL) { + ppmapout(info->vas[count]); + } + } + kmem_free(info->vas, info->bufcnt * sizeof (caddr_t)); + kmem_free(info->bufs, info->bufcnt * sizeof (struct buf)); + kmem_free(info->sema, info->bufcnt * sizeof (ksema_t)); + + pvn_read_done(info->pp, 0); + kmem_free(info, sizeof (struct hio_info)); + } + + /* + * Submit asynchronous readahead requests to the I/O scheduler + * depending on the number of pages to read ahead. These requests + * are asynchronous to the calling thread but I/O requests issued + * subsequently by other threads with higher LBNs must wait for + * these readaheads to complete since we have a single ordered + * I/O pipeline. Thus these readaheads are semi-asynchronous. + * A TaskQ handles waiting for the readaheads to complete. + * + * This function is mostly a copy of hsfs_getapage but somewhat + * simpler. A readahead request is aborted if page allocation + * fails. + */ + /*ARGSUSED*/ + static int + hsfs_getpage_ra( + struct vnode *vp, + u_offset_t off, + struct seg *seg, + caddr_t addr, + struct hsnode *hp, + struct hsfs *fsp, + int xarsiz, + offset_t bof, + int chunk_lbn_count, + int chunk_data_bytes) + { + struct buf *bufs; + caddr_t *vas; + caddr_t va; + struct page *pp, *searchp, *lastp; + struct vnode *devvp; + ulong_t byte_offset; + size_t io_len_tmp; + uint_t io_off, io_len; + uint_t xlen; + uint_t filsiz; + uint_t secsize; + uint_t bufcnt; + uint_t bufsused; + uint_t count; + uint_t io_end; + uint_t which_chunk_lbn; + uint_t offset_lbn; + uint_t offset_extra; + offset_t offset_bytes; + uint_t remaining_bytes; + uint_t extension; + int remainder; /* must be signed */ + diskaddr_t driver_block; + u_offset_t io_off_tmp; + ksema_t *fio_done; + struct hio_info *info; + int pgsize = PAGESIZE; + size_t len; + + ASSERT(fsp->hqueue != NULL); + + if (addr >= seg->s_base + seg->s_size) { + return (-1); + } + + devvp = fsp->hsfs_devvp; + secsize = fsp->hsfs_vol.lbn_size; /* bytes per logical block */ + + /* file data size */ + filsiz = hp->hs_dirent.ext_size; + + extension = 0; + pp = NULL; + + /* + * Update readahead counters + */ + if (hp->hs_num_contig < + (seq_contig_requests - 1)) { + hp->hs_num_contig++; + + } else { + /* + * We increase readahead quantum till + * a predefined max. max_readahead_bytes + * is a multiple of PAGESIZE. + */ + if (hp->hs_ra_bytes < + fsp->hqueue->max_ra_bytes) { + hp->hs_ra_bytes += pgsize; + } + extension += hp->hs_ra_bytes; + } + + if (extension != 0 && extension < filsiz - off) { + len = extension; + } else { + len = pgsize; + } + /* + * Some cd writers don't write sectors that aren't used. Also, + * there's no point in reading sectors we'll never look at. So, + * if we're asked to go beyond the end of a file, truncate to the + * length of that file. + * + * Additionally, this behaviour is required by section 6.4.5 of + * ISO 9660:1988(E). + */ + if (len > (filsiz - off)) { + len = filsiz - off; + } + + /* A little paranoia */ + if (len <= 0) + return (-1); + + /* + * After all that, make sure we're asking for things in units + * that bdev_strategy() will understand (see bug 4202551). + */ + len = roundup(len, DEV_BSIZE); + + hp->hs_prev_offset = off + len; + + pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp, + &io_len_tmp, off, len, 1); + + if (pp == NULL) { + hp->hs_num_contig = 0; + hp->hs_ra_bytes = 0; + hp->hs_prev_offset = 0; + return (-1); + } + + io_off = (uint_t)io_off_tmp; + io_len = (uint_t)io_len_tmp; + + /* check for truncation */ + /* + * xxx Clean up and return EIO instead? + * xxx Ought to go to u_offset_t for everything, but we + * xxx call lots of things that want uint_t arguments. + */ + ASSERT(io_off == io_off_tmp); + + /* + * get enough buffers for worst-case scenario + * (i.e., no coalescing possible). + */ + bufcnt = (len + secsize - 1) / secsize; + bufs = kmem_zalloc(bufcnt * sizeof (struct buf), KM_SLEEP); + vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP); + + /* + * Allocate a array of semaphores since we are doing I/O + * scheduling. + */ + fio_done = kmem_alloc(bufcnt * sizeof (ksema_t), KM_SLEEP); + + /* + * If our filesize is not an integer multiple of PAGESIZE, + * we zero that part of the last page that's between EOF and + * the PAGESIZE boundary. + */ + xlen = io_len & PAGEOFFSET; + if (xlen != 0) + pagezero(pp->p_prev, xlen, pgsize - xlen); + + va = NULL; + lastp = NULL; + searchp = pp; + io_end = io_off + io_len; + for (count = 0, byte_offset = io_off; + byte_offset < io_end; + count++) { + ASSERT(count < bufcnt); + + bufs[count].b_edev = devvp->v_rdev; + bufs[count].b_dev = cmpdev(devvp->v_rdev); + bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ; + bufs[count].b_iodone = hsfs_iodone; + bufs[count].b_vp = vp; + bufs[count].b_file = vp; + sema_init(&bufs[count].b_io, 0, NULL, SEMA_DEFAULT, NULL); + sema_init(&bufs[count].b_sem, 0, NULL, SEMA_DEFAULT, NULL); + + /* Compute disk address for interleaving. */ + + /* considered without skips */ + which_chunk_lbn = byte_offset / chunk_data_bytes; + + /* factor in skips */ + offset_lbn = which_chunk_lbn * chunk_lbn_count; + + /* convert to physical byte offset for lbn */ + offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp); + + /* don't forget offset into lbn */ + offset_extra = byte_offset % chunk_data_bytes; + + /* get virtual block number for driver */ + driver_block = lbtodb(bof + xarsiz + + offset_bytes + offset_extra); + + if (lastp != searchp) { + /* this branch taken first time through loop */ + va = vas[count] = ppmapin(searchp, PROT_WRITE, + (caddr_t)-1); + /* ppmapin() guarantees not to return NULL */ + } else { + vas[count] = NULL; + } + + bufs[count].b_un.b_addr = va + byte_offset % pgsize; + bufs[count].b_offset = + (offset_t)(byte_offset - io_off + off); + + /* + * We specifically use the b_lblkno member here + * as even in the 32 bit world driver_block can + * get very large in line with the ISO9660 spec. + */ + + bufs[count].b_lblkno = driver_block; + + remaining_bytes = ((which_chunk_lbn + 1) + * chunk_data_bytes) + - byte_offset; + + /* + * remaining_bytes can't be zero, as we derived + * which_chunk_lbn directly from byte_offset. + */ + if ((remaining_bytes + byte_offset) < (off + len)) { + /* coalesce-read the rest of the chunk */ + bufs[count].b_bcount = remaining_bytes; + } else { + /* get the final bits */ + bufs[count].b_bcount = off + len - byte_offset; + } + + remainder = pgsize - (byte_offset % pgsize); + if (bufs[count].b_bcount > remainder) { + bufs[count].b_bcount = remainder; + } + + bufs[count].b_bufsize = bufs[count].b_bcount; + if (((offset_t)byte_offset + bufs[count].b_bcount) > + HS_MAXFILEOFF) { + break; + } + byte_offset += bufs[count].b_bcount; + + /* + * We are scheduling I/O so we need to enqueue + * requests rather than calling bdev_strategy + * here. A later invocation of the scheduling + * function will take care of doing the actual + * I/O as it selects requests from the queue as + * per the scheduling logic. + */ + struct hio *hsio = kmem_alloc(sizeof(struct hio), + KM_SLEEP); + + sema_init(&fio_done[count], 0, NULL, + SEMA_DEFAULT, NULL); + hsio->bp = &bufs[count]; + hsio->sema = &fio_done[count]; + hsio->nblocks = howmany(hsio->bp->b_bcount, + DEV_BSIZE); + + /* used for deadline */ + hsio->io_timestamp = lbolt64; + + /* for I/O coalescing */ + hsio->contig_chain = NULL; + hsched_enqueue_io(fsp->hqueue, hsio); + + lwp_stat_update(LWP_STAT_INBLK, 1); + lastp = searchp; + if ((remainder - bufs[count].b_bcount) < 1) { + searchp = searchp->p_next; + } + } + + bufsused = count; + info = kmem_alloc(sizeof (struct hio_info), KM_SLEEP); + info->bufs = bufs; + info->vas = vas; + info->sema = fio_done; + info->bufsused = bufsused; + info->bufcnt = bufcnt; + info->hqueue = fsp->hqueue; + info->pp = pp; + + (void) taskq_dispatch(fsp->hqueue->ra_task, + hsfs_ra_task, info, KM_SLEEP); + /* + * The I/O locked pages are unlocked in our taskq thread. + */ + return (0); + } + /* * Each file may have a different interleaving on disk. This makes * things somewhat interesting. The gist is that there are some @@ -677,6 +1026,9 @@ int xarsiz; diskaddr_t driver_block; u_offset_t io_off_tmp; + ksema_t *fio_done; + int calcdone; + int pgsize = PAGESIZE; /* * We don't support asynchronous operation at the moment, so @@ -722,6 +1074,7 @@ reread: err = 0; pagefound = 0; + calcdone = 0; /* * Do some read-ahead. This mostly saves us a bit of @@ -768,14 +1121,94 @@ again: /* search for page in buffer */ if ((pagefound = page_exists(vp, off)) == 0) { + /* * Need to really do disk IO to get the page. */ + if (calcdone) + goto try_kluster; + /* + * Initial synchronous read-ahead if schedio is enabled. + * This attempts to detect sequential access pattern and + * reads extra pages. + * Once the page cache is populated and cache hits occur + * semi-asynchronous readahead is used instead. + */ + if (fsp->hqueue != NULL) { + if (hp->hs_prev_offset == off) { + if (hp->hs_num_contig < + (seq_contig_requests - 1)) { + hp->hs_num_contig++; + + } else { + /* + * We increase readahead quantum till + * a predefined max. max_readahead_bytes + * is a multiple of PAGESIZE. + */ + if (hp->hs_ra_bytes < + fsp->hqueue->max_ra_bytes) { + hp->hs_ra_bytes += + pgsize; + } + extension += hp->hs_ra_bytes; + } + } else { + /* + * Not contiguous so reset read ahead counters. + */ + hp->hs_num_contig = 0; + hp->hs_ra_bytes = 0; + hp->hs_prev_offset = 0; + } + } + + if (extension != 0 && extension < filsiz - off) { + len = extension; + } else { + len = pgsize; + } + /* + * Some cd writers don't write sectors that aren't used. Also, + * there's no point in reading sectors we'll never look at. So, + * if we're asked to go beyond the end of a file, truncate to the + * length of that file. + * + * Additionally, this behaviour is required by section 6.4.5 of + * ISO 9660:1988(E). + */ + if (len > (filsiz - off)) { + len = filsiz - off; + } + + /* A little paranoia. */ + ASSERT(len > 0); + + /* + * After all that, make sure we're asking for things in units + * that bdev_strategy() will understand (see bug 4202551). + */ + len = roundup(len, DEV_BSIZE); + + if (fsp->hqueue != NULL) { + hp->hs_prev_offset = off + len; + } + + calcdone = 1; + + try_kluster: pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp, &io_len_tmp, off, len, 0); - if (pp == NULL) + if (pp == NULL) { + /* + * Pressure on memory, roll back readahead + */ + hp->hs_num_contig = 0; + hp->hs_ra_bytes = 0; + hp->hs_prev_offset = 0; goto again; + } io_off = (uint_t)io_off_tmp; io_len = (uint_t)io_len_tmp; @@ -795,6 +1228,15 @@ bufcnt = (len + secsize - 1) / secsize; bufs = kmem_zalloc(bufcnt * sizeof (struct buf), KM_SLEEP); vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP); + + /* + * Allocate a array of semaphores if we are doing I/O + * scheduling. + */ + if (fsp->hqueue != NULL) + fio_done = kmem_alloc(bufcnt * sizeof (ksema_t), + KM_SLEEP); + for (count = 0; count < bufcnt; count++) { bufs[count].b_edev = devvp->v_rdev; bufs[count].b_dev = cmpdev(devvp->v_rdev); @@ -815,7 +1257,12 @@ */ xlen = io_len & PAGEOFFSET; if (xlen != 0) +/* pagezero(pp->p_prev, xlen, PAGESIZE - xlen); +*/ + + pagezero(pp->p_prev, xlen, pgsize - xlen); + va = NULL; lastp = NULL; @@ -854,7 +1301,12 @@ vas[count] = NULL; } +/* bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE; +*/ + + bufs[count].b_un.b_addr = va + byte_offset % pgsize; + bufs[count].b_offset = (offset_t)(byte_offset - io_off + off); @@ -896,7 +1348,12 @@ * Interleaving violates that assumption. */ +/* remainder = PAGESIZE - (byte_offset % PAGESIZE); +*/ + + remainder = pgsize - (byte_offset % PAGESIZE); + if (bufs[count].b_bcount > remainder) { bufs[count].b_bcount = remainder; } @@ -908,7 +1365,41 @@ } byte_offset += bufs[count].b_bcount; +/* (void) bdev_strategy(&bufs[count]); +*/ + + if (fsp->hqueue == NULL) { + (void) bdev_strategy(&bufs[count]); + + } else { + /* + * We are scheduling I/O so we need to enqueue + * requests rather than calling bdev_strategy + * here. A later invocation of the scheduling + * function will take care of doing the actual + * I/O as it selects requests from the queue as + * per the scheduling logic. + */ + struct hio *hsio = kmem_alloc( + sizeof(struct hio), + KM_SLEEP); + + sema_init(&fio_done[count], 0, NULL, + SEMA_DEFAULT, NULL); + hsio->bp = &bufs[count]; + hsio->sema = &fio_done[count]; + hsio->nblocks = howmany(hsio->bp->b_bcount, + DEV_BSIZE); + + /* used for deadline */ + hsio->io_timestamp = lbolt64; + + /* for I/O coalescing */ + hsio->contig_chain = NULL; + hsched_enqueue_io(fsp->hqueue, hsio); + } + lwp_stat_update(LWP_STAT_INBLK, 1); lastp = searchp; @@ -918,6 +1409,7 @@ } bufsused = count; + if (fsp->hqueue == NULL) { /* Now wait for everything to come in */ for (count = 0; count < bufsused; count++) { if (err == 0) { @@ -925,6 +1417,25 @@ } else (void) biowait(&bufs[count]); } + } else { + /* Now wait for everything to come in. + */ + for (count = 0; count < bufsused; count++) { + + /* + * Invoke scheduling function till our buf + * is processed. In doing this it might + * process bufs enqueued by other threads + * which is good. + */ + while (sema_tryp(&fio_done[count]) == 0) + hsched_invoke_strategy(fsp->hqueue); + sema_destroy(&fio_done[count]); + + if (err == 0) + err = geterror(&bufs[count]); + } + } /* Don't leak resources */ for (count = 0; count < bufcnt; count++) { @@ -937,6 +1448,8 @@ kmem_free(vas, bufcnt * sizeof (caddr_t)); kmem_free(bufs, bufcnt * sizeof (struct buf)); + if (fsp->hqueue != NULL) + kmem_free(fio_done, bufcnt * sizeof (ksema_t)); } if (err) { @@ -969,10 +1482,11 @@ * Try to lock the next page, if it exists, without * blocking. */ - plsz -= PAGESIZE; + plsz -= pgsize; + /* LINTED (plsz is unsigned) */ - for (soff = off + PAGESIZE; plsz > 0; - soff += PAGESIZE, plsz -= PAGESIZE) { + for (soff = off + pgsize; plsz > 0; + soff += pgsize, plsz -= pgsize) { pp = page_lookup_nowait(vp, (u_offset_t)soff, SE_SHARED); if (pp == NULL) @@ -980,6 +1494,22 @@ pl[index++] = pp; } pl[index] = NULL; + + /* + * Schedule a semi-asynchronous readahead if we are + * accessing the last cached page for the current + * file. + */ + if (fsp->hqueue != NULL && + hp->hs_prev_offset - off == pgsize && + hp->hs_prev_offset < filsiz && + hp->hs_ra_bytes > 0 && + !page_exists(vp,hp->hs_prev_offset)) { + hsfs_getpage_ra(vp, hp->hs_prev_offset, seg, + addr + pgsize, hp, fsp, xarsiz, bof, + chunk_lbn_count, chunk_data_bytes); + } + return (0); } @@ -1301,6 +1831,318 @@ return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr)); } + int + hsched_deadline_compare(const void *x1, const void *x2) + { + const struct hio *h1 = x1; + const struct hio *h2 = x2; + + if (h1->io_timestamp < h2->io_timestamp) + return (-1); + if (h1->io_timestamp > h2->io_timestamp) + return (1); + + if (h1->bp->b_lblkno < h2->bp->b_lblkno) + return (-1); + if (h1->bp->b_lblkno > h2->bp->b_lblkno) + return (1); + + if (h1 < h2) + return (-1); + if (h1 > h2) + return (1); + + return (0); + } + + int + hsched_offset_compare(const void *x1, const void *x2) + { + const struct hio *h1 = x1; + const struct hio *h2 = x2; + + if (h1->bp->b_lblkno < h2->bp->b_lblkno) + return (-1); + if (h1->bp->b_lblkno > h2->bp->b_lblkno) + return (1); + + if (h1 < h2) + return (-1); + if (h1 > h2) + return (1); + + return (0); + } + + /* + * Initialize I/O scheduling structures. This is called via hsfs_mount + * if the schedio option is enabled. + */ + void + hsched_init(struct hsfs *fsp, int fsid, struct modlinkage *modlinkage) + { + struct hsfs_queue *hqueue = fsp->hqueue; + struct vnode *vp = fsp->hsfs_devvp; + char namebuf[25]; + int error, err; + struct dk_cinfo info; + ldi_handle_t lh; + ldi_ident_t li; + + mutex_init(&(hqueue->hsfs_queue_lock), NULL, MUTEX_DEFAULT, NULL); + mutex_init(&(hqueue->strategy_lock), NULL, MUTEX_DEFAULT, NULL); + avl_create(&(hqueue->read_tree), hsched_offset_compare, + sizeof(struct hio), offsetof(struct hio, io_offset_node)); + avl_create(&(hqueue->deadline_tree), hsched_deadline_compare, + sizeof(struct hio), offsetof(struct hio, io_deadline_node)); + + (void) snprintf(namebuf, sizeof (namebuf), "hsched_task_%d", fsid); + hqueue->ra_task = taskq_create(namebuf, hsfs_taskq_nthreads, + minclsyspri, 1, 104857600 / PAGESIZE, 0); + + /* + * Default maxtransfer = 16k chunk + */ + hqueue->dev_maxtransfer = 16384; + + /* + * Try to fetch the maximum device transfer size. This is used to + * ensure that a coalesced block does not exceed the maxtransfer. + */ + err = ldi_ident_from_mod(modlinkage, &li); + if (err) { + cmn_err(CE_NOTE, "hsched_init: ldi_ident_from_mod err=%d\n", + err); + goto set_ra; + } + + err = ldi_open_by_dev(&(vp->v_rdev), OTYP_CHR, FREAD, CRED(), &lh, li); + ldi_ident_release(li); + if (err) { + cmn_err(CE_NOTE, "hsched_init: ldi_open err=%d\n", err); + goto set_ra; + } + + error = ldi_ioctl(lh, DKIOCINFO, (intptr_t )&info, FKIOCTL, + CRED(), &err); + err = ldi_close(lh ,FREAD ,CRED()); + if (err) { + cmn_err(CE_NOTE, "hsched_init: ldi_close err=%d\n", err); + } + + if (error == 0) { + hqueue->dev_maxtransfer = ldbtob(info.dki_maxtransfer); + cmn_err(CE_NOTE, "hsched_init: dev_maxtransfer = %u\n", info.dki_maxtransfer); + } + + set_ra: + /* + * Max size of data to read ahead for sequential access pattern. + * Conservative to avoid letting the underlying CD drive to spin + * down, in case the application is reading slowly. + */ + if (PAGESIZE < 16384) { + hqueue->max_ra_bytes = PAGESIZE * 4; + } else { + hqueue->max_ra_bytes = PAGESIZE; + } + + hqueue->next = NULL; + } + + void + hsched_fini(struct hsfs_queue *hqueue) + { + if (hqueue != NULL) { + avl_destroy(&(hqueue->read_tree)); + avl_destroy(&(hqueue->deadline_tree)); + mutex_destroy(&(hqueue->hsfs_queue_lock)); + mutex_destroy(&(hqueue->strategy_lock)); + taskq_destroy(hqueue->ra_task); + } + } + + #define IS_ADJACENT(io, nio) \ + (((io)->bp->b_lblkno + (io)->nblocks == (nio)->bp->b_lblkno) && \ + (io)->bp->b_edev == (nio)->bp->b_edev) + + /* + * This performs the actual I/O scheduling logic + */ + void + hsched_invoke_strategy(void *arg) + { + struct hsfs_queue *hqueue = arg; + struct buf *nbuf; + struct hio *fio, *nio, *tio; + size_t bsize, soffset, offset; + int err; + uint_t maxtran; + struct vnode *fvp; + + maxtran = hqueue->dev_maxtransfer; + + mutex_enter(&hqueue->strategy_lock); + mutex_enter(&hqueue->hsfs_queue_lock); + + /* + * Check for Deadline expiration first + */ + fio = avl_first(&hqueue->deadline_tree); + if (fio == NULL) { + mutex_exit(&hqueue->hsfs_queue_lock); + mutex_exit(&hqueue->strategy_lock); + return; + } + + if (lbolt64 - fio->io_timestamp >= HSFS_READ_DEADLINE && + hqueue->next != fio) { + bsize = fio->bp->b_bcount; + avl_remove(&hqueue->deadline_tree, fio); + avl_remove(&hqueue->read_tree, fio); + + } else { + /* + * Apply standard scheduling logic. This uses the + * C-LOOK approach. Process I/O requests in ascending + * order of logical block address till no subsequent + * higher numbered block request remains. Then start + * again from the lowest numbered block in the queue. + */ + if (hqueue->next == NULL) { + fio = avl_first(&hqueue->read_tree); + } else { + fio = hqueue->next; + } + + /* + * In addition we try to coalesce contiguous + * requests into one bigger request. + */ + bsize = ldbtob(fio->nblocks); + fvp = fio->bp->b_file; + nio = AVL_NEXT(&hqueue->read_tree, fio); + tio = fio; + while (nio != NULL && IS_ADJACENT(tio, nio) && + bsize < maxtran) { + tio->contig_chain = nio; + bsize += ldbtob(nio->nblocks); + tio = nio; + if (fvp && tio->bp->b_file != fvp) + fvp = NULL; + + nio = AVL_NEXT(&hqueue->read_tree, nio); + avl_remove(&hqueue->deadline_tree, tio); + avl_remove(&hqueue->read_tree, tio); + } + + hqueue->next = nio; + avl_remove(&hqueue->deadline_tree, fio); + avl_remove(&hqueue->read_tree, fio); + } + mutex_exit(&hqueue->hsfs_queue_lock); + + if (bsize > fio->bp->b_bcount) { + /* + * We have coalesced blocks. First allocate mem and buf for + * the entire coalesced chunk. + */ + nbuf = kmem_zalloc(sizeof (struct buf), KM_SLEEP); + nbuf->b_offset = -1; + nbuf->b_edev = fio->bp->b_edev; + nbuf->b_dev = fio->bp->b_dev; + nbuf->b_flags = fio->bp->b_flags; + nbuf->b_iodone = fio->bp->b_iodone; + nbuf->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP); + nbuf->b_lblkno = fio->bp->b_lblkno; + nbuf->b_vp = fio->bp->b_vp; + nbuf->b_file = fvp; + nbuf->b_bcount = bsize; + nbuf->b_bufsize = bsize; + + sema_init(&nbuf->b_sem, 0, NULL, SEMA_DEFAULT, NULL); + sema_init(&nbuf->b_io, 0, NULL, SEMA_DEFAULT, NULL); + + /* + * Perform I/O for the coalesced block. + */ + (void) bdev_strategy(nbuf); + err = biowait(nbuf); + + nio = fio; + if (err == 0) { + /* + * Need to copy the data to the real bufs. + */ + soffset = ldbtob(fio->bp->b_lblkno); + while (nio != NULL) { + offset = ldbtob(nio->bp->b_lblkno) - soffset; + bcopy(nbuf->b_un.b_addr + offset, + nio->bp->b_un.b_addr, + nio->bp->b_bcount); + bioerror(nio->bp, err); + biodone(nio->bp); + sema_v(nio->sema); + tio = nio; + nio = nio->contig_chain; + kmem_free(tio, sizeof (struct hio)); + } + } else { + /* + * Error condition requires that we re-issue the + * individual I/O requests without coalescing since + * we do not know which buf will fail. Penalising + * all the requests for an individual failure does + * not seem fair. + */ + tio = nio; + while (tio != NULL) { + (void) bdev_strategy(tio->bp); + tio = tio->contig_chain; + } + while (nio != NULL) { + (void) biowait(nio->bp); + sema_v(nio->sema); + tio = nio; + nio = nio->contig_chain; + kmem_free(tio, sizeof (struct hio)); + } + } + + kmem_free(nbuf->b_un.b_addr, bsize); + sema_destroy(&nbuf->b_sem); + sema_destroy(&nbuf->b_io); + kmem_free(nbuf, sizeof (struct buf)); + } else { + + /* + * No coalescing. Just perform the I/O + */ + (void) bdev_strategy(fio->bp); + (void) biowait(fio->bp); + sema_v(fio->sema); + kmem_free(fio, sizeof (struct hio)); + } + mutex_exit(&hqueue->strategy_lock); + } + + /* + * Insert an I/O request in the I/O scheduler's pipeline + * Using AVL tree makes it easy to reorder the I/O request + * based on logical block number. + */ + void + hsched_enqueue_io(struct hsfs_queue *hqueue, struct hio *hsio) + { + + mutex_enter(&hqueue->hsfs_queue_lock); + + avl_add(&hqueue->deadline_tree, hsio); + avl_add(&hqueue->read_tree, hsio); + + mutex_exit(&hqueue->hsfs_queue_lock); + } + const fs_operation_def_t hsfs_vnodeops_template[] = { VOPNAME_OPEN, hsfs_open, VOPNAME_CLOSE, hsfs_close, diff -Nurb ./b47/hsfs_diffs_b47__0_ORIG/usr/src/uts/common/sys/fs/hsfs_node.h ./b47/hsfs_diffs_b47__1_NEW_Moinaks_hsfs/usr/src/uts/common/sys/fs/hsfs_node.h --- ./b47/hsfs_diffs_b47__0_ORIG/usr/src/uts/common/sys/fs/hsfs_node.h 2006-09-08 15:05:59.000000000 +0200 +++ ./b47/hsfs_diffs_b47__1_NEW_Moinaks_hsfs/usr/src/uts/common/sys/fs/hsfs_node.h 2006-09-08 15:06:09.000000000 +0200 @@ -35,6 +35,8 @@ extern "C" { #endif +#include + struct hs_direntry { uint_t ext_lbn; /* LBN of start of extent */ uint_t ext_size; /* no. of data bytes in extent */ @@ -98,6 +100,9 @@ long hs_mapcnt; /* mappings to file pages */ uint_t hs_seq; /* sequence number */ uint_t hs_flags; /* (see below) */ + u_offset_t hs_prev_offset; /* Last read end offset (readahead) */ + int hs_num_contig; /* Count of contiguous reads */ + int hs_ra_bytes; /* Bytes to readahead */ kmutex_t hs_contents_lock; /* protects hsnode contents */ /* except hs_offset */ }; @@ -152,6 +157,51 @@ #define HS_HSNODESPACE 16384 /* approx. space used for hsnodes */ /* + * Hsfs I/O Scheduling parameters and data structures + */ + + /* + * Deadline for reads is set at 500ms. This scaled for + * lbolt64 values since the tick count is 1 every 10ms. + */ + #define HSFS_READ_DEADLINE 50 + #define HSFSMNT_SCHEDIO 0x8 + #define HSFS_NORMAL 0x0 + #define HSFS_READAHEAD 0x1 + + struct hio { + struct buf *bp; + struct hio *contig_chain; + u_offset_t nblocks; + uint64_t io_timestamp; + ksema_t *sema; + avl_node_t io_offset_node; + avl_node_t io_deadline_node; + }; + + struct hio_info { + struct buf *bufs; + caddr_t *vas; + ksema_t *sema; + uint_t bufsused; + uint_t bufcnt; + struct page *pp; + struct hsfs_queue *hqueue; + }; + + struct hsfs_queue { + struct hio *next; + struct buf *nbuf; + kmutex_t hsfs_queue_lock; + kmutex_t strategy_lock; + avl_tree_t read_tree; + avl_tree_t deadline_tree; + taskq_t *ra_task; + int max_ra_bytes; + uint_t dev_maxtransfer; /* Device Max Transfer size in DEV_BSIZE */ + }; + + /* * High Sierra filesystem structure. * There is one of these for each mounted High Sierra filesystem. */ @@ -183,6 +233,7 @@ kmutex_t hsfs_free_lock; /* protects free list */ struct hsnode *hsfs_free_f; /* first entry of free list */ struct hsnode *hsfs_free_b; /* last entry of free list */ + struct hsfs_queue *hqueue; /* I/O Scheduling parameters */ }; /*