diff --git a/kernel.spec b/kernel.spec index 8c00a98..056ef68 100644 --- a/kernel.spec +++ b/kernel.spec @@ -23,7 +23,7 @@ Summary: The Linux kernel # # (Uncomment the '#' and both spaces below to set the buildid.) # -%define buildid .pnfs_all_2010_11_03 +%define buildid .pnfs_38_rc2_0100107 ################################################################### # The buildid can also be specified on the rpmbuild command line @@ -727,15 +727,7 @@ Patch12303: dmar-disable-when-ricoh-multifunction.patch Patch12305: xhci_hcd-suspend-resume.patch -Patch30000: pnfs-all-2.6.36-rc3-2010-08-30.patch -Patch30001: linux-2.6-pnfs-compile.patch -Patch30002: linux-2.6.35-inline.patch - -Patch30000: pnfs-all-latest.v2.6.36-rc6-f15.patch -Patch30001: linux-2.6-pnfs-compile.patch -Patch30002: linux-2.6.35-inline.patch - -Patch30000: pnfs-all-2.6.36-2010-11-03.patch +Patch30000: pnfs-all-2.6.38-rc2-2011-01-27.patch Patch30001: linux-2.6-pnfs-compile.patch Patch30002: linux-2.6.35-inline.patch @@ -1364,19 +1356,7 @@ ApplyPatch dmar-disable-when-ricoh-multifunction.patch ApplyPatch xhci_hcd-suspend-resume.patch -ApplyPatch pnfs-all-2.6.36-2010-11-03.patch -ApplyPatch linux-2.6-pnfs-compile.patch -ApplyPatch linux-2.6.35-inline.patch - -ApplyPatch pnfs-all-latest.v2.6.36-rc6-f15.patch -ApplyPatch linux-2.6-pnfs-compile.patch -ApplyPatch linux-2.6.35-inline.patch - -ApplyPatch pnfs-all-2.6.36-2010-11-03.patch -ApplyPatch linux-2.6-pnfs-compile.patch -ApplyPatch linux-2.6.35-inline.patch - -ApplyPatch pnfs-all-2.6.36-rc3-2010-08-30.patch +ApplyPatch pnfs-all-2.6.38-rc2-2011-01-27.patch ApplyPatch linux-2.6-pnfs-compile.patch ApplyPatch linux-2.6.35-inline.patch @@ -2166,6 +2146,9 @@ fi * Fri Jan 28 2011 Chuck Ebbert 2.6.38-0.rc2.git7.1 - Linux 2.6.38-rc2-git7 +* Fri Jan 28 2011 Steve Dickson 2.6.38-0.rc2.git5.1 +- Updated to the latest pNFS tag: pnfs-all-2.6.38-rc2-2011-01-27 + * Wed Jan 26 2011 Kyle McMartin 2.6.38-0.rc2.git5.1 - Linux 2.6.38-rc2-git5 - [x86] Re-enable TRANSPARENT_HUGEPAGE, should be fixed by cacf061c. @@ -2433,6 +2416,9 @@ fi * Mon Nov 08 2010 Kyle McMartin - Cherry-pick utrace-ptrace fixes from mayoung. Thanks! +* Mon Nov 08 2010 Steve Dickson +- Updated to the latest pNFS tag: pnfs-all-2.6.36-2010-11-03 + * Tue Nov 02 2010 Kyle McMartin 2.6.37-0.1.rc1.git0 - Linux 2.6.37-rc1 @@ -2443,9 +2429,6 @@ fi - Switch to tracking git snapshots of what will become 2.6.37. - Fix context rejects in utrace and a few other patches. -* Mon Nov 8 2010 Steve Dickson -- Updated to the latest pNFS tag: pnfs-all-2.6.36-2010-11-03 - * Wed Oct 20 2010 Chuck Ebbert 2.6.36-1 - Linux 2.6.36 diff --git a/pnfs-all-2.6.36-2010-11-03.patch b/pnfs-all-2.6.36-2010-11-03.patch deleted file mode 100644 index 5c46203..0000000 --- a/pnfs-all-2.6.36-2010-11-03.patch +++ /dev/null @@ -1,29329 +0,0 @@ -diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX -index 2f68cd6..e474827 100644 ---- a/Documentation/filesystems/nfs/00-INDEX -+++ b/Documentation/filesystems/nfs/00-INDEX -@@ -12,5 +12,7 @@ nfs-rdma.txt - - how to install and setup the Linux NFS/RDMA client and server software - nfsroot.txt - - short guide on setting up a diskless box with NFS root filesystem. -+pnfs.txt -+ - short explanation of some of the internals of the pnfs client code - rpc-cache.txt - - introduction to the caching mechanisms in the sunrpc layer. -diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt -new file mode 100644 -index 0000000..bc0b9cf ---- /dev/null -+++ b/Documentation/filesystems/nfs/pnfs.txt -@@ -0,0 +1,48 @@ -+Reference counting in pnfs: -+========================== -+ -+The are several inter-related caches. We have layouts which can -+reference multiple devices, each of which can reference multiple data servers. -+Each data server can be referenced by multiple devices. Each device -+can be referenced by multiple layouts. To keep all of this straight, -+we need to reference count. -+ -+ -+struct pnfs_layout_hdr -+---------------------- -+The on-the-wire command LAYOUTGET corresponds to struct -+pnfs_layout_segment, usually referred to by the variable name lseg. -+Each nfs_inode may hold a pointer to a cache of of these layout -+segments in nfsi->layout, of type struct pnfs_layout_hdr. -+ -+We reference the header for the inode pointing to it, across each -+outstanding RPC call that references it (LAYOUTGET, LAYOUTRETURN, -+LAYOUTCOMMIT), and for each lseg held within. -+ -+Each header is also (when non-empty) put on a list associated with -+struct nfs_client (cl_layouts). Being put on this list does not bump -+the reference count, as the layout is kept around by the lseg that -+keeps it in the list. -+ -+deviceid_cache -+-------------- -+lsegs reference device ids, which are resolved per nfs_client and -+layout driver type. The device ids are held in a RCU cache (struct -+nfs4_deviceid_cache). The cache itself is referenced across each -+mount. The entries (struct nfs4_deviceid) themselves are held across -+the lifetime of each lseg referencing them. -+ -+RCU is used because the deviceid is basically a write once, read many -+data structure. The hlist size of 32 buckets needs better -+justification, but seems reasonable given that we can have multiple -+deviceid's per filesystem, and multiple filesystems per nfs_client. -+ -+The hash code is copied from the nfsd code base. A discussion of -+hashing and variations of this algorithm can be found at: -+http://groups.google.com/group/comp.lang.c/browse_thread/thread/9522965e2b8d3809 -+ -+data server cache -+----------------- -+file driver devices refer to data servers, which are kept in a module -+level cache. Its reference is held over the lifetime of the deviceid -+pointing to it. -diff --git a/Documentation/filesystems/spnfs.txt b/Documentation/filesystems/spnfs.txt -new file mode 100644 -index 0000000..e1d2864 ---- /dev/null -+++ b/Documentation/filesystems/spnfs.txt -@@ -0,0 +1,211 @@ -+(c) 2007 Network Appliance Inc. -+ -+spNFS -+----- -+ -+An spNFS system consists of a Meta Data Server (MDS), a number of Client machines (C) and a number of Data Servers (DS). -+ -+A file system is mounted by the clients from the MDS, and all file data -+is striped across the DSs. -+ -+Identify the machines that will be filling each of these roles. -+ -+The spnfs kernel will be installed on all machines: clients, the MDS and DSs. -+ -+ -+Building and installing the spNFS kernel -+---------------------------------------- -+ -+Get the spNFS kernel from: -+ -+ git://linux-nfs.org/~bhalevy/linux-pnfs.git -+ -+Use the pnfs-all-latest branch and add these options to your .config file -+ -+ CONFIG_NETWORK_FILESYSTEMS=y -+ CONFIG_NFS_FS=m -+ CONFIG_NFS_V4=y -+ CONFIG_NFS_V4_1=y -+ CONFIG_PNFS=y -+ CONFIG_NFSD=m -+ CONFIG_PNFSD=y -+ # CONFIG_PNFSD_LOCAL_EXPORT is not set -+ CONFIG_SPNFS=y -+ -+By default, spNFS uses whole-file layouts. Layout segments can be enabled -+by adding: -+ -+ CONFIG_SPNFS_LAYOUTSEGMENTS=y -+ -+to your .config file. -+ -+Building and installation of kernel+modules is as usual. -+This kernel should be installed and booted on the client, MDS and DSs. -+ -+Note that CONFIG_PNFSD_LOCAL_EXPORT must be disabled for spnfs as it -+takes over the pnfs export interface. -+ -+Building nfs-utils -+------------------ -+ -+Get the nfs-utils package containing spnfsd from: -+ -+ git://linux-nfs.org/~bhalevy/pnfs-nfs-utils.git -+ -+Follow the standard instructions for building nfs-utils. -+ -+After building, the spnfsd daemon will be located in utils/spnfsd. The spnfsd -+daemon will only be needed on the MDS. -+ -+ -+Installation -+------------ -+ -+The nfs-utils package contains a default spnfsd.conf file in -+utils/spnfsd/spnfsd.conf. Copy this file to /etc/spnfsd.conf. -+ -+By default, the DS-Mount-Directory is set to /spnfs (see spnfsd.conf). Under -+this directory, mount points must be created for each DS to -+be used for pNFS data stripes. These mount points are named by the ip address -+of the corresponding DS. In the sample spnfsd.conf, there are two -+DSs defined (172.16.28.134 and 172.16.28.141). -+ -+Following the sample spnfsd.conf, -+ -+ mkdir /spnfs -+ -+on the MDS (corresponding to DS-Mount-Directory). Then -+ -+ mkdir /spnfs/172.16.28.134 -+ mkdir /spnfs/172.16.28.141 -+ -+to create the mount points for the DSs. -+ -+On the DSs, chose a directory where data stripes will be created by the MDS. -+For the sample file, this directory is /pnfs, so on each DS execute: -+ -+ mkdir /pnfs -+ -+This directory is specified in the spnfsd.conf file by the DS*_ROOT option -+(where * is replaced by the DS number). DS_ROOT is specified relative to -+the directory being exported by the DSs. In our example, our DSs are exporting -+the root directory (/) and therefore our DS_ROOT is /pnfs. On the DSs, we have -+the following entry in /etc/exports: -+ -+ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check) -+ -+N.B. If we had created a /exports directory and a /pnfs directory under -+/exports, and if we were exporting /exports, then DS_ROOT would still be /pnfs -+(not /exports/pnfs). -+ -+It may be useful to add entries to /etc/fstab on the MDS to automatically -+mount the DS_ROOT file systems. For this example, our MDS fstab would -+contain: -+ -+ 172.17.84.128:/pnfs /spnfs/172.17.84.128 nfs defaults 1 2 -+ 172.17.84.122:/pnfs /spnfs/172.17.84.122 nfs defaults 1 2 -+ -+The DS mounts must be performed manually or via fstab at this time (automatic -+mounting, directory creation, etc. are on the todo list). To perform I/O -+through the MDS, the DS mounts MUST use NFSv3 at this time (this restriction -+will eventually be removed). -+ -+ -+On the MDS, choose a file system to use with spNFS and export it, e.g.: -+ -+ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check,pnfs) -+ -+Make sure nfsd and all supporting processes are running on the MDS and DSs. -+ -+ -+Running -+------- -+ -+If rpc_pipefs is not already mounted (if you're running idmapd it probably is), -+you may want to add the following line to /etc/fstab: -+ -+ rpc_pipefs /var/lib/nfs/rpc_pipefs rpc_pipefs defaults 0 0 -+ -+to automatically mount rpc_pipefs. -+ -+With spnfsd.conf configured for your environment and the mounts mounted as -+described above, spnfsd can now be started. -+ -+On the MDS, execute spnfsd: -+ -+ spnfsd -+ -+The executable is located in the directory where it was built, and -+may also have been installed elsewhere depending on how you built nfs-utils. -+It will run in the foreground by default, and in fact will do so despite -+any options suggesting the contrary (it's still a debugging build). -+ -+On the client, make sure the nfslayoutdriver module is loaded: -+ -+ modprobe nfslayoutdriver -+ -+Then mount the file system from the MDS: -+ -+ mount -t nfs4 -o minorversion=1 mds:/ /mnt -+ -+I/O through the MDS is now supported. To use it, do not load the -+nfslayoutdriver on the client, and mount the MDS using NFSv4 or 4.1 -+(NFSv2 and v3 are not yet supported). -+ -+You may now use spNFS by performing file system activities in /mnt. -+If you create files in /mnt, you should see stripe files corresponding to -+new files being created on the DSs. The current implementation names the -+stripe files based on the inode number of the file on the MDS. For example, -+if you create a file foo in /mnt and do an 'ls -li /mnt/foo': -+ -+ # ls -li foo -+ 1233 -rw-r--r-- 1 root root 0 Nov 29 15:54 foo -+ -+You should see stripe files on each under /pnfs (per the sample) named -+1233. The file /pnfs/1233 on DS1 will contain the first bytes -+of data written to foo, DS2 will contain the next bytes, etc. -+Removing /mnt/foo will remove the corresponding stripe files on the DSs. -+Other file system operations should behave (mostly :-) as expected. -+ -+ -+Layout Segments -+--------------- -+ -+If the kernel is compiled to support layout segments, there will -+be two files created under /proc/fs/spnfs for controlling layout -+segment functionality. -+ -+To enable layout segments, write a '1' to /proc/fs/spnfs/layoutseg, e.g.: -+ -+ echo 1 > /proc/fs/spnfs/layoutseg -+ -+Layout segments can be disabled (returning to whole-file layouts) by -+writing a '0' to /proc/fs/spnfs/layoutseg: -+ -+ echo 0 > /proc/fs/spnfs/layoutseg -+ -+When layout segments are enabled, the size of the layouts returned can -+be specified by writing a decimal number (ascii representation) to -+/proc/fs/spnfs/layoutsegsize: -+ -+ echo 1024 > /proc/fs/spnfs/layoutsegsize -+ -+The value'0' has a special meaning--it causes the server to return a -+layout that is exactly the size requested by the client: -+ -+ echo 0 > /proc/fs/spnfs/layoutsegsize -+ -+ -+Troubleshooting -+--------------- -+ -+If you see data being written to the files on the MDS rather than -+the stripe files, make sure the nfslayoutdriver is loaded on the client -+(see above). -+ -+If you get a "permission denied" error, make sure mountd is running on the mds -+(it occasionally fails to start). -+ -+Bugs, enhancements, compliments, complaints to: dmuntz@netapp.com -+ -+ -diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c -index 3e39193..92ce1fb 100644 ---- a/drivers/md/dm-ioctl.c -+++ b/drivers/md/dm-ioctl.c -@@ -663,6 +663,12 @@ static int dev_create(struct dm_ioctl *param, size_t param_size) - return 0; - } - -+int dm_dev_create(struct dm_ioctl *param) -+{ -+ return dev_create(param, sizeof(*param)); -+} -+EXPORT_SYMBOL(dm_dev_create); -+ - /* - * Always use UUID for lookups if it's present, otherwise use name or dev. - */ -@@ -758,6 +764,12 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) - return 0; - } - -+int dm_dev_remove(struct dm_ioctl *param) -+{ -+ return dev_remove(param, sizeof(*param)); -+} -+EXPORT_SYMBOL(dm_dev_remove); -+ - /* - * Check a string doesn't overrun the chunk of - * memory we copied from userland. -@@ -937,6 +949,12 @@ static int do_resume(struct dm_ioctl *param) - return r; - } - -+int dm_do_resume(struct dm_ioctl *param) -+{ -+ return do_resume(param); -+} -+EXPORT_SYMBOL(dm_do_resume); -+ - /* - * Set or unset the suspension state of a device. - * If the device already is in the requested state we just return its status. -@@ -1203,6 +1221,12 @@ out: - return r; - } - -+int dm_table_load(struct dm_ioctl *param, size_t param_size) -+{ -+ return table_load(param, param_size); -+} -+EXPORT_SYMBOL(dm_table_load); -+ - static int table_clear(struct dm_ioctl *param, size_t param_size) - { - struct hash_cell *hc; -diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c -index 8a8f803..7958885 100644 ---- a/drivers/scsi/hosts.c -+++ b/drivers/scsi/hosts.c -@@ -50,10 +50,11 @@ static void scsi_host_cls_release(struct device *dev) - put_device(&class_to_shost(dev)->shost_gendev); - } - --static struct class shost_class = { -+struct class shost_class = { - .name = "scsi_host", - .dev_release = scsi_host_cls_release, - }; -+EXPORT_SYMBOL(shost_class); - - /** - * scsi_host_set_state - Take the given host through the host state model. -diff --git a/fs/Kconfig b/fs/Kconfig -index 3d18530..82b6696 100644 ---- a/fs/Kconfig -+++ b/fs/Kconfig -@@ -224,6 +224,31 @@ config LOCKD_V4 - config EXPORTFS - tristate - -+config EXPORTFS_FILE_LAYOUT -+ bool -+ depends on PNFSD && EXPORTFS -+ help -+ Exportfs support for the NFSv4.1 files layout type. -+ Must be automatically selected by supporting filesystems. -+ -+config EXPORTFS_OSD_LAYOUT -+ bool -+ depends on PNFSD && EXPORTFS -+ help -+ Exportfs support for the NFSv4.1 objects layout type. -+ Must be automatically selected by supporting osd -+ filesystems. -+ -+ If unsure, say N. -+ -+config EXPORTFS_BLOCK_LAYOUT -+ bool -+ depends on PNFSD && EXPORTFS -+ help -+ Exportfs support for the NFSv4.1 blocks layout type. -+ Must be automatically selected by supporting filesystems. -+ -+ - config NFS_ACL_SUPPORT - tristate - select FS_POSIX_ACL -diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild -index 2d0f757..5458546 100644 ---- a/fs/exofs/Kbuild -+++ b/fs/exofs/Kbuild -@@ -13,4 +13,5 @@ - # - - exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o -+exofs-$(CONFIG_PNFSD) += export.o - obj-$(CONFIG_EXOFS_FS) += exofs.o -diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig -index 86194b2f..77c677f 100644 ---- a/fs/exofs/Kconfig -+++ b/fs/exofs/Kconfig -@@ -1,6 +1,7 @@ - config EXOFS_FS - tristate "exofs: OSD based file system support" - depends on SCSI_OSD_ULD -+ select EXPORTFS_OSD_LAYOUT if PNFSD - help - EXOFS is a file system that uses an OSD storage device, - as its backing storage. -diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h -index 2dc925f..3c03f2d 100644 ---- a/fs/exofs/exofs.h -+++ b/fs/exofs/exofs.h -@@ -36,13 +36,9 @@ - #include - #include - #include -+#include - #include "common.h" - --/* FIXME: Remove once pnfs hits mainline -- * #include -- */ --#include "pnfs.h" -- - #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) - - #ifdef CONFIG_EXOFS_DEBUG -@@ -103,6 +99,7 @@ struct exofs_sb_info { - struct exofs_i_info { - struct inode vfs_inode; /* normal in-memory inode */ - wait_queue_head_t i_wq; /* wait queue for inode */ -+ spinlock_t i_layout_lock; /* lock for layout/return/recall */ - unsigned long i_flags; /* various atomic flags */ - uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ - uint32_t i_dir_start_lookup; /* which page to start lookup */ -@@ -166,6 +163,9 @@ static inline unsigned exofs_io_state_size(unsigned numdevs) - */ - #define OBJ_2BCREATED 0 /* object will be created soon*/ - #define OBJ_CREATED 1 /* object has been created on the osd*/ -+/* Below are not used atomic but reuse the same i_flags */ -+#define OBJ_LAYOUT_IS_GIVEN 2 /* inode has given layouts to clients*/ -+#define OBJ_IN_LAYOUT_RECALL 3 /* inode is in the middle of a layout recall*/ - - static inline int obj_2bcreated(struct exofs_i_info *oi) - { -@@ -303,4 +303,21 @@ extern const struct inode_operations exofs_special_inode_operations; - extern const struct inode_operations exofs_symlink_inode_operations; - extern const struct inode_operations exofs_fast_symlink_inode_operations; - -+/* export.c */ -+typedef int (exofs_recall_fn)(struct inode *inode, u64 data); -+#ifdef CONFIG_PNFSD -+int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, -+ exofs_recall_fn todo, u64 todo_data); -+void exofs_init_export(struct super_block *sb); -+#else -+static inline int -+exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, -+exofs_recall_fn todo, u64 todo_data) -+{ -+ return todo(inode, todo_data); -+} -+ -+static inline void exofs_init_export(struct super_block *sb) {} -+#endif -+ - #endif -diff --git a/fs/exofs/export.c b/fs/exofs/export.c -new file mode 100644 -index 0000000..69bce46 ---- /dev/null -+++ b/fs/exofs/export.c -@@ -0,0 +1,396 @@ -+/* -+ * export.c - Implementation of the pnfs_export_operations -+ * -+ * Copyright (C) 2009 Panasas Inc. -+ * All rights reserved. -+ * -+ * Boaz Harrosh -+ * -+ * This file is part of exofs. -+ * -+ * exofs is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation. Since it is based on ext2, and the only -+ * valid version of GPL for the Linux kernel is version 2, the only valid -+ * version of GPL for exofs is version 2. -+ * -+ * exofs is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with exofs; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include -+#include "exofs.h" -+ -+static int exofs_layout_type(struct super_block *sb) -+{ -+ return LAYOUT_OSD2_OBJECTS; -+} -+ -+static void set_dev_id(struct nfs4_deviceid *pnfs_devid, u64 sbid, u64 devid) -+{ -+ struct nfsd4_pnfs_deviceid *dev_id = -+ (struct nfsd4_pnfs_deviceid *)pnfs_devid; -+ -+ dev_id->sbid = sbid; -+ dev_id->devid = devid; -+} -+ -+static int cb_layout_recall(struct inode *inode, enum pnfs_iomode iomode, -+ u64 offset, u64 length, void *cookie) -+{ -+ struct nfsd4_pnfs_cb_layout cbl; -+ struct pnfsd_cb_ctl cb_ctl; -+ int status; -+ -+ memset(&cb_ctl, 0, sizeof(cb_ctl)); -+ status = pnfsd_get_cb_op(&cb_ctl); -+ if (unlikely(status)) { -+ EXOFS_ERR("%s: nfsd unloaded!! inode (0x%lx) status=%d\n", -+ __func__, inode->i_ino, status); -+ goto err; -+ } -+ -+ memset(&cbl, 0, sizeof(cbl)); -+ cbl.cbl_recall_type = RETURN_FILE; -+ cbl.cbl_seg.layout_type = LAYOUT_OSD2_OBJECTS; -+ cbl.cbl_seg.iomode = iomode; -+ cbl.cbl_seg.offset = offset; -+ cbl.cbl_seg.length = length; -+ cbl.cbl_cookie = cookie; -+ -+ status = cb_ctl.cb_op->cb_layout_recall(inode->i_sb, inode, &cbl); -+ pnfsd_put_cb_op(&cb_ctl); -+ -+err: -+ return status; -+} -+ -+static enum nfsstat4 exofs_layout_get( -+ struct inode *inode, -+ struct exp_xdr_stream *xdr, -+ const struct nfsd4_pnfs_layoutget_arg *args, -+ struct nfsd4_pnfs_layoutget_res *res) -+{ -+ struct exofs_i_info *oi = exofs_i(inode); -+ struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; -+ struct exofs_layout *el = &sbi->layout; -+ struct pnfs_osd_object_cred *creds = NULL; -+ struct pnfs_osd_layout layout; -+ __be32 *start; -+ bool in_recall; -+ int i, err; -+ enum nfsstat4 nfserr; -+ -+ res->lg_seg.offset = 0; -+ res->lg_seg.length = NFS4_MAX_UINT64; -+ res->lg_seg.iomode = IOMODE_RW; -+ res->lg_return_on_close = true; /* TODO: unused but will be soon */ -+ -+ /* skip opaque size, will be filled-in later */ -+ start = exp_xdr_reserve_qwords(xdr, 1); -+ if (!start) { -+ nfserr = NFS4ERR_TOOSMALL; -+ goto out; -+ } -+ -+ creds = kcalloc(el->s_numdevs, sizeof(*creds), GFP_KERNEL); -+ if (!creds) { -+ nfserr = NFS4ERR_LAYOUTTRYLATER; -+ goto out; -+ } -+ -+ /* Fill in a pnfs_osd_layout struct */ -+ layout.olo_map = sbi->data_map; -+ -+ for (i = 0; i < el->s_numdevs; i++) { -+ struct pnfs_osd_object_cred *cred = &creds[i]; -+ osd_id id = exofs_oi_objno(oi); -+ unsigned dev = exofs_layout_od_id(el, id, i); -+ -+ set_dev_id(&cred->oc_object_id.oid_device_id, args->lg_sbid, -+ dev); -+ cred->oc_object_id.oid_partition_id = el->s_pid; -+ cred->oc_object_id.oid_object_id = id; -+ cred->oc_osd_version = osd_dev_is_ver1(el->s_ods[dev]) ? -+ PNFS_OSD_VERSION_1 : -+ PNFS_OSD_VERSION_2; -+ cred->oc_cap_key_sec = PNFS_OSD_CAP_KEY_SEC_NONE; -+ -+ cred->oc_cap_key.cred_len = 0; -+ cred->oc_cap_key.cred = NULL; -+ -+ cred->oc_cap.cred_len = OSD_CAP_LEN; -+ cred->oc_cap.cred = oi->i_cred; -+ } -+ -+ layout.olo_comps_index = 0; -+ layout.olo_num_comps = el->s_numdevs; -+ layout.olo_comps = creds; -+ -+ err = pnfs_osd_xdr_encode_layout(xdr, &layout); -+ if (err) { -+ nfserr = NFS4ERR_TOOSMALL; /* FIXME: Change osd_xdr error codes */ -+ goto out; -+ } -+ -+ exp_xdr_encode_opaque_len(start, xdr->p); -+ -+ spin_lock(&oi->i_layout_lock); -+ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); -+ if (!in_recall) { -+ __set_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); -+ nfserr = NFS4_OK; -+ } else { -+ nfserr = NFS4ERR_RECALLCONFLICT; -+ } -+ spin_unlock(&oi->i_layout_lock); -+ -+out: -+ kfree(creds); -+ EXOFS_DBGMSG("(0x%lx) nfserr=%u xdr_bytes=%zu\n", -+ inode->i_ino, nfserr, exp_xdr_qbytes(xdr->p - start)); -+ return nfserr; -+} -+ -+/* NOTE: inode mutex must NOT be held */ -+static int exofs_layout_commit( -+ struct inode *inode, -+ const struct nfsd4_pnfs_layoutcommit_arg *args, -+ struct nfsd4_pnfs_layoutcommit_res *res) -+{ -+ struct exofs_i_info *oi = exofs_i(inode); -+ struct timespec mtime; -+ loff_t i_size; -+ int in_recall; -+ -+ /* In case of a recall we ignore the new size and mtime since they -+ * are going to be changed again by truncate, and since we cannot take -+ * the inode lock in that case. -+ */ -+ spin_lock(&oi->i_layout_lock); -+ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); -+ spin_unlock(&oi->i_layout_lock); -+ if (in_recall) { -+ EXOFS_DBGMSG("(0x%lx) commit was called during recall\n", -+ inode->i_ino); -+ return 0; -+ } -+ -+ /* NOTE: I would love to call inode_setattr here -+ * but i cannot since this will cause an eventual vmtruncate, -+ * which will cause a layout_recall. So open code the i_size -+ * and mtime/atime changes under i_mutex. -+ */ -+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_NORMAL); -+ -+ if (args->lc_mtime.seconds) { -+ mtime.tv_sec = args->lc_mtime.seconds; -+ mtime.tv_nsec = args->lc_mtime.nseconds; -+ -+ /* layout commit may only make time bigger, since there might -+ * be reordering of the notifications and it might arrive after -+ * A local change. -+ * TODO: if mtime > ctime then we know set_attr did an mtime -+ * in the future. and we can let this update through -+ */ -+ if (0 <= timespec_compare(&mtime, &inode->i_mtime)) -+ mtime = inode->i_mtime; -+ } else { -+ mtime = current_fs_time(inode->i_sb); -+ } -+ -+ /* TODO: Will below work? since mark_inode_dirty has it's own -+ * Time handling -+ */ -+ inode->i_atime = inode->i_mtime = mtime; -+ -+ i_size = i_size_read(inode); -+ if (args->lc_newoffset) { -+ loff_t new_size = args->lc_last_wr + 1; -+ -+ if (i_size < new_size) { -+ i_size_write(inode, i_size = new_size); -+ res->lc_size_chg = 1; -+ res->lc_newsize = new_size; -+ } -+ } -+ /* TODO: else { i_size = osd_get_object_length() } */ -+ -+/* TODO: exofs does not currently use the osd_xdr part of the layout_commit */ -+ -+ mark_inode_dirty_sync(inode); -+ -+ mutex_unlock(&inode->i_mutex); -+ EXOFS_DBGMSG("(0x%lx) i_size=0x%llx lcp->off=0x%llx\n", -+ inode->i_ino, i_size, args->lc_last_wr); -+ return 0; -+} -+ -+static void exofs_handle_error(struct pnfs_osd_ioerr *ioerr) -+{ -+ EXOFS_ERR("exofs_handle_error: errno=%d is_write=%d obj=0x%llx " -+ "offset=0x%llx length=0x%llx\n", -+ ioerr->oer_errno, ioerr->oer_iswrite, -+ _LLU(ioerr->oer_component.oid_object_id), -+ _LLU(ioerr->oer_comp_offset), -+ _LLU(ioerr->oer_comp_length)); -+} -+ -+static int exofs_layout_return( -+ struct inode *inode, -+ const struct nfsd4_pnfs_layoutreturn_arg *args) -+{ -+ __be32 *p = args->lrf_body; -+ unsigned len = exp_xdr_qwords(args->lrf_body_len); -+ -+ EXOFS_DBGMSG("(0x%lx) cookie %p xdr_len %d\n", -+ inode->i_ino, args->lr_cookie, len); -+ -+ while (len >= pnfs_osd_ioerr_xdr_sz()) { -+ struct pnfs_osd_ioerr ioerr; -+ -+ p = pnfs_osd_xdr_decode_ioerr(&ioerr, p); -+ len -= pnfs_osd_ioerr_xdr_sz(); -+ exofs_handle_error(&ioerr); -+ } -+ -+ if (args->lr_cookie) { -+ struct exofs_i_info *oi = exofs_i(inode); -+ bool in_recall; -+ -+ spin_lock(&oi->i_layout_lock); -+ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); -+ __clear_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); -+ spin_unlock(&oi->i_layout_lock); -+ -+ /* TODO: how to communicate cookie with the waiter */ -+ if (in_recall) -+ wake_up(&oi->i_wq); /* wakeup any recalls */ -+ } -+ -+ return 0; -+} -+ -+int exofs_get_device_info(struct super_block *sb, struct exp_xdr_stream *xdr, -+ u32 layout_type, -+ const struct nfsd4_pnfs_deviceid *devid) -+{ -+ struct exofs_sb_info *sbi = sb->s_fs_info; -+ struct pnfs_osd_deviceaddr devaddr; -+ const struct osd_dev_info *odi; -+ u64 devno = devid->devid; -+ __be32 *start; -+ int err; -+ -+ memset(&devaddr, 0, sizeof(devaddr)); -+ -+ if (unlikely(devno >= sbi->layout.s_numdevs)) -+ return -ENODEV; -+ -+ odi = osduld_device_info(sbi->layout.s_ods[devno]); -+ -+ devaddr.oda_systemid.len = odi->systemid_len; -+ devaddr.oda_systemid.data = (void *)odi->systemid; /* !const cast */ -+ -+ devaddr.oda_osdname.len = odi->osdname_len ; -+ devaddr.oda_osdname.data = (void *)odi->osdname;/* !const cast */ -+ -+ /* skip opaque size, will be filled-in later */ -+ start = exp_xdr_reserve_qwords(xdr, 1); -+ if (!start) { -+ err = -E2BIG; -+ goto err; -+ } -+ -+ err = pnfs_osd_xdr_encode_deviceaddr(xdr, &devaddr); -+ if (err) -+ goto err; -+ -+ exp_xdr_encode_opaque_len(start, xdr->p); -+ -+ EXOFS_DBGMSG("xdr_bytes=%Zu devno=%lld osdname-%s\n", -+ exp_xdr_qbytes(xdr->p - start), devno, odi->osdname); -+ return 0; -+ -+err: -+ EXOFS_DBGMSG("Error: err=%d at_byte=%zu\n", -+ err, exp_xdr_qbytes(xdr->p - start)); -+ return err; -+} -+ -+struct pnfs_export_operations exofs_pnfs_ops = { -+ .layout_type = exofs_layout_type, -+ .layout_get = exofs_layout_get, -+ .layout_commit = exofs_layout_commit, -+ .layout_return = exofs_layout_return, -+ .get_device_info = exofs_get_device_info, -+}; -+ -+static bool is_layout_returned(struct exofs_i_info *oi) -+{ -+ bool layout_given; -+ -+ spin_lock(&oi->i_layout_lock); -+ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); -+ spin_unlock(&oi->i_layout_lock); -+ -+ return !layout_given; -+} -+ -+int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, -+ exofs_recall_fn todo, u64 todo_data) -+{ -+ struct exofs_i_info *oi = exofs_i(inode); -+ int layout_given; -+ int error = 0; -+ -+ spin_lock(&oi->i_layout_lock); -+ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); -+ __set_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); -+ spin_unlock(&oi->i_layout_lock); -+ -+ if (!layout_given) -+ goto exec; -+ -+ for (;;) { -+ EXOFS_DBGMSG("(0x%lx) has_layout issue a recall\n", -+ inode->i_ino); -+ error = cb_layout_recall(inode, iomode, 0, NFS4_MAX_UINT64, -+ &oi->i_wq); -+ switch (error) { -+ case 0: -+ case -EAGAIN: -+ break; -+ case -ENOENT: -+ goto exec; -+ default: -+ goto err; -+ } -+ -+ error = wait_event_interruptible(oi->i_wq, -+ is_layout_returned(oi)); -+ if (error) -+ goto err; -+ } -+ -+exec: -+ error = todo(inode, todo_data); -+ -+err: -+ spin_lock(&oi->i_layout_lock); -+ __clear_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); -+ spin_unlock(&oi->i_layout_lock); -+ EXOFS_DBGMSG("(0x%lx) return=>%d\n", inode->i_ino, error); -+ return error; -+} -+ -+void exofs_init_export(struct super_block *sb) -+{ -+ sb->s_pnfs_op = &exofs_pnfs_ops; -+} -diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c -index 3eadd97..1cf2286 100644 ---- a/fs/exofs/inode.c -+++ b/fs/exofs/inode.c -@@ -826,8 +826,9 @@ static inline int exofs_inode_is_fast_symlink(struct inode *inode) - const struct osd_attr g_attr_logical_length = ATTR_DEF( - OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); - --static int _do_truncate(struct inode *inode, loff_t newsize) -+static int _do_truncate(struct inode *inode, u64 data) - { -+ loff_t newsize = data; - struct exofs_i_info *oi = exofs_i(inode); - int ret; - -@@ -864,7 +865,8 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr) - - if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(inode)) { -- error = _do_truncate(inode, iattr->ia_size); -+ error = exofs_inode_recall_layout(inode, IOMODE_ANY, -+ _do_truncate, iattr->ia_size); - if (unlikely(error)) - return error; - } -@@ -977,6 +979,7 @@ static void __oi_init(struct exofs_i_info *oi) - { - init_waitqueue_head(&oi->i_wq); - oi->i_flags = 0; -+ spin_lock_init(&oi->i_layout_lock); - } - /* - * Fill in an inode read from the OSD and set it up for use -diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h -deleted file mode 100644 -index c52e988..0000000 ---- a/fs/exofs/pnfs.h -+++ /dev/null -@@ -1,45 +0,0 @@ --/* -- * Copyright (C) 2008, 2009 -- * Boaz Harrosh -- * -- * This file is part of exofs. -- * -- * exofs is free software; you can redistribute it and/or modify it under the -- * terms of the GNU General Public License version 2 as published by the Free -- * Software Foundation. -- * -- */ -- --/* FIXME: Remove this file once pnfs hits mainline */ -- --#ifndef __EXOFS_PNFS_H__ --#define __EXOFS_PNFS_H__ -- --#if ! defined(__PNFS_OSD_XDR_H__) -- --enum pnfs_iomode { -- IOMODE_READ = 1, -- IOMODE_RW = 2, -- IOMODE_ANY = 3, --}; -- --/* Layout Structure */ --enum pnfs_osd_raid_algorithm4 { -- PNFS_OSD_RAID_0 = 1, -- PNFS_OSD_RAID_4 = 2, -- PNFS_OSD_RAID_5 = 3, -- PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ --}; -- --struct pnfs_osd_data_map { -- u32 odm_num_comps; -- u64 odm_stripe_unit; -- u32 odm_group_width; -- u32 odm_group_depth; -- u32 odm_mirror_cnt; -- u32 odm_raid_algorithm; --}; -- --#endif /* ! defined(__PNFS_OSD_XDR_H__) */ -- --#endif /* __EXOFS_PNFS_H__ */ -diff --git a/fs/exofs/super.c b/fs/exofs/super.c -index 047e92f..623aa55 100644 ---- a/fs/exofs/super.c -+++ b/fs/exofs/super.c -@@ -620,6 +620,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) - sb->s_fs_info = sbi; - sb->s_op = &exofs_sops; - sb->s_export_op = &exofs_export_ops; -+ exofs_init_export(sb); - root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); - if (IS_ERR(root)) { - EXOFS_ERR("ERROR: exofs_iget failed\n"); -diff --git a/fs/exportfs/Makefile b/fs/exportfs/Makefile -index d7c5d4d..51e8ee4 100644 ---- a/fs/exportfs/Makefile -+++ b/fs/exportfs/Makefile -@@ -3,4 +3,7 @@ - - obj-$(CONFIG_EXPORTFS) += exportfs.o - --exportfs-objs := expfs.o -+exportfs-y := expfs.o -+exportfs-$(CONFIG_EXPORTFS_FILE_LAYOUT) += nfs4filelayoutxdr.o -+exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o -+exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o -diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c -index e9e1759..a10949a 100644 ---- a/fs/exportfs/expfs.c -+++ b/fs/exportfs/expfs.c -@@ -16,6 +16,13 @@ - #include - #include - -+#if defined(CONFIG_PNFSD) -+struct pnfsd_cb_ctl pnfsd_cb_ctl = { -+ .lock = __SPIN_LOCK_UNLOCKED(pnfsd_cb_ctl.lock) -+}; -+EXPORT_SYMBOL(pnfsd_cb_ctl); -+#endif /* CONFIG_PNFSD */ -+ - #define dprintk(fmt, args...) do{}while(0) - - -diff --git a/fs/exportfs/nfs4blocklayoutxdr.c b/fs/exportfs/nfs4blocklayoutxdr.c -new file mode 100644 -index 0000000..439e647 ---- /dev/null -+++ b/fs/exportfs/nfs4blocklayoutxdr.c -@@ -0,0 +1,158 @@ -+/* -+ * linux/fs/nfsd/nfs4blocklayoutxdr.c -+ * -+ * -+ * Created by Rick McNeal on 3/31/08. -+ * Copyright 2008 __MyCompanyName__. All rights reserved. -+ * -+ */ -+#include -+#include -+#include -+#include -+ -+static int -+bl_encode_simple(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) -+{ -+ __be32 *p = exp_xdr_reserve_space(xdr, -+ 12 + 4 + bld->u.simple.bld_sig_len); -+ -+ if (!p) -+ return -ETOOSMALL; -+ -+ p = exp_xdr_encode_u32(p, 1); -+ p = exp_xdr_encode_u64(p, bld->u.simple.bld_offset); -+ exp_xdr_encode_opaque(p, bld->u.simple.bld_sig, -+ bld->u.simple.bld_sig_len); -+ -+ return 0; -+} -+ -+static int -+bl_encode_slice(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) -+{ -+ __be32 *p = exp_xdr_reserve_qwords(xdr, 2 + 2 + 1); -+ -+ if (!p) -+ return -ETOOSMALL; -+ -+ p = exp_xdr_encode_u64(p, bld->u.slice.bld_start); -+ p = exp_xdr_encode_u64(p, bld->u.slice.bld_len); -+ exp_xdr_encode_u32(p, bld->u.slice.bld_index); -+ -+ return 0; -+} -+ -+static int -+bl_encode_concat(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) -+{ -+ return -ENOTSUPP; -+} -+ -+static int -+bl_encode_stripe(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) -+{ -+ int i; -+ __be32 *p = exp_xdr_reserve_space(xdr, -+ 2 + 1 + bld->u.stripe.bld_stripes); -+ -+ p = exp_xdr_encode_u64(p, bld->u.stripe.bld_chunk_size); -+ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripes); -+ for (i = 0; i < bld->u.stripe.bld_stripes; i++) -+ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripe_indexs[i]); -+ -+ return 0; -+} -+ -+int -+blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, -+ const struct list_head *volumes) -+{ -+ u32 num_vols = 0, -+ *layoutlen_p = xdr->p; -+ pnfs_blocklayout_devinfo_t *bld; -+ int status = 0; -+ __be32 *p; -+ -+ p = exp_xdr_reserve_qwords(xdr, 2); -+ if (!p) -+ return -ETOOSMALL; -+ p += 2; -+ -+ /* -+ * All simple volumes with their signature are required to be listed -+ * first. -+ */ -+ list_for_each_entry(bld, volumes, bld_list) { -+ num_vols++; -+ p = exp_xdr_reserve_qwords(xdr, 1); -+ if (!p) -+ return -ETOOSMALL; -+ p = exp_xdr_encode_u32(p, bld->bld_type); -+ switch (bld->bld_type) { -+ case PNFS_BLOCK_VOLUME_SIMPLE: -+ status = bl_encode_simple(xdr, bld); -+ break; -+ case PNFS_BLOCK_VOLUME_SLICE: -+ status = bl_encode_slice(xdr, bld); -+ break; -+ case PNFS_BLOCK_VOLUME_CONCAT: -+ status = bl_encode_concat(xdr, bld); -+ break; -+ case PNFS_BLOCK_VOLUME_STRIPE: -+ status = bl_encode_stripe(xdr, bld); -+ break; -+ default: -+ BUG(); -+ } -+ if (status) -+ goto error; -+ } -+ -+ /* ---- Fill in the overall length and number of volumes ---- */ -+ p = exp_xdr_encode_u32(layoutlen_p, (xdr->p - layoutlen_p - 1) * 4); -+ exp_xdr_encode_u32(p, num_vols); -+ -+error: -+ return status; -+} -+EXPORT_SYMBOL_GPL(blocklayout_encode_devinfo); -+ -+enum nfsstat4 -+blocklayout_encode_layout(struct exp_xdr_stream *xdr, -+ const struct list_head *bl_head) -+{ -+ struct pnfs_blocklayout_layout *b; -+ u32 *layoutlen_p = xdr->p, -+ extents = 0; -+ __be32 *p; -+ -+ /* -+ * Save spot for opaque block layout length and number of extents, -+ * fill-in later. -+ */ -+ p = exp_xdr_reserve_qwords(xdr, 2); -+ if (!p) -+ return NFS4ERR_TOOSMALL; -+ p += 2; -+ -+ list_for_each_entry(b, bl_head, bll_list) { -+ extents++; -+ p = exp_xdr_reserve_qwords(xdr, 5 * 2 + 1); -+ if (!p) -+ return NFS4ERR_TOOSMALL; -+ p = exp_xdr_encode_u64(p, b->bll_vol_id.sbid); -+ p = exp_xdr_encode_u64(p, b->bll_vol_id.devid); -+ p = exp_xdr_encode_u64(p, b->bll_foff); -+ p = exp_xdr_encode_u64(p, b->bll_len); -+ p = exp_xdr_encode_u64(p, b->bll_soff); -+ p = exp_xdr_encode_u32(p, b->bll_es); -+ } -+ -+ /* ---- Fill in the overall length and number of extents ---- */ -+ p = exp_xdr_encode_u32(layoutlen_p, (p - layoutlen_p - 1) * 4); -+ exp_xdr_encode_u32(p, extents); -+ -+ return NFS4_OK; -+} -+EXPORT_SYMBOL_GPL(blocklayout_encode_layout); -diff --git a/fs/exportfs/nfs4filelayoutxdr.c b/fs/exportfs/nfs4filelayoutxdr.c -new file mode 100644 -index 0000000..f63c311 ---- /dev/null -+++ b/fs/exportfs/nfs4filelayoutxdr.c -@@ -0,0 +1,218 @@ -+/* -+ * Copyright (c) 2006 The Regents of the University of Michigan. -+ * All rights reserved. -+ * -+ * Andy Adamson -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. Neither the name of the University nor the names of its -+ * contributors may be used to endorse or promote products derived -+ * from this software without specific prior written permission. -+ * -+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED -+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ */ -+#include -+#include -+#include -+#include -+#include -+ -+/* We do our-own dprintk so filesystems are not dependent on sunrpc */ -+#ifdef dprintk -+#undef dprintk -+#endif -+#define dprintk(fmt, args, ...) do { } while (0) -+ -+/* Calculate the XDR length of the GETDEVICEINFO4resok structure -+ * excluding the gdir_notification and the gdir_device_addr da_layout_type. -+ */ -+static int fl_devinfo_xdr_words(const struct pnfs_filelayout_device *fdev) -+{ -+ struct pnfs_filelayout_devaddr *fl_addr; -+ struct pnfs_filelayout_multipath *mp; -+ int i, j, nwords; -+ -+ /* da_addr_body length, indice length, indices, -+ * multipath_list4 length */ -+ nwords = 1 + 1 + fdev->fl_stripeindices_length + 1; -+ for (i = 0; i < fdev->fl_device_length; i++) { -+ mp = &fdev->fl_device_list[i]; -+ nwords++; /* multipath list length */ -+ for (j = 0; j < mp->fl_multipath_length; j++) { -+ fl_addr = mp->fl_multipath_list; -+ nwords += 1 + exp_xdr_qwords(fl_addr->r_netid.len); -+ nwords += 1 + exp_xdr_qwords(fl_addr->r_addr.len); -+ } -+ } -+ dprintk("<-- %s nwords %d\n", __func__, nwords); -+ return nwords; -+} -+ -+/* Encodes the nfsv4_1_file_layout_ds_addr4 structure from draft 13 -+ * on the response stream. -+ * Use linux error codes (not nfs) since these values are being -+ * returned to the file system. -+ */ -+int -+filelayout_encode_devinfo(struct exp_xdr_stream *xdr, -+ const struct pnfs_filelayout_device *fdev) -+{ -+ unsigned int i, j, len = 0, opaque_words; -+ u32 *p_in; -+ u32 index_count = fdev->fl_stripeindices_length; -+ u32 dev_count = fdev->fl_device_length; -+ int error = 0; -+ __be32 *p; -+ -+ opaque_words = fl_devinfo_xdr_words(fdev); -+ dprintk("%s: Begin indx_cnt: %u dev_cnt: %u total size %u\n", -+ __func__, -+ index_count, -+ dev_count, -+ opaque_words*4); -+ -+ /* check space for opaque length */ -+ p = p_in = exp_xdr_reserve_qwords(xdr, opaque_words); -+ if (!p) { -+ error = -ETOOSMALL; -+ goto out; -+ } -+ -+ /* Fill in length later */ -+ p++; -+ -+ /* encode device list indices */ -+ p = exp_xdr_encode_u32(p, index_count); -+ for (i = 0; i < index_count; i++) -+ p = exp_xdr_encode_u32(p, fdev->fl_stripeindices_list[i]); -+ -+ /* encode device list */ -+ p = exp_xdr_encode_u32(p, dev_count); -+ for (i = 0; i < dev_count; i++) { -+ struct pnfs_filelayout_multipath *mp = &fdev->fl_device_list[i]; -+ -+ p = exp_xdr_encode_u32(p, mp->fl_multipath_length); -+ for (j = 0; j < mp->fl_multipath_length; j++) { -+ struct pnfs_filelayout_devaddr *da = -+ &mp->fl_multipath_list[j]; -+ -+ /* Encode device info */ -+ p = exp_xdr_encode_opaque(p, da->r_netid.data, -+ da->r_netid.len); -+ p = exp_xdr_encode_opaque(p, da->r_addr.data, -+ da->r_addr.len); -+ } -+ } -+ -+ /* backfill in length. Subtract 4 for da_addr_body size */ -+ len = (char *)p - (char *)p_in; -+ exp_xdr_encode_u32(p_in, len - 4); -+ -+ error = 0; -+out: -+ dprintk("%s: End err %d xdrlen %d\n", -+ __func__, error, len); -+ return error; -+} -+EXPORT_SYMBOL(filelayout_encode_devinfo); -+ -+/* Encodes the loc_body structure from draft 13 -+ * on the response stream. -+ * Use linux error codes (not nfs) since these values are being -+ * returned to the file system. -+ */ -+enum nfsstat4 -+filelayout_encode_layout(struct exp_xdr_stream *xdr, -+ const struct pnfs_filelayout_layout *flp) -+{ -+ u32 len = 0, nfl_util, fhlen, i; -+ u32 *layoutlen_p; -+ enum nfsstat4 nfserr; -+ __be32 *p; -+ -+ dprintk("%s: device_id %llx:%llx fsi %u, numfh %u\n", -+ __func__, -+ flp->device_id.pnfs_fsid, -+ flp->device_id.pnfs_devid, -+ flp->lg_first_stripe_index, -+ flp->lg_fh_length); -+ -+ /* Ensure file system added at least one file handle */ -+ if (flp->lg_fh_length <= 0) { -+ dprintk("%s: File Layout has no file handles!!\n", __func__); -+ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; -+ goto out; -+ } -+ -+ /* Ensure room for len, devid, util, first_stripe_index, -+ * pattern_offset, number of filehandles */ -+ p = layoutlen_p = exp_xdr_reserve_qwords(xdr, 1+2+2+1+1+2+1); -+ if (!p) { -+ nfserr = NFS4ERR_TOOSMALL; -+ goto out; -+ } -+ -+ /* save spot for opaque file layout length, fill-in later*/ -+ p++; -+ -+ /* encode device id */ -+ p = exp_xdr_encode_u64(p, flp->device_id.sbid); -+ p = exp_xdr_encode_u64(p, flp->device_id.devid); -+ -+ /* set and encode flags */ -+ nfl_util = flp->lg_stripe_unit; -+ if (flp->lg_commit_through_mds) -+ nfl_util |= NFL4_UFLG_COMMIT_THRU_MDS; -+ if (flp->lg_stripe_type == STRIPE_DENSE) -+ nfl_util |= NFL4_UFLG_DENSE; -+ p = exp_xdr_encode_u32(p, nfl_util); -+ -+ /* encode first stripe index */ -+ p = exp_xdr_encode_u32(p, flp->lg_first_stripe_index); -+ -+ /* encode striping pattern start */ -+ p = exp_xdr_encode_u64(p, flp->lg_pattern_offset); -+ -+ /* encode number of file handles */ -+ p = exp_xdr_encode_u32(p, flp->lg_fh_length); -+ -+ /* encode file handles */ -+ for (i = 0; i < flp->lg_fh_length; i++) { -+ fhlen = flp->lg_fh_list[i].fh_size; -+ p = exp_xdr_reserve_space(xdr, 4 + fhlen); -+ if (!p) { -+ nfserr = NFS4ERR_TOOSMALL; -+ goto out; -+ } -+ p = exp_xdr_encode_opaque(p, &flp->lg_fh_list[i].fh_base, fhlen); -+ } -+ -+ /* Set number of bytes encoded = total_bytes_encoded - length var */ -+ len = (char *)p - (char *)layoutlen_p; -+ exp_xdr_encode_u32(layoutlen_p, len - 4); -+ -+ nfserr = NFS4_OK; -+out: -+ dprintk("%s: End err %u xdrlen %d\n", -+ __func__, nfserr, len); -+ return nfserr; -+} -+EXPORT_SYMBOL(filelayout_encode_layout); -diff --git a/fs/exportfs/pnfs_osd_xdr_srv.c b/fs/exportfs/pnfs_osd_xdr_srv.c -new file mode 100644 -index 0000000..60df0df ---- /dev/null -+++ b/fs/exportfs/pnfs_osd_xdr_srv.c -@@ -0,0 +1,289 @@ -+/* -+ * pnfs_osd_xdr_enc.c -+ * -+ * Object-Based pNFS Layout XDR layer -+ * -+ * Copyright (C) 2007-2009 Panasas Inc. -+ * All rights reserved. -+ * -+ * Benny Halevy -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 -+ * See the file COPYING included with this distribution for more details. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. Neither the name of the Panasas company nor the names of its -+ * contributors may be used to endorse or promote products derived -+ * from this software without specific prior written permission. -+ * -+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED -+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ */ -+ -+#include -+#include -+ -+/* -+ * struct pnfs_osd_data_map { -+ * u32 odm_num_comps; -+ * u64 odm_stripe_unit; -+ * u32 odm_group_width; -+ * u32 odm_group_depth; -+ * u32 odm_mirror_cnt; -+ * u32 odm_raid_algorithm; -+ * }; -+ */ -+static int pnfs_osd_xdr_encode_data_map( -+ struct exp_xdr_stream *xdr, -+ struct pnfs_osd_data_map *data_map) -+{ -+ __be32 *p = exp_xdr_reserve_qwords(xdr, 1+2+1+1+1+1); -+ -+ if (!p) -+ return -E2BIG; -+ -+ p = exp_xdr_encode_u32(p, data_map->odm_num_comps); -+ p = exp_xdr_encode_u64(p, data_map->odm_stripe_unit); -+ p = exp_xdr_encode_u32(p, data_map->odm_group_width); -+ p = exp_xdr_encode_u32(p, data_map->odm_group_depth); -+ p = exp_xdr_encode_u32(p, data_map->odm_mirror_cnt); -+ p = exp_xdr_encode_u32(p, data_map->odm_raid_algorithm); -+ -+ return 0; -+} -+ -+/* -+ * struct pnfs_osd_objid { -+ * struct pnfs_deviceid oid_device_id; -+ * u64 oid_partition_id; -+ * u64 oid_object_id; -+ * }; -+ */ -+static inline int pnfs_osd_xdr_encode_objid( -+ struct exp_xdr_stream *xdr, -+ struct pnfs_osd_objid *object_id) -+{ -+ __be32 *p = exp_xdr_reserve_qwords(xdr, 2+2+2+2); -+ struct nfsd4_pnfs_deviceid *dev_id = -+ (struct nfsd4_pnfs_deviceid *)&object_id->oid_device_id; -+ -+ if (!p) -+ return -E2BIG; -+ -+ p = exp_xdr_encode_u64(p, dev_id->sbid); -+ p = exp_xdr_encode_u64(p, dev_id->devid); -+ p = exp_xdr_encode_u64(p, object_id->oid_partition_id); -+ p = exp_xdr_encode_u64(p, object_id->oid_object_id); -+ -+ return 0; -+} -+ -+/* -+ * enum pnfs_osd_cap_key_sec4 { -+ * PNFS_OSD_CAP_KEY_SEC_NONE = 0, -+ * PNFS_OSD_CAP_KEY_SEC_SSV = 1 -+ * }; -+ * -+ * struct pnfs_osd_object_cred { -+ * struct pnfs_osd_objid oc_object_id; -+ * u32 oc_osd_version; -+ * u32 oc_cap_key_sec; -+ * struct pnfs_osd_opaque_cred oc_cap_key -+ * struct pnfs_osd_opaque_cred oc_cap; -+ * }; -+ */ -+static int pnfs_osd_xdr_encode_object_cred( -+ struct exp_xdr_stream *xdr, -+ struct pnfs_osd_object_cred *olo_comp) -+{ -+ __be32 *p; -+ int err; -+ -+ err = pnfs_osd_xdr_encode_objid(xdr, &olo_comp->oc_object_id); -+ if (err) -+ return err; -+ -+ p = exp_xdr_reserve_space(xdr, 3*4 + 4+olo_comp->oc_cap.cred_len); -+ if (!p) -+ return -E2BIG; -+ -+ p = exp_xdr_encode_u32(p, olo_comp->oc_osd_version); -+ -+ /* No sec for now */ -+ p = exp_xdr_encode_u32(p, PNFS_OSD_CAP_KEY_SEC_NONE); -+ p = exp_xdr_encode_u32(p, 0); /* opaque oc_capability_key<> */ -+ -+ exp_xdr_encode_opaque(p, olo_comp->oc_cap.cred, -+ olo_comp->oc_cap.cred_len); -+ -+ return 0; -+} -+ -+/* -+ * struct pnfs_osd_layout { -+ * struct pnfs_osd_data_map olo_map; -+ * u32 olo_comps_index; -+ * u32 olo_num_comps; -+ * struct pnfs_osd_object_cred *olo_comps; -+ * }; -+ */ -+int pnfs_osd_xdr_encode_layout( -+ struct exp_xdr_stream *xdr, -+ struct pnfs_osd_layout *pol) -+{ -+ __be32 *p; -+ u32 i; -+ int err; -+ -+ err = pnfs_osd_xdr_encode_data_map(xdr, &pol->olo_map); -+ if (err) -+ return err; -+ -+ p = exp_xdr_reserve_qwords(xdr, 2); -+ if (!p) -+ return -E2BIG; -+ -+ p = exp_xdr_encode_u32(p, pol->olo_comps_index); -+ p = exp_xdr_encode_u32(p, pol->olo_num_comps); -+ -+ for (i = 0; i < pol->olo_num_comps; i++) { -+ err = pnfs_osd_xdr_encode_object_cred(xdr, &pol->olo_comps[i]); -+ if (err) -+ return err; -+ } -+ -+ return 0; -+} -+EXPORT_SYMBOL(pnfs_osd_xdr_encode_layout); -+ -+static int _encode_string(struct exp_xdr_stream *xdr, -+ const struct nfs4_string *str) -+{ -+ __be32 *p = exp_xdr_reserve_space(xdr, 4 + str->len); -+ -+ if (!p) -+ return -E2BIG; -+ exp_xdr_encode_opaque(p, str->data, str->len); -+ return 0; -+} -+ -+/* struct pnfs_osd_deviceaddr { -+ * struct pnfs_osd_targetid oda_targetid; -+ * struct pnfs_osd_targetaddr oda_targetaddr; -+ * u8 oda_lun[8]; -+ * struct nfs4_string oda_systemid; -+ * struct pnfs_osd_object_cred oda_root_obj_cred; -+ * struct nfs4_string oda_osdname; -+ * }; -+ */ -+int pnfs_osd_xdr_encode_deviceaddr( -+ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr) -+{ -+ __be32 *p; -+ int err; -+ -+ p = exp_xdr_reserve_space(xdr, 4 + 4 + sizeof(devaddr->oda_lun)); -+ if (!p) -+ return -E2BIG; -+ -+ /* Empty oda_targetid */ -+ p = exp_xdr_encode_u32(p, OBJ_TARGET_ANON); -+ -+ /* Empty oda_targetaddr for now */ -+ p = exp_xdr_encode_u32(p, 0); -+ -+ /* oda_lun */ -+ exp_xdr_encode_bytes(p, devaddr->oda_lun, sizeof(devaddr->oda_lun)); -+ -+ err = _encode_string(xdr, &devaddr->oda_systemid); -+ if (err) -+ return err; -+ -+ err = pnfs_osd_xdr_encode_object_cred(xdr, -+ &devaddr->oda_root_obj_cred); -+ if (err) -+ return err; -+ -+ err = _encode_string(xdr, &devaddr->oda_osdname); -+ if (err) -+ return err; -+ -+ return 0; -+} -+EXPORT_SYMBOL(pnfs_osd_xdr_encode_deviceaddr); -+ -+/* -+ * struct pnfs_osd_layoutupdate { -+ * u32 dsu_valid; -+ * s64 dsu_delta; -+ * u32 olu_ioerr_flag; -+ * }; -+ */ -+__be32 * -+pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p) -+{ -+ lou->dsu_valid = be32_to_cpu(*p++); -+ if (lou->dsu_valid) -+ p = xdr_decode_hyper(p, &lou->dsu_delta); -+ lou->olu_ioerr_flag = be32_to_cpu(*p++); -+ return p; -+} -+EXPORT_SYMBOL(pnfs_osd_xdr_decode_layoutupdate); -+ -+/* -+ * struct pnfs_osd_objid { -+ * struct pnfs_deviceid oid_device_id; -+ * u64 oid_partition_id; -+ * u64 oid_object_id; -+ * }; -+ */ -+static inline __be32 * -+pnfs_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) -+{ -+ /* FIXME: p = xdr_decode_fixed(...) */ -+ memcpy(objid->oid_device_id.data, p, sizeof(objid->oid_device_id.data)); -+ p += XDR_QUADLEN(sizeof(objid->oid_device_id.data)); -+ -+ p = xdr_decode_hyper(p, &objid->oid_partition_id); -+ p = xdr_decode_hyper(p, &objid->oid_object_id); -+ return p; -+} -+ -+/* -+ * struct pnfs_osd_ioerr { -+ * struct pnfs_osd_objid oer_component; -+ * u64 oer_comp_offset; -+ * u64 oer_comp_length; -+ * u32 oer_iswrite; -+ * u32 oer_errno; -+ * }; -+ */ -+__be32 * -+pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p) -+{ -+ p = pnfs_osd_xdr_decode_objid(p, &ioerr->oer_component); -+ p = xdr_decode_hyper(p, &ioerr->oer_comp_offset); -+ p = xdr_decode_hyper(p, &ioerr->oer_comp_length); -+ ioerr->oer_iswrite = be32_to_cpu(*p++); -+ ioerr->oer_errno = be32_to_cpu(*p++); -+ return p; -+} -+EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr); -diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c -index 4d4b1e8..efb0a44 100644 ---- a/fs/gfs2/ops_fstype.c -+++ b/fs/gfs2/ops_fstype.c -@@ -18,6 +18,7 @@ - #include - #include - #include -+#include - - #include "gfs2.h" - #include "incore.h" -@@ -1166,6 +1167,9 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent - sb->s_magic = GFS2_MAGIC; - sb->s_op = &gfs2_super_ops; - sb->s_export_op = &gfs2_export_ops; -+#if defined(CONFIG_PNFSD) -+ sb->s_pnfs_op = &pnfs_dlm_export_ops; -+#endif /* CONFIG_PNFSD */ - sb->s_xattr = gfs2_xattr_handlers; - sb->s_qcop = &gfs2_quotactl_ops; - sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; -diff --git a/fs/inode.c b/fs/inode.c -index 8646433..e415be4 100644 ---- a/fs/inode.c -+++ b/fs/inode.c -@@ -172,15 +172,21 @@ int inode_init_always(struct super_block *sb, struct inode *inode) - mapping->writeback_index = 0; - - /* -- * If the block_device provides a backing_dev_info for client -- * inodes then use that. Otherwise the inode share the bdev's -- * backing_dev_info. -+ * If the filesystem provides a backing_dev_info for client inodes -+ * then use that. Otherwise inodes share default_backing_dev_info. - */ -- if (sb->s_bdev) { -- struct backing_dev_info *bdi; -- -- bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; -- mapping->backing_dev_info = bdi; -+ if (sb->s_bdi && sb->s_bdi != &noop_backing_dev_info) { -+ /* -+ * Catch cases where filesystem might be bitten by using s_bdi -+ * instead of sb->s_bdev. Can be removed in 2.6.38. -+ */ -+ if (sb->s_bdev) { -+ struct backing_dev_info *bdi = -+ sb->s_bdev->bd_inode->i_mapping->backing_dev_info; -+ WARN(bdi != sb->s_bdi, "s_bdev bdi %s != s_bdi %s\n", -+ bdi->name, sb->s_bdi->name); -+ } -+ mapping->backing_dev_info = sb->s_bdi; - } - inode->i_private = NULL; - inode->i_mapping = mapping; -diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig -index f7e13db..0da8d28 100644 ---- a/fs/nfs/Kconfig -+++ b/fs/nfs/Kconfig -@@ -76,10 +76,42 @@ config NFS_V4 - - config NFS_V4_1 - bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" -- depends on NFS_V4 && EXPERIMENTAL -+ depends on NFS_FS && NFS_V4 && EXPERIMENTAL -+ select PNFS_FILE_LAYOUT - help - This option enables support for minor version 1 of the NFSv4 protocol -- (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. -+ (RFC 5661) in the kernel's NFS client. -+ -+ If unsure, say N. -+ -+config PNFS_FILE_LAYOUT -+ tristate -+ -+config PNFS_OBJLAYOUT -+ tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" -+ depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD -+ help -+ Say M here if you want your pNFS client to support the Objects Layout Driver. -+ Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and -+ upper level driver (SCSI_OSD_ULD). -+ -+ If unsure, say N. -+ -+config PNFS_PANLAYOUT -+ tristate "Provide support for the Panasas OSD Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" -+ depends on PNFS_OBJLAYOUT -+ help -+ Say M or y here if you want your pNFS client to support the Panasas OSD Layout Driver. -+ -+ If unsure, say N. -+ -+config PNFS_BLOCK -+ tristate "Provide a pNFS block client (EXPERIMENTAL)" -+ depends on NFS_FS && NFS_V4_1 -+ select MD -+ select BLK_DEV_DM -+ help -+ Say M or y here if you want your pNfs client to support the block protocol - - If unsure, say N. - -diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile -index da7fda6..e68c498 100644 ---- a/fs/nfs/Makefile -+++ b/fs/nfs/Makefile -@@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ - delegation.o idmap.o \ - callback.o callback_xdr.o callback_proc.o \ - nfs4namespace.o -+nfs-$(CONFIG_NFS_V4_1) += pnfs.o - nfs-$(CONFIG_SYSCTL) += sysctl.o - nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o -+ -+obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o -+nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o -+ -+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ -+obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ -diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile -new file mode 100644 -index 0000000..5a4bf3d ---- /dev/null -+++ b/fs/nfs/blocklayout/Makefile -@@ -0,0 +1,6 @@ -+# -+# Makefile for the pNFS block layout driver kernel module -+# -+obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o -+blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \ -+ extents.o block-device-discovery-pipe.o -diff --git a/fs/nfs/blocklayout/block-device-discovery-pipe.c b/fs/nfs/blocklayout/block-device-discovery-pipe.c -new file mode 100644 -index 0000000..e4c199f ---- /dev/null -+++ b/fs/nfs/blocklayout/block-device-discovery-pipe.c -@@ -0,0 +1,66 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "blocklayout.h" -+ -+#define NFSDBG_FACILITY NFSDBG_PNFS_LD -+ -+struct pipefs_list bl_device_list; -+struct dentry *bl_device_pipe; -+ -+ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, size_t len) -+{ -+ int err; -+ struct pipefs_hdr *msg; -+ -+ dprintk("Entering %s...\n", __func__); -+ -+ msg = pipefs_readmsg(filp, src, len); -+ if (IS_ERR(msg)) { -+ dprintk("ERROR: unable to read pipefs message.\n"); -+ return PTR_ERR(msg); -+ } -+ -+ /* now assign the result, which wakes the blocked thread */ -+ err = pipefs_assign_upcall_reply(msg, &bl_device_list); -+ if (err) { -+ dprintk("ERROR: failed to assign upcall with id %u\n", -+ msg->msgid); -+ kfree(msg); -+ } -+ return len; -+} -+ -+static const struct rpc_pipe_ops bl_pipe_ops = { -+ .upcall = pipefs_generic_upcall, -+ .downcall = bl_pipe_downcall, -+ .destroy_msg = pipefs_generic_destroy_msg, -+}; -+ -+int bl_pipe_init(void) -+{ -+ dprintk("%s: block_device pipefs registering...\n", __func__); -+ bl_device_pipe = pipefs_mkpipe("bl_device_pipe", &bl_pipe_ops, 1); -+ if (IS_ERR(bl_device_pipe)) -+ dprintk("ERROR, unable to make block_device pipe\n"); -+ -+ if (!bl_device_pipe) -+ dprintk("bl_device_pipe is NULL!\n"); -+ else -+ dprintk("bl_device_pipe created!\n"); -+ pipefs_init_list(&bl_device_list); -+ return 0; -+} -+ -+void bl_pipe_exit(void) -+{ -+ dprintk("%s: block_device pipefs unregistering...\n", __func__); -+ if (IS_ERR(bl_device_pipe)) -+ return ; -+ pipefs_closepipe(bl_device_pipe); -+ return; -+} -diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c -new file mode 100644 -index 0000000..b3ab4cb ---- /dev/null -+++ b/fs/nfs/blocklayout/blocklayout.c -@@ -0,0 +1,1146 @@ -+/* -+ * linux/fs/nfs/blocklayout/blocklayout.c -+ * -+ * Module for the NFSv4.1 pNFS block layout driver. -+ * -+ * Copyright (c) 2006 The Regents of the University of Michigan. -+ * All rights reserved. -+ * -+ * Andy Adamson -+ * Fred Isaman -+ * -+ * permission is granted to use, copy, create derivative works and -+ * redistribute this software and such derivative works for any purpose, -+ * so long as the name of the university of michigan is not used in -+ * any advertising or publicity pertaining to the use or distribution -+ * of this software without specific, written prior authorization. if -+ * the above copyright notice or any other identification of the -+ * university of michigan is included in any copy of any portion of -+ * this software, then the disclaimer below must also be included. -+ * -+ * this software is provided as is, without representation from the -+ * university of michigan as to its fitness for any purpose, and without -+ * warranty by the university of michigan of any kind, either express -+ * or implied, including without limitation the implied warranties of -+ * merchantability and fitness for a particular purpose. the regents -+ * of the university of michigan shall not be liable for any damages, -+ * including special, indirect, incidental, or consequential damages, -+ * with respect to any claim arising out or in connection with the use -+ * of the software, even if it has been or is hereafter advised of the -+ * possibility of such damages. -+ */ -+#include -+#include -+ -+#include /* various write calls */ -+#include /* struct bio */ -+#include -+#include "blocklayout.h" -+ -+#define NFSDBG_FACILITY NFSDBG_PNFS_LD -+ -+MODULE_LICENSE("GPL"); -+MODULE_AUTHOR("Andy Adamson "); -+MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); -+ -+/* Callback operations to the pNFS client */ -+ -+static void print_page(struct page *page) -+{ -+ dprintk("PRINTPAGE page %p\n", page); -+ dprintk(" PagePrivate %d\n", PagePrivate(page)); -+ dprintk(" PageUptodate %d\n", PageUptodate(page)); -+ dprintk(" PageError %d\n", PageError(page)); -+ dprintk(" PageDirty %d\n", PageDirty(page)); -+ dprintk(" PageReferenced %d\n", PageReferenced(page)); -+ dprintk(" PageLocked %d\n", PageLocked(page)); -+ dprintk(" PageWriteback %d\n", PageWriteback(page)); -+ dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); -+ dprintk("\n"); -+} -+ -+/* Given the be associated with isect, determine if page data needs to be -+ * initialized. -+ */ -+static int is_hole(struct pnfs_block_extent *be, sector_t isect) -+{ -+ if (be->be_state == PNFS_BLOCK_NONE_DATA) -+ return 1; -+ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) -+ return 0; -+ else -+ return !is_sector_initialized(be->be_inval, isect); -+} -+ -+/* Given the be associated with isect, determine if page data can be -+ * written to disk. -+ */ -+static int is_writable(struct pnfs_block_extent *be, sector_t isect) -+{ -+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) -+ return 1; -+ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) -+ return 0; -+ else -+ return is_sector_initialized(be->be_inval, isect); -+} -+ -+static int -+dont_like_caller(struct nfs_page *req) -+{ -+ if (atomic_read(&req->wb_complete)) { -+ /* Called by _multi */ -+ return 1; -+ } else { -+ /* Called by _one */ -+ return 0; -+ } -+} -+ -+static enum pnfs_try_status -+bl_commit(struct nfs_write_data *nfs_data, -+ int sync) -+{ -+ dprintk("%s enter\n", __func__); -+ return PNFS_NOT_ATTEMPTED; -+} -+ -+/* The data we are handed might be spread across several bios. We need -+ * to track when the last one is finished. -+ */ -+struct parallel_io { -+ struct kref refcnt; -+ struct rpc_call_ops call_ops; -+ void (*pnfs_callback) (void *data); -+ void *data; -+}; -+ -+static inline struct parallel_io *alloc_parallel(void *data) -+{ -+ struct parallel_io *rv; -+ -+ rv = kmalloc(sizeof(*rv), GFP_KERNEL); -+ if (rv) { -+ rv->data = data; -+ kref_init(&rv->refcnt); -+ } -+ return rv; -+} -+ -+static inline void get_parallel(struct parallel_io *p) -+{ -+ kref_get(&p->refcnt); -+} -+ -+static void destroy_parallel(struct kref *kref) -+{ -+ struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); -+ -+ dprintk("%s enter\n", __func__); -+ p->pnfs_callback(p->data); -+ kfree(p); -+} -+ -+static inline void put_parallel(struct parallel_io *p) -+{ -+ kref_put(&p->refcnt, destroy_parallel); -+} -+ -+static struct bio * -+bl_submit_bio(int rw, struct bio *bio) -+{ -+ if (bio) { -+ get_parallel(bio->bi_private); -+ dprintk("%s submitting %s bio %u@%llu\n", __func__, -+ rw == READ ? "read" : "write", -+ bio->bi_size, (u64)bio->bi_sector); -+ submit_bio(rw, bio); -+ } -+ return NULL; -+} -+ -+static inline void -+bl_done_with_rpage(struct page *page, const int ok) -+{ -+ if (ok) { -+ ClearPagePnfsErr(page); -+ SetPageUptodate(page); -+ } else { -+ ClearPageUptodate(page); -+ SetPageError(page); -+ SetPagePnfsErr(page); -+ } -+ /* Page is unlocked via rpc_release. Should really be done here. */ -+} -+ -+/* This is basically copied from mpage_end_io_read */ -+static void bl_end_io_read(struct bio *bio, int err) -+{ -+ void *data = bio->bi_private; -+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); -+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; -+ -+ do { -+ struct page *page = bvec->bv_page; -+ -+ if (--bvec >= bio->bi_io_vec) -+ prefetchw(&bvec->bv_page->flags); -+ bl_done_with_rpage(page, uptodate); -+ } while (bvec >= bio->bi_io_vec); -+ bio_put(bio); -+ put_parallel(data); -+} -+ -+static void bl_read_cleanup(struct work_struct *work) -+{ -+ struct rpc_task *task; -+ struct nfs_read_data *rdata; -+ dprintk("%s enter\n", __func__); -+ task = container_of(work, struct rpc_task, u.tk_work); -+ rdata = container_of(task, struct nfs_read_data, task); -+ pnfs_read_done(rdata); -+} -+ -+static void -+bl_end_par_io_read(void *data) -+{ -+ struct nfs_read_data *rdata = data; -+ -+ INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); -+ schedule_work(&rdata->task.u.tk_work); -+} -+ -+/* We don't want normal .rpc_call_done callback used, so we replace it -+ * with this stub. -+ */ -+static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) -+{ -+ return; -+} -+ -+static enum pnfs_try_status -+bl_read_pagelist(struct nfs_read_data *rdata, -+ unsigned nr_pages) -+{ -+ int i, hole; -+ struct bio *bio = NULL; -+ struct pnfs_block_extent *be = NULL, *cow_read = NULL; -+ sector_t isect, extent_length = 0; -+ struct parallel_io *par; -+ loff_t f_offset = rdata->args.offset; -+ size_t count = rdata->args.count; -+ struct page **pages = rdata->args.pages; -+ int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; -+ -+ dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, -+ nr_pages, f_offset, count); -+ -+ if (dont_like_caller(rdata->req)) { -+ dprintk("%s dont_like_caller failed\n", __func__); -+ goto use_mds; -+ } -+ if ((nr_pages == 1) && PagePnfsErr(rdata->req->wb_page)) { -+ /* We want to fall back to mds in case of read_page -+ * after error on read_pages. -+ */ -+ dprintk("%s PG_pnfserr set\n", __func__); -+ goto use_mds; -+ } -+ par = alloc_parallel(rdata); -+ if (!par) -+ goto use_mds; -+ par->call_ops = *rdata->pdata.call_ops; -+ par->call_ops.rpc_call_done = bl_rpc_do_nothing; -+ par->pnfs_callback = bl_end_par_io_read; -+ /* At this point, we can no longer jump to use_mds */ -+ -+ isect = (sector_t) (f_offset >> 9); -+ /* Code assumes extents are page-aligned */ -+ for (i = pg_index; i < nr_pages; i++) { -+ if (!extent_length) { -+ /* We've used up the previous extent */ -+ put_extent(be); -+ put_extent(cow_read); -+ bio = bl_submit_bio(READ, bio); -+ /* Get the next one */ -+ be = find_get_extent(BLK_LSEG2EXT(rdata->pdata.lseg), -+ isect, &cow_read); -+ if (!be) { -+ /* Error out this page */ -+ bl_done_with_rpage(pages[i], 0); -+ break; -+ } -+ extent_length = be->be_length - -+ (isect - be->be_f_offset); -+ if (cow_read) { -+ sector_t cow_length = cow_read->be_length - -+ (isect - cow_read->be_f_offset); -+ extent_length = min(extent_length, cow_length); -+ } -+ } -+ hole = is_hole(be, isect); -+ if (hole && !cow_read) { -+ bio = bl_submit_bio(READ, bio); -+ /* Fill hole w/ zeroes w/o accessing device */ -+ dprintk("%s Zeroing page for hole\n", __func__); -+ zero_user(pages[i], 0, -+ min_t(int, PAGE_CACHE_SIZE, count)); -+ print_page(pages[i]); -+ bl_done_with_rpage(pages[i], 1); -+ } else { -+ struct pnfs_block_extent *be_read; -+ -+ be_read = (hole && cow_read) ? cow_read : be; -+ for (;;) { -+ if (!bio) { -+ bio = bio_alloc(GFP_NOIO, nr_pages - i); -+ if (!bio) { -+ /* Error out this page */ -+ bl_done_with_rpage(pages[i], 0); -+ break; -+ } -+ bio->bi_sector = isect - -+ be_read->be_f_offset + -+ be_read->be_v_offset; -+ bio->bi_bdev = be_read->be_mdev; -+ bio->bi_end_io = bl_end_io_read; -+ bio->bi_private = par; -+ } -+ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) -+ break; -+ bio = bl_submit_bio(READ, bio); -+ } -+ } -+ isect += PAGE_CACHE_SIZE >> 9; -+ extent_length -= PAGE_CACHE_SIZE >> 9; -+ } -+ if ((isect << 9) >= rdata->inode->i_size) { -+ rdata->res.eof = 1; -+ rdata->res.count = rdata->inode->i_size - f_offset; -+ } else { -+ rdata->res.count = (isect << 9) - f_offset; -+ } -+ put_extent(be); -+ put_extent(cow_read); -+ bl_submit_bio(READ, bio); -+ put_parallel(par); -+ return PNFS_ATTEMPTED; -+ -+ use_mds: -+ dprintk("Giving up and using normal NFS\n"); -+ return PNFS_NOT_ATTEMPTED; -+} -+ -+static void mark_extents_written(struct pnfs_block_layout *bl, -+ __u64 offset, __u32 count) -+{ -+ sector_t isect, end; -+ struct pnfs_block_extent *be; -+ -+ dprintk("%s(%llu, %u)\n", __func__, offset, count); -+ if (count == 0) -+ return; -+ isect = (offset & (long)(PAGE_CACHE_MASK)) >> 9; -+ end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); -+ end >>= 9; -+ while (isect < end) { -+ sector_t len; -+ be = find_get_extent(bl, isect, NULL); -+ BUG_ON(!be); /* FIXME */ -+ len = min(end, be->be_f_offset + be->be_length) - isect; -+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) -+ mark_for_commit(be, isect, len); /* What if fails? */ -+ isect += len; -+ put_extent(be); -+ } -+} -+ -+/* STUB - this needs thought */ -+static inline void -+bl_done_with_wpage(struct page *page, const int ok) -+{ -+ if (!ok) { -+ SetPageError(page); -+ SetPagePnfsErr(page); -+ /* This is an inline copy of nfs_zap_mapping */ -+ /* This is oh so fishy, and needs deep thought */ -+ if (page->mapping->nrpages != 0) { -+ struct inode *inode = page->mapping->host; -+ spin_lock(&inode->i_lock); -+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; -+ spin_unlock(&inode->i_lock); -+ } -+ } -+ /* end_page_writeback called in rpc_release. Should be done here. */ -+} -+ -+/* This is basically copied from mpage_end_io_read */ -+static void bl_end_io_write(struct bio *bio, int err) -+{ -+ void *data = bio->bi_private; -+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); -+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; -+ -+ do { -+ struct page *page = bvec->bv_page; -+ -+ if (--bvec >= bio->bi_io_vec) -+ prefetchw(&bvec->bv_page->flags); -+ bl_done_with_wpage(page, uptodate); -+ } while (bvec >= bio->bi_io_vec); -+ bio_put(bio); -+ put_parallel(data); -+} -+ -+/* Function scheduled for call during bl_end_par_io_write, -+ * it marks sectors as written and extends the commitlist. -+ */ -+static void bl_write_cleanup(struct work_struct *work) -+{ -+ struct rpc_task *task; -+ struct nfs_write_data *wdata; -+ dprintk("%s enter\n", __func__); -+ task = container_of(work, struct rpc_task, u.tk_work); -+ wdata = container_of(task, struct nfs_write_data, task); -+ if (!wdata->task.tk_status) { -+ /* Marks for LAYOUTCOMMIT */ -+ /* BUG - this should be called after each bio, not after -+ * all finish, unless have some way of storing success/failure -+ */ -+ mark_extents_written(BLK_LSEG2EXT(wdata->pdata.lseg), -+ wdata->args.offset, wdata->args.count); -+ } -+ pnfs_writeback_done(wdata); -+} -+ -+/* Called when last of bios associated with a bl_write_pagelist call finishes */ -+static void -+bl_end_par_io_write(void *data) -+{ -+ struct nfs_write_data *wdata = data; -+ -+ /* STUB - ignoring error handling */ -+ wdata->task.tk_status = 0; -+ wdata->verf.committed = NFS_FILE_SYNC; -+ INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); -+ schedule_work(&wdata->task.u.tk_work); -+} -+ -+static enum pnfs_try_status -+bl_write_pagelist(struct nfs_write_data *wdata, -+ unsigned nr_pages, -+ int sync) -+{ -+ int i; -+ struct bio *bio = NULL; -+ struct pnfs_block_extent *be = NULL; -+ sector_t isect, extent_length = 0; -+ struct parallel_io *par; -+ loff_t offset = wdata->args.offset; -+ size_t count = wdata->args.count; -+ struct page **pages = wdata->args.pages; -+ int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; -+ -+ dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); -+ if (!wdata->req->wb_lseg) { -+ dprintk("%s no lseg, falling back to MDS\n", __func__); -+ return PNFS_NOT_ATTEMPTED; -+ } -+ if (dont_like_caller(wdata->req)) { -+ dprintk("%s dont_like_caller failed\n", __func__); -+ return PNFS_NOT_ATTEMPTED; -+ } -+ /* At this point, wdata->pages is a (sequential) list of nfs_pages. -+ * We want to write each, and if there is an error remove it from -+ * list and call -+ * nfs_retry_request(req) to have it redone using nfs. -+ * QUEST? Do as block or per req? Think have to do per block -+ * as part of end_bio -+ */ -+ par = alloc_parallel(wdata); -+ if (!par) -+ return PNFS_NOT_ATTEMPTED; -+ par->call_ops = *wdata->pdata.call_ops; -+ par->call_ops.rpc_call_done = bl_rpc_do_nothing; -+ par->pnfs_callback = bl_end_par_io_write; -+ /* At this point, have to be more careful with error handling */ -+ -+ isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> 9); -+ for (i = pg_index; i < nr_pages; i++) { -+ if (!extent_length) { -+ /* We've used up the previous extent */ -+ put_extent(be); -+ bio = bl_submit_bio(WRITE, bio); -+ /* Get the next one */ -+ be = find_get_extent(BLK_LSEG2EXT(wdata->pdata.lseg), -+ isect, NULL); -+ if (!be || !is_writable(be, isect)) { -+ /* FIXME */ -+ bl_done_with_wpage(pages[i], 0); -+ break; -+ } -+ extent_length = be->be_length - -+ (isect - be->be_f_offset); -+ } -+ for (;;) { -+ if (!bio) { -+ bio = bio_alloc(GFP_NOIO, nr_pages - i); -+ if (!bio) { -+ /* Error out this page */ -+ /* FIXME */ -+ bl_done_with_wpage(pages[i], 0); -+ break; -+ } -+ bio->bi_sector = isect - be->be_f_offset + -+ be->be_v_offset; -+ bio->bi_bdev = be->be_mdev; -+ bio->bi_end_io = bl_end_io_write; -+ bio->bi_private = par; -+ } -+ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) -+ break; -+ bio = bl_submit_bio(WRITE, bio); -+ } -+ isect += PAGE_CACHE_SIZE >> 9; -+ extent_length -= PAGE_CACHE_SIZE >> 9; -+ } -+ wdata->res.count = (isect << 9) - (offset & (long)PAGE_CACHE_MASK); -+ put_extent(be); -+ bl_submit_bio(WRITE, bio); -+ put_parallel(par); -+ return PNFS_ATTEMPTED; -+} -+ -+/* FIXME - range ignored */ -+static void -+release_extents(struct pnfs_block_layout *bl, -+ struct pnfs_layout_range *range) -+{ -+ int i; -+ struct pnfs_block_extent *be; -+ -+ spin_lock(&bl->bl_ext_lock); -+ for (i = 0; i < EXTENT_LISTS; i++) { -+ while (!list_empty(&bl->bl_extents[i])) { -+ be = list_first_entry(&bl->bl_extents[i], -+ struct pnfs_block_extent, -+ be_node); -+ list_del(&be->be_node); -+ put_extent(be); -+ } -+ } -+ spin_unlock(&bl->bl_ext_lock); -+} -+ -+static void -+release_inval_marks(struct pnfs_inval_markings *marks) -+{ -+ struct pnfs_inval_tracking *pos, *temp; -+ -+ list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { -+ list_del(&pos->it_link); -+ kfree(pos); -+ } -+ return; -+} -+ -+/* Note we are relying on caller locking to prevent nasty races. */ -+static void -+bl_free_layout_hdr(struct pnfs_layout_hdr *lo) -+{ -+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); -+ -+ dprintk("%s enter\n", __func__); -+ release_extents(bl, NULL); -+ release_inval_marks(&bl->bl_inval); -+ kfree(bl); -+} -+ -+static struct pnfs_layout_hdr * -+bl_alloc_layout_hdr(struct inode *inode) -+{ -+ struct pnfs_block_layout *bl; -+ -+ dprintk("%s enter\n", __func__); -+ bl = kzalloc(sizeof(*bl), GFP_KERNEL); -+ if (!bl) -+ return NULL; -+ spin_lock_init(&bl->bl_ext_lock); -+ INIT_LIST_HEAD(&bl->bl_extents[0]); -+ INIT_LIST_HEAD(&bl->bl_extents[1]); -+ INIT_LIST_HEAD(&bl->bl_commit); -+ bl->bl_count = 0; -+ bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> 9; -+ INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); -+ return &bl->bl_layout; -+} -+ -+static void -+bl_free_lseg(struct pnfs_layout_segment *lseg) -+{ -+ dprintk("%s enter\n", __func__); -+ kfree(lseg); -+} -+ -+/* Because the generic infrastructure does not correctly merge layouts, -+ * we pretty much ignore lseg, and store all data layout wide, so we -+ * can correctly merge. Eventually we should push some correct merge -+ * behavior up to the generic code, as the current behavior tends to -+ * cause lots of unnecessary overlapping LAYOUTGET requests. -+ */ -+static struct pnfs_layout_segment * -+bl_alloc_lseg(struct pnfs_layout_hdr *lo, -+ struct nfs4_layoutget_res *lgr) -+{ -+ struct pnfs_layout_segment *lseg; -+ int status; -+ -+ dprintk("%s enter\n", __func__); -+ lseg = kzalloc(sizeof(*lseg) + 0, GFP_KERNEL); -+ if (!lseg) -+ return NULL; -+ status = nfs4_blk_process_layoutget(lo, lgr); -+ if (status) { -+ /* We don't want to call the full-blown bl_free_lseg, -+ * since on error extents were not touched. -+ */ -+ /* STUB - we really want to distinguish between 2 error -+ * conditions here. This lseg failed, but lo data structures -+ * are OK, or we hosed the lo data structures. The calling -+ * code probably needs to distinguish this too. -+ */ -+ kfree(lseg); -+ return ERR_PTR(status); -+ } -+ return lseg; -+} -+ -+static int -+bl_setup_layoutcommit(struct pnfs_layout_hdr *lo, -+ struct nfs4_layoutcommit_args *arg) -+{ -+ struct nfs_server *nfss = NFS_SERVER(lo->inode); -+ struct bl_layoutupdate_data *layoutupdate_data; -+ -+ dprintk("%s enter\n", __func__); -+ /* Need to ensure commit is block-size aligned */ -+ if (nfss->pnfs_blksize) { -+ u64 mask = nfss->pnfs_blksize - 1; -+ u64 offset = arg->range.offset & mask; -+ -+ arg->range.offset -= offset; -+ arg->range.length += offset + mask; -+ arg->range.length &= ~mask; -+ } -+ -+ layoutupdate_data = kmalloc(sizeof(struct bl_layoutupdate_data), -+ GFP_KERNEL); -+ if (unlikely(!layoutupdate_data)) -+ return -ENOMEM; -+ INIT_LIST_HEAD(&layoutupdate_data->ranges); -+ arg->layoutdriver_data = layoutupdate_data; -+ -+ return 0; -+} -+ -+static void -+bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, -+ const struct nfs4_layoutcommit_args *arg) -+{ -+ dprintk("%s enter\n", __func__); -+ encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); -+} -+ -+static void -+bl_cleanup_layoutcommit(struct pnfs_layout_hdr *lo, -+ struct nfs4_layoutcommit_args *arg, int status) -+{ -+ dprintk("%s enter\n", __func__); -+ clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), arg, status); -+ kfree(arg->layoutdriver_data); -+} -+ -+static void free_blk_mountid(struct block_mount_id *mid) -+{ -+ if (mid) { -+ struct pnfs_block_dev *dev; -+ spin_lock(&mid->bm_lock); -+ while (!list_empty(&mid->bm_devlist)) { -+ dev = list_first_entry(&mid->bm_devlist, -+ struct pnfs_block_dev, -+ bm_node); -+ list_del(&dev->bm_node); -+ free_block_dev(dev); -+ } -+ spin_unlock(&mid->bm_lock); -+ kfree(mid); -+ } -+} -+ -+/* This is mostly copied from the filelayout's get_device_info function. -+ * It seems much of this should be at the generic pnfs level. -+ */ -+static struct pnfs_block_dev * -+nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, -+ struct nfs4_deviceid *d_id, -+ struct list_head *sdlist) -+{ -+ struct pnfs_device *dev; -+ struct pnfs_block_dev *rv = NULL; -+ u32 max_resp_sz; -+ int max_pages; -+ struct page **pages = NULL; -+ int i, rc; -+ -+ /* -+ * Use the session max response size as the basis for setting -+ * GETDEVICEINFO's maxcount -+ */ -+ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; -+ max_pages = max_resp_sz >> PAGE_SHIFT; -+ dprintk("%s max_resp_sz %u max_pages %d\n", -+ __func__, max_resp_sz, max_pages); -+ -+ dev = kmalloc(sizeof(*dev), GFP_KERNEL); -+ if (!dev) { -+ dprintk("%s kmalloc failed\n", __func__); -+ return NULL; -+ } -+ -+ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); -+ if (pages == NULL) { -+ kfree(dev); -+ return NULL; -+ } -+ for (i = 0; i < max_pages; i++) { -+ pages[i] = alloc_page(GFP_KERNEL); -+ if (!pages[i]) -+ goto out_free; -+ } -+ -+ /* set dev->area */ -+ dev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); -+ if (!dev->area) -+ goto out_free; -+ -+ memcpy(&dev->dev_id, d_id, sizeof(*d_id)); -+ dev->layout_type = LAYOUT_BLOCK_VOLUME; -+ dev->pages = pages; -+ dev->pgbase = 0; -+ dev->pglen = PAGE_SIZE * max_pages; -+ dev->mincount = 0; -+ -+ dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); -+ rc = nfs4_proc_getdeviceinfo(server, dev); -+ dprintk("%s getdevice info returns %d\n", __func__, rc); -+ if (rc) -+ goto out_free; -+ -+ rv = nfs4_blk_decode_device(server, dev, sdlist); -+ out_free: -+ if (dev->area != NULL) -+ vunmap(dev->area); -+ for (i = 0; i < max_pages; i++) -+ __free_page(pages[i]); -+ kfree(pages); -+ kfree(dev); -+ return rv; -+} -+ -+ -+/* -+ * Retrieve the list of available devices for the mountpoint. -+ */ -+static int -+bl_initialize_mountpoint(struct nfs_server *server, const struct nfs_fh *fh) -+{ -+ struct block_mount_id *b_mt_id = NULL; -+ struct pnfs_mount_type *mtype = NULL; -+ struct pnfs_devicelist *dlist = NULL; -+ struct pnfs_block_dev *bdev; -+ LIST_HEAD(block_disklist); -+ int status = 0, i; -+ -+ dprintk("%s enter\n", __func__); -+ -+ if (server->pnfs_blksize == 0) { -+ dprintk("%s Server did not return blksize\n", __func__); -+ return -EINVAL; -+ } -+ b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_KERNEL); -+ if (!b_mt_id) { -+ status = -ENOMEM; -+ goto out_error; -+ } -+ /* Initialize nfs4 block layout mount id */ -+ spin_lock_init(&b_mt_id->bm_lock); -+ INIT_LIST_HEAD(&b_mt_id->bm_devlist); -+ -+ dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_KERNEL); -+ if (!dlist) -+ goto out_error; -+ dlist->eof = 0; -+ while (!dlist->eof) { -+ status = nfs4_proc_getdevicelist(server, fh, dlist); -+ if (status) -+ goto out_error; -+ dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", -+ __func__, dlist->num_devs, dlist->eof); -+ /* For each device returned in dlist, call GETDEVICEINFO, and -+ * decode the opaque topology encoding to create a flat -+ * volume topology, matching VOLUME_SIMPLE disk signatures -+ * to disks in the visible block disk list. -+ * Construct an LVM meta device from the flat volume topology. -+ */ -+ for (i = 0; i < dlist->num_devs; i++) { -+ bdev = nfs4_blk_get_deviceinfo(server, fh, -+ &dlist->dev_id[i], -+ &block_disklist); -+ if (!bdev) { -+ status = -ENODEV; -+ goto out_error; -+ } -+ spin_lock(&b_mt_id->bm_lock); -+ list_add(&bdev->bm_node, &b_mt_id->bm_devlist); -+ spin_unlock(&b_mt_id->bm_lock); -+ } -+ } -+ dprintk("%s SUCCESS\n", __func__); -+ server->pnfs_ld_data = b_mt_id; -+ -+ out_return: -+ kfree(dlist); -+ return status; -+ -+ out_error: -+ free_blk_mountid(b_mt_id); -+ kfree(mtype); -+ goto out_return; -+} -+ -+static int -+bl_uninitialize_mountpoint(struct nfs_server *server) -+{ -+ struct block_mount_id *b_mt_id = server->pnfs_ld_data; -+ -+ dprintk("%s enter\n", __func__); -+ free_blk_mountid(b_mt_id); -+ dprintk("%s RETURNS\n", __func__); -+ return 0; -+} -+ -+/* STUB - mark intersection of layout and page as bad, so is not -+ * used again. -+ */ -+static void mark_bad_read(void) -+{ -+ return; -+} -+ -+/* Copied from buffer.c */ -+static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) -+{ -+ if (uptodate) { -+ set_buffer_uptodate(bh); -+ } else { -+ /* This happens, due to failed READA attempts. */ -+ clear_buffer_uptodate(bh); -+ } -+ unlock_buffer(bh); -+} -+ -+/* Copied from buffer.c */ -+static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) -+{ -+ __end_buffer_read_notouch(bh, uptodate); -+} -+ -+/* -+ * map_block: map a requested I/0 block (isect) into an offset in the LVM -+ * meta block_device -+ */ -+static void -+map_block(sector_t isect, struct pnfs_block_extent *be, struct buffer_head *bh) -+{ -+ dprintk("%s enter be=%p\n", __func__, be); -+ -+ set_buffer_mapped(bh); -+ bh->b_bdev = be->be_mdev; -+ bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> -+ (be->be_mdev->bd_inode->i_blkbits - 9); -+ -+ dprintk("%s isect %ld, bh->b_blocknr %ld, using bsize %Zd\n", -+ __func__, (long)isect, -+ (long)bh->b_blocknr, -+ bh->b_size); -+ return; -+} -+ -+/* Given an unmapped page, zero it (or read in page for COW), -+ * and set appropriate flags/markings, but it is safe to not initialize -+ * the range given in [from, to). -+ */ -+/* This is loosely based on nobh_write_begin */ -+static int -+init_page_for_write(struct pnfs_block_layout *bl, struct page *page, -+ unsigned from, unsigned to, sector_t **pages_to_mark) -+{ -+ struct buffer_head *bh; -+ int inval, ret = -EIO; -+ struct pnfs_block_extent *be = NULL, *cow_read = NULL; -+ sector_t isect; -+ -+ dprintk("%s enter, %p\n", __func__, page); -+ bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); -+ if (!bh) { -+ ret = -ENOMEM; -+ goto cleanup; -+ } -+ -+ isect = (sector_t)page->index << (PAGE_CACHE_SHIFT - 9); -+ be = find_get_extent(bl, isect, &cow_read); -+ if (!be) -+ goto cleanup; -+ inval = is_hole(be, isect); -+ dprintk("%s inval=%i, from=%u, to=%u\n", __func__, inval, from, to); -+ if (inval) { -+ if (be->be_state == PNFS_BLOCK_NONE_DATA) { -+ dprintk("%s PANIC - got NONE_DATA extent %p\n", -+ __func__, be); -+ goto cleanup; -+ } -+ map_block(isect, be, bh); -+ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); -+ } -+ if (PageUptodate(page)) { -+ /* Do nothing */ -+ } else if (inval & !cow_read) { -+ zero_user_segments(page, 0, from, to, PAGE_CACHE_SIZE); -+ } else if (0 < from || PAGE_CACHE_SIZE > to) { -+ struct pnfs_block_extent *read_extent; -+ -+ read_extent = (inval && cow_read) ? cow_read : be; -+ map_block(isect, read_extent, bh); -+ lock_buffer(bh); -+ bh->b_end_io = end_buffer_read_nobh; -+ submit_bh(READ, bh); -+ dprintk("%s: Waiting for buffer read\n", __func__); -+ /* XXX Don't really want to hold layout lock here */ -+ wait_on_buffer(bh); -+ if (!buffer_uptodate(bh)) -+ goto cleanup; -+ } -+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) { -+ /* There is a BUG here if is a short copy after write_begin, -+ * but I think this is a generic fs bug. The problem is that -+ * we have marked the page as initialized, but it is possible -+ * that the section not copied may never get copied. -+ */ -+ ret = mark_initialized_sectors(be->be_inval, isect, -+ PAGE_CACHE_SECTORS, -+ pages_to_mark); -+ /* Want to preallocate mem so above can't fail */ -+ if (ret) -+ goto cleanup; -+ } -+ SetPageMappedToDisk(page); -+ ret = 0; -+ -+cleanup: -+ free_buffer_head(bh); -+ put_extent(be); -+ put_extent(cow_read); -+ if (ret) { -+ /* Need to mark layout with bad read...should now -+ * just use nfs4 for reads and writes. -+ */ -+ mark_bad_read(); -+ } -+ return ret; -+} -+ -+static int -+bl_write_begin(struct pnfs_layout_segment *lseg, struct page *page, loff_t pos, -+ unsigned count, struct pnfs_fsdata *fsdata) -+{ -+ unsigned from, to; -+ int ret; -+ sector_t *pages_to_mark = NULL; -+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(lseg); -+ -+ dprintk("%s enter, %u@%lld\n", __func__, count, pos); -+ print_page(page); -+ /* The following code assumes blocksize >= PAGE_CACHE_SIZE */ -+ if (bl->bl_blocksize < (PAGE_CACHE_SIZE >> 9)) { -+ dprintk("%s Can't handle blocksize %llu\n", __func__, -+ (u64)bl->bl_blocksize); -+ put_lseg(fsdata->lseg); -+ fsdata->lseg = NULL; -+ return 0; -+ } -+ if (PageMappedToDisk(page)) { -+ /* Basically, this is a flag that says we have -+ * successfully called write_begin already on this page. -+ */ -+ /* NOTE - there are cache consistency issues here. -+ * For example, what if the layout is recalled, then regained? -+ * If the file is closed and reopened, will the page flags -+ * be reset? If not, we'll have to use layout info instead of -+ * the page flag. -+ */ -+ return 0; -+ } -+ from = pos & (PAGE_CACHE_SIZE - 1); -+ to = from + count; -+ ret = init_page_for_write(bl, page, from, to, &pages_to_mark); -+ if (ret) { -+ dprintk("%s init page failed with %i", __func__, ret); -+ /* Revert back to plain NFS and just continue on with -+ * write. This assumes there is no request attached, which -+ * should be true if we get here. -+ */ -+ BUG_ON(PagePrivate(page)); -+ put_lseg(fsdata->lseg); -+ fsdata->lseg = NULL; -+ kfree(pages_to_mark); -+ ret = 0; -+ } else { -+ fsdata->private = pages_to_mark; -+ } -+ return ret; -+} -+ -+/* CAREFUL - what happens if copied < count??? */ -+static int -+bl_write_end(struct inode *inode, struct page *page, loff_t pos, -+ unsigned count, unsigned copied, struct pnfs_layout_segment *lseg) -+{ -+ dprintk("%s enter, %u@%lld, lseg=%p\n", __func__, count, pos, lseg); -+ print_page(page); -+ if (lseg) -+ SetPageUptodate(page); -+ return 0; -+} -+ -+/* Return any memory allocated to fsdata->private, and take advantage -+ * of no page locks to mark pages noted in write_begin as needing -+ * initialization. -+ */ -+static void -+bl_write_end_cleanup(struct file *filp, struct pnfs_fsdata *fsdata) -+{ -+ struct page *page; -+ pgoff_t index; -+ sector_t *pos; -+ struct address_space *mapping = filp->f_mapping; -+ struct pnfs_fsdata *fake_data; -+ struct pnfs_layout_segment *lseg; -+ -+ if (!fsdata) -+ return; -+ lseg = fsdata->lseg; -+ if (!lseg) -+ return; -+ pos = fsdata->private; -+ if (!pos) -+ return; -+ dprintk("%s enter with pos=%llu\n", __func__, (u64)(*pos)); -+ for (; *pos != ~0; pos++) { -+ index = *pos >> (PAGE_CACHE_SHIFT - 9); -+ /* XXX How do we properly deal with failures here??? */ -+ page = grab_cache_page_write_begin(mapping, index, 0); -+ if (!page) { -+ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", __func__); -+ continue; -+ } -+ dprintk("%s: Examining block page\n", __func__); -+ print_page(page); -+ if (!PageMappedToDisk(page)) { -+ /* XXX How do we properly deal with failures here??? */ -+ dprintk("%s Marking block page\n", __func__); -+ init_page_for_write(BLK_LSEG2EXT(fsdata->lseg), page, -+ PAGE_CACHE_SIZE, PAGE_CACHE_SIZE, -+ NULL); -+ print_page(page); -+ fake_data = kzalloc(sizeof(*fake_data), GFP_KERNEL); -+ if (!fake_data) { -+ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", -+ __func__); -+ unlock_page(page); -+ continue; -+ } -+ get_lseg(lseg); -+ fake_data->lseg = lseg; -+ fake_data->bypass_eof = 1; -+ mapping->a_ops->write_end(filp, mapping, -+ index << PAGE_CACHE_SHIFT, -+ PAGE_CACHE_SIZE, -+ PAGE_CACHE_SIZE, -+ page, fake_data); -+ /* Note fake_data is freed by nfs_write_end */ -+ } else -+ unlock_page(page); -+ } -+ kfree(fsdata->private); -+ fsdata->private = NULL; -+} -+ -+/* This is called by nfs_can_coalesce_requests via nfs_pageio_do_add_request. -+ * Should return False if there is a reason requests can not be coalesced, -+ * otherwise, should default to returning True. -+ */ -+static int -+bl_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, -+ struct nfs_page *req) -+{ -+ dprintk("%s enter\n", __func__); -+ if (pgio->pg_iswrite) -+ return prev->wb_lseg == req->wb_lseg; -+ else -+ return 1; -+} -+ -+static struct pnfs_layoutdriver_type blocklayout_type = { -+ .id = LAYOUT_BLOCK_VOLUME, -+ .name = "LAYOUT_BLOCK_VOLUME", -+ .commit = bl_commit, -+ .read_pagelist = bl_read_pagelist, -+ .write_pagelist = bl_write_pagelist, -+ .write_begin = bl_write_begin, -+ .write_end = bl_write_end, -+ .write_end_cleanup = bl_write_end_cleanup, -+ .alloc_layout_hdr = bl_alloc_layout_hdr, -+ .free_layout_hdr = bl_free_layout_hdr, -+ .alloc_lseg = bl_alloc_lseg, -+ .free_lseg = bl_free_lseg, -+ .setup_layoutcommit = bl_setup_layoutcommit, -+ .encode_layoutcommit = bl_encode_layoutcommit, -+ .cleanup_layoutcommit = bl_cleanup_layoutcommit, -+ .initialize_mountpoint = bl_initialize_mountpoint, -+ .uninitialize_mountpoint = bl_uninitialize_mountpoint, -+ .pg_test = bl_pg_test, -+}; -+ -+static int __init nfs4blocklayout_init(void) -+{ -+ int ret; -+ -+ dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); -+ -+ ret = pnfs_register_layoutdriver(&blocklayout_type); -+ if (!ret) -+ bl_pipe_init(); -+ return ret; -+} -+ -+static void __exit nfs4blocklayout_exit(void) -+{ -+ dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", -+ __func__); -+ -+ pnfs_unregister_layoutdriver(&blocklayout_type); -+ bl_pipe_exit(); -+} -+ -+module_init(nfs4blocklayout_init); -+module_exit(nfs4blocklayout_exit); -diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h -new file mode 100644 -index 0000000..9e7bd62 ---- /dev/null -+++ b/fs/nfs/blocklayout/blocklayout.h -@@ -0,0 +1,302 @@ -+/* -+ * linux/fs/nfs/blocklayout/blocklayout.h -+ * -+ * Module for the NFSv4.1 pNFS block layout driver. -+ * -+ * Copyright (c) 2006 The Regents of the University of Michigan. -+ * All rights reserved. -+ * -+ * Andy Adamson -+ * Fred Isaman -+ * -+ * permission is granted to use, copy, create derivative works and -+ * redistribute this software and such derivative works for any purpose, -+ * so long as the name of the university of michigan is not used in -+ * any advertising or publicity pertaining to the use or distribution -+ * of this software without specific, written prior authorization. if -+ * the above copyright notice or any other identification of the -+ * university of michigan is included in any copy of any portion of -+ * this software, then the disclaimer below must also be included. -+ * -+ * this software is provided as is, without representation from the -+ * university of michigan as to its fitness for any purpose, and without -+ * warranty by the university of michigan of any kind, either express -+ * or implied, including without limitation the implied warranties of -+ * merchantability and fitness for a particular purpose. the regents -+ * of the university of michigan shall not be liable for any damages, -+ * including special, indirect, incidental, or consequential damages, -+ * with respect to any claim arising out or in connection with the use -+ * of the software, even if it has been or is hereafter advised of the -+ * possibility of such damages. -+ */ -+#ifndef FS_NFS_NFS4BLOCKLAYOUT_H -+#define FS_NFS_NFS4BLOCKLAYOUT_H -+ -+#include -+#include /* Needed for struct dm_ioctl*/ -+#include "../pnfs.h" -+ -+#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> 9) -+ -+#define PG_pnfserr PG_owner_priv_1 -+#define PagePnfsErr(page) test_bit(PG_pnfserr, &(page)->flags) -+#define SetPagePnfsErr(page) set_bit(PG_pnfserr, &(page)->flags) -+#define ClearPagePnfsErr(page) clear_bit(PG_pnfserr, &(page)->flags) -+ -+extern int dm_dev_create(struct dm_ioctl *param); /* from dm-ioctl.c */ -+extern int dm_dev_remove(struct dm_ioctl *param); /* from dm-ioctl.c */ -+extern int dm_do_resume(struct dm_ioctl *param); -+extern int dm_table_load(struct dm_ioctl *param, size_t param_size); -+ -+struct block_mount_id { -+ spinlock_t bm_lock; /* protects list */ -+ struct list_head bm_devlist; /* holds pnfs_block_dev */ -+}; -+ -+struct pnfs_block_dev { -+ struct list_head bm_node; -+ struct nfs4_deviceid bm_mdevid; /* associated devid */ -+ struct block_device *bm_mdev; /* meta device itself */ -+}; -+ -+/* holds visible disks that can be matched against VOLUME_SIMPLE signatures */ -+struct visible_block_device { -+ struct list_head vi_node; -+ struct block_device *vi_bdev; -+ int vi_mapped; -+ int vi_put_done; -+}; -+ -+enum blk_vol_type { -+ PNFS_BLOCK_VOLUME_SIMPLE = 0, /* maps to a single LU */ -+ PNFS_BLOCK_VOLUME_SLICE = 1, /* slice of another volume */ -+ PNFS_BLOCK_VOLUME_CONCAT = 2, /* concatenation of multiple volumes */ -+ PNFS_BLOCK_VOLUME_STRIPE = 3 /* striped across multiple volumes */ -+}; -+ -+/* All disk offset/lengths are stored in 512-byte sectors */ -+struct pnfs_blk_volume { -+ uint32_t bv_type; -+ sector_t bv_size; -+ struct pnfs_blk_volume **bv_vols; -+ int bv_vol_n; -+ union { -+ dev_t bv_dev; -+ sector_t bv_stripe_unit; -+ sector_t bv_offset; -+ }; -+}; -+ -+/* Since components need not be aligned, cannot use sector_t */ -+struct pnfs_blk_sig_comp { -+ int64_t bs_offset; /* In bytes */ -+ uint32_t bs_length; /* In bytes */ -+ char *bs_string; -+}; -+ -+/* Maximum number of signatures components in a simple volume */ -+# define PNFS_BLOCK_MAX_SIG_COMP 16 -+ -+struct pnfs_blk_sig { -+ int si_num_comps; -+ struct pnfs_blk_sig_comp si_comps[PNFS_BLOCK_MAX_SIG_COMP]; -+}; -+ -+enum exstate4 { -+ PNFS_BLOCK_READWRITE_DATA = 0, -+ PNFS_BLOCK_READ_DATA = 1, -+ PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ -+ PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ -+}; -+ -+#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ -+ -+struct my_tree_t { -+ sector_t mtt_step_size; /* Internal sector alignment */ -+ struct list_head mtt_stub; /* Should be a radix tree */ -+}; -+ -+struct pnfs_inval_markings { -+ spinlock_t im_lock; -+ struct my_tree_t im_tree; /* Sectors that need LAYOUTCOMMIT */ -+ sector_t im_block_size; /* Server blocksize in sectors */ -+}; -+ -+struct pnfs_inval_tracking { -+ struct list_head it_link; -+ int it_sector; -+ int it_tags; -+}; -+ -+/* sector_t fields are all in 512-byte sectors */ -+struct pnfs_block_extent { -+ struct kref be_refcnt; -+ struct list_head be_node; /* link into lseg list */ -+ struct nfs4_deviceid be_devid; /* STUB - remevable??? */ -+ struct block_device *be_mdev; -+ sector_t be_f_offset; /* the starting offset in the file */ -+ sector_t be_length; /* the size of the extent */ -+ sector_t be_v_offset; /* the starting offset in the volume */ -+ enum exstate4 be_state; /* the state of this extent */ -+ struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ -+}; -+ -+/* Shortened extent used by LAYOUTCOMMIT */ -+struct pnfs_block_short_extent { -+ struct list_head bse_node; -+ struct nfs4_deviceid bse_devid; /* STUB - removable??? */ -+ struct block_device *bse_mdev; -+ sector_t bse_f_offset; /* the starting offset in the file */ -+ sector_t bse_length; /* the size of the extent */ -+}; -+ -+static inline void -+INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) -+{ -+ spin_lock_init(&marks->im_lock); -+ INIT_LIST_HEAD(&marks->im_tree.mtt_stub); -+ marks->im_block_size = blocksize; -+ marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, -+ blocksize); -+} -+ -+enum extentclass4 { -+ RW_EXTENT = 0, /* READWRTE and INVAL */ -+ RO_EXTENT = 1, /* READ and NONE */ -+ EXTENT_LISTS = 2, -+}; -+ -+static inline int choose_list(enum exstate4 state) -+{ -+ if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) -+ return RO_EXTENT; -+ else -+ return RW_EXTENT; -+} -+ -+struct pnfs_block_layout { -+ struct pnfs_layout_hdr bl_layout; -+ struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ -+ spinlock_t bl_ext_lock; /* Protects list manipulation */ -+ struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ -+ struct list_head bl_commit; /* Needs layout commit */ -+ unsigned int bl_count; /* entries in bl_commit */ -+ sector_t bl_blocksize; /* Server blocksize in sectors */ -+}; -+ -+/* this struct is comunicated between: -+ * bl_setup_layoutcommit && bl_encode_layoutcommit && bl_cleanup_layoutcommit -+ */ -+struct bl_layoutupdate_data { -+ struct list_head ranges; -+}; -+ -+#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->inode)->pnfs_ld_data)) -+ -+static inline struct pnfs_block_layout * -+BLK_LO2EXT(struct pnfs_layout_hdr *lo) -+{ -+ return container_of(lo, struct pnfs_block_layout, bl_layout); -+} -+ -+static inline struct pnfs_block_layout * -+BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) -+{ -+ return BLK_LO2EXT(lseg->layout); -+} -+ -+uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes); -+ -+#define BLK_READBUF(p, e, nbytes) do { \ -+ p = blk_overflow(p, e, nbytes); \ -+ if (!p) { \ -+ printk(KERN_WARNING \ -+ "%s: reply buffer overflowed in line %d.\n", \ -+ __func__, __LINE__); \ -+ goto out_err; \ -+ } \ -+} while (0) -+ -+#define READ32(x) (x) = ntohl(*p++) -+#define READ64(x) do { \ -+ (x) = (uint64_t)ntohl(*p++) << 32; \ -+ (x) |= ntohl(*p++); \ -+} while (0) -+#define COPYMEM(x, nbytes) do { \ -+ memcpy((x), p, nbytes); \ -+ p += XDR_QUADLEN(nbytes); \ -+} while (0) -+#define READ_DEVID(x) COPYMEM((x)->data, NFS4_DEVICEID4_SIZE) -+#define READ_SECTOR(x) do { \ -+ READ64(tmp); \ -+ if (tmp & 0x1ff) { \ -+ printk(KERN_WARNING \ -+ "%s Value not 512-byte aligned at line %d\n", \ -+ __func__, __LINE__); \ -+ goto out_err; \ -+ } \ -+ (x) = tmp >> 9; \ -+} while (0) -+ -+#define WRITE32(n) do { \ -+ *p++ = htonl(n); \ -+ } while (0) -+#define WRITE64(n) do { \ -+ *p++ = htonl((uint32_t)((n) >> 32)); \ -+ *p++ = htonl((uint32_t)(n)); \ -+} while (0) -+#define WRITEMEM(ptr, nbytes) do { \ -+ p = xdr_encode_opaque_fixed(p, ptr, nbytes); \ -+} while (0) -+#define WRITE_DEVID(x) WRITEMEM((x)->data, NFS4_DEVICEID4_SIZE) -+ -+/* blocklayoutdev.c */ -+struct block_device *nfs4_blkdev_get(dev_t dev); -+int nfs4_blkdev_put(struct block_device *bdev); -+struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, -+ struct pnfs_device *dev, -+ struct list_head *sdlist); -+int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, -+ struct nfs4_layoutget_res *lgr); -+int nfs4_blk_create_block_disk_list(struct list_head *); -+void nfs4_blk_destroy_disk_list(struct list_head *); -+/* blocklayoutdm.c */ -+int nfs4_blk_flatten(struct pnfs_blk_volume *, int, struct pnfs_block_dev *); -+void free_block_dev(struct pnfs_block_dev *bdev); -+/* extents.c */ -+struct pnfs_block_extent * -+find_get_extent(struct pnfs_block_layout *bl, sector_t isect, -+ struct pnfs_block_extent **cow_read); -+int mark_initialized_sectors(struct pnfs_inval_markings *marks, -+ sector_t offset, sector_t length, -+ sector_t **pages); -+void put_extent(struct pnfs_block_extent *be); -+struct pnfs_block_extent *alloc_extent(void); -+struct pnfs_block_extent *get_extent(struct pnfs_block_extent *be); -+int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect); -+int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, -+ struct xdr_stream *xdr, -+ const struct nfs4_layoutcommit_args *arg); -+void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, -+ const struct nfs4_layoutcommit_args *arg, -+ int status); -+int add_and_merge_extent(struct pnfs_block_layout *bl, -+ struct pnfs_block_extent *new); -+int mark_for_commit(struct pnfs_block_extent *be, -+ sector_t offset, sector_t length); -+ -+#include -+ -+extern struct pipefs_list bl_device_list; -+extern struct dentry *bl_device_pipe; -+ -+int bl_pipe_init(void); -+void bl_pipe_exit(void); -+ -+#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ -+#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ -+#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ -+#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ -+#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ -+ -+#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ -diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c -new file mode 100644 -index 0000000..17bd25a ---- /dev/null -+++ b/fs/nfs/blocklayout/blocklayoutdev.c -@@ -0,0 +1,335 @@ -+/* -+ * linux/fs/nfs/blocklayout/blocklayoutdev.c -+ * -+ * Device operations for the pnfs nfs4 file layout driver. -+ * -+ * Copyright (c) 2006 The Regents of the University of Michigan. -+ * All rights reserved. -+ * -+ * Andy Adamson -+ * Fred Isaman -+ * -+ * permission is granted to use, copy, create derivative works and -+ * redistribute this software and such derivative works for any purpose, -+ * so long as the name of the university of michigan is not used in -+ * any advertising or publicity pertaining to the use or distribution -+ * of this software without specific, written prior authorization. if -+ * the above copyright notice or any other identification of the -+ * university of michigan is included in any copy of any portion of -+ * this software, then the disclaimer below must also be included. -+ * -+ * this software is provided as is, without representation from the -+ * university of michigan as to its fitness for any purpose, and without -+ * warranty by the university of michigan of any kind, either express -+ * or implied, including without limitation the implied warranties of -+ * merchantability and fitness for a particular purpose. the regents -+ * of the university of michigan shall not be liable for any damages, -+ * including special, indirect, incidental, or consequential damages, -+ * with respect to any claim arising out or in connection with the use -+ * of the software, even if it has been or is hereafter advised of the -+ * possibility of such damages. -+ */ -+#include -+#include /* __bread */ -+ -+#include -+#include -+#include -+ -+#include "blocklayout.h" -+ -+#define NFSDBG_FACILITY NFSDBG_PNFS_LD -+ -+uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes) -+{ -+ uint32_t *q = p + XDR_QUADLEN(nbytes); -+ if (unlikely(q > end || q < p)) -+ return NULL; -+ return p; -+} -+EXPORT_SYMBOL(blk_overflow); -+ -+/* Open a block_device by device number. */ -+struct block_device *nfs4_blkdev_get(dev_t dev) -+{ -+ struct block_device *bd; -+ -+ dprintk("%s enter\n", __func__); -+ bd = open_by_devnum(dev, FMODE_READ); -+ if (IS_ERR(bd)) -+ goto fail; -+ return bd; -+fail: -+ dprintk("%s failed to open device : %ld\n", -+ __func__, PTR_ERR(bd)); -+ return NULL; -+} -+ -+/* -+ * Release the block device -+ */ -+int nfs4_blkdev_put(struct block_device *bdev) -+{ -+ dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), -+ MINOR(bdev->bd_dev)); -+ bd_release(bdev); -+ return blkdev_put(bdev, FMODE_READ); -+} -+ -+/* Decodes pnfs_block_deviceaddr4 (draft-8) which is XDR encoded -+ * in dev->dev_addr_buf. -+ */ -+struct pnfs_block_dev * -+nfs4_blk_decode_device(struct nfs_server *server, -+ struct pnfs_device *dev, -+ struct list_head *sdlist) -+{ -+ struct pnfs_block_dev *rv = NULL; -+ struct block_device *bd = NULL; -+ struct pipefs_hdr *msg = NULL, *reply = NULL; -+ uint32_t major, minor; -+ -+ dprintk("%s enter\n", __func__); -+ -+ if (IS_ERR(bl_device_pipe)) -+ return NULL; -+ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); -+ dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, -+ dev->mincount); -+ msg = pipefs_alloc_init_msg(0, BL_DEVICE_MOUNT, 0, dev->area, -+ dev->mincount); -+ if (IS_ERR(msg)) { -+ dprintk("ERROR: couldn't make pipefs message.\n"); -+ goto out_err; -+ } -+ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); -+ msg->status = BL_DEVICE_REQUEST_INIT; -+ -+ dprintk("%s CALLING USERSPACE DAEMON\n", __func__); -+ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, -+ &bl_device_list, 0, 0); -+ -+ if (IS_ERR(reply)) { -+ dprintk("ERROR: upcall_waitreply failed\n"); -+ goto out_err; -+ } -+ if (reply->status != BL_DEVICE_REQUEST_PROC) { -+ dprintk("%s failed to open device: %ld\n", -+ __func__, PTR_ERR(bd)); -+ goto out_err; -+ } -+ memcpy(&major, (uint32_t *)(payload_of(reply)), sizeof(uint32_t)); -+ memcpy(&minor, (uint32_t *)(payload_of(reply) + sizeof(uint32_t)), -+ sizeof(uint32_t)); -+ bd = nfs4_blkdev_get(MKDEV(major, minor)); -+ if (IS_ERR(bd)) { -+ dprintk("%s failed to open device : %ld\n", -+ __func__, PTR_ERR(bd)); -+ goto out_err; -+ } -+ -+ rv = kzalloc(sizeof(*rv), GFP_KERNEL); -+ if (!rv) -+ goto out_err; -+ -+ rv->bm_mdev = bd; -+ memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); -+ dprintk("%s Created device %s with bd_block_size %u\n", -+ __func__, -+ bd->bd_disk->disk_name, -+ bd->bd_block_size); -+ kfree(reply); -+ kfree(msg); -+ return rv; -+ -+out_err: -+ kfree(rv); -+ if (!IS_ERR(reply)) -+ kfree(reply); -+ if (!IS_ERR(msg)) -+ kfree(msg); -+ return NULL; -+} -+ -+/* Map deviceid returned by the server to constructed block_device */ -+static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, -+ struct nfs4_deviceid *id) -+{ -+ struct block_device *rv = NULL; -+ struct block_mount_id *mid; -+ struct pnfs_block_dev *dev; -+ -+ dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); -+ mid = BLK_ID(lo); -+ spin_lock(&mid->bm_lock); -+ list_for_each_entry(dev, &mid->bm_devlist, bm_node) { -+ if (memcmp(id->data, dev->bm_mdevid.data, -+ NFS4_DEVICEID4_SIZE) == 0) { -+ rv = dev->bm_mdev; -+ goto out; -+ } -+ } -+ out: -+ spin_unlock(&mid->bm_lock); -+ dprintk("%s returning %p\n", __func__, rv); -+ return rv; -+} -+ -+/* Tracks info needed to ensure extents in layout obey constraints of spec */ -+struct layout_verification { -+ u32 mode; /* R or RW */ -+ u64 start; /* Expected start of next non-COW extent */ -+ u64 inval; /* Start of INVAL coverage */ -+ u64 cowread; /* End of COW read coverage */ -+}; -+ -+/* Verify the extent meets the layout requirements of the pnfs-block draft, -+ * section 2.3.1. -+ */ -+static int verify_extent(struct pnfs_block_extent *be, -+ struct layout_verification *lv) -+{ -+ if (lv->mode == IOMODE_READ) { -+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA || -+ be->be_state == PNFS_BLOCK_INVALID_DATA) -+ return -EIO; -+ if (be->be_f_offset != lv->start) -+ return -EIO; -+ lv->start += be->be_length; -+ return 0; -+ } -+ /* lv->mode == IOMODE_RW */ -+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { -+ if (be->be_f_offset != lv->start) -+ return -EIO; -+ if (lv->cowread > lv->start) -+ return -EIO; -+ lv->start += be->be_length; -+ lv->inval = lv->start; -+ return 0; -+ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { -+ if (be->be_f_offset != lv->start) -+ return -EIO; -+ lv->start += be->be_length; -+ return 0; -+ } else if (be->be_state == PNFS_BLOCK_READ_DATA) { -+ if (be->be_f_offset > lv->start) -+ return -EIO; -+ if (be->be_f_offset < lv->inval) -+ return -EIO; -+ if (be->be_f_offset < lv->cowread) -+ return -EIO; -+ /* It looks like you might want to min this with lv->start, -+ * but you really don't. -+ */ -+ lv->inval = lv->inval + be->be_length; -+ lv->cowread = be->be_f_offset + be->be_length; -+ return 0; -+ } else -+ return -EIO; -+} -+ -+/* XDR decode pnfs_block_layout4 structure */ -+int -+nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, -+ struct nfs4_layoutget_res *lgr) -+{ -+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); -+ uint32_t *p = (uint32_t *)lgr->layout.buf; -+ uint32_t *end = (uint32_t *)((char *)lgr->layout.buf + lgr->layout.len); -+ int i, status = -EIO; -+ uint32_t count; -+ struct pnfs_block_extent *be = NULL, *save; -+ uint64_t tmp; /* Used by READSECTOR */ -+ struct layout_verification lv = { -+ .mode = lgr->range.iomode, -+ .start = lgr->range.offset >> 9, -+ .inval = lgr->range.offset >> 9, -+ .cowread = lgr->range.offset >> 9, -+ }; -+ -+ LIST_HEAD(extents); -+ -+ BLK_READBUF(p, end, 4); -+ READ32(count); -+ -+ dprintk("%s enter, number of extents %i\n", __func__, count); -+ BLK_READBUF(p, end, (28 + NFS4_DEVICEID4_SIZE) * count); -+ -+ /* Decode individual extents, putting them in temporary -+ * staging area until whole layout is decoded to make error -+ * recovery easier. -+ */ -+ for (i = 0; i < count; i++) { -+ be = alloc_extent(); -+ if (!be) { -+ status = -ENOMEM; -+ goto out_err; -+ } -+ READ_DEVID(&be->be_devid); -+ be->be_mdev = translate_devid(lo, &be->be_devid); -+ if (!be->be_mdev) -+ goto out_err; -+ /* The next three values are read in as bytes, -+ * but stored as 512-byte sector lengths -+ */ -+ READ_SECTOR(be->be_f_offset); -+ READ_SECTOR(be->be_length); -+ READ_SECTOR(be->be_v_offset); -+ READ32(be->be_state); -+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) -+ be->be_inval = &bl->bl_inval; -+ if (verify_extent(be, &lv)) { -+ dprintk("%s verify failed\n", __func__); -+ goto out_err; -+ } -+ list_add_tail(&be->be_node, &extents); -+ } -+ if (p != end) { -+ dprintk("%s Undecoded cruft at end of opaque\n", __func__); -+ be = NULL; -+ goto out_err; -+ } -+ if (lgr->range.offset + lgr->range.length != lv.start << 9) { -+ dprintk("%s Final length mismatch\n", __func__); -+ be = NULL; -+ goto out_err; -+ } -+ if (lv.start < lv.cowread) { -+ dprintk("%s Final uncovered COW extent\n", __func__); -+ be = NULL; -+ goto out_err; -+ } -+ /* Extents decoded properly, now try to merge them in to -+ * existing layout extents. -+ */ -+ spin_lock(&bl->bl_ext_lock); -+ list_for_each_entry_safe(be, save, &extents, be_node) { -+ list_del(&be->be_node); -+ status = add_and_merge_extent(bl, be); -+ if (status) { -+ spin_unlock(&bl->bl_ext_lock); -+ /* This is a fairly catastrophic error, as the -+ * entire layout extent lists are now corrupted. -+ * We should have some way to distinguish this. -+ */ -+ be = NULL; -+ goto out_err; -+ } -+ } -+ spin_unlock(&bl->bl_ext_lock); -+ status = 0; -+ out: -+ dprintk("%s returns %i\n", __func__, status); -+ return status; -+ -+ out_err: -+ put_extent(be); -+ while (!list_empty(&extents)) { -+ be = list_first_entry(&extents, struct pnfs_block_extent, -+ be_node); -+ list_del(&be->be_node); -+ put_extent(be); -+ } -+ goto out; -+} -diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c -new file mode 100644 -index 0000000..097dd05 ---- /dev/null -+++ b/fs/nfs/blocklayout/blocklayoutdm.c -@@ -0,0 +1,120 @@ -+/* -+ * linux/fs/nfs/blocklayout/blocklayoutdm.c -+ * -+ * Module for the NFSv4.1 pNFS block layout driver. -+ * -+ * Copyright (c) 2007 The Regents of the University of Michigan. -+ * All rights reserved. -+ * -+ * Fred Isaman -+ * Andy Adamson -+ * -+ * permission is granted to use, copy, create derivative works and -+ * redistribute this software and such derivative works for any purpose, -+ * so long as the name of the university of michigan is not used in -+ * any advertising or publicity pertaining to the use or distribution -+ * of this software without specific, written prior authorization. if -+ * the above copyright notice or any other identification of the -+ * university of michigan is included in any copy of any portion of -+ * this software, then the disclaimer below must also be included. -+ * -+ * this software is provided as is, without representation from the -+ * university of michigan as to its fitness for any purpose, and without -+ * warranty by the university of michigan of any kind, either express -+ * or implied, including without limitation the implied warranties of -+ * merchantability and fitness for a particular purpose. the regents -+ * of the university of michigan shall not be liable for any damages, -+ * including special, indirect, incidental, or consequential damages, -+ * with respect to any claim arising out or in connection with the use -+ * of the software, even if it has been or is hereafter advised of the -+ * possibility of such damages. -+ */ -+ -+#include /* gendisk - used in a dprintk*/ -+#include -+#include -+ -+#include "blocklayout.h" -+ -+#define NFSDBG_FACILITY NFSDBG_PNFS_LD -+ -+/* Defines used for calculating memory usage in nfs4_blk_flatten() */ -+#define ARGSIZE 24 /* Max bytes needed for linear target arg string */ -+#define SPECSIZE (sizeof8(struct dm_target_spec) + ARGSIZE) -+#define SPECS_PER_PAGE (PAGE_SIZE / SPECSIZE) -+#define SPEC_HEADER_ADJUST (SPECS_PER_PAGE - \ -+ (PAGE_SIZE - sizeof8(struct dm_ioctl)) / SPECSIZE) -+#define roundup8(x) (((x)+7) & ~7) -+#define sizeof8(x) roundup8(sizeof(x)) -+ -+static int dev_remove(dev_t dev) -+{ -+ int ret = 1; -+ struct pipefs_hdr *msg = NULL, *reply = NULL; -+ uint64_t bl_dev; -+ uint32_t major = MAJOR(dev), minor = MINOR(dev); -+ -+ dprintk("Entering %s\n", __func__); -+ -+ if (IS_ERR(bl_device_pipe)) -+ return ret; -+ -+ memcpy((void *)&bl_dev, &major, sizeof(uint32_t)); -+ memcpy((void *)&bl_dev + sizeof(uint32_t), &minor, sizeof(uint32_t)); -+ msg = pipefs_alloc_init_msg(0, BL_DEVICE_UMOUNT, 0, (void *)&bl_dev, -+ sizeof(uint64_t)); -+ if (IS_ERR(msg)) { -+ dprintk("ERROR: couldn't make pipefs message.\n"); -+ goto out; -+ } -+ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); -+ msg->status = BL_DEVICE_REQUEST_INIT; -+ -+ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, -+ &bl_device_list, 0, 0); -+ if (IS_ERR(reply)) { -+ dprintk("ERROR: upcall_waitreply failed\n"); -+ goto out; -+ } -+ -+ if (reply->status == BL_DEVICE_REQUEST_PROC) -+ ret = 0; /*TODO: what to return*/ -+out: -+ if (!IS_ERR(reply)) -+ kfree(reply); -+ if (!IS_ERR(msg)) -+ kfree(msg); -+ return ret; -+} -+ -+/* -+ * Release meta device -+ */ -+static int nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) -+{ -+ int rv; -+ -+ dprintk("%s Releasing\n", __func__); -+ /* XXX Check return? */ -+ rv = nfs4_blkdev_put(bdev->bm_mdev); -+ dprintk("%s nfs4_blkdev_put returns %d\n", __func__, rv); -+ -+ rv = dev_remove(bdev->bm_mdev->bd_dev); -+ dprintk("%s Returns %d\n", __func__, rv); -+ return rv; -+} -+ -+void free_block_dev(struct pnfs_block_dev *bdev) -+{ -+ if (bdev) { -+ if (bdev->bm_mdev) { -+ dprintk("%s Removing DM device: %d:%d\n", -+ __func__, -+ MAJOR(bdev->bm_mdev->bd_dev), -+ MINOR(bdev->bm_mdev->bd_dev)); -+ /* XXX Check status ?? */ -+ nfs4_blk_metadev_release(bdev); -+ } -+ kfree(bdev); -+ } -+} -diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c -new file mode 100644 -index 0000000..40dff82 ---- /dev/null -+++ b/fs/nfs/blocklayout/extents.c -@@ -0,0 +1,948 @@ -+/* -+ * linux/fs/nfs/blocklayout/blocklayout.h -+ * -+ * Module for the NFSv4.1 pNFS block layout driver. -+ * -+ * Copyright (c) 2006 The Regents of the University of Michigan. -+ * All rights reserved. -+ * -+ * Andy Adamson -+ * Fred Isaman -+ * -+ * permission is granted to use, copy, create derivative works and -+ * redistribute this software and such derivative works for any purpose, -+ * so long as the name of the university of michigan is not used in -+ * any advertising or publicity pertaining to the use or distribution -+ * of this software without specific, written prior authorization. if -+ * the above copyright notice or any other identification of the -+ * university of michigan is included in any copy of any portion of -+ * this software, then the disclaimer below must also be included. -+ * -+ * this software is provided as is, without representation from the -+ * university of michigan as to its fitness for any purpose, and without -+ * warranty by the university of michigan of any kind, either express -+ * or implied, including without limitation the implied warranties of -+ * merchantability and fitness for a particular purpose. the regents -+ * of the university of michigan shall not be liable for any damages, -+ * including special, indirect, incidental, or consequential damages, -+ * with respect to any claim arising out or in connection with the use -+ * of the software, even if it has been or is hereafter advised of the -+ * possibility of such damages. -+ */ -+ -+#include "blocklayout.h" -+#define NFSDBG_FACILITY NFSDBG_PNFS_LD -+ -+/* Bit numbers */ -+#define EXTENT_INITIALIZED 0 -+#define EXTENT_WRITTEN 1 -+#define EXTENT_IN_COMMIT 2 -+#define INTERNAL_EXISTS MY_MAX_TAGS -+#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) -+ -+/* Returns largest t<=s s.t. t%base==0 */ -+static inline sector_t normalize(sector_t s, int base) -+{ -+ sector_t tmp = s; /* Since do_div modifies its argument */ -+ return s - do_div(tmp, base); -+} -+ -+static inline sector_t normalize_up(sector_t s, int base) -+{ -+ return normalize(s + base - 1, base); -+} -+ -+/* Complete stub using list while determine API wanted */ -+ -+/* Returns tags, or negative */ -+static int32_t _find_entry(struct my_tree_t *tree, u64 s) -+{ -+ struct pnfs_inval_tracking *pos; -+ -+ dprintk("%s(%llu) enter\n", __func__, s); -+ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { -+ if (pos->it_sector > s) -+ continue; -+ else if (pos->it_sector == s) -+ return pos->it_tags & INTERNAL_MASK; -+ else -+ break; -+ } -+ return -ENOENT; -+} -+ -+static inline -+int _has_tag(struct my_tree_t *tree, u64 s, int32_t tag) -+{ -+ int32_t tags; -+ -+ dprintk("%s(%llu, %i) enter\n", __func__, s, tag); -+ s = normalize(s, tree->mtt_step_size); -+ tags = _find_entry(tree, s); -+ if ((tags < 0) || !(tags & (1 << tag))) -+ return 0; -+ else -+ return 1; -+} -+ -+/* Creates entry with tag, or if entry already exists, unions tag to it. -+ * If storage is not NULL, newly created entry will use it. -+ * Returns number of entries added, or negative on error. -+ */ -+static int _add_entry(struct my_tree_t *tree, u64 s, int32_t tag, -+ struct pnfs_inval_tracking *storage) -+{ -+ int found = 0; -+ struct pnfs_inval_tracking *pos; -+ -+ dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); -+ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { -+ if (pos->it_sector > s) -+ continue; -+ else if (pos->it_sector == s) { -+ found = 1; -+ break; -+ } else -+ break; -+ } -+ if (found) { -+ pos->it_tags |= (1 << tag); -+ return 0; -+ } else { -+ struct pnfs_inval_tracking *new; -+ if (storage) -+ new = storage; -+ else { -+ new = kmalloc(sizeof(*new), GFP_KERNEL); -+ if (!new) -+ return -ENOMEM; -+ } -+ new->it_sector = s; -+ new->it_tags = (1 << tag); -+ list_add(&new->it_link, &pos->it_link); -+ return 1; -+ } -+} -+ -+/* XXXX Really want option to not create */ -+/* Over range, unions tag with existing entries, else creates entry with tag */ -+static int _set_range(struct my_tree_t *tree, int32_t tag, u64 s, u64 length) -+{ -+ u64 i; -+ -+ dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); -+ for (i = normalize(s, tree->mtt_step_size); i < s + length; -+ i += tree->mtt_step_size) -+ if (_add_entry(tree, i, tag, NULL)) -+ return -ENOMEM; -+ return 0; -+} -+ -+/* Ensure that future operations on given range of tree will not malloc */ -+static int _preload_range(struct my_tree_t *tree, u64 offset, u64 length) -+{ -+ u64 start, end, s; -+ int count, i, used = 0, status = -ENOMEM; -+ struct pnfs_inval_tracking **storage; -+ -+ dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); -+ start = normalize(offset, tree->mtt_step_size); -+ end = normalize_up(offset + length, tree->mtt_step_size); -+ count = (int)(end - start) / (int)tree->mtt_step_size; -+ -+ /* Pre-malloc what memory we might need */ -+ storage = kmalloc(sizeof(*storage) * count, GFP_KERNEL); -+ if (!storage) -+ return -ENOMEM; -+ for (i = 0; i < count; i++) { -+ storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), -+ GFP_KERNEL); -+ if (!storage[i]) -+ goto out_cleanup; -+ } -+ -+ /* Now need lock - HOW??? */ -+ -+ for (s = start; s < end; s += tree->mtt_step_size) -+ used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); -+ -+ /* Unlock - HOW??? */ -+ status = 0; -+ -+ out_cleanup: -+ for (i = used; i < count; i++) { -+ if (!storage[i]) -+ break; -+ kfree(storage[i]); -+ } -+ kfree(storage); -+ return status; -+} -+ -+static void set_needs_init(sector_t *array, sector_t offset) -+{ -+ sector_t *p = array; -+ -+ dprintk("%s enter\n", __func__); -+ if (!p) -+ return; -+ while (*p < offset) -+ p++; -+ if (*p == offset) -+ return; -+ else if (*p == ~0) { -+ *p++ = offset; -+ *p = ~0; -+ return; -+ } else { -+ sector_t *save = p; -+ dprintk("%s Adding %llu\n", __func__, (u64)offset); -+ while (*p != ~0) -+ p++; -+ p++; -+ memmove(save + 1, save, (char *)p - (char *)save); -+ *save = offset; -+ return; -+ } -+} -+ -+/* We are relying on page lock to serialize this */ -+int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect) -+{ -+ int rv; -+ -+ spin_lock(&marks->im_lock); -+ rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); -+ spin_unlock(&marks->im_lock); -+ return rv; -+} -+ -+/* Assume start, end already sector aligned */ -+static int -+_range_has_tag(struct my_tree_t *tree, u64 start, u64 end, int32_t tag) -+{ -+ struct pnfs_inval_tracking *pos; -+ u64 expect = 0; -+ -+ dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); -+ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { -+ if (pos->it_sector >= end) -+ continue; -+ if (!expect) { -+ if ((pos->it_sector == end - tree->mtt_step_size) && -+ (pos->it_tags & (1 << tag))) { -+ expect = pos->it_sector - tree->mtt_step_size; -+ if (expect < start) -+ return 1; -+ continue; -+ } else { -+ return 0; -+ } -+ } -+ if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) -+ return 0; -+ expect -= tree->mtt_step_size; -+ if (expect < start) -+ return 1; -+ } -+ return 0; -+} -+ -+static int is_range_written(struct pnfs_inval_markings *marks, -+ sector_t start, sector_t end) -+{ -+ int rv; -+ -+ spin_lock(&marks->im_lock); -+ rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); -+ spin_unlock(&marks->im_lock); -+ return rv; -+} -+ -+/* Marks sectors in [offest, offset_length) as having been initialized. -+ * All lengths are step-aligned, where step is min(pagesize, blocksize). -+ * Notes where partial block is initialized, and helps prepare it for -+ * complete initialization later. -+ */ -+/* Currently assumes offset is page-aligned */ -+int mark_initialized_sectors(struct pnfs_inval_markings *marks, -+ sector_t offset, sector_t length, -+ sector_t **pages) -+{ -+ sector_t s, start, end; -+ sector_t *array = NULL; /* Pages to mark */ -+ -+ dprintk("%s(offset=%llu,len=%llu) enter\n", -+ __func__, (u64)offset, (u64)length); -+ s = max((sector_t) 3, -+ 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); -+ dprintk("%s set max=%llu\n", __func__, (u64)s); -+ if (pages) { -+ array = kmalloc(s * sizeof(sector_t), GFP_KERNEL); -+ if (!array) -+ goto outerr; -+ array[0] = ~0; -+ } -+ -+ start = normalize(offset, marks->im_block_size); -+ end = normalize_up(offset + length, marks->im_block_size); -+ if (_preload_range(&marks->im_tree, start, end - start)) -+ goto outerr; -+ -+ spin_lock(&marks->im_lock); -+ -+ for (s = normalize_up(start, PAGE_CACHE_SECTORS); -+ s < offset; s += PAGE_CACHE_SECTORS) { -+ dprintk("%s pre-area pages\n", __func__); -+ /* Portion of used block is not initialized */ -+ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) -+ set_needs_init(array, s); -+ } -+ if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) -+ goto out_unlock; -+ for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); -+ s < end; s += PAGE_CACHE_SECTORS) { -+ dprintk("%s post-area pages\n", __func__); -+ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) -+ set_needs_init(array, s); -+ } -+ -+ spin_unlock(&marks->im_lock); -+ -+ if (pages) { -+ if (array[0] == ~0) { -+ kfree(array); -+ *pages = NULL; -+ } else -+ *pages = array; -+ } -+ return 0; -+ -+ out_unlock: -+ spin_unlock(&marks->im_lock); -+ outerr: -+ if (pages) { -+ kfree(array); -+ *pages = NULL; -+ } -+ return -ENOMEM; -+} -+ -+/* Marks sectors in [offest, offset+length) as having been written to disk. -+ * All lengths should be block aligned. -+ */ -+int mark_written_sectors(struct pnfs_inval_markings *marks, -+ sector_t offset, sector_t length) -+{ -+ int status; -+ -+ dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, -+ (u64)offset, (u64)length); -+ spin_lock(&marks->im_lock); -+ status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); -+ spin_unlock(&marks->im_lock); -+ return status; -+} -+ -+static void print_short_extent(struct pnfs_block_short_extent *be) -+{ -+ dprintk("PRINT SHORT EXTENT extent %p\n", be); -+ if (be) { -+ dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); -+ dprintk(" be_length %llu\n", (u64)be->bse_length); -+ } -+} -+ -+void print_clist(struct list_head *list, unsigned int count) -+{ -+ struct pnfs_block_short_extent *be; -+ unsigned int i = 0; -+ -+ dprintk("****************\n"); -+ dprintk("Extent list looks like:\n"); -+ list_for_each_entry(be, list, bse_node) { -+ i++; -+ print_short_extent(be); -+ } -+ if (i != count) -+ dprintk("\n\nExpected %u entries\n\n\n", count); -+ dprintk("****************\n"); -+} -+ -+/* Note: In theory, we should do more checking that devid's match between -+ * old and new, but if they don't, the lists are too corrupt to salvage anyway. -+ */ -+/* Note this is very similar to add_and_merge_extent */ -+static void add_to_commitlist(struct pnfs_block_layout *bl, -+ struct pnfs_block_short_extent *new) -+{ -+ struct list_head *clist = &bl->bl_commit; -+ struct pnfs_block_short_extent *old, *save; -+ sector_t end = new->bse_f_offset + new->bse_length; -+ -+ dprintk("%s enter\n", __func__); -+ print_short_extent(new); -+ print_clist(clist, bl->bl_count); -+ bl->bl_count++; -+ /* Scan for proper place to insert, extending new to the left -+ * as much as possible. -+ */ -+ list_for_each_entry_safe(old, save, clist, bse_node) { -+ if (new->bse_f_offset < old->bse_f_offset) -+ break; -+ if (end <= old->bse_f_offset + old->bse_length) { -+ /* Range is already in list */ -+ bl->bl_count--; -+ kfree(new); -+ return; -+ } else if (new->bse_f_offset <= -+ old->bse_f_offset + old->bse_length) { -+ /* new overlaps or abuts existing be */ -+ if (new->bse_mdev == old->bse_mdev) { -+ /* extend new to fully replace old */ -+ new->bse_length += new->bse_f_offset - -+ old->bse_f_offset; -+ new->bse_f_offset = old->bse_f_offset; -+ list_del(&old->bse_node); -+ bl->bl_count--; -+ kfree(old); -+ } -+ } -+ } -+ /* Note that if we never hit the above break, old will not point to a -+ * valid extent. However, in that case &old->bse_node==list. -+ */ -+ list_add_tail(&new->bse_node, &old->bse_node); -+ /* Scan forward for overlaps. If we find any, extend new and -+ * remove the overlapped extent. -+ */ -+ old = list_prepare_entry(new, clist, bse_node); -+ list_for_each_entry_safe_continue(old, save, clist, bse_node) { -+ if (end < old->bse_f_offset) -+ break; -+ /* new overlaps or abuts old */ -+ if (new->bse_mdev == old->bse_mdev) { -+ if (end < old->bse_f_offset + old->bse_length) { -+ /* extend new to fully cover old */ -+ end = old->bse_f_offset + old->bse_length; -+ new->bse_length = end - new->bse_f_offset; -+ } -+ list_del(&old->bse_node); -+ bl->bl_count--; -+ kfree(old); -+ } -+ } -+ dprintk("%s: after merging\n", __func__); -+ print_clist(clist, bl->bl_count); -+} -+ -+/* Note the range described by offset, length is guaranteed to be contained -+ * within be. -+ */ -+int mark_for_commit(struct pnfs_block_extent *be, -+ sector_t offset, sector_t length) -+{ -+ sector_t new_end, end = offset + length; -+ struct pnfs_block_short_extent *new; -+ struct pnfs_block_layout *bl = container_of(be->be_inval, -+ struct pnfs_block_layout, -+ bl_inval); -+ -+ new = kmalloc(sizeof(*new), GFP_KERNEL); -+ if (!new) -+ return -ENOMEM; -+ -+ mark_written_sectors(be->be_inval, offset, length); -+ /* We want to add the range to commit list, but it must be -+ * block-normalized, and verified that the normalized range has -+ * been entirely written to disk. -+ */ -+ new->bse_f_offset = offset; -+ offset = normalize(offset, bl->bl_blocksize); -+ if (offset < new->bse_f_offset) { -+ if (is_range_written(be->be_inval, offset, new->bse_f_offset)) -+ new->bse_f_offset = offset; -+ else -+ new->bse_f_offset = offset + bl->bl_blocksize; -+ } -+ new_end = normalize_up(end, bl->bl_blocksize); -+ if (end < new_end) { -+ if (is_range_written(be->be_inval, end, new_end)) -+ end = new_end; -+ else -+ end = new_end - bl->bl_blocksize; -+ } -+ if (end <= new->bse_f_offset) { -+ kfree(new); -+ return 0; -+ } -+ new->bse_length = end - new->bse_f_offset; -+ new->bse_devid = be->be_devid; -+ new->bse_mdev = be->be_mdev; -+ -+ spin_lock(&bl->bl_ext_lock); -+ /* new will be freed, either by add_to_commitlist if it decides not -+ * to use it, or after LAYOUTCOMMIT uses it in the commitlist. -+ */ -+ add_to_commitlist(bl, new); -+ spin_unlock(&bl->bl_ext_lock); -+ return 0; -+} -+ -+static void print_bl_extent(struct pnfs_block_extent *be) -+{ -+ dprintk("PRINT EXTENT extent %p\n", be); -+ if (be) { -+ dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); -+ dprintk(" be_length %llu\n", (u64)be->be_length); -+ dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); -+ dprintk(" be_state %d\n", be->be_state); -+ } -+} -+ -+static void -+destroy_extent(struct kref *kref) -+{ -+ struct pnfs_block_extent *be; -+ -+ be = container_of(kref, struct pnfs_block_extent, be_refcnt); -+ dprintk("%s be=%p\n", __func__, be); -+ kfree(be); -+} -+ -+void -+put_extent(struct pnfs_block_extent *be) -+{ -+ if (be) { -+ dprintk("%s enter %p (%i)\n", __func__, be, -+ atomic_read(&be->be_refcnt.refcount)); -+ kref_put(&be->be_refcnt, destroy_extent); -+ } -+} -+ -+struct pnfs_block_extent *alloc_extent(void) -+{ -+ struct pnfs_block_extent *be; -+ -+ be = kmalloc(sizeof(struct pnfs_block_extent), GFP_KERNEL); -+ if (!be) -+ return NULL; -+ INIT_LIST_HEAD(&be->be_node); -+ kref_init(&be->be_refcnt); -+ be->be_inval = NULL; -+ return be; -+} -+ -+struct pnfs_block_extent * -+get_extent(struct pnfs_block_extent *be) -+{ -+ if (be) -+ kref_get(&be->be_refcnt); -+ return be; -+} -+ -+void print_elist(struct list_head *list) -+{ -+ struct pnfs_block_extent *be; -+ dprintk("****************\n"); -+ dprintk("Extent list looks like:\n"); -+ list_for_each_entry(be, list, be_node) { -+ print_bl_extent(be); -+ } -+ dprintk("****************\n"); -+} -+ -+static inline int -+extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) -+{ -+ /* Note this assumes new->be_f_offset >= old->be_f_offset */ -+ return (new->be_state == old->be_state) && -+ ((new->be_state == PNFS_BLOCK_NONE_DATA) || -+ ((new->be_v_offset - old->be_v_offset == -+ new->be_f_offset - old->be_f_offset) && -+ new->be_mdev == old->be_mdev)); -+} -+ -+/* Adds new to appropriate list in bl, modifying new and removing existing -+ * extents as appropriate to deal with overlaps. -+ * -+ * See find_get_extent for list constraints. -+ * -+ * Refcount on new is already set. If end up not using it, or error out, -+ * need to put the reference. -+ * -+ * Lock is held by caller. -+ */ -+int -+add_and_merge_extent(struct pnfs_block_layout *bl, -+ struct pnfs_block_extent *new) -+{ -+ struct pnfs_block_extent *be, *tmp; -+ sector_t end = new->be_f_offset + new->be_length; -+ struct list_head *list; -+ -+ dprintk("%s enter with be=%p\n", __func__, new); -+ print_bl_extent(new); -+ list = &bl->bl_extents[choose_list(new->be_state)]; -+ print_elist(list); -+ -+ /* Scan for proper place to insert, extending new to the left -+ * as much as possible. -+ */ -+ list_for_each_entry_safe_reverse(be, tmp, list, be_node) { -+ if (new->be_f_offset >= be->be_f_offset + be->be_length) -+ break; -+ if (new->be_f_offset >= be->be_f_offset) { -+ if (end <= be->be_f_offset + be->be_length) { -+ /* new is a subset of existing be*/ -+ if (extents_consistent(be, new)) { -+ dprintk("%s: new is subset, ignoring\n", -+ __func__); -+ put_extent(new); -+ return 0; -+ } else { -+ goto out_err; -+ } -+ } else { -+ /* |<-- be -->| -+ * |<-- new -->| */ -+ if (extents_consistent(be, new)) { -+ /* extend new to fully replace be */ -+ new->be_length += new->be_f_offset - -+ be->be_f_offset; -+ new->be_f_offset = be->be_f_offset; -+ new->be_v_offset = be->be_v_offset; -+ dprintk("%s: removing %p\n", __func__, be); -+ list_del(&be->be_node); -+ put_extent(be); -+ } else { -+ goto out_err; -+ } -+ } -+ } else if (end >= be->be_f_offset + be->be_length) { -+ /* new extent overlap existing be */ -+ if (extents_consistent(be, new)) { -+ /* extend new to fully replace be */ -+ dprintk("%s: removing %p\n", __func__, be); -+ list_del(&be->be_node); -+ put_extent(be); -+ } else { -+ goto out_err; -+ } -+ } else if (end > be->be_f_offset) { -+ /* |<-- be -->| -+ *|<-- new -->| */ -+ if (extents_consistent(new, be)) { -+ /* extend new to fully replace be */ -+ new->be_length += be->be_f_offset + be->be_length - -+ new->be_f_offset - new->be_length; -+ dprintk("%s: removing %p\n", __func__, be); -+ list_del(&be->be_node); -+ put_extent(be); -+ } else { -+ goto out_err; -+ } -+ } -+ } -+ /* Note that if we never hit the above break, be will not point to a -+ * valid extent. However, in that case &be->be_node==list. -+ */ -+ list_add(&new->be_node, &be->be_node); -+ dprintk("%s: inserting new\n", __func__); -+ print_elist(list); -+ /* STUB - The per-list consistency checks have all been done, -+ * should now check cross-list consistency. -+ */ -+ return 0; -+ -+ out_err: -+ put_extent(new); -+ return -EIO; -+} -+ -+/* Returns extent, or NULL. If a second READ extent exists, it is returned -+ * in cow_read, if given. -+ * -+ * The extents are kept in two seperate ordered lists, one for READ and NONE, -+ * one for READWRITE and INVALID. Within each list, we assume: -+ * 1. Extents are ordered by file offset. -+ * 2. For any given isect, there is at most one extents that matches. -+ */ -+struct pnfs_block_extent * -+find_get_extent(struct pnfs_block_layout *bl, sector_t isect, -+ struct pnfs_block_extent **cow_read) -+{ -+ struct pnfs_block_extent *be, *cow, *ret; -+ int i; -+ -+ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); -+ cow = ret = NULL; -+ spin_lock(&bl->bl_ext_lock); -+ for (i = 0; i < EXTENT_LISTS; i++) { -+ if (ret && -+ (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) -+ break; -+ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { -+ if (isect >= be->be_f_offset + be->be_length) -+ break; -+ if (isect >= be->be_f_offset) { -+ /* We have found an extent */ -+ dprintk("%s Get %p (%i)\n", __func__, be, -+ atomic_read(&be->be_refcnt.refcount)); -+ kref_get(&be->be_refcnt); -+ if (!ret) -+ ret = be; -+ else if (be->be_state != PNFS_BLOCK_READ_DATA) -+ put_extent(be); -+ else -+ cow = be; -+ break; -+ } -+ } -+ } -+ spin_unlock(&bl->bl_ext_lock); -+ if (cow_read) -+ *cow_read = cow; -+ print_bl_extent(ret); -+ return ret; -+} -+ -+/* Similar to find_get_extent, but called with lock held, and ignores cow */ -+static struct pnfs_block_extent * -+find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) -+{ -+ struct pnfs_block_extent *be, *ret = NULL; -+ int i; -+ -+ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); -+ for (i = 0; i < EXTENT_LISTS; i++) { -+ if (ret) -+ break; -+ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { -+ if (isect >= be->be_f_offset + be->be_length) -+ break; -+ if (isect >= be->be_f_offset) { -+ /* We have found an extent */ -+ dprintk("%s Get %p (%i)\n", __func__, be, -+ atomic_read(&be->be_refcnt.refcount)); -+ kref_get(&be->be_refcnt); -+ ret = be; -+ break; -+ } -+ } -+ } -+ print_bl_extent(ret); -+ return ret; -+} -+ -+int -+encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, -+ struct xdr_stream *xdr, -+ const struct nfs4_layoutcommit_args *arg) -+{ -+ sector_t start, end; -+ struct pnfs_block_short_extent *lce, *save; -+ unsigned int count = 0; -+ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; -+ struct list_head *ranges = &bld->ranges; -+ __be32 *p, *xdr_start; -+ -+ dprintk("%s enter\n", __func__); -+ start = arg->range.offset >> 9; -+ end = start + (arg->range.length >> 9); -+ dprintk("%s set start=%llu, end=%llu\n", -+ __func__, (u64)start, (u64)end); -+ -+ /* BUG - creation of bl_commit is buggy - need to wait for -+ * entire block to be marked WRITTEN before it can be added. -+ */ -+ spin_lock(&bl->bl_ext_lock); -+ /* Want to adjust for possible truncate */ -+ /* We now want to adjust argument range */ -+ -+ /* XDR encode the ranges found */ -+ xdr_start = xdr_reserve_space(xdr, 8); -+ if (!xdr_start) -+ goto out; -+ list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { -+ p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); -+ if (!p) -+ break; -+ WRITE_DEVID(&lce->bse_devid); -+ WRITE64(lce->bse_f_offset << 9); -+ WRITE64(lce->bse_length << 9); -+ WRITE64(0LL); -+ WRITE32(PNFS_BLOCK_READWRITE_DATA); -+ list_del(&lce->bse_node); -+ list_add_tail(&lce->bse_node, ranges); -+ bl->bl_count--; -+ count++; -+ } -+ xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); -+ xdr_start[1] = cpu_to_be32(count); -+out: -+ spin_unlock(&bl->bl_ext_lock); -+ dprintk("%s found %i ranges\n", __func__, count); -+ return 0; -+} -+ -+/* Helper function to set_to_rw that initialize a new extent */ -+static void -+_prep_new_extent(struct pnfs_block_extent *new, -+ struct pnfs_block_extent *orig, -+ sector_t offset, sector_t length, int state) -+{ -+ kref_init(&new->be_refcnt); -+ /* don't need to INIT_LIST_HEAD(&new->be_node) */ -+ memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid)); -+ new->be_mdev = orig->be_mdev; -+ new->be_f_offset = offset; -+ new->be_length = length; -+ new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; -+ new->be_state = state; -+ new->be_inval = orig->be_inval; -+} -+ -+/* Tries to merge be with extent in front of it in list. -+ * Frees storage if not used. -+ */ -+static struct pnfs_block_extent * -+_front_merge(struct pnfs_block_extent *be, struct list_head *head, -+ struct pnfs_block_extent *storage) -+{ -+ struct pnfs_block_extent *prev; -+ -+ if (!storage) -+ goto no_merge; -+ if (&be->be_node == head || be->be_node.prev == head) -+ goto no_merge; -+ prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); -+ if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || -+ !extents_consistent(prev, be)) -+ goto no_merge; -+ _prep_new_extent(storage, prev, prev->be_f_offset, -+ prev->be_length + be->be_length, prev->be_state); -+ list_replace(&prev->be_node, &storage->be_node); -+ put_extent(prev); -+ list_del(&be->be_node); -+ put_extent(be); -+ return storage; -+ -+ no_merge: -+ kfree(storage); -+ return be; -+} -+ -+static u64 -+set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) -+{ -+ u64 rv = offset + length; -+ struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; -+ struct pnfs_block_extent *children[3]; -+ struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; -+ int i = 0, j; -+ -+ dprintk("%s(%llu, %llu)\n", __func__, offset, length); -+ /* Create storage for up to three new extents e1, e2, e3 */ -+ e1 = kmalloc(sizeof(*e1), GFP_KERNEL); -+ e2 = kmalloc(sizeof(*e2), GFP_KERNEL); -+ e3 = kmalloc(sizeof(*e3), GFP_KERNEL); -+ /* BUG - we are ignoring any failure */ -+ if (!e1 || !e2 || !e3) -+ goto out_nosplit; -+ -+ spin_lock(&bl->bl_ext_lock); -+ be = find_get_extent_locked(bl, offset); -+ rv = be->be_f_offset + be->be_length; -+ if (be->be_state != PNFS_BLOCK_INVALID_DATA) { -+ spin_unlock(&bl->bl_ext_lock); -+ goto out_nosplit; -+ } -+ /* Add e* to children, bumping e*'s krefs */ -+ if (be->be_f_offset != offset) { -+ _prep_new_extent(e1, be, be->be_f_offset, -+ offset - be->be_f_offset, -+ PNFS_BLOCK_INVALID_DATA); -+ children[i++] = e1; -+ print_bl_extent(e1); -+ } else -+ merge1 = e1; -+ _prep_new_extent(e2, be, offset, -+ min(length, be->be_f_offset + be->be_length - offset), -+ PNFS_BLOCK_READWRITE_DATA); -+ children[i++] = e2; -+ print_bl_extent(e2); -+ if (offset + length < be->be_f_offset + be->be_length) { -+ _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, -+ be->be_f_offset + be->be_length - -+ offset - length, -+ PNFS_BLOCK_INVALID_DATA); -+ children[i++] = e3; -+ print_bl_extent(e3); -+ } else -+ merge2 = e3; -+ -+ /* Remove be from list, and insert the e* */ -+ /* We don't get refs on e*, since this list is the base reference -+ * set when init'ed. -+ */ -+ if (i < 3) -+ children[i] = NULL; -+ new = children[0]; -+ list_replace(&be->be_node, &new->be_node); -+ put_extent(be); -+ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); -+ for (j = 1; j < i; j++) { -+ old = new; -+ new = children[j]; -+ list_add(&new->be_node, &old->be_node); -+ } -+ if (merge2) { -+ /* This is a HACK, should just create a _back_merge function */ -+ new = list_entry(new->be_node.next, -+ struct pnfs_block_extent, be_node); -+ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); -+ } -+ spin_unlock(&bl->bl_ext_lock); -+ -+ /* Since we removed the base reference above, be is now scheduled for -+ * destruction. -+ */ -+ put_extent(be); -+ dprintk("%s returns %llu after split\n", __func__, rv); -+ return rv; -+ -+ out_nosplit: -+ kfree(e1); -+ kfree(e2); -+ kfree(e3); -+ dprintk("%s returns %llu without splitting\n", __func__, rv); -+ return rv; -+} -+ -+void -+clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, -+ const struct nfs4_layoutcommit_args *arg, -+ int status) -+{ -+ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; -+ struct pnfs_block_short_extent *lce, *save; -+ -+ dprintk("%s status %d\n", __func__, status); -+ list_for_each_entry_safe_reverse(lce, save, &bld->ranges, bse_node) { -+ if (likely(!status)) { -+ u64 offset = lce->bse_f_offset; -+ u64 end = offset + lce->bse_length; -+ -+ do { -+ offset = set_to_rw(bl, offset, end - offset); -+ } while (offset < end); -+ -+ kfree(lce); -+ } else { -+ spin_lock(&bl->bl_ext_lock); -+ add_to_commitlist(bl, lce); -+ spin_unlock(&bl->bl_ext_lock); -+ } -+ } -+} -diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h -index 85a7cfd..1f92ceb 100644 ---- a/fs/nfs/callback.h -+++ b/fs/nfs/callback.h -@@ -8,6 +8,8 @@ - #ifndef __LINUX_FS_NFS_CALLBACK_H - #define __LINUX_FS_NFS_CALLBACK_H - -+#include "pnfs.h" -+ - #define NFS4_CALLBACK 0x40000000 - #define NFS4_CALLBACK_XDRSIZE 2048 - #define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE) -@@ -111,6 +113,13 @@ extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, - - #define RCA4_TYPE_MASK_RDATA_DLG 0 - #define RCA4_TYPE_MASK_WDATA_DLG 1 -+#define RCA4_TYPE_MASK_DIR_DLG 2 -+#define RCA4_TYPE_MASK_FILE_LAYOUT 3 -+#define RCA4_TYPE_MASK_BLK_LAYOUT 4 -+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8 -+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9 -+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 -+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 - - struct cb_recallanyargs { - struct sockaddr *craa_addr; -@@ -127,6 +136,39 @@ struct cb_recallslotargs { - extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, - void *dummy); - -+struct cb_layoutrecallargs { -+ struct sockaddr *cbl_addr; -+ struct nfs_fh cbl_fh; -+ struct pnfs_layout_range cbl_seg; -+ struct nfs_fsid cbl_fsid; -+ uint32_t cbl_recall_type; -+ uint32_t cbl_layout_type; -+ uint32_t cbl_layoutchanged; -+ nfs4_stateid cbl_stateid; -+}; -+ -+extern unsigned nfs4_callback_layoutrecall( -+ struct cb_layoutrecallargs *args, -+ void *dummy); -+ -+struct cb_devicenotifyitem { -+ uint32_t cbd_notify_type; -+ uint32_t cbd_layout_type; -+ struct nfs4_deviceid cbd_dev_id; -+ uint32_t cbd_immediate; -+}; -+ -+/* XXX: Should be dynamic up to max compound size */ -+#define NFS4_DEV_NOTIFY_MAXENTRIES 10 -+struct cb_devicenotifyargs { -+ struct sockaddr *addr; -+ int ndevs; -+ struct cb_devicenotifyitem devs[NFS4_DEV_NOTIFY_MAXENTRIES]; -+}; -+ -+extern unsigned nfs4_callback_devicenotify( -+ struct cb_devicenotifyargs *args, -+ void *dummy); - #endif /* CONFIG_NFS_V4_1 */ - - extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); -diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c -index 930d10f..28816ab 100644 ---- a/fs/nfs/callback_proc.c -+++ b/fs/nfs/callback_proc.c -@@ -8,10 +8,14 @@ - #include - #include - #include -+#include -+#include -+#include - #include "nfs4_fs.h" - #include "callback.h" - #include "delegation.h" - #include "internal.h" -+#include "pnfs.h" - - #ifdef NFS_DEBUG - #define NFSDBG_FACILITY NFSDBG_CALLBACK -@@ -113,16 +117,338 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf - - #if defined(CONFIG_NFS_V4_1) - -+static bool -+pnfs_is_next_layout_stateid(const struct pnfs_layout_hdr *lo, -+ const nfs4_stateid stateid) -+{ -+ int seqlock; -+ bool res; -+ u32 oldseqid, newseqid; -+ -+ do { -+ seqlock = read_seqbegin(&lo->seqlock); -+ oldseqid = be32_to_cpu(lo->stateid.stateid.seqid); -+ newseqid = be32_to_cpu(stateid.stateid.seqid); -+ res = !memcmp(lo->stateid.stateid.other, -+ stateid.stateid.other, -+ NFS4_STATEID_OTHER_SIZE); -+ if (res) { /* comparing layout stateids */ -+ if (oldseqid == ~0) -+ res = (newseqid == 1); -+ else -+ res = (newseqid == oldseqid + 1); -+ } else { /* open stateid */ -+ res = !memcmp(lo->stateid.data, -+ &zero_stateid, -+ NFS4_STATEID_SIZE); -+ if (res) -+ res = (newseqid == 1); -+ } -+ } while (read_seqretry(&lo->seqlock, seqlock)); -+ -+ return res; -+} -+ -+/* -+ * Retrieve an inode based on layout recall parameters -+ * -+ * Note: caller must iput(inode) to dereference the inode. -+ */ -+static struct inode * -+nfs_layoutrecall_find_inode(struct nfs_client *clp, -+ const struct cb_layoutrecallargs *args) -+{ -+ struct nfs_inode *nfsi; -+ struct pnfs_layout_hdr *lo; -+ struct nfs_server *server; -+ struct inode *ino = NULL; -+ -+ dprintk("%s: Begin recall_type=%d clp %p\n", -+ __func__, args->cbl_recall_type, clp); -+ -+ spin_lock(&clp->cl_lock); -+ list_for_each_entry(lo, &clp->cl_layouts, layouts) { -+ nfsi = NFS_I(lo->inode); -+ if (!nfsi) -+ continue; -+ -+ dprintk("%s: Searching inode=%lu\n", -+ __func__, nfsi->vfs_inode.i_ino); -+ -+ if (args->cbl_recall_type == RETURN_FILE) { -+ if (nfs_compare_fh(&args->cbl_fh, &nfsi->fh)) -+ continue; -+ } else if (args->cbl_recall_type == RETURN_FSID) { -+ server = NFS_SERVER(&nfsi->vfs_inode); -+ if (server->fsid.major != args->cbl_fsid.major || -+ server->fsid.minor != args->cbl_fsid.minor) -+ continue; -+ } -+ -+ /* Make sure client didn't clean up layout without -+ * telling the server */ -+ if (!has_layout(nfsi)) -+ continue; -+ -+ ino = igrab(&nfsi->vfs_inode); -+ dprintk("%s: Found inode=%p\n", __func__, ino); -+ break; -+ } -+ spin_unlock(&clp->cl_lock); -+ return ino; -+} -+ -+struct recall_layout_threadargs { -+ struct inode *inode; -+ struct nfs_client *clp; -+ struct completion started; -+ struct cb_layoutrecallargs *rl; -+ int result; -+}; -+ -+static int pnfs_recall_layout(void *data) -+{ -+ struct inode *inode, *ino; -+ struct nfs_client *clp; -+ struct cb_layoutrecallargs rl; -+ struct nfs4_layoutreturn *lrp; -+ struct recall_layout_threadargs *args = -+ (struct recall_layout_threadargs *)data; -+ int status = 0; -+ -+ daemonize("nfsv4-layoutreturn"); -+ -+ dprintk("%s: recall_type=%d fsid 0x%llx-0x%llx start\n", -+ __func__, args->rl->cbl_recall_type, -+ args->rl->cbl_fsid.major, args->rl->cbl_fsid.minor); -+ -+ clp = args->clp; -+ inode = args->inode; -+ rl = *args->rl; -+ -+ /* support whole file layouts only */ -+ rl.cbl_seg.offset = 0; -+ rl.cbl_seg.length = NFS4_MAX_UINT64; -+ -+ if (rl.cbl_recall_type == RETURN_FILE) { -+ if (pnfs_is_next_layout_stateid(NFS_I(inode)->layout, -+ rl.cbl_stateid)) -+ status = pnfs_return_layout(inode, &rl.cbl_seg, -+ &rl.cbl_stateid, RETURN_FILE, -+ false); -+ else -+ status = cpu_to_be32(NFS4ERR_DELAY); -+ if (status) -+ dprintk("%s RETURN_FILE error: %d\n", __func__, status); -+ else -+ status = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); -+ args->result = status; -+ complete(&args->started); -+ goto out; -+ } -+ -+ status = cpu_to_be32(NFS4_OK); -+ args->result = status; -+ complete(&args->started); -+ args = NULL; -+ -+ /* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */ -+ while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) { -+ /* FIXME: need to check status on pnfs_return_layout */ -+ pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, false); -+ iput(ino); -+ } -+ -+ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); -+ if (!lrp) { -+ dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n", -+ __func__); -+ goto out; -+ } -+ -+ /* send final layoutreturn */ -+ lrp->args.reclaim = 0; -+ lrp->args.layout_type = rl.cbl_layout_type; -+ lrp->args.return_type = rl.cbl_recall_type; -+ lrp->args.range = rl.cbl_seg; -+ lrp->args.inode = inode; -+ nfs4_proc_layoutreturn(lrp, true); -+ -+out: -+ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); -+ nfs_put_client(clp); -+ module_put_and_exit(0); -+ dprintk("%s: exit status %d\n", __func__, 0); -+ return 0; -+} -+ -+/* -+ * Asynchronous layout recall! -+ */ -+static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode, -+ struct cb_layoutrecallargs *rl) -+{ -+ struct recall_layout_threadargs data = { -+ .clp = clp, -+ .inode = inode, -+ .rl = rl, -+ }; -+ struct task_struct *t; -+ int status = -EAGAIN; -+ -+ dprintk("%s: -->\n", __func__); -+ -+ /* FIXME: do not allow two concurrent layout recalls */ -+ if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state)) -+ return status; -+ -+ init_completion(&data.started); -+ __module_get(THIS_MODULE); -+ atomic_inc(&clp->cl_count); -+ -+ t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout"); -+ if (IS_ERR(t)) { -+ printk(KERN_INFO "NFS: Layout recall callback thread failed " -+ "for client (clientid %08x/%08x)\n", -+ (unsigned)(clp->cl_clientid >> 32), -+ (unsigned)(clp->cl_clientid)); -+ status = PTR_ERR(t); -+ goto out_module_put; -+ } -+ wait_for_completion(&data.started); -+ return data.result; -+out_module_put: -+ nfs_put_client(clp); -+ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); -+ module_put(THIS_MODULE); -+ return status; -+} -+ -+static int pnfs_recall_all_layouts(struct nfs_client *clp) -+{ -+ struct cb_layoutrecallargs rl; -+ struct inode *inode; -+ int status = 0; -+ -+ rl.cbl_recall_type = RETURN_ALL; -+ rl.cbl_seg.iomode = IOMODE_ANY; -+ rl.cbl_seg.offset = 0; -+ rl.cbl_seg.length = NFS4_MAX_UINT64; -+ -+ /* we need the inode to get the nfs_server struct */ -+ inode = nfs_layoutrecall_find_inode(clp, &rl); -+ if (!inode) -+ return status; -+ status = pnfs_async_return_layout(clp, inode, &rl); -+ iput(inode); -+ -+ return status; -+} -+ -+__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args, -+ void *dummy) -+{ -+ struct nfs_client *clp; -+ struct inode *inode = NULL; -+ __be32 res; -+ int status; -+ unsigned int num_client = 0; -+ -+ dprintk("%s: -->\n", __func__); -+ -+ res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); -+ clp = nfs_find_client(args->cbl_addr, 4); -+ if (clp == NULL) -+ goto out; -+ -+ res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); -+ do { -+ struct nfs_client *prev = clp; -+ num_client++; -+ /* the callback must come from the MDS personality */ -+ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) -+ goto loop; -+ /* In the _ALL or _FSID case, we need the inode to get -+ * the nfs_server struct. -+ */ -+ inode = nfs_layoutrecall_find_inode(clp, args); -+ if (!inode) -+ goto loop; -+ status = pnfs_async_return_layout(clp, inode, args); -+ if (status) -+ res = cpu_to_be32(NFS4ERR_DELAY); -+ iput(inode); -+loop: -+ clp = nfs_find_client_next(prev); -+ nfs_put_client(prev); -+ } while (clp != NULL); -+ -+out: -+ dprintk("%s: exit with status = %d numclient %u\n", -+ __func__, ntohl(res), num_client); -+ return res; -+} -+ -+/* Remove the deviceid(s) from the nfs_client deviceid cache */ -+static __be32 pnfs_devicenotify_client(struct nfs_client *clp, -+ struct cb_devicenotifyargs *args) -+{ -+ uint32_t type; -+ int i; -+ -+ dprintk("%s: --> clp %p\n", __func__, clp); -+ -+ for (i = 0; i < args->ndevs; i++) { -+ struct cb_devicenotifyitem *dev = &args->devs[i]; -+ type = dev->cbd_notify_type; -+ if (type == NOTIFY_DEVICEID4_DELETE && clp->cl_devid_cache) -+ pnfs_delete_deviceid(clp->cl_devid_cache, -+ &dev->cbd_dev_id); -+ else if (type == NOTIFY_DEVICEID4_CHANGE) -+ printk(KERN_ERR "%s: NOTIFY_DEVICEID4_CHANGE " -+ "not supported\n", __func__); -+ } -+ return 0; -+} -+ -+__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, -+ void *dummy) -+{ -+ struct nfs_client *clp; -+ __be32 res = 0; -+ unsigned int num_client = 0; -+ -+ dprintk("%s: -->\n", __func__); -+ -+ res = __constant_htonl(NFS4ERR_INVAL); -+ clp = nfs_find_client(args->addr, 4); -+ if (clp == NULL) -+ goto out; -+ -+ do { -+ struct nfs_client *prev = clp; -+ num_client++; -+ res = pnfs_devicenotify_client(clp, args); -+ clp = nfs_find_client_next(prev); -+ nfs_put_client(prev); -+ } while (clp != NULL); -+ -+out: -+ dprintk("%s: exit with status = %d numclient %u\n", -+ __func__, ntohl(res), num_client); -+ return res; -+} -+ - int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) - { - if (delegation == NULL) - return 0; - -- /* seqid is 4-bytes long */ -- if (((u32 *) &stateid->data)[0] != 0) -+ if (stateid->stateid.seqid != 0) - return 0; -- if (memcmp(&delegation->stateid.data[4], &stateid->data[4], -- sizeof(stateid->data)-4)) -+ if (memcmp(&delegation->stateid.stateid.other, -+ &stateid->stateid.other, -+ NFS4_STATEID_OTHER_SIZE)) - return 0; - - return 1; -@@ -324,13 +650,37 @@ out: - return status; - } - -+static inline bool -+validate_bitmap_values(const unsigned long *mask) -+{ -+ int i; -+ -+ if (*mask == 0) -+ return true; -+ if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, mask) || -+ test_bit(RCA4_TYPE_MASK_WDATA_DLG, mask) || -+ test_bit(RCA4_TYPE_MASK_DIR_DLG, mask) || -+ test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, mask) || -+ test_bit(RCA4_TYPE_MASK_BLK_LAYOUT, mask)) -+ return true; -+ for (i = RCA4_TYPE_MASK_OBJ_LAYOUT_MIN; -+ i <= RCA4_TYPE_MASK_OBJ_LAYOUT_MAX; i++) -+ if (test_bit(i, mask)) -+ return true; -+ for (i = RCA4_TYPE_MASK_OTHER_LAYOUT_MIN; -+ i <= RCA4_TYPE_MASK_OTHER_LAYOUT_MAX; i++) -+ if (test_bit(i, mask)) -+ return true; -+ return false; -+} -+ - __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) - { - struct nfs_client *clp; - __be32 status; - fmode_t flags = 0; - -- status = htonl(NFS4ERR_OP_NOT_IN_SESSION); -+ status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); - clp = nfs_find_client(args->craa_addr, 4); - if (clp == NULL) - goto out; -@@ -338,16 +688,27 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) - dprintk("NFS: RECALL_ANY callback request from %s\n", - rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); - -+ status = cpu_to_be32(NFS4ERR_INVAL); -+ if (!validate_bitmap_values((const unsigned long *) -+ &args->craa_type_mask)) -+ goto out_put; -+ -+ status = cpu_to_be32(NFS4_OK); - if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) - &args->craa_type_mask)) - flags = FMODE_READ; - if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) - &args->craa_type_mask)) - flags |= FMODE_WRITE; -+ if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) -+ &args->craa_type_mask)) -+ if (pnfs_recall_all_layouts(clp) == -EAGAIN) -+ status = cpu_to_be32(NFS4ERR_DELAY); - - if (flags) - nfs_expire_all_delegation_types(clp, flags); -- status = htonl(NFS4_OK); -+out_put: -+ nfs_put_client(clp); - out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; -diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c -index 05af212..fbfa2b9 100644 ---- a/fs/nfs/callback_xdr.c -+++ b/fs/nfs/callback_xdr.c -@@ -22,6 +22,8 @@ - #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) - - #if defined(CONFIG_NFS_V4_1) -+#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) -+#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) - #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ - 4 + 1 + 3) - #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) -@@ -220,6 +222,147 @@ out: - - #if defined(CONFIG_NFS_V4_1) - -+static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, -+ struct xdr_stream *xdr, -+ struct cb_layoutrecallargs *args) -+{ -+ __be32 *p; -+ __be32 status = 0; -+ -+ args->cbl_addr = svc_addr(rqstp); -+ p = read_buf(xdr, 4 * sizeof(uint32_t)); -+ if (unlikely(p == NULL)) { -+ status = htonl(NFS4ERR_BADXDR); -+ goto out; -+ } -+ -+ args->cbl_layout_type = ntohl(*p++); -+ args->cbl_seg.iomode = ntohl(*p++); -+ args->cbl_layoutchanged = ntohl(*p++); -+ args->cbl_recall_type = ntohl(*p++); -+ -+ if (likely(args->cbl_recall_type == RETURN_FILE)) { -+ status = decode_fh(xdr, &args->cbl_fh); -+ if (unlikely(status != 0)) -+ goto out; -+ -+ p = read_buf(xdr, 2 * sizeof(uint64_t)); -+ if (unlikely(p == NULL)) { -+ status = htonl(NFS4ERR_BADXDR); -+ goto out; -+ } -+ p = xdr_decode_hyper(p, &args->cbl_seg.offset); -+ p = xdr_decode_hyper(p, &args->cbl_seg.length); -+ status = decode_stateid(xdr, &args->cbl_stateid); -+ if (unlikely(status != 0)) -+ goto out; -+ } else if (args->cbl_recall_type == RETURN_FSID) { -+ p = read_buf(xdr, 2 * sizeof(uint64_t)); -+ if (unlikely(p == NULL)) { -+ status = htonl(NFS4ERR_BADXDR); -+ goto out; -+ } -+ p = xdr_decode_hyper(p, &args->cbl_fsid.major); -+ p = xdr_decode_hyper(p, &args->cbl_fsid.minor); -+ } -+ dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d " -+ "fsid %llx-%llx fhsize %d\n", __func__, -+ args->cbl_layout_type, args->cbl_seg.iomode, -+ args->cbl_layoutchanged, args->cbl_recall_type, -+ args->cbl_fsid.major, args->cbl_fsid.minor, -+ args->cbl_fh.size); -+out: -+ dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); -+ return status; -+} -+ -+static -+__be32 decode_devicenotify_args(struct svc_rqst *rqstp, -+ struct xdr_stream *xdr, -+ struct cb_devicenotifyargs *args) -+{ -+ __be32 *p; -+ __be32 status = 0; -+ u32 tmp; -+ int n, i; -+ args->ndevs = 0; -+ -+ args->addr = svc_addr(rqstp); -+ -+ /* Num of device notifications */ -+ p = read_buf(xdr, sizeof(uint32_t)); -+ if (unlikely(p == NULL)) { -+ status = htonl(NFS4ERR_RESOURCE); -+ goto out; -+ } -+ n = ntohl(*p++); -+ if (n <= 0) -+ goto out; -+ -+ /* XXX: need to possibly return error in this case */ -+ if (n > NFS4_DEV_NOTIFY_MAXENTRIES) { -+ dprintk("%s: Processing (%d) notifications out of (%d)\n", -+ __func__, NFS4_DEV_NOTIFY_MAXENTRIES, n); -+ n = NFS4_DEV_NOTIFY_MAXENTRIES; -+ } -+ -+ /* Decode each dev notification */ -+ for (i = 0; i < n; i++) { -+ struct cb_devicenotifyitem *dev = &args->devs[i]; -+ -+ p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE); -+ if (unlikely(p == NULL)) { -+ status = htonl(NFS4ERR_RESOURCE); -+ goto out; -+ } -+ -+ tmp = ntohl(*p++); /* bitmap size */ -+ if (tmp != 1) { -+ status = htonl(NFS4ERR_INVAL); -+ goto out; -+ } -+ dev->cbd_notify_type = ntohl(*p++); -+ if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && -+ dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) { -+ status = htonl(NFS4ERR_INVAL); -+ goto out; -+ } -+ -+ tmp = ntohl(*p++); /* opaque size */ -+ if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) && -+ (tmp != NFS4_DEVICEID4_SIZE + 8)) || -+ ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) && -+ (tmp != NFS4_DEVICEID4_SIZE + 4))) { -+ status = htonl(NFS4ERR_INVAL); -+ goto out; -+ } -+ dev->cbd_layout_type = ntohl(*p++); -+ memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE); -+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); -+ -+ if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { -+ p = read_buf(xdr, sizeof(uint32_t)); -+ if (unlikely(p == NULL)) { -+ status = htonl(NFS4ERR_DELAY); -+ goto out; -+ } -+ dev->cbd_immediate = ntohl(*p++); -+ } else { -+ dev->cbd_immediate = 0; -+ } -+ -+ args->ndevs++; -+ -+ dprintk("%s: type %d layout 0x%x immediate %d\n", -+ __func__, dev->cbd_notify_type, dev->cbd_layout_type, -+ dev->cbd_immediate); -+ } -+out: -+ dprintk("%s: status %d ndevs %d\n", -+ __func__, ntohl(status), args->ndevs); -+ return status; -+} -+ - static __be32 decode_sessionid(struct xdr_stream *xdr, - struct nfs4_sessionid *sid) - { -@@ -574,11 +717,11 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) - case OP_CB_SEQUENCE: - case OP_CB_RECALL_ANY: - case OP_CB_RECALL_SLOT: -+ case OP_CB_LAYOUTRECALL: -+ case OP_CB_NOTIFY_DEVICEID: - *op = &callback_ops[op_nr]; - break; - -- case OP_CB_LAYOUTRECALL: -- case OP_CB_NOTIFY_DEVICEID: - case OP_CB_NOTIFY: - case OP_CB_PUSH_DELEG: - case OP_CB_RECALLABLE_OBJ_AVAIL: -@@ -739,6 +882,18 @@ static struct callback_op callback_ops[] = { - .res_maxsize = CB_OP_RECALL_RES_MAXSZ, - }, - #if defined(CONFIG_NFS_V4_1) -+ [OP_CB_LAYOUTRECALL] = { -+ .process_op = (callback_process_op_t)nfs4_callback_layoutrecall, -+ .decode_args = -+ (callback_decode_arg_t)decode_layoutrecall_args, -+ .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, -+ }, -+ [OP_CB_NOTIFY_DEVICEID] = { -+ .process_op = (callback_process_op_t)nfs4_callback_devicenotify, -+ .decode_args = -+ (callback_decode_arg_t)decode_devicenotify_args, -+ .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ, -+ }, - [OP_CB_SEQUENCE] = { - .process_op = (callback_process_op_t)nfs4_callback_sequence, - .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, -diff --git a/fs/nfs/client.c b/fs/nfs/client.c -index e734072..9e1135e 100644 ---- a/fs/nfs/client.c -+++ b/fs/nfs/client.c -@@ -48,6 +48,7 @@ - #include "iostat.h" - #include "internal.h" - #include "fscache.h" -+#include "pnfs.h" - - #define NFSDBG_FACILITY NFSDBG_CLIENT - -@@ -155,7 +156,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ - cred = rpc_lookup_machine_cred(); - if (!IS_ERR(cred)) - clp->cl_machine_cred = cred; -- -+#if defined(CONFIG_NFS_V4_1) -+ INIT_LIST_HEAD(&clp->cl_layouts); -+#endif - nfs_fscache_get_client_cookie(clp); - - return clp; -@@ -252,6 +255,7 @@ void nfs_put_client(struct nfs_client *clp) - nfs_free_client(clp); - } - } -+EXPORT_SYMBOL_GPL(nfs_put_client); - - #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - /* -@@ -344,7 +348,7 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, - * Test if two socket addresses represent the same actual socket, - * by comparing (only) relevant fields, including the port number. - */ --static int nfs_sockaddr_cmp(const struct sockaddr *sa1, -+int nfs_sockaddr_cmp(const struct sockaddr *sa1, - const struct sockaddr *sa2) - { - if (sa1->sa_family != sa2->sa_family) -@@ -358,6 +362,7 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1, - } - return 0; - } -+EXPORT_SYMBOL(nfs_sockaddr_cmp); - - /* - * Find a client by IP address and protocol version -@@ -549,6 +554,7 @@ int nfs4_check_client_ready(struct nfs_client *clp) - return -EPROTONOSUPPORT; - return 0; - } -+EXPORT_SYMBOL(nfs4_check_client_ready); - - /* - * Initialise the timeout values for a connection -@@ -868,7 +874,7 @@ error: - /* - * Load up the server record from information gained in an fsinfo record - */ --static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) -+static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) - { - unsigned long max_rpc_payload; - -@@ -898,6 +904,10 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * - if (server->wsize > NFS_MAX_FILE_IO_SIZE) - server->wsize = NFS_MAX_FILE_IO_SIZE; - server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -+ server->pnfs_blksize = fsinfo->blksize; -+ set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype); -+ pnfs_set_ds_iosize(server); -+ - server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); - - server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); -@@ -934,12 +944,13 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str - goto out_error; - } - -+ memset(&fsinfo, 0, sizeof(fsinfo)); - fsinfo.fattr = fattr; - error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); - if (error < 0) - goto out_error; - -- nfs_server_set_fsinfo(server, &fsinfo); -+ nfs_server_set_fsinfo(server, mntfh, &fsinfo); - - /* Get some general file system info */ - if (server->namelen == 0) { -@@ -1017,6 +1028,7 @@ void nfs_free_server(struct nfs_server *server) - { - dprintk("--> nfs_free_server()\n"); - -+ unset_pnfs_layoutdriver(server); - spin_lock(&nfs_client_lock); - list_del(&server->client_link); - list_del(&server->master_link); -@@ -1221,7 +1233,7 @@ error: - /* - * Set up an NFS4 client - */ --static int nfs4_set_client(struct nfs_server *server, -+int nfs4_set_client(struct nfs_server *server, - const char *hostname, - const struct sockaddr *addr, - const size_t addrlen, -@@ -1264,6 +1276,7 @@ error: - dprintk("<-- nfs4_set_client() = xerror %d\n", error); - return error; - } -+EXPORT_SYMBOL(nfs4_set_client); - - - /* -diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c -index 064a809..43786c2 100644 ---- a/fs/nfs/direct.c -+++ b/fs/nfs/direct.c -@@ -271,6 +271,38 @@ static const struct rpc_call_ops nfs_read_direct_ops = { - .rpc_release = nfs_direct_read_release, - }; - -+static long nfs_direct_read_execute(struct nfs_read_data *data, -+ struct rpc_task_setup *task_setup_data, -+ struct rpc_message *msg) -+{ -+ struct inode *inode = data->inode; -+ struct rpc_task *task; -+ -+ nfs_fattr_init(&data->fattr); -+ msg->rpc_argp = &data->args; -+ msg->rpc_resp = &data->res; -+ -+ task_setup_data->task = &data->task; -+ task_setup_data->callback_data = data; -+ NFS_PROTO(inode)->read_setup(data, msg); -+ -+ task = rpc_run_task(task_setup_data); -+ if (IS_ERR(task)) -+ return PTR_ERR(task); -+ -+ rpc_put_task(task); -+ -+ dprintk("NFS: %5u initiated direct read call " -+ "(req %s/%lld, %u bytes @ offset %llu)\n", -+ data->task.tk_pid, -+ inode->i_sb->s_id, -+ (long long)NFS_FILEID(inode), -+ data->args.count, -+ (unsigned long long)data->args.offset); -+ -+ return 0; -+} -+ - /* - * For each rsize'd chunk of the user's buffer, dispatch an NFS READ - * operation. If nfs_readdata_alloc() or get_user_pages() fails, -@@ -287,7 +319,6 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, - unsigned long user_addr = (unsigned long)iov->iov_base; - size_t count = iov->iov_len; - size_t rsize = NFS_SERVER(inode)->rsize; -- struct rpc_task *task; - struct rpc_message msg = { - .rpc_cred = ctx->cred, - }; -@@ -348,26 +379,9 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, - data->res.fattr = &data->fattr; - data->res.eof = 0; - data->res.count = bytes; -- nfs_fattr_init(&data->fattr); -- msg.rpc_argp = &data->args; -- msg.rpc_resp = &data->res; - -- task_setup_data.task = &data->task; -- task_setup_data.callback_data = data; -- NFS_PROTO(inode)->read_setup(data, &msg); -- -- task = rpc_run_task(&task_setup_data); -- if (IS_ERR(task)) -+ if (nfs_direct_read_execute(data, &task_setup_data, &msg)) - break; -- rpc_put_task(task); -- -- dprintk("NFS: %5u initiated direct read call " -- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", -- data->task.tk_pid, -- inode->i_sb->s_id, -- (long long)NFS_FILEID(inode), -- bytes, -- (unsigned long long)data->args.offset); - - started += bytes; - user_addr += bytes; -@@ -457,12 +471,15 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq) - } - - #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -+static long nfs_direct_write_execute(struct nfs_write_data *data, -+ struct rpc_task_setup *task_setup_data, -+ struct rpc_message *msg); -+ - static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) - { - struct inode *inode = dreq->inode; - struct list_head *p; - struct nfs_write_data *data; -- struct rpc_task *task; - struct rpc_message msg = { - .rpc_cred = dreq->ctx->cred, - }; -@@ -496,25 +513,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) - * Reuse data->task; data->args should not have changed - * since the original request was sent. - */ -- task_setup_data.task = &data->task; -- task_setup_data.callback_data = data; -- msg.rpc_argp = &data->args; -- msg.rpc_resp = &data->res; -- NFS_PROTO(inode)->write_setup(data, &msg); -- -- /* -- * We're called via an RPC callback, so BKL is already held. -- */ -- task = rpc_run_task(&task_setup_data); -- if (!IS_ERR(task)) -- rpc_put_task(task); -- -- dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", -- data->task.tk_pid, -- inode->i_sb->s_id, -- (long long)NFS_FILEID(inode), -- data->args.count, -- (unsigned long long)data->args.offset); -+ nfs_direct_write_execute(data, &task_setup_data, &msg); - } - - if (put_dreq(dreq)) -@@ -557,10 +556,31 @@ static const struct rpc_call_ops nfs_commit_direct_ops = { - .rpc_release = nfs_direct_commit_release, - }; - -+static long nfs_direct_commit_execute(struct nfs_direct_req *dreq, -+ struct nfs_write_data *data, -+ struct rpc_task_setup *task_setup_data, -+ struct rpc_message *msg) -+{ -+ struct rpc_task *task; -+ -+ NFS_PROTO(data->inode)->commit_setup(data, msg); -+ -+ /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ -+ dreq->commit_data = NULL; -+ -+ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); -+ -+ task = rpc_run_task(task_setup_data); -+ if (IS_ERR(task)) -+ return PTR_ERR(task); -+ -+ rpc_put_task(task); -+ return 0; -+} -+ - static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) - { - struct nfs_write_data *data = dreq->commit_data; -- struct rpc_task *task; - struct rpc_message msg = { - .rpc_argp = &data->args, - .rpc_resp = &data->res, -@@ -589,16 +609,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) - data->res.verf = &data->verf; - nfs_fattr_init(&data->fattr); - -- NFS_PROTO(data->inode)->commit_setup(data, &msg); -- -- /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ -- dreq->commit_data = NULL; -- -- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); -- -- task = rpc_run_task(&task_setup_data); -- if (!IS_ERR(task)) -- rpc_put_task(task); -+ nfs_direct_commit_execute(dreq, data, &task_setup_data, &msg); - } - - static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) -@@ -700,6 +711,36 @@ static const struct rpc_call_ops nfs_write_direct_ops = { - .rpc_release = nfs_direct_write_release, - }; - -+static long nfs_direct_write_execute(struct nfs_write_data *data, -+ struct rpc_task_setup *task_setup_data, -+ struct rpc_message *msg) -+{ -+ struct inode *inode = data->inode; -+ struct rpc_task *task; -+ -+ task_setup_data->task = &data->task; -+ task_setup_data->callback_data = data; -+ msg->rpc_argp = &data->args; -+ msg->rpc_resp = &data->res; -+ NFS_PROTO(inode)->write_setup(data, msg); -+ -+ task = rpc_run_task(task_setup_data); -+ if (IS_ERR(task)) -+ return PTR_ERR(task); -+ -+ rpc_put_task(task); -+ -+ dprintk("NFS: %5u initiated direct write call " -+ "(req %s/%lld, %u bytes @ offset %llu)\n", -+ data->task.tk_pid, -+ inode->i_sb->s_id, -+ (long long)NFS_FILEID(inode), -+ data->args.count, -+ (unsigned long long)data->args.offset); -+ -+ return 0; -+} -+ - /* - * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE - * operation. If nfs_writedata_alloc() or get_user_pages() fails, -@@ -715,7 +756,6 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, - struct inode *inode = ctx->path.dentry->d_inode; - unsigned long user_addr = (unsigned long)iov->iov_base; - size_t count = iov->iov_len; -- struct rpc_task *task; - struct rpc_message msg = { - .rpc_cred = ctx->cred, - }; -@@ -782,24 +822,8 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, - data->res.verf = &data->verf; - nfs_fattr_init(&data->fattr); - -- task_setup_data.task = &data->task; -- task_setup_data.callback_data = data; -- msg.rpc_argp = &data->args; -- msg.rpc_resp = &data->res; -- NFS_PROTO(inode)->write_setup(data, &msg); -- -- task = rpc_run_task(&task_setup_data); -- if (IS_ERR(task)) -+ if (nfs_direct_write_execute(data, &task_setup_data, &msg)) - break; -- rpc_put_task(task); -- -- dprintk("NFS: %5u initiated direct write call " -- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", -- data->task.tk_pid, -- inode->i_sb->s_id, -- (long long)NFS_FILEID(inode), -- bytes, -- (unsigned long long)data->args.offset); - - started += bytes; - user_addr += bytes; -diff --git a/fs/nfs/file.c b/fs/nfs/file.c -index 05bf3c0..28d4aa3 100644 ---- a/fs/nfs/file.c -+++ b/fs/nfs/file.c -@@ -36,6 +36,7 @@ - #include "internal.h" - #include "iostat.h" - #include "fscache.h" -+#include "pnfs.h" - - #define NFSDBG_FACILITY NFSDBG_FILE - -@@ -380,12 +381,16 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - struct page *page; - int once_thru = 0; -+ struct pnfs_layout_segment *lseg; - - dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", - file->f_path.dentry->d_parent->d_name.name, - file->f_path.dentry->d_name.name, - mapping->host->i_ino, len, (long long) pos); - -+ lseg = pnfs_update_layout(mapping->host, -+ nfs_file_open_context(file), -+ pos, len, IOMODE_RW); - start: - /* - * Prevent starvation issues if someone is doing a consistency -@@ -394,17 +399,22 @@ start: - ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (ret) -- return ret; -+ goto out; - - page = grab_cache_page_write_begin(mapping, index, flags); -- if (!page) -- return -ENOMEM; -+ if (!page) { -+ ret = -ENOMEM; -+ goto out; -+ } - *pagep = page; - -- ret = nfs_flush_incompatible(file, page); -+ ret = nfs_flush_incompatible(file, page, lseg); - if (ret) { - unlock_page(page); - page_cache_release(page); -+ *pagep = NULL; -+ *fsdata = NULL; -+ goto out; - } else if (!once_thru && - nfs_want_read_modify_write(file, page, pos, len)) { - once_thru = 1; -@@ -413,6 +423,12 @@ start: - if (!ret) - goto start; - } -+ ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata); -+ out: -+ if (ret) { -+ put_lseg(lseg); -+ *fsdata = NULL; -+ } - return ret; - } - -@@ -422,6 +438,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, - { - unsigned offset = pos & (PAGE_CACHE_SIZE - 1); - int status; -+ struct pnfs_layout_segment *lseg; - - dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", - file->f_path.dentry->d_parent->d_name.name, -@@ -448,10 +465,17 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, - zero_user_segment(page, pglen, PAGE_CACHE_SIZE); - } - -- status = nfs_updatepage(file, page, offset, copied); -+ lseg = nfs4_pull_lseg_from_fsdata(file, fsdata); -+ status = pnfs_write_end(file, page, pos, len, copied, lseg); -+ if (status) -+ goto out; -+ status = nfs_updatepage(file, page, offset, copied, lseg, fsdata); - -+ out: - unlock_page(page); - page_cache_release(page); -+ pnfs_write_end_cleanup(file, fsdata); -+ put_lseg(lseg); - - if (status < 0) - return status; -@@ -562,6 +586,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) - /* make sure the cache has finished storing the page */ - nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); - -+ /* XXX Do we want to call pnfs_update_layout here? */ -+ - lock_page(page); - mapping = page->mapping; - if (mapping != dentry->d_inode->i_mapping) -@@ -572,11 +598,11 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) - if (pagelen == 0) - goto out_unlock; - -- ret = nfs_flush_incompatible(filp, page); -+ ret = nfs_flush_incompatible(filp, page, NULL); - if (ret != 0) - goto out_unlock; - -- ret = nfs_updatepage(filp, page, 0, pagelen); -+ ret = nfs_updatepage(filp, page, 0, pagelen, NULL, NULL); - out_unlock: - if (!ret) - return VM_FAULT_LOCKED; -diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c -index 7d2d6c7..437d9a6 100644 ---- a/fs/nfs/inode.c -+++ b/fs/nfs/inode.c -@@ -48,6 +48,7 @@ - #include "internal.h" - #include "fscache.h" - #include "dns_resolve.h" -+#include "pnfs.h" - - #define NFSDBG_FACILITY NFSDBG_VFS - -@@ -648,6 +649,7 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) - atomic_inc(&ctx->lock_context.count); - return ctx; - } -+EXPORT_SYMBOL(get_nfs_open_context); - - static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) - { -@@ -1000,6 +1002,7 @@ void nfs_fattr_init(struct nfs_fattr *fattr) - fattr->time_start = jiffies; - fattr->gencount = nfs_inc_attr_generation_counter(); - } -+EXPORT_SYMBOL(nfs_fattr_init); - - struct nfs_fattr *nfs_alloc_fattr(void) - { -@@ -1209,6 +1212,14 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) - server->fsid = fattr->fsid; - - /* -+ * file needs layout commit, server attributes may be stale -+ */ -+ if (layoutcommit_needed(nfsi) && nfsi->change_attr >= fattr->change_attr) { -+ dprintk("NFS: %s: layoutcommit is needed for file %s/%ld\n", -+ __func__, inode->i_sb->s_id, inode->i_ino); -+ return 0; -+ } -+ /* - * Update the read time so we don't revalidate too often. - */ - nfsi->read_cache_jiffies = fattr->time_start; -@@ -1407,11 +1418,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) - */ - void nfs4_evict_inode(struct inode *inode) - { -+ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true); - truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); -+ pnfs_destroy_layout(NFS_I(inode)); - /* If we are holding a delegation, return it! */ - nfs_inode_return_delegation_noreclaim(inode); -- /* First call standard NFS clear_inode() code */ - nfs_clear_inode(inode); - } - #endif -@@ -1446,6 +1458,8 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi) - nfsi->delegation = NULL; - nfsi->delegation_state = 0; - init_rwsem(&nfsi->rwsem); -+ rpc_init_wait_queue(&nfsi->lo_rpcwaitq, "pNFS Layout"); -+ nfsi->layout = NULL; - #endif - } - -diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h -index c961bc9..4e7a4c9 100644 ---- a/fs/nfs/internal.h -+++ b/fs/nfs/internal.h -@@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *, - struct nfs_fattr *); - extern void nfs_mark_client_ready(struct nfs_client *clp, int state); - extern int nfs4_check_client_ready(struct nfs_client *clp); -+extern int nfs_sockaddr_cmp(const struct sockaddr *sa1, -+ const struct sockaddr *sa2); -+extern int nfs4_set_client(struct nfs_server *server, -+ const char *hostname, -+ const struct sockaddr *addr, -+ const size_t addrlen, -+ const char *ip_addr, -+ rpc_authflavor_t authflavour, -+ int proto, const struct rpc_timeout *timeparms, -+ u32 minorversion); - #ifdef CONFIG_PROC_FS - extern int __init nfs_fs_proc_init(void); - extern void nfs_fs_proc_exit(void); -@@ -201,6 +211,8 @@ extern const u32 nfs41_maxwrite_overhead; - extern struct rpc_procinfo nfs4_procedures[]; - #endif - -+extern int nfs4_recover_expired_lease(struct nfs_client *clp); -+ - /* proc.c */ - void nfs_close_context(struct nfs_open_context *ctx, int is_sync); - -@@ -249,10 +261,31 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); - #endif - - /* read.c */ -+extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, -+ const struct rpc_call_ops *call_ops); -+extern int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, -+ const struct rpc_call_ops *call_ops); - extern void nfs_read_prepare(struct rpc_task *task, void *calldata); - - /* write.c */ -+extern int nfs_initiate_write(struct nfs_write_data *data, -+ struct rpc_clnt *clnt, -+ const struct rpc_call_ops *call_ops, -+ int how); -+extern int pnfs_initiate_write(struct nfs_write_data *data, -+ struct rpc_clnt *clnt, -+ const struct rpc_call_ops *call_ops, -+ int how); -+extern int nfs_initiate_commit(struct nfs_write_data *data, -+ struct rpc_clnt *clnt, -+ const struct rpc_call_ops *call_ops, -+ int how); -+extern int pnfs_initiate_commit(struct nfs_write_data *data, -+ struct rpc_clnt *clnt, -+ const struct rpc_call_ops *call_ops, -+ int how, int pnfs); - extern void nfs_write_prepare(struct rpc_task *task, void *calldata); -+extern void nfs_mark_list_commit(struct list_head *head); - #ifdef CONFIG_MIGRATION - extern int nfs_migrate_page(struct address_space *, - struct page *, struct page *); -diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h -index 311e15c..cb390fb 100644 ---- a/fs/nfs/nfs4_fs.h -+++ b/fs/nfs/nfs4_fs.h -@@ -46,6 +46,7 @@ enum nfs4_client_state { - NFS4CLNT_DELEGRETURN, - NFS4CLNT_SESSION_RESET, - NFS4CLNT_RECALL_SLOT, -+ NFS4CLNT_LAYOUT_RECALL, - }; - - enum nfs4_session_state { -@@ -256,10 +257,12 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser - } - - extern int nfs4_setup_sequence(const struct nfs_server *server, -+ struct nfs4_session *ds_session, - struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, - int cache_reply, struct rpc_task *task); - extern void nfs4_destroy_session(struct nfs4_session *session); - extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); -+extern int nfs4_proc_exchange_id(struct nfs_client *, struct rpc_cred *); - extern int nfs4_proc_create_session(struct nfs_client *); - extern int nfs4_proc_destroy_session(struct nfs4_session *); - extern int nfs4_init_session(struct nfs_server *server); -@@ -272,6 +275,7 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser - } - - static inline int nfs4_setup_sequence(const struct nfs_server *server, -+ struct nfs4_session *ds_session, - struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, - int cache_reply, struct rpc_task *task) - { -@@ -289,7 +293,7 @@ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; - extern const u32 nfs4_fattr_bitmap[2]; - extern const u32 nfs4_statfs_bitmap[2]; - extern const u32 nfs4_pathconf_bitmap[2]; --extern const u32 nfs4_fsinfo_bitmap[2]; -+extern const u32 nfs4_fsinfo_bitmap[3]; - extern const u32 nfs4_fs_locations_bitmap[2]; - - /* nfs4renewd.c */ -@@ -299,13 +303,24 @@ extern void nfs4_kill_renewd(struct nfs_client *); - extern void nfs4_renew_state(struct work_struct *); - - /* nfs4state.c */ -+struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); - struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp); - struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); - #if defined(CONFIG_NFS_V4_1) --struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); - struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); - #endif /* CONFIG_NFS_V4_1 */ - -+static inline struct rpc_cred * -+nfs4_get_machine_cred(struct nfs_client *clp) -+{ -+ struct rpc_cred *cred; -+ -+ spin_lock(&clp->cl_lock); -+ cred = nfs4_get_machine_cred_locked(clp); -+ spin_unlock(&clp->cl_lock); -+ return cred; -+} -+ - extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); - extern void nfs4_put_state_owner(struct nfs4_state_owner *); - extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); -diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c -new file mode 100644 -index 0000000..aaabe2f ---- /dev/null -+++ b/fs/nfs/nfs4filelayout.c -@@ -0,0 +1,679 @@ -+/* -+ * Module for the pnfs nfs4 file layout driver. -+ * Defines all I/O and Policy interface operations, plus code -+ * to register itself with the pNFS client. -+ * -+ * Copyright (c) 2002 -+ * The Regents of the University of Michigan -+ * All Rights Reserved -+ * -+ * Dean Hildebrand -+ * -+ * Permission is granted to use, copy, create derivative works, and -+ * redistribute this software and such derivative works for any purpose, -+ * so long as the name of the University of Michigan is not used in -+ * any advertising or publicity pertaining to the use or distribution -+ * of this software without specific, written prior authorization. If -+ * the above copyright notice or any other identification of the -+ * University of Michigan is included in any copy of any portion of -+ * this software, then the disclaimer below must also be included. -+ * -+ * This software is provided as is, without representation or warranty -+ * of any kind either express or implied, including without limitation -+ * the implied warranties of merchantability, fitness for a particular -+ * purpose, or noninfringement. The Regents of the University of -+ * Michigan shall not be liable for any damages, including special, -+ * indirect, incidental, or consequential damages, with respect to any -+ * claim arising out of or in connection with the use of the software, -+ * even if it has been or is hereafter advised of the possibility of -+ * such damages. -+ */ -+ -+#include -+ -+#include "internal.h" -+#include "nfs4filelayout.h" -+ -+#define NFSDBG_FACILITY NFSDBG_PNFS_LD -+ -+MODULE_LICENSE("GPL"); -+MODULE_AUTHOR("Dean Hildebrand "); -+MODULE_DESCRIPTION("The NFSv4 file layout driver"); -+ -+int -+filelayout_initialize_mountpoint(struct nfs_server *nfss, -+ const struct nfs_fh *mntfh) -+{ -+ int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client, -+ nfs4_fl_free_deviceid_callback); -+ if (status) { -+ printk(KERN_WARNING "%s: deviceid cache could not be " -+ "initialized\n", __func__); -+ return status; -+ } -+ dprintk("%s: deviceid cache has been initialized successfully\n", -+ __func__); -+ return 0; -+} -+ -+/* Uninitialize a mountpoint by destroying its device list */ -+int -+filelayout_uninitialize_mountpoint(struct nfs_server *nfss) -+{ -+ dprintk("--> %s\n", __func__); -+ -+ if (nfss->nfs_client->cl_devid_cache) -+ pnfs_put_deviceid_cache(nfss->nfs_client); -+ return 0; -+} -+ -+/* This function is used by the layout driver to calculate the -+ * offset of the file on the dserver based on whether the -+ * layout type is STRIPE_DENSE or STRIPE_SPARSE -+ */ -+static loff_t -+filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) -+{ -+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); -+ -+ switch (flseg->stripe_type) { -+ case STRIPE_SPARSE: -+ return offset; -+ -+ case STRIPE_DENSE: -+ { -+ u32 stripe_width; -+ u64 tmp, off; -+ u32 unit = flseg->stripe_unit; -+ -+ stripe_width = unit * flseg->dsaddr->stripe_count; -+ tmp = off = offset - flseg->pattern_offset; -+ do_div(tmp, stripe_width); -+ return tmp * unit + do_div(off, unit); -+ } -+ default: -+ BUG(); -+ } -+ -+ /* We should never get here... just to stop the gcc warning */ -+ return 0; -+} -+ -+/* -+ * Call ops for the async read/write cases -+ * In the case of dense layouts, the offset needs to be reset to its -+ * original value. -+ */ -+static void filelayout_read_call_done(struct rpc_task *task, void *data) -+{ -+ struct nfs_read_data *rdata = (struct nfs_read_data *)data; -+ -+ if (rdata->fldata.orig_offset) { -+ dprintk("%s new off %llu orig offset %llu\n", __func__, -+ rdata->args.offset, rdata->fldata.orig_offset); -+ rdata->args.offset = rdata->fldata.orig_offset; -+ } -+ -+ /* Note this may cause RPC to be resent */ -+ rdata->pdata.call_ops->rpc_call_done(task, data); -+} -+ -+static void filelayout_read_release(void *data) -+{ -+ struct nfs_read_data *rdata = (struct nfs_read_data *)data; -+ -+ put_lseg(rdata->pdata.lseg); -+ rdata->pdata.lseg = NULL; -+ rdata->pdata.call_ops->rpc_release(data); -+} -+ -+static void filelayout_write_call_done(struct rpc_task *task, void *data) -+{ -+ struct nfs_write_data *wdata = (struct nfs_write_data *)data; -+ -+ if (wdata->fldata.orig_offset) { -+ dprintk("%s new off %llu orig offset %llu\n", __func__, -+ wdata->args.offset, wdata->fldata.orig_offset); -+ wdata->args.offset = wdata->fldata.orig_offset; -+ } -+ -+ /* Note this may cause RPC to be resent */ -+ wdata->pdata.call_ops->rpc_call_done(task, data); -+} -+ -+static void filelayout_write_release(void *data) -+{ -+ struct nfs_write_data *wdata = (struct nfs_write_data *)data; -+ -+ put_lseg(wdata->pdata.lseg); -+ wdata->pdata.lseg = NULL; -+ wdata->pdata.call_ops->rpc_release(data); -+} -+ -+struct rpc_call_ops filelayout_read_call_ops = { -+ .rpc_call_prepare = nfs_read_prepare, -+ .rpc_call_done = filelayout_read_call_done, -+ .rpc_release = filelayout_read_release, -+}; -+ -+struct rpc_call_ops filelayout_write_call_ops = { -+ .rpc_call_prepare = nfs_write_prepare, -+ .rpc_call_done = filelayout_write_call_done, -+ .rpc_release = filelayout_write_release, -+}; -+ -+/* Perform sync or async reads. -+ * -+ * An optimization for the NFS file layout driver -+ * allows the original read/write data structs to be passed in the -+ * last argument. -+ * -+ * TODO: join with write_pagelist? -+ */ -+static enum pnfs_try_status -+filelayout_read_pagelist(struct nfs_read_data *data, unsigned nr_pages) -+{ -+ struct pnfs_layout_segment *lseg = data->pdata.lseg; -+ struct nfs4_pnfs_ds *ds; -+ loff_t offset = data->args.offset; -+ u32 idx; -+ struct nfs_fh *fh; -+ -+ dprintk("--> %s ino %lu nr_pages %d pgbase %u req %Zu@%llu\n", -+ __func__, data->inode->i_ino, nr_pages, -+ data->args.pgbase, (size_t)data->args.count, offset); -+ -+ /* Retrieve the correct rpc_client for the byte range */ -+ idx = nfs4_fl_calc_ds_index(lseg, offset); -+ ds = nfs4_fl_prepare_ds(lseg, idx); -+ if (!ds) { -+ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); -+ return PNFS_NOT_ATTEMPTED; -+ } -+ dprintk("%s USE DS:ip %x %hu\n", __func__, -+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); -+ -+ /* just try the first data server for the index..*/ -+ data->fldata.ds_nfs_client = ds->ds_clp; -+ fh = nfs4_fl_select_ds_fh(lseg, offset); -+ if (fh) -+ data->args.fh = fh; -+ -+ /* -+ * Now get the file offset on the dserver -+ * Set the read offset to this offset, and -+ * save the original offset in orig_offset -+ * In the case of aync reads, the offset will be reset in the -+ * call_ops->rpc_call_done() routine. -+ */ -+ data->args.offset = filelayout_get_dserver_offset(lseg, offset); -+ data->fldata.orig_offset = offset; -+ -+ /* Perform an asynchronous read */ -+ nfs_initiate_read(data, ds->ds_clp->cl_rpcclient, -+ &filelayout_read_call_ops); -+ -+ data->pdata.pnfs_error = 0; -+ -+ return PNFS_ATTEMPTED; -+} -+ -+/* Perform async writes. */ -+static enum pnfs_try_status -+filelayout_write_pagelist(struct nfs_write_data *data, unsigned nr_pages, int sync) -+{ -+ struct pnfs_layout_segment *lseg = data->pdata.lseg; -+ struct nfs4_pnfs_ds *ds; -+ loff_t offset = data->args.offset; -+ u32 idx; -+ struct nfs_fh *fh; -+ -+ /* Retrieve the correct rpc_client for the byte range */ -+ idx = nfs4_fl_calc_ds_index(lseg, offset); -+ ds = nfs4_fl_prepare_ds(lseg, idx); -+ if (!ds) { -+ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); -+ return PNFS_NOT_ATTEMPTED; -+ } -+ dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__, -+ data->inode->i_ino, sync, (size_t) data->args.count, offset, -+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); -+ -+ data->fldata.ds_nfs_client = ds->ds_clp; -+ fh = nfs4_fl_select_ds_fh(lseg, offset); -+ if (fh) -+ data->args.fh = fh; -+ /* -+ * Get the file offset on the dserver. Set the write offset to -+ * this offset and save the original offset. -+ */ -+ data->args.offset = filelayout_get_dserver_offset(lseg, offset); -+ data->fldata.orig_offset = offset; -+ -+ /* -+ * Perform an asynchronous write The offset will be reset in the -+ * call_ops->rpc_call_done() routine -+ */ -+ nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, -+ &filelayout_write_call_ops, sync); -+ -+ data->pdata.pnfs_error = 0; -+ return PNFS_ATTEMPTED; -+} -+ -+/* -+ * filelayout_check_layout() -+ * -+ * Make sure layout segment parameters are sane WRT the device. -+ * At this point no generic layer initialization of the lseg has occurred, -+ * and nothing has been added to the layout_hdr cache. -+ * -+ */ -+static int -+filelayout_check_layout(struct pnfs_layout_hdr *lo, -+ struct nfs4_filelayout_segment *fl, -+ struct nfs4_layoutget_res *lgr, -+ struct nfs4_deviceid *id) -+{ -+ struct nfs4_file_layout_dsaddr *dsaddr; -+ int status = -EINVAL; -+ struct nfs_server *nfss = NFS_SERVER(lo->inode); -+ -+ dprintk("--> %s\n", __func__); -+ -+ if (fl->pattern_offset > lgr->range.offset) { -+ dprintk("%s pattern_offset %lld to large\n", -+ __func__, fl->pattern_offset); -+ goto out; -+ } -+ -+ if (fl->stripe_unit % PAGE_SIZE) { -+ dprintk("%s Stripe unit (%u) not page aligned\n", -+ __func__, fl->stripe_unit); -+ goto out; -+ } -+ -+ /* find and reference the deviceid */ -+ dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id); -+ if (dsaddr == NULL) { -+ dsaddr = get_device_info(lo->inode, id); -+ if (dsaddr == NULL) -+ goto out; -+ } -+ fl->dsaddr = dsaddr; -+ -+ if (fl->first_stripe_index < 0 || -+ fl->first_stripe_index >= dsaddr->stripe_count) { -+ dprintk("%s Bad first_stripe_index %d\n", -+ __func__, fl->first_stripe_index); -+ goto out_put; -+ } -+ -+ if ((fl->stripe_type == STRIPE_SPARSE && -+ fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) || -+ (fl->stripe_type == STRIPE_DENSE && -+ fl->num_fh != dsaddr->stripe_count)) { -+ dprintk("%s num_fh %u not valid for given packing\n", -+ __func__, fl->num_fh); -+ goto out_put; -+ } -+ -+ if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) { -+ dprintk("%s Stripe unit (%u) not aligned with rsize %u " -+ "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize, -+ nfss->wsize); -+ } -+ -+ status = 0; -+out: -+ dprintk("--> %s returns %d\n", __func__, status); -+ return status; -+out_put: -+ pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid); -+ goto out; -+} -+ -+static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) -+{ -+ int i; -+ -+ for (i = 0; i < fl->num_fh; i++) { -+ if (!fl->fh_array[i]) -+ break; -+ kfree(fl->fh_array[i]); -+ } -+ kfree(fl->fh_array); -+ fl->fh_array = NULL; -+} -+ -+static void -+_filelayout_free_lseg(struct nfs4_filelayout_segment *fl) -+{ -+ filelayout_free_fh_array(fl); -+ kfree(fl); -+} -+ -+static int -+filelayout_decode_layout(struct pnfs_layout_hdr *flo, -+ struct nfs4_filelayout_segment *fl, -+ struct nfs4_layoutget_res *lgr, -+ struct nfs4_deviceid *id) -+{ -+ uint32_t *p = (uint32_t *)lgr->layout.buf; -+ uint32_t nfl_util; -+ int i; -+ -+ dprintk("%s: set_layout_map Begin\n", __func__); -+ -+ memcpy(id, p, sizeof(*id)); -+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); -+ print_deviceid(id); -+ -+ nfl_util = be32_to_cpup(p++); -+ if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) -+ fl->commit_through_mds = 1; -+ if (nfl_util & NFL4_UFLG_DENSE) -+ fl->stripe_type = STRIPE_DENSE; -+ else -+ fl->stripe_type = STRIPE_SPARSE; -+ fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK; -+ -+ fl->first_stripe_index = be32_to_cpup(p++); -+ p = xdr_decode_hyper(p, &fl->pattern_offset); -+ fl->num_fh = be32_to_cpup(p++); -+ -+ dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n", -+ __func__, nfl_util, fl->num_fh, fl->first_stripe_index, -+ fl->pattern_offset); -+ -+ fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *), -+ GFP_KERNEL); -+ if (!fl->fh_array) -+ return -ENOMEM; -+ -+ for (i = 0; i < fl->num_fh; i++) { -+ /* Do we want to use a mempool here? */ -+ fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); -+ if (!fl->fh_array[i]) { -+ filelayout_free_fh_array(fl); -+ return -ENOMEM; -+ } -+ fl->fh_array[i]->size = be32_to_cpup(p++); -+ if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { -+ printk(KERN_ERR "Too big fh %d received %d\n", -+ i, fl->fh_array[i]->size); -+ filelayout_free_fh_array(fl); -+ return -EIO; -+ } -+ memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size); -+ p += XDR_QUADLEN(fl->fh_array[i]->size); -+ dprintk("DEBUG: %s: fh len %d\n", __func__, -+ fl->fh_array[i]->size); -+ } -+ -+ return 0; -+} -+ -+static struct pnfs_layout_segment * -+filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, -+ struct nfs4_layoutget_res *lgr) -+{ -+ struct nfs4_filelayout_segment *fl; -+ int rc; -+ struct nfs4_deviceid id; -+ -+ dprintk("--> %s\n", __func__); -+ fl = kzalloc(sizeof(*fl), GFP_KERNEL); -+ if (!fl) -+ return NULL; -+ -+ rc = filelayout_decode_layout(layoutid, fl, lgr, &id); -+ if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) { -+ _filelayout_free_lseg(fl); -+ return NULL; -+ } -+ return &fl->generic_hdr; -+} -+ -+static void -+filelayout_free_lseg(struct pnfs_layout_segment *lseg) -+{ -+ struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode); -+ struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); -+ -+ dprintk("--> %s\n", __func__); -+ pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, -+ &fl->dsaddr->deviceid); -+ _filelayout_free_lseg(fl); -+} -+ -+/* Allocate a new nfs_write_data struct and initialize */ -+static struct nfs_write_data * -+filelayout_clone_write_data(struct nfs_write_data *old) -+{ -+ static struct nfs_write_data *new; -+ -+ new = nfs_commitdata_alloc(); -+ if (!new) -+ goto out; -+ kref_init(&new->refcount); -+ new->parent = old; -+ kref_get(&old->refcount); -+ new->inode = old->inode; -+ new->cred = old->cred; -+ new->args.offset = 0; -+ new->args.count = 0; -+ new->res.count = 0; -+ new->res.fattr = &new->fattr; -+ nfs_fattr_init(&new->fattr); -+ new->res.verf = &new->verf; -+ new->args.context = get_nfs_open_context(old->args.context); -+ new->pdata.lseg = NULL; -+ new->pdata.call_ops = old->pdata.call_ops; -+ new->pdata.how = old->pdata.how; -+out: -+ return new; -+} -+ -+static void filelayout_commit_call_done(struct rpc_task *task, void *data) -+{ -+ struct nfs_write_data *wdata = (struct nfs_write_data *)data; -+ -+ wdata->pdata.call_ops->rpc_call_done(task, data); -+} -+ -+static struct rpc_call_ops filelayout_commit_call_ops = { -+ .rpc_call_prepare = nfs_write_prepare, -+ .rpc_call_done = filelayout_commit_call_done, -+ .rpc_release = filelayout_write_release, -+}; -+ -+/* -+ * Execute a COMMIT op to the MDS or to each data server on which a page -+ * in 'pages' exists. -+ * Invoke the pnfs_commit_complete callback. -+ */ -+enum pnfs_try_status -+filelayout_commit(struct nfs_write_data *data, int sync) -+{ -+ LIST_HEAD(head); -+ struct nfs_page *req; -+ loff_t file_offset = 0; -+ u16 idx, i; -+ struct list_head **ds_page_list = NULL; -+ u16 *indices_used; -+ int num_indices_seen = 0; -+ const struct rpc_call_ops *call_ops; -+ struct rpc_clnt *clnt; -+ struct nfs_write_data **clone_list = NULL; -+ struct nfs_write_data *dsdata; -+ struct nfs4_pnfs_ds *ds; -+ -+ dprintk("%s data %p sync %d\n", __func__, data, sync); -+ -+ /* Alloc room for both in one go */ -+ ds_page_list = kzalloc((NFS4_PNFS_MAX_MULTI_CNT + 1) * -+ (sizeof(u16) + sizeof(struct list_head *)), -+ GFP_KERNEL); -+ if (!ds_page_list) -+ goto mem_error; -+ indices_used = (u16 *) (ds_page_list + NFS4_PNFS_MAX_MULTI_CNT + 1); -+ /* -+ * Sort pages based on which ds to send to. -+ * MDS is given index equal to NFS4_PNFS_MAX_MULTI_CNT. -+ * Note we are assuming there is only a single lseg in play. -+ * When that is not true, we could first sort on lseg, then -+ * sort within each as we do here. -+ */ -+ while (!list_empty(&data->pages)) { -+ req = nfs_list_entry(data->pages.next); -+ nfs_list_remove_request(req); -+ if (!req->wb_lseg || -+ ((struct nfs4_filelayout_segment *) -+ FILELAYOUT_LSEG(req->wb_lseg))->commit_through_mds) -+ idx = NFS4_PNFS_MAX_MULTI_CNT; -+ else { -+ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; -+ idx = nfs4_fl_calc_ds_index(req->wb_lseg, file_offset); -+ } -+ if (ds_page_list[idx]) { -+ /* Already seen this idx */ -+ list_add(&req->wb_list, ds_page_list[idx]); -+ } else { -+ /* New idx not seen so far */ -+ list_add_tail(&req->wb_list, &head); -+ indices_used[num_indices_seen++] = idx; -+ } -+ ds_page_list[idx] = &req->wb_list; -+ } -+ /* Once created, clone must be released via call_op */ -+ clone_list = kzalloc(num_indices_seen * -+ sizeof(struct nfs_write_data *), GFP_KERNEL); -+ if (!clone_list) -+ goto mem_error; -+ for (i = 0; i < num_indices_seen - 1; i++) { -+ clone_list[i] = filelayout_clone_write_data(data); -+ if (!clone_list[i]) -+ goto mem_error; -+ } -+ clone_list[i] = data; -+ /* -+ * Now send off the RPCs to each ds. Note that it is important -+ * that any RPC to the MDS be sent last (or at least after all -+ * clones have been made.) -+ */ -+ for (i = 0; i < num_indices_seen; i++) { -+ dsdata = clone_list[i]; -+ idx = indices_used[i]; -+ list_cut_position(&dsdata->pages, &head, ds_page_list[idx]); -+ if (idx == NFS4_PNFS_MAX_MULTI_CNT) { -+ call_ops = data->pdata.call_ops;; -+ clnt = NFS_CLIENT(dsdata->inode); -+ ds = NULL; -+ } else { -+ struct nfs_fh *fh; -+ -+ call_ops = &filelayout_commit_call_ops; -+ req = nfs_list_entry(dsdata->pages.next); -+ ds = nfs4_fl_prepare_ds(req->wb_lseg, idx); -+ if (!ds) { -+ /* Trigger retry of this chunk through MDS */ -+ dsdata->task.tk_status = -EIO; -+ data->pdata.call_ops->rpc_release(dsdata); -+ continue; -+ } -+ clnt = ds->ds_clp->cl_rpcclient; -+ dsdata->fldata.ds_nfs_client = ds->ds_clp; -+ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; -+ fh = nfs4_fl_select_ds_fh(req->wb_lseg, file_offset); -+ if (fh) -+ dsdata->args.fh = fh; -+ } -+ dprintk("%s: Initiating commit: %llu USE DS:\n", -+ __func__, file_offset); -+ ifdebug(FACILITY) -+ print_ds(ds); -+ -+ /* Send COMMIT to data server */ -+ nfs_initiate_commit(dsdata, clnt, call_ops, sync); -+ } -+ kfree(clone_list); -+ kfree(ds_page_list); -+ data->pdata.pnfs_error = 0; -+ return PNFS_ATTEMPTED; -+ -+ mem_error: -+ if (clone_list) { -+ for (i = 0; i < num_indices_seen - 1; i++) { -+ if (!clone_list[i]) -+ break; -+ data->pdata.call_ops->rpc_release(clone_list[i]); -+ } -+ kfree(clone_list); -+ } -+ kfree(ds_page_list); -+ /* One of these will be empty, but doesn't hurt to do both */ -+ nfs_mark_list_commit(&head); -+ nfs_mark_list_commit(&data->pages); -+ data->pdata.call_ops->rpc_release(data); -+ return PNFS_ATTEMPTED; -+} -+ -+/* -+ * filelayout_pg_test(). Called by nfs_can_coalesce_requests() -+ * -+ * return 1 : coalesce page -+ * return 0 : don't coalesce page -+ * -+ * By the time this is called, we know req->wb_lseg == prev->wb_lseg -+ */ -+int -+filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, -+ struct nfs_page *req) -+{ -+ u64 p_stripe, r_stripe; -+ u32 stripe_unit; -+ -+ if (!req->wb_lseg) -+ return 1; -+ p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; -+ r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; -+ stripe_unit = FILELAYOUT_LSEG(req->wb_lseg)->stripe_unit; -+ -+ do_div(p_stripe, stripe_unit); -+ do_div(r_stripe, stripe_unit); -+ -+ return (p_stripe == r_stripe); -+} -+ -+static struct pnfs_layoutdriver_type filelayout_type = { -+ .id = LAYOUT_NFSV4_1_FILES, -+ .name = "LAYOUT_NFSV4_1_FILES", -+ .owner = THIS_MODULE, -+ .flags = PNFS_USE_RPC_CODE, -+ .initialize_mountpoint = filelayout_initialize_mountpoint, -+ .uninitialize_mountpoint = filelayout_uninitialize_mountpoint, -+ .alloc_lseg = filelayout_alloc_lseg, -+ .free_lseg = filelayout_free_lseg, -+ .pg_test = filelayout_pg_test, -+ .read_pagelist = filelayout_read_pagelist, -+ .write_pagelist = filelayout_write_pagelist, -+ .commit = filelayout_commit, -+}; -+ -+static int __init nfs4filelayout_init(void) -+{ -+ printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n", -+ __func__); -+ return pnfs_register_layoutdriver(&filelayout_type); -+} -+ -+static void __exit nfs4filelayout_exit(void) -+{ -+ printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n", -+ __func__); -+ pnfs_unregister_layoutdriver(&filelayout_type); -+} -+ -+module_init(nfs4filelayout_init); -+module_exit(nfs4filelayout_exit); -diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h -new file mode 100644 -index 0000000..f884b0c ---- /dev/null -+++ b/fs/nfs/nfs4filelayout.h -@@ -0,0 +1,100 @@ -+/* -+ * NFSv4 file layout driver data structures. -+ * -+ * Copyright (c) 2002 -+ * The Regents of the University of Michigan -+ * All Rights Reserved -+ * -+ * Dean Hildebrand -+ * -+ * Permission is granted to use, copy, create derivative works, and -+ * redistribute this software and such derivative works for any purpose, -+ * so long as the name of the University of Michigan is not used in -+ * any advertising or publicity pertaining to the use or distribution -+ * of this software without specific, written prior authorization. If -+ * the above copyright notice or any other identification of the -+ * University of Michigan is included in any copy of any portion of -+ * this software, then the disclaimer below must also be included. -+ * -+ * This software is provided as is, without representation or warranty -+ * of any kind either express or implied, including without limitation -+ * the implied warranties of merchantability, fitness for a particular -+ * purpose, or noninfringement. The Regents of the University of -+ * Michigan shall not be liable for any damages, including special, -+ * indirect, incidental, or consequential damages, with respect to any -+ * claim arising out of or in connection with the use of the software, -+ * even if it has been or is hereafter advised of the possibility of -+ * such damages. -+ */ -+ -+#ifndef FS_NFS_NFS4FILELAYOUT_H -+#define FS_NFS_NFS4FILELAYOUT_H -+ -+#include "pnfs.h" -+ -+/* -+ * Field testing shows we need to support upto 4096 stripe indices. -+ * We store each index as a u8 (u32 on the wire) to keep the memory footprint -+ * reasonable. This in turn means we support a maximum of 256 -+ * RFC 5661 multipath_list4 structures. -+ */ -+#define NFS4_PNFS_MAX_STRIPE_CNT 4096 -+#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */ -+ -+enum stripetype4 { -+ STRIPE_SPARSE = 1, -+ STRIPE_DENSE = 2 -+}; -+ -+/* Individual ip address */ -+struct nfs4_pnfs_ds { -+ struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ -+ u32 ds_ip_addr; -+ u32 ds_port; -+ struct nfs_client *ds_clp; -+ atomic_t ds_count; -+}; -+ -+struct nfs4_file_layout_dsaddr { -+ struct pnfs_deviceid_node deviceid; -+ u32 stripe_count; -+ u8 *stripe_indices; -+ u32 ds_num; -+ struct nfs4_pnfs_ds *ds_list[1]; -+}; -+ -+struct nfs4_filelayout_segment { -+ struct pnfs_layout_segment generic_hdr; -+ u32 stripe_type; -+ u32 commit_through_mds; -+ u32 stripe_unit; -+ u32 first_stripe_index; -+ u64 pattern_offset; -+ struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */ -+ unsigned int num_fh; -+ struct nfs_fh **fh_array; -+}; -+ -+static inline struct nfs4_filelayout_segment * -+FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg) -+{ -+ return container_of(lseg, -+ struct nfs4_filelayout_segment, -+ generic_hdr); -+} -+ -+extern struct nfs_fh * -+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset); -+ -+extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *); -+extern void print_ds(struct nfs4_pnfs_ds *ds); -+extern void print_deviceid(struct nfs4_deviceid *dev_id); -+u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset); -+struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, -+ u32 ds_idx); -+extern struct nfs4_file_layout_dsaddr * -+nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id); -+struct nfs4_file_layout_dsaddr * -+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id); -+ -+#endif /* FS_NFS_NFS4FILELAYOUT_H */ -diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c -new file mode 100644 -index 0000000..1f0ab62 ---- /dev/null -+++ b/fs/nfs/nfs4filelayoutdev.c -@@ -0,0 +1,620 @@ -+/* -+ * Device operations for the pnfs nfs4 file layout driver. -+ * -+ * Copyright (c) 2002 -+ * The Regents of the University of Michigan -+ * All Rights Reserved -+ * -+ * Dean Hildebrand -+ * Garth Goodson -+ * -+ * Permission is granted to use, copy, create derivative works, and -+ * redistribute this software and such derivative works for any purpose, -+ * so long as the name of the University of Michigan is not used in -+ * any advertising or publicity pertaining to the use or distribution -+ * of this software without specific, written prior authorization. If -+ * the above copyright notice or any other identification of the -+ * University of Michigan is included in any copy of any portion of -+ * this software, then the disclaimer below must also be included. -+ * -+ * This software is provided as is, without representation or warranty -+ * of any kind either express or implied, including without limitation -+ * the implied warranties of merchantability, fitness for a particular -+ * purpose, or noninfringement. The Regents of the University of -+ * Michigan shall not be liable for any damages, including special, -+ * indirect, incidental, or consequential damages, with respect to any -+ * claim arising out of or in connection with the use of the software, -+ * even if it has been or is hereafter advised of the possibility of -+ * such damages. -+ */ -+ -+#include -+#include -+ -+#include "internal.h" -+#include "nfs4filelayout.h" -+ -+#define NFSDBG_FACILITY NFSDBG_PNFS_LD -+ -+/* -+ * Data server cache -+ * -+ * Data servers can be mapped to different device ids. -+ * nfs4_pnfs_ds reference counting -+ * - set to 1 on allocation -+ * - incremented when a device id maps a data server already in the cache. -+ * - decremented when deviceid is removed from the cache. -+ */ -+DEFINE_SPINLOCK(nfs4_ds_cache_lock); -+static LIST_HEAD(nfs4_data_server_cache); -+ -+/* Debug routines */ -+void -+print_ds(struct nfs4_pnfs_ds *ds) -+{ -+ if (ds == NULL) { -+ printk("%s NULL device\n", __func__); -+ return; -+ } -+ printk(" ip_addr %x port %hu\n" -+ " ref count %d\n" -+ " client %p\n" -+ " cl_exchange_flags %x\n", -+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), -+ atomic_read(&ds->ds_count), ds->ds_clp, -+ ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); -+} -+ -+void -+print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr) -+{ -+ int i; -+ -+ ifdebug(FACILITY) { -+ printk("%s dsaddr->ds_num %d\n", __func__, -+ dsaddr->ds_num); -+ for (i = 0; i < dsaddr->ds_num; i++) -+ print_ds(dsaddr->ds_list[i]); -+ } -+} -+ -+void print_deviceid(struct nfs4_deviceid *id) -+{ -+ u32 *p = (u32 *)id; -+ -+ dprintk("%s: device id= [%x%x%x%x]\n", __func__, -+ p[0], p[1], p[2], p[3]); -+} -+ -+/* nfs4_ds_cache_lock is held */ -+static struct nfs4_pnfs_ds * -+_data_server_lookup_locked(u32 ip_addr, u32 port) -+{ -+ struct nfs4_pnfs_ds *ds; -+ -+ dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", -+ ntohl(ip_addr), ntohs(port)); -+ -+ list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { -+ if (ds->ds_ip_addr == ip_addr && -+ ds->ds_port == port) { -+ return ds; -+ } -+ } -+ return NULL; -+} -+ -+/* Create an rpc to the data server defined in 'dev_list' */ -+static int -+nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) -+{ -+ struct nfs_server *tmp; -+ struct sockaddr_in sin; -+ struct rpc_clnt *mds_clnt = mds_srv->client; -+ struct nfs_client *clp = mds_srv->nfs_client; -+ struct sockaddr *mds_addr; -+ int err = 0; -+ -+ dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__, -+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), -+ mds_clnt->cl_auth->au_flavor); -+ -+ sin.sin_family = AF_INET; -+ sin.sin_addr.s_addr = ds->ds_ip_addr; -+ sin.sin_port = ds->ds_port; -+ -+ /* -+ * If this DS is also the MDS, use the MDS session only if the -+ * MDS exchangeid flags show the EXCHGID4_FLAG_USE_PNFS_DS pNFS role. -+ */ -+ mds_addr = (struct sockaddr *)&clp->cl_addr; -+ if (nfs_sockaddr_cmp((struct sockaddr *)&sin, mds_addr)) { -+ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { -+ printk(KERN_INFO -+ "ip:port %x:%hu is not a pNFS Data Server\n", -+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); -+ err = -ENODEV; -+ } else { -+ atomic_inc(&clp->cl_count); -+ ds->ds_clp = clp; -+ dprintk("%s Using MDS Session for DS\n", __func__); -+ } -+ goto out; -+ } -+ -+ /* Temporay server for nfs4_set_client */ -+ tmp = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); -+ if (!tmp) -+ goto out; -+ -+ /* -+ * Set a retrans, timeout interval, and authflavor equual to the MDS -+ * values. Use the MDS nfs_client cl_ipaddr field so as to use the -+ * same co_ownerid as the MDS. -+ */ -+ err = nfs4_set_client(tmp, -+ mds_srv->nfs_client->cl_hostname, -+ (struct sockaddr *)&sin, -+ sizeof(struct sockaddr), -+ mds_srv->nfs_client->cl_ipaddr, -+ mds_clnt->cl_auth->au_flavor, -+ IPPROTO_TCP, -+ mds_clnt->cl_xprt->timeout, -+ 1 /* minorversion */); -+ if (err < 0) -+ goto out_free; -+ -+ clp = tmp->nfs_client; -+ -+ /* Ask for only the EXCHGID4_FLAG_USE_PNFS_DS pNFS role */ -+ dprintk("%s EXCHANGE_ID for clp %p\n", __func__, clp); -+ clp->cl_exchange_flags = EXCHGID4_FLAG_USE_PNFS_DS; -+ -+ err = nfs4_recover_expired_lease(clp); -+ if (!err) -+ err = nfs4_check_client_ready(clp); -+ if (err) -+ goto out_put; -+ -+ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { -+ printk(KERN_INFO "ip:port %x:%hu is not a pNFS Data Server\n", -+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); -+ err = -ENODEV; -+ goto out_put; -+ } -+ /* -+ * Set DS lease equal to the MDS lease, renewal is scheduled in -+ * create_session -+ */ -+ spin_lock(&mds_srv->nfs_client->cl_lock); -+ clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time; -+ spin_unlock(&mds_srv->nfs_client->cl_lock); -+ clp->cl_last_renewal = jiffies; -+ -+ clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); -+ ds->ds_clp = clp; -+ -+ dprintk("%s: ip=%x, port=%hu, rpcclient %p\n", __func__, -+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), -+ clp->cl_rpcclient); -+out_free: -+ kfree(tmp); -+out: -+ dprintk("%s Returns %d\n", __func__, err); -+ return err; -+out_put: -+ nfs_put_client(clp); -+ goto out_free; -+} -+ -+static void -+destroy_ds(struct nfs4_pnfs_ds *ds) -+{ -+ dprintk("--> %s\n", __func__); -+ ifdebug(FACILITY) -+ print_ds(ds); -+ -+ if (ds->ds_clp) -+ nfs_put_client(ds->ds_clp); -+ kfree(ds); -+} -+ -+static void -+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) -+{ -+ struct nfs4_pnfs_ds *ds; -+ int i; -+ -+ print_deviceid(&dsaddr->deviceid.de_id); -+ -+ for (i = 0; i < dsaddr->ds_num; i++) { -+ ds = dsaddr->ds_list[i]; -+ if (ds != NULL) { -+ if (atomic_dec_and_lock(&ds->ds_count, -+ &nfs4_ds_cache_lock)) { -+ list_del_init(&ds->ds_node); -+ spin_unlock(&nfs4_ds_cache_lock); -+ destroy_ds(ds); -+ } -+ } -+ } -+ kfree(dsaddr->stripe_indices); -+ kfree(dsaddr); -+} -+ -+void -+nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device) -+{ -+ struct nfs4_file_layout_dsaddr *dsaddr = -+ container_of(device, struct nfs4_file_layout_dsaddr, deviceid); -+ -+ nfs4_fl_free_deviceid(dsaddr); -+} -+ -+static struct nfs4_pnfs_ds * -+nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port) -+{ -+ struct nfs4_pnfs_ds *tmp_ds, *ds; -+ -+ ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL); -+ if (!ds) -+ goto out; -+ -+ spin_lock(&nfs4_ds_cache_lock); -+ tmp_ds = _data_server_lookup_locked(ip_addr, port); -+ if (tmp_ds == NULL) { -+ ds->ds_ip_addr = ip_addr; -+ ds->ds_port = port; -+ atomic_set(&ds->ds_count, 1); -+ INIT_LIST_HEAD(&ds->ds_node); -+ ds->ds_clp = NULL; -+ list_add(&ds->ds_node, &nfs4_data_server_cache); -+ dprintk("%s add new data server ip 0x%x\n", __func__, -+ ds->ds_ip_addr); -+ } else { -+ kfree(ds); -+ atomic_inc(&tmp_ds->ds_count); -+ dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", -+ __func__, tmp_ds->ds_ip_addr, -+ atomic_read(&tmp_ds->ds_count)); -+ ds = tmp_ds; -+ } -+ spin_unlock(&nfs4_ds_cache_lock); -+out: -+ return ds; -+} -+ -+/* -+ * Currently only support ipv4, and one multi-path address. -+ */ -+static struct nfs4_pnfs_ds * -+decode_and_add_ds(__be32 **pp, struct inode *inode) -+{ -+ struct nfs4_pnfs_ds *ds = NULL; -+ char *buf; -+ const char *ipend, *pstr; -+ u32 ip_addr, port; -+ int nlen, rlen, i; -+ int tmp[2]; -+ __be32 *r_netid, *r_addr, *p = *pp; -+ -+ /* r_netid */ -+ nlen = be32_to_cpup(p++); -+ r_netid = p; -+ p += XDR_QUADLEN(nlen); -+ -+ /* r_addr */ -+ rlen = be32_to_cpup(p++); -+ r_addr = p; -+ p += XDR_QUADLEN(rlen); -+ *pp = p; -+ -+ /* Check that netid is "tcp" */ -+ if (nlen != 3 || memcmp((char *)r_netid, "tcp", 3)) { -+ dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__); -+ goto out_err; -+ } -+ -+ /* ipv6 length plus port is legal */ -+ if (rlen > INET6_ADDRSTRLEN + 8) { -+ dprintk("%s Invalid address, length %d\n", __func__, -+ rlen); -+ goto out_err; -+ } -+ buf = kmalloc(rlen + 1, GFP_KERNEL); -+ buf[rlen] = '\0'; -+ memcpy(buf, r_addr, rlen); -+ -+ /* replace the port dots with dashes for the in4_pton() delimiter*/ -+ for (i = 0; i < 2; i++) { -+ char *res = strrchr(buf, '.'); -+ *res = '-'; -+ } -+ -+ /* Currently only support ipv4 address */ -+ if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) { -+ dprintk("%s: Only ipv4 addresses supported\n", __func__); -+ goto out_free; -+ } -+ -+ /* port */ -+ pstr = ipend; -+ sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]); -+ port = htons((tmp[0] << 8) | (tmp[1])); -+ -+ ds = nfs4_pnfs_ds_add(inode, ip_addr, port); -+ dprintk("%s Decoded address and port %s\n", __func__, buf); -+out_free: -+ kfree(buf); -+out_err: -+ return ds; -+} -+ -+/* Decode opaque device data and return the result */ -+static struct nfs4_file_layout_dsaddr* -+decode_device(struct inode *ino, struct pnfs_device *pdev) -+{ -+ int i, dummy; -+ u32 cnt, num; -+ u8 *indexp; -+ __be32 *p = (__be32 *)pdev->area, *indicesp; -+ struct nfs4_file_layout_dsaddr *dsaddr; -+ -+ /* Get the stripe count (number of stripe index) */ -+ cnt = be32_to_cpup(p++); -+ dprintk("%s stripe count %d\n", __func__, cnt); -+ if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { -+ printk(KERN_WARNING "%s: stripe count %d greater than " -+ "supported maximum %d\n", __func__, -+ cnt, NFS4_PNFS_MAX_STRIPE_CNT); -+ goto out_err; -+ } -+ -+ /* Check the multipath list count */ -+ indicesp = p; -+ p += XDR_QUADLEN(cnt << 2); -+ num = be32_to_cpup(p++); -+ dprintk("%s ds_num %u\n", __func__, num); -+ if (num > NFS4_PNFS_MAX_MULTI_CNT) { -+ printk(KERN_WARNING "%s: multipath count %d greater than " -+ "supported maximum %d\n", __func__, -+ num, NFS4_PNFS_MAX_MULTI_CNT); -+ goto out_err; -+ } -+ dsaddr = kzalloc(sizeof(*dsaddr) + -+ (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), -+ GFP_KERNEL); -+ if (!dsaddr) -+ goto out_err; -+ -+ dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL); -+ if (!dsaddr->stripe_indices) -+ goto out_err_free; -+ -+ dsaddr->stripe_count = cnt; -+ dsaddr->ds_num = num; -+ -+ memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id)); -+ -+ /* Go back an read stripe indices */ -+ p = indicesp; -+ indexp = &dsaddr->stripe_indices[0]; -+ for (i = 0; i < dsaddr->stripe_count; i++) { -+ *indexp = be32_to_cpup(p++); -+ if (*indexp >= num) -+ goto out_err_free; -+ indexp++; -+ } -+ /* Skip already read multipath list count */ -+ p++; -+ -+ for (i = 0; i < dsaddr->ds_num; i++) { -+ int j; -+ -+ dummy = be32_to_cpup(p++); /* multipath count */ -+ if (dummy > 1) { -+ printk(KERN_WARNING -+ "%s: Multipath count %d not supported, " -+ "skipping all greater than 1\n", __func__, -+ dummy); -+ } -+ for (j = 0; j < dummy; j++) { -+ if (j == 0) { -+ dsaddr->ds_list[i] = decode_and_add_ds(&p, ino); -+ if (dsaddr->ds_list[i] == NULL) -+ goto out_err_free; -+ } else { -+ u32 len; -+ /* skip extra multipath */ -+ len = be32_to_cpup(p++); -+ p += XDR_QUADLEN(len); -+ len = be32_to_cpup(p++); -+ p += XDR_QUADLEN(len); -+ continue; -+ } -+ } -+ } -+ return dsaddr; -+ -+out_err_free: -+ nfs4_fl_free_deviceid(dsaddr); -+out_err: -+ dprintk("%s ERROR: returning NULL\n", __func__); -+ return NULL; -+} -+ -+/* -+ * Decode the opaque device specified in 'dev' -+ * and add it to the list of available devices. -+ * If the deviceid is already cached, nfs4_add_deviceid will return -+ * a pointer to the cached struct and throw away the new. -+ */ -+static struct nfs4_file_layout_dsaddr* -+decode_and_add_device(struct inode *inode, struct pnfs_device *dev) -+{ -+ struct nfs4_file_layout_dsaddr *dsaddr; -+ struct pnfs_deviceid_node *d; -+ -+ dsaddr = decode_device(inode, dev); -+ if (!dsaddr) { -+ printk(KERN_WARNING "%s: Could not decode or add device\n", -+ __func__); -+ return NULL; -+ } -+ -+ d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, -+ &dsaddr->deviceid); -+ -+ return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); -+} -+ -+/* -+ * Retrieve the information for dev_id, add it to the list -+ * of available devices, and return it. -+ */ -+struct nfs4_file_layout_dsaddr * -+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id) -+{ -+ struct pnfs_device *pdev = NULL; -+ u32 max_resp_sz; -+ int max_pages; -+ struct page **pages = NULL; -+ struct nfs4_file_layout_dsaddr *dsaddr = NULL; -+ int rc, i; -+ struct nfs_server *server = NFS_SERVER(inode); -+ -+ /* -+ * Use the session max response size as the basis for setting -+ * GETDEVICEINFO's maxcount -+ */ -+ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; -+ max_pages = max_resp_sz >> PAGE_SHIFT; -+ dprintk("%s inode %p max_resp_sz %u max_pages %d\n", -+ __func__, inode, max_resp_sz, max_pages); -+ -+ pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL); -+ if (pdev == NULL) -+ return NULL; -+ -+ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); -+ if (pages == NULL) { -+ kfree(pdev); -+ return NULL; -+ } -+ for (i = 0; i < max_pages; i++) { -+ pages[i] = alloc_page(GFP_KERNEL); -+ if (!pages[i]) -+ goto out_free; -+ } -+ -+ /* set pdev->area */ -+ pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); -+ if (!pdev->area) -+ goto out_free; -+ -+ memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); -+ pdev->layout_type = LAYOUT_NFSV4_1_FILES; -+ pdev->pages = pages; -+ pdev->pgbase = 0; -+ pdev->pglen = PAGE_SIZE * max_pages; -+ pdev->mincount = 0; -+ -+ rc = nfs4_proc_getdeviceinfo(server, pdev); -+ dprintk("%s getdevice info returns %d\n", __func__, rc); -+ if (rc) -+ goto out_free; -+ -+ /* -+ * Found new device, need to decode it and then add it to the -+ * list of known devices for this mountpoint. -+ */ -+ dsaddr = decode_and_add_device(inode, pdev); -+out_free: -+ if (pdev->area != NULL) -+ vunmap(pdev->area); -+ for (i = 0; i < max_pages; i++) -+ __free_page(pages[i]); -+ kfree(pages); -+ kfree(pdev); -+ dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); -+ return dsaddr; -+} -+ -+struct nfs4_file_layout_dsaddr * -+nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id) -+{ -+ struct pnfs_deviceid_node *d; -+ -+ d = pnfs_find_get_deviceid(clp->cl_devid_cache, id); -+ return (d == NULL) ? NULL : -+ container_of(d, struct nfs4_file_layout_dsaddr, deviceid); -+} -+ -+/* -+ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit -+ * Then: ((res + fsi) % dsaddr->stripe_count) -+ */ -+static u32 -+_nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) -+{ -+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); -+ u64 tmp; -+ -+ tmp = offset - flseg->pattern_offset; -+ do_div(tmp, flseg->stripe_unit); -+ tmp += flseg->first_stripe_index; -+ return do_div(tmp, flseg->dsaddr->stripe_count); -+} -+ -+u32 -+nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset) -+{ -+ u32 j; -+ -+ j = _nfs4_fl_calc_j_index(lseg, offset); -+ return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j]; -+} -+ -+struct nfs_fh * -+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset) -+{ -+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); -+ u32 i; -+ -+ if (flseg->stripe_type == STRIPE_SPARSE) { -+ if (flseg->num_fh == 1) -+ i = 0; -+ else if (flseg->num_fh == 0) -+ return NULL; -+ else -+ i = nfs4_fl_calc_ds_index(lseg, offset); -+ } else -+ i = _nfs4_fl_calc_j_index(lseg, offset); -+ return flseg->fh_array[i]; -+} -+ -+struct nfs4_pnfs_ds * -+nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) -+{ -+ struct nfs4_file_layout_dsaddr *dsaddr; -+ -+ dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr; -+ if (dsaddr->ds_list[ds_idx] == NULL) { -+ printk(KERN_ERR "%s: No data server for device id!\n", -+ __func__); -+ return NULL; -+ } -+ -+ if (!dsaddr->ds_list[ds_idx]->ds_clp) { -+ int err; -+ -+ err = nfs4_pnfs_ds_create(NFS_SERVER(lseg->layout->inode), -+ dsaddr->ds_list[ds_idx]); -+ if (err) { -+ printk(KERN_ERR "%s nfs4_pnfs_ds_create error %d\n", -+ __func__, err); -+ return NULL; -+ } -+ } -+ return dsaddr->ds_list[ds_idx]; -+} -diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c -index 089da5b..cc642dd 100644 ---- a/fs/nfs/nfs4proc.c -+++ b/fs/nfs/nfs4proc.c -@@ -55,6 +55,7 @@ - #include "internal.h" - #include "iostat.h" - #include "callback.h" -+#include "pnfs.h" - - #define NFSDBG_FACILITY NFSDBG_PROC - -@@ -67,7 +68,7 @@ struct nfs4_opendata; - static int _nfs4_proc_open(struct nfs4_opendata *data); - static int _nfs4_recover_proc_open(struct nfs4_opendata *data); - static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); --static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); -+static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, struct nfs_client *); - static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); - static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); - static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, -@@ -125,11 +126,12 @@ const u32 nfs4_pathconf_bitmap[2] = { - 0 - }; - --const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE -+const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE - | FATTR4_WORD0_MAXREAD - | FATTR4_WORD0_MAXWRITE - | FATTR4_WORD0_LEASE_TIME, -- 0 -+ FATTR4_WORD1_FS_LAYOUT_TYPES, -+ FATTR4_WORD2_LAYOUT_BLKSIZE - }; - - const u32 nfs4_fs_locations_bitmap[2] = { -@@ -562,6 +564,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session, - } - - int nfs4_setup_sequence(const struct nfs_server *server, -+ struct nfs4_session *ds_session, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - int cache_reply, -@@ -570,6 +573,8 @@ int nfs4_setup_sequence(const struct nfs_server *server, - struct nfs4_session *session = nfs4_get_session(server); - int ret = 0; - -+ if (ds_session) -+ session = ds_session; - if (session == NULL) { - args->sa_session = NULL; - res->sr_session = NULL; -@@ -599,7 +604,7 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) - - dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); - -- if (nfs4_setup_sequence(data->seq_server, data->seq_args, -+ if (nfs4_setup_sequence(data->seq_server, NULL, data->seq_args, - data->seq_res, data->cache_reply, task)) - return; - rpc_call_start(task); -@@ -1378,7 +1383,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) - nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); - } - data->timestamp = jiffies; -- if (nfs4_setup_sequence(data->o_arg.server, -+ if (nfs4_setup_sequence(data->o_arg.server, NULL, - &data->o_arg.seq_args, - &data->o_res.seq_res, 1, task)) - return; -@@ -1553,9 +1558,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) - return 0; - } - --static int nfs4_recover_expired_lease(struct nfs_server *server) -+int nfs4_recover_expired_lease(struct nfs_client *clp) - { -- struct nfs_client *clp = server->nfs_client; - unsigned int loop; - int ret; - -@@ -1571,6 +1575,7 @@ static int nfs4_recover_expired_lease(struct nfs_server *server) - } - return ret; - } -+EXPORT_SYMBOL(nfs4_recover_expired_lease); - - /* - * OPEN_EXPIRED: -@@ -1660,7 +1665,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in - dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); - goto out_err; - } -- status = nfs4_recover_expired_lease(server); -+ status = nfs4_recover_expired_lease(server->nfs_client); - if (status != 0) - goto err_put_state_owner; - if (path->dentry->d_inode != NULL) -@@ -1871,7 +1876,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) - if (calldata->arg.fmode == 0) - break; - default: -- if (nfs4_async_handle_error(task, server, state) == -EAGAIN) -+ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) - rpc_restart_call_prepare(task); - } - nfs_release_seqid(calldata->arg.seqid); -@@ -1916,7 +1921,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) - - nfs_fattr_init(calldata->res.fattr); - calldata->timestamp = jiffies; -- if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), -+ if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), NULL, - &calldata->arg.seq_args, &calldata->res.seq_res, - 1, task)) - return; -@@ -1979,8 +1984,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i - path_get(path); - calldata->path = *path; - -- msg.rpc_argp = &calldata->arg, -- msg.rpc_resp = &calldata->res, -+ msg.rpc_argp = &calldata->arg; -+ msg.rpc_resp = &calldata->res; - task_setup_data.callback_data = calldata; - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) -@@ -2337,6 +2342,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, - struct nfs4_state *state = NULL; - int status; - -+ if (pnfs_ld_layoutret_on_setattr(inode)) -+ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true); -+ - nfs_fattr_init(fattr); - - /* Search for an existing open(O_WRITE) file */ -@@ -2664,7 +2672,7 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) - - if (!nfs4_sequence_done(task, &res->seq_res)) - return 0; -- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) -+ if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN) - return 0; - update_changeattr(dir, &res->cinfo); - nfs_post_op_update_inode(dir, res->dir_attr); -@@ -3105,19 +3113,31 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, - static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) - { - struct nfs_server *server = NFS_SERVER(data->inode); -+ struct nfs_client *client = server->nfs_client; - - dprintk("--> %s\n", __func__); - -+#ifdef CONFIG_NFS_V4_1 -+ if (data->pdata.pnfsflags & PNFS_NO_RPC) -+ return 0; -+ -+ /* Is this a DS session */ -+ if (data->fldata.ds_nfs_client) { -+ dprintk("%s DS read\n", __func__); -+ client = data->fldata.ds_nfs_client; -+ } -+#endif /* CONFIG_NFS_V4_1 */ -+ - if (!nfs4_sequence_done(task, &data->res.seq_res)) - return -EAGAIN; - -- if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { -- nfs_restart_rpc(task, server->nfs_client); -+ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { -+ nfs_restart_rpc(task, client); - return -EAGAIN; - } - - nfs_invalidate_atime(data->inode); -- if (task->tk_status > 0) -+ if (task->tk_status > 0 && client == server->nfs_client) - renew_lease(server, data->timestamp); - return 0; - } -@@ -3128,20 +3148,56 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message - msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; - } - -+static void pnfs4_update_write_done(struct nfs_inode *nfsi, struct nfs_write_data *data) -+{ -+#ifdef CONFIG_NFS_V4_1 -+ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); -+ pnfs_need_layoutcommit(nfsi, data->args.context); -+#endif /* CONFIG_NFS_V4_1 */ -+} -+ - static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) - { - struct inode *inode = data->inode; -- -+ struct nfs_server *server = NFS_SERVER(inode); -+ struct nfs_client *client = server->nfs_client; -+ - if (!nfs4_sequence_done(task, &data->res.seq_res)) - return -EAGAIN; - -- if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { -- nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); -+#ifdef CONFIG_NFS_V4_1 -+ /* restore original count after retry? */ -+ if (data->pdata.orig_count) { -+ dprintk("%s: restoring original count %u\n", __func__, -+ data->pdata.orig_count); -+ data->args.count = data->pdata.orig_count; -+ } -+ -+ if (data->pdata.pnfsflags & PNFS_NO_RPC) -+ return 0; -+ -+ /* Is this a DS session */ -+ if (data->fldata.ds_nfs_client) { -+ dprintk("%s DS write\n", __func__); -+ client = data->fldata.ds_nfs_client; -+ } -+#endif /* CONFIG_NFS_V4_1 */ -+ -+ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { -+ nfs_restart_rpc(task, client); - return -EAGAIN; - } -+ -+ /* -+ * MDS write: renew lease -+ * DS write: update lastbyte written, mark for layout commit -+ */ - if (task->tk_status >= 0) { -- renew_lease(NFS_SERVER(inode), data->timestamp); -- nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); -+ if (client == server->nfs_client) { -+ renew_lease(server, data->timestamp); -+ nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); -+ } else -+ pnfs4_update_write_done(NFS_I(inode), data); - } - return 0; - } -@@ -3154,21 +3210,42 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag - data->res.server = server; - data->timestamp = jiffies; - -+#ifdef CONFIG_NFS_V4_1 -+ /* writes to DS use pnfs vector */ -+ if (data->fldata.ds_nfs_client) { -+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_WRITE]; -+ return; -+ } -+#endif /* CONFIG_NFS_V4_1 */ - msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; - } - - static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) - { - struct inode *inode = data->inode; -- -+ struct nfs_server *server = NFS_SERVER(data->inode); -+ struct nfs_client *client = server->nfs_client; -+ -+#ifdef CONFIG_NFS_V4_1 -+ if (data->pdata.pnfsflags & PNFS_NO_RPC) -+ return 0; -+ -+ /* Is this a DS session */ -+ if (data->fldata.ds_nfs_client) { -+ dprintk("%s DS commit\n", __func__); -+ client = data->fldata.ds_nfs_client; -+ } -+#endif /* CONFIG_NFS_V4_1 */ -+ - if (!nfs4_sequence_done(task, &data->res.seq_res)) - return -EAGAIN; - -- if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { -+ if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL, NULL) == -EAGAIN) { - nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); - return -EAGAIN; - } -- nfs_refresh_inode(inode, data->res.fattr); -+ if (client == server->nfs_client) -+ nfs_refresh_inode(inode, data->res.fattr); - return 0; - } - -@@ -3178,6 +3255,12 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa - - data->args.bitmask = server->cache_consistency_bitmask; - data->res.server = server; -+#if defined(CONFIG_NFS_V4_1) -+ if (data->fldata.ds_nfs_client) { -+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_COMMIT]; -+ return; -+ } -+#endif /* CONFIG_NFS_V4_1 */ - msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; - } - -@@ -3475,9 +3558,10 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen - } - - static int --nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) -+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state, struct nfs_client *clp) - { -- struct nfs_client *clp = server->nfs_client; -+ if (!clp) -+ clp = server->nfs_client; - - if (task->tk_status >= 0) - return 0; -@@ -3504,14 +3588,16 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - case -NFS4ERR_SEQ_FALSE_RETRY: - case -NFS4ERR_SEQ_MISORDERED: -- dprintk("%s ERROR %d, Reset session\n", __func__, -- task->tk_status); -+ dprintk("%s ERROR %d, Reset session. Exchangeid " -+ "flags 0x%x\n", __func__, task->tk_status, -+ clp->cl_exchange_flags); - nfs4_schedule_state_recovery(clp); - task->tk_status = 0; - return -EAGAIN; - #endif /* CONFIG_NFS_V4_1 */ - case -NFS4ERR_DELAY: -- nfs_inc_server_stats(server, NFSIOS_DELAY); -+ if (server) -+ nfs_inc_server_stats(server, NFSIOS_DELAY); - case -NFS4ERR_GRACE: - case -EKEYEXPIRED: - rpc_delay(task, NFS4_POLL_RETRY_MAX); -@@ -3524,6 +3610,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, - task->tk_status = nfs4_map_errors(task->tk_status); - return 0; - do_state_recovery: -+ if (is_ds_only_client(clp)) -+ return 0; - rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); - nfs4_schedule_state_recovery(clp); - if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) -@@ -3657,8 +3745,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) - renew_lease(data->res.server, data->timestamp); - break; - default: -- if (nfs4_async_handle_error(task, data->res.server, NULL) == -- -EAGAIN) { -+ if (nfs4_async_handle_error(task, data->res.server, NULL, NULL) -+ == -EAGAIN) { - nfs_restart_rpc(task, data->res.server->nfs_client); - return; - } -@@ -3678,7 +3766,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) - - d_data = (struct nfs4_delegreturndata *)data; - -- if (nfs4_setup_sequence(d_data->res.server, -+ if (nfs4_setup_sequence(d_data->res.server, NULL, - &d_data->args.seq_args, - &d_data->res.seq_res, 1, task)) - return; -@@ -3913,7 +4001,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) - case -NFS4ERR_EXPIRED: - break; - default: -- if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) -+ if (nfs4_async_handle_error(task, calldata->server, NULL, NULL) == -EAGAIN) - nfs_restart_rpc(task, - calldata->server->nfs_client); - } -@@ -3931,7 +4019,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) - return; - } - calldata->timestamp = jiffies; -- if (nfs4_setup_sequence(calldata->server, -+ if (nfs4_setup_sequence(calldata->server, NULL, - &calldata->arg.seq_args, - &calldata->res.seq_res, 1, task)) - return; -@@ -3973,8 +4061,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, - return ERR_PTR(-ENOMEM); - } - -- msg.rpc_argp = &data->arg, -- msg.rpc_resp = &data->res, -+ msg.rpc_argp = &data->arg; -+ msg.rpc_resp = &data->res; - task_setup_data.callback_data = data; - return rpc_run_task(&task_setup_data); - } -@@ -4086,7 +4174,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) - } else - data->arg.new_lock_owner = 0; - data->timestamp = jiffies; -- if (nfs4_setup_sequence(data->server, -+ if (nfs4_setup_sequence(data->server, NULL, - &data->arg.seq_args, - &data->res.seq_res, 1, task)) - return; -@@ -4211,8 +4299,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f - data->arg.reclaim = NFS_LOCK_RECLAIM; - task_setup_data.callback_ops = &nfs4_recover_lock_ops; - } -- msg.rpc_argp = &data->arg, -- msg.rpc_resp = &data->res, -+ msg.rpc_argp = &data->arg; -+ msg.rpc_resp = &data->res; - task_setup_data.callback_data = data; - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) -@@ -4557,7 +4645,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) - nfs4_verifier verifier; - struct nfs41_exchange_id_args args = { - .client = clp, -- .flags = clp->cl_exchange_flags, -+ .flags = clp->cl_exchange_flags & ~EXCHGID4_FLAG_CONFIRMED_R, - }; - struct nfs41_exchange_id_res res = { - .client = clp, -@@ -5081,7 +5169,7 @@ int nfs4_init_session(struct nfs_server *server) - session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; - session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; - -- ret = nfs4_recover_expired_lease(server); -+ ret = nfs4_recover_expired_lease(server->nfs_client); - if (!ret) - ret = nfs4_check_client_ready(clp); - return ret; -@@ -5333,6 +5421,412 @@ out: - dprintk("<-- %s status=%d\n", __func__, status); - return status; - } -+ -+static void -+nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) -+{ -+ struct nfs4_layoutget *lgp = calldata; -+ struct inode *ino = lgp->args.inode; -+ struct nfs_inode *nfsi = NFS_I(ino); -+ struct nfs_server *server = NFS_SERVER(ino); -+ struct pnfs_layout_segment *lseg; -+ -+ dprintk("--> %s\n", __func__); -+ spin_lock(&ino->i_lock); -+ lseg = pnfs_has_layout(nfsi->layout, &lgp->args.range); -+ if (likely(!lseg)) { -+ spin_unlock(&ino->i_lock); -+ dprintk("%s: no lseg found, proceeding\n", __func__); -+ if (!nfs4_setup_sequence(server, NULL, &lgp->args.seq_args, -+ &lgp->res.seq_res, 0, task)) -+ rpc_call_start(task); -+ return; -+ } -+ if (!lseg->valid) { -+ spin_unlock(&ino->i_lock); -+ dprintk("%s: invalid lseg found, waiting\n", __func__); -+ rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL); -+ return; -+ } -+ get_lseg(lseg); -+ *lgp->lsegpp = lseg; -+ spin_unlock(&ino->i_lock); -+ dprintk("%s: valid lseg found, no rpc required\n", __func__); -+ rpc_exit(task, NFS4_OK); -+} -+ -+static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) -+{ -+ struct nfs4_layoutget *lgp = calldata; -+ struct nfs_server *server = NFS_SERVER(lgp->args.inode); -+ -+ dprintk("--> %s\n", __func__); -+ -+ if (!nfs4_sequence_done(task, &lgp->res.seq_res)) -+ return; -+ -+ switch (task->tk_status) { -+ case 0: -+ break; -+ case -NFS4ERR_LAYOUTTRYLATER: -+ case -NFS4ERR_RECALLCONFLICT: -+ task->tk_status = -NFS4ERR_DELAY; -+ /* Fall through */ -+ default: -+ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) { -+ rpc_restart_call_prepare(task); -+ return; -+ } -+ } -+ lgp->status = task->tk_status; -+ dprintk("<-- %s\n", __func__); -+} -+ -+static void nfs4_layoutget_release(void *calldata) -+{ -+ struct nfs4_layoutget *lgp = calldata; -+ -+ dprintk("--> %s\n", __func__); -+ put_layout_hdr(lgp->args.inode); -+ if (lgp->res.layout.buf != NULL) -+ free_page((unsigned long) lgp->res.layout.buf); -+ put_nfs_open_context(lgp->args.ctx); -+ kfree(calldata); -+ dprintk("<-- %s\n", __func__); -+} -+ -+static const struct rpc_call_ops nfs4_layoutget_call_ops = { -+ .rpc_call_prepare = nfs4_layoutget_prepare, -+ .rpc_call_done = nfs4_layoutget_done, -+ .rpc_release = nfs4_layoutget_release, -+}; -+ -+int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) -+{ -+ struct nfs_server *server = NFS_SERVER(lgp->args.inode); -+ struct rpc_task *task; -+ struct rpc_message msg = { -+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], -+ .rpc_argp = &lgp->args, -+ .rpc_resp = &lgp->res, -+ }; -+ struct rpc_task_setup task_setup_data = { -+ .rpc_client = server->client, -+ .rpc_message = &msg, -+ .callback_ops = &nfs4_layoutget_call_ops, -+ .callback_data = lgp, -+ .flags = RPC_TASK_ASYNC, -+ }; -+ int status = 0; -+ -+ dprintk("--> %s\n", __func__); -+ -+ lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS); -+ if (lgp->res.layout.buf == NULL) { -+ nfs4_layoutget_release(lgp); -+ return -ENOMEM; -+ } -+ -+ lgp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; -+ task = rpc_run_task(&task_setup_data); -+ if (IS_ERR(task)) -+ return PTR_ERR(task); -+ status = nfs4_wait_for_completion_rpc_task(task); -+ if (status != 0) -+ goto out; -+ status = lgp->status; -+ if (status != 0) -+ goto out; -+ status = pnfs_layout_process(lgp); -+out: -+ rpc_put_task(task); -+ dprintk("<-- %s status=%d\n", __func__, status); -+ return status; -+} -+ -+static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *data) -+{ -+ struct nfs4_layoutcommit_data *ldata = -+ (struct nfs4_layoutcommit_data *)data; -+ struct nfs_server *server = NFS_SERVER(ldata->args.inode); -+ -+ if (nfs4_setup_sequence(server, NULL, &ldata->args.seq_args, -+ &ldata->res.seq_res, 1, task)) -+ return; -+ rpc_call_start(task); -+} -+ -+static void -+nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) -+{ -+ struct nfs4_layoutcommit_data *data = -+ (struct nfs4_layoutcommit_data *)calldata; -+ struct nfs_server *server = NFS_SERVER(data->args.inode); -+ -+ if (!nfs4_sequence_done(task, &data->res.seq_res)) -+ return; -+ -+ if (RPC_ASSASSINATED(task)) -+ return; -+ -+ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) -+ nfs_restart_rpc(task, server->nfs_client); -+ -+ data->status = task->tk_status; -+} -+ -+static void nfs4_layoutcommit_release(void *lcdata) -+{ -+ struct nfs4_layoutcommit_data *data = -+ (struct nfs4_layoutcommit_data *)lcdata; -+ -+ pnfs_cleanup_layoutcommit(lcdata); -+ /* Matched by get_layout in pnfs_layoutcommit_inode */ -+ put_layout_hdr(data->args.inode); -+ put_rpccred(data->cred); -+ kfree(lcdata); -+} -+ -+static const struct rpc_call_ops nfs4_layoutcommit_ops = { -+ .rpc_call_prepare = nfs4_layoutcommit_prepare, -+ .rpc_call_done = nfs4_layoutcommit_done, -+ .rpc_release = nfs4_layoutcommit_release, -+}; -+ -+/* Execute a layoutcommit to the server */ -+int -+nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync) -+{ -+ struct rpc_message msg = { -+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT], -+ .rpc_argp = &data->args, -+ .rpc_resp = &data->res, -+ .rpc_cred = data->cred, -+ }; -+ struct rpc_task_setup task_setup_data = { -+ .task = &data->task, -+ .rpc_client = NFS_CLIENT(data->args.inode), -+ .rpc_message = &msg, -+ .callback_ops = &nfs4_layoutcommit_ops, -+ .callback_data = data, -+ .flags = RPC_TASK_ASYNC, -+ }; -+ struct rpc_task *task; -+ int status = 0; -+ -+ dprintk("NFS: %4d initiating layoutcommit call. %llu@%llu lbw: %llu " -+ "type: %d issync %d\n", -+ data->task.tk_pid, -+ data->args.range.length, -+ data->args.range.offset, -+ data->args.lastbytewritten, -+ data->args.layout_type, issync); -+ -+ data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; -+ task = rpc_run_task(&task_setup_data); -+ if (IS_ERR(task)) -+ return PTR_ERR(task); -+ if (!issync) -+ goto out; -+ status = nfs4_wait_for_completion_rpc_task(task); -+ if (status != 0) -+ goto out; -+ status = data->status; -+out: -+ dprintk("%s: status %d\n", __func__, status); -+ rpc_put_task(task); -+ return 0; -+} -+ -+static void -+nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) -+{ -+ struct nfs4_layoutreturn *lrp = calldata; -+ struct inode *ino = lrp->args.inode; -+ struct nfs_inode *nfsi = NFS_I(ino); -+ struct nfs_server *server = NFS_SERVER(ino); -+ -+ dprintk("--> %s\n", __func__); -+ if ((lrp->args.return_type == RETURN_FILE) && -+ pnfs_return_layout_barrier(nfsi, &lrp->args.range)) { -+ dprintk("%s: waiting on barrier\n", __func__); -+ rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL); -+ return; -+ } -+ if (lrp->stateid) { -+ /* Forget the layout, without sending the return */ -+ rpc_exit(task, 0); -+ return; -+ } -+ if (nfs4_setup_sequence(server, NULL, &lrp->args.seq_args, -+ &lrp->res.seq_res, 0, task)) -+ return; -+ rpc_call_start(task); -+} -+ -+static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) -+{ -+ struct nfs4_layoutreturn *lrp = calldata; -+ struct inode *ino = lrp->args.inode; -+ struct nfs_server *server = NFS_SERVER(ino); -+ -+ dprintk("--> %s\n", __func__); -+ -+ if (!nfs4_sequence_done(task, &lrp->res.seq_res)) -+ return; -+ -+ if (RPC_ASSASSINATED(task)) -+ return; -+ -+ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) -+ nfs_restart_rpc(task, server->nfs_client); -+ -+ dprintk("<-- %s\n", __func__); -+} -+ -+static void nfs4_layoutreturn_release(void *calldata) -+{ -+ struct nfs4_layoutreturn *lrp = calldata; -+ struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; -+ -+ dprintk("--> %s return_type %d lo %p\n", __func__, -+ lrp->args.return_type, lo); -+ -+ pnfs_layoutreturn_release(lrp); -+ kfree(calldata); -+ dprintk("<-- %s\n", __func__); -+} -+ -+static const struct rpc_call_ops nfs4_layoutreturn_call_ops = { -+ .rpc_call_prepare = nfs4_layoutreturn_prepare, -+ .rpc_call_done = nfs4_layoutreturn_done, -+ .rpc_release = nfs4_layoutreturn_release, -+}; -+ -+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync) -+{ -+ struct inode *ino = lrp->args.inode; -+ struct nfs_server *server = NFS_SERVER(ino); -+ struct rpc_task *task; -+ struct rpc_message msg = { -+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], -+ .rpc_argp = &lrp->args, -+ .rpc_resp = &lrp->res, -+ }; -+ struct rpc_task_setup task_setup_data = { -+ .rpc_client = server->client, -+ .rpc_message = &msg, -+ .callback_ops = &nfs4_layoutreturn_call_ops, -+ .callback_data = lrp, -+ .flags = RPC_TASK_ASYNC, -+ }; -+ int status = 0; -+ -+ dprintk("--> %s\n", __func__); -+ lrp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; -+ task = rpc_run_task(&task_setup_data); -+ if (IS_ERR(task)) -+ return PTR_ERR(task); -+ if (!issync) -+ goto out; -+ status = nfs4_wait_for_completion_rpc_task(task); -+ if (status != 0) -+ goto out; -+ status = task->tk_status; -+out: -+ dprintk("<-- %s\n", __func__); -+ rpc_put_task(task); -+ return status; -+} -+ -+/* -+ * Retrieve the list of Data Server devices from the MDS. -+ */ -+static int _nfs4_getdevicelist(struct nfs_server *server, -+ const struct nfs_fh *fh, -+ struct pnfs_devicelist *devlist) -+{ -+ struct nfs4_getdevicelist_args args = { -+ .fh = fh, -+ .layoutclass = server->pnfs_curr_ld->id, -+ }; -+ struct nfs4_getdevicelist_res res = { -+ .devlist = devlist, -+ }; -+ struct rpc_message msg = { -+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST], -+ .rpc_argp = &args, -+ .rpc_resp = &res, -+ .rpc_cred = nfs4_get_machine_cred(server->nfs_client), -+ }; -+ int status; -+ -+ dprintk("--> %s\n", __func__); -+ status = nfs4_call_sync(server, &msg, &args, &res, 0); -+ put_rpccred(msg.rpc_cred); -+ dprintk("<-- %s status=%d\n", __func__, status); -+ return status; -+} -+ -+int nfs4_proc_getdevicelist(struct nfs_server *server, -+ const struct nfs_fh *fh, -+ struct pnfs_devicelist *devlist) -+{ -+ struct nfs4_exception exception = { }; -+ int err; -+ -+ do { -+ err = nfs4_handle_exception(server, -+ _nfs4_getdevicelist(server, fh, devlist), -+ &exception); -+ } while (exception.retry); -+ -+ dprintk("%s: err=%d, num_devs=%u\n", __func__, -+ err, devlist->num_devs); -+ -+ return err; -+} -+EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist); -+ -+static int -+_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) -+{ -+ struct nfs4_getdeviceinfo_args args = { -+ .pdev = pdev, -+ }; -+ struct nfs4_getdeviceinfo_res res = { -+ .pdev = pdev, -+ }; -+ struct rpc_message msg = { -+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], -+ .rpc_argp = &args, -+ .rpc_resp = &res, -+ .rpc_cred = nfs4_get_machine_cred(server->nfs_client), -+ }; -+ int status; -+ -+ dprintk("--> %s\n", __func__); -+ status = nfs4_call_sync(server, &msg, &args, &res, 0); -+ put_rpccred(msg.rpc_cred); -+ dprintk("<-- %s status=%d\n", __func__, status); -+ -+ return status; -+} -+ -+int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) -+{ -+ struct nfs4_exception exception = { }; -+ int err; -+ -+ do { -+ err = nfs4_handle_exception(server, -+ _nfs4_proc_getdeviceinfo(server, pdev), -+ &exception); -+ } while (exception.retry); -+ return err; -+} -+EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo); -+ - #endif /* CONFIG_NFS_V4_1 */ - - struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { -diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c -index 72b6c58..b57f41f 100644 ---- a/fs/nfs/nfs4renewd.c -+++ b/fs/nfs/nfs4renewd.c -@@ -64,7 +64,7 @@ nfs4_renew_state(struct work_struct *work) - ops = clp->cl_mvops->state_renewal_ops; - dprintk("%s: start\n", __func__); - /* Are there any active superblocks? */ -- if (list_empty(&clp->cl_superblocks)) -+ if (list_empty(&clp->cl_superblocks) && !is_ds_only_client(clp)) - goto out; - spin_lock(&clp->cl_lock); - lease = clp->cl_lease_time; -diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c -index 3e2f19b..3168d77 100644 ---- a/fs/nfs/nfs4state.c -+++ b/fs/nfs/nfs4state.c -@@ -53,6 +53,7 @@ - #include "callback.h" - #include "delegation.h" - #include "internal.h" -+#include "pnfs.h" - - #define OPENOWNER_POOL_SIZE 8 - -@@ -126,6 +127,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp) - int status; - struct nfs_fsinfo fsinfo; - -+ if (is_ds_only_client(clp)) { -+ nfs4_schedule_state_renewal(clp); -+ return 0; -+ } -+ - status = nfs4_proc_get_lease_time(clp, &fsinfo); - if (status == 0) { - /* Update lease time and schedule renewal */ -@@ -182,6 +188,7 @@ static int nfs4_begin_drain_session(struct nfs_client *clp) - int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) - { - int status; -+ u32 req_exchange_flags = clp->cl_exchange_flags; - - nfs4_begin_drain_session(clp); - status = nfs4_proc_exchange_id(clp, cred); -@@ -190,6 +197,16 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) - status = nfs4_proc_create_session(clp); - if (status != 0) - goto out; -+ if (is_ds_only_session(req_exchange_flags)) { -+ clp->cl_exchange_flags &= -+ ~(EXCHGID4_FLAG_USE_PNFS_MDS | EXCHGID4_FLAG_USE_NON_PNFS); -+ if (!is_ds_only_session(clp->cl_exchange_flags)) { -+ nfs4_destroy_session(clp->cl_session); -+ clp->cl_session = NULL; -+ status = -ENOTSUPP; -+ goto out; -+ } -+ } - nfs41_setup_state_renewal(clp); - nfs_mark_client_ready(clp, NFS_CS_READY); - out: -@@ -583,8 +600,24 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, - if (!call_close) { - nfs4_put_open_state(state); - nfs4_put_state_owner(owner); -- } else -+ } else { -+ u32 roc_iomode; -+ struct nfs_inode *nfsi = NFS_I(state->inode); -+ -+ if (has_layout(nfsi) && -+ (roc_iomode = pnfs_layout_roc_iomode(nfsi)) != 0) { -+ struct pnfs_layout_range range = { -+ .iomode = roc_iomode, -+ .offset = 0, -+ .length = NFS4_MAX_UINT64, -+ }; -+ -+ pnfs_return_layout(state->inode, &range, NULL, -+ RETURN_FILE, wait); -+ } -+ - nfs4_do_close(path, state, gfp_mask, wait); -+ } - } - - void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) -@@ -1447,6 +1480,7 @@ static void nfs4_state_manager(struct nfs_client *clp) - } - clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); - set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); -+ pnfs_destroy_all_layouts(clp); - } - - if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { -diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c -index 08ef912..30ce2ed 100644 ---- a/fs/nfs/nfs4xdr.c -+++ b/fs/nfs/nfs4xdr.c -@@ -52,6 +52,7 @@ - #include - #include "nfs4_fs.h" - #include "internal.h" -+#include "pnfs.h" - - #define NFSDBG_FACILITY NFSDBG_XDR - -@@ -89,7 +90,7 @@ static int nfs4_stat_to_errno(int); - #define encode_getfh_maxsz (op_encode_hdr_maxsz) - #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ - ((3+NFS4_FHSIZE) >> 2)) --#define nfs4_fattr_bitmap_maxsz 3 -+#define nfs4_fattr_bitmap_maxsz 4 - #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) - #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) - #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) -@@ -111,7 +112,11 @@ static int nfs4_stat_to_errno(int); - #define encode_restorefh_maxsz (op_encode_hdr_maxsz) - #define decode_restorefh_maxsz (op_decode_hdr_maxsz) - #define encode_fsinfo_maxsz (encode_getattr_maxsz) --#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) -+/* The 5 accounts for the PNFS attributes, and assumes that at most three -+ * layout types will be returned. -+ */ -+#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ -+ nfs4_fattr_bitmap_maxsz + 8 + 5) - #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) - #define decode_renew_maxsz (op_decode_hdr_maxsz) - #define encode_setclientid_maxsz \ -@@ -310,6 +315,41 @@ static int nfs4_stat_to_errno(int); - XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) - #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) - #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) -+#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ -+ encode_verifier_maxsz) -+#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \ -+ 2 /* nfs_cookie4 gdlr_cookie */ + \ -+ decode_verifier_maxsz \ -+ /* verifier4 gdlr_verifier */ + \ -+ 1 /* gdlr_deviceid_list count */ + \ -+ XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ -+ NFS4_DEVICEID4_SIZE) \ -+ /* gdlr_deviceid_list */ + \ -+ 1 /* bool gdlr_eof */) -+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ -+ XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) -+#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ -+ 1 /* layout type */ + \ -+ 1 /* opaque devaddr4 length */ + \ -+ /* devaddr4 payload is read into page */ \ -+ 1 /* notification bitmap length */ + \ -+ 1 /* notification bitmap */) -+#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ -+ encode_stateid_maxsz) -+#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ -+ decode_stateid_maxsz + \ -+ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) -+#define encode_layoutcommit_maxsz (18 + \ -+ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \ -+ op_encode_hdr_maxsz + \ -+ encode_stateid_maxsz) -+#define decode_layoutcommit_maxsz (3 + op_decode_hdr_maxsz) -+#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ -+ encode_stateid_maxsz + \ -+ 1 /* FIXME: opaque lrf_body always empty at -+ *the moment */) -+#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ -+ 1 + decode_stateid_maxsz) - #else /* CONFIG_NFS_V4_1 */ - #define encode_sequence_maxsz 0 - #define decode_sequence_maxsz 0 -@@ -699,6 +739,60 @@ static int nfs4_stat_to_errno(int); - #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ - decode_sequence_maxsz + \ - decode_reclaim_complete_maxsz) -+#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ -+ encode_sequence_maxsz + \ -+ encode_putfh_maxsz + \ -+ encode_getdevicelist_maxsz) -+#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ -+ decode_sequence_maxsz + \ -+ decode_putfh_maxsz + \ -+ decode_getdevicelist_maxsz) -+#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ -+ encode_sequence_maxsz +\ -+ encode_getdeviceinfo_maxsz) -+#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \ -+ decode_sequence_maxsz + \ -+ decode_getdeviceinfo_maxsz) -+#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \ -+ encode_sequence_maxsz + \ -+ encode_putfh_maxsz + \ -+ encode_layoutget_maxsz) -+#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \ -+ decode_sequence_maxsz + \ -+ decode_putfh_maxsz + \ -+ decode_layoutget_maxsz) -+#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \ -+ encode_sequence_maxsz +\ -+ encode_putfh_maxsz + \ -+ encode_layoutcommit_maxsz + \ -+ encode_getattr_maxsz) -+#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \ -+ decode_sequence_maxsz + \ -+ decode_putfh_maxsz + \ -+ decode_layoutcommit_maxsz + \ -+ decode_getattr_maxsz) -+#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \ -+ encode_sequence_maxsz + \ -+ encode_putfh_maxsz + \ -+ encode_layoutreturn_maxsz) -+#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \ -+ decode_sequence_maxsz + \ -+ decode_putfh_maxsz + \ -+ decode_layoutreturn_maxsz) -+#define NFS4_enc_dswrite_sz (compound_encode_hdr_maxsz + \ -+ encode_sequence_maxsz +\ -+ encode_putfh_maxsz + \ -+ encode_write_maxsz) -+#define NFS4_dec_dswrite_sz (compound_decode_hdr_maxsz + \ -+ decode_sequence_maxsz + \ -+ decode_putfh_maxsz + \ -+ decode_write_maxsz) -+#define NFS4_enc_dscommit_sz (compound_encode_hdr_maxsz + \ -+ encode_putfh_maxsz + \ -+ encode_commit_maxsz) -+#define NFS4_dec_dscommit_sz (compound_decode_hdr_maxsz + \ -+ decode_putfh_maxsz + \ -+ decode_commit_maxsz) - - const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + - compound_encode_hdr_maxsz + -@@ -1003,6 +1097,35 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm - hdr->replen += decode_getattr_maxsz; - } - -+static void -+encode_getattr_three(struct xdr_stream *xdr, -+ uint32_t bm0, uint32_t bm1, uint32_t bm2, -+ struct compound_hdr *hdr) -+{ -+ __be32 *p; -+ -+ p = reserve_space(xdr, 4); -+ *p = cpu_to_be32(OP_GETATTR); -+ if (bm2) { -+ p = reserve_space(xdr, 16); -+ *p++ = cpu_to_be32(3); -+ *p++ = cpu_to_be32(bm0); -+ *p++ = cpu_to_be32(bm1); -+ *p = cpu_to_be32(bm2); -+ } else if (bm1) { -+ p = reserve_space(xdr, 12); -+ *p++ = cpu_to_be32(2); -+ *p++ = cpu_to_be32(bm0); -+ *p = cpu_to_be32(bm1); -+ } else { -+ p = reserve_space(xdr, 8); -+ *p++ = cpu_to_be32(1); -+ *p = cpu_to_be32(bm0); -+ } -+ hdr->nops++; -+ hdr->replen += decode_getattr_maxsz; -+} -+ - static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) - { - encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], -@@ -1011,8 +1134,11 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c - - static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) - { -- encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], -- bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); -+ encode_getattr_three(xdr, -+ bitmask[0] & nfs4_fsinfo_bitmap[0], -+ bitmask[1] & nfs4_fsinfo_bitmap[1], -+ bitmask[2] & nfs4_fsinfo_bitmap[2], -+ hdr); - } - - static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) -@@ -1726,6 +1852,155 @@ static void encode_sequence(struct xdr_stream *xdr, - #endif /* CONFIG_NFS_V4_1 */ - } - -+#ifdef CONFIG_NFS_V4_1 -+static void -+encode_getdevicelist(struct xdr_stream *xdr, -+ const struct nfs4_getdevicelist_args *args, -+ struct compound_hdr *hdr) -+{ -+ __be32 *p; -+ nfs4_verifier dummy = { -+ .data = "dummmmmy", -+ }; -+ -+ p = reserve_space(xdr, 20); -+ *p++ = cpu_to_be32(OP_GETDEVICELIST); -+ *p++ = cpu_to_be32(args->layoutclass); -+ *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); -+ xdr_encode_hyper(p, 0ULL); /* cookie */ -+ encode_nfs4_verifier(xdr, &dummy); -+ hdr->nops++; -+ hdr->replen += decode_getdevicelist_maxsz; -+} -+ -+static void -+encode_getdeviceinfo(struct xdr_stream *xdr, -+ const struct nfs4_getdeviceinfo_args *args, -+ struct compound_hdr *hdr) -+{ -+ __be32 *p; -+ -+ p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE); -+ *p++ = cpu_to_be32(OP_GETDEVICEINFO); -+ p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, -+ NFS4_DEVICEID4_SIZE); -+ *p++ = cpu_to_be32(args->pdev->layout_type); -+ *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */ -+ *p++ = cpu_to_be32(0); /* bitmap length 0 */ -+ hdr->nops++; -+ hdr->replen += decode_getdeviceinfo_maxsz; -+} -+ -+static void -+encode_layoutget(struct xdr_stream *xdr, -+ const struct nfs4_layoutget_args *args, -+ struct compound_hdr *hdr) -+{ -+ nfs4_stateid stateid; -+ __be32 *p; -+ -+ p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); -+ *p++ = cpu_to_be32(OP_LAYOUTGET); -+ *p++ = cpu_to_be32(0); /* Signal layout available */ -+ *p++ = cpu_to_be32(args->type); -+ *p++ = cpu_to_be32(args->range.iomode); -+ p = xdr_encode_hyper(p, args->range.offset); -+ p = xdr_encode_hyper(p, args->range.length); -+ p = xdr_encode_hyper(p, args->minlength); -+ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout, -+ args->ctx->state); -+ p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE); -+ *p = cpu_to_be32(args->maxcount); -+ -+ dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", -+ __func__, -+ args->type, -+ args->range.iomode, -+ (unsigned long)args->range.offset, -+ (unsigned long)args->range.length, -+ args->maxcount); -+ hdr->nops++; -+ hdr->replen += decode_layoutget_maxsz; -+} -+ -+static int -+encode_layoutcommit(struct xdr_stream *xdr, -+ const struct nfs4_layoutcommit_args *args, -+ struct compound_hdr *hdr) -+{ -+ __be32 *p; -+ -+ dprintk("%s: %llu@%llu lbw: %llu type: %d\n", __func__, -+ args->range.length, args->range.offset, args->lastbytewritten, -+ args->layout_type); -+ -+ p = reserve_space(xdr, 40 + NFS4_STATEID_SIZE); -+ *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); -+ p = xdr_encode_hyper(p, args->range.offset); -+ p = xdr_encode_hyper(p, args->range.length); -+ *p++ = cpu_to_be32(0); /* reclaim */ -+ p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE); -+ *p++ = cpu_to_be32(1); /* newoffset = TRUE */ -+ p = xdr_encode_hyper(p, args->lastbytewritten); -+ *p = cpu_to_be32(args->time_modify_changed != 0); -+ if (args->time_modify_changed) { -+ p = reserve_space(xdr, 12); -+ *p++ = cpu_to_be32(0); -+ *p++ = cpu_to_be32(args->time_modify.tv_sec); -+ *p = cpu_to_be32(args->time_modify.tv_nsec); -+ } -+ -+ p = reserve_space(xdr, 4); -+ *p = cpu_to_be32(args->layout_type); -+ -+ if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutcommit) { -+ NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutcommit( -+ NFS_I(args->inode)->layout, xdr, args); -+ } else { -+ p = reserve_space(xdr, 4); -+ xdr_encode_opaque(p, NULL, 0); -+ } -+ -+ hdr->nops++; -+ hdr->replen += decode_layoutcommit_maxsz; -+ return 0; -+} -+ -+static void -+encode_layoutreturn(struct xdr_stream *xdr, -+ const struct nfs4_layoutreturn_args *args, -+ struct compound_hdr *hdr) -+{ -+ nfs4_stateid stateid; -+ __be32 *p; -+ -+ p = reserve_space(xdr, 20); -+ *p++ = cpu_to_be32(OP_LAYOUTRETURN); -+ *p++ = cpu_to_be32(args->reclaim); -+ *p++ = cpu_to_be32(args->layout_type); -+ *p++ = cpu_to_be32(args->range.iomode); -+ *p = cpu_to_be32(args->return_type); -+ if (args->return_type == RETURN_FILE) { -+ p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE); -+ p = xdr_encode_hyper(p, args->range.offset); -+ p = xdr_encode_hyper(p, args->range.length); -+ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout, -+ NULL); -+ p = xdr_encode_opaque_fixed(p, &stateid.data, -+ NFS4_STATEID_SIZE); -+ if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) { -+ NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn( -+ NFS_I(args->inode)->layout, xdr, args); -+ } else { -+ p = reserve_space(xdr, 4); -+ *p = cpu_to_be32(0); -+ } -+ } -+ hdr->nops++; -+ hdr->replen += decode_layoutreturn_maxsz; -+} -+#endif /* CONFIG_NFS_V4_1 */ -+ - /* - * END OF "GENERIC" ENCODE ROUTINES. - */ -@@ -2374,7 +2649,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str - struct compound_hdr hdr = { - .nops = 0, - }; -- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; -+ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; - - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); -@@ -2513,7 +2788,7 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p, - struct compound_hdr hdr = { - .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), - }; -- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; -+ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; - - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, req, &hdr); -@@ -2543,6 +2818,153 @@ static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p, - return 0; - } - -+/* -+ * Encode GETDEVICELIST request -+ */ -+static int -+nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, uint32_t *p, -+ struct nfs4_getdevicelist_args *args) -+{ -+ struct xdr_stream xdr; -+ struct compound_hdr hdr = { -+ .minorversion = nfs4_xdr_minorversion(&args->seq_args), -+ }; -+ -+ xdr_init_encode(&xdr, &req->rq_snd_buf, p); -+ encode_compound_hdr(&xdr, req, &hdr); -+ encode_sequence(&xdr, &args->seq_args, &hdr); -+ encode_putfh(&xdr, args->fh, &hdr); -+ encode_getdevicelist(&xdr, args, &hdr); -+ encode_nops(&hdr); -+ return 0; -+} -+ -+/* -+ * Encode GETDEVICEINFO request -+ */ -+static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p, -+ struct nfs4_getdeviceinfo_args *args) -+{ -+ struct xdr_stream xdr; -+ struct compound_hdr hdr = { -+ .minorversion = nfs4_xdr_minorversion(&args->seq_args), -+ }; -+ -+ xdr_init_encode(&xdr, &req->rq_snd_buf, p); -+ encode_compound_hdr(&xdr, req, &hdr); -+ encode_sequence(&xdr, &args->seq_args, &hdr); -+ encode_getdeviceinfo(&xdr, args, &hdr); -+ -+ /* set up reply kvec. Subtract notification bitmap max size (2) -+ * so that notification bitmap is put in xdr_buf tail */ -+ xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2, -+ args->pdev->pages, args->pdev->pgbase, -+ args->pdev->pglen); -+ -+ encode_nops(&hdr); -+ return 0; -+} -+ -+/* -+ * Encode LAYOUTGET request -+ */ -+static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p, -+ struct nfs4_layoutget_args *args) -+{ -+ struct xdr_stream xdr; -+ struct compound_hdr hdr = { -+ .minorversion = nfs4_xdr_minorversion(&args->seq_args), -+ }; -+ -+ xdr_init_encode(&xdr, &req->rq_snd_buf, p); -+ encode_compound_hdr(&xdr, req, &hdr); -+ encode_sequence(&xdr, &args->seq_args, &hdr); -+ encode_putfh(&xdr, NFS_FH(args->inode), &hdr); -+ encode_layoutget(&xdr, args, &hdr); -+ encode_nops(&hdr); -+ return 0; -+} -+ -+/* -+ * Encode LAYOUTCOMMIT request -+ */ -+static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, uint32_t *p, -+ struct nfs4_layoutcommit_args *args) -+{ -+ struct xdr_stream xdr; -+ struct compound_hdr hdr = { -+ .minorversion = nfs4_xdr_minorversion(&args->seq_args), -+ }; -+ -+ xdr_init_encode(&xdr, &req->rq_snd_buf, p); -+ encode_compound_hdr(&xdr, req, &hdr); -+ encode_sequence(&xdr, &args->seq_args, &hdr); -+ encode_putfh(&xdr, args->fh, &hdr); -+ encode_layoutcommit(&xdr, args, &hdr); -+ encode_getfattr(&xdr, args->bitmask, &hdr); -+ encode_nops(&hdr); -+ return 0; -+} -+ -+/* -+ * Encode LAYOUTRETURN request -+ */ -+static int nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, uint32_t *p, -+ struct nfs4_layoutreturn_args *args) -+{ -+ struct xdr_stream xdr; -+ struct compound_hdr hdr = { -+ .minorversion = nfs4_xdr_minorversion(&args->seq_args), -+ }; -+ -+ xdr_init_encode(&xdr, &req->rq_snd_buf, p); -+ encode_compound_hdr(&xdr, req, &hdr); -+ encode_sequence(&xdr, &args->seq_args, &hdr); -+ encode_putfh(&xdr, NFS_FH(args->inode), &hdr); -+ encode_layoutreturn(&xdr, args, &hdr); -+ encode_nops(&hdr); -+ return 0; -+} -+ -+/* -+ * Encode a pNFS File Layout Data Server WRITE request -+ */ -+static int nfs4_xdr_enc_dswrite(struct rpc_rqst *req, uint32_t *p, -+ struct nfs_writeargs *args) -+{ -+ struct xdr_stream xdr; -+ struct compound_hdr hdr = { -+ .minorversion = nfs4_xdr_minorversion(&args->seq_args), -+ }; -+ -+ xdr_init_encode(&xdr, &req->rq_snd_buf, p); -+ encode_compound_hdr(&xdr, req, &hdr); -+ encode_sequence(&xdr, &args->seq_args, &hdr); -+ encode_putfh(&xdr, args->fh, &hdr); -+ encode_write(&xdr, args, &hdr); -+ encode_nops(&hdr); -+ return 0; -+} -+ -+/* -+ * Encode a pNFS File Layout Data Server COMMIT request -+ */ -+static int nfs4_xdr_enc_dscommit(struct rpc_rqst *req, uint32_t *p, -+ struct nfs_writeargs *args) -+{ -+ struct xdr_stream xdr; -+ struct compound_hdr hdr = { -+ .minorversion = nfs4_xdr_minorversion(&args->seq_args), -+ }; -+ -+ xdr_init_encode(&xdr, &req->rq_snd_buf, p); -+ encode_compound_hdr(&xdr, req, &hdr); -+ encode_sequence(&xdr, &args->seq_args, &hdr); -+ encode_putfh(&xdr, args->fh, &hdr); -+ encode_commit(&xdr, args, &hdr); -+ encode_nops(&hdr); -+ return 0; -+} - #endif /* CONFIG_NFS_V4_1 */ - - static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -@@ -2643,14 +3065,17 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) - goto out_overflow; - bmlen = be32_to_cpup(p); - -- bitmap[0] = bitmap[1] = 0; -+ bitmap[0] = bitmap[1] = bitmap[2] = 0; - p = xdr_inline_decode(xdr, (bmlen << 2)); - if (unlikely(!p)) - goto out_overflow; - if (bmlen > 0) { - bitmap[0] = be32_to_cpup(p++); -- if (bmlen > 1) -- bitmap[1] = be32_to_cpup(p); -+ if (bmlen > 1) { -+ bitmap[1] = be32_to_cpup(p++); -+ if (bmlen > 2) -+ bitmap[2] = be32_to_cpup(p); -+ } - } - return 0; - out_overflow: -@@ -2679,8 +3104,9 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3 - decode_attr_bitmap(xdr, bitmask); - bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; - } else -- bitmask[0] = bitmask[1] = 0; -- dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); -+ bitmask[0] = bitmask[1] = bitmask[2] = 0; -+ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, -+ bitmask[0], bitmask[1], bitmask[2]); - return 0; - } - -@@ -3665,7 +4091,7 @@ out_overflow: - static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) - { - __be32 *savep; -- uint32_t attrlen, bitmap[2] = {0}; -+ uint32_t attrlen, bitmap[3] = {0}; - int status; - - if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) -@@ -3691,7 +4117,7 @@ xdr_error: - static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) - { - __be32 *savep; -- uint32_t attrlen, bitmap[2] = {0}; -+ uint32_t attrlen, bitmap[3] = {0}; - int status; - - if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) -@@ -3723,7 +4149,7 @@ xdr_error: - static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) - { - __be32 *savep; -- uint32_t attrlen, bitmap[2] = {0}; -+ uint32_t attrlen, bitmap[3] = {0}; - int status; - - if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) -@@ -3749,7 +4175,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, - { - __be32 *savep; - uint32_t attrlen, -- bitmap[2] = {0}, -+ bitmap[3] = {0}, - type; - int status; - umode_t fmode = 0; -@@ -3868,11 +4294,87 @@ xdr_error: - return status; - } - -+/* -+ * Decode potentially multiple layout types. Currently we only support -+ * one layout driver per file system. -+ */ -+static int decode_first_pnfs_layout_type(struct xdr_stream *xdr, -+ uint32_t *layouttype) -+{ -+ uint32_t *p; -+ int num; -+ -+ p = xdr_inline_decode(xdr, 4); -+ if (unlikely(!p)) -+ goto out_overflow; -+ num = be32_to_cpup(p); -+ -+ /* pNFS is not supported by the underlying file system */ -+ if (num == 0) { -+ *layouttype = 0; -+ return 0; -+ } -+ if (num > 1) -+ printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers " -+ "per filesystem not supported\n", __func__); -+ -+ /* Decode and set first layout type, move xdr->p past unused types */ -+ p = xdr_inline_decode(xdr, num * 4); -+ if (unlikely(!p)) -+ goto out_overflow; -+ *layouttype = be32_to_cpup(p); -+ return 0; -+out_overflow: -+ print_overflow_msg(__func__, xdr); -+ return -EIO; -+} -+ -+/* -+ * The type of file system exported. -+ * Note we must ensure that layouttype is set in any non-error case. -+ */ -+static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, -+ uint32_t *layouttype) -+{ -+ int status = 0; -+ -+ dprintk("%s: bitmap is %x\n", __func__, bitmap[1]); -+ if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) -+ return -EIO; -+ if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) { -+ status = decode_first_pnfs_layout_type(xdr, layouttype); -+ bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; -+ } else -+ *layouttype = 0; -+ return status; -+} -+ -+/* -+ * The prefered block size for layout directed io -+ */ -+static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, -+ uint32_t *res) -+{ -+ __be32 *p; -+ -+ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); -+ *res = 0; -+ if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { -+ p = xdr_inline_decode(xdr, 4); -+ if (unlikely(!p)) { -+ print_overflow_msg(__func__, xdr); -+ return -EIO; -+ } -+ *res = be32_to_cpup(p); -+ bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; -+ } -+ return 0; -+} - - static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) - { - __be32 *savep; -- uint32_t attrlen, bitmap[2]; -+ uint32_t attrlen, bitmap[3]; - int status; - - if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) -@@ -3894,6 +4396,12 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) - if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) - goto xdr_error; - fsinfo->wtpref = fsinfo->wtmax; -+ status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); -+ if (status) -+ goto xdr_error; -+ status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); -+ if (status) -+ goto xdr_error; - - status = verify_attr_len(xdr, savep, attrlen); - xdr_error: -@@ -4382,7 +4890,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, - { - __be32 *savep; - uint32_t attrlen, -- bitmap[2] = {0}; -+ bitmap[3] = {0}; - struct kvec *iov = req->rq_rcv_buf.head; - int status; - -@@ -4731,16 +5239,238 @@ out_overflow: - #endif /* CONFIG_NFS_V4_1 */ - } - -+#if defined(CONFIG_NFS_V4_1) - /* -- * END OF "GENERIC" DECODE ROUTINES. -- */ -- --/* -- * Decode OPEN_DOWNGRADE response -+ * TODO: Need to handle case when EOF != true; - */ --static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res) -+static int decode_getdevicelist(struct xdr_stream *xdr, -+ struct pnfs_devicelist *res) - { -- struct xdr_stream xdr; -+ __be32 *p; -+ int status, i; -+ struct nfs_writeverf verftemp; -+ -+ status = decode_op_hdr(xdr, OP_GETDEVICELIST); -+ if (status) -+ return status; -+ -+ p = xdr_inline_decode(xdr, 8 + 8 + 4); -+ if (unlikely(!p)) -+ goto out_overflow; -+ -+ /* TODO: Skip cookie for now */ -+ p += 2; -+ -+ /* Read verifier */ -+ p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); -+ -+ res->num_devs = be32_to_cpup(p); -+ -+ dprintk("%s: num_dev %d\n", __func__, res->num_devs); -+ -+ if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) -+ return -NFS4ERR_REP_TOO_BIG; -+ -+ p = xdr_inline_decode(xdr, -+ res->num_devs * NFS4_DEVICEID4_SIZE + 4); -+ if (unlikely(!p)) -+ goto out_overflow; -+ for (i = 0; i < res->num_devs; i++) -+ p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, -+ NFS4_DEVICEID4_SIZE); -+ res->eof = be32_to_cpup(p); -+ return 0; -+out_overflow: -+ print_overflow_msg(__func__, xdr); -+ return -EIO; -+} -+ -+static int decode_getdeviceinfo(struct xdr_stream *xdr, -+ struct pnfs_device *pdev) -+{ -+ __be32 *p; -+ uint32_t len, type; -+ int status; -+ -+ status = decode_op_hdr(xdr, OP_GETDEVICEINFO); -+ if (status) { -+ if (status == -ETOOSMALL) { -+ p = xdr_inline_decode(xdr, 4); -+ if (unlikely(!p)) -+ goto out_overflow; -+ pdev->mincount = be32_to_cpup(p); -+ dprintk("%s: Min count too small. mincnt = %u\n", -+ __func__, pdev->mincount); -+ } -+ return status; -+ } -+ -+ p = xdr_inline_decode(xdr, 8); -+ if (unlikely(!p)) -+ goto out_overflow; -+ type = be32_to_cpup(p++); -+ if (type != pdev->layout_type) { -+ dprintk("%s: layout mismatch req: %u pdev: %u\n", -+ __func__, pdev->layout_type, type); -+ return -EINVAL; -+ } -+ /* -+ * Get the length of the opaque device_addr4. xdr_read_pages places -+ * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages) -+ * and places the remaining xdr data in xdr_buf->tail -+ */ -+ pdev->mincount = be32_to_cpup(p); -+ xdr_read_pages(xdr, pdev->mincount); /* include space for the length */ -+ -+ /* Parse notification bitmap, verifying that it is zero. */ -+ p = xdr_inline_decode(xdr, 4); -+ if (unlikely(!p)) -+ goto out_overflow; -+ len = be32_to_cpup(p); -+ if (len) { -+ int i; -+ -+ p = xdr_inline_decode(xdr, 4 * len); -+ if (unlikely(!p)) -+ goto out_overflow; -+ for (i = 0; i < len; i++, p++) { -+ if (be32_to_cpup(p)) { -+ dprintk("%s: notifications not supported\n", -+ __func__); -+ return -EIO; -+ } -+ } -+ } -+ return 0; -+out_overflow: -+ print_overflow_msg(__func__, xdr); -+ return -EIO; -+} -+ -+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, -+ struct nfs4_layoutget_res *res) -+{ -+ __be32 *p; -+ int status; -+ u32 layout_count; -+ -+ status = decode_op_hdr(xdr, OP_LAYOUTGET); -+ if (status) -+ return status; -+ p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE); -+ if (unlikely(!p)) -+ goto out_overflow; -+ res->return_on_close = be32_to_cpup(p++); -+ p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE); -+ layout_count = be32_to_cpup(p); -+ if (!layout_count) { -+ dprintk("%s: server responded with empty layout array\n", -+ __func__); -+ return -EINVAL; -+ } -+ -+ p = xdr_inline_decode(xdr, 24); -+ if (unlikely(!p)) -+ goto out_overflow; -+ p = xdr_decode_hyper(p, &res->range.offset); -+ p = xdr_decode_hyper(p, &res->range.length); -+ res->range.iomode = be32_to_cpup(p++); -+ res->type = be32_to_cpup(p++); -+ -+ status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p); -+ if (unlikely(status)) -+ return status; -+ -+ dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", -+ __func__, -+ (unsigned long)res->range.offset, -+ (unsigned long)res->range.length, -+ res->range.iomode, -+ res->type, -+ res->layout.len); -+ -+ /* nfs4_proc_layoutget allocated a single page */ -+ if (res->layout.len > PAGE_SIZE) -+ return -ENOMEM; -+ memcpy(res->layout.buf, p, res->layout.len); -+ -+ if (layout_count > 1) { -+ /* We only handle a length one array at the moment. Any -+ * further entries are just ignored. Note that this means -+ * the client may see a response that is less than the -+ * minimum it requested. -+ */ -+ dprintk("%s: server responded with %d layouts, dropping tail\n", -+ __func__, layout_count); -+ } -+ -+ return 0; -+out_overflow: -+ print_overflow_msg(__func__, xdr); -+ return -EIO; -+} -+ -+static int decode_layoutreturn(struct xdr_stream *xdr, -+ struct nfs4_layoutreturn_res *res) -+{ -+ __be32 *p; -+ int status; -+ -+ status = decode_op_hdr(xdr, OP_LAYOUTRETURN); -+ if (status) -+ return status; -+ p = xdr_inline_decode(xdr, 4); -+ if (unlikely(!p)) -+ goto out_overflow; -+ res->valid = true; -+ res->lrs_present = be32_to_cpup(p); -+ if (res->lrs_present) -+ status = decode_stateid(xdr, &res->stateid); -+ return status; -+out_overflow: -+ print_overflow_msg(__func__, xdr); -+ return -EIO; -+} -+ -+static int decode_layoutcommit(struct xdr_stream *xdr, -+ struct rpc_rqst *req, -+ struct nfs4_layoutcommit_res *res) -+{ -+ __be32 *p; -+ int status; -+ -+ status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); -+ if (status) -+ return status; -+ -+ p = xdr_inline_decode(xdr, 4); -+ if (unlikely(!p)) -+ goto out_overflow; -+ res->sizechanged = be32_to_cpup(p); -+ -+ if (res->sizechanged) { -+ p = xdr_inline_decode(xdr, 8); -+ if (unlikely(!p)) -+ goto out_overflow; -+ xdr_decode_hyper(p, &res->newsize); -+ } -+ return 0; -+out_overflow: -+ print_overflow_msg(__func__, xdr); -+ return -EIO; -+} -+#endif /* CONFIG_NFS_V4_1 */ -+ -+/* -+ * END OF "GENERIC" DECODE ROUTINES. -+ */ -+ -+/* -+ * Decode OPEN_DOWNGRADE response -+ */ -+static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res) -+{ -+ struct xdr_stream xdr; - struct compound_hdr hdr; - int status; - -@@ -5758,6 +6488,186 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p, - status = decode_reclaim_complete(&xdr, (void *)NULL); - return status; - } -+ -+/* -+ * Decode GETDEVICELIST response -+ */ -+static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, uint32_t *p, -+ struct nfs4_getdevicelist_res *res) -+{ -+ struct xdr_stream xdr; -+ struct compound_hdr hdr; -+ int status; -+ -+ dprintk("encoding getdevicelist!\n"); -+ -+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -+ status = decode_compound_hdr(&xdr, &hdr); -+ if (status != 0) -+ goto out; -+ status = decode_sequence(&xdr, &res->seq_res, rqstp); -+ if (status != 0) -+ goto out; -+ status = decode_putfh(&xdr); -+ if (status != 0) -+ goto out; -+ status = decode_getdevicelist(&xdr, res->devlist); -+out: -+ return status; -+} -+ -+/* -+ * Decode GETDEVINFO response -+ */ -+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p, -+ struct nfs4_getdeviceinfo_res *res) -+{ -+ struct xdr_stream xdr; -+ struct compound_hdr hdr; -+ int status; -+ -+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -+ status = decode_compound_hdr(&xdr, &hdr); -+ if (status != 0) -+ goto out; -+ status = decode_sequence(&xdr, &res->seq_res, rqstp); -+ if (status != 0) -+ goto out; -+ status = decode_getdeviceinfo(&xdr, res->pdev); -+out: -+ return status; -+} -+ -+/* -+ * Decode LAYOUTGET response -+ */ -+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p, -+ struct nfs4_layoutget_res *res) -+{ -+ struct xdr_stream xdr; -+ struct compound_hdr hdr; -+ int status; -+ -+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -+ status = decode_compound_hdr(&xdr, &hdr); -+ if (status) -+ goto out; -+ status = decode_sequence(&xdr, &res->seq_res, rqstp); -+ if (status) -+ goto out; -+ status = decode_putfh(&xdr); -+ if (status) -+ goto out; -+ status = decode_layoutget(&xdr, rqstp, res); -+out: -+ return status; -+} -+ -+/* -+ * Decode LAYOUTRETURN response -+ */ -+static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, uint32_t *p, -+ struct nfs4_layoutreturn_res *res) -+{ -+ struct xdr_stream xdr; -+ struct compound_hdr hdr; -+ int status; -+ -+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -+ status = decode_compound_hdr(&xdr, &hdr); -+ if (status) -+ goto out; -+ status = decode_sequence(&xdr, &res->seq_res, rqstp); -+ if (status) -+ goto out; -+ status = decode_putfh(&xdr); -+ if (status) -+ goto out; -+ status = decode_layoutreturn(&xdr, res); -+out: -+ return status; -+} -+ -+/* -+ * Decode LAYOUTCOMMIT response -+ */ -+static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, uint32_t *p, -+ struct nfs4_layoutcommit_res *res) -+{ -+ struct xdr_stream xdr; -+ struct compound_hdr hdr; -+ int status; -+ -+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -+ status = decode_compound_hdr(&xdr, &hdr); -+ if (status) -+ goto out; -+ status = decode_sequence(&xdr, &res->seq_res, rqstp); -+ if (status) -+ goto out; -+ status = decode_putfh(&xdr); -+ if (status) -+ goto out; -+ status = decode_layoutcommit(&xdr, rqstp, res); -+ if (status) -+ goto out; -+ decode_getfattr(&xdr, res->fattr, res->server, -+ !RPC_IS_ASYNC(rqstp->rq_task)); -+out: -+ return status; -+} -+ -+/* -+ * Decode pNFS File Layout Data Server WRITE response -+ */ -+static int nfs4_xdr_dec_dswrite(struct rpc_rqst *rqstp, uint32_t *p, -+ struct nfs_writeres *res) -+{ -+ struct xdr_stream xdr; -+ struct compound_hdr hdr; -+ int status; -+ -+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -+ status = decode_compound_hdr(&xdr, &hdr); -+ if (status) -+ goto out; -+ status = decode_sequence(&xdr, &res->seq_res, rqstp); -+ if (status) -+ goto out; -+ status = decode_putfh(&xdr); -+ if (status) -+ goto out; -+ status = decode_write(&xdr, res); -+ if (!status) -+ return res->count; -+out: -+ return status; -+} -+ -+/* -+ * Decode pNFS File Layout Data Server COMMIT response -+ */ -+static int nfs4_xdr_dec_dscommit(struct rpc_rqst *rqstp, uint32_t *p, -+ struct nfs_writeres *res) -+{ -+ struct xdr_stream xdr; -+ struct compound_hdr hdr; -+ int status; -+ -+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -+ status = decode_compound_hdr(&xdr, &hdr); -+ if (status) -+ goto out; -+ status = decode_sequence(&xdr, &res->seq_res, rqstp); -+ if (status) -+ goto out; -+ status = decode_putfh(&xdr); -+ if (status) -+ goto out; -+ status = decode_commit(&xdr, res); -+out: -+ return status; -+} - #endif /* CONFIG_NFS_V4_1 */ - - __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) -@@ -5936,6 +6846,13 @@ struct rpc_procinfo nfs4_procedures[] = { - PROC(SEQUENCE, enc_sequence, dec_sequence), - PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), - PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), -+ PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), -+ PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), -+ PROC(LAYOUTGET, enc_layoutget, dec_layoutget), -+ PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), -+ PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), -+ PROC(PNFS_WRITE, enc_dswrite, dec_dswrite), -+ PROC(PNFS_COMMIT, enc_dscommit, dec_dscommit), - #endif /* CONFIG_NFS_V4_1 */ - }; - -diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild -new file mode 100644 -index 0000000..9addfe8 ---- /dev/null -+++ b/fs/nfs/objlayout/Kbuild -@@ -0,0 +1,11 @@ -+# -+# Makefile for the pNFS Objects Layout Driver kernel module -+# -+objlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o objio_osd.o -+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o -+ -+# -+# Panasas pNFS Layout Driver kernel module -+# -+panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o -+obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o -diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c -new file mode 100644 -index 0000000..e945ace ---- /dev/null -+++ b/fs/nfs/objlayout/objio_osd.c -@@ -0,0 +1,1060 @@ -+/* -+ * objio_osd.c -+ * -+ * pNFS Objects layout implementation over open-osd initiator library -+ * -+ * Copyright (C) 2009 Panasas Inc. -+ * All rights reserved. -+ * -+ * Benny Halevy -+ * Boaz Harrosh -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 -+ * See the file COPYING included with this distribution for more details. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. Neither the name of the Panasas company nor the names of its -+ * contributors may be used to endorse or promote products derived -+ * from this software without specific prior written permission. -+ * -+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED -+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "objlayout.h" -+ -+#define NFSDBG_FACILITY NFSDBG_PNFS_LD -+ -+#define _LLU(x) ((unsigned long long)x) -+ -+enum { BIO_MAX_PAGES_KMALLOC = -+ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), -+}; -+ -+/* A per mountpoint struct currently for device cache */ -+struct objio_mount_type { -+ struct list_head dev_list; -+ spinlock_t dev_list_lock; -+}; -+ -+struct _dev_ent { -+ struct list_head list; -+ struct nfs4_deviceid d_id; -+ struct osd_dev *od; -+}; -+ -+static void _dev_list_remove_all(struct objio_mount_type *omt) -+{ -+ spin_lock(&omt->dev_list_lock); -+ -+ while (!list_empty(&omt->dev_list)) { -+ struct _dev_ent *de = list_entry(omt->dev_list.next, -+ struct _dev_ent, list); -+ -+ list_del_init(&de->list); -+ osduld_put_device(de->od); -+ kfree(de); -+ } -+ -+ spin_unlock(&omt->dev_list_lock); -+} -+ -+static struct osd_dev *___dev_list_find(struct objio_mount_type *omt, -+ struct nfs4_deviceid *d_id) -+{ -+ struct list_head *le; -+ -+ list_for_each(le, &omt->dev_list) { -+ struct _dev_ent *de = list_entry(le, struct _dev_ent, list); -+ -+ if (0 == memcmp(&de->d_id, d_id, sizeof(*d_id))) -+ return de->od; -+ } -+ -+ return NULL; -+} -+ -+static struct osd_dev *_dev_list_find(struct objio_mount_type *omt, -+ struct nfs4_deviceid *d_id) -+{ -+ struct osd_dev *od; -+ -+ spin_lock(&omt->dev_list_lock); -+ od = ___dev_list_find(omt, d_id); -+ spin_unlock(&omt->dev_list_lock); -+ return od; -+} -+ -+static int _dev_list_add(struct objio_mount_type *omt, -+ struct nfs4_deviceid *d_id, struct osd_dev *od) -+{ -+ struct _dev_ent *de = kzalloc(sizeof(*de), GFP_KERNEL); -+ -+ if (!de) -+ return -ENOMEM; -+ -+ spin_lock(&omt->dev_list_lock); -+ -+ if (___dev_list_find(omt, d_id)) { -+ kfree(de); -+ goto out; -+ } -+ -+ de->d_id = *d_id; -+ de->od = od; -+ list_add(&de->list, &omt->dev_list); -+ -+out: -+ spin_unlock(&omt->dev_list_lock); -+ return 0; -+} -+ -+struct objio_segment { -+ struct pnfs_osd_layout *layout; -+ -+ unsigned mirrors_p1; -+ unsigned stripe_unit; -+ unsigned group_width; /* Data stripe_units without integrity comps */ -+ u64 group_depth; -+ unsigned group_count; -+ -+ unsigned num_comps; -+ /* variable length */ -+ struct osd_dev *ods[1]; -+}; -+ -+struct objio_state; -+typedef ssize_t (*objio_done_fn)(struct objio_state *ios); -+ -+struct objio_state { -+ /* Generic layer */ -+ struct objlayout_io_state ol_state; -+ -+ struct objio_segment *objio_seg; -+ -+ struct kref kref; -+ objio_done_fn done; -+ void *private; -+ -+ unsigned long length; -+ unsigned numdevs; /* Actually used devs in this IO */ -+ /* A per-device variable array of size numdevs */ -+ struct _objio_per_comp { -+ struct bio *bio; -+ struct osd_request *or; -+ unsigned long length; -+ u64 offset; -+ unsigned dev; -+ } per_dev[]; -+}; -+ -+/* Send and wait for a get_device_info of devices in the layout, -+ then look them up with the osd_initiator library */ -+static struct osd_dev *_device_lookup(struct pnfs_layout_hdr *pnfslay, -+ struct objio_segment *objio_seg, unsigned comp) -+{ -+ struct pnfs_osd_layout *layout = objio_seg->layout; -+ struct pnfs_osd_deviceaddr *deviceaddr; -+ struct nfs4_deviceid *d_id; -+ struct osd_dev *od; -+ struct osd_dev_info odi; -+ struct objio_mount_type *omt = NFS_SERVER(pnfslay->inode)->pnfs_ld_data; -+ int err; -+ -+ d_id = &layout->olo_comps[comp].oc_object_id.oid_device_id; -+ -+ od = _dev_list_find(omt, d_id); -+ if (od) -+ return od; -+ -+ err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr); -+ if (unlikely(err)) { -+ dprintk("%s: objlayout_get_deviceinfo=>%d\n", __func__, err); -+ return ERR_PTR(err); -+ } -+ -+ odi.systemid_len = deviceaddr->oda_systemid.len; -+ if (odi.systemid_len > sizeof(odi.systemid)) { -+ err = -EINVAL; -+ goto out; -+ } else if (odi.systemid_len) -+ memcpy(odi.systemid, deviceaddr->oda_systemid.data, -+ odi.systemid_len); -+ odi.osdname_len = deviceaddr->oda_osdname.len; -+ odi.osdname = (u8 *)deviceaddr->oda_osdname.data; -+ -+ if (!odi.osdname_len && !odi.systemid_len) { -+ dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", -+ __func__); -+ err = -ENODEV; -+ goto out; -+ } -+ -+ od = osduld_info_lookup(&odi); -+ if (unlikely(IS_ERR(od))) { -+ err = PTR_ERR(od); -+ dprintk("%s: osduld_info_lookup => %d\n", __func__, err); -+ goto out; -+ } -+ -+ _dev_list_add(omt, d_id, od); -+ -+out: -+ dprintk("%s: return=%d\n", __func__, err); -+ objlayout_put_deviceinfo(deviceaddr); -+ return err ? ERR_PTR(err) : od; -+} -+ -+static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, -+ struct objio_segment *objio_seg) -+{ -+ struct pnfs_osd_layout *layout = objio_seg->layout; -+ unsigned i, num_comps = layout->olo_num_comps; -+ int err; -+ -+ /* lookup all devices */ -+ for (i = 0; i < num_comps; i++) { -+ struct osd_dev *od; -+ -+ od = _device_lookup(pnfslay, objio_seg, i); -+ if (unlikely(IS_ERR(od))) { -+ err = PTR_ERR(od); -+ goto out; -+ } -+ objio_seg->ods[i] = od; -+ } -+ objio_seg->num_comps = num_comps; -+ err = 0; -+ -+out: -+ dprintk("%s: return=%d\n", __func__, err); -+ return err; -+} -+ -+static int _verify_data_map(struct pnfs_osd_layout *layout) -+{ -+ struct pnfs_osd_data_map *data_map = &layout->olo_map; -+ u64 stripe_length; -+ u32 group_width; -+ -+/* FIXME: Only raid0 for now. if not go through MDS */ -+ if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { -+ printk(KERN_ERR "Only RAID_0 for now\n"); -+ return -ENOTSUPP; -+ } -+ if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { -+ printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", -+ data_map->odm_num_comps, data_map->odm_mirror_cnt); -+ return -EINVAL; -+ } -+ -+ if (data_map->odm_group_width) -+ group_width = data_map->odm_group_width; -+ else -+ group_width = data_map->odm_num_comps / -+ (data_map->odm_mirror_cnt + 1); -+ -+ stripe_length = (u64)data_map->odm_stripe_unit * group_width; -+ if (stripe_length >= (1ULL << 32)) { -+ printk(KERN_ERR "Total Stripe length(0x%llx)" -+ " >= 32bit is not supported\n", _LLU(stripe_length)); -+ return -ENOTSUPP; -+ } -+ -+ if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { -+ printk(KERN_ERR "Stripe Unit(0x%llx)" -+ " must be Multples of PAGE_SIZE(0x%lx)\n", -+ _LLU(data_map->odm_stripe_unit), PAGE_SIZE); -+ return -ENOTSUPP; -+ } -+ -+ return 0; -+} -+ -+int objio_alloc_lseg(void **outp, -+ struct pnfs_layout_hdr *pnfslay, -+ struct pnfs_layout_segment *lseg, -+ struct pnfs_osd_layout *layout) -+{ -+ struct objio_segment *objio_seg; -+ int err; -+ -+ err = _verify_data_map(layout); -+ if (unlikely(err)) -+ return err; -+ -+ objio_seg = kzalloc(sizeof(*objio_seg) + -+ (layout->olo_num_comps - 1) * sizeof(objio_seg->ods[0]), -+ GFP_KERNEL); -+ if (!objio_seg) -+ return -ENOMEM; -+ -+ objio_seg->layout = layout; -+ err = objio_devices_lookup(pnfslay, objio_seg); -+ if (err) -+ goto free_seg; -+ -+ objio_seg->mirrors_p1 = layout->olo_map.odm_mirror_cnt + 1; -+ objio_seg->stripe_unit = layout->olo_map.odm_stripe_unit; -+ if (layout->olo_map.odm_group_width) { -+ objio_seg->group_width = layout->olo_map.odm_group_width; -+ objio_seg->group_depth = layout->olo_map.odm_group_depth; -+ objio_seg->group_count = layout->olo_map.odm_num_comps / -+ objio_seg->mirrors_p1 / -+ objio_seg->group_width; -+ } else { -+ objio_seg->group_width = layout->olo_map.odm_num_comps / -+ objio_seg->mirrors_p1; -+ objio_seg->group_depth = -1; -+ objio_seg->group_count = 1; -+ } -+ -+ *outp = objio_seg; -+ return 0; -+ -+free_seg: -+ dprintk("%s: Error: return %d\n", __func__, err); -+ kfree(objio_seg); -+ *outp = NULL; -+ return err; -+} -+ -+void objio_free_lseg(void *p) -+{ -+ struct objio_segment *objio_seg = p; -+ -+ kfree(objio_seg); -+} -+ -+int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) -+{ -+ struct objio_segment *objio_seg = seg; -+ struct objio_state *ios; -+ const unsigned first_size = sizeof(*ios) + -+ objio_seg->num_comps * sizeof(ios->per_dev[0]); -+ const unsigned sec_size = objio_seg->num_comps * -+ sizeof(ios->ol_state.ioerrs[0]); -+ -+ dprintk("%s: num_comps=%d\n", __func__, objio_seg->num_comps); -+ ios = kzalloc(first_size + sec_size, GFP_KERNEL); -+ if (unlikely(!ios)) -+ return -ENOMEM; -+ -+ ios->objio_seg = objio_seg; -+ ios->ol_state.ioerrs = ((void *)ios) + first_size; -+ ios->ol_state.num_comps = objio_seg->num_comps; -+ -+ *outp = &ios->ol_state; -+ return 0; -+} -+ -+void objio_free_io_state(struct objlayout_io_state *ol_state) -+{ -+ struct objio_state *ios = container_of(ol_state, struct objio_state, -+ ol_state); -+ -+ kfree(ios); -+} -+ -+enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) -+{ -+ switch (oep) { -+ case OSD_ERR_PRI_NO_ERROR: -+ return (enum pnfs_osd_errno)0; -+ -+ case OSD_ERR_PRI_CLEAR_PAGES: -+ BUG_ON(1); -+ return 0; -+ -+ case OSD_ERR_PRI_RESOURCE: -+ return PNFS_OSD_ERR_RESOURCE; -+ case OSD_ERR_PRI_BAD_CRED: -+ return PNFS_OSD_ERR_BAD_CRED; -+ case OSD_ERR_PRI_NO_ACCESS: -+ return PNFS_OSD_ERR_NO_ACCESS; -+ case OSD_ERR_PRI_UNREACHABLE: -+ return PNFS_OSD_ERR_UNREACHABLE; -+ case OSD_ERR_PRI_NOT_FOUND: -+ return PNFS_OSD_ERR_NOT_FOUND; -+ case OSD_ERR_PRI_NO_SPACE: -+ return PNFS_OSD_ERR_NO_SPACE; -+ default: -+ WARN_ON(1); -+ /* fallthrough */ -+ case OSD_ERR_PRI_EIO: -+ return PNFS_OSD_ERR_EIO; -+ } -+} -+ -+static void _clear_bio(struct bio *bio) -+{ -+ struct bio_vec *bv; -+ unsigned i; -+ -+ __bio_for_each_segment(bv, bio, i, 0) { -+ unsigned this_count = bv->bv_len; -+ -+ if (likely(PAGE_SIZE == this_count)) -+ clear_highpage(bv->bv_page); -+ else -+ zero_user(bv->bv_page, bv->bv_offset, this_count); -+ } -+} -+ -+static int _io_check(struct objio_state *ios, bool is_write) -+{ -+ enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; -+ int lin_ret = 0; -+ int i; -+ -+ for (i = 0; i < ios->numdevs; i++) { -+ struct osd_sense_info osi; -+ struct osd_request *or = ios->per_dev[i].or; -+ int ret; -+ -+ if (!or) -+ continue; -+ -+ ret = osd_req_decode_sense(or, &osi); -+ if (likely(!ret)) -+ continue; -+ -+ if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { -+ /* start read offset passed endof file */ -+ BUG_ON(is_write); -+ _clear_bio(ios->per_dev[i].bio); -+ dprintk("%s: start read offset passed end of file " -+ "offset=0x%llx, length=0x%lx\n", __func__, -+ _LLU(ios->per_dev[i].offset), -+ ios->per_dev[i].length); -+ -+ continue; /* we recovered */ -+ } -+ objlayout_io_set_result(&ios->ol_state, ios->per_dev[i].dev, -+ osd_pri_2_pnfs_err(osi.osd_err_pri), -+ ios->per_dev[i].offset, -+ ios->per_dev[i].length, -+ is_write); -+ -+ if (osi.osd_err_pri >= oep) { -+ oep = osi.osd_err_pri; -+ lin_ret = ret; -+ } -+ } -+ -+ return lin_ret; -+} -+ -+/* -+ * Common IO state helpers. -+ */ -+static void _io_free(struct objio_state *ios) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ios->numdevs; i++) { -+ struct _objio_per_comp *per_dev = &ios->per_dev[i]; -+ -+ if (per_dev->or) { -+ osd_end_request(per_dev->or); -+ per_dev->or = NULL; -+ } -+ -+ if (per_dev->bio) { -+ bio_put(per_dev->bio); -+ per_dev->bio = NULL; -+ } -+ } -+} -+ -+struct osd_dev * _io_od(struct objio_state *ios, unsigned dev) -+{ -+ unsigned min_dev = ios->objio_seg->layout->olo_comps_index; -+ unsigned max_dev = min_dev + ios->ol_state.num_comps; -+ -+ BUG_ON(dev < min_dev || max_dev <= dev); -+ return ios->objio_seg->ods[dev - min_dev]; -+} -+ -+struct _striping_info { -+ u64 obj_offset; -+ u64 group_length; -+ u64 total_group_length; -+ u64 Major; -+ unsigned dev; -+ unsigned unit_off; -+}; -+ -+static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, -+ struct _striping_info *si) -+{ -+ u32 stripe_unit = ios->objio_seg->stripe_unit; -+ u32 group_width = ios->objio_seg->group_width; -+ u64 group_depth = ios->objio_seg->group_depth; -+ u32 U = stripe_unit * group_width; -+ -+ u64 T = U * group_depth; -+ u64 S = T * ios->objio_seg->group_count; -+ u64 M = div64_u64(file_offset, S); -+ -+ /* -+ G = (L - (M * S)) / T -+ H = (L - (M * S)) % T -+ */ -+ u64 LmodU = file_offset - M * S; -+ u32 G = div64_u64(LmodU, T); -+ u64 H = LmodU - G * T; -+ -+ u32 N = div_u64(H, U); -+ -+ div_u64_rem(file_offset, stripe_unit, &si->unit_off); -+ si->obj_offset = si->unit_off + (N * stripe_unit) + -+ (M * group_depth * stripe_unit); -+ -+ /* "H - (N * U)" is just "H % U" so it's bound to u32 */ -+ si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; -+ si->dev *= ios->objio_seg->mirrors_p1; -+ -+ si->group_length = T - H; -+ si->total_group_length = T; -+ si->Major = M; -+} -+ -+static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, -+ unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len) -+{ -+ unsigned pg = *cur_pg; -+ struct request_queue *q = -+ osd_request_queue(_io_od(ios, per_dev->dev)); -+ -+ per_dev->length += cur_len; -+ -+ if (per_dev->bio == NULL) { -+ unsigned stripes = ios->ol_state.num_comps / -+ ios->objio_seg->mirrors_p1; -+ unsigned pages_in_stripe = stripes * -+ (ios->objio_seg->stripe_unit / PAGE_SIZE); -+ unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / -+ stripes; -+ -+ per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); -+ if (unlikely(!per_dev->bio)) { -+ dprintk("Faild to allocate BIO size=%u\n", bio_size); -+ return -ENOMEM; -+ } -+ } -+ -+ while (cur_len > 0) { -+ unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); -+ unsigned added_len; -+ -+ BUG_ON(ios->ol_state.nr_pages <= pg); -+ cur_len -= pglen; -+ -+ added_len = bio_add_pc_page(q, per_dev->bio, -+ ios->ol_state.pages[pg], pglen, pgbase); -+ if (unlikely(pglen != added_len)) -+ return -ENOMEM; -+ pgbase = 0; -+ ++pg; -+ } -+ BUG_ON(cur_len); -+ -+ *cur_pg = pg; -+ return 0; -+} -+ -+static int _prepare_one_group(struct objio_state *ios, u64 length, -+ struct _striping_info *si, unsigned first_comp, -+ unsigned *last_pg) -+{ -+ unsigned stripe_unit = ios->objio_seg->stripe_unit; -+ unsigned mirrors_p1 = ios->objio_seg->mirrors_p1; -+ unsigned devs_in_group = ios->objio_seg->group_width * mirrors_p1; -+ unsigned dev = si->dev; -+ unsigned first_dev = dev - (dev % devs_in_group); -+ unsigned comp = first_comp + (dev - first_dev); -+ unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; -+ unsigned cur_pg = *last_pg; -+ int ret = 0; -+ -+ while (length) { -+ struct _objio_per_comp *per_dev = &ios->per_dev[comp]; -+ unsigned cur_len, page_off = 0; -+ -+ if (!per_dev->length) { -+ per_dev->dev = dev; -+ if (dev < si->dev) { -+ per_dev->offset = si->obj_offset + stripe_unit - -+ si->unit_off; -+ cur_len = stripe_unit; -+ } else if (dev == si->dev) { -+ per_dev->offset = si->obj_offset; -+ cur_len = stripe_unit - si->unit_off; -+ page_off = si->unit_off & ~PAGE_MASK; -+ BUG_ON(page_off && -+ (page_off != ios->ol_state.pgbase)); -+ } else { /* dev > si->dev */ -+ per_dev->offset = si->obj_offset - si->unit_off; -+ cur_len = stripe_unit; -+ } -+ -+ if (max_comp < comp) -+ max_comp = comp; -+ -+ dev += mirrors_p1; -+ dev = (dev % devs_in_group) + first_dev; -+ } else { -+ cur_len = stripe_unit; -+ } -+ if (cur_len >= length) -+ cur_len = length; -+ -+ ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, -+ cur_len); -+ if (unlikely(ret)) -+ goto out; -+ -+ comp += mirrors_p1; -+ comp = (comp % devs_in_group) + first_comp; -+ -+ length -= cur_len; -+ ios->length += cur_len; -+ } -+out: -+ ios->numdevs = max_comp + mirrors_p1; -+ *last_pg = cur_pg; -+ return ret; -+} -+ -+static int _io_rw_pagelist(struct objio_state *ios) -+{ -+ u64 length = ios->ol_state.count; -+ struct _striping_info si; -+ unsigned devs_in_group = ios->objio_seg->group_width * -+ ios->objio_seg->mirrors_p1; -+ unsigned first_comp = 0; -+ unsigned num_comps = ios->objio_seg->layout->olo_map.odm_num_comps; -+ unsigned last_pg = 0; -+ int ret = 0; -+ -+ _calc_stripe_info(ios, ios->ol_state.offset, &si); -+ while (length) { -+ if (length < si.group_length) -+ si.group_length = length; -+ -+ ret = _prepare_one_group(ios, si.group_length, &si, first_comp, -+ &last_pg); -+ if (unlikely(ret)) -+ goto out; -+ -+ length -= si.group_length; -+ -+ si.group_length = si.total_group_length; -+ si.unit_off = 0; -+ ++si.Major; -+ si.obj_offset = si.Major * ios->objio_seg->stripe_unit * -+ ios->objio_seg->group_depth; -+ -+ si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group; -+ si.dev %= num_comps; -+ -+ first_comp += devs_in_group; -+ first_comp %= num_comps; -+ } -+ -+out: -+ if (!ios->length) -+ return ret; -+ -+ return 0; -+} -+ -+static ssize_t _sync_done(struct objio_state *ios) -+{ -+ struct completion *waiting = ios->private; -+ -+ complete(waiting); -+ return 0; -+} -+ -+static void _last_io(struct kref *kref) -+{ -+ struct objio_state *ios = container_of(kref, struct objio_state, kref); -+ -+ ios->done(ios); -+} -+ -+static void _done_io(struct osd_request *or, void *p) -+{ -+ struct objio_state *ios = p; -+ -+ kref_put(&ios->kref, _last_io); -+} -+ -+static ssize_t _io_exec(struct objio_state *ios) -+{ -+ DECLARE_COMPLETION_ONSTACK(wait); -+ ssize_t status = 0; /* sync status */ -+ unsigned i; -+ objio_done_fn saved_done_fn = ios->done; -+ bool sync = ios->ol_state.sync; -+ -+ if (sync) { -+ ios->done = _sync_done; -+ ios->private = &wait; -+ } -+ -+ kref_init(&ios->kref); -+ -+ for (i = 0; i < ios->numdevs; i++) { -+ struct osd_request *or = ios->per_dev[i].or; -+ -+ if (!or) -+ continue; -+ -+ kref_get(&ios->kref); -+ osd_execute_request_async(or, _done_io, ios); -+ } -+ -+ kref_put(&ios->kref, _last_io); -+ -+ if (sync) { -+ wait_for_completion(&wait); -+ status = saved_done_fn(ios); -+ } -+ -+ return status; -+} -+ -+/* -+ * read -+ */ -+static ssize_t _read_done(struct objio_state *ios) -+{ -+ ssize_t status; -+ int ret = _io_check(ios, false); -+ -+ _io_free(ios); -+ -+ if (likely(!ret)) -+ status = ios->length; -+ else -+ status = ret; -+ -+ objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); -+ return status; -+} -+ -+static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) -+{ -+ struct osd_request *or = NULL; -+ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; -+ unsigned dev = per_dev->dev; -+ struct pnfs_osd_object_cred *cred = -+ &ios->objio_seg->layout->olo_comps[dev]; -+ struct osd_obj_id obj = { -+ .partition = cred->oc_object_id.oid_partition_id, -+ .id = cred->oc_object_id.oid_object_id, -+ }; -+ int ret; -+ -+ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); -+ if (unlikely(!or)) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ per_dev->or = or; -+ -+ osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); -+ -+ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); -+ if (ret) { -+ dprintk("%s: Faild to osd_finalize_request() => %d\n", -+ __func__, ret); -+ goto err; -+ } -+ -+ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", -+ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), -+ per_dev->length); -+ -+err: -+ return ret; -+} -+ -+static ssize_t _read_exec(struct objio_state *ios) -+{ -+ unsigned i; -+ int ret; -+ -+ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { -+ if (!ios->per_dev[i].length) -+ continue; -+ ret = _read_mirrors(ios, i); -+ if (unlikely(ret)) -+ goto err; -+ } -+ -+ ios->done = _read_done; -+ return _io_exec(ios); /* In sync mode exec returns the io status */ -+ -+err: -+ _io_free(ios); -+ return ret; -+} -+ -+ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) -+{ -+ struct objio_state *ios = container_of(ol_state, struct objio_state, -+ ol_state); -+ int ret; -+ -+ ret = _io_rw_pagelist(ios); -+ if (unlikely(ret)) -+ return ret; -+ -+ return _read_exec(ios); -+} -+ -+/* -+ * write -+ */ -+static ssize_t _write_done(struct objio_state *ios) -+{ -+ ssize_t status; -+ int ret = _io_check(ios, true); -+ -+ _io_free(ios); -+ -+ if (likely(!ret)) { -+ /* FIXME: should be based on the OSD's persistence model -+ * See OSD2r05 Section 4.13 Data persistence model */ -+ ios->ol_state.committed = NFS_UNSTABLE; //NFS_FILE_SYNC; -+ status = ios->length; -+ } else { -+ status = ret; -+ } -+ -+ objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); -+ return status; -+} -+ -+static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) -+{ -+ struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; -+ unsigned dev = ios->per_dev[cur_comp].dev; -+ unsigned last_comp = cur_comp + ios->objio_seg->mirrors_p1; -+ int ret; -+ -+ for (; cur_comp < last_comp; ++cur_comp, ++dev) { -+ struct osd_request *or = NULL; -+ struct pnfs_osd_object_cred *cred = -+ &ios->objio_seg->layout->olo_comps[dev]; -+ struct osd_obj_id obj = { -+ .partition = cred->oc_object_id.oid_partition_id, -+ .id = cred->oc_object_id.oid_object_id, -+ }; -+ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; -+ struct bio *bio; -+ -+ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); -+ if (unlikely(!or)) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ per_dev->or = or; -+ -+ if (per_dev != master_dev) { -+ bio = bio_kmalloc(GFP_KERNEL, -+ master_dev->bio->bi_max_vecs); -+ if (unlikely(!bio)) { -+ dprintk("Faild to allocate BIO size=%u\n", -+ master_dev->bio->bi_max_vecs); -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ __bio_clone(bio, master_dev->bio); -+ bio->bi_bdev = NULL; -+ bio->bi_next = NULL; -+ per_dev->bio = bio; -+ per_dev->dev = dev; -+ per_dev->length = master_dev->length; -+ per_dev->offset = master_dev->offset; -+ } else { -+ bio = master_dev->bio; -+ /* FIXME: bio_set_dir() */ -+ bio->bi_rw |= REQ_WRITE; -+ } -+ -+ osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); -+ -+ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); -+ if (ret) { -+ dprintk("%s: Faild to osd_finalize_request() => %d\n", -+ __func__, ret); -+ goto err; -+ } -+ -+ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", -+ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), -+ per_dev->length); -+ } -+ -+err: -+ return ret; -+} -+ -+static ssize_t _write_exec(struct objio_state *ios) -+{ -+ unsigned i; -+ int ret; -+ -+ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { -+ if (!ios->per_dev[i].length) -+ continue; -+ ret = _write_mirrors(ios, i); -+ if (unlikely(ret)) -+ goto err; -+ } -+ -+ ios->done = _write_done; -+ return _io_exec(ios); /* In sync mode exec returns the io->status */ -+ -+err: -+ _io_free(ios); -+ return ret; -+} -+ -+ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) -+{ -+ struct objio_state *ios = container_of(ol_state, struct objio_state, -+ ol_state); -+ int ret; -+ -+ /* TODO: ios->stable = stable; */ -+ ret = _io_rw_pagelist(ios); -+ if (unlikely(ret)) -+ return ret; -+ -+ return _write_exec(ios); -+} -+ -+/* -+ * Policy Operations -+ */ -+ -+/* -+ * Get the max [rw]size -+ */ -+static ssize_t -+objlayout_get_blocksize(void) -+{ -+ ssize_t sz = BIO_MAX_PAGES_KMALLOC * PAGE_SIZE; -+ -+ return sz; -+} -+ -+/* -+ * Don't gather across stripes, but rather gather (coalesce) up to -+ * the stripe size. -+ * -+ * FIXME: change interface to use merge_align, merge_count -+ */ -+static struct pnfs_layoutdriver_type objlayout_type = { -+ .id = LAYOUT_OSD2_OBJECTS, -+ .name = "LAYOUT_OSD2_OBJECTS", -+ .flags = PNFS_LAYOUTRET_ON_SETATTR, -+ -+ .initialize_mountpoint = objlayout_initialize_mountpoint, -+ .uninitialize_mountpoint = objlayout_uninitialize_mountpoint, -+ -+ .alloc_layout_hdr = objlayout_alloc_layout_hdr, -+ .free_layout_hdr = objlayout_free_layout_hdr, -+ -+ .alloc_lseg = objlayout_alloc_lseg, -+ .free_lseg = objlayout_free_lseg, -+ -+ .get_blocksize = objlayout_get_blocksize, -+ -+ .read_pagelist = objlayout_read_pagelist, -+ .write_pagelist = objlayout_write_pagelist, -+ .commit = objlayout_commit, -+ -+ .encode_layoutcommit = objlayout_encode_layoutcommit, -+ .encode_layoutreturn = objlayout_encode_layoutreturn, -+}; -+ -+void *objio_init_mt(void) -+{ -+ struct objio_mount_type *omt = kzalloc(sizeof(*omt), GFP_KERNEL); -+ -+ if (!omt) -+ return ERR_PTR(-ENOMEM); -+ -+ INIT_LIST_HEAD(&omt->dev_list); -+ spin_lock_init(&omt->dev_list_lock); -+ return omt; -+} -+ -+void objio_fini_mt(void *mountid) -+{ -+ _dev_list_remove_all(mountid); -+ kfree(mountid); -+} -+ -+MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); -+MODULE_AUTHOR("Benny Halevy "); -+MODULE_LICENSE("GPL"); -+ -+static int __init -+objlayout_init(void) -+{ -+ int ret = pnfs_register_layoutdriver(&objlayout_type); -+ -+ if (ret) -+ printk(KERN_INFO -+ "%s: Registering OSD pNFS Layout Driver failed: error=%d\n", -+ __func__, ret); -+ else -+ printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n", -+ __func__); -+ return ret; -+} -+ -+static void __exit -+objlayout_exit(void) -+{ -+ pnfs_unregister_layoutdriver(&objlayout_type); -+ printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n", -+ __func__); -+} -+ -+module_init(objlayout_init); -+module_exit(objlayout_exit); -diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c -new file mode 100644 -index 0000000..b647577 ---- /dev/null -+++ b/fs/nfs/objlayout/objlayout.c -@@ -0,0 +1,773 @@ -+/* -+ * objlayout.c -+ * -+ * pNFS layout driver for Panasas OSDs -+ * -+ * Copyright (C) 2007-2009 Panasas Inc. -+ * All rights reserved. -+ * -+ * Benny Halevy -+ * Boaz Harrosh -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 -+ * See the file COPYING included with this distribution for more details. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. Neither the name of the Panasas company nor the names of its -+ * contributors may be used to endorse or promote products derived -+ * from this software without specific prior written permission. -+ * -+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED -+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ */ -+ -+#include -+#include "objlayout.h" -+ -+#define NFSDBG_FACILITY NFSDBG_PNFS_LD -+ -+struct pnfs_client_operations *pnfs_client_ops; -+ -+/* -+ * Create a objlayout layout structure for the given inode and return it. -+ */ -+struct pnfs_layout_hdr * -+objlayout_alloc_layout_hdr(struct inode *inode) -+{ -+ struct objlayout *objlay; -+ -+ objlay = kzalloc(sizeof(struct objlayout), GFP_KERNEL); -+ if (objlay) { -+ spin_lock_init(&objlay->lock); -+ INIT_LIST_HEAD(&objlay->err_list); -+ } -+ dprintk("%s: Return %p\n", __func__, objlay); -+ return &objlay->pnfs_layout; -+} -+ -+/* -+ * Free an objlayout layout structure -+ */ -+void -+objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo) -+{ -+ struct objlayout *objlay = OBJLAYOUT(lo); -+ -+ dprintk("%s: objlay %p\n", __func__, objlay); -+ -+ WARN_ON(!list_empty(&objlay->err_list)); -+ kfree(objlay); -+} -+ -+/* -+ * Unmarshall layout and store it in pnfslay. -+ */ -+struct pnfs_layout_segment * -+objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay, -+ struct nfs4_layoutget_res *lgr) -+{ -+ int status; -+ void *layout = lgr->layout.buf; -+ struct objlayout_segment *objlseg; -+ struct pnfs_osd_layout *pnfs_osd_layout; -+ -+ dprintk("%s: Begin pnfslay %p layout %p\n", __func__, pnfslay, layout); -+ -+ BUG_ON(!layout); -+ -+ status = -ENOMEM; -+ objlseg = kzalloc(sizeof(*objlseg) + -+ pnfs_osd_layout_incore_sz(layout), GFP_KERNEL); -+ if (!objlseg) -+ goto err; -+ -+ pnfs_osd_layout = (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; -+ pnfs_osd_xdr_decode_layout(pnfs_osd_layout, layout); -+ -+ status = objio_alloc_lseg(&objlseg->internal, pnfslay, &objlseg->lseg, -+ pnfs_osd_layout); -+ if (status) -+ goto err; -+ -+ dprintk("%s: Return %p\n", __func__, &objlseg->lseg); -+ return &objlseg->lseg; -+ -+ err: -+ kfree(objlseg); -+ return ERR_PTR(status); -+} -+ -+/* -+ * Free a layout segement -+ */ -+void -+objlayout_free_lseg(struct pnfs_layout_segment *lseg) -+{ -+ struct objlayout_segment *objlseg; -+ -+ dprintk("%s: freeing layout segment %p\n", __func__, lseg); -+ -+ if (unlikely(!lseg)) -+ return; -+ -+ objlseg = container_of(lseg, struct objlayout_segment, lseg); -+ objio_free_lseg(objlseg->internal); -+ kfree(objlseg); -+} -+ -+/* -+ * I/O Operations -+ */ -+static inline u64 -+end_offset(u64 start, u64 len) -+{ -+ u64 end; -+ -+ end = start + len; -+ return end >= start ? end : NFS4_MAX_UINT64; -+} -+ -+/* last octet in a range */ -+static inline u64 -+last_byte_offset(u64 start, u64 len) -+{ -+ u64 end; -+ -+ BUG_ON(!len); -+ end = start + len; -+ return end > start ? end - 1 : NFS4_MAX_UINT64; -+} -+ -+static struct objlayout_io_state * -+objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, -+ struct page **pages, -+ unsigned pgbase, -+ unsigned nr_pages, -+ loff_t offset, -+ size_t count, -+ struct pnfs_layout_segment *lseg, -+ void *rpcdata) -+{ -+ struct objlayout_segment *objlseg = -+ container_of(lseg, struct objlayout_segment, lseg); -+ struct objlayout_io_state *state; -+ u64 lseg_end_offset; -+ size_t size_nr_pages; -+ -+ dprintk("%s: allocating io_state\n", __func__); -+ if (objio_alloc_io_state(objlseg->internal, &state)) -+ return NULL; -+ -+ BUG_ON(offset < lseg->range.offset); -+ lseg_end_offset = end_offset(lseg->range.offset, lseg->range.length); -+ BUG_ON(offset >= lseg_end_offset); -+ if (offset + count > lseg_end_offset) { -+ count = lseg->range.length - (offset - lseg->range.offset); -+ dprintk("%s: truncated count %Zd\n", __func__, count); -+ } -+ -+ if (pgbase > PAGE_SIZE) { -+ unsigned n = pgbase >> PAGE_SHIFT; -+ -+ pgbase &= ~PAGE_MASK; -+ pages += n; -+ nr_pages -= n; -+ } -+ -+ size_nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; -+ BUG_ON(nr_pages < size_nr_pages); -+ if (nr_pages > size_nr_pages) -+ nr_pages = size_nr_pages; -+ -+ INIT_LIST_HEAD(&state->err_list); -+ state->objlseg = objlseg; -+ state->rpcdata = rpcdata; -+ state->pages = pages; -+ state->pgbase = pgbase; -+ state->nr_pages = nr_pages; -+ state->offset = offset; -+ state->count = count; -+ state->sync = 0; -+ -+ return state; -+} -+ -+static void -+objlayout_free_io_state(struct objlayout_io_state *state) -+{ -+ dprintk("%s: freeing io_state\n", __func__); -+ if (unlikely(!state)) -+ return; -+ -+ objio_free_io_state(state); -+} -+ -+/* -+ * I/O done common code -+ */ -+static void -+objlayout_iodone(struct objlayout_io_state *state) -+{ -+ dprintk("%s: state %p status\n", __func__, state); -+ -+ if (likely(state->status >= 0)) { -+ objlayout_free_io_state(state); -+ } else { -+ struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.layout); -+ -+ spin_lock(&objlay->lock); -+ objlay->delta_space_valid = OBJ_DSU_INVALID; -+ list_add(&objlay->err_list, &state->err_list); -+ spin_unlock(&objlay->lock); -+ } -+} -+ -+/* -+ * objlayout_io_set_result - Set an osd_error code on a specific osd comp. -+ * -+ * The @index component IO failed (error returned from target). Register -+ * the error for later reporting at layout-return. -+ */ -+void -+objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, -+ int osd_error, u64 offset, u64 length, bool is_write) -+{ -+ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; -+ -+ BUG_ON(index >= state->num_comps); -+ if (osd_error) { -+ struct pnfs_osd_layout *layout = -+ (typeof(layout))state->objlseg->pnfs_osd_layout; -+ -+ ioerr->oer_component = layout->olo_comps[index].oc_object_id; -+ ioerr->oer_comp_offset = offset; -+ ioerr->oer_comp_length = length; -+ ioerr->oer_iswrite = is_write; -+ ioerr->oer_errno = osd_error; -+ -+ dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " -+ "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", -+ __func__, index, ioerr->oer_errno, -+ ioerr->oer_iswrite, -+ _DEVID_LO(&ioerr->oer_component.oid_device_id), -+ _DEVID_HI(&ioerr->oer_component.oid_device_id), -+ ioerr->oer_component.oid_partition_id, -+ ioerr->oer_component.oid_object_id, -+ ioerr->oer_comp_offset, -+ ioerr->oer_comp_length); -+ } else { -+ /* User need not call if no error is reported */ -+ ioerr->oer_errno = 0; -+ } -+} -+ -+static void _rpc_commit_complete(struct work_struct *work) -+{ -+ struct rpc_task *task; -+ struct nfs_write_data *wdata; -+ -+ dprintk("%s enter\n", __func__); -+ task = container_of(work, struct rpc_task, u.tk_work); -+ wdata = container_of(task, struct nfs_write_data, task); -+ -+ pnfs_commit_done(wdata); -+} -+ -+/* -+ * Commit data remotely on OSDs -+ */ -+enum pnfs_try_status -+objlayout_commit(struct nfs_write_data *wdata, int how) -+{ -+ int status = PNFS_ATTEMPTED; -+ -+ INIT_WORK(&wdata->task.u.tk_work, _rpc_commit_complete); -+ schedule_work(&wdata->task.u.tk_work); -+ dprintk("%s: Return %d\n", __func__, status); -+ return status; -+} -+ -+/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). -+ * This is because the osd completion is called with ints-off from -+ * the block layer -+ */ -+static void _rpc_read_complete(struct work_struct *work) -+{ -+ struct rpc_task *task; -+ struct nfs_read_data *rdata; -+ -+ dprintk("%s enter\n", __func__); -+ task = container_of(work, struct rpc_task, u.tk_work); -+ rdata = container_of(task, struct nfs_read_data, task); -+ -+ pnfs_read_done(rdata); -+} -+ -+void -+objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) -+{ -+ int eof = state->eof; -+ struct nfs_read_data *rdata; -+ -+ state->status = status; -+ dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof); -+ rdata = state->rpcdata; -+ rdata->task.tk_status = status; -+ if (status >= 0) { -+ rdata->res.count = status; -+ rdata->res.eof = eof; -+ } -+ objlayout_iodone(state); -+ /* must not use state after this point */ -+ -+ if (sync) -+ pnfs_read_done(rdata); -+ else { -+ INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); -+ schedule_work(&rdata->task.u.tk_work); -+ } -+} -+ -+/* -+ * Perform sync or async reads. -+ */ -+enum pnfs_try_status -+objlayout_read_pagelist(struct nfs_read_data *rdata, unsigned nr_pages) -+{ -+ loff_t offset = rdata->args.offset; -+ size_t count = rdata->args.count; -+ struct objlayout_io_state *state; -+ ssize_t status = 0; -+ loff_t eof; -+ -+ dprintk("%s: Begin inode %p offset %llu count %d\n", -+ __func__, rdata->inode, offset, (int)count); -+ -+ eof = i_size_read(rdata->inode); -+ if (unlikely(offset + count > eof)) { -+ if (offset >= eof) { -+ status = 0; -+ rdata->res.count = 0; -+ rdata->res.eof = 1; -+ goto out; -+ } -+ count = eof - offset; -+ } -+ -+ state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, -+ rdata->args.pages, rdata->args.pgbase, -+ nr_pages, offset, count, -+ rdata->pdata.lseg, rdata); -+ if (unlikely(!state)) { -+ status = -ENOMEM; -+ goto out; -+ } -+ -+ state->eof = state->offset + state->count >= eof; -+ -+ status = objio_read_pagelist(state); -+ out: -+ dprintk("%s: Return status %Zd\n", __func__, status); -+ rdata->pdata.pnfs_error = status; -+ return PNFS_ATTEMPTED; -+} -+ -+/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). -+ * This is because the osd completion is called with ints-off from -+ * the block layer -+ */ -+static void _rpc_write_complete(struct work_struct *work) -+{ -+ struct rpc_task *task; -+ struct nfs_write_data *wdata; -+ -+ dprintk("%s enter\n", __func__); -+ task = container_of(work, struct rpc_task, u.tk_work); -+ wdata = container_of(task, struct nfs_write_data, task); -+ -+ pnfs_writeback_done(wdata); -+} -+ -+void -+objlayout_write_done(struct objlayout_io_state *state, ssize_t status, -+ bool sync) -+{ -+ struct nfs_write_data *wdata; -+ -+ dprintk("%s: Begin\n", __func__); -+ wdata = state->rpcdata; -+ state->status = status; -+ wdata->task.tk_status = status; -+ if (status >= 0) { -+ wdata->res.count = status; -+ wdata->verf.committed = state->committed; -+ dprintk("%s: Return status %d committed %d\n", -+ __func__, wdata->task.tk_status, -+ wdata->verf.committed); -+ } else -+ dprintk("%s: Return status %d\n", -+ __func__, wdata->task.tk_status); -+ objlayout_iodone(state); -+ /* must not use state after this point */ -+ -+ if (sync) -+ pnfs_writeback_done(wdata); -+ else { -+ INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); -+ schedule_work(&wdata->task.u.tk_work); -+ } -+} -+ -+/* -+ * Perform sync or async writes. -+ */ -+enum pnfs_try_status -+objlayout_write_pagelist(struct nfs_write_data *wdata, -+ unsigned nr_pages, -+ int how) -+{ -+ struct objlayout_io_state *state; -+ ssize_t status; -+ -+ dprintk("%s: Begin inode %p offset %llu count %u\n", -+ __func__, wdata->inode, wdata->args.offset, wdata->args.count); -+ -+ state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, -+ wdata->args.pages, -+ wdata->args.pgbase, -+ nr_pages, -+ wdata->args.offset, -+ wdata->args.count, -+ wdata->pdata.lseg, wdata); -+ if (unlikely(!state)) { -+ status = -ENOMEM; -+ goto out; -+ } -+ -+ state->sync = how & FLUSH_SYNC; -+ -+ status = objio_write_pagelist(state, how & FLUSH_STABLE); -+ out: -+ dprintk("%s: Return status %Zd\n", __func__, status); -+ wdata->pdata.pnfs_error = status; -+ return PNFS_ATTEMPTED; -+} -+ -+void -+objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay, -+ struct xdr_stream *xdr, -+ const struct nfs4_layoutcommit_args *args) -+{ -+ struct objlayout *objlay = OBJLAYOUT(pnfslay); -+ struct pnfs_osd_layoutupdate lou; -+ __be32 *start; -+ -+ dprintk("%s: Begin\n", __func__); -+ -+ spin_lock(&objlay->lock); -+ lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); -+ lou.dsu_delta = objlay->delta_space_used; -+ objlay->delta_space_used = 0; -+ objlay->delta_space_valid = OBJ_DSU_INIT; -+ lou.olu_ioerr_flag = !list_empty(&objlay->err_list); -+ spin_unlock(&objlay->lock); -+ -+ start = xdr_reserve_space(xdr, 4); -+ -+ BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); -+ -+ *start = cpu_to_be32((xdr->p - start - 1) * 4); -+ -+ dprintk("%s: Return delta_space_used %lld err %d\n", __func__, -+ lou.dsu_delta, lou.olu_ioerr_flag); -+} -+ -+static int -+err_prio(u32 oer_errno) -+{ -+ switch (oer_errno) { -+ case 0: -+ return 0; -+ -+ case PNFS_OSD_ERR_RESOURCE: -+ return OSD_ERR_PRI_RESOURCE; -+ case PNFS_OSD_ERR_BAD_CRED: -+ return OSD_ERR_PRI_BAD_CRED; -+ case PNFS_OSD_ERR_NO_ACCESS: -+ return OSD_ERR_PRI_NO_ACCESS; -+ case PNFS_OSD_ERR_UNREACHABLE: -+ return OSD_ERR_PRI_UNREACHABLE; -+ case PNFS_OSD_ERR_NOT_FOUND: -+ return OSD_ERR_PRI_NOT_FOUND; -+ case PNFS_OSD_ERR_NO_SPACE: -+ return OSD_ERR_PRI_NO_SPACE; -+ default: -+ WARN_ON(1); -+ /* fallthrough */ -+ case PNFS_OSD_ERR_EIO: -+ return OSD_ERR_PRI_EIO; -+ } -+} -+ -+static void -+merge_ioerr(struct pnfs_osd_ioerr *dest_err, -+ const struct pnfs_osd_ioerr *src_err) -+{ -+ u64 dest_end, src_end; -+ -+ if (!dest_err->oer_errno) { -+ *dest_err = *src_err; -+ /* accumulated device must be blank */ -+ memset(&dest_err->oer_component.oid_device_id, 0, -+ sizeof(dest_err->oer_component.oid_device_id)); -+ -+ return; -+ } -+ -+ if (dest_err->oer_component.oid_partition_id != -+ src_err->oer_component.oid_partition_id) -+ dest_err->oer_component.oid_partition_id = 0; -+ -+ if (dest_err->oer_component.oid_object_id != -+ src_err->oer_component.oid_object_id) -+ dest_err->oer_component.oid_object_id = 0; -+ -+ if (dest_err->oer_comp_offset > src_err->oer_comp_offset) -+ dest_err->oer_comp_offset = src_err->oer_comp_offset; -+ -+ dest_end = end_offset(dest_err->oer_comp_offset, -+ dest_err->oer_comp_length); -+ src_end = end_offset(src_err->oer_comp_offset, -+ src_err->oer_comp_length); -+ if (dest_end < src_end) -+ dest_end = src_end; -+ -+ dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; -+ -+ if ((src_err->oer_iswrite == dest_err->oer_iswrite) && -+ (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { -+ dest_err->oer_errno = src_err->oer_errno; -+ } else if (src_err->oer_iswrite) { -+ dest_err->oer_iswrite = true; -+ dest_err->oer_errno = src_err->oer_errno; -+ } -+} -+ -+static void -+encode_accumulated_error(struct objlayout *objlay, struct xdr_stream *xdr) -+{ -+ struct objlayout_io_state *state, *tmp; -+ struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; -+ -+ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { -+ unsigned i; -+ -+ for (i = 0; i < state->num_comps; i++) { -+ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; -+ -+ if (!ioerr->oer_errno) -+ continue; -+ -+ printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d " -+ "dev(%llx:%llx) par=0x%llx obj=0x%llx " -+ "offset=0x%llx length=0x%llx\n", -+ __func__, i, ioerr->oer_errno, -+ ioerr->oer_iswrite, -+ _DEVID_LO(&ioerr->oer_component.oid_device_id), -+ _DEVID_HI(&ioerr->oer_component.oid_device_id), -+ ioerr->oer_component.oid_partition_id, -+ ioerr->oer_component.oid_object_id, -+ ioerr->oer_comp_offset, -+ ioerr->oer_comp_length); -+ -+ merge_ioerr(&accumulated_err, ioerr); -+ } -+ list_del(&state->err_list); -+ objlayout_free_io_state(state); -+ } -+ -+ BUG_ON(pnfs_osd_xdr_encode_ioerr(xdr, &accumulated_err)); -+} -+ -+void -+objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, -+ struct xdr_stream *xdr, -+ const struct nfs4_layoutreturn_args *args) -+{ -+ struct objlayout *objlay = OBJLAYOUT(pnfslay); -+ struct objlayout_io_state *state, *tmp; -+ __be32 *start, *uninitialized_var(last_xdr); -+ -+ dprintk("%s: Begin\n", __func__); -+ start = xdr_reserve_space(xdr, 4); -+ BUG_ON(!start); -+ -+ spin_lock(&objlay->lock); -+ -+ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { -+ unsigned i; -+ int res = 0; -+ -+ for (i = 0; i < state->num_comps && !res; i++) { -+ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; -+ -+ if (!ioerr->oer_errno) -+ continue; -+ -+ dprintk("%s: err[%d]: errno=%d is_write=%d " -+ "dev(%llx:%llx) par=0x%llx obj=0x%llx " -+ "offset=0x%llx length=0x%llx\n", -+ __func__, i, ioerr->oer_errno, -+ ioerr->oer_iswrite, -+ _DEVID_LO(&ioerr->oer_component.oid_device_id), -+ _DEVID_HI(&ioerr->oer_component.oid_device_id), -+ ioerr->oer_component.oid_partition_id, -+ ioerr->oer_component.oid_object_id, -+ ioerr->oer_comp_offset, -+ ioerr->oer_comp_length); -+ -+ last_xdr = xdr->p; -+ res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]); -+ } -+ if (unlikely(res)) { -+ /* no space for even one error descriptor */ -+ BUG_ON(last_xdr == start + 1); -+ -+ /* we've encountered a situation with lots and lots of -+ * errors and no space to encode them all. Use the last -+ * available slot to report the union of all the -+ * remaining errors. -+ */ -+ xdr_rewind_stream(xdr, last_xdr - -+ pnfs_osd_ioerr_xdr_sz() / 4); -+ encode_accumulated_error(objlay, xdr); -+ goto loop_done; -+ } -+ list_del(&state->err_list); -+ objlayout_free_io_state(state); -+ } -+loop_done: -+ spin_unlock(&objlay->lock); -+ -+ *start = cpu_to_be32((xdr->p - start - 1) * 4); -+ dprintk("%s: Return\n", __func__); -+} -+ -+struct objlayout_deviceinfo { -+ struct page *page; -+ struct pnfs_osd_deviceaddr da; /* This must be last */ -+}; -+ -+/* Initialize and call nfs_getdeviceinfo, then decode and return a -+ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() -+ * should be called. -+ */ -+int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, -+ struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr) -+{ -+ struct objlayout_deviceinfo *odi; -+ struct pnfs_device pd; -+ struct super_block *sb; -+ struct page *page; -+ size_t sz; -+ u32 *p; -+ int err; -+ -+ page = alloc_page(GFP_KERNEL); -+ if (!page) -+ return -ENOMEM; -+ -+ pd.area = page_address(page); -+ -+ memcpy(&pd.dev_id, d_id, sizeof(*d_id)); -+ pd.layout_type = LAYOUT_OSD2_OBJECTS; -+ pd.pages = &page; -+ pd.pgbase = 0; -+ pd.pglen = PAGE_SIZE; -+ pd.mincount = 0; -+ -+ sb = pnfslay->inode->i_sb; -+ err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->inode), &pd); -+ dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); -+ if (err) -+ goto err_out; -+ -+ p = pd.area; -+ sz = pnfs_osd_xdr_deviceaddr_incore_sz(p); -+ odi = kzalloc(sz + (sizeof(*odi) - sizeof(odi->da)), GFP_KERNEL); -+ if (!odi) { -+ err = -ENOMEM; -+ goto err_out; -+ } -+ pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); -+ odi->page = page; -+ *deviceaddr = &odi->da; -+ return 0; -+ -+err_out: -+ __free_page(page); -+ return err; -+} -+ -+void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) -+{ -+ struct objlayout_deviceinfo *odi = container_of(deviceaddr, -+ struct objlayout_deviceinfo, -+ da); -+ -+ __free_page(odi->page); -+ kfree(odi); -+} -+ -+/* -+ * Initialize a mountpoint by retrieving the list of -+ * available devices for it. -+ * Return the pnfs_mount_type structure so the -+ * pNFS_client can refer to the mount point later on. -+ */ -+int -+objlayout_initialize_mountpoint(struct nfs_server *server, -+ const struct nfs_fh *mntfh) -+{ -+ void *data; -+ -+ data = objio_init_mt(); -+ if (IS_ERR(data)) { -+ printk(KERN_INFO "%s: objlayout lib not ready err=%ld\n", -+ __func__, PTR_ERR(data)); -+ return PTR_ERR(data); -+ } -+ server->pnfs_ld_data = data; -+ -+ dprintk("%s: Return data=%p\n", __func__, data); -+ return 0; -+} -+ -+/* -+ * Uninitialize a mountpoint -+ */ -+int -+objlayout_uninitialize_mountpoint(struct nfs_server *server) -+{ -+ dprintk("%s: Begin %p\n", __func__, server->pnfs_ld_data); -+ objio_fini_mt(server->pnfs_ld_data); -+ return 0; -+} -diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h -new file mode 100644 -index 0000000..cad24a4 ---- /dev/null -+++ b/fs/nfs/objlayout/objlayout.h -@@ -0,0 +1,206 @@ -+/* -+ * objlayout.h -+ * -+ * Data types and function declerations for interfacing with the -+ * pNFS standard object layout driver. -+ * -+ * Copyright (C) 2007-2009 Panasas Inc. -+ * All rights reserved. -+ * -+ * Benny Halevy -+ * Boaz Harrosh -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 -+ * See the file COPYING included with this distribution for more details. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. Neither the name of the Panasas company nor the names of its -+ * contributors may be used to endorse or promote products derived -+ * from this software without specific prior written permission. -+ * -+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED -+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ */ -+ -+#ifndef _OBJLAYOUT_H -+#define _OBJLAYOUT_H -+ -+#include -+#include -+#include "../pnfs.h" -+ -+/* -+ * in-core layout segment -+ */ -+struct objlayout_segment { -+ struct pnfs_layout_segment lseg; -+ void *internal; /* for provider internal use */ -+ u8 pnfs_osd_layout[]; -+}; -+ -+/* -+ * per-inode layout -+ */ -+struct objlayout { -+ struct pnfs_layout_hdr pnfs_layout; -+ -+ /* for layout_commit */ -+ enum osd_delta_space_valid_enum { -+ OBJ_DSU_INIT = 0, -+ OBJ_DSU_VALID, -+ OBJ_DSU_INVALID, -+ } delta_space_valid; -+ s64 delta_space_used; /* consumed by write ops */ -+ -+ /* for layout_return */ -+ spinlock_t lock; -+ struct list_head err_list; -+}; -+ -+static inline struct objlayout * -+OBJLAYOUT(struct pnfs_layout_hdr *lo) -+{ -+ return container_of(lo, struct objlayout, pnfs_layout); -+} -+ -+/* -+ * per-I/O operation state -+ * embedded in objects provider io_state data structure -+ */ -+struct objlayout_io_state { -+ struct objlayout_segment *objlseg; -+ -+ struct page **pages; -+ unsigned pgbase; -+ unsigned nr_pages; -+ unsigned long count; -+ loff_t offset; -+ bool sync; -+ -+ void *rpcdata; -+ int status; /* res */ -+ int eof; /* res */ -+ int committed; /* res */ -+ -+ /* Error reporting (layout_return) */ -+ struct list_head err_list; -+ unsigned num_comps; -+ /* Pointer to array of error descriptors of size num_comps. -+ * It should contain as many entries as devices in the osd_layout -+ * that participate in the I/O. It is up to the io_engine to allocate -+ * needed space and set num_comps. -+ */ -+ struct pnfs_osd_ioerr *ioerrs; -+}; -+ -+/* -+ * Raid engine I/O API -+ */ -+extern void *objio_init_mt(void); -+extern void objio_fini_mt(void *mt); -+ -+extern int objio_alloc_lseg(void **outp, -+ struct pnfs_layout_hdr *pnfslay, -+ struct pnfs_layout_segment *lseg, -+ struct pnfs_osd_layout *layout); -+extern void objio_free_lseg(void *p); -+ -+extern int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp); -+extern void objio_free_io_state(struct objlayout_io_state *state); -+ -+extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); -+extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, -+ bool stable); -+ -+/* -+ * callback API -+ */ -+extern void objlayout_io_set_result(struct objlayout_io_state *state, -+ unsigned index, int osd_error, -+ u64 offset, u64 length, bool is_write); -+ -+static inline void -+objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) -+{ -+ struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.layout); -+ -+ /* If one of the I/Os errored out and the delta_space_used was -+ * invalid we render the complete report as invalid. Protocol mandate -+ * the DSU be accurate or not reported. -+ */ -+ spin_lock(&objlay->lock); -+ if (objlay->delta_space_valid != OBJ_DSU_INVALID) { -+ objlay->delta_space_valid = OBJ_DSU_VALID; -+ objlay->delta_space_used += space_used; -+ } -+ spin_unlock(&objlay->lock); -+} -+ -+extern void objlayout_read_done(struct objlayout_io_state *state, -+ ssize_t status, bool sync); -+extern void objlayout_write_done(struct objlayout_io_state *state, -+ ssize_t status, bool sync); -+ -+extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, -+ struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr); -+extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); -+ -+/* -+ * exported generic objects function vectors -+ */ -+ -+extern int objlayout_initialize_mountpoint( -+ struct nfs_server *, -+ const struct nfs_fh *); -+extern int objlayout_uninitialize_mountpoint(struct nfs_server *); -+ -+extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *); -+extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *); -+ -+extern struct pnfs_layout_segment *objlayout_alloc_lseg( -+ struct pnfs_layout_hdr *, -+ struct nfs4_layoutget_res *); -+extern void objlayout_free_lseg(struct pnfs_layout_segment *); -+ -+extern enum pnfs_try_status objlayout_read_pagelist( -+ struct nfs_read_data *, -+ unsigned nr_pages); -+ -+extern enum pnfs_try_status objlayout_write_pagelist( -+ struct nfs_write_data *, -+ unsigned nr_pages, -+ int how); -+ -+extern enum pnfs_try_status objlayout_commit( -+ struct nfs_write_data *, -+ int how); -+ -+extern void objlayout_encode_layoutcommit( -+ struct pnfs_layout_hdr *, -+ struct xdr_stream *, -+ const struct nfs4_layoutcommit_args *); -+ -+extern void objlayout_encode_layoutreturn( -+ struct pnfs_layout_hdr *, -+ struct xdr_stream *, -+ const struct nfs4_layoutreturn_args *); -+ -+#endif /* _OBJLAYOUT_H */ -diff --git a/fs/nfs/objlayout/panfs_shim.c b/fs/nfs/objlayout/panfs_shim.c -new file mode 100644 -index 0000000..4d31856 ---- /dev/null -+++ b/fs/nfs/objlayout/panfs_shim.c -@@ -0,0 +1,702 @@ -+/* -+ * panfs_shim.c -+ * -+ * Shim layer for interfacing with the Panasas DirectFlow module I/O stack -+ * -+ * Copyright (C) 2007-2009 Panasas Inc. -+ * All rights reserved. -+ * -+ * Benny Halevy -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. Neither the name of the Panasas company nor the names of its -+ * contributors may be used to endorse or promote products derived -+ * from this software without specific prior written permission. -+ * -+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED -+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ * -+ * See the file COPYING included with this distribution for more details. -+ * -+ */ -+ -+#include -+#include -+#include -+ -+#include "objlayout.h" -+#include "panfs_shim.h" -+ -+#include -+ -+#define NFSDBG_FACILITY NFSDBG_PNFS_LD -+ -+struct panfs_export_operations *panfs_export_ops; -+ -+void * -+objio_init_mt(void) -+{ -+ return panfs_export_ops == NULL ? ERR_PTR(-EAGAIN) : NULL; -+} -+ -+void objio_fini_mt(void *mountid) -+{ -+} -+ -+static int -+panfs_shim_conv_raid01(struct pnfs_osd_layout *layout, -+ struct pnfs_osd_data_map *lo_map, -+ pan_agg_layout_hdr_t *hdr) -+{ -+ if (lo_map->odm_mirror_cnt) { -+ hdr->type = PAN_AGG_RAID1; -+ hdr->hdr.raid1.num_comps = lo_map->odm_mirror_cnt + 1; -+ } else if (layout->olo_num_comps > 1) { -+ hdr->type = PAN_AGG_RAID0; -+ hdr->hdr.raid0.num_comps = layout->olo_num_comps; -+ hdr->hdr.raid0.stripe_unit = lo_map->odm_stripe_unit; -+ } else -+ hdr->type = PAN_AGG_SIMPLE; -+ return 0; -+} -+ -+static int -+panfs_shim_conv_raid5(struct pnfs_osd_layout *layout, -+ struct pnfs_osd_data_map *lo_map, -+ pan_agg_layout_hdr_t *hdr) -+{ -+ if (lo_map->odm_mirror_cnt) -+ goto err; -+ -+ if (lo_map->odm_group_width || lo_map->odm_group_depth) { -+ if (!lo_map->odm_group_width || !lo_map->odm_group_depth) -+ goto err; -+ -+ hdr->type = PAN_AGG_GRP_RAID5_LEFT; -+ hdr->hdr.grp_raid5_left.num_comps = lo_map->odm_num_comps; -+ if (hdr->hdr.grp_raid5_left.num_comps != lo_map->odm_num_comps) -+ goto err; -+ hdr->hdr.grp_raid5_left.stripe_unit = lo_map->odm_stripe_unit; -+ hdr->hdr.grp_raid5_left.rg_width = lo_map->odm_group_width; -+ hdr->hdr.grp_raid5_left.rg_depth = lo_map->odm_group_depth; -+ /* this is a guess, panasas server is not supposed to -+ hand out layotu otherwise */ -+ hdr->hdr.grp_raid5_left.group_layout_policy = -+ PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN; -+ } else { -+ hdr->type = PAN_AGG_RAID5_LEFT; -+ hdr->hdr.raid5_left.num_comps = lo_map->odm_num_comps; -+ if (hdr->hdr.raid5_left.num_comps != lo_map->odm_num_comps) -+ goto err; -+ hdr->hdr.raid5_left.stripe_unit2 = -+ hdr->hdr.raid5_left.stripe_unit1 = -+ hdr->hdr.raid5_left.stripe_unit0 = lo_map->odm_stripe_unit; -+ } -+ -+ return 0; -+err: -+ return -EINVAL; -+} -+ -+/* -+ * Convert a pnfs_osd data map into Panasas aggregation layout header -+ */ -+static int -+panfs_shim_conv_pnfs_osd_data_map( -+ struct pnfs_osd_layout *layout, -+ pan_agg_layout_hdr_t *hdr) -+{ -+ int status = -EINVAL; -+ struct pnfs_osd_data_map *lo_map = &layout->olo_map; -+ -+ if (!layout->olo_num_comps) { -+ dprintk("%s: !!layout.n_comps(%u)\n", __func__, -+ layout->olo_num_comps); -+ goto err; -+ } -+ -+ switch (lo_map->odm_raid_algorithm) { -+ case PNFS_OSD_RAID_0: -+ if (layout->olo_num_comps != lo_map->odm_num_comps || -+ layout->olo_comps_index) { -+ dprintk("%s: !!PNFS_OSD_RAID_0 " -+ "layout.n_comps(%u) map.n_comps(%u) " -+ "comps_index(%u)\n", __func__, -+ layout->olo_num_comps, -+ lo_map->odm_num_comps, -+ layout->olo_comps_index); -+ goto err; -+ } -+ status = panfs_shim_conv_raid01(layout, lo_map, hdr); -+ break; -+ -+ case PNFS_OSD_RAID_5: -+ if (!lo_map->odm_group_width) { -+ if (layout->olo_num_comps != lo_map->odm_num_comps || -+ layout->olo_comps_index) { -+ dprintk("%s: !!PNFS_OSD_RAID_5 !group_width " -+ "layout.n_comps(%u)!=map.n_comps(%u) " -+ "|| comps_index(%u)\n", __func__, -+ layout->olo_num_comps, -+ lo_map->odm_num_comps, -+ layout->olo_comps_index); -+ goto err; -+ } -+ } else if ((layout->olo_num_comps != lo_map->odm_num_comps && -+ layout->olo_num_comps > lo_map->odm_group_width) || -+ (layout->olo_comps_index % lo_map->odm_group_width)){ -+ dprintk("%s: !!PNFS_OSD_RAID_5 group_width(%u) " -+ "layout.n_comps(%u) map.n_comps(%u) " -+ "comps_index(%u)\n", __func__, -+ lo_map->odm_group_width, -+ layout->olo_num_comps, -+ lo_map->odm_num_comps, -+ layout->olo_comps_index); -+ goto err; -+ } -+ status = panfs_shim_conv_raid5(layout, lo_map, hdr); -+ break; -+ -+ case PNFS_OSD_RAID_4: -+ case PNFS_OSD_RAID_PQ: -+ default: -+ dprintk("%s: !!PNFS_OSD_RAID_(%d)\n", __func__, -+ lo_map->odm_raid_algorithm); -+ goto err; -+ } -+ -+ return 0; -+ -+err: -+ return status; -+} -+ -+/* -+ * Convert pnfs_osd layout into Panasas map and caps type -+ */ -+int -+objio_alloc_lseg(void **outp, -+ struct pnfs_layout_hdr *pnfslay, -+ struct pnfs_layout_segment *lseg, -+ struct pnfs_osd_layout *layout) -+{ -+ int i, total_comps; -+ int status; -+ struct pnfs_osd_object_cred *lo_comp; -+ pan_size_t alloc_sz, local_sz; -+ pan_sm_map_cap_t *mcs = NULL; -+ u8 *buf; -+ pan_agg_comp_obj_t *pan_comp; -+ pan_sm_sec_t *pan_sec; -+ -+ status = -EINVAL; -+ if (layout->olo_num_comps < layout->olo_map.odm_group_width) { -+ total_comps = layout->olo_comps_index + layout->olo_num_comps; -+ } else { -+ /* allocate full map, otherwise SAM gets confused */ -+ total_comps = layout->olo_map.odm_num_comps; -+ } -+ alloc_sz = total_comps * -+ (sizeof(pan_agg_comp_obj_t) + sizeof(pan_sm_sec_t)); -+ for (i = 0; i < layout->olo_num_comps; i++) { -+ void *p = layout->olo_comps[i].oc_cap.cred; -+ if (panfs_export_ops->sm_sec_t_get_size_otw( -+ (pan_sm_sec_otw_t *)&p, &local_sz, NULL, NULL)) -+ goto err; -+ alloc_sz += local_sz; -+ } -+ -+ status = -ENOMEM; -+ mcs = kzalloc(sizeof(*mcs) + alloc_sz, GFP_KERNEL); -+ if (!mcs) -+ goto err; -+ buf = (u8 *)&mcs[1]; -+ -+ mcs->offset = lseg->range.offset; -+ mcs->length = lseg->range.length; -+#if 0 -+ /* FIXME: for now */ -+ mcs->expiration_time.ts_sec = 0; -+ mcs->expiration_time.ts_nsec = 0; -+#endif -+ mcs->full_map.map_hdr.avail_state = PAN_AGG_OBJ_STATE_NORMAL; -+ status = panfs_shim_conv_pnfs_osd_data_map(layout, -+ &mcs->full_map.layout_hdr); -+ if (status) -+ goto err; -+ -+ mcs->full_map.components.size = total_comps; -+ mcs->full_map.components.data = (pan_agg_comp_obj_t *)buf; -+ buf += total_comps * sizeof(pan_agg_comp_obj_t); -+ -+ mcs->secs.size = total_comps; -+ mcs->secs.data = (pan_sm_sec_t *)buf; -+ buf += total_comps * sizeof(pan_sm_sec_t); -+ -+ lo_comp = layout->olo_comps; -+ pan_comp = mcs->full_map.components.data + layout->olo_comps_index; -+ pan_sec = mcs->secs.data + layout->olo_comps_index; -+ for (i = 0; i < layout->olo_num_comps; i++) { -+ void *p; -+ pan_stor_obj_id_t *obj_id = &mcs->full_map.map_hdr.obj_id; -+ struct pnfs_osd_objid *oc_obj_id = &lo_comp->oc_object_id; -+ u64 dev_id = __be64_to_cpup( -+ (__be64 *)oc_obj_id->oid_device_id.data + 1); -+ -+ dprintk("%s: i=%d deviceid=%Lx:%Lx partition=%Lx object=%Lx\n", -+ __func__, i, -+ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data), -+ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data + 1), -+ oc_obj_id->oid_partition_id, oc_obj_id->oid_object_id); -+ -+ if (i == 0) { -+ /* make up mgr_id to calm sam down */ -+ pan_mgr_id_construct_artificial(PAN_MGR_SM, 0, -+ &obj_id->dev_id); -+ obj_id->grp_id = oc_obj_id->oid_partition_id; -+ obj_id->obj_id = oc_obj_id->oid_object_id; -+ } -+ -+ if (obj_id->grp_id != lo_comp->oc_object_id.oid_partition_id) { -+ dprintk("%s: i=%d grp_id=0x%Lx oid_partition_id=0x%Lx\n", -+ __func__, i, (u64)obj_id->grp_id, -+ lo_comp->oc_object_id.oid_partition_id); -+ status = -EINVAL; -+ goto err; -+ } -+ -+ if (obj_id->obj_id != lo_comp->oc_object_id.oid_object_id) { -+ dprintk("%s: i=%d obj_id=0x%Lx oid_object_id=0x%Lx\n", -+ __func__, i, obj_id->obj_id, -+ lo_comp->oc_object_id.oid_object_id); -+ status = -EINVAL; -+ goto err; -+ } -+ -+ pan_comp->dev_id = dev_id; -+ if (!pan_stor_is_device_id_an_obsd_id(pan_comp->dev_id)) { -+ dprintk("%s: i=%d dev_id=0x%Lx not an obsd_id\n", -+ __func__, i, obj_id->dev_id); -+ status = -EINVAL; -+ goto err; -+ } -+ if (lo_comp->oc_osd_version == PNFS_OSD_MISSING) { -+ dprintk("%s: degraded maps not supported yet\n", -+ __func__); -+ status = -ENOTSUPP; -+ goto err; -+ } -+ pan_comp->avail_state = PAN_AGG_COMP_STATE_NORMAL; -+ if (lo_comp->oc_cap_key_sec != PNFS_OSD_CAP_KEY_SEC_NONE) { -+ dprintk("%s: cap key security not supported yet\n", -+ __func__); -+ status = -ENOTSUPP; -+ goto err; -+ } -+ -+ p = lo_comp->oc_cap.cred; -+ panfs_export_ops->sm_sec_t_unmarshall( -+ (pan_sm_sec_otw_t *)&p, -+ pan_sec, -+ buf, -+ alloc_sz, -+ NULL, -+ &local_sz); -+ buf += local_sz; -+ alloc_sz -= local_sz; -+ -+ lo_comp++; -+ pan_comp++; -+ pan_sec++; -+ } -+ -+ *outp = mcs; -+ dprintk("%s:Return mcs=%p\n", __func__, mcs); -+ return 0; -+ -+err: -+ objio_free_lseg(mcs); -+ dprintk("%s:Error %d\n", __func__, status); -+ return status; -+} -+ -+/* -+ * Free a Panasas map and caps type -+ */ -+void -+objio_free_lseg(void *p) -+{ -+ kfree(p); -+} -+ -+/* -+ * I/O routines -+ */ -+int -+objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) -+{ -+ struct panfs_shim_io_state *p; -+ -+ dprintk("%s: allocating io_state\n", __func__); -+ p = kzalloc(sizeof(*p), GFP_KERNEL); -+ if (!p) -+ return -ENOMEM; -+ -+ *outp = &p->ol_state; -+ return 0; -+} -+ -+/* -+ * Free an I/O state -+ */ -+void -+objio_free_io_state(struct objlayout_io_state *ol_state) -+{ -+ struct panfs_shim_io_state *state = container_of(ol_state, -+ struct panfs_shim_io_state, ol_state); -+ int i; -+ -+ dprintk("%s: freeing io_state\n", __func__); -+ for (i = 0; i < state->ol_state.nr_pages; i++) -+ kunmap(state->ol_state.pages[i]); -+ -+ if (state->ucreds) -+ panfs_export_ops->ucreds_put(state->ucreds); -+ kfree(state->sg_list); -+ kfree(state); -+} -+ -+static int -+panfs_shim_pages_to_sg( -+ struct panfs_shim_io_state *state, -+ struct page **pages, -+ unsigned int pgbase, -+ unsigned nr_pages, -+ size_t count) -+{ -+ unsigned i, n; -+ pan_sg_entry_t *sg; -+ -+ dprintk("%s pgbase %u nr_pages %u count %d " -+ "pg0 %p flags 0x%x index %llu\n", -+ __func__, pgbase, nr_pages, (int)count, pages[0], -+ (unsigned)pages[0]->flags, (unsigned long long)pages[0]->index); -+ -+ sg = kmalloc(nr_pages * sizeof(*sg), GFP_KERNEL); -+ if (sg == NULL) -+ return -ENOMEM; -+ -+ dprintk("%s sg_list %p pages %p pgbase %u nr_pages %u\n", -+ __func__, sg, pages, pgbase, nr_pages); -+ -+ for (i = 0; i < nr_pages; i++) { -+ sg[i].buffer = (char *)kmap(pages[i]) + pgbase; -+ n = PAGE_SIZE - pgbase; -+ pgbase = 0; -+ if (n > count) -+ n = count; -+ sg[i].chunk_size = n; -+ count -= n; -+ if (likely(count)) { -+ sg[i].next = &sg[i+1]; -+ } else { -+ /* we're done */ -+ sg[i].next = NULL; -+ break; -+ } -+ } -+ BUG_ON(count); -+ -+ state->sg_list = sg; -+ return 0; -+} -+ -+/* -+ * Callback function for async reads -+ */ -+static void -+panfs_shim_read_done( -+ void *arg1, -+ void *arg2, -+ pan_sam_read_res_t *res_p, -+ pan_status_t rc) -+{ -+ struct panfs_shim_io_state *state = arg1; -+ ssize_t status; -+ -+ dprintk("%s: Begin\n", __func__); -+ if (!res_p) -+ res_p = &state->u.read.res; -+ if (rc == PAN_SUCCESS) -+ rc = res_p->result; -+ if (rc == PAN_SUCCESS) { -+ status = res_p->length; -+ WARN_ON(status < 0); -+ } else { -+ status = -panfs_export_ops->convert_rc(rc); -+ dprintk("%s: pan_sam_read rc %d: status %Zd\n", -+ __func__, rc, status); -+ } -+ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); -+ objlayout_read_done(&state->ol_state, status, true); -+} -+ -+ssize_t -+objio_read_pagelist(struct objlayout_io_state *ol_state) -+{ -+ struct panfs_shim_io_state *state = container_of(ol_state, -+ struct panfs_shim_io_state, ol_state); -+ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)ol_state->objlseg->internal; -+ ssize_t status = 0; -+ pan_status_t rc = PAN_SUCCESS; -+ -+ dprintk("%s: Begin\n", __func__); -+ -+ status = panfs_shim_pages_to_sg(state, ol_state->pages, -+ ol_state->pgbase, ol_state->nr_pages, -+ ol_state->count); -+ if (unlikely(status)) -+ goto err; -+ -+ state->obj_sec.min_security = 0; -+ state->obj_sec.map_ccaps = mcs; -+ -+ rc = panfs_export_ops->ucreds_get(&state->ucreds); -+ if (unlikely(rc)) { -+ status = -EACCES; -+ goto err; -+ } -+ -+ state->u.read.args.obj_id = mcs->full_map.map_hdr.obj_id; -+ state->u.read.args.offset = ol_state->offset; -+ rc = panfs_export_ops->sam_read(PAN_SAM_ACCESS_BYPASS_TIMESTAMP, -+ &state->u.read.args, -+ &state->obj_sec, -+ state->sg_list, -+ state->ucreds, -+ ol_state->sync ? -+ NULL : panfs_shim_read_done, -+ state, NULL, -+ &state->u.read.res); -+ if (rc != PAN_ERR_IN_PROGRESS) -+ panfs_shim_read_done(state, NULL, &state->u.read.res, rc); -+ err: -+ dprintk("%s: Return %Zd\n", __func__, status); -+ return status; -+} -+ -+/* -+ * Callback function for async writes -+ */ -+static void -+panfs_shim_write_done( -+ void *arg1, -+ void *arg2, -+ pan_sam_write_res_t *res_p, -+ pan_status_t rc) -+{ -+ struct panfs_shim_io_state *state = arg1; -+ ssize_t status; -+ -+ dprintk("%s: Begin\n", __func__); -+ if (!res_p) -+ res_p = &state->u.write.res; -+ if (rc == PAN_SUCCESS) -+ rc = res_p->result; -+ if (rc == PAN_SUCCESS) { -+/* state->ol_state.committed = NFS_FILE_SYNC;*/ -+ state->ol_state.committed = NFS_UNSTABLE; -+ status = res_p->length; -+ WARN_ON(status < 0); -+ -+ objlayout_add_delta_space_used(&state->ol_state, -+ res_p->delta_capacity_used); -+ } else { -+ status = -panfs_export_ops->convert_rc(rc); -+ dprintk("%s: pan_sam_write rc %u: status %Zd\n", -+ __func__, rc, status); -+ } -+ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); -+ objlayout_write_done(&state->ol_state, status, true); -+} -+ -+ssize_t -+objio_write_pagelist(struct objlayout_io_state *ol_state, -+ bool stable /* unused, PanOSD writes are stable */) -+{ -+ struct panfs_shim_io_state *state = container_of(ol_state, -+ struct panfs_shim_io_state, ol_state); -+ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)ol_state->objlseg->internal; -+ ssize_t status = 0; -+ pan_status_t rc = PAN_SUCCESS; -+ -+ dprintk("%s: Begin\n", __func__); -+ -+ status = panfs_shim_pages_to_sg(state, ol_state->pages, -+ ol_state->pgbase, ol_state->nr_pages, -+ ol_state->count); -+ if (unlikely(status)) -+ goto err; -+ -+ state->obj_sec.min_security = 0; -+ state->obj_sec.map_ccaps = mcs; -+ -+ rc = panfs_export_ops->ucreds_get(&state->ucreds); -+ if (unlikely(rc)) { -+ status = -EACCES; -+ goto err; -+ } -+ -+ state->u.write.args.obj_id = mcs->full_map.map_hdr.obj_id; -+ state->u.write.args.offset = ol_state->offset; -+ rc = panfs_export_ops->sam_write(PAN_SAM_ACCESS_NONE, -+ &state->u.write.args, -+ &state->obj_sec, -+ state->sg_list, -+ state->ucreds, -+ ol_state->sync ? -+ NULL : panfs_shim_write_done, -+ state, -+ NULL, -+ &state->u.write.res); -+ if (rc != PAN_ERR_IN_PROGRESS) -+ panfs_shim_write_done(state, NULL, &state->u.write.res, rc); -+ err: -+ dprintk("%s: Return %Zd\n", __func__, status); -+ return status; -+} -+ -+int -+panfs_shim_register(struct panfs_export_operations *ops) -+{ -+ if (panfs_export_ops) { -+ printk(KERN_INFO -+ "%s: panfs already registered (panfs ops %p)\n", -+ __func__, panfs_export_ops); -+ return -EINVAL; -+ } -+ -+ printk(KERN_INFO "%s: registering panfs ops %p\n", -+ __func__, ops); -+ -+ panfs_export_ops = ops; -+ return 0; -+} -+EXPORT_SYMBOL(panfs_shim_register); -+ -+int -+panfs_shim_unregister(void) -+{ -+ if (!panfs_export_ops) { -+ printk(KERN_INFO "%s: panfs is not registered\n", __func__); -+ return -EINVAL; -+ } -+ -+ printk(KERN_INFO "%s: unregistering panfs ops %p\n", -+ __func__, panfs_export_ops); -+ -+ panfs_export_ops = NULL; -+ return 0; -+} -+EXPORT_SYMBOL(panfs_shim_unregister); -+ -+/* -+ * Policy Operations -+ */ -+ -+#define PANLAYOUT_DEF_STRIPE_UNIT (64*1024) -+#define PANLAYOUT_DEF_STRIPE_WIDTH 9 -+#define PANLAYOUT_MAX_STRIPE_WIDTH 11 -+#define PANLAYOUT_MAX_GATHER_STRIPES 8 -+ -+/* -+ * Get the max [rw]size -+ */ -+static ssize_t -+panlayout_get_blocksize(void) -+{ -+ ssize_t sz = (PANLAYOUT_MAX_STRIPE_WIDTH-1) * -+ PANLAYOUT_DEF_STRIPE_UNIT * -+ PANLAYOUT_MAX_GATHER_STRIPES; -+ dprintk("%s: Return %Zd\n", __func__, sz); -+ return sz; -+} -+ -+/* -+ * Don't gather across stripes, but rather gather (coalesce) up to -+ * the stripe size. -+ * -+ * FIXME: change interface to use merge_align, merge_count -+ */ -+#define PNFS_LAYOUT_PANOSD (NFS4_PNFS_PRIVATE_LAYOUT | LAYOUT_OSD2_OBJECTS) -+ -+static struct pnfs_layoutdriver_type panlayout_type = { -+ .id = PNFS_LAYOUT_PANOSD, -+ .name = "PNFS_LAYOUT_PANOSD", -+ .flags = PNFS_LAYOUTRET_ON_SETATTR, -+ -+ .initialize_mountpoint = objlayout_initialize_mountpoint, -+ .uninitialize_mountpoint = objlayout_uninitialize_mountpoint, -+ -+ .alloc_layout_hdr = objlayout_alloc_layout_hdr, -+ .free_layout_hdr = objlayout_free_layout_hdr, -+ -+ .alloc_lseg = objlayout_alloc_lseg, -+ .free_lseg = objlayout_free_lseg, -+ -+ .get_blocksize = panlayout_get_blocksize, -+ -+ .read_pagelist = objlayout_read_pagelist, -+ .write_pagelist = objlayout_write_pagelist, -+ .commit = objlayout_commit, -+ -+ .encode_layoutcommit = objlayout_encode_layoutcommit, -+ .encode_layoutreturn = objlayout_encode_layoutreturn, -+}; -+ -+MODULE_DESCRIPTION("pNFS Layout Driver for Panasas OSDs"); -+MODULE_AUTHOR("Benny Halevy "); -+MODULE_LICENSE("GPL"); -+ -+static int __init -+panlayout_init(void) -+{ -+ int ret = pnfs_register_layoutdriver(&panlayout_type); -+ -+ if (ret) -+ printk(KERN_INFO -+ "%s: Registering Panasas OSD pNFS Layout Driver failed: error=%d\n", -+ __func__, ret); -+ else -+ printk(KERN_INFO "%s: Registered Panasas OSD pNFS Layout Driver\n", -+ __func__); -+ return ret; -+} -+ -+static void __exit -+panlayout_exit(void) -+{ -+ pnfs_unregister_layoutdriver(&panlayout_type); -+ printk(KERN_INFO "%s: Unregistered Panasas OSD pNFS Layout Driver\n", -+ __func__); -+} -+ -+module_init(panlayout_init); -+module_exit(panlayout_exit); -diff --git a/fs/nfs/objlayout/panfs_shim.h b/fs/nfs/objlayout/panfs_shim.h -new file mode 100644 -index 0000000..18ef6db ---- /dev/null -+++ b/fs/nfs/objlayout/panfs_shim.h -@@ -0,0 +1,482 @@ -+/* -+ * panfs_shim.h -+ * -+ * Data types and external function declerations for interfacing with -+ * panfs (Panasas DirectFlow) I/O stack -+ * -+ * Copyright (C) 2007 Panasas Inc. -+ * All rights reserved. -+ * -+ * Benny Halevy -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. Neither the name of the Panasas company nor the names of its -+ * contributors may be used to endorse or promote products derived -+ * from this software without specific prior written permission. -+ * -+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED -+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ * -+ * See the file COPYING included with this distribution for more details. -+ * -+ */ -+ -+#ifndef _PANLAYOUT_PANFS_SHIM_H -+#define _PANLAYOUT_PANFS_SHIM_H -+ -+typedef s8 pan_int8_t; -+typedef u8 pan_uint8_t; -+typedef s16 pan_int16_t; -+typedef u16 pan_uint16_t; -+typedef s32 pan_int32_t; -+typedef u32 pan_uint32_t; -+typedef s64 pan_int64_t; -+typedef u64 pan_uint64_t; -+ -+/* -+ * from pan_base_types.h -+ */ -+typedef pan_uint64_t pan_rpc_none_t; -+typedef pan_uint32_t pan_rpc_arrdim_t; -+typedef pan_uint32_t pan_status_t; -+typedef pan_uint8_t pan_otw_t; -+typedef pan_uint8_t pan_pad_t; -+ -+typedef pan_uint32_t pan_timespec_sec_t; -+typedef pan_uint32_t pan_timespec_nsec_t; -+ -+typedef struct pan_timespec_s pan_timespec_t; -+struct pan_timespec_s { -+ pan_timespec_sec_t ts_sec; -+ pan_timespec_nsec_t ts_nsec; -+}; -+ -+/* -+ * from pan_std_types.h -+ */ -+typedef pan_uint32_t pan_size_t; -+typedef int pan_bool_t; -+ -+/* -+ * from pan_common_error.h -+ */ -+#define PAN_SUCCESS ((pan_status_t)0) -+#define PAN_ERR_IN_PROGRESS ((pan_status_t)55) -+ -+/* -+ * from pan_sg.h -+ */ -+typedef struct pan_sg_entry_s pan_sg_entry_t; -+struct pan_sg_entry_s { -+ void *buffer; /* pointer to memory */ -+ pan_uint32_t chunk_size; /* size of each chunk (bytes) */ -+ pan_sg_entry_t *next; -+}; -+ -+/* -+ * from pan_storage.h -+ */ -+typedef pan_uint64_t pan_stor_dev_id_t; -+typedef pan_uint32_t pan_stor_obj_grp_id_t; -+typedef pan_uint64_t pan_stor_obj_uniq_t; -+typedef pan_uint32_t pan_stor_action_t; -+typedef pan_uint8_t pan_stor_cap_key_t[20]; -+ -+typedef pan_uint8_t pan_stor_key_type_t; -+typedef pan_uint64_t pan_stor_len_t; -+typedef pan_int64_t pan_stor_delta_len_t; -+typedef pan_uint64_t pan_stor_offset_t; -+typedef pan_uint16_t pan_stor_op_t; -+ -+typedef pan_uint16_t pan_stor_sec_level_t; -+ -+struct pan_stor_obj_id_s { -+ pan_stor_dev_id_t dev_id; -+ pan_stor_obj_uniq_t obj_id; -+ pan_stor_obj_grp_id_t grp_id; -+}; -+ -+typedef struct pan_stor_obj_id_s pan_stor_obj_id_t; -+ -+#define PAN_STOR_OP_NONE ((pan_stor_op_t) 0U) -+#define PAN_STOR_OP_READ ((pan_stor_op_t) 8U) -+#define PAN_STOR_OP_WRITE ((pan_stor_op_t) 9U) -+#define PAN_STOR_OP_APPEND ((pan_stor_op_t) 10U) -+#define PAN_STOR_OP_GETATTR ((pan_stor_op_t) 11U) -+#define PAN_STOR_OP_SETATTR ((pan_stor_op_t) 12U) -+#define PAN_STOR_OP_FLUSH ((pan_stor_op_t) 13U) -+#define PAN_STOR_OP_CLEAR ((pan_stor_op_t) 14U) -+ -+/* -+ * from pan_aggregation_map.h -+ */ -+typedef pan_uint8_t pan_agg_type_t; -+typedef pan_uint64_t pan_agg_map_version_t; -+typedef pan_uint8_t pan_agg_obj_state_t; -+typedef pan_uint8_t pan_agg_comp_state_t; -+typedef pan_uint8_t pan_agg_comp_flag_t; -+ -+#define PAN_AGG_OBJ_STATE_INVALID ((pan_agg_obj_state_t) 0x00) -+#define PAN_AGG_OBJ_STATE_NORMAL ((pan_agg_obj_state_t) 0x01) -+#define PAN_AGG_OBJ_STATE_DEGRADED ((pan_agg_obj_state_t) 0x02) -+#define PAN_AGG_OBJ_STATE_RECONSTRUCT ((pan_agg_obj_state_t) 0x03) -+#define PAN_AGG_OBJ_STATE_COPYBACK ((pan_agg_obj_state_t) 0x04) -+#define PAN_AGG_OBJ_STATE_UNAVAILABLE ((pan_agg_obj_state_t) 0x05) -+#define PAN_AGG_OBJ_STATE_CREATING ((pan_agg_obj_state_t) 0x06) -+#define PAN_AGG_OBJ_STATE_DELETED ((pan_agg_obj_state_t) 0x07) -+#define PAN_AGG_COMP_STATE_INVALID ((pan_agg_comp_state_t) 0x00) -+#define PAN_AGG_COMP_STATE_NORMAL ((pan_agg_comp_state_t) 0x01) -+#define PAN_AGG_COMP_STATE_UNAVAILABLE ((pan_agg_comp_state_t) 0x02) -+#define PAN_AGG_COMP_STATE_COPYBACK ((pan_agg_comp_state_t) 0x03) -+#define PAN_AGG_COMP_F_NONE ((pan_agg_comp_flag_t) 0x00) -+#define PAN_AGG_COMP_F_ATTR_STORING ((pan_agg_comp_flag_t) 0x01) -+#define PAN_AGG_COMP_F_OBJ_CORRUPT_OBS ((pan_agg_comp_flag_t) 0x02) -+#define PAN_AGG_COMP_F_TEMP ((pan_agg_comp_flag_t) 0x04) -+ -+struct pan_aggregation_map_s { -+ pan_agg_map_version_t version; -+ pan_agg_obj_state_t avail_state; -+ pan_stor_obj_id_t obj_id; -+}; -+ -+typedef struct pan_aggregation_map_s pan_aggregation_map_t; -+ -+struct pan_agg_comp_obj_s { -+ pan_stor_dev_id_t dev_id; -+ pan_agg_comp_state_t avail_state; -+ pan_agg_comp_flag_t comp_flags; -+}; -+ -+typedef struct pan_agg_comp_obj_s pan_agg_comp_obj_t; -+ -+struct pan_agg_simple_header_s { -+ pan_uint8_t unused; -+}; -+ -+typedef struct pan_agg_simple_header_s pan_agg_simple_header_t; -+ -+struct pan_agg_raid1_header_s { -+ pan_uint16_t num_comps; -+}; -+ -+typedef struct pan_agg_raid1_header_s pan_agg_raid1_header_t; -+ -+struct pan_agg_raid0_header_s { -+ pan_uint16_t num_comps; -+ pan_uint32_t stripe_unit; -+}; -+ -+typedef struct pan_agg_raid0_header_s pan_agg_raid0_header_t; -+ -+struct pan_agg_raid5_left_header_s { -+ pan_uint16_t num_comps; -+ pan_uint32_t stripe_unit0; -+ pan_uint32_t stripe_unit1; -+ pan_uint32_t stripe_unit2; -+}; -+ -+typedef struct pan_agg_raid5_left_header_s pan_agg_raid5_left_header_t; -+ -+typedef struct pan_agg_grp_raid5_left_header_s pan_agg_grp_raid5_left_header_t; -+ -+struct pan_agg_grp_raid5_left_header_s { -+ pan_uint16_t num_comps; -+ pan_uint32_t stripe_unit; -+ pan_uint16_t rg_width; -+ pan_uint16_t rg_depth; -+ pan_uint8_t group_layout_policy; -+}; -+ -+#define PAN_AGG_GRP_RAID5_LEFT_POLICY_INVALID ((pan_uint8_t) 0x00) -+#define PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN ((pan_uint8_t) 0x01) -+ -+#define PAN_AGG_NULL_MAP ((pan_agg_type_t) 0x00) -+#define PAN_AGG_SIMPLE ((pan_agg_type_t) 0x01) -+#define PAN_AGG_RAID1 ((pan_agg_type_t) 0x02) -+#define PAN_AGG_RAID0 ((pan_agg_type_t) 0x03) -+#define PAN_AGG_RAID5_LEFT ((pan_agg_type_t) 0x04) -+#define PAN_AGG_GRP_RAID5_LEFT ((pan_agg_type_t) 0x06) -+#define PAN_AGG_MINTYPE ((pan_agg_type_t) 0x01) -+#define PAN_AGG_MAXTYPE ((pan_agg_type_t) 0x06) -+ -+struct pan_agg_layout_hdr_s { -+ pan_agg_type_t type; -+ pan_pad_t pad[3]; -+ union { -+ pan_uint64_t null; -+ pan_agg_simple_header_t simple; -+ pan_agg_raid1_header_t raid1; -+ pan_agg_raid0_header_t raid0; -+ pan_agg_raid5_left_header_t raid5_left; -+ pan_agg_grp_raid5_left_header_t grp_raid5_left; -+ } hdr; -+}; -+ -+typedef struct pan_agg_layout_hdr_s pan_agg_layout_hdr_t; -+ -+struct pan_agg_comp_obj_a_s { -+ pan_rpc_arrdim_t size; -+ pan_agg_comp_obj_t *data; -+}; -+typedef struct pan_agg_comp_obj_a_s pan_agg_comp_obj_a; -+ -+struct pan_agg_full_map_s { -+ pan_aggregation_map_t map_hdr; -+ pan_agg_layout_hdr_t layout_hdr; -+ pan_agg_comp_obj_a components; -+}; -+ -+typedef struct pan_agg_full_map_s pan_agg_full_map_t; -+ -+/* -+ * from pan_obsd_rpc_types.h -+ */ -+typedef pan_uint8_t pan_obsd_security_key_a[16]; -+ -+typedef pan_uint8_t pan_obsd_capability_key_a[20]; -+ -+typedef pan_uint8_t pan_obsd_key_holder_id_t; -+ -+#define PAN_OBSD_KEY_HOLDER_BASIS_KEY ((pan_obsd_key_holder_id_t) 0x01) -+#define PAN_OBSD_KEY_HOLDER_CAP_KEY ((pan_obsd_key_holder_id_t) 0x02) -+ -+struct pan_obsd_key_holder_s { -+ pan_obsd_key_holder_id_t select; -+ pan_pad_t pad[3]; -+ union { -+ pan_obsd_security_key_a basis_key; -+ pan_obsd_capability_key_a cap_key; -+ } key; -+}; -+ -+typedef struct pan_obsd_key_holder_s pan_obsd_key_holder_t; -+ -+/* -+ * from pan_sm_sec.h -+ */ -+typedef pan_uint8_t pan_sm_sec_type_t; -+typedef pan_uint8_t pan_sm_sec_otw_allo_mode_t; -+ -+struct pan_obsd_capability_generic_otw_t_s { -+ pan_rpc_arrdim_t size; -+ pan_uint8_t *data; -+}; -+typedef struct pan_obsd_capability_generic_otw_t_s -+ pan_obsd_capability_generic_otw_t; -+ -+struct pan_sm_sec_obsd_s { -+ pan_obsd_key_holder_t key; -+ pan_obsd_capability_generic_otw_t cap_otw; -+ pan_sm_sec_otw_allo_mode_t allo_mode; -+}; -+ -+typedef struct pan_sm_sec_obsd_s pan_sm_sec_obsd_t; -+ -+struct pan_sm_sec_s { -+ pan_sm_sec_type_t type; -+ pan_pad_t pad[3]; -+ union { -+ pan_rpc_none_t none; -+ pan_sm_sec_obsd_t obsd; -+ } variant; -+}; -+ -+typedef struct pan_sm_sec_s pan_sm_sec_t; -+ -+struct pan_sm_sec_a_s { -+ pan_rpc_arrdim_t size; -+ pan_sm_sec_t *data; -+}; -+typedef struct pan_sm_sec_a_s pan_sm_sec_a; -+typedef pan_otw_t *pan_sm_sec_otw_t; -+ -+/* -+ * from pan_sm_types.h -+ */ -+typedef pan_uint64_t pan_sm_cap_handle_t; -+ -+struct pan_sm_map_cap_s { -+ pan_agg_full_map_t full_map; -+ pan_stor_offset_t offset; -+ pan_stor_len_t length; -+ pan_sm_sec_a secs; -+ pan_sm_cap_handle_t handle; -+ pan_timespec_t expiration_time; -+ pan_stor_action_t action_mask; -+ pan_uint32_t flags; -+}; -+ -+typedef struct pan_sm_map_cap_s pan_sm_map_cap_t; -+ -+/* -+ * from pan_sm_ops.h -+ */ -+typedef pan_rpc_none_t pan_sm_cache_ptr_t; -+ -+/* -+ * from pan_sam_api.h -+ */ -+typedef pan_uint32_t pan_sam_access_flags_t; -+ -+typedef struct pan_sam_dev_error_s pan_sam_dev_error_t; -+struct pan_sam_dev_error_s { -+ pan_stor_dev_id_t dev_id; -+ pan_stor_op_t stor_op; -+ pan_status_t error; -+}; -+ -+typedef struct pan_sam_ext_status_s pan_sam_ext_status_t; -+struct pan_sam_ext_status_s { -+ pan_uint32_t available; -+ pan_uint32_t size; -+ pan_sam_dev_error_t *errors; -+}; -+ -+enum pan_sam_rpc_sec_sel_e { -+ PAN_SAM_RPC_SEC_DEFAULT, -+ PAN_SAM_RPC_SEC_ATLEAST, -+ PAN_SAM_RPC_SEC_EXACTLY -+}; -+typedef enum pan_sam_rpc_sec_sel_e pan_sam_rpc_sec_sel_t; -+ -+typedef struct pan_sam_obj_sec_s pan_sam_obj_sec_t; -+struct pan_sam_obj_sec_s { -+ pan_stor_sec_level_t min_security; -+ pan_sm_map_cap_t *map_ccaps; -+}; -+ -+typedef struct pan_sam_rpc_sec_s pan_sam_rpc_sec_t; -+struct pan_sam_rpc_sec_s { -+ pan_sam_rpc_sec_sel_t selector; -+}; -+ -+typedef struct pan_sam_read_args_s pan_sam_read_args_t; -+struct pan_sam_read_args_s { -+ pan_stor_obj_id_t obj_id; -+ pan_sm_cache_ptr_t obj_ent; -+ void *return_attr; -+ void *checksum; -+ pan_stor_offset_t offset; -+ pan_uint16_t sm_options; -+ void *callout; -+ void *callout_arg; -+}; -+ -+typedef struct pan_sam_read_res_s pan_sam_read_res_t; -+struct pan_sam_read_res_s { -+ pan_status_t result; -+ pan_sam_ext_status_t ext_status; -+ pan_stor_len_t length; -+ void *attr; -+ void *checksum; -+}; -+ -+typedef void (*pan_sam_read_cb_t)( -+ void *user_arg1, -+ void *user_arg2, -+ pan_sam_read_res_t *res_p, -+ pan_status_t status); -+ -+#define PAN_SAM_ACCESS_NONE 0x0000 -+#define PAN_SAM_ACCESS_BYPASS_TIMESTAMP 0x0020 -+ -+typedef struct pan_sam_write_args_s pan_sam_write_args_t; -+struct pan_sam_write_args_s { -+ pan_stor_obj_id_t obj_id; -+ pan_sm_cache_ptr_t obj_ent; -+ pan_stor_offset_t offset; -+ void *attr; -+ void *return_attr; -+}; -+ -+typedef struct pan_sam_write_res_s pan_sam_write_res_t; -+struct pan_sam_write_res_s { -+ pan_status_t result; -+ pan_sam_ext_status_t ext_status; -+ pan_stor_len_t length; -+ pan_stor_delta_len_t delta_capacity_used; -+ pan_bool_t parity_dirty; -+ void *attr; -+}; -+ -+typedef void (*pan_sam_write_cb_t)( -+ void *user_arg1, -+ void *user_arg2, -+ pan_sam_write_res_t *res_p, -+ pan_status_t status); -+ -+/* -+ * from pan_mgr_types.h -+ */ -+#define PAN_MGR_ID_TYPE_SHIFT 56 -+#define PAN_MGR_ID_TYPE_MASK ((pan_mgr_id_t)18374686479671623680ULL) -+#define PAN_MGR_ID_UNIQ_MASK ((pan_mgr_id_t)72057594037927935ULL) -+ -+typedef pan_uint16_t pan_mgr_type_t; -+typedef pan_uint64_t pan_mgr_id_t; -+ -+#define PAN_MGR_SM ((pan_mgr_type_t) 2U) -+#define PAN_MGR_OBSD ((pan_mgr_type_t) 6U) -+ -+/* -+ * from pan_mgr_types_c.h -+ */ -+#define pan_mgr_id_construct_artificial(_mgr_type_, _mgr_uniq_, _mgr_id_p_) { \ -+ pan_mgr_id_t _id1, _id2; \ -+\ -+ _id1 = (_mgr_type_); \ -+ _id1 <<= PAN_MGR_ID_TYPE_SHIFT; \ -+ _id1 &= PAN_MGR_ID_TYPE_MASK; \ -+ _id2 = (_mgr_uniq_); \ -+ _id2 &= PAN_MGR_ID_UNIQ_MASK; \ -+ _id1 |= _id2; \ -+ *(_mgr_id_p_) = _id1; \ -+} -+ -+/* -+ * from pan_storage_c.h -+ */ -+#define pan_stor_is_device_id_an_obsd_id(_device_id_) \ -+ ((((_device_id_) & PAN_MGR_ID_TYPE_MASK) >> PAN_MGR_ID_TYPE_SHIFT) \ -+ == PAN_MGR_OBSD) -+ -+/* -+ * pnfs_shim internal definitions -+ */ -+ -+struct panfs_shim_io_state { -+ struct objlayout_io_state ol_state; -+ -+ pan_sg_entry_t *sg_list; -+ pan_sam_obj_sec_t obj_sec; -+ void *ucreds; -+ union { -+ struct { -+ pan_sam_read_args_t args; -+ pan_sam_read_res_t res; -+ } read; -+ struct { -+ pan_sam_write_args_t args; -+ pan_sam_write_res_t res; -+ } write; -+ } u; -+}; -+ -+#endif /* _PANLAYOUT_PANFS_SHIM_H */ -diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c -new file mode 100644 -index 0000000..d05c6be ---- /dev/null -+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c -@@ -0,0 +1,435 @@ -+/* -+ * pnfs_osd_xdr.c -+ * -+ * Object-Based pNFS Layout XDR layer -+ * -+ * Copyright (C) 2007-2009 Panasas Inc. -+ * All rights reserved. -+ * -+ * Benny Halevy -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 -+ * See the file COPYING included with this distribution for more details. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. Neither the name of the Panasas company nor the names of its -+ * contributors may be used to endorse or promote products derived -+ * from this software without specific prior written permission. -+ * -+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED -+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ */ -+ -+#include -+ -+#define NFSDBG_FACILITY NFSDBG_PNFS_LD -+ -+/* -+ * The following implementation is based on these Internet Drafts: -+ * -+ * draft-ietf-nfsv4-minorversion-21 -+ * draft-ietf-nfsv4-pnfs-obj-12 -+ */ -+ -+/* -+ * struct pnfs_osd_objid { -+ * struct pnfs_deviceid oid_device_id; -+ * u64 oid_partition_id; -+ * u64 oid_object_id; -+ * }; -+ */ -+static inline u32 * -+pnfs_osd_xdr_decode_objid(u32 *p, struct pnfs_osd_objid *objid) -+{ -+ COPYMEM(objid->oid_device_id.data, sizeof(objid->oid_device_id.data)); -+ READ64(objid->oid_partition_id); -+ READ64(objid->oid_object_id); -+ return p; -+} -+ -+static inline u32 * -+pnfs_osd_xdr_decode_opaque_cred(u32 *p, -+ struct pnfs_osd_opaque_cred *opaque_cred) -+{ -+ READ32(opaque_cred->cred_len); -+ COPYMEM(opaque_cred->cred, opaque_cred->cred_len); -+ return p; -+} -+ -+/* -+ * struct pnfs_osd_object_cred { -+ * struct pnfs_osd_objid oc_object_id; -+ * u32 oc_osd_version; -+ * u32 oc_cap_key_sec; -+ * struct pnfs_osd_opaque_cred oc_cap_key -+ * struct pnfs_osd_opaque_cred oc_cap; -+ * }; -+ */ -+static inline u32 * -+pnfs_osd_xdr_decode_object_cred(u32 *p, struct pnfs_osd_object_cred *comp, -+ u8 **credp) -+{ -+ u8 *cred; -+ -+ p = pnfs_osd_xdr_decode_objid(p, &comp->oc_object_id); -+ READ32(comp->oc_osd_version); -+ READ32(comp->oc_cap_key_sec); -+ -+ cred = *credp; -+ comp->oc_cap_key.cred = cred; -+ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap_key); -+ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap_key.cred_len)); -+ comp->oc_cap.cred = cred; -+ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap); -+ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap.cred_len)); -+ *credp = cred; -+ -+ return p; -+} -+ -+/* -+ * struct pnfs_osd_data_map { -+ * u32 odm_num_comps; -+ * u64 odm_stripe_unit; -+ * u32 odm_group_width; -+ * u32 odm_group_depth; -+ * u32 odm_mirror_cnt; -+ * u32 odm_raid_algorithm; -+ * }; -+ */ -+static inline u32 * -+pnfs_osd_xdr_decode_data_map(u32 *p, struct pnfs_osd_data_map *data_map) -+{ -+ READ32(data_map->odm_num_comps); -+ READ64(data_map->odm_stripe_unit); -+ READ32(data_map->odm_group_width); -+ READ32(data_map->odm_group_depth); -+ READ32(data_map->odm_mirror_cnt); -+ READ32(data_map->odm_raid_algorithm); -+ dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " -+ "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", -+ __func__, -+ data_map->odm_num_comps, -+ (unsigned long long)data_map->odm_stripe_unit, -+ data_map->odm_group_width, -+ data_map->odm_group_depth, -+ data_map->odm_mirror_cnt, -+ data_map->odm_raid_algorithm); -+ return p; -+} -+ -+struct pnfs_osd_layout * -+pnfs_osd_xdr_decode_layout(struct pnfs_osd_layout *layout, u32 *p) -+{ -+ int i; -+ u32 *start = p; -+ struct pnfs_osd_object_cred *comp; -+ u8 *cred; -+ -+ p = pnfs_osd_xdr_decode_data_map(p, &layout->olo_map); -+ READ32(layout->olo_comps_index); -+ READ32(layout->olo_num_comps); -+ layout->olo_comps = (struct pnfs_osd_object_cred *)(layout + 1); -+ comp = layout->olo_comps; -+ cred = (u8 *)(comp + layout->olo_num_comps); -+ dprintk("%s: comps_index=%u num_comps=%u\n", -+ __func__, layout->olo_comps_index, layout->olo_num_comps); -+ for (i = 0; i < layout->olo_num_comps; i++) { -+ p = pnfs_osd_xdr_decode_object_cred(p, comp, &cred); -+ dprintk("%s: comp[%d]=dev(%llx:%llx) par=0x%llx obj=0x%llx " -+ "key_len=%u cap_len=%u\n", -+ __func__, i, -+ _DEVID_LO(&comp->oc_object_id.oid_device_id), -+ _DEVID_HI(&comp->oc_object_id.oid_device_id), -+ comp->oc_object_id.oid_partition_id, -+ comp->oc_object_id.oid_object_id, -+ comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); -+ comp++; -+ } -+ dprintk("%s: xdr_size=%Zd end=%p in_core_size=%Zd\n", __func__, -+ (char *)p - (char *)start, cred, (char *)cred - (char *)layout); -+ return layout; -+} -+ -+/* -+ * Get Device Information Decoding -+ * -+ * Note: since Device Information is currently done synchronously, most -+ * of the actual fields are left inside the rpc buffer and are only -+ * pointed to by the pnfs_osd_deviceaddr members. So the read buffer -+ * should not be freed while the returned information is in use. -+ */ -+ -+u32 *__xdr_read_calc_nfs4_string( -+ u32 *p, struct nfs4_string *str, u8 **freespace) -+{ -+ u32 len; -+ char *data; -+ bool need_copy; -+ -+ READ32(len); -+ data = (char *)p; -+ -+ if (data[len]) { /* Not null terminated we'll need extra space */ -+ data = *freespace; -+ *freespace += len + 1; -+ need_copy = true; -+ } else { -+ need_copy = false; -+ } -+ -+ if (str) { -+ str->len = len; -+ str->data = data; -+ if (need_copy) { -+ memcpy(data, p, len); -+ data[len] = 0; -+ } -+ } -+ -+ p += XDR_QUADLEN(len); -+ return p; -+} -+ -+u32 *__xdr_read_calc_u8_opaque( -+ u32 *p, struct nfs4_string *str) -+{ -+ u32 len; -+ -+ READ32(len); -+ -+ if (str) { -+ str->len = len; -+ str->data = (char *)p; -+ } -+ -+ p += XDR_QUADLEN(len); -+ return p; -+} -+ -+/* -+ * struct pnfs_osd_targetid { -+ * u32 oti_type; -+ * struct nfs4_string oti_scsi_device_id; -+ * }; -+ */ -+u32 *__xdr_read_calc_targetid( -+ u32 *p, struct pnfs_osd_targetid* targetid, u8 **freespace) -+{ -+ u32 oti_type; -+ -+ READ32(oti_type); -+ if (targetid) -+ targetid->oti_type = oti_type; -+ -+ switch (oti_type) { -+ case OBJ_TARGET_SCSI_NAME: -+ case OBJ_TARGET_SCSI_DEVICE_ID: -+ p = __xdr_read_calc_u8_opaque(p, -+ targetid ? &targetid->oti_scsi_device_id : NULL); -+ } -+ -+ return p; -+} -+ -+/* -+ * struct pnfs_osd_net_addr { -+ * struct nfs4_string r_netid; -+ * struct nfs4_string r_addr; -+ * }; -+ */ -+u32 *__xdr_read_calc_net_addr( -+ u32 *p, struct pnfs_osd_net_addr* netaddr, u8 **freespace) -+{ -+ -+ p = __xdr_read_calc_nfs4_string(p, -+ netaddr ? &netaddr->r_netid : NULL, -+ freespace); -+ -+ p = __xdr_read_calc_nfs4_string(p, -+ netaddr ? &netaddr->r_addr : NULL, -+ freespace); -+ -+ return p; -+} -+ -+/* -+ * struct pnfs_osd_targetaddr { -+ * u32 ota_available; -+ * struct pnfs_osd_net_addr ota_netaddr; -+ * }; -+ */ -+u32 *__xdr_read_calc_targetaddr( -+ u32 *p, struct pnfs_osd_targetaddr *targetaddr, u8 **freespace) -+{ -+ u32 ota_available; -+ -+ READ32(ota_available); -+ if (targetaddr) -+ targetaddr->ota_available = ota_available; -+ -+ if (ota_available) { -+ p = __xdr_read_calc_net_addr(p, -+ targetaddr ? &targetaddr->ota_netaddr : NULL, -+ freespace); -+ } -+ -+ return p; -+} -+ -+/* -+ * struct pnfs_osd_deviceaddr { -+ * struct pnfs_osd_targetid oda_targetid; -+ * struct pnfs_osd_targetaddr oda_targetaddr; -+ * u8 oda_lun[8]; -+ * struct nfs4_string oda_systemid; -+ * struct pnfs_osd_object_cred oda_root_obj_cred; -+ * struct nfs4_string oda_osdname; -+ * }; -+ */ -+u32 *__xdr_read_calc_deviceaddr( -+ u32 *p, struct pnfs_osd_deviceaddr *deviceaddr, u8 **freespace) -+{ -+ p = __xdr_read_calc_targetid(p, -+ deviceaddr ? &deviceaddr->oda_targetid : NULL, -+ freespace); -+ -+ p = __xdr_read_calc_targetaddr(p, -+ deviceaddr ? &deviceaddr->oda_targetaddr : NULL, -+ freespace); -+ -+ if (deviceaddr) -+ COPYMEM(deviceaddr->oda_lun, sizeof(deviceaddr->oda_lun)); -+ else -+ p += XDR_QUADLEN(sizeof(deviceaddr->oda_lun)); -+ -+ p = __xdr_read_calc_u8_opaque(p, -+ deviceaddr ? &deviceaddr->oda_systemid : NULL); -+ -+ if (deviceaddr) { -+ p = pnfs_osd_xdr_decode_object_cred(p, -+ &deviceaddr->oda_root_obj_cred, freespace); -+ } else { -+ *freespace += pnfs_osd_object_cred_incore_sz(p); -+ p += pnfs_osd_object_cred_xdr_sz(p); -+ } -+ -+ p = __xdr_read_calc_u8_opaque(p, -+ deviceaddr ? &deviceaddr->oda_osdname : NULL); -+ -+ return p; -+} -+ -+size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p) -+{ -+ u8 *null_freespace = NULL; -+ size_t sz; -+ -+ __xdr_read_calc_deviceaddr(p, NULL, &null_freespace); -+ sz = sizeof(struct pnfs_osd_deviceaddr) + (size_t)null_freespace; -+ -+ return sz; -+} -+ -+void pnfs_osd_xdr_decode_deviceaddr( -+ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p) -+{ -+ u8 *freespace = (u8 *)(deviceaddr + 1); -+ -+ __xdr_read_calc_deviceaddr(p, deviceaddr, &freespace); -+} -+ -+/* -+ * struct pnfs_osd_layoutupdate { -+ * u32 dsu_valid; -+ * s64 dsu_delta; -+ * u32 olu_ioerr_flag; -+ * }; -+ */ -+int -+pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, -+ struct pnfs_osd_layoutupdate *lou) -+{ -+ __be32 *p = xdr_reserve_space(xdr, 16); -+ -+ if (!p) -+ return -E2BIG; -+ -+ *p++ = cpu_to_be32(lou->dsu_valid); -+ if (lou->dsu_valid) -+ p = xdr_encode_hyper(p, lou->dsu_delta); -+ *p++ = cpu_to_be32(lou->olu_ioerr_flag); -+ return 0; -+} -+ -+/* -+ * struct pnfs_osd_objid { -+ * struct pnfs_deviceid oid_device_id; -+ * u64 oid_partition_id; -+ * u64 oid_object_id; -+ */ -+static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr, -+ struct pnfs_osd_objid *object_id) -+{ -+ __be32 *p; -+ -+ p = xdr_reserve_space(xdr, 32); -+ if (!p) -+ return -E2BIG; -+ -+ p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, -+ sizeof(object_id->oid_device_id.data)); -+ p = xdr_encode_hyper(p, object_id->oid_partition_id); -+ p = xdr_encode_hyper(p, object_id->oid_object_id); -+ -+ return 0; -+} -+ -+/* -+ * struct pnfs_osd_ioerr { -+ * struct pnfs_osd_objid oer_component; -+ * u64 oer_comp_offset; -+ * u64 oer_comp_length; -+ * u32 oer_iswrite; -+ * u32 oer_errno; -+ * }; -+ */ -+int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, -+ struct pnfs_osd_ioerr *ioerr) -+{ -+ __be32 *p; -+ int ret; -+ -+ ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component); -+ if (ret) -+ return ret; -+ -+ p = xdr_reserve_space(xdr, 24); -+ if (!p) -+ return -E2BIG; -+ -+ p = xdr_encode_hyper(p, ioerr->oer_comp_offset); -+ p = xdr_encode_hyper(p, ioerr->oer_comp_length); -+ *p++ = cpu_to_be32(ioerr->oer_iswrite); -+ *p = cpu_to_be32(ioerr->oer_errno); -+ -+ return 0; -+} -diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c -index 9194902..96e375e 100644 ---- a/fs/nfs/pagelist.c -+++ b/fs/nfs/pagelist.c -@@ -20,6 +20,7 @@ - #include - - #include "internal.h" -+#include "pnfs.h" - - static struct kmem_cache *nfs_page_cachep; - -@@ -56,7 +57,8 @@ nfs_page_free(struct nfs_page *p) - struct nfs_page * - nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, - struct page *page, -- unsigned int offset, unsigned int count) -+ unsigned int offset, unsigned int count, -+ struct pnfs_layout_segment *lseg) - { - struct nfs_page *req; - -@@ -81,6 +83,9 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, - req->wb_context = get_nfs_open_context(ctx); - req->wb_lock_context = nfs_get_lock_context(ctx); - kref_init(&req->wb_kref); -+ req->wb_lseg = lseg; -+ if (lseg) -+ get_lseg(lseg); - return req; - } - -@@ -156,9 +161,12 @@ void nfs_clear_request(struct nfs_page *req) - put_nfs_open_context(ctx); - req->wb_context = NULL; - } -+ if (req->wb_lseg != NULL) { -+ put_lseg(req->wb_lseg); -+ req->wb_lseg = NULL; -+ } - } - -- - /** - * nfs_release_request - Release the count on an NFS read/write request - * @req: request to release -@@ -237,7 +245,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, - * Return 'true' if this is the case, else return 'false'. - */ - static int nfs_can_coalesce_requests(struct nfs_page *prev, -- struct nfs_page *req) -+ struct nfs_page *req, -+ struct nfs_pageio_descriptor *pgio) - { - if (req->wb_context->cred != prev->wb_context->cred) - return 0; -@@ -251,6 +260,12 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev, - return 0; - if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) - return 0; -+ if (req->wb_lseg != prev->wb_lseg) -+ return 0; -+#ifdef CONFIG_NFS_V4_1 -+ if (pgio->pg_test && !pgio->pg_test(pgio, prev, req)) -+ return 0; -+#endif /* CONFIG_NFS_V4_1 */ - return 1; - } - -@@ -283,7 +298,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, - if (newlen > desc->pg_bsize) - return 0; - prev = nfs_list_entry(desc->pg_list.prev); -- if (!nfs_can_coalesce_requests(prev, req)) -+ if (!nfs_can_coalesce_requests(prev, req, desc)) - return 0; - } else - desc->pg_base = req->wb_pgbase; -@@ -372,6 +387,7 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) - * @idx_start: lower bound of page->index to scan - * @npages: idx_start + npages sets the upper bound to scan. - * @tag: tag to scan for -+ * @use_pnfs: will be set TRUE if commit needs to be handled by layout driver - * - * Moves elements from one of the inode request lists. - * If the number of requests is set to 0, the entire address_space -@@ -381,7 +397,7 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) - */ - int nfs_scan_list(struct nfs_inode *nfsi, - struct list_head *dst, pgoff_t idx_start, -- unsigned int npages, int tag) -+ unsigned int npages, int tag, int *use_pnfs) - { - struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; - struct nfs_page *req; -@@ -412,6 +428,8 @@ int nfs_scan_list(struct nfs_inode *nfsi, - radix_tree_tag_clear(&nfsi->nfs_page_tree, - req->wb_index, tag); - nfs_list_add_request(req, dst); -+ if (req->wb_lseg) -+ *use_pnfs = 1; - res++; - if (res == INT_MAX) - goto out; -diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c -new file mode 100644 -index 0000000..dfdf661 ---- /dev/null -+++ b/fs/nfs/pnfs.c -@@ -0,0 +1,1723 @@ -+/* -+ * pNFS functions to call and manage layout drivers. -+ * -+ * Copyright (c) 2002 [year of first publication] -+ * The Regents of the University of Michigan -+ * All Rights Reserved -+ * -+ * Dean Hildebrand -+ * -+ * Permission is granted to use, copy, create derivative works, and -+ * redistribute this software and such derivative works for any purpose, -+ * so long as the name of the University of Michigan is not used in -+ * any advertising or publicity pertaining to the use or distribution -+ * of this software without specific, written prior authorization. If -+ * the above copyright notice or any other identification of the -+ * University of Michigan is included in any copy of any portion of -+ * this software, then the disclaimer below must also be included. -+ * -+ * This software is provided as is, without representation or warranty -+ * of any kind either express or implied, including without limitation -+ * the implied warranties of merchantability, fitness for a particular -+ * purpose, or noninfringement. The Regents of the University of -+ * Michigan shall not be liable for any damages, including special, -+ * indirect, incidental, or consequential damages, with respect to any -+ * claim arising out of or in connection with the use of the software, -+ * even if it has been or is hereafter advised of the possibility of -+ * such damages. -+ */ -+ -+#include -+#include "internal.h" -+#include "pnfs.h" -+#include "iostat.h" -+ -+#define NFSDBG_FACILITY NFSDBG_PNFS -+ -+/* Locking: -+ * -+ * pnfs_spinlock: -+ * protects pnfs_modules_tbl. -+ */ -+static DEFINE_SPINLOCK(pnfs_spinlock); -+ -+/* -+ * pnfs_modules_tbl holds all pnfs modules -+ */ -+static LIST_HEAD(pnfs_modules_tbl); -+ -+/* Return the registered pnfs layout driver module matching given id */ -+static struct pnfs_layoutdriver_type * -+find_pnfs_driver_locked(u32 id) -+{ -+ struct pnfs_layoutdriver_type *local; -+ -+ list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) -+ if (local->id == id) -+ goto out; -+ local = NULL; -+out: -+ dprintk("%s: Searching for id %u, found %p\n", __func__, id, local); -+ return local; -+} -+ -+static struct pnfs_layoutdriver_type * -+find_pnfs_driver(u32 id) -+{ -+ struct pnfs_layoutdriver_type *local; -+ -+ spin_lock(&pnfs_spinlock); -+ local = find_pnfs_driver_locked(id); -+ spin_unlock(&pnfs_spinlock); -+ return local; -+} -+ -+/* Set cred to indicate we require a layoutcommit -+ * If we don't even have a layout, we don't need to commit it. -+ */ -+void -+pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx) -+{ -+ dprintk("%s: has_layout=%d ctx=%p\n", __func__, has_layout(nfsi), ctx); -+ spin_lock(&nfsi->vfs_inode.i_lock); -+ if (has_layout(nfsi) && -+ !test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->state)) { -+ nfsi->layout->cred = get_rpccred(ctx->state->owner->so_cred); -+ __set_bit(NFS_LAYOUT_NEED_LCOMMIT, -+ &nfsi->layout->state); -+ nfsi->change_attr++; -+ spin_unlock(&nfsi->vfs_inode.i_lock); -+ dprintk("%s: Set layoutcommit\n", __func__); -+ return; -+ } -+ spin_unlock(&nfsi->vfs_inode.i_lock); -+} -+ -+/* Update last_write_offset for layoutcommit. -+ * TODO: We should only use commited extents, but the current nfs -+ * implementation does not calculate the written range in nfs_commit_done. -+ * We therefore update this field in writeback_done. -+ */ -+void -+pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent) -+{ -+ loff_t end_pos; -+ -+ spin_lock(&nfsi->vfs_inode.i_lock); -+ if (offset < nfsi->layout->write_begin_pos) -+ nfsi->layout->write_begin_pos = offset; -+ end_pos = offset + extent - 1; /* I'm being inclusive */ -+ if (end_pos > nfsi->layout->write_end_pos) -+ nfsi->layout->write_end_pos = end_pos; -+ dprintk("%s: Wrote %lu@%lu bpos %lu, epos: %lu\n", -+ __func__, -+ (unsigned long) extent, -+ (unsigned long) offset , -+ (unsigned long) nfsi->layout->write_begin_pos, -+ (unsigned long) nfsi->layout->write_end_pos); -+ spin_unlock(&nfsi->vfs_inode.i_lock); -+} -+ -+void -+unset_pnfs_layoutdriver(struct nfs_server *nfss) -+{ -+ if (nfss->pnfs_curr_ld) { -+ nfss->pnfs_curr_ld->uninitialize_mountpoint(nfss); -+ module_put(nfss->pnfs_curr_ld->owner); -+ } -+ nfss->pnfs_curr_ld = NULL; -+} -+ -+/* -+ * Try to set the server's pnfs module to the pnfs layout type specified by id. -+ * Currently only one pNFS layout driver per filesystem is supported. -+ * -+ * @id layout type. Zero (illegal layout type) indicates pNFS not in use. -+ */ -+void -+set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, -+ u32 id) -+{ -+ struct pnfs_layoutdriver_type *ld_type = NULL; -+ -+ if (id == 0) -+ goto out_no_driver; -+ if ((server->nfs_client->rpc_ops->version != 4) || -+ (server->nfs_client->cl_minorversion != 1)) -+ goto out_no_driver; -+ if (!(server->nfs_client->cl_exchange_flags & -+ (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { -+ printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__, -+ id, server->nfs_client->cl_exchange_flags); -+ goto out_no_driver; -+ } -+ ld_type = find_pnfs_driver(id); -+ if (!ld_type) { -+ request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); -+ ld_type = find_pnfs_driver(id); -+ if (!ld_type) { -+ dprintk("%s: No pNFS module found for %u.\n", -+ __func__, id); -+ goto out_no_driver; -+ } -+ } -+ if (!try_module_get(ld_type->owner)) { -+ dprintk("%s: Could not grab reference on module\n", __func__); -+ goto out_no_driver; -+ } -+ server->pnfs_curr_ld = ld_type; -+ if (ld_type->initialize_mountpoint(server, mntfh)) { -+ printk(KERN_ERR -+ "%s: Error initializing mount point for layout driver %u.\n", -+ __func__, id); -+ module_put(ld_type->owner); -+ goto out_no_driver; -+ } -+ dprintk("%s: pNFS module for %u set\n", __func__, id); -+ return; -+ -+out_no_driver: -+ dprintk("%s: Using NFSv4 I/O\n", __func__); -+ server->pnfs_curr_ld = NULL; -+} -+ -+int -+pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) -+{ -+ int status = -EINVAL; -+ struct pnfs_layoutdriver_type *tmp; -+ -+ if (ld_type->id == 0) { -+ printk(KERN_ERR "%s id 0 is reserved\n", __func__); -+ return status; -+ } -+ if (!ld_type->alloc_lseg || !ld_type->free_lseg) { -+ printk(KERN_ERR "%s Layout driver must provide " -+ "alloc_lseg and free_lseg.\n", __func__); -+ return status; -+ } -+ -+ if (!ld_type->read_pagelist || !ld_type->write_pagelist || -+ !ld_type->commit) { -+ printk(KERN_ERR "%s Layout driver must provide " -+ "read_pagelist, write_pagelist, and commit.\n", -+ __func__); -+ return status; -+ } -+ -+ spin_lock(&pnfs_spinlock); -+ tmp = find_pnfs_driver_locked(ld_type->id); -+ if (!tmp) { -+ list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl); -+ status = 0; -+ dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id, -+ ld_type->name); -+ } else { -+ printk(KERN_ERR "%s Module with id %d already loaded!\n", -+ __func__, ld_type->id); -+ } -+ spin_unlock(&pnfs_spinlock); -+ -+ return status; -+} -+EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver); -+ -+void -+pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) -+{ -+ dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); -+ spin_lock(&pnfs_spinlock); -+ list_del(&ld_type->pnfs_tblid); -+ spin_unlock(&pnfs_spinlock); -+} -+EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); -+ -+/* -+ * pNFS client layout cache -+ */ -+ -+static struct pnfs_layout_hdr * -+pnfs_alloc_layout_hdr(struct inode *ino) -+{ -+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; -+ return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino) : -+ kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL); -+} -+ -+static void -+pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) -+{ -+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->inode)->pnfs_curr_ld; -+ return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); -+} -+ -+static void -+get_layout_hdr_locked(struct pnfs_layout_hdr *lo) -+{ -+ assert_spin_locked(&lo->inode->i_lock); -+ lo->refcount++; -+} -+ -+static void -+put_layout_hdr_locked(struct pnfs_layout_hdr *lo) -+{ -+ assert_spin_locked(&lo->inode->i_lock); -+ BUG_ON(lo->refcount == 0); -+ -+ lo->refcount--; -+ if (!lo->refcount) { -+ dprintk("%s: freeing layout cache %p\n", __func__, lo); -+ BUG_ON(!list_empty(&lo->layouts)); -+ NFS_I(lo->inode)->layout = NULL; -+ pnfs_free_layout_hdr(lo); -+ } -+} -+ -+void -+put_layout_hdr(struct inode *inode) -+{ -+ spin_lock(&inode->i_lock); -+ put_layout_hdr_locked(NFS_I(inode)->layout); -+ spin_unlock(&inode->i_lock); -+ -+} -+ -+static void -+init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) -+{ -+ INIT_LIST_HEAD(&lseg->fi_list); -+ kref_init(&lseg->kref); -+ lseg->valid = true; -+ lseg->layout = lo; -+} -+ -+/* Called without i_lock held */ -+static void -+destroy_lseg(struct kref *kref) -+{ -+ struct pnfs_layout_segment *lseg = -+ container_of(kref, struct pnfs_layout_segment, kref); -+ struct pnfs_layout_hdr *local = lseg->layout; -+ -+ dprintk("--> %s\n", __func__); -+ NFS_SERVER(local->inode)->pnfs_curr_ld->free_lseg(lseg); -+ /* Matched by get_layout_hdr_locked in pnfs_insert_layout */ -+ put_layout_hdr(local->inode); -+} -+ -+void -+put_lseg(struct pnfs_layout_segment *lseg) -+{ -+ bool do_wake_up; -+ struct nfs_inode *nfsi; -+ -+ if (!lseg) -+ return; -+ -+ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, -+ atomic_read(&lseg->kref.refcount), lseg->valid); -+ do_wake_up = !lseg->valid; -+ nfsi = NFS_I(lseg->layout->inode); -+ kref_put(&lseg->kref, destroy_lseg); -+ if (do_wake_up) -+ rpc_wake_up(&nfsi->lo_rpcwaitq); -+} -+EXPORT_SYMBOL_GPL(put_lseg); -+ -+void get_lseg(struct pnfs_layout_segment *lseg) -+{ -+ kref_get(&lseg->kref); -+} -+EXPORT_SYMBOL_GPL(get_lseg); -+ -+static inline u64 -+end_offset(u64 start, u64 len) -+{ -+ u64 end; -+ -+ end = start + len; -+ return end >= start ? end: NFS4_MAX_UINT64; -+} -+ -+/* last octet in a range */ -+static inline u64 -+last_byte_offset(u64 start, u64 len) -+{ -+ u64 end; -+ -+ BUG_ON(!len); -+ end = start + len; -+ return end > start ? end - 1: NFS4_MAX_UINT64; -+} -+ -+/* -+ * is l2 fully contained in l1? -+ * start1 end1 -+ * [----------------------------------) -+ * start2 end2 -+ * [----------------) -+ */ -+static inline int -+lo_seg_contained(struct pnfs_layout_range *l1, -+ struct pnfs_layout_range *l2) -+{ -+ u64 start1 = l1->offset; -+ u64 end1 = end_offset(start1, l1->length); -+ u64 start2 = l2->offset; -+ u64 end2 = end_offset(start2, l2->length); -+ -+ return (start1 <= start2) && (end1 >= end2); -+} -+ -+/* -+ * is l1 and l2 intersecting? -+ * start1 end1 -+ * [----------------------------------) -+ * start2 end2 -+ * [----------------) -+ */ -+static inline int -+lo_seg_intersecting(struct pnfs_layout_range *l1, -+ struct pnfs_layout_range *l2) -+{ -+ u64 start1 = l1->offset; -+ u64 end1 = end_offset(start1, l1->length); -+ u64 start2 = l2->offset; -+ u64 end2 = end_offset(start2, l2->length); -+ -+ return (end1 == NFS4_MAX_UINT64 || end1 > start2) && -+ (end2 == NFS4_MAX_UINT64 || end2 > start1); -+} -+ -+/* -+ * iomode matching rules: -+ * range lseg match -+ * ----- ----- ----- -+ * ANY READ true -+ * ANY RW true -+ * RW READ false -+ * RW RW true -+ * READ READ true -+ * READ RW false -+ */ -+static int -+should_free_lseg(struct pnfs_layout_segment *lseg, -+ struct pnfs_layout_range *range) -+{ -+ return (range->iomode == IOMODE_ANY || -+ lseg->range.iomode == range->iomode) && -+ lo_seg_intersecting(&lseg->range, range); -+} -+ -+static bool -+_pnfs_can_return_lseg(struct pnfs_layout_segment *lseg) -+{ -+ return atomic_read(&lseg->kref.refcount) == 1; -+} -+ -+static void -+pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, -+ struct pnfs_layout_range *range) -+{ -+ struct pnfs_layout_segment *lseg, *next; -+ -+ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", -+ __func__, lo, range->offset, range->length, range->iomode); -+ -+ assert_spin_locked(&lo->inode->i_lock); -+ list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) { -+ if (!should_free_lseg(lseg, range) || -+ !_pnfs_can_return_lseg(lseg)) -+ continue; -+ dprintk("%s: freeing lseg %p iomode %d " -+ "offset %llu length %llu\n", __func__, -+ lseg, lseg->range.iomode, lseg->range.offset, -+ lseg->range.length); -+ list_move(&lseg->fi_list, tmp_list); -+ } -+ if (list_empty(&lo->segs)) { -+ struct nfs_client *clp; -+ -+ clp = NFS_SERVER(lo->inode)->nfs_client; -+ spin_lock(&clp->cl_lock); -+ /* List does not take a reference, so no need for put here */ -+ list_del_init(&lo->layouts); -+ spin_unlock(&clp->cl_lock); -+ pnfs_invalidate_layout_stateid(lo); -+ } -+ -+ dprintk("%s:Return\n", __func__); -+} -+ -+static void -+pnfs_free_lseg_list(struct list_head *tmp_list) -+{ -+ struct pnfs_layout_segment *lseg; -+ -+ while (!list_empty(tmp_list)) { -+ lseg = list_entry(tmp_list->next, struct pnfs_layout_segment, -+ fi_list); -+ dprintk("%s calling put_lseg on %p\n", __func__, lseg); -+ list_del(&lseg->fi_list); -+ put_lseg(lseg); -+ } -+} -+ -+void -+pnfs_destroy_layout(struct nfs_inode *nfsi) -+{ -+ struct pnfs_layout_hdr *lo; -+ struct pnfs_layout_range range = { -+ .iomode = IOMODE_ANY, -+ .offset = 0, -+ .length = NFS4_MAX_UINT64, -+ }; -+ LIST_HEAD(tmp_list); -+ -+ spin_lock(&nfsi->vfs_inode.i_lock); -+ lo = nfsi->layout; -+ if (lo) { -+ pnfs_clear_lseg_list(lo, &tmp_list, &range); -+ WARN_ON(!list_empty(&nfsi->layout->segs)); -+ WARN_ON(!list_empty(&nfsi->layout->layouts)); -+ WARN_ON(nfsi->layout->refcount != 1); -+ -+ /* Matched by refcount set to 1 in alloc_init_layout_hdr */ -+ put_layout_hdr_locked(lo); -+ } -+ spin_unlock(&nfsi->vfs_inode.i_lock); -+ pnfs_free_lseg_list(&tmp_list); -+} -+ -+/* -+ * Called by the state manger to remove all layouts established under an -+ * expired lease. -+ */ -+void -+pnfs_destroy_all_layouts(struct nfs_client *clp) -+{ -+ struct pnfs_layout_hdr *lo; -+ LIST_HEAD(tmp_list); -+ -+ spin_lock(&clp->cl_lock); -+ list_splice_init(&clp->cl_layouts, &tmp_list); -+ spin_unlock(&clp->cl_lock); -+ -+ while (!list_empty(&tmp_list)) { -+ lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, -+ layouts); -+ dprintk("%s freeing layout for inode %lu\n", __func__, -+ lo->inode->i_ino); -+ pnfs_destroy_layout(NFS_I(lo->inode)); -+ } -+} -+ -+/* update lo->stateid with new if is more recent -+ * -+ * lo->stateid could be the open stateid, in which case we just use what given. -+ */ -+static void -+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, -+ const nfs4_stateid *new) -+{ -+ nfs4_stateid *old = &lo->stateid; -+ bool overwrite = false; -+ -+ write_seqlock(&lo->seqlock); -+ if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) || -+ memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other))) -+ overwrite = true; -+ else { -+ u32 oldseq, newseq; -+ -+ oldseq = be32_to_cpu(old->stateid.seqid); -+ newseq = be32_to_cpu(new->stateid.seqid); -+ if ((int)(newseq - oldseq) > 0) -+ overwrite = true; -+ } -+ if (overwrite) -+ memcpy(&old->stateid, &new->stateid, sizeof(new->stateid)); -+ write_sequnlock(&lo->seqlock); -+} -+ -+static void -+pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo, -+ struct nfs4_state *state) -+{ -+ int seq; -+ -+ dprintk("--> %s\n", __func__); -+ write_seqlock(&lo->seqlock); -+ do { -+ seq = read_seqbegin(&state->seqlock); -+ memcpy(lo->stateid.data, state->stateid.data, -+ sizeof(state->stateid.data)); -+ } while (read_seqretry(&state->seqlock, seq)); -+ set_bit(NFS_LAYOUT_STATEID_SET, &lo->state); -+ write_sequnlock(&lo->seqlock); -+ dprintk("<-- %s\n", __func__); -+} -+ -+void -+pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, -+ struct nfs4_state *open_state) -+{ -+ int seq; -+ -+ dprintk("--> %s\n", __func__); -+ do { -+ seq = read_seqbegin(&lo->seqlock); -+ if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) { -+ /* This will trigger retry of the read */ -+ pnfs_layout_from_open_stateid(lo, open_state); -+ } else -+ memcpy(dst->data, lo->stateid.data, -+ sizeof(lo->stateid.data)); -+ } while (read_seqretry(&lo->seqlock, seq)); -+ dprintk("<-- %s\n", __func__); -+} -+ -+/* -+* Get layout from server. -+* for now, assume that whole file layouts are requested. -+* arg->offset: 0 -+* arg->length: all ones -+*/ -+static struct pnfs_layout_segment * -+send_layoutget(struct pnfs_layout_hdr *lo, -+ struct nfs_open_context *ctx, -+ struct pnfs_layout_range *range) -+{ -+ struct inode *ino = lo->inode; -+ struct nfs_server *server = NFS_SERVER(ino); -+ struct nfs4_layoutget *lgp; -+ struct pnfs_layout_segment *lseg = NULL; -+ -+ dprintk("--> %s\n", __func__); -+ -+ BUG_ON(ctx == NULL); -+ lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); -+ if (lgp == NULL) { -+ put_layout_hdr(ino); -+ return NULL; -+ } -+ lgp->args.minlength = PAGE_CACHE_SIZE; -+ if (lgp->args.minlength > range->length) -+ lgp->args.minlength = range->length; -+ lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; -+ lgp->args.range = *range; -+ lgp->args.type = server->pnfs_curr_ld->id; -+ lgp->args.inode = ino; -+ lgp->args.ctx = get_nfs_open_context(ctx); -+ lgp->lsegpp = &lseg; -+ -+ /* Synchronously retrieve layout information from server and -+ * store in lseg. -+ */ -+ nfs4_proc_layoutget(lgp); -+ if (!lseg) { -+ /* remember that LAYOUTGET failed and suspend trying */ -+ set_bit(lo_fail_bit(range->iomode), &lo->state); -+ } -+ return lseg; -+} -+ -+static struct pnfs_layout_segment * -+has_layout_to_return(struct pnfs_layout_hdr *lo, -+ struct pnfs_layout_range *range) -+{ -+ struct pnfs_layout_segment *out = NULL, *lseg; -+ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", -+ __func__, lo, range->offset, range->length, range->iomode); -+ -+ assert_spin_locked(&lo->inode->i_lock); -+ list_for_each_entry(lseg, &lo->segs, fi_list) -+ if (should_free_lseg(lseg, range)) { -+ out = lseg; -+ break; -+ } -+ -+ dprintk("%s:Return lseg=%p\n", __func__, out); -+ return out; -+} -+ -+bool -+pnfs_return_layout_barrier(struct nfs_inode *nfsi, -+ struct pnfs_layout_range *range) -+{ -+ struct pnfs_layout_segment *lseg; -+ bool ret = false; -+ -+ spin_lock(&nfsi->vfs_inode.i_lock); -+ list_for_each_entry(lseg, &nfsi->layout->segs, fi_list) { -+ if (!should_free_lseg(lseg, range)) -+ continue; -+ lseg->valid = false; -+ if (!_pnfs_can_return_lseg(lseg)) { -+ dprintk("%s: wait on lseg %p refcount %d\n", -+ __func__, lseg, -+ atomic_read(&lseg->kref.refcount)); -+ ret = true; -+ } -+ } -+ spin_unlock(&nfsi->vfs_inode.i_lock); -+ dprintk("%s:Return %d\n", __func__, ret); -+ return ret; -+} -+ -+void -+pnfs_layoutreturn_release(struct nfs4_layoutreturn *lrp) -+{ -+ struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; -+ LIST_HEAD(tmp_list); -+ -+ if (lrp->args.return_type != RETURN_FILE) -+ return; -+ spin_lock(&lrp->args.inode->i_lock); -+ pnfs_clear_lseg_list(lo, &tmp_list, &lrp->args.range); -+ if (!lrp->res.valid) -+ ; /* forgetful model internal release */ -+ else if (!lrp->res.lrs_present) -+ pnfs_invalidate_layout_stateid(lo); -+ else -+ pnfs_set_layout_stateid(lo, &lrp->res.stateid); -+ put_layout_hdr_locked(lo); /* Matched in _pnfs_return_layout */ -+ spin_unlock(&lrp->args.inode->i_lock); -+ pnfs_free_lseg_list(&tmp_list); -+} -+ -+static int -+return_layout(struct inode *ino, struct pnfs_layout_range *range, -+ enum pnfs_layoutreturn_type type, struct pnfs_layout_hdr *lo, -+ bool wait, const nfs4_stateid *stateid) -+{ -+ struct nfs4_layoutreturn *lrp; -+ struct nfs_server *server = NFS_SERVER(ino); -+ int status = -ENOMEM; -+ -+ dprintk("--> %s\n", __func__); -+ -+ BUG_ON(type != RETURN_FILE); -+ -+ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); -+ if (lrp == NULL) { -+ if (lo && (type == RETURN_FILE)) -+ put_layout_hdr(lo->inode); -+ goto out; -+ } -+ lrp->args.reclaim = 0; -+ lrp->args.layout_type = server->pnfs_curr_ld->id; -+ lrp->args.return_type = type; -+ lrp->args.range = *range; -+ lrp->args.inode = ino; -+ lrp->stateid = stateid; -+ -+ status = nfs4_proc_layoutreturn(lrp, wait); -+out: -+ dprintk("<-- %s status: %d\n", __func__, status); -+ return status; -+} -+ -+int -+_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range, -+ const nfs4_stateid *stateid, /* optional */ -+ enum pnfs_layoutreturn_type type, -+ bool wait) -+{ -+ struct pnfs_layout_hdr *lo = NULL; -+ struct nfs_inode *nfsi = NFS_I(ino); -+ struct pnfs_layout_range arg; -+ int status = 0; -+ -+ dprintk("--> %s type %d\n", __func__, type); -+ -+ -+ arg.iomode = range ? range->iomode : IOMODE_ANY; -+ arg.offset = 0; -+ arg.length = NFS4_MAX_UINT64; -+ -+ if (type == RETURN_FILE) { -+ spin_lock(&ino->i_lock); -+ lo = nfsi->layout; -+ if (lo && !has_layout_to_return(lo, &arg)) -+ lo = NULL; -+ if (!lo) { -+ spin_unlock(&ino->i_lock); -+ dprintk("%s: no layout segments to return\n", __func__); -+ goto out; -+ } -+ -+ /* Reference matched in pnfs_layoutreturn_release */ -+ get_layout_hdr_locked(lo); -+ -+ spin_unlock(&ino->i_lock); -+ -+ if (layoutcommit_needed(nfsi)) { -+ if (stateid && !wait) { /* callback */ -+ dprintk("%s: layoutcommit pending\n", __func__); -+ status = -EAGAIN; -+ goto out_put; -+ } -+ status = pnfs_layoutcommit_inode(ino, wait); -+ if (status) { -+ /* Return layout even if layoutcommit fails */ -+ dprintk("%s: layoutcommit failed, status=%d. " -+ "Returning layout anyway\n", -+ __func__, status); -+ } -+ } -+ status = return_layout(ino, &arg, type, lo, wait, stateid); -+ } -+out: -+ dprintk("<-- %s status: %d\n", __func__, status); -+ return status; -+out_put: -+ put_layout_hdr(ino); -+ goto out; -+} -+ -+/* -+ * Compare two layout segments for sorting into layout cache. -+ * We want to preferentially return RW over RO layouts, so ensure those -+ * are seen first. -+ */ -+static s64 -+cmp_layout(struct pnfs_layout_range *l1, -+ struct pnfs_layout_range *l2) -+{ -+ s64 d; -+ -+ /* higher offset > lower offset */ -+ d = l1->offset - l2->offset; -+ if (d) -+ return d; -+ -+ /* longer length > shorter length */ -+ d = l1->length - l2->length; -+ if (d) -+ return d; -+ -+ /* read > read/write */ -+ return (int)(l2->iomode == IOMODE_READ) - -+ (int)(l1->iomode == IOMODE_READ); -+} -+ -+static void -+pnfs_insert_layout(struct pnfs_layout_hdr *lo, -+ struct pnfs_layout_segment *lseg) -+{ -+ struct pnfs_layout_segment *lp; -+ int found = 0; -+ -+ dprintk("%s:Begin\n", __func__); -+ -+ assert_spin_locked(&lo->inode->i_lock); -+ if (list_empty(&lo->segs)) { -+ struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client; -+ -+ spin_lock(&clp->cl_lock); -+ BUG_ON(!list_empty(&lo->layouts)); -+ list_add_tail(&lo->layouts, &clp->cl_layouts); -+ spin_unlock(&clp->cl_lock); -+ } -+ list_for_each_entry(lp, &lo->segs, fi_list) { -+ if (cmp_layout(&lp->range, &lseg->range) > 0) -+ continue; -+ list_add_tail(&lseg->fi_list, &lp->fi_list); -+ dprintk("%s: inserted lseg %p " -+ "iomode %d offset %llu length %llu before " -+ "lp %p iomode %d offset %llu length %llu\n", -+ __func__, lseg, lseg->range.iomode, -+ lseg->range.offset, lseg->range.length, -+ lp, lp->range.iomode, lp->range.offset, -+ lp->range.length); -+ found = 1; -+ break; -+ } -+ if (!found) { -+ list_add_tail(&lseg->fi_list, &lo->segs); -+ dprintk("%s: inserted lseg %p " -+ "iomode %d offset %llu length %llu at tail\n", -+ __func__, lseg, lseg->range.iomode, -+ lseg->range.offset, lseg->range.length); -+ } -+ get_layout_hdr_locked(lo); -+ -+ dprintk("%s:Return\n", __func__); -+} -+ -+static struct pnfs_layout_hdr * -+alloc_init_layout_hdr(struct inode *ino) -+{ -+ struct pnfs_layout_hdr *lo; -+ -+ lo = pnfs_alloc_layout_hdr(ino); -+ if (!lo) -+ return NULL; -+ lo->refcount = 1; -+ INIT_LIST_HEAD(&lo->layouts); -+ INIT_LIST_HEAD(&lo->segs); -+ seqlock_init(&lo->seqlock); -+ lo->inode = ino; -+ return lo; -+} -+ -+static struct pnfs_layout_hdr * -+pnfs_find_alloc_layout(struct inode *ino) -+{ -+ struct nfs_inode *nfsi = NFS_I(ino); -+ struct pnfs_layout_hdr *new = NULL; -+ -+ dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); -+ -+ assert_spin_locked(&ino->i_lock); -+ if (nfsi->layout) -+ return nfsi->layout; -+ -+ spin_unlock(&ino->i_lock); -+ new = alloc_init_layout_hdr(ino); -+ spin_lock(&ino->i_lock); -+ -+ if (likely(nfsi->layout == NULL)) /* Won the race? */ -+ nfsi->layout = new; -+ else -+ pnfs_free_layout_hdr(new); -+ return nfsi->layout; -+} -+ -+/* -+ * iomode matching rules: -+ * range lseg match -+ * ----- ----- ----- -+ * ANY READ true -+ * ANY RW true -+ * RW READ false -+ * RW RW true -+ * READ READ true -+ * READ RW true -+ */ -+static int -+is_matching_lseg(struct pnfs_layout_segment *lseg, -+ struct pnfs_layout_range *range) -+{ -+ struct pnfs_layout_range range1; -+ -+ if ((range->iomode == IOMODE_RW && lseg->range.iomode != IOMODE_RW) || -+ !lo_seg_intersecting(&lseg->range, range)) -+ return 0; -+ -+ /* range1 covers only the first byte in the range */ -+ range1 = *range; -+ range1.length = 1; -+ return lo_seg_contained(&lseg->range, &range1); -+} -+ -+/* -+ * lookup range in layout -+ */ -+struct pnfs_layout_segment * -+pnfs_has_layout(struct pnfs_layout_hdr *lo, -+ struct pnfs_layout_range *range) -+{ -+ struct pnfs_layout_segment *lseg, *ret = NULL; -+ -+ dprintk("%s:Begin\n", __func__); -+ -+ assert_spin_locked(&lo->inode->i_lock); -+ list_for_each_entry(lseg, &lo->segs, fi_list) { -+ if (is_matching_lseg(lseg, range)) { -+ ret = lseg; -+ break; -+ } -+ if (cmp_layout(range, &lseg->range) > 0) -+ break; -+ } -+ -+ dprintk("%s:Return lseg %p ref %d valid %d\n", -+ __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0, -+ ret ? ret->valid : 0); -+ return ret; -+} -+ -+/* -+ * Layout segment is retreived from the server if not cached. -+ * The appropriate layout segment is referenced and returned to the caller. -+ */ -+struct pnfs_layout_segment * -+pnfs_update_layout(struct inode *ino, -+ struct nfs_open_context *ctx, -+ loff_t pos, -+ u64 count, -+ enum pnfs_iomode iomode) -+{ -+ struct pnfs_layout_range arg = { -+ .iomode = iomode, -+ .offset = pos, -+ .length = count, -+ }; -+ struct nfs_inode *nfsi = NFS_I(ino); -+ struct pnfs_layout_hdr *lo; -+ struct pnfs_layout_segment *lseg = NULL; -+ -+ if (!pnfs_enabled_sb(NFS_SERVER(ino))) -+ return NULL; -+ spin_lock(&ino->i_lock); -+ lo = pnfs_find_alloc_layout(ino); -+ if (lo == NULL) { -+ dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); -+ goto out_unlock; -+ } -+ -+ /* Check to see if the layout for the given range already exists */ -+ lseg = pnfs_has_layout(lo, &arg); -+ if (lseg) { -+ if (lseg->valid) { -+ dprintk("%s: Using cached lseg %p for %llu@%llu " -+ "iomode %d)\n", -+ __func__, -+ lseg, -+ arg.length, -+ arg.offset, -+ arg.iomode); -+ get_lseg(lseg); -+ goto out_unlock; -+ } -+ /* someone is cleaning the layout */ -+ lseg = NULL; -+ } -+ -+ /* if LAYOUTGET already failed once we don't try again */ -+ if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state)) -+ goto out_unlock; -+ -+ get_layout_hdr_locked(lo); /* Matched in pnfs_layoutget_release */ -+ spin_unlock(&ino->i_lock); -+ -+ lseg = send_layoutget(lo, ctx, &arg); -+out: -+ dprintk("%s end, state 0x%lx lseg %p\n", __func__, -+ nfsi->layout->state, lseg); -+ return lseg; -+out_unlock: -+ spin_unlock(&ino->i_lock); -+ goto out; -+} -+ -+int -+pnfs_layout_process(struct nfs4_layoutget *lgp) -+{ -+ struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; -+ struct nfs4_layoutget_res *res = &lgp->res; -+ struct pnfs_layout_segment *lseg; -+ struct inode *ino = lo->inode; -+ int status = 0; -+ -+ /* Inject layout blob into I/O device driver */ -+ lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res); -+ if (!lseg || IS_ERR(lseg)) { -+ if (!lseg) -+ status = -ENOMEM; -+ else -+ status = PTR_ERR(lseg); -+ dprintk("%s: Could not allocate layout: error %d\n", -+ __func__, status); -+ goto out; -+ } -+ -+ spin_lock(&ino->i_lock); -+ init_lseg(lo, lseg); -+ lseg->range = res->range; -+ get_lseg(lseg); -+ *lgp->lsegpp = lseg; -+ pnfs_insert_layout(lo, lseg); -+ -+ if (res->return_on_close) { -+ /* FI: This needs to be re-examined. At lo level, -+ * all it needs is a bit indicating whether any of -+ * the lsegs in the list have the flags set. -+ */ -+ lo->roc_iomode |= res->range.iomode; -+ } -+ -+ /* Done processing layoutget. Set the layout stateid */ -+ pnfs_set_layout_stateid(lo, &res->stateid); -+ spin_unlock(&ino->i_lock); -+out: -+ return status; -+} -+ -+void -+readahead_range(struct inode *inode, struct list_head *pages, loff_t *offset, -+ size_t *count) -+{ -+ struct page *first, *last; -+ loff_t foff, i_size = i_size_read(inode); -+ pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; -+ size_t range; -+ -+ first = list_entry((pages)->prev, struct page, lru); -+ last = list_entry((pages)->next, struct page, lru); -+ -+ foff = (loff_t)first->index << PAGE_CACHE_SHIFT; -+ -+ range = (last->index - first->index) * PAGE_CACHE_SIZE; -+ if (last->index == end_index) -+ range += ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; -+ else -+ range += PAGE_CACHE_SIZE; -+ dprintk("%s foff %lu, range %Zu\n", __func__, (unsigned long)foff, -+ range); -+ *offset = foff; -+ *count = range; -+} -+ -+void -+pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio) -+{ -+ struct pnfs_layout_hdr *lo; -+ struct pnfs_layoutdriver_type *ld; -+ -+ pgio->pg_test = NULL; -+ -+ lo = NFS_I(inode)->layout; -+ ld = NFS_SERVER(inode)->pnfs_curr_ld; -+ if (!ld || !lo) -+ return; -+ -+ pgio->pg_test = ld->pg_test; -+} -+ -+/* -+ * rsize is already set by caller to MDS rsize. -+ */ -+void -+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, -+ struct inode *inode, -+ struct nfs_open_context *ctx, -+ struct list_head *pages, -+ size_t *rsize) -+{ -+ struct nfs_server *nfss = NFS_SERVER(inode); -+ size_t count = 0; -+ loff_t loff; -+ -+ pgio->pg_iswrite = 0; -+ pgio->pg_test = NULL; -+ pgio->pg_lseg = NULL; -+ -+ if (!pnfs_enabled_sb(nfss)) -+ return; -+ -+ readahead_range(inode, pages, &loff, &count); -+ pgio->pg_lseg = pnfs_update_layout(inode, ctx, loff, count, IOMODE_READ); -+ if (pgio->pg_lseg) { -+ pnfs_set_pg_test(inode, pgio); -+ *rsize = NFS_SERVER(inode)->ds_rsize; -+ } -+} -+ -+void -+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, -+ size_t *wsize) -+{ -+ struct nfs_server *server = NFS_SERVER(inode); -+ -+ pgio->pg_iswrite = 1; -+ if (!pnfs_enabled_sb(server)) -+ pgio->pg_test = NULL; -+ else { -+ pnfs_set_pg_test(inode, pgio); -+ *wsize = server->ds_wsize; -+ } -+} -+ -+/* Set buffer size for data servers */ -+void -+pnfs_set_ds_iosize(struct nfs_server *server) -+{ -+ unsigned dssize = 0; -+ -+ if (server->pnfs_curr_ld && server->pnfs_curr_ld->get_blocksize) -+ dssize = server->pnfs_curr_ld->get_blocksize(); -+ if (dssize) -+ server->ds_rsize = server->ds_wsize = -+ nfs_block_size(dssize, NULL); -+ else { -+ server->ds_wsize = server->wsize; -+ server->ds_rsize = server->rsize; -+ } -+} -+ -+static int -+pnfs_call_done(struct pnfs_call_data *pdata, struct rpc_task *task, void *data) -+{ -+ put_lseg(pdata->lseg); -+ pdata->lseg = NULL; -+ pdata->call_ops->rpc_call_done(task, data); -+ if (pdata->pnfs_error == -EAGAIN || task->tk_status == -EAGAIN) -+ return -EAGAIN; -+ if (pdata->pnfsflags & PNFS_NO_RPC) { -+ pdata->call_ops->rpc_release(data); -+ } else { -+ /* -+ * just restore original rpc call ops -+ * rpc_release will be called later by the rpc scheduling layer. -+ */ -+ task->tk_ops = pdata->call_ops; -+ } -+ return 0; -+} -+ -+/* Post-write completion function -+ * Invoked by all layout drivers when write_pagelist is done. -+ * -+ * NOTE: callers set data->pnfsflags PNFS_NO_RPC -+ * so that the NFS cleanup routines perform only the page cache -+ * cleanup. -+ */ -+static void -+pnfs_write_retry(struct work_struct *work) -+{ -+ struct rpc_task *task; -+ struct nfs_write_data *wdata; -+ struct pnfs_layout_range range; -+ -+ dprintk("%s enter\n", __func__); -+ task = container_of(work, struct rpc_task, u.tk_work); -+ wdata = container_of(task, struct nfs_write_data, task); -+ range.iomode = IOMODE_RW; -+ range.offset = wdata->args.offset; -+ range.length = wdata->args.count; -+ _pnfs_return_layout(wdata->inode, &range, NULL, RETURN_FILE, true); -+ pnfs_initiate_write(wdata, NFS_CLIENT(wdata->inode), -+ wdata->pdata.call_ops, wdata->pdata.how); -+} -+ -+void -+pnfs_writeback_done(struct nfs_write_data *data) -+{ -+ struct pnfs_call_data *pdata = &data->pdata; -+ -+ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); -+ -+ /* update last write offset and need layout commit -+ * for non-files layout types (files layout calls -+ * pnfs4_write_done for this) -+ */ -+ if ((pdata->pnfsflags & PNFS_NO_RPC) && -+ data->task.tk_status >= 0 && data->res.count > 0) { -+ struct nfs_inode *nfsi = NFS_I(data->inode); -+ -+ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); -+ pnfs_need_layoutcommit(nfsi, data->args.context); -+ } -+ -+ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { -+ INIT_WORK(&data->task.u.tk_work, pnfs_write_retry); -+ queue_work(nfsiod_workqueue, &data->task.u.tk_work); -+ } -+} -+EXPORT_SYMBOL_GPL(pnfs_writeback_done); -+ -+static void _pnfs_clear_lseg_from_pages(struct list_head *head) -+{ -+ struct nfs_page *req; -+ -+ list_for_each_entry(req, head, wb_list) { -+ put_lseg(req->wb_lseg); -+ req->wb_lseg = NULL; -+ } -+} -+ -+/* -+ * Call the appropriate parallel I/O subsystem write function. -+ * If no I/O device driver exists, or one does match the returned -+ * fstype, then return a positive status for regular NFS processing. -+ * -+ * TODO: Is wdata->how and wdata->args.stable always the same value? -+ * TODO: It seems in NFS, the server may not do a stable write even -+ * though it was requested (and vice-versa?). To check, it looks -+ * in data->res.verf->committed. Do we need this ability -+ * for non-file layout drivers? -+ */ -+enum pnfs_try_status -+pnfs_try_to_write_data(struct nfs_write_data *wdata, -+ const struct rpc_call_ops *call_ops, int how) -+{ -+ struct inode *inode = wdata->inode; -+ enum pnfs_try_status trypnfs; -+ struct nfs_server *nfss = NFS_SERVER(inode); -+ struct pnfs_layout_segment *lseg = wdata->req->wb_lseg; -+ -+ wdata->pdata.call_ops = call_ops; -+ wdata->pdata.pnfs_error = 0; -+ wdata->pdata.how = how; -+ -+ dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, -+ inode->i_ino, wdata->args.count, wdata->args.offset, how); -+ -+ get_lseg(lseg); -+ -+ if (!pnfs_use_rpc(nfss)) -+ wdata->pdata.pnfsflags |= PNFS_NO_RPC; -+ wdata->pdata.lseg = lseg; -+ trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, -+ nfs_page_array_len(wdata->args.pgbase, wdata->args.count), -+ how); -+ -+ if (trypnfs == PNFS_NOT_ATTEMPTED) { -+ wdata->pdata.pnfsflags &= ~PNFS_NO_RPC; -+ wdata->pdata.lseg = NULL; -+ put_lseg(lseg); -+ _pnfs_clear_lseg_from_pages(&wdata->pages); -+ } else { -+ nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); -+ } -+ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); -+ return trypnfs; -+} -+ -+/* Post-read completion function. Invoked by all layout drivers when -+ * read_pagelist is done -+ */ -+static void -+pnfs_read_retry(struct work_struct *work) -+{ -+ struct rpc_task *task; -+ struct nfs_read_data *rdata; -+ struct pnfs_layout_range range; -+ -+ dprintk("%s enter\n", __func__); -+ task = container_of(work, struct rpc_task, u.tk_work); -+ rdata = container_of(task, struct nfs_read_data, task); -+ range.iomode = IOMODE_RW; -+ range.offset = rdata->args.offset; -+ range.length = rdata->args.count; -+ _pnfs_return_layout(rdata->inode, &range, NULL, RETURN_FILE, true); -+ pnfs_initiate_read(rdata, NFS_CLIENT(rdata->inode), -+ rdata->pdata.call_ops); -+} -+ -+void -+pnfs_read_done(struct nfs_read_data *data) -+{ -+ struct pnfs_call_data *pdata = &data->pdata; -+ -+ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); -+ -+ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { -+ INIT_WORK(&data->task.u.tk_work, pnfs_read_retry); -+ queue_work(nfsiod_workqueue, &data->task.u.tk_work); -+ } -+} -+EXPORT_SYMBOL_GPL(pnfs_read_done); -+ -+/* -+ * Call the appropriate parallel I/O subsystem read function. -+ * If no I/O device driver exists, or one does match the returned -+ * fstype, then return a positive status for regular NFS processing. -+ */ -+enum pnfs_try_status -+pnfs_try_to_read_data(struct nfs_read_data *rdata, -+ const struct rpc_call_ops *call_ops) -+{ -+ struct inode *inode = rdata->inode; -+ struct nfs_server *nfss = NFS_SERVER(inode); -+ struct pnfs_layout_segment *lseg = rdata->req->wb_lseg; -+ enum pnfs_try_status trypnfs; -+ -+ rdata->pdata.call_ops = call_ops; -+ rdata->pdata.pnfs_error = 0; -+ -+ dprintk("%s: Reading ino:%lu %u@%llu\n", -+ __func__, inode->i_ino, rdata->args.count, rdata->args.offset); -+ -+ get_lseg(lseg); -+ -+ if (!pnfs_use_rpc(nfss)) -+ rdata->pdata.pnfsflags |= PNFS_NO_RPC; -+ rdata->pdata.lseg = lseg; -+ trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata, -+ nfs_page_array_len(rdata->args.pgbase, rdata->args.count)); -+ if (trypnfs == PNFS_NOT_ATTEMPTED) { -+ rdata->pdata.pnfsflags &= ~PNFS_NO_RPC; -+ rdata->pdata.lseg = NULL; -+ put_lseg(lseg); -+ _pnfs_clear_lseg_from_pages(&rdata->pages); -+ } else { -+ nfs_inc_stats(inode, NFSIOS_PNFS_READ); -+ } -+ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); -+ return trypnfs; -+} -+ -+/* -+ * This gives the layout driver an opportunity to read in page "around" -+ * the data to be written. It returns 0 on success, otherwise an error code -+ * which will either be passed up to user, or ignored if -+ * some previous part of write succeeded. -+ * Note the range [pos, pos+len-1] is entirely within the page. -+ */ -+int _pnfs_write_begin(struct inode *inode, struct page *page, -+ loff_t pos, unsigned len, -+ struct pnfs_layout_segment *lseg, -+ struct pnfs_fsdata **fsdata) -+{ -+ struct pnfs_fsdata *data; -+ int status = 0; -+ -+ dprintk("--> %s: pos=%llu len=%u\n", -+ __func__, (unsigned long long)pos, len); -+ data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL); -+ if (!data) { -+ status = -ENOMEM; -+ goto out; -+ } -+ data->lseg = lseg; /* refcount passed into data to be managed there */ -+ status = NFS_SERVER(inode)->pnfs_curr_ld->write_begin( -+ lseg, page, pos, len, data); -+ if (status) { -+ kfree(data); -+ data = NULL; -+ } -+out: -+ *fsdata = data; -+ dprintk("<-- %s: status=%d\n", __func__, status); -+ return status; -+} -+ -+/* pNFS Commit callback function for all layout drivers */ -+void -+pnfs_commit_done(struct nfs_write_data *data) -+{ -+ struct pnfs_call_data *pdata = &data->pdata; -+ -+ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); -+ -+ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { -+ struct pnfs_layout_range range = { -+ .iomode = IOMODE_RW, -+ .offset = data->args.offset, -+ .length = data->args.count, -+ }; -+ dprintk("%s: retrying\n", __func__); -+ _pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE, -+ true); -+ pnfs_initiate_commit(data, NFS_CLIENT(data->inode), -+ pdata->call_ops, pdata->how, 1); -+ } -+} -+EXPORT_SYMBOL_GPL(pnfs_commit_done); -+ -+enum pnfs_try_status -+pnfs_try_to_commit(struct nfs_write_data *data, -+ const struct rpc_call_ops *call_ops, int sync) -+{ -+ struct inode *inode = data->inode; -+ struct nfs_server *nfss = NFS_SERVER(data->inode); -+ enum pnfs_try_status trypnfs; -+ -+ dprintk("%s: Begin\n", __func__); -+ -+ if (!pnfs_use_rpc(nfss)) -+ data->pdata.pnfsflags |= PNFS_NO_RPC; -+ /* We need to account for possibility that -+ * each nfs_page can point to a different lseg (or be NULL). -+ * For the immediate case of whole-file-only layouts, we at -+ * least know there can be only a single lseg. -+ * We still have to account for the possibility of some being NULL. -+ * This will be done by passing the buck to the layout driver. -+ */ -+ data->pdata.call_ops = call_ops; -+ data->pdata.pnfs_error = 0; -+ data->pdata.how = sync; -+ data->pdata.lseg = NULL; -+ trypnfs = nfss->pnfs_curr_ld->commit(data, sync); -+ if (trypnfs == PNFS_NOT_ATTEMPTED) { -+ data->pdata.pnfsflags &= ~PNFS_NO_RPC; -+ _pnfs_clear_lseg_from_pages(&data->pages); -+ } else -+ nfs_inc_stats(inode, NFSIOS_PNFS_COMMIT); -+ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); -+ return trypnfs; -+} -+ -+void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) -+{ -+ struct nfs_server *nfss = NFS_SERVER(data->args.inode); -+ -+ /* TODO: Maybe we should avoid this by allowing the layout driver -+ * to directly xdr its layout on the wire. -+ */ -+ if (nfss->pnfs_curr_ld->cleanup_layoutcommit) -+ nfss->pnfs_curr_ld->cleanup_layoutcommit( -+ NFS_I(data->args.inode)->layout, -+ &data->args, data->status); -+} -+ -+/* -+ * Set up the argument/result storage required for the RPC call. -+ */ -+static int -+pnfs_layoutcommit_setup(struct inode *inode, -+ struct nfs4_layoutcommit_data *data, -+ loff_t write_begin_pos, loff_t write_end_pos) -+{ -+ struct nfs_server *nfss = NFS_SERVER(inode); -+ int result = 0; -+ -+ dprintk("--> %s\n", __func__); -+ -+ data->args.inode = inode; -+ data->args.fh = NFS_FH(inode); -+ data->args.layout_type = nfss->pnfs_curr_ld->id; -+ data->res.fattr = &data->fattr; -+ nfs_fattr_init(&data->fattr); -+ -+ /* TODO: Need to determine the correct values */ -+ data->args.time_modify_changed = 0; -+ -+ /* Set values from inode so it can be reset -+ */ -+ data->args.range.iomode = IOMODE_RW; -+ data->args.range.offset = write_begin_pos; -+ data->args.range.length = write_end_pos - write_begin_pos + 1; -+ data->args.lastbytewritten = min(write_end_pos, -+ i_size_read(inode) - 1); -+ data->args.bitmask = nfss->attr_bitmask; -+ data->res.server = nfss; -+ -+ /* Call layout driver to set the arguments */ -+ if (nfss->pnfs_curr_ld->setup_layoutcommit) -+ result = nfss->pnfs_curr_ld->setup_layoutcommit( -+ NFS_I(inode)->layout, &data->args); -+ -+ dprintk("<-- %s Status %d\n", __func__, result); -+ return result; -+} -+ -+/* Issue a async layoutcommit for an inode. -+ */ -+int -+pnfs_layoutcommit_inode(struct inode *inode, int sync) -+{ -+ struct nfs4_layoutcommit_data *data; -+ struct nfs_inode *nfsi = NFS_I(inode); -+ loff_t write_begin_pos; -+ loff_t write_end_pos; -+ -+ int status = 0; -+ -+ dprintk("%s Begin (sync:%d)\n", __func__, sync); -+ -+ BUG_ON(!has_layout(nfsi)); -+ -+ data = kzalloc(sizeof(*data), GFP_NOFS); -+ if (!data) -+ return -ENOMEM; -+ -+ spin_lock(&inode->i_lock); -+ if (!layoutcommit_needed(nfsi)) { -+ spin_unlock(&inode->i_lock); -+ goto out_free; -+ } -+ -+ /* Clear layoutcommit properties in the inode so -+ * new lc info can be generated -+ */ -+ write_begin_pos = nfsi->layout->write_begin_pos; -+ write_end_pos = nfsi->layout->write_end_pos; -+ data->cred = nfsi->layout->cred; -+ nfsi->layout->write_begin_pos = 0; -+ nfsi->layout->write_end_pos = 0; -+ nfsi->layout->cred = NULL; -+ __clear_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->state); -+ pnfs_get_layout_stateid(&data->args.stateid, nfsi->layout, NULL); -+ -+ /* Reference for layoutcommit matched in pnfs_layoutcommit_release */ -+ get_layout_hdr_locked(NFS_I(inode)->layout); -+ -+ spin_unlock(&inode->i_lock); -+ -+ /* Set up layout commit args */ -+ status = pnfs_layoutcommit_setup(inode, data, write_begin_pos, -+ write_end_pos); -+ if (status) { -+ /* The layout driver failed to setup the layoutcommit */ -+ put_rpccred(data->cred); -+ put_layout_hdr(inode); -+ goto out_free; -+ } -+ status = nfs4_proc_layoutcommit(data, sync); -+out: -+ dprintk("%s end (err:%d)\n", __func__, status); -+ return status; -+out_free: -+ kfree(data); -+ goto out; -+} -+ -+void pnfs_free_fsdata(struct pnfs_fsdata *fsdata) -+{ -+ /* lseg refcounting handled directly in nfs_write_end */ -+ kfree(fsdata); -+} -+ -+/* -+ * Device ID cache. Currently supports one layout type per struct nfs_client. -+ * Add layout type to the lookup key to expand to support multiple types. -+ */ -+int -+pnfs_alloc_init_deviceid_cache(struct nfs_client *clp, -+ void (*free_callback)(struct pnfs_deviceid_node *)) -+{ -+ struct pnfs_deviceid_cache *c; -+ -+ c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL); -+ if (!c) -+ return -ENOMEM; -+ spin_lock(&clp->cl_lock); -+ if (clp->cl_devid_cache != NULL) { -+ atomic_inc(&clp->cl_devid_cache->dc_ref); -+ dprintk("%s [kref [%d]]\n", __func__, -+ atomic_read(&clp->cl_devid_cache->dc_ref)); -+ kfree(c); -+ } else { -+ /* kzalloc initializes hlists */ -+ spin_lock_init(&c->dc_lock); -+ atomic_set(&c->dc_ref, 1); -+ c->dc_free_callback = free_callback; -+ clp->cl_devid_cache = c; -+ dprintk("%s [new]\n", __func__); -+ } -+ spin_unlock(&clp->cl_lock); -+ return 0; -+} -+EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache); -+ -+/* Must be called with locked c->dc_lock */ -+static struct pnfs_deviceid_node * -+pnfs_unhash_deviceid(struct pnfs_deviceid_cache *c, -+ struct nfs4_deviceid *id) -+{ -+ struct pnfs_deviceid_node *d; -+ struct hlist_node *n; -+ long h = nfs4_deviceid_hash(id); -+ -+ dprintk("%s hash %ld\n", __func__, h); -+ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node) -+ if (!memcmp(&d->de_id, id, sizeof(*id))) { -+ hlist_del_rcu(&d->de_node); -+ synchronize_rcu(); -+ return d; -+ } -+ -+ return NULL; -+} -+ -+/* -+ * Called from pnfs_layoutdriver_type->free_lseg -+ * last layout segment reference frees deviceid -+ */ -+void -+pnfs_put_deviceid(struct pnfs_deviceid_cache *c, -+ struct pnfs_deviceid_node *devid) -+{ -+ dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref)); -+ if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock)) -+ return; -+ -+ pnfs_unhash_deviceid(c, &devid->de_id); -+ spin_unlock(&c->dc_lock); -+ -+ c->dc_free_callback(devid); -+} -+EXPORT_SYMBOL_GPL(pnfs_put_deviceid); -+ -+void -+pnfs_delete_deviceid(struct pnfs_deviceid_cache *c, -+ struct nfs4_deviceid *id) -+{ -+ struct pnfs_deviceid_node *devid; -+ -+ spin_lock(&c->dc_lock); -+ devid = pnfs_unhash_deviceid(c, id); -+ spin_unlock(&c->dc_lock); -+ -+ dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref)); -+ if (atomic_dec_and_test(&devid->de_ref)) -+ c->dc_free_callback(devid); -+} -+EXPORT_SYMBOL_GPL(pnfs_delete_deviceid); -+ -+/* Find and reference a deviceid */ -+struct pnfs_deviceid_node * -+pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id) -+{ -+ struct pnfs_deviceid_node *d; -+ struct hlist_node *n; -+ long hash = nfs4_deviceid_hash(id); -+ -+ dprintk("--> %s hash %ld\n", __func__, hash); -+ rcu_read_lock(); -+ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { -+ if (!memcmp(&d->de_id, id, sizeof(*id))) { -+ if (!atomic_inc_not_zero(&d->de_ref)) { -+ goto fail; -+ } else { -+ rcu_read_unlock(); -+ return d; -+ } -+ } -+ } -+fail: -+ rcu_read_unlock(); -+ return NULL; -+} -+EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid); -+ -+/* -+ * Add a deviceid to the cache. -+ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new -+ */ -+struct pnfs_deviceid_node * -+pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new) -+{ -+ struct pnfs_deviceid_node *d; -+ long hash = nfs4_deviceid_hash(&new->de_id); -+ -+ dprintk("--> %s hash %ld\n", __func__, hash); -+ spin_lock(&c->dc_lock); -+ d = pnfs_find_get_deviceid(c, &new->de_id); -+ if (d) { -+ spin_unlock(&c->dc_lock); -+ dprintk("%s [discard]\n", __func__); -+ c->dc_free_callback(new); -+ return d; -+ } -+ INIT_HLIST_NODE(&new->de_node); -+ atomic_set(&new->de_ref, 1); -+ hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]); -+ spin_unlock(&c->dc_lock); -+ dprintk("%s [new]\n", __func__); -+ return new; -+} -+EXPORT_SYMBOL_GPL(pnfs_add_deviceid); -+ -+void -+pnfs_put_deviceid_cache(struct nfs_client *clp) -+{ -+ struct pnfs_deviceid_cache *local = clp->cl_devid_cache; -+ -+ dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache); -+ if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) { -+ int i; -+ /* Verify cache is empty */ -+ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) -+ BUG_ON(!hlist_empty(&local->dc_deviceids[i])); -+ clp->cl_devid_cache = NULL; -+ spin_unlock(&clp->cl_lock); -+ kfree(local); -+ } -+} -+EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache); -diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h -new file mode 100644 -index 0000000..0e90b0e ---- /dev/null -+++ b/fs/nfs/pnfs.h -@@ -0,0 +1,595 @@ -+/* -+ * pNFS client data structures. -+ * -+ * Copyright (c) 2002 -+ * The Regents of the University of Michigan -+ * All Rights Reserved -+ * -+ * Dean Hildebrand -+ * -+ * Permission is granted to use, copy, create derivative works, and -+ * redistribute this software and such derivative works for any purpose, -+ * so long as the name of the University of Michigan is not used in -+ * any advertising or publicity pertaining to the use or distribution -+ * of this software without specific, written prior authorization. If -+ * the above copyright notice or any other identification of the -+ * University of Michigan is included in any copy of any portion of -+ * this software, then the disclaimer below must also be included. -+ * -+ * This software is provided as is, without representation or warranty -+ * of any kind either express or implied, including without limitation -+ * the implied warranties of merchantability, fitness for a particular -+ * purpose, or noninfringement. The Regents of the University of -+ * Michigan shall not be liable for any damages, including special, -+ * indirect, incidental, or consequential damages, with respect to any -+ * claim arising out of or in connection with the use of the software, -+ * even if it has been or is hereafter advised of the possibility of -+ * such damages. -+ */ -+ -+#ifndef FS_NFS_PNFS_H -+#define FS_NFS_PNFS_H -+ -+#include -+ -+struct pnfs_layout_segment { -+ struct list_head fi_list; -+ struct pnfs_layout_range range; -+ struct kref kref; -+ bool valid; -+ struct pnfs_layout_hdr *layout; -+}; -+ -+enum pnfs_try_status { -+ PNFS_ATTEMPTED = 0, -+ PNFS_NOT_ATTEMPTED = 1, -+}; -+ -+struct pnfs_fsdata { -+ struct pnfs_layout_segment *lseg; -+ int bypass_eof; -+ void *private; -+}; -+ -+#ifdef CONFIG_NFS_V4_1 -+ -+#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" -+ -+enum { -+ NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ -+ NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ -+ NFS_LAYOUT_STATEID_SET, /* have a valid layout stateid */ -+ NFS_LAYOUT_NEED_LCOMMIT, /* LAYOUTCOMMIT needed */ -+}; -+ -+enum layoutdriver_policy_flags { -+ /* Should the full nfs rpc cleanup code be used after io */ -+ PNFS_USE_RPC_CODE = 1 << 0, -+ -+ /* Should the pNFS client commit and return the layout upon a setattr */ -+ PNFS_LAYOUTRET_ON_SETATTR = 1 << 1, -+}; -+ -+/* Per-layout driver specific registration structure */ -+struct pnfs_layoutdriver_type { -+ struct list_head pnfs_tblid; -+ const u32 id; -+ const char *name; -+ struct module *owner; -+ unsigned flags; -+ int (*initialize_mountpoint) (struct nfs_server *, const struct nfs_fh *); -+ int (*uninitialize_mountpoint) (struct nfs_server *); -+ -+ struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode); -+ void (*free_layout_hdr) (struct pnfs_layout_hdr *); -+ -+ struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); -+ void (*free_lseg) (struct pnfs_layout_segment *lseg); -+ -+ /* test for nfs page cache coalescing */ -+ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); -+ -+ /* Retreive the block size of the file system. -+ * If gather_across_stripes == 1, then the file system will gather -+ * requests into the block size. -+ * TODO: Where will the layout driver get this info? It is hard -+ * coded in PVFS2. -+ */ -+ ssize_t (*get_blocksize) (void); -+ -+/* read and write pagelist should return just 0 (to indicate that -+ * the layout code has taken control) or 1 (to indicate that the -+ * layout code wishes to fall back to normal nfs.) If 0 is returned, -+ * information can be passed back through nfs_data->res and -+ * nfs_data->task.tk_status, and the appropriate pnfs done function -+ * MUST be called. -+ */ -+ enum pnfs_try_status -+ (*read_pagelist) (struct nfs_read_data *nfs_data, unsigned nr_pages); -+ enum pnfs_try_status -+ (*write_pagelist) (struct nfs_write_data *nfs_data, unsigned nr_pages, int how); -+ int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page, -+ loff_t pos, unsigned count, -+ struct pnfs_fsdata *fsdata); -+ int (*write_end)(struct inode *inode, struct page *page, loff_t pos, -+ unsigned count, unsigned copied, -+ struct pnfs_layout_segment *lseg); -+ void (*write_end_cleanup)(struct file *filp, -+ struct pnfs_fsdata *fsdata); -+ -+ /* Consistency ops */ -+ /* 2 problems: -+ * 1) the page list contains nfs_pages, NOT pages -+ * 2) currently the NFS code doesn't create a page array (as it does with read/write) -+ */ -+ enum pnfs_try_status -+ (*commit) (struct nfs_write_data *nfs_data, int how); -+ -+ int (*setup_layoutcommit) (struct pnfs_layout_hdr *layoutid, -+ struct nfs4_layoutcommit_args *args); -+ -+ void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, -+ struct xdr_stream *xdr, -+ const struct nfs4_layoutcommit_args *args); -+ -+ void (*cleanup_layoutcommit) (struct pnfs_layout_hdr *layoutid, -+ struct nfs4_layoutcommit_args *args, -+ int status); -+ -+ void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, -+ struct xdr_stream *xdr, -+ const struct nfs4_layoutreturn_args *args); -+}; -+ -+struct pnfs_layout_hdr { -+ unsigned long refcount; -+ struct list_head layouts; /* other client layouts */ -+ struct list_head segs; /* layout segments list */ -+ int roc_iomode;/* return on close iomode, 0=none */ -+ seqlock_t seqlock; /* Protects the stateid */ -+ nfs4_stateid stateid; -+ unsigned long state; -+ struct rpc_cred *cred; /* layoutcommit credential */ -+ /* DH: These vars keep track of the maximum write range -+ * so the values can be used for layoutcommit. -+ */ -+ loff_t write_begin_pos; -+ loff_t write_end_pos; -+ struct inode *inode; -+}; -+ -+struct pnfs_device { -+ struct nfs4_deviceid dev_id; -+ unsigned int layout_type; -+ unsigned int mincount; -+ struct page **pages; -+ void *area; -+ unsigned int pgbase; -+ unsigned int pglen; -+}; -+ -+#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 -+ -+struct pnfs_devicelist { -+ unsigned int eof; -+ unsigned int num_devs; -+ struct nfs4_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; -+}; -+ -+/* -+ * Device ID RCU cache. A device ID is unique per client ID and layout type. -+ */ -+#define NFS4_DEVICE_ID_HASH_BITS 5 -+#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) -+#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) -+ -+static inline u32 -+nfs4_deviceid_hash(struct nfs4_deviceid *id) -+{ -+ unsigned char *cptr = (unsigned char *)id->data; -+ unsigned int nbytes = NFS4_DEVICEID4_SIZE; -+ u32 x = 0; -+ -+ while (nbytes--) { -+ x *= 37; -+ x += *cptr++; -+ } -+ return x & NFS4_DEVICE_ID_HASH_MASK; -+} -+ -+struct pnfs_deviceid_node { -+ struct hlist_node de_node; -+ struct nfs4_deviceid de_id; -+ atomic_t de_ref; -+}; -+ -+struct pnfs_deviceid_cache { -+ spinlock_t dc_lock; -+ atomic_t dc_ref; -+ void (*dc_free_callback)(struct pnfs_deviceid_node *); -+ struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE]; -+}; -+ -+extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *, -+ void (*free_callback)(struct pnfs_deviceid_node *)); -+extern void pnfs_put_deviceid_cache(struct nfs_client *); -+extern struct pnfs_deviceid_node *pnfs_find_get_deviceid( -+ struct pnfs_deviceid_cache *, -+ struct nfs4_deviceid *); -+extern struct pnfs_deviceid_node *pnfs_add_deviceid( -+ struct pnfs_deviceid_cache *, -+ struct pnfs_deviceid_node *); -+extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c, -+ struct pnfs_deviceid_node *devid); -+extern void pnfs_delete_deviceid(struct pnfs_deviceid_cache *, -+ struct nfs4_deviceid *); -+ -+extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); -+extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); -+ -+/* nfs4proc.c */ -+extern int nfs4_proc_getdevicelist(struct nfs_server *server, -+ const struct nfs_fh *fh, -+ struct pnfs_devicelist *devlist); -+extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, -+ struct pnfs_device *dev); -+extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); -+extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, -+ int issync); -+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait); -+ -+/* pnfs.c */ -+void get_lseg(struct pnfs_layout_segment *lseg); -+void put_lseg(struct pnfs_layout_segment *lseg); -+struct pnfs_layout_segment * -+pnfs_has_layout(struct pnfs_layout_hdr *lo, struct pnfs_layout_range *range); -+struct pnfs_layout_segment * -+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, -+ loff_t pos, u64 count, enum pnfs_iomode access_type); -+bool pnfs_return_layout_barrier(struct nfs_inode *, struct pnfs_layout_range *); -+int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, -+ const nfs4_stateid *stateid, /* optional */ -+ enum pnfs_layoutreturn_type, bool wait); -+void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *mntfh, u32 id); -+void unset_pnfs_layoutdriver(struct nfs_server *); -+enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, -+ const struct rpc_call_ops *, int); -+enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, -+ const struct rpc_call_ops *); -+void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); -+int pnfs_layoutcommit_inode(struct inode *inode, int sync); -+void pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent); -+void pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx); -+void pnfs_set_ds_iosize(struct nfs_server *server); -+enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *, -+ const struct rpc_call_ops *, int); -+void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, -+ struct nfs_open_context *, struct list_head *, -+ size_t *); -+void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, -+ size_t *); -+void pnfs_free_fsdata(struct pnfs_fsdata *fsdata); -+int pnfs_layout_process(struct nfs4_layoutget *lgp); -+void pnfs_layoutreturn_release(struct nfs4_layoutreturn *lpr); -+void pnfs_destroy_layout(struct nfs_inode *); -+void pnfs_destroy_all_layouts(struct nfs_client *); -+void put_layout_hdr(struct inode *inode); -+void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, -+ struct nfs4_state *open_state); -+void pnfs_read_done(struct nfs_read_data *); -+void pnfs_writeback_done(struct nfs_write_data *); -+void pnfs_commit_done(struct nfs_write_data *); -+int _pnfs_write_begin(struct inode *inode, struct page *page, -+ loff_t pos, unsigned len, -+ struct pnfs_layout_segment *lseg, -+ struct pnfs_fsdata **fsdata); -+ -+static inline bool -+has_layout(struct nfs_inode *nfsi) -+{ -+ return nfsi->layout != NULL; -+} -+ -+static inline int lo_fail_bit(u32 iomode) -+{ -+ return iomode == IOMODE_RW ? -+ NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; -+} -+ -+static inline void pnfs_invalidate_layout_stateid(struct pnfs_layout_hdr *lo) -+{ -+ write_seqlock(&lo->seqlock); -+ clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state); -+ write_sequnlock(&lo->seqlock); -+} -+ -+/* Return true if a layout driver is being used for this mountpoint */ -+static inline int pnfs_enabled_sb(struct nfs_server *nfss) -+{ -+ return nfss->pnfs_curr_ld != NULL; -+} -+ -+static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, -+ struct pnfs_fsdata *fsdata) -+{ -+ return !fsdata || ((struct pnfs_layout_segment *)fsdata == lseg) || -+ !fsdata->bypass_eof; -+} -+ -+/* Should the pNFS client commit and return the layout upon a setattr */ -+static inline bool -+pnfs_ld_layoutret_on_setattr(struct inode *inode) -+{ -+ if (!pnfs_enabled_sb(NFS_SERVER(inode))) -+ return false; -+ return NFS_SERVER(inode)->pnfs_curr_ld->flags & -+ PNFS_LAYOUTRET_ON_SETATTR; -+} -+ -+static inline bool pnfs_use_rpc(struct nfs_server *nfss) -+{ -+ if (pnfs_enabled_sb(nfss)) -+ return nfss->pnfs_curr_ld->flags & PNFS_USE_RPC_CODE; -+ -+ return true; -+} -+ -+/* Should the pNFS client commit and return the layout on close -+ */ -+static inline int -+pnfs_layout_roc_iomode(struct nfs_inode *nfsi) -+{ -+ return nfsi->layout->roc_iomode; -+} -+ -+static inline int pnfs_write_begin(struct file *filp, struct page *page, -+ loff_t pos, unsigned len, -+ struct pnfs_layout_segment *lseg, -+ void **fsdata) -+{ -+ struct inode *inode = filp->f_dentry->d_inode; -+ struct nfs_server *nfss = NFS_SERVER(inode); -+ int status = 0; -+ -+ *fsdata = lseg; -+ if (lseg && nfss->pnfs_curr_ld->write_begin) -+ status = _pnfs_write_begin(inode, page, pos, len, lseg, -+ (struct pnfs_fsdata **) fsdata); -+ return status; -+} -+ -+/* CAREFUL - what happens if copied < len??? */ -+static inline int pnfs_write_end(struct file *filp, struct page *page, -+ loff_t pos, unsigned len, unsigned copied, -+ struct pnfs_layout_segment *lseg) -+{ -+ struct inode *inode = filp->f_dentry->d_inode; -+ struct nfs_server *nfss = NFS_SERVER(inode); -+ -+ if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_end) -+ return nfss->pnfs_curr_ld->write_end(inode, page, pos, len, -+ copied, lseg); -+ else -+ return 0; -+} -+ -+static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) -+{ -+ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); -+ -+ if (fsdata && nfss->pnfs_curr_ld) { -+ if (nfss->pnfs_curr_ld->write_end_cleanup) -+ nfss->pnfs_curr_ld->write_end_cleanup(filp, fsdata); -+ if (nfss->pnfs_curr_ld->write_begin) -+ pnfs_free_fsdata(fsdata); -+ } -+} -+ -+static inline int pnfs_return_layout(struct inode *ino, -+ struct pnfs_layout_range *range, -+ const nfs4_stateid *stateid, /* optional */ -+ enum pnfs_layoutreturn_type type, -+ bool wait) -+{ -+ struct nfs_inode *nfsi = NFS_I(ino); -+ struct nfs_server *nfss = NFS_SERVER(ino); -+ -+ if (pnfs_enabled_sb(nfss) && -+ (type != RETURN_FILE || has_layout(nfsi))) -+ return _pnfs_return_layout(ino, range, stateid, type, wait); -+ -+ return 0; -+} -+ -+static inline bool -+layoutcommit_needed(struct nfs_inode *nfsi) -+{ -+ return has_layout(nfsi) && -+ test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->state); -+} -+ -+static inline int pnfs_get_write_status(struct nfs_write_data *data) -+{ -+ return data->pdata.pnfs_error; -+} -+ -+static inline int pnfs_get_read_status(struct nfs_read_data *data) -+{ -+ return data->pdata.pnfs_error; -+} -+ -+static inline struct pnfs_layout_segment * -+nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) -+{ -+ if (fsdata) { -+ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); -+ -+ if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_begin) -+ return ((struct pnfs_fsdata *) fsdata)->lseg; -+ return (struct pnfs_layout_segment *)fsdata; -+ } -+ return NULL; -+} -+ -+#else /* CONFIG_NFS_V4_1 */ -+ -+static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) -+{ -+} -+ -+static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) -+{ -+} -+ -+static inline void get_lseg(struct pnfs_layout_segment *lseg) -+{ -+} -+ -+static inline void put_lseg(struct pnfs_layout_segment *lseg) -+{ -+} -+ -+static inline struct pnfs_layout_segment * -+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, -+ loff_t pos, u64 count, enum pnfs_iomode access_type) -+{ -+ return NULL; -+} -+ -+static inline bool -+has_layout(struct nfs_inode *nfsi) -+{ -+ return false; -+} -+ -+static inline bool -+layoutcommit_needed(struct nfs_inode *nfsi) -+{ -+ return 0; -+} -+ -+static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, -+ struct pnfs_fsdata *fsdata) -+{ -+ return 1; -+} -+ -+static inline enum pnfs_try_status -+pnfs_try_to_read_data(struct nfs_read_data *data, -+ const struct rpc_call_ops *call_ops) -+{ -+ return PNFS_NOT_ATTEMPTED; -+} -+ -+static inline enum pnfs_try_status -+pnfs_try_to_write_data(struct nfs_write_data *data, -+ const struct rpc_call_ops *call_ops, int how) -+{ -+ return PNFS_NOT_ATTEMPTED; -+} -+ -+static inline enum pnfs_try_status -+pnfs_try_to_commit(struct nfs_write_data *data, -+ const struct rpc_call_ops *call_ops, int how) -+{ -+ return PNFS_NOT_ATTEMPTED; -+} -+ -+static inline int pnfs_layoutcommit_inode(struct inode *inode, int sync) -+{ -+ return 0; -+} -+ -+static inline bool -+pnfs_ld_layoutret_on_setattr(struct inode *inode) -+{ -+ return false; -+} -+ -+static inline bool pnfs_use_rpc(struct nfs_server *nfss) -+{ -+ return true; -+} -+ -+static inline int -+pnfs_layout_roc_iomode(struct nfs_inode *nfsi) -+{ -+ return 0; -+} -+ -+static inline int pnfs_return_layout(struct inode *ino, -+ struct pnfs_layout_range *range, -+ const nfs4_stateid *stateid, /* optional */ -+ enum pnfs_layoutreturn_type type, -+ bool wait) -+{ -+ return 0; -+} -+ -+static inline void set_pnfs_layoutdriver(struct nfs_server *s, const struct nfs_fh *mntfh, u32 id) -+{ -+} -+ -+static inline void unset_pnfs_layoutdriver(struct nfs_server *s) -+{ -+} -+ -+static inline void pnfs_set_ds_iosize(struct nfs_server *server) -+{ -+ server->ds_wsize = server->ds_rsize = -1; -+} -+ -+static inline int pnfs_write_begin(struct file *filp, struct page *page, -+ loff_t pos, unsigned len, -+ struct pnfs_layout_segment *lseg, -+ void **fsdata) -+{ -+ *fsdata = NULL; -+ return 0; -+} -+ -+static inline int pnfs_write_end(struct file *filp, struct page *page, -+ loff_t pos, unsigned len, unsigned copied, -+ struct pnfs_layout_segment *lseg) -+{ -+ return 0; -+} -+ -+static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) -+{ -+} -+ -+static inline int pnfs_get_write_status(struct nfs_write_data *data) -+{ -+ return 0; -+} -+ -+static inline int pnfs_get_read_status(struct nfs_read_data *data) -+{ -+ return 0; -+} -+ -+static inline void -+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino, -+ struct nfs_open_context *ctx, struct list_head *pages, -+ size_t *rsize) -+{ -+ pgio->pg_lseg = NULL; -+} -+ -+static inline void -+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino, -+ size_t *wsize) -+{ -+ pgio->pg_lseg = NULL; -+} -+ -+static inline struct pnfs_layout_segment * -+nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) -+{ -+ return NULL; -+} -+ -+#endif /* CONFIG_NFS_V4_1 */ -+ -+#endif /* FS_NFS_PNFS_H */ -diff --git a/fs/nfs/read.c b/fs/nfs/read.c -index 87adc27..1df536a 100644 ---- a/fs/nfs/read.c -+++ b/fs/nfs/read.c -@@ -18,8 +18,12 @@ - #include - #include - #include -+#include -+#include - - #include -+#include -+#include "pnfs.h" - - #include "nfs4_fs.h" - #include "internal.h" -@@ -117,11 +121,16 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, - LIST_HEAD(one_request); - struct nfs_page *new; - unsigned int len; -+ loff_t pgoffs; -+ struct pnfs_layout_segment *lseg; - - len = nfs_page_length(page); - if (len == 0) - return nfs_return_empty_page(page); -- new = nfs_create_request(ctx, inode, page, 0, len); -+ pgoffs = (loff_t)page->index << PAGE_CACHE_SHIFT; -+ lseg = pnfs_update_layout(inode, ctx, pgoffs, len, IOMODE_READ); -+ new = nfs_create_request(ctx, inode, page, 0, len, lseg); -+ put_lseg(lseg); - if (IS_ERR(new)) { - unlock_page(page); - return PTR_ERR(new); -@@ -155,24 +164,20 @@ static void nfs_readpage_release(struct nfs_page *req) - nfs_release_request(req); - } - --/* -- * Set up the NFS read request struct -- */ --static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, -- const struct rpc_call_ops *call_ops, -- unsigned int count, unsigned int offset) -+int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, -+ const struct rpc_call_ops *call_ops) - { -- struct inode *inode = req->wb_context->path.dentry->d_inode; -+ struct inode *inode = data->inode; - int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; - struct rpc_task *task; - struct rpc_message msg = { - .rpc_argp = &data->args, - .rpc_resp = &data->res, -- .rpc_cred = req->wb_context->cred, -+ .rpc_cred = data->cred, - }; - struct rpc_task_setup task_setup_data = { - .task = &data->task, -- .rpc_client = NFS_CLIENT(inode), -+ .rpc_client = clnt, - .rpc_message = &msg, - .callback_ops = call_ops, - .callback_data = data, -@@ -180,9 +185,46 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, - .flags = RPC_TASK_ASYNC | swap_flags, - }; - -+ /* Set up the initial task struct. */ -+ NFS_PROTO(inode)->read_setup(data, &msg); -+ -+ dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", -+ data->task.tk_pid, -+ inode->i_sb->s_id, -+ (long long)NFS_FILEID(inode), -+ data->args.count, -+ (unsigned long long)data->args.offset); -+ -+ task = rpc_run_task(&task_setup_data); -+ if (IS_ERR(task)) -+ return PTR_ERR(task); -+ rpc_put_task(task); -+ return 0; -+} -+EXPORT_SYMBOL(nfs_initiate_read); -+ -+int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, -+ const struct rpc_call_ops *call_ops) -+{ -+ if (data->req->wb_lseg && -+ (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) -+ return pnfs_get_read_status(data); -+ -+ return nfs_initiate_read(data, clnt, call_ops); -+} -+ -+/* -+ * Set up the NFS read request struct -+ */ -+static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, -+ const struct rpc_call_ops *call_ops, -+ unsigned int count, unsigned int offset) -+{ -+ struct inode *inode = req->wb_context->path.dentry->d_inode; -+ - data->req = req; - data->inode = inode; -- data->cred = msg.rpc_cred; -+ data->cred = req->wb_context->cred; - - data->args.fh = NFS_FH(inode); - data->args.offset = req_offset(req) + offset; -@@ -197,21 +239,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, - data->res.eof = 0; - nfs_fattr_init(&data->fattr); - -- /* Set up the initial task struct. */ -- NFS_PROTO(inode)->read_setup(data, &msg); -- -- dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", -- data->task.tk_pid, -- inode->i_sb->s_id, -- (long long)NFS_FILEID(inode), -- count, -- (unsigned long long)data->args.offset); -- -- task = rpc_run_task(&task_setup_data); -- if (IS_ERR(task)) -- return PTR_ERR(task); -- rpc_put_task(task); -- return 0; -+ return pnfs_initiate_read(data, NFS_CLIENT(inode), call_ops); - } - - static void -@@ -355,7 +383,14 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data - { - struct nfs_readargs *argp = &data->args; - struct nfs_readres *resp = &data->res; -+ struct nfs_client *clp = NFS_SERVER(data->inode)->nfs_client; - -+#ifdef CONFIG_NFS_V4_1 -+ if (data->fldata.ds_nfs_client) { -+ dprintk("%s DS read\n", __func__); -+ clp = data->fldata.ds_nfs_client; -+ } -+#endif /* CONFIG_NFS_V4_1 */ - if (resp->eof || resp->count == argp->count) - return; - -@@ -369,7 +404,10 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data - argp->offset += resp->count; - argp->pgbase += resp->count; - argp->count -= resp->count; -- nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); -+#ifdef CONFIG_NFS_V4_1 -+ data->pdata.pnfs_error = -EAGAIN; -+#endif /* CONFIG_NFS_V4_1 */ -+ nfs_restart_rpc(task, clp); - } - - /* -@@ -410,13 +448,19 @@ static void nfs_readpage_release_partial(void *calldata) - void nfs_read_prepare(struct rpc_task *task, void *calldata) - { - struct nfs_read_data *data = calldata; -+ struct nfs4_session *ds_session = NULL; - -- if (nfs4_setup_sequence(NFS_SERVER(data->inode), -+ if (data->fldata.ds_nfs_client) { -+ dprintk("%s DS read\n", __func__); -+ ds_session = data->fldata.ds_nfs_client->cl_session; -+ } -+ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, - &data->args.seq_args, &data->res.seq_res, - 0, task)) - return; - rpc_call_start(task); - } -+EXPORT_SYMBOL(nfs_read_prepare); - #endif /* CONFIG_NFS_V4_1 */ - - static const struct rpc_call_ops nfs_read_partial_ops = { -@@ -569,7 +613,20 @@ readpage_async_filler(void *data, struct page *page) - if (len == 0) - return nfs_return_empty_page(page); - -- new = nfs_create_request(desc->ctx, inode, page, 0, len); -+ if (desc->pgio->pg_lseg) { -+ loff_t pgoff = (loff_t)page->index << PAGE_CACHE_SHIFT; -+ struct pnfs_layout_range *range = &desc->pgio->pg_lseg->range; -+ -+ /* retry later with the right lseg? */ -+ if (range->offset > pgoff + len || -+ range->offset + range->length < pgoff) { -+ new = ERR_PTR(-EAGAIN); -+ goto out_error; -+ } -+ } -+ -+ new = nfs_create_request(desc->ctx, inode, page, 0, len, -+ desc->pgio->pg_lseg); - if (IS_ERR(new)) - goto out_error; - -@@ -625,6 +682,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, - if (ret == 0) - goto read_complete; /* all pages were read */ - -+ pnfs_pageio_init_read(&pgio, inode, desc.ctx, pages, &rsize); - if (rsize < PAGE_CACHE_SIZE) - nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); - else -@@ -633,6 +691,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, - ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); - - nfs_pageio_complete(&pgio); -+ put_lseg(pgio.pg_lseg); - npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - nfs_add_stats(inode, NFSIOS_READPAGES, npages); - read_complete: -diff --git a/fs/nfs/super.c b/fs/nfs/super.c -index f4cbf0c..91606fb 100644 ---- a/fs/nfs/super.c -+++ b/fs/nfs/super.c -@@ -64,6 +64,7 @@ - #include "iostat.h" - #include "internal.h" - #include "fscache.h" -+#include "pnfs.h" - - #define NFSDBG_FACILITY NFSDBG_VFS - -@@ -687,6 +688,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) - - return 0; - } -+#ifdef CONFIG_NFS_V4_1 -+void show_sessions(struct seq_file *m, struct nfs_server *server) -+{ -+ if (nfs4_has_session(server->nfs_client)) -+ seq_printf(m, ",sessions"); -+} -+#else -+void show_sessions(struct seq_file *m, struct nfs_server *server) {} -+#endif -+ -+#ifdef CONFIG_NFS_V4_1 -+void show_pnfs(struct seq_file *m, struct nfs_server *server) -+{ -+ seq_printf(m, ",pnfs="); -+ if (server->pnfs_curr_ld) -+ seq_printf(m, "%s", server->pnfs_curr_ld->name); -+ else -+ seq_printf(m, "not configured"); -+} -+#else /* CONFIG_NFS_V4_1 */ -+void show_pnfs(struct seq_file *m, struct nfs_server *server) {} -+#endif /* CONFIG_NFS_V4_1 */ - - /* - * Present statistical information for this VFS mountpoint -@@ -725,6 +748,8 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) - seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); - seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); - seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); -+ show_sessions(m, nfss); -+ show_pnfs(m, nfss); - } - #endif - -diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c -index 2f84ada..51ae53b 100644 ---- a/fs/nfs/unlink.c -+++ b/fs/nfs/unlink.c -@@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task *task, void *calldata) - struct nfs_unlinkdata *data = calldata; - struct nfs_server *server = NFS_SERVER(data->dir); - -- if (nfs4_setup_sequence(server, &data->args.seq_args, -+ if (nfs4_setup_sequence(server, NULL, &data->args.seq_args, - &data->res.seq_res, 1, task)) - return; - rpc_call_start(task); -diff --git a/fs/nfs/write.c b/fs/nfs/write.c -index 874972d..988b65a 100644 ---- a/fs/nfs/write.c -+++ b/fs/nfs/write.c -@@ -28,6 +28,7 @@ - #include "iostat.h" - #include "nfs4_fs.h" - #include "fscache.h" -+#include "pnfs.h" - - #define NFSDBG_FACILITY NFSDBG_PAGECACHE - -@@ -59,6 +60,7 @@ struct nfs_write_data *nfs_commitdata_alloc(void) - } - return p; - } -+EXPORT_SYMBOL(nfs_commitdata_alloc); - - void nfs_commit_free(struct nfs_write_data *p) - { -@@ -429,6 +431,17 @@ static void nfs_inode_remove_request(struct nfs_page *req) - nfs_clear_request(req); - nfs_release_request(req); - } -+static void -+nfs_mark_request_nopnfs(struct nfs_page *req) -+{ -+ struct pnfs_layout_segment *lseg = req->wb_lseg; -+ -+ if (req->wb_lseg == NULL) -+ return; -+ req->wb_lseg = NULL; -+ put_lseg(lseg); -+ dprintk(" retry through MDS\n"); -+} - - static void - nfs_mark_request_dirty(struct nfs_page *req) -@@ -534,7 +547,7 @@ nfs_need_commit(struct nfs_inode *nfsi) - * The requests are *not* checked to ensure that they form a contiguous set. - */ - static int --nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) -+nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int *use_pnfs) - { - struct nfs_inode *nfsi = NFS_I(inode); - int ret; -@@ -542,7 +555,8 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, u - if (!nfs_need_commit(nfsi)) - return 0; - -- ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); -+ ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT, -+ use_pnfs); - if (ret > 0) - nfsi->ncommit -= ret; - if (nfs_need_commit(NFS_I(inode))) -@@ -571,7 +585,8 @@ static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pg - static struct nfs_page *nfs_try_to_update_request(struct inode *inode, - struct page *page, - unsigned int offset, -- unsigned int bytes) -+ unsigned int bytes, -+ struct pnfs_layout_segment *lseg) - { - struct nfs_page *req; - unsigned int rqend; -@@ -596,8 +611,8 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, - * Note: nfs_flush_incompatible() will already - * have flushed out requests having wrong owners. - */ -- if (offset > rqend -- || end < req->wb_offset) -+ if (offset > rqend || end < req->wb_offset || -+ req->wb_lseg != lseg) - goto out_flushme; - - if (nfs_set_page_tag_locked(req)) -@@ -645,16 +660,17 @@ out_err: - * already called nfs_flush_incompatible() if necessary. - */ - static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, -- struct page *page, unsigned int offset, unsigned int bytes) -+ struct page *page, unsigned int offset, unsigned int bytes, -+ struct pnfs_layout_segment *lseg) - { - struct inode *inode = page->mapping->host; - struct nfs_page *req; - int error; - -- req = nfs_try_to_update_request(inode, page, offset, bytes); -+ req = nfs_try_to_update_request(inode, page, offset, bytes, lseg); - if (req != NULL) - goto out; -- req = nfs_create_request(ctx, inode, page, offset, bytes); -+ req = nfs_create_request(ctx, inode, page, offset, bytes, lseg); - if (IS_ERR(req)) - goto out; - error = nfs_inode_add_request(inode, req); -@@ -667,23 +683,27 @@ out: - } - - static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, -- unsigned int offset, unsigned int count) -+ unsigned int offset, unsigned int count, -+ struct pnfs_layout_segment *lseg, -+ void *fsdata) - { - struct nfs_page *req; - -- req = nfs_setup_write_request(ctx, page, offset, count); -+ req = nfs_setup_write_request(ctx, page, offset, count, lseg); - if (IS_ERR(req)) - return PTR_ERR(req); - nfs_mark_request_dirty(req); - /* Update file length */ -- nfs_grow_file(page, offset, count); -+ if (pnfs_grow_ok(lseg, fsdata)) -+ nfs_grow_file(page, offset, count); - nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); - nfs_mark_request_dirty(req); - nfs_clear_page_tag_locked(req); - return 0; - } - --int nfs_flush_incompatible(struct file *file, struct page *page) -+int nfs_flush_incompatible(struct file *file, struct page *page, -+ struct pnfs_layout_segment *lseg) - { - struct nfs_open_context *ctx = nfs_file_open_context(file); - struct nfs_page *req; -@@ -702,7 +722,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page) - return 0; - do_flush = req->wb_page != page || req->wb_context != ctx || - req->wb_lock_context->lockowner != current->files || -- req->wb_lock_context->pid != current->tgid; -+ req->wb_lock_context->pid != current->tgid || -+ req->wb_lseg != lseg; - nfs_release_request(req); - if (!do_flush) - return 0; -@@ -729,7 +750,8 @@ static int nfs_write_pageuptodate(struct page *page, struct inode *inode) - * things with a page scheduled for an RPC call (e.g. invalidate it). - */ - int nfs_updatepage(struct file *file, struct page *page, -- unsigned int offset, unsigned int count) -+ unsigned int offset, unsigned int count, -+ struct pnfs_layout_segment *lseg, void *fsdata) - { - struct nfs_open_context *ctx = nfs_file_open_context(file); - struct inode *inode = page->mapping->host; -@@ -754,7 +776,7 @@ int nfs_updatepage(struct file *file, struct page *page, - offset = 0; - } - -- status = nfs_writepage_setup(ctx, page, offset, count); -+ status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata); - if (status < 0) - nfs_set_pageerror(page); - -@@ -784,25 +806,21 @@ static int flush_task_priority(int how) - return RPC_PRIORITY_NORMAL; - } - --/* -- * Set up the argument/result storage required for the RPC call. -- */ --static int nfs_write_rpcsetup(struct nfs_page *req, -- struct nfs_write_data *data, -- const struct rpc_call_ops *call_ops, -- unsigned int count, unsigned int offset, -- int how) -+int nfs_initiate_write(struct nfs_write_data *data, -+ struct rpc_clnt *clnt, -+ const struct rpc_call_ops *call_ops, -+ int how) - { -- struct inode *inode = req->wb_context->path.dentry->d_inode; -+ struct inode *inode = data->inode; - int priority = flush_task_priority(how); - struct rpc_task *task; - struct rpc_message msg = { - .rpc_argp = &data->args, - .rpc_resp = &data->res, -- .rpc_cred = req->wb_context->cred, -+ .rpc_cred = data->cred, - }; - struct rpc_task_setup task_setup_data = { -- .rpc_client = NFS_CLIENT(inode), -+ .rpc_client = clnt, - .task = &data->task, - .rpc_message = &msg, - .callback_ops = call_ops, -@@ -813,12 +831,62 @@ static int nfs_write_rpcsetup(struct nfs_page *req, - }; - int ret = 0; - -+ /* Set up the initial task struct. */ -+ NFS_PROTO(inode)->write_setup(data, &msg); -+ -+ dprintk("NFS: %5u initiated write call " -+ "(req %s/%lld, %u bytes @ offset %llu)\n", -+ data->task.tk_pid, -+ inode->i_sb->s_id, -+ (long long)NFS_FILEID(inode), -+ data->args.count, -+ (unsigned long long)data->args.offset); -+ -+ task = rpc_run_task(&task_setup_data); -+ if (IS_ERR(task)) { -+ ret = PTR_ERR(task); -+ goto out; -+ } -+ if (how & FLUSH_SYNC) { -+ ret = rpc_wait_for_completion_task(task); -+ if (ret == 0) -+ ret = task->tk_status; -+ } -+ rpc_put_task(task); -+out: -+ return ret; -+} -+EXPORT_SYMBOL(nfs_initiate_write); -+ -+int pnfs_initiate_write(struct nfs_write_data *data, -+ struct rpc_clnt *clnt, -+ const struct rpc_call_ops *call_ops, -+ int how) -+{ -+ if (data->req->wb_lseg && -+ (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) -+ return pnfs_get_write_status(data); -+ -+ return nfs_initiate_write(data, clnt, call_ops, how); -+} -+ -+/* -+ * Set up the argument/result storage required for the RPC call. -+ */ -+static int nfs_write_rpcsetup(struct nfs_page *req, -+ struct nfs_write_data *data, -+ const struct rpc_call_ops *call_ops, -+ unsigned int count, unsigned int offset, -+ int how) -+{ -+ struct inode *inode = req->wb_context->path.dentry->d_inode; -+ - /* Set up the RPC argument and reply structs - * NB: take care not to mess about with data->commit et al. */ - - data->req = req; - data->inode = inode = req->wb_context->path.dentry->d_inode; -- data->cred = msg.rpc_cred; -+ data->cred = req->wb_context->cred; - - data->args.fh = NFS_FH(inode); - data->args.offset = req_offset(req) + offset; -@@ -839,30 +907,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req, - data->res.verf = &data->verf; - nfs_fattr_init(&data->fattr); - -- /* Set up the initial task struct. */ -- NFS_PROTO(inode)->write_setup(data, &msg); -- -- dprintk("NFS: %5u initiated write call " -- "(req %s/%lld, %u bytes @ offset %llu)\n", -- data->task.tk_pid, -- inode->i_sb->s_id, -- (long long)NFS_FILEID(inode), -- count, -- (unsigned long long)data->args.offset); -- -- task = rpc_run_task(&task_setup_data); -- if (IS_ERR(task)) { -- ret = PTR_ERR(task); -- goto out; -- } -- if (how & FLUSH_SYNC) { -- ret = rpc_wait_for_completion_task(task); -- if (ret == 0) -- ret = task->tk_status; -- } -- rpc_put_task(task); --out: -- return ret; -+ return pnfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); - } - - /* If a nfs_flush_* function fails, it should remove reqs from @head and -@@ -873,6 +918,7 @@ static void nfs_redirty_request(struct nfs_page *req) - { - struct page *page = req->wb_page; - -+ nfs_mark_request_nopnfs(req); - nfs_mark_request_dirty(req); - nfs_clear_page_tag_locked(req); - nfs_end_page_writeback(page); -@@ -985,6 +1031,8 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, - { - size_t wsize = NFS_SERVER(inode)->wsize; - -+ pnfs_pageio_init_write(pgio, inode, &wsize); -+ - if (wsize < PAGE_CACHE_SIZE) - nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); - else -@@ -1050,13 +1098,27 @@ out: - void nfs_write_prepare(struct rpc_task *task, void *calldata) - { - struct nfs_write_data *data = calldata; -+ struct nfs4_session *ds_session = NULL; -+ -+ if (data->fldata.ds_nfs_client) { -+ dprintk("%s DS read\n", __func__); -+ ds_session = data->fldata.ds_nfs_client->cl_session; -+ } else if (data->args.count > NFS_SERVER(data->inode)->wsize) { -+ /* retrying via MDS? */ -+ data->pdata.orig_count = data->args.count; -+ data->args.count = NFS_SERVER(data->inode)->wsize; -+ dprintk("%s: trimmed count %u to wsize %u\n", __func__, -+ data->pdata.orig_count, data->args.count); -+ } else -+ data->pdata.orig_count = 0; - -- if (nfs4_setup_sequence(NFS_SERVER(data->inode), -+ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, - &data->args.seq_args, - &data->res.seq_res, 1, task)) - return; - rpc_call_start(task); - } -+EXPORT_SYMBOL(nfs_write_prepare); - #endif /* CONFIG_NFS_V4_1 */ - - static const struct rpc_call_ops nfs_write_partial_ops = { -@@ -1140,10 +1202,11 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) - struct nfs_writeargs *argp = &data->args; - struct nfs_writeres *resp = &data->res; - struct nfs_server *server = NFS_SERVER(data->inode); -+ struct nfs_client *clp = server->nfs_client; - int status; - -- dprintk("NFS: %5u nfs_writeback_done (status %d)\n", -- task->tk_pid, task->tk_status); -+ dprintk("NFS: %5u nfs_writeback_done (status %d count %u)\n", -+ task->tk_pid, task->tk_status, resp->count); - - /* - * ->write_done will attempt to use post-op attributes to detect -@@ -1156,6 +1219,13 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) - if (status != 0) - return status; - nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); -+#ifdef CONFIG_NFS_V4_1 -+ /* Is this a DS session */ -+ if (data->fldata.ds_nfs_client) { -+ dprintk("%s DS write\n", __func__); -+ clp = data->fldata.ds_nfs_client; -+ } -+#endif /* CONFIG_NFS_V4_1 */ - - #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) - if (resp->verf->committed < argp->stable && task->tk_status >= 0) { -@@ -1172,7 +1242,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) - if (time_before(complain, jiffies)) { - dprintk("NFS: faulty NFS server %s:" - " (committed = %d) != (stable = %d)\n", -- server->nfs_client->cl_hostname, -+ clp->cl_hostname, - resp->verf->committed, argp->stable); - complain = jiffies + 300 * HZ; - } -@@ -1198,6 +1268,9 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) - */ - argp->stable = NFS_FILE_SYNC; - } -+#ifdef CONFIG_NFS_V4_1 -+ data->pdata.pnfs_error = -EAGAIN; -+#endif /* CONFIG_NFS_V4_1 */ - nfs_restart_rpc(task, server->nfs_client); - return -EAGAIN; - } -@@ -1242,40 +1315,73 @@ static void nfs_commitdata_release(void *data) - nfs_commit_free(wdata); - } - --/* -- * Set up the argument/result storage required for the RPC call. -- */ --static int nfs_commit_rpcsetup(struct list_head *head, -- struct nfs_write_data *data, -- int how) -+int nfs_initiate_commit(struct nfs_write_data *data, -+ struct rpc_clnt *clnt, -+ const struct rpc_call_ops *call_ops, -+ int how) - { -- struct nfs_page *first = nfs_list_entry(head->next); -- struct inode *inode = first->wb_context->path.dentry->d_inode; -+ struct inode *inode = data->inode; - int priority = flush_task_priority(how); - struct rpc_task *task; - struct rpc_message msg = { - .rpc_argp = &data->args, - .rpc_resp = &data->res, -- .rpc_cred = first->wb_context->cred, -+ .rpc_cred = data->cred, - }; - struct rpc_task_setup task_setup_data = { - .task = &data->task, -- .rpc_client = NFS_CLIENT(inode), -+ .rpc_client = clnt, - .rpc_message = &msg, -- .callback_ops = &nfs_commit_ops, -+ .callback_ops = call_ops, - .callback_data = data, - .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, - .priority = priority, - }; - -+ /* Set up the initial task struct. */ -+ NFS_PROTO(inode)->commit_setup(data, &msg); -+ -+ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); -+ -+ task = rpc_run_task(&task_setup_data); -+ if (IS_ERR(task)) -+ return PTR_ERR(task); -+ rpc_put_task(task); -+ return 0; -+} -+EXPORT_SYMBOL(nfs_initiate_commit); -+ -+ -+int pnfs_initiate_commit(struct nfs_write_data *data, -+ struct rpc_clnt *clnt, -+ const struct rpc_call_ops *call_ops, -+ int how, int pnfs) -+{ -+ if (pnfs && -+ (pnfs_try_to_commit(data, &nfs_commit_ops, how) == PNFS_ATTEMPTED)) -+ return pnfs_get_write_status(data); -+ -+ return nfs_initiate_commit(data, clnt, &nfs_commit_ops, how); -+} -+ -+/* -+ * Set up the argument/result storage required for the RPC call. -+ */ -+static int nfs_commit_rpcsetup(struct list_head *head, -+ struct nfs_write_data *data, -+ int how, int pnfs) -+{ -+ struct nfs_page *first = nfs_list_entry(head->next); -+ struct inode *inode = first->wb_context->path.dentry->d_inode; -+ - /* Set up the RPC argument and reply structs - * NB: take care not to mess about with data->commit et al. */ - - list_splice_init(head, &data->pages); - - data->inode = inode; -- data->cred = msg.rpc_cred; -+ data->cred = first->wb_context->cred; - - data->args.fh = NFS_FH(data->inode); - /* Note: we always request a commit of the entire inode */ -@@ -1286,45 +1392,47 @@ static int nfs_commit_rpcsetup(struct list_head *head, - data->res.fattr = &data->fattr; - data->res.verf = &data->verf; - nfs_fattr_init(&data->fattr); -+ kref_init(&data->refcount); -+ data->parent = NULL; -+ data->args.context = first->wb_context; /* used by commit done */ - -- /* Set up the initial task struct. */ -- NFS_PROTO(inode)->commit_setup(data, &msg); -+ return pnfs_initiate_commit(data, NFS_CLIENT(inode), &nfs_commit_ops, -+ how, pnfs); -+} - -- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); -+/* Handle memory error during commit */ -+void nfs_mark_list_commit(struct list_head *head) -+{ -+ struct nfs_page *req; - -- task = rpc_run_task(&task_setup_data); -- if (IS_ERR(task)) -- return PTR_ERR(task); -- rpc_put_task(task); -- return 0; -+ while (!list_empty(head)) { -+ req = nfs_list_entry(head->next); -+ nfs_list_remove_request(req); -+ nfs_mark_request_commit(req); -+ dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); -+ dec_bdi_stat(req->wb_page->mapping->backing_dev_info, -+ BDI_RECLAIMABLE); -+ nfs_clear_page_tag_locked(req); -+ } - } -+EXPORT_SYMBOL(nfs_mark_list_commit); - - /* - * Commit dirty pages - */ - static int --nfs_commit_list(struct inode *inode, struct list_head *head, int how) -+nfs_commit_list(struct inode *inode, struct list_head *head, int how, int pnfs) - { - struct nfs_write_data *data; -- struct nfs_page *req; - - data = nfs_commitdata_alloc(); -- - if (!data) - goto out_bad; - - /* Set up the argument struct */ -- return nfs_commit_rpcsetup(head, data, how); -+ return nfs_commit_rpcsetup(head, data, how, pnfs); - out_bad: -- while (!list_empty(head)) { -- req = nfs_list_entry(head->next); -- nfs_list_remove_request(req); -- nfs_mark_request_commit(req); -- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); -- dec_bdi_stat(req->wb_page->mapping->backing_dev_info, -- BDI_RECLAIMABLE); -- nfs_clear_page_tag_locked(req); -- } -+ nfs_mark_list_commit(head); - nfs_commit_clear_lock(NFS_I(inode)); - return -ENOMEM; - } -@@ -1344,6 +1452,19 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) - return; - } - -+static inline void nfs_commit_cleanup(struct kref *kref) -+{ -+ struct nfs_write_data *data; -+ -+ data = container_of(kref, struct nfs_write_data, refcount); -+ /* Clear lock only when all cloned commits are finished */ -+ if (data->parent) -+ kref_put(&data->parent->refcount, nfs_commit_cleanup); -+ else -+ nfs_commit_clear_lock(NFS_I(data->inode)); -+ nfs_commitdata_release(data); -+} -+ - static void nfs_commit_release(void *calldata) - { - struct nfs_write_data *data = calldata; -@@ -1361,6 +1482,11 @@ static void nfs_commit_release(void *calldata) - req->wb_bytes, - (long long)req_offset(req)); - if (status < 0) { -+ if (req->wb_lseg) { -+ nfs_mark_request_nopnfs(req); -+ nfs_mark_request_dirty(req); -+ goto next; -+ } - nfs_context_set_write_error(req->wb_context, status); - nfs_inode_remove_request(req); - dprintk(", error = %d\n", status); -@@ -1377,12 +1503,12 @@ static void nfs_commit_release(void *calldata) - } - /* We have a mismatch. Write the page again */ - dprintk(" mismatch\n"); -+ nfs_mark_request_nopnfs(req); - nfs_mark_request_dirty(req); - next: - nfs_clear_page_tag_locked(req); - } -- nfs_commit_clear_lock(NFS_I(data->inode)); -- nfs_commitdata_release(calldata); -+ kref_put(&data->refcount, nfs_commit_cleanup); - } - - static const struct rpc_call_ops nfs_commit_ops = { -@@ -1398,21 +1524,22 @@ int nfs_commit_inode(struct inode *inode, int how) - LIST_HEAD(head); - int may_wait = how & FLUSH_SYNC; - int res = 0; -+ int use_pnfs = 0; - - if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) - goto out_mark_dirty; - spin_lock(&inode->i_lock); -- res = nfs_scan_commit(inode, &head, 0, 0); -+ res = nfs_scan_commit(inode, &head, 0, 0, &use_pnfs); - spin_unlock(&inode->i_lock); - if (res) { -- int error = nfs_commit_list(inode, &head, how); -+ int error = nfs_commit_list(inode, &head, how, use_pnfs); - if (error < 0) - return error; -- if (may_wait) -+ if (may_wait) { - wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, - nfs_wait_bit_killable, - TASK_KILLABLE); -- else -+ } else - goto out_mark_dirty; - } else - nfs_commit_clear_lock(NFS_I(inode)); -@@ -1465,7 +1592,18 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr - - int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) - { -- return nfs_commit_unstable_pages(inode, wbc); -+ int ret; -+ ret = nfs_commit_unstable_pages(inode, wbc); -+ if (ret >= 0 && layoutcommit_needed(NFS_I(inode))) { -+ int err, sync = wbc->sync_mode; -+ -+ if (wbc->nonblocking || wbc->for_background) -+ sync = 0; -+ err = pnfs_layoutcommit_inode(inode, sync); -+ if (err < 0) -+ ret = err; -+ } -+ return ret; - } - - /* -diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig -index 4264377..62033eb 100644 ---- a/fs/nfsd/Kconfig -+++ b/fs/nfsd/Kconfig -@@ -79,3 +79,52 @@ config NFSD_V4 - available from http://linux-nfs.org/. - - If unsure, say N. -+ -+config PNFSD -+ bool "NFSv4.1 server support for Parallel NFS (pNFS) (DEVELOPER ONLY)" -+ depends on NFSD_V4 && EXPERIMENTAL -+ select EXPORTFS_FILE_LAYOUT -+ help -+ This option enables support for the parallel NFS features of the -+ minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1) -+ in the kernel's NFS server. -+ -+ Unless you're an NFS developer, say N. -+ -+config PNFSD_LOCAL_EXPORT -+ bool "Enable pNFS support for exporting local filesystems for debugging purposes" -+ depends on PNFSD -+ help -+ Say Y here if you want your pNFS server to export local file systems -+ over the files layout type. With this option the MDS (metadata -+ server) functions also as a single DS (data server). This is mostly -+ useful for development and debugging purposes. -+ -+ If unsure, say N. -+ -+config SPNFS -+ bool "Provide spNFS server support (EXPERIMENTAL)" -+ depends on PNFSD -+ select RPCSEC_GSS_KRB5 -+ help -+ Say Y here if you want spNFS server support. -+ -+ If unsure, say N. -+ -+config SPNFS_LAYOUTSEGMENTS -+ bool "Allow spNFS to return partial file layouts (EXPERIMENTAL)" -+ depends on SPNFS -+ select RPCSEC_GSS_KRB5 -+ help -+ Say Y here if you want spNFS to be able to return layout segments. -+ -+ If unsure, say N. -+ -+config SPNFS_BLOCK -+ bool "Provide Block Layout server support (EXPERIMENTAL)" -+ depends on SPNFS -+ select EXPORTFS_BLOCK_LAYOUT -+ help -+ Say Y here if you want spNFS block layout support -+ -+ If unsure, say N. -diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile -index 9b118ee..fed6c25 100644 ---- a/fs/nfsd/Makefile -+++ b/fs/nfsd/Makefile -@@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o - nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o - nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ - nfs4acl.o nfs4callback.o nfs4recover.o -+nfsd-$(CONFIG_PNFSD) += nfs4pnfsd.o nfs4pnfsdlm.o nfs4pnfsds.o -+nfsd-$(CONFIG_PNFSD_LOCAL_EXPORT) += pnfsd_lexp.o -+nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o -+nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o -diff --git a/fs/nfsd/bl_com.c b/fs/nfsd/bl_com.c -new file mode 100644 -index 0000000..aac98c7 ---- /dev/null -+++ b/fs/nfsd/bl_com.c -@@ -0,0 +1,292 @@ -+#if defined(CONFIG_SPNFS_BLOCK) -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+ -+#define NFSDDBG_FACILITY NFSDDBG_PNFS -+ -+static ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *, -+ char __user *, size_t); -+static ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); -+static void bl_pipe_destroy_msg(struct rpc_pipe_msg *); -+ -+static struct rpc_pipe_ops bl_upcall_ops = { -+ .upcall = bl_pipe_upcall, -+ .downcall = bl_pipe_downcall, -+ .destroy_msg = bl_pipe_destroy_msg, -+}; -+ -+bl_comm_t *bl_comm_global; -+ -+int -+nfsd_bl_start(void) -+{ -+ bl_comm_t *bl_comm = NULL; -+ struct path path; -+ struct nameidata nd; -+ int rc; -+ -+ dprintk("%s: starting pipe\n", __func__); -+ if (bl_comm_global) -+ return -EEXIST; -+ -+ path.mnt = rpc_get_mount(); -+ if (IS_ERR(path.mnt)) -+ return PTR_ERR(path.mnt); -+ -+ /* FIXME: do not abuse rpc_pipefs/nfs */ -+ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); -+ if (rc) -+ goto err; -+ -+ bl_comm = kzalloc(sizeof (*bl_comm), GFP_KERNEL); -+ if (!bl_comm) { -+ rc = -ENOMEM; -+ goto err; -+ } -+ -+ /* FIXME: rename to "spnfs_block" */ -+ bl_comm->pipe_dentry = rpc_mkpipe(nd.path.dentry, "pnfs_block", bl_comm, -+ &bl_upcall_ops, 0); -+ if (IS_ERR(bl_comm->pipe_dentry)) { -+ rc = -EPIPE; -+ goto err; -+ } -+ mutex_init(&bl_comm->lock); -+ mutex_init(&bl_comm->pipe_lock); -+ init_waitqueue_head(&bl_comm->pipe_wq); -+ -+ bl_comm_global = bl_comm; -+ return 0; -+err: -+ rpc_put_mount(); -+ kfree(bl_comm); -+ return rc; -+} -+ -+void -+nfsd_bl_stop(void) -+{ -+ bl_comm_t *c = bl_comm_global; -+ -+ dprintk("%s: stopping pipe\n", __func__); -+ if (!c) -+ return; -+ rpc_unlink(c->pipe_dentry); -+ rpc_put_mount(); -+ bl_comm_global = NULL; -+ kfree(c); -+} -+ -+static ssize_t -+bl_pipe_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *dst, -+ size_t buflen) -+{ -+ char *data = (char *)msg->data + msg->copied; -+ ssize_t mlen = msg->len - msg->copied, -+ left; -+ -+ if (mlen > buflen) -+ mlen = buflen; -+ -+ left = copy_to_user(dst, data, mlen); -+ if (left < 0) { -+ msg->errno = left; -+ return left; -+ } -+ mlen -= left; -+ msg->copied += mlen; -+ msg->errno = 0; -+ -+ return mlen; -+} -+ -+static ssize_t -+bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) -+{ -+ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); -+ bl_comm_t *bc = (bl_comm_t *)rpci->private; -+ bl_comm_msg_t *im = &bc->msg; -+ int ret; -+ bl_comm_res_t *res; -+ -+ -+ if (mlen == 0) { -+ im->msg_status = PNFS_BLOCK_FAILURE; -+ im->msg_res = NULL; -+ wake_up(&bc->pipe_wq); -+ return -EFAULT; -+ } -+ -+ if ((res = kmalloc(mlen, GFP_KERNEL)) == NULL) -+ return -ENOMEM; -+ -+ if (copy_from_user(res, src, mlen)) { -+ kfree(res); -+ return -EFAULT; -+ } -+ -+ mutex_lock(&bc->pipe_lock); -+ -+ ret = mlen; -+ im->msg_status = res->res_status; -+ im->msg_res = res; -+ -+ wake_up(&bc->pipe_wq); -+ mutex_unlock(&bc->pipe_lock); -+ return ret; -+} -+ -+static void -+bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) -+{ -+ bl_comm_msg_t *im = msg->data; -+ bl_comm_t *bc = container_of(im, struct bl_comm, msg); -+ -+ if (msg->errno >= 0) -+ return; -+ -+ mutex_lock(&bc->pipe_lock); -+ im->msg_status = PNFS_BLOCK_FAILURE; -+ wake_up(&bc->pipe_wq); -+ mutex_unlock(&bc->pipe_lock); -+} -+ -+int -+bl_upcall(bl_comm_t *bc, bl_comm_msg_t *upmsg, bl_comm_res_t **res) -+{ -+ struct rpc_pipe_msg msg; -+ DECLARE_WAITQUEUE(wq, current); -+ int rval = 1; -+ bl_comm_msg_t *m = &bc->msg; -+ -+ if (bc == NULL) { -+ dprintk("%s: No pNFS block daemon available\n", __func__); -+ return 1; -+ } -+ -+ mutex_lock(&bc->lock); -+ mutex_lock(&bc->pipe_lock); -+ -+ memcpy(m, upmsg, sizeof (*m)); -+ -+ memset(&msg, 0, sizeof (msg)); -+ msg.data = m; -+ msg.len = sizeof (*m); -+ -+ add_wait_queue(&bc->pipe_wq, &wq); -+ rval = rpc_queue_upcall(bc->pipe_dentry->d_inode, &msg); -+ if (rval < 0) { -+ remove_wait_queue(&bc->pipe_wq, &wq); -+ goto out; -+ } -+ -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ mutex_unlock(&bc->pipe_lock); -+ schedule(); -+ __set_current_state(TASK_RUNNING); -+ remove_wait_queue(&bc->pipe_wq, &wq); -+ mutex_lock(&bc->pipe_lock); -+ -+ if (m->msg_status == PNFS_BLOCK_SUCCESS) { -+ *res = m->msg_res; -+ rval = 0; -+ } else -+ rval = 1; -+ -+out: -+ mutex_unlock(&bc->pipe_lock); -+ mutex_unlock(&bc->lock); -+ return rval; -+} -+ -+static ssize_t ctl_write(struct file *file, const char __user *buf, size_t len, -+ loff_t *offset) -+{ -+ int cmd, -+ rc; -+ bl_comm_t *bc = bl_comm_global; -+ bl_comm_msg_t msg; -+ bl_comm_res_t *res; -+ -+ if (copy_from_user((int *)&cmd, (int *)buf, sizeof (int))) -+ return -EFAULT; -+ switch (cmd) { -+ case PNFS_BLOCK_CTL_STOP: -+ msg.msg_type = PNFS_UPCALL_MSG_STOP; -+ (void) bl_upcall(bc, &msg, &res); -+ kfree(res); -+ nfsd_bl_stop(); -+ break; -+ -+ case PNFS_BLOCK_CTL_START: -+ rc = nfsd_bl_start(); -+ if (rc != 0) -+ return rc; -+ break; -+ -+ case PNFS_BLOCK_CTL_VERS: -+ msg.msg_type = PNFS_UPCALL_MSG_VERS; -+ msg.u.msg_vers = PNFS_UPCALL_VERS; -+ if (bl_upcall(bc, &msg, &res)) { -+ dprintk("%s: Failed to contact pNFS block daemon\n", -+ __func__); -+ return 0; -+ } -+ kfree(res); -+ break; -+ -+ default: -+ dprintk("%s: unknown ctl command %d\n", __func__, cmd); -+ break; -+ } -+ return len; -+} -+ -+static struct file_operations ctl_ops = { -+ .write = ctl_write, -+}; -+ -+/* -+ * bl_init_proc -- set up proc interfaces -+ * -+ * Creating a pnfs_block directory isn't really required at this point -+ * since we've only got a single node in that directory. If the need for -+ * more nodes doesn't present itself shortly this code should revert -+ * to a single top level node. McNeal 11-Aug-2008. -+ */ -+int -+bl_init_proc(void) -+{ -+ struct proc_dir_entry *e; -+ -+ e = proc_mkdir("fs/pnfs_block", NULL); -+ if (!e) -+ return -ENOMEM; -+ -+ e = create_proc_entry("fs/pnfs_block/ctl", 0, NULL); -+ if (!e) -+ return -ENOMEM; -+ e->proc_fops = &ctl_ops; -+ -+ return 0; -+} -+#endif /* CONFIG_SPNFS_BLOCK */ -diff --git a/fs/nfsd/bl_ops.c b/fs/nfsd/bl_ops.c -new file mode 100644 -index 0000000..e41b61b ---- /dev/null -+++ b/fs/nfsd/bl_ops.c -@@ -0,0 +1,1672 @@ -+/* -+ * bl_ops.c -+ * spNFS -+ * -+ * Created by Rick McNeal on 4/1/08. -+ * Copyright 2008 __MyCompanyName__. All rights reserved. -+ * -+ */ -+ -+/* -+ * Block layout operations. -+ * -+ * These functions, with the exception of pnfs_block_enabled, are assigned to -+ * the super block s_export_op structure. -+ */ -+#if defined(CONFIG_SPNFS_BLOCK) -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "pnfsd.h" -+ -+#define NFSDDBG_FACILITY NFSDDBG_PNFS -+ -+#define MIN(a, b) ((a) < (b) ? (a) : (b)) -+ -+#define BL_LAYOUT_HASH_BITS 4 -+#define BL_LAYOUT_HASH_SIZE (1 << BL_LAYOUT_HASH_BITS) -+#define BL_LAYOUT_HASH_MASK (BL_LAYOUT_HASH_SIZE - 1) -+#define BL_LIST_REQ (sizeof (struct dm_ioctl) + 256) -+ -+#define bl_layout_hashval(id) \ -+ ((id) & BL_LAYOUT_HASH_MASK) -+ -+#define BLL_F_END(p) ((p)->bll_foff + (p)->bll_len) -+#define BLL_S_END(p) ((p)->bll_soff + (p)->bll_len) -+#define _2SECTS(v) ((v) >> 9) -+ -+#ifndef READ32 -+#define READ32(x) (x) = ntohl(*p++) -+#define READ64(x) do { \ -+(x) = (u64)ntohl(*p++) << 32; \ -+(x) |= ntohl(*p++); \ -+} while (0) -+#endif -+ -+ -+typedef enum {True, False} boolean_t; -+/* ---- block layoutget and commit structure ---- */ -+typedef struct bl_layout_rec { -+ struct list_head blr_hash, -+ blr_layouts; -+ dev_t blr_rdev; -+ struct inode *blr_inode; -+ int blr_recalled; // debug -+ u64 blr_orig_size, -+ blr_commit_size, -+ blr_ext_size; -+ spinlock_t blr_lock; // Protects blr_layouts -+} bl_layout_rec_t; -+ -+static struct list_head layout_hash; -+static struct list_head layout_hashtbl[BL_LAYOUT_HASH_SIZE]; -+static spinlock_t layout_hashtbl_lock; -+ -+/* ---- prototypes ---- */ -+static boolean_t device_slice(dev_t devid); -+static boolean_t device_dm(dev_t devid); -+static boolean_t layout_inode_add(struct inode *i, bl_layout_rec_t **); -+static bl_layout_rec_t *layout_inode_find(struct inode *i); -+static void layout_inode_del(struct inode *i); -+static char *map_state2name(enum pnfs_block_extent_state4 s); -+static pnfs_blocklayout_devinfo_t *bld_alloc(struct list_head *volume, int type); -+static void bld_free(pnfs_blocklayout_devinfo_t *bld); -+static pnfs_blocklayout_devinfo_t *bld_simple(struct list_head *volumes, -+ dev_t devid, int local_index); -+static pnfs_blocklayout_devinfo_t *bld_slice(struct list_head *volumes, -+ dev_t devid, int my_loc, int idx); -+static int layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, -+ struct nfsd4_layout_seg *seg); -+struct list_head *layout_cache_iter(bl_layout_rec_t *r, -+ struct list_head *bl_possible, struct nfsd4_layout_seg *seg); -+static void layout_cache_merge(bl_layout_rec_t *r, struct list_head *h); -+static int layout_cache_update(bl_layout_rec_t *r, struct list_head *h); -+static void layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg); -+static void print_bll(pnfs_blocklayout_layout_t *b, char *); -+static inline boolean_t layout_cache_fill_from_list(bl_layout_rec_t *r, -+ struct list_head *h, struct nfsd4_layout_seg *seg); -+static inline void bll_collapse(bl_layout_rec_t *r, -+ pnfs_blocklayout_layout_t *c); -+static pnfs_blocklayout_layout_t *bll_alloc(u64 offset, u64 len, -+ enum bl_cache_state state, struct list_head *h); -+static pnfs_blocklayout_layout_t *bll_alloc_dup(pnfs_blocklayout_layout_t *b, -+ enum bl_cache_state c, struct list_head *h); -+static inline boolean_t layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, -+ enum pnfs_block_extent_state4 *s); -+static void extents_setup(struct fiemap_extent_info *fei); -+static void extents_count(struct fiemap_extent_info *fei, struct inode *i, -+ u64 foff, u64 len); -+static boolean_t extents_get(struct fiemap_extent_info *fei, struct inode *i, -+ u64 foff, u64 len); -+static boolean_t extents_process(struct fiemap_extent_info *fei, -+ struct list_head *bl_candidates, struct nfsd4_layout_seg *, dev_t dev, -+ pnfs_blocklayout_layout_t *b); -+static void extents_cleanup(struct fiemap_extent_info *fei); -+ -+void -+nfsd_bl_init(void) -+{ -+ int i; -+ dprintk("%s loaded\n", __func__); -+ -+ spin_lock_init(&layout_hashtbl_lock); -+ INIT_LIST_HEAD(&layout_hash); -+ for (i = 0; i < BL_LAYOUT_HASH_SIZE; i++) -+ INIT_LIST_HEAD(&layout_hashtbl[i]); -+ bl_init_proc(); -+} -+ -+/* -+ * pnfs_block_enabled -- check to see if this file system should be export as -+ * block pnfs -+ */ -+int -+pnfs_block_enabled(struct inode *inode, int ex_flags) -+{ -+ bl_comm_msg_t msg; -+ bl_comm_res_t *res = NULL; -+ static int bl_comm_once = 0; -+ -+ dprintk("--> %s\n", __func__); -+ /* -+ * FIXME: Figure out method to determine if this file system should -+ * be exported. The following areas need to be checked. -+ * (1) Validate that this file system was exported as a pNFS -+ * block-layout -+ * (2) Has there been successful communication with the -+ * volume daemon? -+ */ -+ /* Check #1 */ -+#ifdef notyet -+ if (!(ex_flags & NFSEXP_PNFS_BLOCK)) { -+ dprintk("%s: pnfs_block not set in export\n", __func__); -+ return 0; -+ } -+#endif -+ -+ /* Check #1 */ -+ if (!bl_comm_once) { -+ msg.msg_type = PNFS_UPCALL_MSG_VERS; -+ msg.u.msg_vers = PNFS_UPCALL_VERS; -+ if (bl_upcall(bl_comm_global, &msg, &res)) { -+ dprintk("%s: Failed to contact pNFS block daemon\n", -+ __func__); -+ return 0; -+ } -+ if (msg.u.msg_vers != res->u.vers) { -+ dprintk("%s: vers mismatch, kernel != daemon\n", -+ __func__); -+ kfree(res); -+ return 0; -+ } -+ } -+ bl_comm_once = 1; -+ -+ kfree(res); -+ -+ dprintk("<-- %s okay\n", __func__); -+ return 1; -+} -+ -+int -+bl_layout_type(struct super_block *sb) -+{ -+ return LAYOUT_BLOCK_VOLUME; -+} -+ -+int -+bl_getdeviceiter(struct super_block *sb, -+ u32 layout_type, -+ struct nfsd4_pnfs_dev_iter_res *res) -+{ -+ res->gd_eof = 1; -+ if (res->gd_cookie) -+ return -ENOENT; -+ res->gd_devid = sb->s_dev; -+ res->gd_verf = 1; -+ res->gd_cookie = 1; -+ return 0; -+} -+ -+static int -+bl_getdeviceinfo_slice(struct super_block *sb, struct exp_xdr_stream *xdr, -+ const struct nfsd4_pnfs_deviceid *devid) -+{ -+ pnfs_blocklayout_devinfo_t *bld_slice_p, -+ *bld_simple_p, -+ *bld; -+ int status = -EIO, -+ location = 0; -+ struct list_head volumes; -+ -+ dprintk("--> %s\n", __func__); -+ INIT_LIST_HEAD(&volumes); -+ -+ bld_simple_p = bld_simple(&volumes, devid->devid, -+ location++); -+ if (!bld_simple_p) -+ goto out; -+ bld_slice_p = bld_slice(&volumes, devid->devid, location++, -+ bld_simple_p->bld_index_loc); -+ -+ if (!bld_slice_p) -+ goto out; -+ -+ status = blocklayout_encode_devinfo(xdr, &volumes); -+ -+out: -+ while (!list_empty(&volumes)) { -+ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, -+ bld_list); -+ if (bld->bld_type == PNFS_BLOCK_VOLUME_SIMPLE) -+ kfree(bld->u.simple.bld_sig); -+ bld_free(bld); -+ } -+ -+ dprintk("<-- %s (rval %d)\n", __func__, status); -+ return status; -+} -+ -+static int -+bl_getdeviceinfo_dm(struct super_block *sb, struct exp_xdr_stream *xdr, -+ const struct nfsd4_pnfs_deviceid *devid) -+{ -+ pnfs_blocklayout_devinfo_t *bld = NULL; -+ int status = -EIO, // default to error -+ i, -+ location = 0; -+ struct list_head volumes; -+ bl_comm_msg_t msg; -+ bl_comm_res_t *res; -+ -+ dprintk("--> %s\n", __func__); -+ INIT_LIST_HEAD(&volumes); -+ -+ msg.msg_type = PNFS_UPCALL_MSG_DMGET; -+ msg.u.msg_dev = devid->devid; -+ if (bl_upcall(bl_comm_global, &msg, &res)) { -+ dprintk("%s: upcall for DMGET failed\n", __func__); -+ goto out; -+ } -+ -+ /* -+ * Don't use bld_alloc() here. If used this will be the first volume -+ * type added to the list whereas the protocol requires it to be the -+ * last. -+ */ -+ bld = kmalloc(sizeof (*bld), GFP_KERNEL); -+ if (!bld) -+ goto out; -+ memset(bld, 0, sizeof (*bld)); -+ bld->bld_type = PNFS_BLOCK_VOLUME_STRIPE; -+ bld->u.stripe.bld_stripes = res->u.stripe.num_stripes; -+ bld->u.stripe.bld_chunk_size = res->u.stripe.stripe_size * 512LL; -+ dprintk("%s: stripes %d, chunk_size %Lu\n", __func__, -+ bld->u.stripe.bld_stripes, bld->u.stripe.bld_chunk_size / 512LL); -+ -+ bld->u.stripe.bld_stripe_indexs = kmalloc(bld->u.stripe.bld_stripes * -+ sizeof (int), GFP_KERNEL); -+ if (!bld->u.stripe.bld_stripe_indexs) -+ goto out; -+ -+ for (i = 0; i < bld->u.stripe.bld_stripes; i++) { -+ dev_t dev; -+ pnfs_blocklayout_devinfo_t *bldp; -+ -+ dev = MKDEV(res->u.stripe.devs[i].major, -+ res->u.stripe.devs[i].minor); -+ if (dev == 0) -+ goto out; -+ -+ bldp = bld_simple(&volumes, dev, location++); -+ if (!bldp) { -+ dprintk("%s: bld_simple failed\n", __func__); -+ goto out; -+ } -+ bldp = bld_slice(&volumes, dev, location++, bldp->bld_index_loc); -+ -+ if (!bldp) { -+ dprintk("%s: bld_slice failed\n", __func__); -+ goto out; -+ } -+ bld->u.stripe.bld_stripe_indexs[i] = bldp->bld_index_loc; -+ -+ } -+ list_add_tail(&bld->bld_list, &volumes); -+ status = blocklayout_encode_devinfo(xdr, &volumes); -+ -+out: -+ while (!list_empty(&volumes)) { -+ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, -+ bld_list); -+ switch (bld->bld_type) { -+ case PNFS_BLOCK_VOLUME_SLICE: -+ case PNFS_BLOCK_VOLUME_CONCAT: -+ // No memory to release for these -+ break; -+ case PNFS_BLOCK_VOLUME_SIMPLE: -+ kfree(bld->u.simple.bld_sig); -+ break; -+ case PNFS_BLOCK_VOLUME_STRIPE: -+ kfree(bld->u.stripe.bld_stripe_indexs); -+ break; -+ } -+ bld_free(bld); -+ } -+ kfree(res); -+ dprintk("<-- %s (rval %d)\n", __func__, status); -+ return status; -+} -+ -+/* -+ * bl_getdeviceinfo -- determine device tree for requested devid -+ */ -+int -+bl_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, -+ u32 layout_type, -+ const struct nfsd4_pnfs_deviceid *devid) -+{ -+ if (device_slice(devid->devid) == True) -+ return bl_getdeviceinfo_slice(sb, xdr, devid); -+ else if (device_dm(devid->devid) == True) -+ return bl_getdeviceinfo_dm(sb, xdr, devid); -+ return -EINVAL; -+} -+ -+enum nfsstat4 -+bl_layoutget(struct inode *i, struct exp_xdr_stream *xdr, -+ const struct nfsd4_pnfs_layoutget_arg *arg, -+ struct nfsd4_pnfs_layoutget_res *res) -+{ -+ pnfs_blocklayout_layout_t *b; -+ bl_layout_rec_t *r; -+ struct list_head bl_possible, -+ *bl_candidates = NULL; -+ boolean_t del_on_error = False; -+ int adj; -+ enum nfsstat4 nfserr = NFS4_OK; -+ -+ dprintk("--> %s (inode=[0x%x:%lu], offset=%Lu, len=%Lu, iomode=%d)\n", -+ __func__, i->i_sb->s_dev, i->i_ino, _2SECTS(res->lg_seg.offset), -+ _2SECTS(res->lg_seg.length), res->lg_seg.iomode); -+ -+ if (res->lg_seg.length == 0) { -+ printk("%s: request length of 0, error condition\n", __func__); -+ return NFS4ERR_BADLAYOUT; -+ } -+ -+ /* -+ * Adjust the length as required per spec. -+ * - First case is were the length is set to (u64)-1. Cheap means to -+ * define the end of the file. -+ * - Second case is were the I/O mode is read-only, but the request is -+ * past the end of the file so the request needs to be trimed. -+ */ -+ if ((res->lg_seg.length == NFS4_MAX_UINT64) || -+ (((res->lg_seg.offset + res->lg_seg.length) > i->i_size) && -+ (res->lg_seg.iomode == IOMODE_READ))) -+ res->lg_seg.length = i->i_size - res->lg_seg.offset; -+ -+ adj = (res->lg_seg.offset & 511) ? res->lg_seg.offset & 511 : 0; -+ res->lg_seg.offset -= adj; -+ res->lg_seg.length = (res->lg_seg.length + adj + 511) & ~511; -+ -+ if (res->lg_seg.iomode != IOMODE_READ) -+ if (i->i_op->fallocate(i, FALLOC_FL_KEEP_SIZE, -+ res->lg_seg.offset, res->lg_seg.length)) -+ return NFS4ERR_IO; -+ -+ INIT_LIST_HEAD(&bl_possible); -+ -+ if ((r = layout_inode_find(i)) == NULL) { -+ if (layout_inode_add(i, &r) == False) { -+ printk("%s: layout_inode_add failed\n", __func__); -+ return NFS4ERR_IO; -+ } -+ del_on_error = True; -+ } -+ BUG_ON(!r); -+ -+ spin_lock(&r->blr_lock); -+ -+ if (layout_cache_fill_from(r, &bl_possible, &res->lg_seg)) { -+ /* -+ * This will send LAYOUTTRYAGAIN error to the client. -+ */ -+ dprintk("%s: layout_cache_fill_from() failed\n", __func__); -+ nfserr = NFS4ERR_LAYOUTTRYLATER; -+ goto layoutget_cleanup; -+ } -+ -+ res->lg_return_on_close = 1; -+ res->lg_seg.length = 0; -+ -+ bl_candidates = layout_cache_iter(r, &bl_possible, &res->lg_seg); -+ if (!bl_candidates) { -+ nfserr = NFS4ERR_LAYOUTTRYLATER; -+ goto layoutget_cleanup; -+ } -+ -+ layout_cache_merge(r, bl_candidates); -+ if (layout_cache_update(r, bl_candidates)) { -+ /* ---- Failed to allocate memory. ---- */ -+ dprintk("%s: layout_cache_update() failed\n", __func__); -+ nfserr = NFS4ERR_LAYOUTTRYLATER; -+ goto layoutget_cleanup; -+ } -+ -+ nfserr = blocklayout_encode_layout(xdr, bl_candidates); -+ if (nfserr) -+ dprintk("%s: layoutget xdr routine failed\n", __func__); -+ -+layoutget_cleanup: -+ if (bl_candidates) { -+ while (!list_empty(bl_candidates)) { -+ b = list_entry(bl_candidates->next, -+ struct pnfs_blocklayout_layout, bll_list); -+ list_del(&b->bll_list); -+ kfree(b); -+ } -+ } -+ -+ spin_unlock(&r->blr_lock); -+ if (unlikely(nfserr)) { -+ if (del_on_error == True) -+ layout_inode_del(i); -+ res->lg_seg.length = 0; -+ res->lg_seg.offset = 0; -+ } -+ -+ dprintk("<-- %s (rval %u)\n", __func__, nfserr); -+ return nfserr; -+} -+ -+/* -+ * bl_layoutcommit -- commit changes, especially size, to file systemj -+ * -+ * Currently this routine isn't called and everything is handled within -+ * nfsd4_layoutcommit(). By not calling this routine the server doesn't -+ * handle a partial return, a set of extents, of the layout. The extents -+ * are decoded here, but nothing is done with them. If this routine is -+ * be called the interface must change to pass the 'dentry' pointer such -+ * that notify_change() can be called. -+ */ -+int -+bl_layoutcommit(struct inode *i, -+ const struct nfsd4_pnfs_layoutcommit_arg *args, -+ struct nfsd4_pnfs_layoutcommit_res *res) -+{ -+ bl_layout_rec_t *r; -+ int status = 0; -+ u64 lw_plus; -+ -+ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); -+ r = layout_inode_find(i); -+ if (r) { -+ lw_plus = args->lc_last_wr + 1; -+ if (args->lc_newoffset) { -+ dprintk(" lc_last_wr %Lu\n", lw_plus); -+ if (r->blr_orig_size < lw_plus) { -+ r->blr_orig_size = lw_plus; -+ res->lc_size_chg = 1; -+ res->lc_newsize = lw_plus; -+ } -+ } -+ -+ if (args->lc_up_len) { -+ int extents, -+ i; -+ struct pnfs_blocklayout_layout *b; -+ __be32 *p = args->lc_up_layout; -+ -+ /* -+ * Client is returning a set of extents which -+ * should/could be used to update the file system. -+ * See section 2.3.2 in draft-ietf-nfsv4-pnfs-block-08 -+ */ -+ READ32(extents); -+ dprintk(" Client returning %d extents: data size %d\n", -+ extents, args->lc_up_len); -+ b = kmalloc(sizeof (struct pnfs_blocklayout_layout) * -+ extents, GFP_KERNEL); -+ if (b) { -+ for (i = 0; i < extents; i++) { -+ READ64(b[i].bll_vol_id.sbid); -+ READ64(b[i].bll_vol_id.devid); -+ READ64(b[i].bll_foff); -+ READ64(b[i].bll_len); -+ READ64(b[i].bll_soff); -+ READ32(b[i].bll_es); -+ dprintk(" %d: foff %Lu, len %Lu, soff %Lu " -+ "state %s\n", -+ i, _2SECTS(b[i].bll_foff), -+ _2SECTS(b[i].bll_len), -+ _2SECTS(b[i].bll_soff), -+ map_state2name(b[i].bll_es)); -+ } -+ kfree(b); -+ } else { -+ status = -ENOMEM; -+ } -+ } -+ } else -+ dprintk("%s: Unexpected commit to inode %p\n", __func__, i); -+ -+ dprintk("<-- %s (rval %d)\n", __func__, status); -+ return status; -+} -+ -+int -+bl_layoutreturn(struct inode *i, -+ const struct nfsd4_pnfs_layoutreturn_arg *args) -+{ -+ int status = 0; -+ bl_layout_rec_t *r; -+ -+ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); -+ -+ r = layout_inode_find(i); -+ if (r) { -+ spin_lock(&r->blr_lock); -+ layout_cache_del(r, &args->lr_seg); -+ spin_unlock(&r->blr_lock); -+ dprintk(" ext_size %Lu, i_size %Lu, orig_size %Lu\n", -+ r->blr_ext_size, i->i_size, r->blr_orig_size); -+ } -+ -+ layout_inode_del(i); -+ dprintk("<-- %s (rval %d)\n", __func__, status); -+ return status; -+} -+ -+int -+bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) -+{ -+ struct super_block *sb; -+ struct nfsd4_pnfs_cb_layout lr; -+ bl_layout_rec_t *r; -+ pnfs_blocklayout_layout_t *b; -+ u64 adj; -+ -+ dprintk("--> %s\n", __func__); -+ BUG_ON(!len); -+ switch (type) { -+ case RETURN_FILE: -+ sb = inode->i_sb; -+ dprintk(" recalling layout [0x%x:%lu], %Lu:%Lu\n", -+ inode->i_sb->s_dev, inode->i_ino, -+ _2SECTS(offset), _2SECTS(len)); -+ break; -+ case RETURN_FSID: -+ sb = inode->i_sb; -+ dprintk("%s: recalling layout for fsid x (unimplemented)\n", -+ __func__); -+ return 0; -+ case RETURN_ALL: -+ /* -+ * XXX figure out how to get a sb since there's no -+ * inode ptr -+ */ -+ dprintk("%s: recalling all layouts (unimplemented)\n", -+ __func__); -+ return 0; -+ default: -+ return -EINVAL; -+ } -+ -+restart: -+ r = layout_inode_find(inode); -+ if (r && len && !r->blr_recalled) { -+ spin_lock(&r->blr_lock); -+ list_for_each_entry(b, &r->blr_layouts, bll_list) { -+ if (!r->blr_recalled && !b->bll_recalled && -+ (offset >= b->bll_foff) && (offset < BLL_F_END(b))) { -+ b->bll_recalled = 1; -+ lr.cbl_recall_type = type; -+ lr.cbl_seg.layout_type = LAYOUT_BLOCK_VOLUME; -+ lr.cbl_seg.clientid = 0; -+ lr.cbl_seg.offset = 0; -+ lr.cbl_seg.length = NFS4_MAX_UINT64; -+ r->blr_recalled = 1; -+ dprintk(" FULL LAYOUTRECALL\n"); -+ lr.cbl_seg.iomode = IOMODE_ANY; -+ -+ /* -+ * Currently there are only two cases where the -+ * layout is being returned. -+ * (1) Someone is issuing a NFS_WRITE operation -+ * to this layout. -+ * (2) The file has been truncated which means -+ * the layout is immediately made invalid. -+ * In both cases the client must write any -+ * uncommitted modifications to the server via -+ * NFS_WRITE. -+ */ -+ lr.cbl_layoutchanged = 1; -+ -+ /* -+ * Need to drop the lock because we'll get a -+ * layoutreturn which will block waiting for -+ * the lock. The request will come in on the -+ * same thread which will cause a deadlock. -+ */ -+ spin_unlock(&r->blr_lock); -+ nfsd_layout_recall_cb(sb, inode, &lr); -+ adj = MIN(b->bll_len - (offset - b->bll_foff), -+ len); -+ offset += adj; -+ len -= adj; -+ if (!len) { -+ spin_lock(&r->blr_lock); -+ break; -+ } -+ /* -+ * Since layoutreturn will have been called we -+ * can't assume blr_layouts is still valid, -+ * so restart. -+ */ -+ goto restart; -+ } -+ } -+ spin_unlock(&r->blr_lock); -+ } -+ -+ dprintk("<-- %s\n", __func__); -+ return 0; -+} -+ -+/* -+ * []------------------------------------------------------------------[] -+ * | Support functions from here on down. | -+ * []------------------------------------------------------------------[] -+ */ -+ -+/* -+ * bld_simple -- given a dev_t build a simple volume structure -+ * -+ * Simple volume contains the device signature and offset to that data in -+ * the storage volume. -+ */ -+static pnfs_blocklayout_devinfo_t * -+bld_simple(struct list_head *volumes, dev_t devid, int local_index) -+{ -+ pnfs_blocklayout_devinfo_t *bld = NULL; -+ bl_comm_msg_t msg; -+ bl_comm_res_t *res = NULL; -+ -+ msg.msg_type = PNFS_UPCALL_MSG_GETSIG; -+ msg.u.msg_dev = devid; -+ if (bl_upcall(bl_comm_global, &msg, &res)) { -+ dprintk("%s: Failed to get signature information\n", __func__); -+ goto error; -+ } -+ -+ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SIMPLE); -+ if (!bld) -+ return NULL; -+ -+ bld->u.simple.bld_offset = (res->u.sig.sector * 512LL) + res->u.sig.offset; -+ bld->u.simple.bld_sig_len = res->u.sig.len; -+ bld->u.simple.bld_sig = kmalloc(res->u.sig.len, GFP_KERNEL); -+ if (!bld->u.simple.bld_sig) -+ goto error; -+ -+ memcpy(bld->u.simple.bld_sig, res->u.sig.sig, res->u.sig.len); -+ kfree(res); -+ return bld; -+ -+error: -+ if (bld) -+ bld_free(bld); -+ if (res) -+ kfree(res); -+ dprintk("%s: error in bld_simple\n", __func__); -+ return NULL; -+} -+ -+/* -+ * bld_slice -- given a dev_t build a slice volume structure -+ * -+ * A slice volume contains the length of the slice/partition and its offset -+ * from the beginning of the storage volume. There's also a reference to -+ * the "simple" volume which contains this slice. -+ */ -+static pnfs_blocklayout_devinfo_t * -+bld_slice(struct list_head *volumes, dev_t devid, int my_loc, int simple_loc) -+{ -+ pnfs_blocklayout_devinfo_t *bld; -+ bl_comm_msg_t msg; -+ bl_comm_res_t *res; -+ -+ dprintk("--> %s\n", __func__); -+ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SLICE); -+ if (!bld) -+ return NULL; -+ -+ msg.msg_type = PNFS_UPCALL_MSG_GETSLICE; -+ msg.u.msg_dev = devid; -+ if (bl_upcall(bl_comm_global, &msg, &res)) { -+ dprintk("Upcall to get slice info failed\n"); -+ bld_free(bld); -+ return NULL; -+ } -+ -+ bld->bld_devid.devid = devid; -+ bld->bld_index_loc = my_loc; -+ bld->u.slice.bld_start = res->u.slice.start * 512LL; -+ bld->u.slice.bld_len = res->u.slice.length * 512LL; -+ bld->u.slice.bld_index = simple_loc; -+ -+ dprintk("%s: start %Lu, len %Lu\n", __func__, -+ bld->u.slice.bld_start / 512LL, bld->u.slice.bld_len / 512LL); -+ -+ kfree(res); -+ dprintk("<-- %s (rval %p)\n", __func__, bld); -+ return bld; -+} -+ -+static int -+layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, -+ struct nfsd4_layout_seg *seg) -+{ -+ pnfs_blocklayout_layout_t *n; -+ -+ dprintk("--> %s\n", __func__); -+ -+ if (!list_empty(&r->blr_layouts)) -+ if (layout_cache_fill_from_list(r, h, seg) == False) -+ return -EIO; -+ -+ /* -+ * This deals with two conditions. -+ * (1) When blr_layouts is empty we need to create the first entry -+ * (2) When the range requested falls past the end of any current -+ * layout the residual must be taken care of. -+ */ -+ if (seg->length) { -+ n = bll_alloc(seg->offset, seg->length, BLOCK_LAYOUT_NEW, h); -+ if (!n) -+ return -ENOMEM; -+ dprintk(" remaining at %Lu, len %Lu\n", _2SECTS(n->bll_foff), -+ _2SECTS(n->bll_len)); -+ } -+ -+ dprintk("<-- %s\n", __func__); -+ return 0; -+} -+ -+struct list_head * -+layout_cache_iter(bl_layout_rec_t *r, struct list_head *bl_possible, -+ struct nfsd4_layout_seg *seg) -+{ -+ pnfs_blocklayout_layout_t *b, -+ *n = NULL; -+ struct list_head *bl_candidates = NULL; -+ struct fiemap_extent_info fei; -+ struct inode *i; -+ dev_t dev; -+ -+ dev = r->blr_rdev; -+ i = r->blr_inode; -+ -+ dprintk("--> %s\n", __func__); -+ bl_candidates = kmalloc(sizeof (*bl_candidates), GFP_KERNEL); -+ if (!bl_candidates) -+ return NULL; -+ INIT_LIST_HEAD(bl_candidates); -+ extents_setup(&fei); -+ -+ list_for_each_entry(b, bl_possible, bll_list) { -+ if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { -+ -+ extents_count(&fei, i, b->bll_foff, b->bll_len); -+ if (fei.fi_extents_mapped) { -+ -+ /* -+ * Common case here. Got a range which has -+ * extents. Now get those extents and process -+ * them into pNFS extents. -+ */ -+ if (extents_get(&fei, i, b->bll_foff, -+ b->bll_len) == False) -+ goto cleanup; -+ if (extents_process(&fei, bl_candidates, -+ seg, dev, b) == False) -+ goto cleanup; -+ extents_cleanup(&fei); -+ -+ } else if (seg->iomode == IOMODE_READ) { -+ -+ /* -+ * Found a hole in a file while reading. No -+ * problem, just create a pNFS extent for the -+ * range and let the client know there's no -+ * backing store. -+ */ -+ n = bll_alloc(b->bll_foff, b->bll_len, -+ BLOCK_LAYOUT_NEW, bl_candidates); -+ n->bll_es = PNFS_BLOCK_NONE_DATA; -+ n->bll_vol_id.sbid = 0; -+ n->bll_vol_id.devid = dev; -+ seg->length += b->bll_len; -+ } else { -+ -+ /* -+ * There's a problem here. Since the iomode -+ * is read/write fallocate should have allocated -+ * any necessary storage for the given range. -+ */ -+ dprintk(" Extent count for RW is 0\n"); -+ goto cleanup; -+ } -+ -+ } else { -+ n = bll_alloc_dup(b, b->bll_cache_state, bl_candidates); -+ seg->length += n->bll_len; -+ } -+ -+ if (r->blr_ext_size < (b->bll_foff + b->bll_len)) -+ r->blr_ext_size = b->bll_foff + b->bll_len; -+ } -+ -+ while (!list_empty(bl_possible)) { -+ b = list_entry(bl_possible->next, -+ struct pnfs_blocklayout_layout, bll_list); -+ list_del(&b->bll_list); -+ kfree(b); -+ } -+ -+ b = list_first_entry(bl_candidates, struct pnfs_blocklayout_layout, -+ bll_list); -+ seg->offset = b->bll_foff; -+ dprintk("<-- %s okay\n", __func__); -+ return bl_candidates; -+ -+cleanup: -+ extents_cleanup(&fei); -+ if (bl_candidates) -+ kfree(bl_candidates); -+ dprintk("<-- %s, error occurred\n", __func__); -+ return NULL; -+} -+ -+/* -+ * layout_cache_merge -- collapse layouts which make up a contiguous range. -+ */ -+static void -+layout_cache_merge(bl_layout_rec_t *r, struct list_head *h) -+{ -+ pnfs_blocklayout_layout_t *b, -+ *p; -+ -+ dprintk("--> %s\n", __func__); -+restart: -+ p = NULL; -+ list_for_each_entry(b, h, bll_list) { -+ if (p && (BLL_S_END(p) == b->bll_soff) && -+ (p->bll_es == b->bll_es) && -+ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { -+ /* -+ * We've got a condidate. -+ */ -+#ifdef too_verbose -+ dprintk(" merge %Lu(f):%Lu(l):%Lu(s) into %Lu(f):%Lu(l):%Lu(s)\n", -+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), -+ _2SECTS(b->bll_soff), -+ _2SECTS(p->bll_foff), _2SECTS(p->bll_len), -+ _2SECTS(b->bll_soff)); -+#endif -+ -+ if (p->bll_cache_state == BLOCK_LAYOUT_CACHE) -+ p->bll_cache_state = BLOCK_LAYOUT_UPDATE; -+ p->bll_len += b->bll_len; -+ list_del(&b->bll_list); -+ kfree(b); -+ goto restart; -+ } else if (p && (BLL_F_END(p) == b->bll_foff) && -+ (p->bll_es == b->bll_es) && -+ (b->bll_es == PNFS_BLOCK_NONE_DATA)) { -+ p->bll_len += b->bll_len; -+ list_del(&b->bll_list); -+ kfree(b); -+ goto restart; -+ } else -+ p = b; -+ } -+ dprintk("<-- %s\n", __func__); -+} -+ -+static int -+layout_cache_update(bl_layout_rec_t *r, struct list_head *h) -+{ -+ pnfs_blocklayout_layout_t *b, -+ *c, -+ *n; -+ boolean_t status = 0; -+ -+ dprintk("--> %s\n", __func__); -+ if (list_empty(&r->blr_layouts)) { -+ /* ---- Just add entries and return ---- */ -+ dprintk(" cache empty for inode 0x%x:%ld\n", r->blr_rdev, -+ r->blr_inode->i_ino); -+ list_for_each_entry(b, h, bll_list) { -+ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, -+ &r->blr_layouts); -+ if (!c) { -+ status = -ENOMEM; -+ break; -+ } -+ dprintk(" adding %Lu(f):%Lu(l):%Lu(s):%d\n", -+ _2SECTS(c->bll_foff), _2SECTS(c->bll_len), -+ _2SECTS(c->bll_soff), c->bll_es); -+ } -+ return status; -+ } -+ -+ list_for_each_entry(b, h, bll_list) { -+ BUG_ON(!b->bll_vol_id.devid); -+ if (b->bll_cache_state == BLOCK_LAYOUT_UPDATE) { -+ boolean_t found = False; -+ list_for_each_entry(c, &r->blr_layouts, bll_list) { -+ if ((b->bll_soff >= c->bll_soff) && -+ (b->bll_soff < BLL_S_END(c)) && -+ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { -+ u64 u; -+ -+ if ((b->bll_foff < c->bll_foff) || -+ (b->bll_foff > BLL_F_END(c))) -+ BUG(); -+ -+ u = BLL_S_END(b) - BLL_S_END(c); -+ /* -+ * The updated cache entry has to be -+ * different than the current. -+ * Otherwise the cache state for 'b' -+ * should be BLOCK_LAYOUT_CACHE. -+ */ -+ BUG_ON(BLL_S_END(b) < BLL_S_END(c)); -+ -+ dprintk(" " -+ "updating %Lu(f):%Lu(l):%Lu(s) to len %Lu\n", -+ _2SECTS(c->bll_foff), -+ _2SECTS(c->bll_len), -+ _2SECTS(c->bll_soff), -+ _2SECTS(c->bll_len + u)); -+ c->bll_len += u; -+ bll_collapse(r, c); -+ found = True; -+ break; -+ } -+ } -+ -+ if (found == False) { -+ dprintk(" ERROR Expected to find" -+ " %Lu(f):%Lu(l):%Lu(s), but didn't\n", -+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), -+ _2SECTS(b->bll_soff)); -+ list_for_each_entry(c, &r->blr_layouts, bll_list) -+ print_bll(c, "Cached"); -+ BUG(); -+ } -+ } else if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { -+ -+ c = list_first_entry(&r->blr_layouts, -+ struct pnfs_blocklayout_layout, bll_list); -+ if (b->bll_foff < c->bll_foff) { -+ /* -+ * Special case where new entry is before -+ * first cached entry. -+ */ -+ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, NULL); -+ list_add(&c->bll_list, &r->blr_layouts); -+ dprintk(" new entry at head of list at %Lu, " -+ "len %Lu\n", -+ _2SECTS(c->bll_foff), _2SECTS(c->bll_len)); -+ } else { -+ list_for_each_entry(c, &r->blr_layouts, -+ bll_list) { -+ n = list_entry(c->bll_list.next, -+ struct pnfs_blocklayout_layout, -+ bll_list); -+ /* -+ * This is ugly, but can't think of -+ * another way to examine this case. -+ * Consider the following. Need to -+ * add an entry which starts at 40 -+ * and the cache has the following -+ * entries: -+ * Start Length -+ * 10 5 -+ * 30 5 -+ * 50 5 -+ * So, need to look and see if the new -+ * entry starts after the current -+ * cache, but before the next one. -+ * There's a catch in that the next -+ * entry might not be valid as it's -+ * really just a pointer to the list -+ * head. -+ */ -+ if (((b->bll_foff >= -+ BLL_F_END(c)) && -+ (c->bll_list.next == &r->blr_layouts)) || -+ ((b->bll_foff >= -+ BLL_F_END(c)) && -+ (b->bll_foff < n->bll_foff))) { -+ -+ n = bll_alloc_dup(b, -+ BLOCK_LAYOUT_CACHE, NULL); -+ dprintk(" adding new %Lu:%Lu" -+ " after %Lu:%Lu\n", -+ _2SECTS(n->bll_foff), -+ _2SECTS(n->bll_len), -+ _2SECTS(c->bll_foff), -+ _2SECTS(c->bll_len)); -+ list_add(&n->bll_list, -+ &c->bll_list); -+ break; -+ } -+ } -+ } -+ } -+ } -+ dprintk("<-- %s\n", __func__); -+ return status; -+} -+ -+static void -+layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg_in) -+{ -+ struct pnfs_blocklayout_layout *b, -+ *n; -+ u64 len; -+ struct nfsd4_layout_seg seg = *seg_in; -+ -+ dprintk("--> %s\n", __func__); -+ if (seg.length == NFS4_MAX_UINT64) { -+ r->blr_recalled = 0; -+ dprintk(" Fast return of all layouts\n"); -+ while (!list_empty(&r->blr_layouts)) { -+ b = list_entry(r->blr_layouts.next, -+ struct pnfs_blocklayout_layout, bll_list); -+ dprintk(" foff %Lu, len %Lu, soff %Lu\n", -+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), -+ _2SECTS(b->bll_soff)); -+ list_del(&b->bll_list); -+ kfree(b); -+ } -+ dprintk("<-- %s\n", __func__); -+ return; -+ } -+ -+restart: -+ list_for_each_entry(b, &r->blr_layouts, bll_list) { -+ if (seg.offset == b->bll_foff) { -+ /* -+ * This handle the following three cases: -+ * (1) return layout matches entire cache layout -+ * (2) return layout matches beginning portion of cache -+ * (3) return layout matches entire cache layout and -+ * into next entry. Varies from #1 in end case. -+ */ -+ dprintk(" match on offsets, %Lu:%Lu\n", -+ _2SECTS(seg.offset), _2SECTS(seg.length)); -+ len = MIN(seg.length, b->bll_len); -+ b->bll_foff += len; -+ b->bll_soff += len; -+ b->bll_len -= len; -+ seg.length -= len; -+ seg.offset += len; -+ if (!b->bll_len) { -+ list_del(&b->bll_list); -+ kfree(b); -+ dprintk(" removing cache line\n"); -+ if (!seg.length) { -+ dprintk(" also finished\n"); -+ goto complete; -+ } -+ /* -+ * Since 'b' was freed we can't continue at the -+ * next entry which is referenced as -+ * b->bll_list.next by the list_for_each_entry -+ * macro. Need to restart the loop. -+ * TODO: Think about creating a dummy 'b' which -+ * would keep list_for_each_entry() happy. -+ */ -+ goto restart; -+ } -+ if (!seg.length) { -+ dprintk(" finished, but cache line not" -+ "empty\n"); -+ goto complete; -+ } -+ } else if ((seg.offset >= b->bll_foff) && -+ (seg.offset < BLL_F_END(b))) { -+ /* -+ * layout being returned is within this cache line. -+ */ -+ dprintk(" layout %Lu:%Lu within cache line %Lu:%Lu\n", -+ _2SECTS(seg.offset), _2SECTS(seg.length), -+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); -+ BUG_ON(!seg.length); -+ if ((seg.offset + seg.length) >= BLL_F_END(b)) { -+ /* -+ * Layout returned starts in the middle of -+ * cache entry and just need to trim back -+ * cache to shorter length. -+ */ -+ dprintk(" trim back cache line\n"); -+ len = seg.offset - b->bll_foff; -+ seg.offset += b->bll_len - len; -+ seg.length -= b->bll_len - len; -+ b->bll_len = len; -+ if (!seg.length) -+ return; -+ } else { -+ /* -+ * Need to split current cache layout because -+ * chunk is being removed from the middle. -+ */ -+ dprintk(" split cache line\n"); -+ len = seg.offset + seg.length; -+ n = bll_alloc(len, -+ (b->bll_foff + b->bll_len) - len, -+ BLOCK_LAYOUT_CACHE, NULL); -+ n->bll_soff = b->bll_soff + len; -+ list_add(&n->bll_list, &b->bll_list); -+ b->bll_len = seg.offset - b->bll_foff; -+ return; -+ } -+ } -+ } -+complete: -+ if (list_empty(&r->blr_layouts)) -+ r->blr_recalled = 0; -+ dprintk("<-- %s\n", __func__); -+} -+ -+/* -+ * layout_cache_fill_from_list -- fills from cache list -+ * -+ * NOTE: This routine was only seperated out from layout_cache_file_from() -+ * to reduce the indentation level which makes the code easier to read. -+ */ -+static inline boolean_t -+layout_cache_fill_from_list(bl_layout_rec_t *r, struct list_head *h, -+ struct nfsd4_layout_seg *seg) -+{ -+ pnfs_blocklayout_layout_t *b, -+ *n; -+ enum pnfs_block_extent_state4 s; -+ -+ list_for_each_entry(b, &r->blr_layouts, bll_list) { -+ if (seg->offset < b->bll_foff) { -+ n = bll_alloc(seg->offset, -+ MIN(seg->length, b->bll_foff - seg->offset), -+ BLOCK_LAYOUT_NEW, NULL); -+ if (!n) -+ return False; -+ -+ list_add(&n->bll_list, h->prev); -+ dprintk(" new: %Lu:%Lu, added before %Lu:%Lu\n", -+ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), -+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); -+ seg->offset += n->bll_len; -+ seg->length -= n->bll_len; -+ if (!seg->length) -+ break; -+ } -+ -+ if ((seg->offset >= b->bll_foff) && -+ (seg->offset < BLL_F_END(b))) { -+ if (layout_conflict(b, seg->iomode, &s) == False) { -+ dprintk(" CONFLICT FOUND: " -+ "%Lu(f):%Lu(l):%Lu(s) state %d, iomode %d\n", -+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), -+ _2SECTS(b->bll_soff), b->bll_es, -+ seg->iomode); -+ return False; -+ } -+ n = bll_alloc(seg->offset, -+ MIN(seg->length, BLL_F_END(b) - seg->offset), -+ BLOCK_LAYOUT_CACHE, h); -+ dprintk(" CACHE hit: Found %Lu(f):%Lu(l): " -+ "in %Lu(f):%Lu(l):%Lu(s):%d\n", -+ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), -+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), -+ _2SECTS(b->bll_soff), b->bll_es); -+ if (!n) -+ return False; -+ -+ n->bll_soff = b->bll_soff + seg->offset - b->bll_foff; -+ n->bll_vol_id.sbid = 0; -+ n->bll_vol_id.devid = b->bll_vol_id.devid; -+ n->bll_es = s; -+ seg->offset += n->bll_len; -+ seg->length -= n->bll_len; -+ if (!seg->length) -+ break; -+ } -+ } -+ return True; -+} -+ -+static u64 -+bll_alloc_holey(struct list_head *bl_candidates, u64 offset, u64 length, -+ dev_t dev) -+{ -+ pnfs_blocklayout_layout_t *n; -+ -+ n = bll_alloc(offset, length, BLOCK_LAYOUT_NEW, bl_candidates); -+ if (!n) -+ return 0; -+ n->bll_es = PNFS_BLOCK_NONE_DATA; -+ n->bll_vol_id.sbid = 0; -+ n->bll_vol_id.devid = dev; -+ -+ return n->bll_len; -+} -+ -+static void -+extents_setup(struct fiemap_extent_info *fei) -+{ -+ fei->fi_extents_start = NULL; -+} -+ -+/* -+ * extents_count -- Determine the number of extents for a given range. -+ * -+ * No need to call set_fs() here because the function -+ * doesn't use copy_to_user() if it's only counting -+ * the number of extents needed. -+ */ -+static void -+extents_count(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) -+{ -+ dprintk(" Need fiemap of %Ld:%Ld\n", _2SECTS(foff), _2SECTS(len)); -+ fei->fi_flags = FIEMAP_FLAG_SYNC; -+ fei->fi_extents_max = 0; -+ fei->fi_extents_start = NULL; -+ fei->fi_extents_mapped = 0; -+ i->i_op->fiemap(i, fei, foff, len + (1 << i->i_sb->s_blocksize_bits) - 1); -+} -+ -+/* -+ * extents_get -- Get list of extents for range -+ * -+ * extents_count() must have been called before this routine such that -+ * fi_extents_mapped is known. -+ */ -+static boolean_t -+extents_get(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) -+{ -+ int m_space, -+ rval; -+ struct fiemap_extent *fe; -+ mm_segment_t old_fs = get_fs(); -+ -+ /* -+ * Now malloc the correct amount of space -+ * needed. It's possible for the file to have changed -+ * between calls which would require more space for -+ * the extents. If that occurs the last extent will -+ * not have FIEMAP_EXTENT_LAST set and the error will -+ * be caught in extents_process(). -+ */ -+ m_space = fei->fi_extents_mapped * sizeof (struct fiemap_extent); -+ fe = kmalloc(m_space, GFP_KERNEL); -+ if (!fe) -+ return False; -+ memset(fe, 0, m_space); -+ -+ fei->fi_extents_max = fei->fi_extents_mapped; -+ fei->fi_extents_mapped = 0; -+ fei->fi_extents_start = fe; -+ -+ set_fs(KERNEL_DS); -+ rval = i->i_op->fiemap(i, fei, foff, len + -+ (1 << i->i_sb->s_blocksize_bits) - 1); -+ set_fs(old_fs); -+ -+ if (rval || !fei->fi_extents_mapped) { -+ dprintk(" No extents. Wanted %d, got %d\n", -+ fei->fi_extents_max, fei->fi_extents_mapped); -+ kfree(fe); -+ fei->fi_extents_start = NULL; -+ return False; -+ } else -+ return True; -+} -+ -+/* -+ * extents_process -- runs through the extent returned from the file system and -+ * creates block layout entries. -+ */ -+static boolean_t -+extents_process(struct fiemap_extent_info *fei, struct list_head *bl_candidates, -+ struct nfsd4_layout_seg *seg, dev_t dev, pnfs_blocklayout_layout_t *b) -+{ -+ struct fiemap_extent *fep, -+ *fep_last = NULL; -+ int i; -+ pnfs_blocklayout_layout_t *n; -+ u64 last_end, -+ rval; -+ -+ dprintk("--> %s\n", __func__); -+ for (fep = fei->fi_extents_start, i = 0; i < fei->fi_extents_mapped; -+ i++, fep++) { -+ -+ BUG_ON(!fep->fe_physical); -+ /* -+ * Deal with corner cases of hoel-y files. -+ */ -+ if (fep_last && ((fep_last->fe_logical + fep_last->fe_length) != -+ fep->fe_logical)) { -+ -+ /* -+ * If the last extent doesn't end logically -+ * at the beginning of the current we've got -+ * hole and need to create a pNFS extent. -+ */ -+ dprintk(" Got a hole at %Ld:%Ld \n", -+ _2SECTS(fep_last->fe_logical), -+ _2SECTS(fep_last->fe_length)); -+ last_end = fep_last->fe_logical + fep_last->fe_length; -+ rval = bll_alloc_holey(bl_candidates, last_end, -+ fep->fe_logical - last_end, dev); -+ if (!rval) -+ return False; -+ seg->length += rval; -+ } -+ -+ n = bll_alloc(fep->fe_logical, fep->fe_length, -+ BLOCK_LAYOUT_NEW, bl_candidates); -+ if (unlikely(n == NULL)) { -+ dprintk("%s: bll_alloc failed\n", __func__); -+ return False; -+ } -+ -+ n->bll_soff = fep->fe_physical; -+ n->bll_es = seg->iomode == IOMODE_READ ? -+ PNFS_BLOCK_READ_DATA : PNFS_BLOCK_READWRITE_DATA; -+ n->bll_vol_id.sbid = 0; -+ n->bll_vol_id.devid = dev; -+ seg->length += fep->fe_length; -+ print_bll(n, "New extent"); -+ fep_last = fep; -+ } -+ dprintk("<-- %s (i=%d)\n", __func__, i); -+ -+ return True; -+} -+ -+static void -+extents_cleanup(struct fiemap_extent_info *fei) -+{ -+ if (fei->fi_extents_start) { -+ kfree(fei->fi_extents_start); -+ fei->fi_extents_start = NULL; -+ } -+} -+ -+/* -+ * device_slice -- check to see if device is a slice or DM -+ */ -+static boolean_t -+device_slice(dev_t devid) -+{ -+ struct block_device *bd = open_by_devnum(devid, FMODE_READ); -+ boolean_t rval = False; -+ -+ if (bd) { -+ if (bd->bd_disk->minors > 1) -+ rval = True; -+ blkdev_put(bd, FMODE_READ); -+ } -+ return rval; -+} -+ -+/* -+ * device_dm -- check to see if device is a Device Mapper volume. -+ * -+ * Returns 1 for DM or 0 if not -+ */ -+static boolean_t -+device_dm(dev_t devid) -+{ -+ boolean_t rval = False; -+ bl_comm_msg_t msg; -+ bl_comm_res_t *res; -+ -+ msg.msg_type = PNFS_UPCALL_MSG_DMCHK; -+ msg.u.msg_dev = devid; -+ if (bl_upcall(bl_comm_global, &msg, &res)) { -+ dprintk("Failed upcall to check on DM status\n"); -+ } else if (res->u.dm_vol) { -+ rval = True; -+ dprintk("Device is DM volume\n"); -+ } else -+ dprintk("Device is not DM volume\n"); -+ kfree(res); -+ -+ return rval; -+} -+ -+static boolean_t -+layout_inode_add(struct inode *i, bl_layout_rec_t **p) -+{ -+ bl_layout_rec_t *r = NULL; -+ -+ if (!i->i_op->fiemap || !i->i_op->fallocate) { -+ printk("pNFS: file system doesn't support required fiemap or" -+ "fallocate methods\n"); -+ return False; -+ } -+ -+ r = kmalloc(sizeof (*r), GFP_KERNEL); -+ if (!r) -+ goto error; -+ -+ r->blr_rdev = i->i_sb->s_dev; -+ r->blr_inode = i; -+ r->blr_orig_size = i->i_size; -+ r->blr_ext_size = 0; -+ r->blr_recalled = 0; -+ INIT_LIST_HEAD(&r->blr_layouts); -+ spin_lock_init(&r->blr_lock); -+ spin_lock(&layout_hashtbl_lock); -+ list_add_tail(&r->blr_hash, &layout_hash); -+ spin_unlock(&layout_hashtbl_lock); -+ *p = r; -+ return True; -+ -+error: -+ if (r) -+ kfree(r); -+ return False; -+} -+ -+static bl_layout_rec_t * -+__layout_inode_find(struct inode *i) -+{ -+ bl_layout_rec_t *r; -+ -+ if (!list_empty(&layout_hash)) { -+ list_for_each_entry(r, &layout_hash, blr_hash) { -+ if ((r->blr_inode->i_ino == i->i_ino) && -+ (r->blr_rdev == i->i_sb->s_dev)) { -+ return r; -+ } -+ } -+ } -+ return NULL; -+} -+ -+static bl_layout_rec_t * -+layout_inode_find(struct inode *i) -+{ -+ bl_layout_rec_t *r; -+ -+ spin_lock(&layout_hashtbl_lock); -+ r = __layout_inode_find(i); -+ spin_unlock(&layout_hashtbl_lock); -+ -+ return r; -+} -+ -+static void -+layout_inode_del(struct inode *i) -+{ -+ bl_layout_rec_t *r; -+ -+ spin_lock(&layout_hashtbl_lock); -+ r = __layout_inode_find(i); -+ if (r) { -+ spin_lock(&r->blr_lock); -+ if (list_empty(&r->blr_layouts)) { -+ list_del(&r->blr_hash); -+ spin_unlock(&r->blr_lock); -+ kfree(r); -+ } else { -+ spin_unlock(&r->blr_lock); -+ } -+ } else { -+ dprintk("%s: failed to find inode [0x%x:%lu] in table for delete\n", -+ __func__, i->i_sb->s_dev, i->i_ino); -+ } -+ spin_unlock(&layout_hashtbl_lock); -+} -+ -+/* -+ * map_state2name -- converts state in ascii string. -+ * -+ * Used for debug messages only. -+ */ -+static char * -+map_state2name(enum pnfs_block_extent_state4 s) -+{ -+ switch (s) { -+ case PNFS_BLOCK_READWRITE_DATA: return " RW"; -+ case PNFS_BLOCK_READ_DATA: return " RO"; -+ case PNFS_BLOCK_INVALID_DATA: return "INVALID"; -+ case PNFS_BLOCK_NONE_DATA: return " NONE"; -+ default: -+ BUG(); -+ } -+} -+ -+static pnfs_blocklayout_devinfo_t * -+bld_alloc(struct list_head *volumes, int type) -+{ -+ pnfs_blocklayout_devinfo_t *bld; -+ -+ bld = kmalloc(sizeof (*bld), GFP_KERNEL); -+ if (!bld) -+ return NULL; -+ -+ memset(bld, 0, sizeof (*bld)); -+ bld->bld_type = type; -+ list_add_tail(&bld->bld_list, volumes); -+ -+ return bld; -+} -+ -+static void -+bld_free(pnfs_blocklayout_devinfo_t *bld) -+{ -+ list_del(&bld->bld_list); -+ kfree(bld); -+} -+ -+static void -+print_bll(pnfs_blocklayout_layout_t *b, char *text) -+{ -+ dprintk(" BLL: %s\n", text); -+ dprintk(" foff %Lu, soff %Lu, len %Lu, state %s\n", -+ _2SECTS(b->bll_foff), _2SECTS(b->bll_soff), _2SECTS(b->bll_len), -+ map_state2name(b->bll_es)); -+} -+ -+static inline void -+bll_collapse(bl_layout_rec_t *r, pnfs_blocklayout_layout_t *c) -+{ -+ pnfs_blocklayout_layout_t *n; -+ int dbg_count = 0; -+ u64 endpoint; -+ -+ BUG_ON(c->bll_es == PNFS_BLOCK_NONE_DATA); -+ while (c->bll_list.next != &r->blr_layouts) { -+ n = list_entry(c->bll_list.next, -+ struct pnfs_blocklayout_layout, bll_list); -+ endpoint = BLL_S_END(c); -+ if ((n->bll_soff >= c->bll_soff) && -+ (n->bll_soff < endpoint)) { -+ if (endpoint < BLL_S_END(n)) { -+ /* -+ * The following is possible. -+ * -+ * -+ * Existing: +---+ +---+ -+ * New: +-----------------------+ -+ * The client request merge entries together -+ * but didn't require picking up all of the -+ * last entry. So, we still need to delete -+ * the last entry and add the remaining space -+ * to the new entry. -+ */ -+ c->bll_len += BLL_S_END(n) - endpoint; -+ } -+ dbg_count++; -+ list_del(&n->bll_list); -+ kfree(n); -+ } else { -+ break; -+ } -+ } -+ /* ---- Debug only, remove before integration ---- */ -+ if (dbg_count) -+ dprintk(" Collapsed %d cache entries between %Lu(s) and %Lu(s)\n", -+ dbg_count, _2SECTS(c->bll_soff), _2SECTS(BLL_S_END(c))); -+} -+ -+static pnfs_blocklayout_layout_t * -+bll_alloc(u64 offset, u64 len, enum bl_cache_state state, struct list_head *h) -+{ -+ pnfs_blocklayout_layout_t *n = NULL; -+ -+ n = kmalloc(sizeof (*n), GFP_KERNEL); -+ if (n) { -+ memset(n, 0, sizeof (*n)); -+ n->bll_foff = offset; -+ n->bll_len = len; -+ n->bll_cache_state = state; -+ if (h) -+ list_add_tail(&n->bll_list, h); -+ } -+ return n; -+} -+ -+static pnfs_blocklayout_layout_t * -+bll_alloc_dup(pnfs_blocklayout_layout_t *b, enum bl_cache_state c, -+ struct list_head *h) -+{ -+ pnfs_blocklayout_layout_t *n = NULL; -+ -+ n = bll_alloc(b->bll_foff, b->bll_len, c, h); -+ if (n) { -+ n->bll_es = b->bll_es; -+ n->bll_soff = b->bll_soff; -+ n->bll_vol_id.devid = b->bll_vol_id.devid; -+ } -+ return n; -+} -+ -+static inline boolean_t -+layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, -+ enum pnfs_block_extent_state4 *s) -+{ -+ /* ---- Normal case ---- */ -+ *s = b->bll_es; -+ -+ switch (b->bll_es) { -+ case PNFS_BLOCK_READWRITE_DATA: -+ if (iomode == IOMODE_READ) -+ *s = PNFS_BLOCK_READ_DATA; -+ /* ---- Any use is permitted. ---- */ -+ break; -+ case PNFS_BLOCK_READ_DATA: -+ /* ---- Committed as read only data. ---- */ -+ if (iomode == IOMODE_RW) -+ return False; -+ break; -+ case PNFS_BLOCK_INVALID_DATA: -+ /* ---- Blocks have been allocated, but not initialized ---- */ -+ if (iomode == IOMODE_READ) -+ *s = PNFS_BLOCK_NONE_DATA; -+ break; -+ case PNFS_BLOCK_NONE_DATA: -+ /* ---- Hole-y file. No backing store avail. ---- */ -+ if (iomode != IOMODE_READ) -+ return False; -+ break; -+ default: -+ BUG(); -+ } -+ return True; -+} -+ -+#endif /* CONFIG_SPNFS_BLOCK */ -diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c -index c2a4f71..2e025c2 100644 ---- a/fs/nfsd/export.c -+++ b/fs/nfsd/export.c -@@ -17,11 +17,19 @@ - #include - #include - -+#include -+#if defined(CONFIG_SPNFS) -+#include -+#if defined(CONFIG_SPNFS_BLOCK) -+#include -+#endif -+#endif - #include - #include - - #include "nfsd.h" - #include "nfsfh.h" -+#include "pnfsd.h" - - #define NFSDDBG_FACILITY NFSDDBG_EXPORT - -@@ -352,10 +360,84 @@ static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) - return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); - } - -+#if defined(CONFIG_PNFSD) -+static struct pnfsd_cb_operations pnfsd_cb_op = { -+ .cb_layout_recall = nfsd_layout_recall_cb, -+ .cb_device_notify = nfsd_device_notify_cb, -+ -+ .cb_get_state = nfs4_pnfs_cb_get_state, -+ .cb_change_state = nfs4_pnfs_cb_change_state, -+}; -+ -+#if defined(CONFIG_SPNFS) -+static struct pnfs_export_operations spnfs_export_ops = { -+ .layout_type = spnfs_layout_type, -+ .get_device_info = spnfs_getdeviceinfo, -+ .get_device_iter = spnfs_getdeviceiter, -+ .layout_get = spnfs_layoutget, -+ .layout_return = spnfs_layoutreturn, -+}; -+ -+static struct pnfs_export_operations spnfs_ds_export_ops = { -+ .get_state = spnfs_get_state, -+}; -+ -+#if defined(CONFIG_SPNFS_BLOCK) -+static struct pnfs_export_operations bl_export_ops = { -+ .layout_type = bl_layout_type, -+ .get_device_info = bl_getdeviceinfo, -+ .get_device_iter = bl_getdeviceiter, -+ .layout_get = bl_layoutget, -+ .layout_return = bl_layoutreturn, -+}; -+#endif /* CONFIG_SPNFS_BLOCK */ -+#endif /* CONFIG_SPNFS */ -+#endif /* CONFIG_PNFSD */ -+ - static struct svc_export *svc_export_update(struct svc_export *new, - struct svc_export *old); - static struct svc_export *svc_export_lookup(struct svc_export *); - -+static int pnfsd_check_export(struct inode *inode, int *flags) -+{ -+#if defined(CONFIG_PNFSD) -+ -+#if defined(CONFIG_PNFSD_LOCAL_EXPORT) -+ if (!inode->i_sb->s_pnfs_op) -+ pnfsd_lexp_init(inode); -+ return 0; -+#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ -+ -+#if defined(CONFIG_SPNFS) -+#if defined(CONFIG_SPNFS_BLOCK) -+ if (pnfs_block_enabled(inode, *flags)) { -+ dprintk("set pnfs block export structure... \n"); -+ inode->i_sb->s_pnfs_op = &bl_export_ops; -+ } else -+#endif /* CONFIG_SPNFS_BLOCK */ -+ /* -+ * spnfs_enabled() indicates we're an MDS. -+ * XXX Better to check an export time option as well. -+ */ -+ if (spnfs_enabled()) { -+ dprintk("set spnfs export structure...\n"); -+ inode->i_sb->s_pnfs_op = &spnfs_export_ops; -+ } else { -+ dprintk("%s spnfs not in use\n", __func__); -+ -+ /* -+ * get_state is needed if we're a DS using spnfs. -+ * XXX Better to check an export time option instead. -+ */ -+ inode->i_sb->s_pnfs_op = &spnfs_ds_export_ops; -+ } -+#endif /* CONFIG_SPNFS */ -+ -+#endif /* CONFIG_PNFSD */ -+ -+ return 0; -+} -+ - static int check_export(struct inode *inode, int *flags, unsigned char *uuid) - { - -@@ -395,8 +477,17 @@ static int check_export(struct inode *inode, int *flags, unsigned char *uuid) - return -EINVAL; - } - -- return 0; -+#if !defined(CONFIG_SPNFS) -+ if (inode->i_sb->s_pnfs_op && -+ (!inode->i_sb->s_pnfs_op->layout_type || -+ !inode->i_sb->s_pnfs_op->get_device_info || -+ !inode->i_sb->s_pnfs_op->layout_get)) { -+ dprintk("exp_export: export of invalid fs pnfs export ops.\n"); -+ return -EINVAL; -+ } -+#endif /* !CONFIG_SPNFS */ - -+ return pnfsd_check_export(inode, flags); - } - - #ifdef CONFIG_NFSD_V4 -@@ -586,6 +677,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) - if (exp.ex_uuid == NULL) - err = -ENOMEM; - } -+ } else if (strcmp(buf, "pnfs") == 0) { -+ exp.ex_pnfs = 1; - } else if (strcmp(buf, "secinfo") == 0) - err = secinfo_parse(&mesg, buf, &exp); - else -@@ -660,6 +753,8 @@ static int svc_export_show(struct seq_file *m, - seq_printf(m, "%02x", exp->ex_uuid[i]); - } - } -+ if (exp->ex_pnfs) -+ seq_puts(m, ",pnfs"); - show_secinfo(m, exp); - } - seq_puts(m, ")\n"); -@@ -687,6 +782,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) - new->ex_fslocs.locations = NULL; - new->ex_fslocs.locations_count = 0; - new->ex_fslocs.migrated = 0; -+ new->ex_pnfs = 0; - } - - static void export_update(struct cache_head *cnew, struct cache_head *citem) -@@ -699,6 +795,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) - new->ex_anon_uid = item->ex_anon_uid; - new->ex_anon_gid = item->ex_anon_gid; - new->ex_fsid = item->ex_fsid; -+ new->ex_pnfs = item->ex_pnfs; - new->ex_uuid = item->ex_uuid; - item->ex_uuid = NULL; - new->ex_pathname = item->ex_pathname; -@@ -1635,8 +1732,17 @@ nfsd_export_init(void) - if (rv) - return rv; - rv = cache_register(&svc_expkey_cache); -- if (rv) -+ if (rv) { - cache_unregister(&svc_export_cache); -+ goto out; -+ } -+#if defined(CONFIG_PNFSD) -+ spin_lock(&pnfsd_cb_ctl.lock); -+ pnfsd_cb_ctl.module = THIS_MODULE; -+ pnfsd_cb_ctl.cb_op = &pnfsd_cb_op; -+ spin_unlock(&pnfsd_cb_ctl.lock); -+#endif /* CONFIG_PNFSD */ -+out: - return rv; - - } -@@ -1664,6 +1770,12 @@ nfsd_export_shutdown(void) - - exp_writelock(); - -+#if defined(CONFIG_PNFSD) -+ spin_lock(&pnfsd_cb_ctl.lock); -+ pnfsd_cb_ctl.module = NULL; -+ pnfsd_cb_ctl.cb_op = NULL; -+ spin_unlock(&pnfsd_cb_ctl.lock); -+#endif /* CONFIG_PNFSD */ - cache_unregister(&svc_expkey_cache); - cache_unregister(&svc_export_cache); - svcauth_unix_purge(); -diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c -index 988cbb3..fc8f498 100644 ---- a/fs/nfsd/nfs4callback.c -+++ b/fs/nfsd/nfs4callback.c -@@ -41,7 +41,6 @@ - - #define NFSPROC4_CB_NULL 0 - #define NFSPROC4_CB_COMPOUND 1 --#define NFS4_STATEID_SIZE 16 - - /* Index of predefined Linux callback client operations */ - -@@ -49,11 +48,17 @@ enum { - NFSPROC4_CLNT_CB_NULL = 0, - NFSPROC4_CLNT_CB_RECALL, - NFSPROC4_CLNT_CB_SEQUENCE, -+#if defined(CONFIG_PNFSD) -+ NFSPROC4_CLNT_CB_LAYOUT, -+ NFSPROC4_CLNT_CB_DEVICE, -+#endif - }; - - enum nfs_cb_opnum4 { - OP_CB_RECALL = 4, -+ OP_CB_LAYOUT = 5, - OP_CB_SEQUENCE = 11, -+ OP_CB_DEVICE = 14, - }; - - #define NFS4_MAXTAGLEN 20 -@@ -79,6 +84,19 @@ enum nfs_cb_opnum4 { - #define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ - cb_sequence_dec_sz + \ - op_dec_sz) -+#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \ -+ cb_sequence_enc_sz + \ -+ 1 + 3 + \ -+ enc_nfs4_fh_sz + 4) -+#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ -+ cb_sequence_dec_sz + \ -+ op_dec_sz) -+#define NFS4_enc_cb_device_sz (cb_compound_enc_hdr_sz + \ -+ cb_sequence_enc_sz + \ -+ 1 + 6) -+#define NFS4_dec_cb_device_sz (cb_compound_dec_hdr_sz + \ -+ cb_sequence_dec_sz + \ -+ op_dec_sz) - - /* - * Generic encode routines from fs/nfs/nfs4xdr.c -@@ -95,6 +113,10 @@ xdr_writemem(__be32 *p, const void *ptr, int nbytes) - } - - #define WRITE32(n) *p++ = htonl(n) -+#define WRITE64(n) do { \ -+ *p++ = htonl((u32)((n) >> 32)); \ -+ *p++ = htonl((u32)(n)); \ -+} while (0) - #define WRITEMEM(ptr,nbytes) do { \ - p = xdr_writemem(p, ptr, nbytes); \ - } while (0) -@@ -268,6 +290,111 @@ encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args, - hdr->nops++; - } - -+#if defined(CONFIG_PNFSD) -+ -+#include "pnfsd.h" -+ -+static void -+encode_cb_layout(struct xdr_stream *xdr, struct nfs4_layoutrecall *clr, -+ struct nfs4_cb_compound_hdr *hdr) -+{ -+ u32 *p; -+ -+ BUG_ON(hdr->minorversion == 0); -+ -+ RESERVE_SPACE(20); -+ WRITE32(OP_CB_LAYOUT); -+ WRITE32(clr->cb.cbl_seg.layout_type); -+ WRITE32(clr->cb.cbl_seg.iomode); -+ WRITE32(clr->cb.cbl_layoutchanged); -+ WRITE32(clr->cb.cbl_recall_type); -+ if (unlikely(clr->cb.cbl_recall_type == RETURN_FSID)) { -+ struct nfs4_fsid fsid = clr->cb.cbl_fsid; -+ -+ RESERVE_SPACE(16); -+ WRITE64(fsid.major); -+ WRITE64(fsid.minor); -+ dprintk("%s: type %x iomode %d changed %d recall_type %d " -+ "fsid 0x%llx-0x%llx\n", -+ __func__, clr->cb.cbl_seg.layout_type, -+ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, -+ clr->cb.cbl_recall_type, fsid.major, fsid.minor); -+ } else if (clr->cb.cbl_recall_type == RETURN_FILE) { -+ int len = clr->clr_file->fi_fhlen; -+ stateid_t *cbl_sid = (stateid_t *)&clr->cb.cbl_sid; -+ -+ RESERVE_SPACE(20 + len); -+ WRITE32(len); -+ WRITEMEM(clr->clr_file->fi_fhval, len); -+ WRITE64(clr->cb.cbl_seg.offset); -+ WRITE64(clr->cb.cbl_seg.length); -+ encode_stateid(xdr, cbl_sid); -+ dprintk("%s: type %x iomode %d changed %d recall_type %d " -+ "offset %lld length %lld stateid " STATEID_FMT "\n", -+ __func__, clr->cb.cbl_seg.layout_type, -+ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, -+ clr->cb.cbl_recall_type, -+ clr->cb.cbl_seg.offset, clr->cb.cbl_seg.length, -+ STATEID_VAL(cbl_sid)); -+ } else { -+ dprintk("%s: type %x iomode %d changed %d recall_type %d\n", -+ __func__, clr->cb.cbl_seg.layout_type, -+ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, -+ clr->cb.cbl_recall_type); -+ } -+ hdr->nops++; -+} -+ -+static void -+encode_cb_device(struct xdr_stream *xdr, struct nfs4_notify_device *nd, -+ struct nfs4_cb_compound_hdr *hdr) -+{ -+ u32 *p; -+ int i; -+ int len = nd->nd_list->cbd_len; -+ struct nfsd4_pnfs_cb_dev_item *cbd = nd->nd_list->cbd_list; -+ -+ dprintk("NFSD %s: --> num %d\n", __func__, len); -+ -+ BUG_ON(hdr->minorversion == 0); -+ -+ RESERVE_SPACE(8); -+ WRITE32(OP_CB_DEVICE); -+ -+ /* notify4 cnda_changes<>; */ -+ WRITE32(len); -+ for (i = 0; i < len; i++) { -+ dprintk("%s: nt %d lt %d devid x%llx-x%llx im %d i %d\n", -+ __func__, cbd[i].cbd_notify_type, -+ cbd[i].cbd_layout_type, -+ cbd[i].cbd_devid.sbid, -+ cbd[i].cbd_devid.devid, -+ cbd[i].cbd_immediate, i); -+ -+ BUG_ON(cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && -+ cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_DELETE); -+ RESERVE_SPACE(32); -+ /* bitmap4 notify_mask; */ -+ WRITE32(1); -+ WRITE32(cbd[i].cbd_notify_type); -+ /* opaque notify_vals<>; */ -+ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) -+ WRITE32(24); -+ else -+ WRITE32(20); -+ WRITE32(cbd[i].cbd_layout_type); -+ WRITE64(cbd[i].cbd_devid.sbid); -+ WRITE64(cbd[i].cbd_devid.devid); -+ -+ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) { -+ RESERVE_SPACE(4); -+ WRITE32(cbd[i].cbd_immediate); -+ } -+ } -+ hdr->nops++; -+} -+#endif /* CONFIG_PNFSD */ -+ - static int - nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) - { -@@ -297,6 +424,45 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, - return 0; - } - -+#if defined(CONFIG_PNFSD) -+static int -+nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, u32 *p, -+ struct nfs4_rpc_args *rpc_args) -+{ -+ struct xdr_stream xdr; -+ struct nfs4_layoutrecall *args = rpc_args->args_op; -+ struct nfs4_cb_compound_hdr hdr = { -+ .ident = 0, -+ .minorversion = rpc_args->args_seq.cbs_minorversion, -+ }; -+ -+ xdr_init_encode(&xdr, &req->rq_snd_buf, p); -+ encode_cb_compound_hdr(&xdr, &hdr); -+ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); -+ encode_cb_layout(&xdr, args, &hdr); -+ encode_cb_nops(&hdr); -+ return 0; -+} -+ -+static int -+nfs4_xdr_enc_cb_device(struct rpc_rqst *req, u32 *p, -+ struct nfs4_rpc_args *rpc_args) -+{ -+ struct xdr_stream xdr; -+ struct nfs4_notify_device *args = rpc_args->args_op; -+ struct nfs4_cb_compound_hdr hdr = { -+ .ident = 0, -+ .minorversion = rpc_args->args_seq.cbs_minorversion, -+ }; -+ -+ xdr_init_encode(&xdr, &req->rq_snd_buf, p); -+ encode_cb_compound_hdr(&xdr, &hdr); -+ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); -+ encode_cb_device(&xdr, args, &hdr); -+ encode_cb_nops(&hdr); -+ return 0; -+} -+#endif /* CONFIG_PNFSD */ - - static int - decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ -@@ -413,6 +579,48 @@ out: - return status; - } - -+#if defined(CONFIG_PNFSD) -+static int -+nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, u32 *p, -+ struct nfsd4_cb_sequence *seq) -+{ -+ struct xdr_stream xdr; -+ struct nfs4_cb_compound_hdr hdr; -+ int status; -+ -+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -+ status = decode_cb_compound_hdr(&xdr, &hdr); -+ if (status) -+ goto out; -+ status = decode_cb_sequence(&xdr, seq, rqstp); -+ if (status) -+ goto out; -+ status = decode_cb_op_hdr(&xdr, OP_CB_LAYOUT); -+out: -+ return status; -+} -+ -+static int -+nfs4_xdr_dec_cb_device(struct rpc_rqst *rqstp, u32 *p, -+ struct nfsd4_cb_sequence *seq) -+{ -+ struct xdr_stream xdr; -+ struct nfs4_cb_compound_hdr hdr; -+ int status; -+ -+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -+ status = decode_cb_compound_hdr(&xdr, &hdr); -+ if (status) -+ goto out; -+ status = decode_cb_sequence(&xdr, seq, rqstp); -+ if (status) -+ goto out; -+ status = decode_cb_op_hdr(&xdr, OP_CB_DEVICE); -+out: -+ return status; -+} -+#endif /* CONFIG_PNFSD */ -+ - /* - * RPC procedure tables - */ -@@ -430,6 +638,10 @@ out: - static struct rpc_procinfo nfs4_cb_procedures[] = { - PROC(CB_NULL, NULL, enc_cb_null, dec_cb_null), - PROC(CB_RECALL, COMPOUND, enc_cb_recall, dec_cb_recall), -+#if defined(CONFIG_PNFSD) -+ PROC(CB_LAYOUT, COMPOUND, enc_cb_layout, dec_cb_layout), -+ PROC(CB_DEVICE, COMPOUND, enc_cb_device, dec_cb_device), -+#endif - }; - - static struct rpc_version nfs_cb_version4 = { -@@ -615,10 +827,9 @@ out: - * TODO: cb_sequence should support referring call lists, cachethis, multiple - * slots, and mark callback channel down on communication errors. - */ --static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) -+static void nfsd4_cb_prepare_sequence(struct rpc_task *task, -+ struct nfs4_client *clp) - { -- struct nfs4_delegation *dp = calldata; -- struct nfs4_client *clp = dp->dl_client; - struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; - u32 minorversion = clp->cl_cb_conn.cb_minorversion; - int status = 0; -@@ -638,11 +849,15 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) - rpc_call_start(task); - } - --static void nfsd4_cb_done(struct rpc_task *task, void *calldata) -+static void nfsd4_cb_recall_prepare(struct rpc_task *task, void *calldata) - { - struct nfs4_delegation *dp = calldata; -- struct nfs4_client *clp = dp->dl_client; -+ nfsd4_cb_prepare_sequence(task, dp->dl_client); -+} - -+static void nfsd4_cb_done_sequence(struct rpc_task *task, -+ struct nfs4_client *clp) -+{ - dprintk("%s: minorversion=%d\n", __func__, - clp->cl_cb_conn.cb_minorversion); - -@@ -666,7 +881,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) - struct nfs4_client *clp = dp->dl_client; - struct rpc_clnt *current_rpc_client = clp->cl_cb_client; - -- nfsd4_cb_done(task, calldata); -+ nfsd4_cb_done_sequence(task, clp); - - if (current_rpc_client == NULL) { - /* We're shutting down; give up. */ -@@ -713,7 +928,7 @@ static void nfsd4_cb_recall_release(void *calldata) - } - - static const struct rpc_call_ops nfsd4_cb_recall_ops = { -- .rpc_call_prepare = nfsd4_cb_prepare, -+ .rpc_call_prepare = nfsd4_cb_recall_prepare, - .rpc_call_done = nfsd4_cb_recall_done, - .rpc_release = nfsd4_cb_recall_release, - }; -@@ -788,3 +1003,173 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp) - { - queue_work(callback_wq, &dp->dl_recall.cb_work); - } -+ -+#if defined(CONFIG_PNFSD) -+static void nfsd4_cb_layout_prepare(struct rpc_task *task, void *calldata) -+{ -+ struct nfs4_layoutrecall *clr = calldata; -+ nfsd4_cb_prepare_sequence(task, clr->clr_client); -+} -+ -+static void nfsd4_cb_layout_done(struct rpc_task *task, void *calldata) -+{ -+ struct nfs4_layoutrecall *clr = calldata; -+ struct nfs4_client *clp = clr->clr_client; -+ -+ nfsd4_cb_done_sequence(task, clp); -+ -+ if (!task->tk_status) -+ return; -+ -+ printk("%s: clp %p cb_client %p fp %p failed with status %d\n", -+ __func__, -+ clp, -+ clp->cl_cb_client, -+ clr->clr_file, -+ task->tk_status); -+ -+ switch (task->tk_status) { -+ case -EIO: -+ /* Network partition? */ -+ atomic_set(&clp->cl_cb_set, 0); -+ warn_no_callback_path(clp, task->tk_status); -+ /* FIXME: -+ * The pnfs standard states that we need to only expire -+ * the client after at-least "lease time" .eg lease-time * 2 -+ * when failing to communicate a recall -+ */ -+ break; -+ case -NFS4ERR_DELAY: -+ /* Poll the client until it's done with the layout */ -+ rpc_delay(task, HZ/100); /* 10 mili-seconds */ -+ task->tk_status = 0; -+ rpc_restart_call_prepare(task); -+ break; -+ case -NFS4ERR_NOMATCHING_LAYOUT: -+ task->tk_status = 0; -+ nomatching_layout(clr); -+ } -+} -+ -+static void nfsd4_cb_layout_release(void *calldata) -+{ -+ struct nfs4_layoutrecall *clr = calldata; -+ kfree(clr->clr_args); -+ clr->clr_args = NULL; -+ put_layoutrecall(clr); -+} -+ -+static const struct rpc_call_ops nfsd4_cb_layout_ops = { -+ .rpc_call_prepare = nfsd4_cb_layout_prepare, -+ .rpc_call_done = nfsd4_cb_layout_done, -+ .rpc_release = nfsd4_cb_layout_release, -+}; -+ -+/* -+ * Called with state lock. -+ */ -+int -+nfsd4_cb_layout(struct nfs4_layoutrecall *clr) -+{ -+ struct nfs4_client *clp = clr->clr_client; -+ struct rpc_clnt *clnt = clp->cl_cb_client; -+ struct nfs4_rpc_args *args; -+ struct rpc_message msg = { -+ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_LAYOUT], -+ .rpc_cred = callback_cred -+ }; -+ int status; -+ -+ args = kzalloc(sizeof(*args), GFP_KERNEL); -+ if (!args) { -+ status = -ENOMEM; -+ goto out; -+ } -+ clr->clr_args = args; -+ args->args_op = clr; -+ msg.rpc_argp = args; -+ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, -+ &nfsd4_cb_layout_ops, clr); -+out: -+ if (status) { -+ kfree(args); -+ put_layoutrecall(clr); -+ } -+ dprintk("NFSD: nfsd4_cb_layout: status %d\n", status); -+ return status; -+} -+ -+static void nfsd4_cb_device_prepare(struct rpc_task *task, void *calldata) -+{ -+ struct nfs4_notify_device *cbnd = calldata; -+ nfsd4_cb_prepare_sequence(task, cbnd->nd_client); -+} -+ -+static void nfsd4_cb_device_done(struct rpc_task *task, void *calldata) -+{ -+ struct nfs4_notify_device *cbnd = calldata; -+ struct nfs4_client *clp = cbnd->nd_client; -+ -+ nfsd4_cb_done_sequence(task, clp); -+ -+ dprintk("%s: clp %p cb_client %p: status %d\n", -+ __func__, -+ clp, -+ clp->cl_cb_client, -+ task->tk_status); -+ -+ if (task->tk_status == -EIO) { -+ /* Network partition? */ -+ atomic_set(&clp->cl_cb_set, 0); -+ warn_no_callback_path(clp, task->tk_status); -+ } -+} -+ -+static void nfsd4_cb_device_release(void *calldata) -+{ -+ struct nfs4_notify_device *cbnd = calldata; -+ kfree(cbnd->nd_args); -+ cbnd->nd_args = NULL; -+ kfree(cbnd); -+} -+ -+static const struct rpc_call_ops nfsd4_cb_device_ops = { -+ .rpc_call_prepare = nfsd4_cb_device_prepare, -+ .rpc_call_done = nfsd4_cb_device_done, -+ .rpc_release = nfsd4_cb_device_release, -+}; -+ -+/* -+ * Called with state lock. -+ */ -+int -+nfsd4_cb_notify_device(struct nfs4_notify_device *cbnd) -+{ -+ struct nfs4_client *clp = cbnd->nd_client; -+ struct rpc_clnt *clnt = clp->cl_cb_client; -+ struct nfs4_rpc_args *args; -+ struct rpc_message msg = { -+ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_DEVICE], -+ .rpc_cred = callback_cred -+ }; -+ int status = -EIO; -+ -+ dprintk("%s: clp %p\n", __func__, clp); -+ -+ args = kzalloc(sizeof(*args), GFP_KERNEL); -+ if (!args) { -+ status = -ENOMEM; -+ goto out; -+ } -+ args->args_op = cbnd; -+ msg.rpc_argp = args; -+ -+ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, -+ &nfsd4_cb_device_ops, cbnd); -+out: -+ if (status) -+ kfree(args); -+ dprintk("%s: status %d\n", __func__, status); -+ return status; -+} -+#endif /* CONFIG_PNFSD */ -diff --git a/fs/nfsd/nfs4pnfsd.c b/fs/nfsd/nfs4pnfsd.c -new file mode 100644 -index 0000000..8e8bae3 ---- /dev/null -+++ b/fs/nfsd/nfs4pnfsd.c -@@ -0,0 +1,1688 @@ -+/****************************************************************************** -+ * -+ * (c) 2007 Network Appliance, Inc. All Rights Reserved. -+ * (c) 2009 NetApp. All Rights Reserved. -+ * -+ * NetApp provides this source code under the GPL v2 License. -+ * The GPL v2 license is available at -+ * http://opensource.org/licenses/gpl-license.php. -+ * -+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ * -+ *****************************************************************************/ -+ -+#include "pnfsd.h" -+ -+#define NFSDDBG_FACILITY NFSDDBG_PROC -+ -+/* Globals */ -+static u32 current_layoutid = 1; -+ -+/* -+ * Currently used for manipulating the layout state. -+ */ -+static DEFINE_SPINLOCK(layout_lock); -+ -+#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_SMP) -+# define BUG_ON_UNLOCKED_LAYOUT() BUG_ON(!spin_is_locked(&layout_lock)) -+#else -+# define BUG_ON_UNLOCKED_LAYOUT() -+#endif -+ -+/* -+ * Layout state - NFSv4.1 pNFS -+ */ -+static struct kmem_cache *pnfs_layout_slab; -+static struct kmem_cache *pnfs_layoutrecall_slab; -+ -+/* hash table for nfsd4_pnfs_deviceid.sbid */ -+#define SBID_HASH_BITS 8 -+#define SBID_HASH_SIZE (1 << SBID_HASH_BITS) -+#define SBID_HASH_MASK (SBID_HASH_SIZE - 1) -+ -+struct sbid_tracker { -+ u64 id; -+ struct super_block *sb; -+ struct list_head hash; -+}; -+ -+static u64 current_sbid; -+static struct list_head sbid_hashtbl[SBID_HASH_SIZE]; -+ -+static inline unsigned long -+sbid_hashval(struct super_block *sb) -+{ -+ return hash_ptr(sb, SBID_HASH_BITS); -+} -+ -+static inline struct sbid_tracker * -+alloc_sbid(void) -+{ -+ return kmalloc(sizeof(struct sbid_tracker), GFP_KERNEL); -+} -+ -+static void -+destroy_sbid(struct sbid_tracker *sbid) -+{ -+ spin_lock(&layout_lock); -+ list_del(&sbid->hash); -+ spin_unlock(&layout_lock); -+ kfree(sbid); -+} -+ -+void -+nfsd4_free_pnfs_slabs(void) -+{ -+ int i; -+ struct sbid_tracker *sbid; -+ -+ nfsd4_free_slab(&pnfs_layout_slab); -+ nfsd4_free_slab(&pnfs_layoutrecall_slab); -+ -+ for (i = 0; i < SBID_HASH_SIZE; i++) { -+ while (!list_empty(&sbid_hashtbl[i])) { -+ sbid = list_first_entry(&sbid_hashtbl[i], -+ struct sbid_tracker, -+ hash); -+ destroy_sbid(sbid); -+ } -+ } -+} -+ -+int -+nfsd4_init_pnfs_slabs(void) -+{ -+ int i; -+ -+ pnfs_layout_slab = kmem_cache_create("pnfs_layouts", -+ sizeof(struct nfs4_layout), 0, 0, NULL); -+ if (pnfs_layout_slab == NULL) -+ return -ENOMEM; -+ pnfs_layoutrecall_slab = kmem_cache_create("pnfs_layoutrecalls", -+ sizeof(struct nfs4_layoutrecall), 0, 0, NULL); -+ if (pnfs_layoutrecall_slab == NULL) -+ return -ENOMEM; -+ -+ for (i = 0; i < SBID_HASH_SIZE; i++) { -+ INIT_LIST_HEAD(&sbid_hashtbl[i]); -+ } -+ -+ return 0; -+} -+ -+/* XXX: Need to implement the notify types and track which -+ * clients have which devices. */ -+void pnfs_set_device_notify(clientid_t *clid, unsigned int types) -+{ -+ struct nfs4_client *clp; -+ dprintk("%s: -->\n", __func__); -+ -+ nfs4_lock_state(); -+ /* Indicate that client has a device so we can only notify -+ * the correct clients */ -+ clp = find_confirmed_client(clid); -+ if (clp) { -+ atomic_inc(&clp->cl_deviceref); -+ dprintk("%s: Incr device count (clnt %p) to %d\n", -+ __func__, clp, atomic_read(&clp->cl_deviceref)); -+ } -+ nfs4_unlock_state(); -+} -+ -+/* Clear notifications for this client -+ * XXX: Do we need to loop through a clean up all -+ * krefs when nfsd cleans up the client? */ -+void pnfs_clear_device_notify(struct nfs4_client *clp) -+{ -+ atomic_dec(&clp->cl_deviceref); -+ dprintk("%s: Decr device count (clnt %p) to %d\n", -+ __func__, clp, atomic_read(&clp->cl_deviceref)); -+} -+ -+static struct nfs4_layout_state * -+alloc_init_layout_state(struct nfs4_client *clp, struct nfs4_file *fp, -+ stateid_t *stateid) -+{ -+ struct nfs4_layout_state *new; -+ -+ /* FIXME: use a kmem_cache */ -+ new = kzalloc(sizeof(*new), GFP_KERNEL); -+ if (!new) -+ return new; -+ get_nfs4_file(fp); -+ INIT_LIST_HEAD(&new->ls_perfile); -+ INIT_LIST_HEAD(&new->ls_layouts); -+ kref_init(&new->ls_ref); -+ new->ls_client = clp; -+ new->ls_file = fp; -+ new->ls_stateid.si_boot = stateid->si_boot; -+ new->ls_stateid.si_stateownerid = 0; /* identifies layout stateid */ -+ new->ls_stateid.si_generation = 1; -+ spin_lock(&layout_lock); -+ new->ls_stateid.si_fileid = current_layoutid++; -+ list_add(&new->ls_perfile, &fp->fi_layout_states); -+ spin_unlock(&layout_lock); -+ return new; -+} -+ -+static inline void -+get_layout_state(struct nfs4_layout_state *ls) -+{ -+ kref_get(&ls->ls_ref); -+} -+ -+static void -+destroy_layout_state_common(struct nfs4_layout_state *ls) -+{ -+ struct nfs4_file *fp = ls->ls_file; -+ -+ dprintk("pNFS %s: ls %p fp %p clp %p\n", __func__, ls, fp, -+ ls->ls_client); -+ BUG_ON(!list_empty(&ls->ls_layouts)); -+ kfree(ls); -+ put_nfs4_file(fp); -+} -+ -+static void -+destroy_layout_state(struct kref *kref) -+{ -+ struct nfs4_layout_state *ls = -+ container_of(kref, struct nfs4_layout_state, ls_ref); -+ -+ spin_lock(&layout_lock); -+ list_del(&ls->ls_perfile); -+ spin_unlock(&layout_lock); -+ destroy_layout_state_common(ls); -+} -+ -+static void -+destroy_layout_state_locked(struct kref *kref) -+{ -+ struct nfs4_layout_state *ls = -+ container_of(kref, struct nfs4_layout_state, ls_ref); -+ -+ list_del(&ls->ls_perfile); -+ destroy_layout_state_common(ls); -+} -+ -+static inline void -+put_layout_state(struct nfs4_layout_state *ls) -+{ -+ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, -+ atomic_read(&ls->ls_ref.refcount)); -+ kref_put(&ls->ls_ref, destroy_layout_state); -+} -+ -+static inline void -+put_layout_state_locked(struct nfs4_layout_state *ls) -+{ -+ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, -+ atomic_read(&ls->ls_ref.refcount)); -+ kref_put(&ls->ls_ref, destroy_layout_state_locked); -+} -+ -+/* -+ * Search the fp->fi_layout_state list for a layout state with the clientid. -+ * If not found, then this is a 'first open/delegation/lock stateid' from -+ * the client for this file. -+ * Called under the layout_lock. -+ */ -+static struct nfs4_layout_state * -+find_get_layout_state(struct nfs4_client *clp, struct nfs4_file *fp) -+{ -+ struct nfs4_layout_state *ls; -+ -+ BUG_ON_UNLOCKED_LAYOUT(); -+ list_for_each_entry(ls, &fp->fi_layout_states, ls_perfile) { -+ if (ls->ls_client == clp) { -+ dprintk("pNFS %s: before GET ls %p ls_ref %d\n", -+ __func__, ls, -+ atomic_read(&ls->ls_ref.refcount)); -+ get_layout_state(ls); -+ return ls; -+ } -+ } -+ return NULL; -+} -+ -+static __be32 -+verify_stateid(struct nfs4_file *fp, stateid_t *stateid) -+{ -+ struct nfs4_stateid *local = NULL; -+ struct nfs4_delegation *temp = NULL; -+ -+ /* check if open or lock stateid */ -+ local = find_stateid(stateid, RD_STATE); -+ if (local) -+ return 0; -+ temp = find_delegation_stateid(fp->fi_inode, stateid); -+ if (temp) -+ return 0; -+ return nfserr_bad_stateid; -+} -+ -+/* -+ * nfs4_preocess_layout_stateid () -+ * -+ * We have looked up the nfs4_file corresponding to the current_fh, and -+ * confirmed the clientid. Pull the few tests from nfs4_preprocess_stateid_op() -+ * that make sense with a layout stateid. -+ * -+ * Called with the state_lock held -+ * Returns zero and stateid is updated, or error. -+ * -+ * Note: the struct nfs4_layout_state pointer is only set by layoutget. -+ */ -+static __be32 -+nfs4_process_layout_stateid(struct nfs4_client *clp, struct nfs4_file *fp, -+ stateid_t *stateid, struct nfs4_layout_state **lsp) -+{ -+ struct nfs4_layout_state *ls = NULL; -+ __be32 status = 0; -+ -+ dprintk("--> %s clp %p fp %p \n", __func__, clp, fp); -+ -+ dprintk("%s: operation stateid=" STATEID_FMT "\n", __func__, -+ STATEID_VAL(stateid)); -+ -+ status = nfs4_check_stateid(stateid); -+ if (status) -+ goto out; -+ -+ /* Is this the first use of this layout ? */ -+ spin_lock(&layout_lock); -+ ls = find_get_layout_state(clp, fp); -+ spin_unlock(&layout_lock); -+ if (!ls) { -+ /* Only alloc layout state on layoutget (which sets lsp). */ -+ if (!lsp) { -+ dprintk("%s ERROR: Not layoutget & no layout stateid\n", -+ __func__); -+ status = nfserr_bad_stateid; -+ goto out; -+ } -+ dprintk("%s Initial stateid for layout: file %p client %p\n", -+ __func__, fp, clp); -+ -+ /* verify input stateid */ -+ status = verify_stateid(fp, stateid); -+ if (status) { -+ dprintk("%s ERROR: invalid open/deleg/lock stateid\n", -+ __func__); -+ goto out; -+ } -+ ls = alloc_init_layout_state(clp, fp, stateid); -+ if (!ls) { -+ dprintk("%s pNFS ERROR: no memory for layout state\n", -+ __func__); -+ status = nfserr_resource; -+ goto out; -+ } -+ } else { -+ dprintk("%s Not initial stateid. Layout state %p file %p\n", -+ __func__, ls, fp); -+ -+ /* BAD STATEID */ -+ status = nfserr_bad_stateid; -+ if (memcmp(&ls->ls_stateid.si_opaque, &stateid->si_opaque, -+ sizeof(stateid_opaque_t)) != 0) { -+ -+ /* if a LAYOUTGET operation and stateid is a valid -+ * open/deleg/lock stateid, accept it as a parallel -+ * initial layout stateid -+ */ -+ if (lsp && ((verify_stateid(fp, stateid)) == 0)) { -+ dprintk("%s parallel initial layout state\n", -+ __func__); -+ goto verified; -+ } -+ -+ dprintk("%s ERROR bad opaque in stateid 1\n", __func__); -+ goto out_put; -+ } -+ -+ /* stateid is a valid layout stateid for this file. */ -+ if (stateid->si_generation > ls->ls_stateid.si_generation) { -+ dprintk("%s bad stateid 1\n", __func__); -+ goto out_put; -+ } -+ } -+verified: -+ status = 0; -+ -+ /* Return the layout state if requested */ -+ if (lsp) { -+ get_layout_state(ls); -+ *lsp = ls; -+ } -+ dprintk("%s: layout stateid=" STATEID_FMT "\n", __func__, -+ STATEID_VAL(&ls->ls_stateid)); -+out_put: -+ dprintk("%s PUT LO STATE:\n", __func__); -+ put_layout_state(ls); -+out: -+ dprintk("<-- %s status %d\n", __func__, htonl(status)); -+ -+ return status; -+} -+ -+static inline struct nfs4_layout * -+alloc_layout(void) -+{ -+ return kmem_cache_alloc(pnfs_layout_slab, GFP_KERNEL); -+} -+ -+static inline void -+free_layout(struct nfs4_layout *lp) -+{ -+ kmem_cache_free(pnfs_layout_slab, lp); -+} -+ -+#define update_layout_stateid(ls, sid) { \ -+ update_stateid(&(ls)->ls_stateid); \ -+ dprintk("%s Updated ls_stateid to %d on layoutstate %p\n", \ -+ __func__, (ls)->ls_stateid.si_generation, (ls)); \ -+ memcpy((sid), &(ls)->ls_stateid, sizeof(stateid_t)); \ -+} -+ -+static void -+init_layout(struct nfs4_layout_state *ls, -+ struct nfs4_layout *lp, -+ struct nfs4_file *fp, -+ struct nfs4_client *clp, -+ struct svc_fh *current_fh, -+ struct nfsd4_layout_seg *seg, -+ stateid_t *stateid) -+{ -+ dprintk("pNFS %s: ls %p lp %p clp %p fp %p ino %p\n", __func__, -+ ls, lp, clp, fp, fp->fi_inode); -+ -+ get_nfs4_file(fp); -+ lp->lo_client = clp; -+ lp->lo_file = fp; -+ get_layout_state(ls); -+ lp->lo_state = ls; -+ memcpy(&lp->lo_seg, seg, sizeof(lp->lo_seg)); -+ spin_lock(&layout_lock); -+ update_layout_stateid(ls, stateid); -+ list_add_tail(&lp->lo_perstate, &ls->ls_layouts); -+ list_add_tail(&lp->lo_perclnt, &clp->cl_layouts); -+ list_add_tail(&lp->lo_perfile, &fp->fi_layouts); -+ spin_unlock(&layout_lock); -+ dprintk("pNFS %s end\n", __func__); -+} -+ -+static void -+dequeue_layout(struct nfs4_layout *lp) -+{ -+ BUG_ON_UNLOCKED_LAYOUT(); -+ list_del(&lp->lo_perclnt); -+ list_del(&lp->lo_perfile); -+ list_del(&lp->lo_perstate); -+} -+ -+static void -+destroy_layout(struct nfs4_layout *lp) -+{ -+ struct nfs4_client *clp; -+ struct nfs4_file *fp; -+ struct nfs4_layout_state *ls; -+ -+ BUG_ON_UNLOCKED_LAYOUT(); -+ clp = lp->lo_client; -+ fp = lp->lo_file; -+ ls = lp->lo_state; -+ dprintk("pNFS %s: lp %p clp %p fp %p ino %p ls_layouts empty %d\n", -+ __func__, lp, clp, fp, fp->fi_inode, -+ list_empty(&ls->ls_layouts)); -+ -+ kmem_cache_free(pnfs_layout_slab, lp); -+ /* release references taken by init_layout */ -+ put_layout_state_locked(ls); -+ put_nfs4_file(fp); -+} -+ -+void fs_layout_return(struct super_block *sb, struct inode *ino, -+ struct nfsd4_pnfs_layoutreturn *lrp, int flags, -+ void *recall_cookie) -+{ -+ int ret; -+ -+ if (unlikely(!sb->s_pnfs_op->layout_return)) -+ return; -+ -+ lrp->lr_flags = flags; -+ lrp->args.lr_cookie = recall_cookie; -+ -+ if (!ino) /* FSID or ALL */ -+ ino = sb->s_root->d_inode; -+ -+ ret = sb->s_pnfs_op->layout_return(ino, &lrp->args); -+ dprintk("%s: inode %lu iomode=%d offset=0x%llx length=0x%llx " -+ "cookie = %p flags 0x%x status=%d\n", -+ __func__, ino->i_ino, lrp->args.lr_seg.iomode, -+ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, -+ recall_cookie, flags, ret); -+} -+ -+static u64 -+alloc_init_sbid(struct super_block *sb) -+{ -+ struct sbid_tracker *sbid; -+ struct sbid_tracker *new = alloc_sbid(); -+ unsigned long hash_idx = sbid_hashval(sb); -+ u64 id = 0; -+ -+ if (likely(new)) { -+ spin_lock(&layout_lock); -+ id = ++current_sbid; -+ new->id = (id << SBID_HASH_BITS) | (hash_idx & SBID_HASH_MASK); -+ id = new->id; -+ BUG_ON(id == 0); -+ new->sb = sb; -+ -+ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) -+ if (sbid->sb == sb) { -+ kfree(new); -+ id = sbid->id; -+ spin_unlock(&layout_lock); -+ return id; -+ } -+ list_add(&new->hash, &sbid_hashtbl[hash_idx]); -+ spin_unlock(&layout_lock); -+ } -+ return id; -+} -+ -+struct super_block * -+find_sbid_id(u64 id) -+{ -+ struct sbid_tracker *sbid; -+ struct super_block *sb = NULL; -+ unsigned long hash_idx = id & SBID_HASH_MASK; -+ int pos = 0; -+ -+ spin_lock(&layout_lock); -+ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { -+ pos++; -+ if (sbid->id != id) -+ continue; -+ if (pos > 1) -+ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); -+ sb = sbid->sb; -+ break; -+ } -+ spin_unlock(&layout_lock); -+ return sb; -+} -+ -+u64 -+find_create_sbid(struct super_block *sb) -+{ -+ struct sbid_tracker *sbid; -+ unsigned long hash_idx = sbid_hashval(sb); -+ int pos = 0; -+ u64 id = 0; -+ -+ spin_lock(&layout_lock); -+ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { -+ pos++; -+ if (sbid->sb != sb) -+ continue; -+ if (pos > 1) -+ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); -+ id = sbid->id; -+ break; -+ } -+ spin_unlock(&layout_lock); -+ -+ if (!id) -+ id = alloc_init_sbid(sb); -+ -+ return id; -+} -+ -+/* -+ * Create a layoutrecall structure -+ * An optional layoutrecall can be cloned (except for the layoutrecall lists) -+ */ -+static struct nfs4_layoutrecall * -+alloc_init_layoutrecall(struct nfsd4_pnfs_cb_layout *cbl, -+ struct nfs4_client *clp, -+ struct nfs4_file *lrfile) -+{ -+ struct nfs4_layoutrecall *clr; -+ -+ dprintk("NFSD %s\n", __func__); -+ clr = kmem_cache_alloc(pnfs_layoutrecall_slab, GFP_KERNEL); -+ if (clr == NULL) -+ return clr; -+ -+ dprintk("NFSD %s -->\n", __func__); -+ -+ memset(clr, 0, sizeof(*clr)); -+ if (lrfile) -+ get_nfs4_file(lrfile); -+ clr->clr_client = clp; -+ clr->clr_file = lrfile; -+ clr->cb = *cbl; -+ -+ kref_init(&clr->clr_ref); -+ INIT_LIST_HEAD(&clr->clr_perclnt); -+ -+ dprintk("NFSD %s return %p\n", __func__, clr); -+ return clr; -+} -+ -+static void -+get_layoutrecall(struct nfs4_layoutrecall *clr) -+{ -+ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, -+ atomic_read(&clr->clr_ref.refcount)); -+ kref_get(&clr->clr_ref); -+} -+ -+static void -+destroy_layoutrecall(struct kref *kref) -+{ -+ struct nfs4_layoutrecall *clr = -+ container_of(kref, struct nfs4_layoutrecall, clr_ref); -+ dprintk("pNFS %s: clr %p fp %p clp %p\n", __func__, clr, -+ clr->clr_file, clr->clr_client); -+ BUG_ON(!list_empty(&clr->clr_perclnt)); -+ if (clr->clr_file) -+ put_nfs4_file(clr->clr_file); -+ kmem_cache_free(pnfs_layoutrecall_slab, clr); -+} -+ -+int -+put_layoutrecall(struct nfs4_layoutrecall *clr) -+{ -+ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, -+ atomic_read(&clr->clr_ref.refcount)); -+ return kref_put(&clr->clr_ref, destroy_layoutrecall); -+} -+ -+void * -+layoutrecall_done(struct nfs4_layoutrecall *clr) -+{ -+ void *recall_cookie = clr->cb.cbl_cookie; -+ struct nfs4_layoutrecall *parent = clr->parent; -+ -+ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, -+ atomic_read(&clr->clr_ref.refcount)); -+ BUG_ON_UNLOCKED_LAYOUT(); -+ list_del_init(&clr->clr_perclnt); -+ put_layoutrecall(clr); -+ -+ if (parent && !put_layoutrecall(parent)) -+ recall_cookie = NULL; -+ -+ return recall_cookie; -+} -+ -+/* -+ * get_state() and cb_get_state() are -+ */ -+void -+release_pnfs_ds_dev_list(struct nfs4_stateid *stp) -+{ -+ struct pnfs_ds_dev_entry *ddp; -+ -+ while (!list_empty(&stp->st_pnfs_ds_id)) { -+ ddp = list_entry(stp->st_pnfs_ds_id.next, -+ struct pnfs_ds_dev_entry, dd_dev_entry); -+ list_del(&ddp->dd_dev_entry); -+ kfree(ddp); -+ } -+} -+ -+static int -+nfs4_add_pnfs_ds_dev(struct nfs4_stateid *stp, u32 dsid) -+{ -+ struct pnfs_ds_dev_entry *ddp; -+ -+ ddp = kmalloc(sizeof(*ddp), GFP_KERNEL); -+ if (!ddp) -+ return -ENOMEM; -+ -+ INIT_LIST_HEAD(&ddp->dd_dev_entry); -+ list_add(&ddp->dd_dev_entry, &stp->st_pnfs_ds_id); -+ ddp->dd_dsid = dsid; -+ return 0; -+} -+ -+/* -+ * are two octet ranges overlapping? -+ * start1 last1 -+ * |-----------------| -+ * start2 last2 -+ * |----------------| -+ */ -+static inline int -+lo_seg_overlapping(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) -+{ -+ u64 start1 = l1->offset; -+ u64 last1 = last_byte_offset(start1, l1->length); -+ u64 start2 = l2->offset; -+ u64 last2 = last_byte_offset(start2, l2->length); -+ int ret; -+ -+ /* if last1 == start2 there's a single byte overlap */ -+ ret = (last2 >= start1) && (last1 >= start2); -+ dprintk("%s: l1 %llu:%lld l2 %llu:%lld ret=%d\n", __func__, -+ l1->offset, l1->length, l2->offset, l2->length, ret); -+ return ret; -+} -+ -+static inline int -+same_fsid_major(struct nfs4_fsid *fsid, u64 major) -+{ -+ return fsid->major == major; -+} -+ -+static inline int -+same_fsid(struct nfs4_fsid *fsid, struct svc_fh *current_fh) -+{ -+ return same_fsid_major(fsid, current_fh->fh_export->ex_fsid); -+} -+ -+/* -+ * find a layout recall conflicting with the specified layoutget -+ */ -+static int -+is_layout_recalled(struct nfs4_client *clp, -+ struct svc_fh *current_fh, -+ struct nfsd4_layout_seg *seg) -+{ -+ struct nfs4_layoutrecall *clr; -+ -+ spin_lock(&layout_lock); -+ list_for_each_entry (clr, &clp->cl_layoutrecalls, clr_perclnt) { -+ if (clr->cb.cbl_seg.layout_type != seg->layout_type) -+ continue; -+ if (clr->cb.cbl_recall_type == RETURN_ALL) -+ goto found; -+ if (clr->cb.cbl_recall_type == RETURN_FSID) { -+ if (same_fsid(&clr->cb.cbl_fsid, current_fh)) -+ goto found; -+ else -+ continue; -+ } -+ BUG_ON(clr->cb.cbl_recall_type != RETURN_FILE); -+ if (clr->cb.cbl_seg.clientid == seg->clientid && -+ lo_seg_overlapping(&clr->cb.cbl_seg, seg)) -+ goto found; -+ } -+ spin_unlock(&layout_lock); -+ return 0; -+found: -+ spin_unlock(&layout_lock); -+ return 1; -+} -+ -+/* -+ * are two octet ranges overlapping or adjacent? -+ */ -+static inline int -+lo_seg_mergeable(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) -+{ -+ u64 start1 = l1->offset; -+ u64 end1 = end_offset(start1, l1->length); -+ u64 start2 = l2->offset; -+ u64 end2 = end_offset(start2, l2->length); -+ -+ /* is end1 == start2 ranges are adjacent */ -+ return (end2 >= start1) && (end1 >= start2); -+} -+ -+static void -+extend_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lg) -+{ -+ u64 lo_start = lo->offset; -+ u64 lo_end = end_offset(lo_start, lo->length); -+ u64 lg_start = lg->offset; -+ u64 lg_end = end_offset(lg_start, lg->length); -+ -+ /* lo already covers lg? */ -+ if (lo_start <= lg_start && lg_end <= lo_end) -+ return; -+ -+ /* extend start offset */ -+ if (lo_start > lg_start) -+ lo_start = lg_start; -+ -+ /* extend end offset */ -+ if (lo_end < lg_end) -+ lo_end = lg_end; -+ -+ lo->offset = lo_start; -+ lo->length = (lo_end == NFS4_MAX_UINT64) ? -+ lo_end : lo_end - lo_start; -+} -+ -+static struct nfs4_layout * -+merge_layout(struct nfs4_file *fp, -+ struct nfs4_client *clp, -+ struct nfsd4_layout_seg *seg) -+{ -+ struct nfs4_layout *lp = NULL; -+ -+ spin_lock(&layout_lock); -+ list_for_each_entry (lp, &fp->fi_layouts, lo_perfile) -+ if (lp->lo_seg.layout_type == seg->layout_type && -+ lp->lo_seg.clientid == seg->clientid && -+ lp->lo_seg.iomode == seg->iomode && -+ lo_seg_mergeable(&lp->lo_seg, seg)) { -+ extend_layout(&lp->lo_seg, seg); -+ break; -+ } -+ spin_unlock(&layout_lock); -+ -+ return lp; -+} -+ -+__be32 -+nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *lgp, -+ struct exp_xdr_stream *xdr) -+{ -+ u32 status; -+ __be32 nfserr; -+ struct inode *ino = lgp->lg_fhp->fh_dentry->d_inode; -+ struct super_block *sb = ino->i_sb; -+ int can_merge; -+ struct nfs4_file *fp; -+ struct nfs4_client *clp; -+ struct nfs4_layout *lp = NULL; -+ struct nfs4_layout_state *ls = NULL; -+ struct nfsd4_pnfs_layoutget_arg args = { -+ .lg_minlength = lgp->lg_minlength, -+ .lg_fh = &lgp->lg_fhp->fh_handle, -+ }; -+ struct nfsd4_pnfs_layoutget_res res = { -+ .lg_seg = lgp->lg_seg, -+ }; -+ -+ dprintk("NFSD: %s Begin\n", __func__); -+ -+ args.lg_sbid = find_create_sbid(sb); -+ if (!args.lg_sbid) { -+ nfserr = nfserr_layouttrylater; -+ goto out; -+ } -+ -+ can_merge = sb->s_pnfs_op->can_merge_layouts != NULL && -+ sb->s_pnfs_op->can_merge_layouts(lgp->lg_seg.layout_type); -+ -+ nfs4_lock_state(); -+ fp = find_alloc_file(ino, lgp->lg_fhp); -+ clp = find_confirmed_client((clientid_t *)&lgp->lg_seg.clientid); -+ dprintk("pNFS %s: fp %p clp %p \n", __func__, fp, clp); -+ if (!fp || !clp) { -+ nfserr = nfserr_inval; -+ goto out_unlock; -+ } -+ -+ /* Check decoded layout stateid */ -+ nfserr = nfs4_process_layout_stateid(clp, fp, &lgp->lg_sid, &ls); -+ if (nfserr) -+ goto out_unlock; -+ -+ if (is_layout_recalled(clp, lgp->lg_fhp, &lgp->lg_seg)) { -+ nfserr = nfserr_recallconflict; -+ goto out; -+ } -+ -+ /* pre-alloc layout in case we can't merge after we call -+ * the file system -+ */ -+ lp = alloc_layout(); -+ if (!lp) { -+ nfserr = nfserr_layouttrylater; -+ goto out_unlock; -+ } -+ -+ dprintk("pNFS %s: pre-export type 0x%x maxcount %Zd " -+ "iomode %u offset %llu length %llu\n", -+ __func__, lgp->lg_seg.layout_type, -+ exp_xdr_qbytes(xdr->end - xdr->p), -+ lgp->lg_seg.iomode, lgp->lg_seg.offset, lgp->lg_seg.length); -+ -+ /* FIXME: need to eliminate the use of the state lock */ -+ nfs4_unlock_state(); -+ status = sb->s_pnfs_op->layout_get(ino, xdr, &args, &res); -+ nfs4_lock_state(); -+ -+ dprintk("pNFS %s: post-export status %u " -+ "iomode %u offset %llu length %llu\n", -+ __func__, status, res.lg_seg.iomode, -+ res.lg_seg.offset, res.lg_seg.length); -+ -+ /* -+ * The allowable error codes for the layout_get pNFS export -+ * operations vector function (from the file system) can be -+ * expanded as needed to include other errors defined for -+ * the RFC 5561 LAYOUTGET operation. -+ */ -+ switch (status) { -+ case 0: -+ nfserr = NFS4_OK; -+ break; -+ case NFS4ERR_ACCESS: -+ case NFS4ERR_BADIOMODE: -+ /* No support for LAYOUTIOMODE4_RW layouts */ -+ case NFS4ERR_BADLAYOUT: -+ /* No layout matching loga_minlength rules */ -+ case NFS4ERR_INVAL: -+ case NFS4ERR_IO: -+ case NFS4ERR_LAYOUTTRYLATER: -+ case NFS4ERR_LAYOUTUNAVAILABLE: -+ case NFS4ERR_LOCKED: -+ case NFS4ERR_NOSPC: -+ case NFS4ERR_RECALLCONFLICT: -+ case NFS4ERR_SERVERFAULT: -+ case NFS4ERR_TOOSMALL: -+ /* Requested layout too big for loga_maxcount */ -+ case NFS4ERR_WRONG_TYPE: -+ /* Not a regular file */ -+ nfserr = cpu_to_be32(status); -+ goto out_freelayout; -+ default: -+ BUG(); -+ nfserr = nfserr_serverfault; -+ } -+ -+ lgp->lg_seg = res.lg_seg; -+ lgp->lg_roc = res.lg_return_on_close; -+ -+ /* SUCCESS! -+ * Can the new layout be merged into an existing one? -+ * If so, free unused layout struct -+ */ -+ if (can_merge && merge_layout(fp, clp, &res.lg_seg)) -+ goto out_freelayout; -+ -+ /* Can't merge, so let's initialize this new layout */ -+ init_layout(ls, lp, fp, clp, lgp->lg_fhp, &res.lg_seg, &lgp->lg_sid); -+out_unlock: -+ if (ls) -+ put_layout_state(ls); -+ if (fp) -+ put_nfs4_file(fp); -+ nfs4_unlock_state(); -+out: -+ dprintk("pNFS %s: lp %p exit nfserr %u\n", __func__, lp, -+ be32_to_cpu(nfserr)); -+ return nfserr; -+out_freelayout: -+ free_layout(lp); -+ goto out_unlock; -+} -+ -+static void -+trim_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lr) -+{ -+ u64 lo_start = lo->offset; -+ u64 lo_end = end_offset(lo_start, lo->length); -+ u64 lr_start = lr->offset; -+ u64 lr_end = end_offset(lr_start, lr->length); -+ -+ dprintk("%s:Begin lo %llu:%lld lr %llu:%lld\n", __func__, -+ lo->offset, lo->length, lr->offset, lr->length); -+ -+ /* lr fully covers lo? */ -+ if (lr_start <= lo_start && lo_end <= lr_end) { -+ lo->length = 0; -+ goto out; -+ } -+ -+ /* -+ * split not supported yet. retain layout segment. -+ * remains must be returned by the client -+ * on the final layout return. -+ */ -+ if (lo_start < lr_start && lr_end < lo_end) { -+ dprintk("%s: split not supported\n", __func__); -+ goto out; -+ } -+ -+ if (lo_start < lr_start) -+ lo_end = lr_start - 1; -+ else /* lr_end < lo_end */ -+ lo_start = lr_end + 1; -+ -+ lo->offset = lo_start; -+ lo->length = (lo_end == NFS4_MAX_UINT64) ? lo_end : lo_end - lo_start; -+out: -+ dprintk("%s:End lo %llu:%lld\n", __func__, lo->offset, lo->length); -+} -+ -+static int -+pnfs_return_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp, -+ struct nfsd4_pnfs_layoutreturn *lrp, -+ struct nfs4_layout_state *ls) -+{ -+ int layouts_found = 0; -+ struct nfs4_layout *lp, *nextlp; -+ -+ dprintk("%s: clp %p fp %p\n", __func__, clp, fp); -+ spin_lock(&layout_lock); -+ list_for_each_entry_safe (lp, nextlp, &fp->fi_layouts, lo_perfile) { -+ dprintk("%s: lp %p client %p,%p lo_type %x,%x iomode %d,%d\n", -+ __func__, lp, -+ lp->lo_client, clp, -+ lp->lo_seg.layout_type, lrp->args.lr_seg.layout_type, -+ lp->lo_seg.iomode, lrp->args.lr_seg.iomode); -+ if (lp->lo_client != clp || -+ lp->lo_seg.layout_type != lrp->args.lr_seg.layout_type || -+ (lp->lo_seg.iomode != lrp->args.lr_seg.iomode && -+ lrp->args.lr_seg.iomode != IOMODE_ANY) || -+ !lo_seg_overlapping(&lp->lo_seg, &lrp->args.lr_seg)) -+ continue; -+ layouts_found++; -+ trim_layout(&lp->lo_seg, &lrp->args.lr_seg); -+ if (!lp->lo_seg.length) { -+ lrp->lrs_present = 0; -+ dequeue_layout(lp); -+ destroy_layout(lp); -+ } -+ } -+ if (ls && layouts_found && lrp->lrs_present) -+ update_layout_stateid(ls, &lrp->lr_sid); -+ spin_unlock(&layout_lock); -+ -+ return layouts_found; -+} -+ -+static int -+pnfs_return_client_layouts(struct nfs4_client *clp, -+ struct nfsd4_pnfs_layoutreturn *lrp, u64 ex_fsid) -+{ -+ int layouts_found = 0; -+ struct nfs4_layout *lp, *nextlp; -+ -+ spin_lock(&layout_lock); -+ list_for_each_entry_safe (lp, nextlp, &clp->cl_layouts, lo_perclnt) { -+ if (lrp->args.lr_seg.layout_type != lp->lo_seg.layout_type || -+ (lrp->args.lr_seg.iomode != lp->lo_seg.iomode && -+ lrp->args.lr_seg.iomode != IOMODE_ANY)) -+ continue; -+ -+ if (lrp->args.lr_return_type == RETURN_FSID && -+ !same_fsid_major(&lp->lo_file->fi_fsid, ex_fsid)) -+ continue; -+ -+ layouts_found++; -+ dequeue_layout(lp); -+ destroy_layout(lp); -+ } -+ spin_unlock(&layout_lock); -+ -+ return layouts_found; -+} -+ -+static int -+recall_return_perfect_match(struct nfs4_layoutrecall *clr, -+ struct nfsd4_pnfs_layoutreturn *lrp, -+ struct nfs4_file *fp, -+ struct svc_fh *current_fh) -+{ -+ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode || -+ clr->cb.cbl_recall_type != lrp->args.lr_return_type) -+ return 0; -+ -+ return (clr->cb.cbl_recall_type == RETURN_FILE && -+ clr->clr_file == fp && -+ clr->cb.cbl_seg.offset == lrp->args.lr_seg.offset && -+ clr->cb.cbl_seg.length == lrp->args.lr_seg.length) || -+ -+ (clr->cb.cbl_recall_type == RETURN_FSID && -+ same_fsid(&clr->cb.cbl_fsid, current_fh)) || -+ -+ clr->cb.cbl_recall_type == RETURN_ALL; -+} -+ -+static int -+recall_return_partial_match(struct nfs4_layoutrecall *clr, -+ struct nfsd4_pnfs_layoutreturn *lrp, -+ struct nfs4_file *fp, -+ struct svc_fh *current_fh) -+{ -+ /* iomode matching? */ -+ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode && -+ clr->cb.cbl_seg.iomode != IOMODE_ANY && -+ lrp->args.lr_seg.iomode != IOMODE_ANY) -+ return 0; -+ -+ if (clr->cb.cbl_recall_type == RETURN_ALL || -+ lrp->args.lr_return_type == RETURN_ALL) -+ return 1; -+ -+ /* fsid matches? */ -+ if (clr->cb.cbl_recall_type == RETURN_FSID || -+ lrp->args.lr_return_type == RETURN_FSID) -+ return same_fsid(&clr->cb.cbl_fsid, current_fh); -+ -+ /* file matches, range overlapping? */ -+ return clr->clr_file == fp && -+ lo_seg_overlapping(&clr->cb.cbl_seg, &lrp->args.lr_seg); -+} -+ -+int nfs4_pnfs_return_layout(struct super_block *sb, struct svc_fh *current_fh, -+ struct nfsd4_pnfs_layoutreturn *lrp) -+{ -+ int status = 0; -+ int layouts_found = 0; -+ struct inode *ino = current_fh->fh_dentry->d_inode; -+ struct nfs4_file *fp = NULL; -+ struct nfs4_client *clp; -+ struct nfs4_layout_state *ls = NULL; -+ struct nfs4_layoutrecall *clr, *nextclr; -+ u64 ex_fsid = current_fh->fh_export->ex_fsid; -+ void *recall_cookie = NULL; -+ -+ dprintk("NFSD: %s\n", __func__); -+ -+ nfs4_lock_state(); -+ clp = find_confirmed_client((clientid_t *)&lrp->args.lr_seg.clientid); -+ if (!clp) -+ goto out; -+ -+ if (lrp->args.lr_return_type == RETURN_FILE) { -+ fp = find_file(ino); -+ if (!fp) { -+ printk(KERN_ERR "%s: RETURN_FILE: no nfs4_file for " -+ "ino %p:%lu\n", -+ __func__, ino, ino ? ino->i_ino : 0L); -+ goto out; -+ } -+ -+ /* Check the stateid */ -+ dprintk("%s PROCESS LO_STATEID inode %p\n", __func__, ino); -+ status = nfs4_process_layout_stateid(clp, fp, &lrp->lr_sid, &ls); -+ if (status) -+ goto out_put_file; -+ -+ /* update layouts */ -+ layouts_found = pnfs_return_file_layouts(clp, fp, lrp, ls); -+ /* optimize for the all-empty case */ -+ if (list_empty(&fp->fi_layouts)) -+ recall_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; -+ } else { -+ layouts_found = pnfs_return_client_layouts(clp, lrp, ex_fsid); -+ } -+ -+ dprintk("pNFS %s: clp %p fp %p layout_type 0x%x iomode %d " -+ "return_type %d fsid 0x%llx offset %llu length %llu: " -+ "layouts_found %d\n", -+ __func__, clp, fp, lrp->args.lr_seg.layout_type, -+ lrp->args.lr_seg.iomode, lrp->args.lr_return_type, -+ ex_fsid, -+ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, layouts_found); -+ -+ /* update layoutrecalls -+ * note: for RETURN_{FSID,ALL}, fp may be NULL -+ */ -+ spin_lock(&layout_lock); -+ list_for_each_entry_safe (clr, nextclr, &clp->cl_layoutrecalls, -+ clr_perclnt) { -+ if (clr->cb.cbl_seg.layout_type != lrp->args.lr_seg.layout_type) -+ continue; -+ -+ if (recall_return_perfect_match(clr, lrp, fp, current_fh)) -+ recall_cookie = layoutrecall_done(clr); -+ else if (layouts_found && -+ recall_return_partial_match(clr, lrp, fp, current_fh)) -+ clr->clr_time = CURRENT_TIME; -+ } -+ spin_unlock(&layout_lock); -+ -+out_put_file: -+ if (fp) -+ put_nfs4_file(fp); -+ if (ls) -+ put_layout_state(ls); -+out: -+ nfs4_unlock_state(); -+ -+ /* call exported filesystem layout_return (ignore return-code) */ -+ fs_layout_return(sb, ino, lrp, 0, recall_cookie); -+ -+ dprintk("pNFS %s: exit status %d \n", __func__, status); -+ return status; -+} -+ -+/* -+ * PNFS Metadata server export operations callback for get_state -+ * -+ * called by the cluster fs when it receives a get_state() from a data -+ * server. -+ * returns status, or pnfs_get_state* with pnfs_get_state->status set. -+ * -+ */ -+int -+nfs4_pnfs_cb_get_state(struct super_block *sb, struct pnfs_get_state *arg) -+{ -+ struct nfs4_stateid *stp; -+ int flags = LOCK_STATE | OPEN_STATE; /* search both hash tables */ -+ int status = -EINVAL; -+ struct inode *ino; -+ struct nfs4_delegation *dl; -+ stateid_t *stid = (stateid_t *)&arg->stid; -+ -+ dprintk("NFSD: %s sid=" STATEID_FMT " ino %llu\n", __func__, -+ STATEID_VAL(stid), arg->ino); -+ -+ nfs4_lock_state(); -+ stp = find_stateid(stid, flags); -+ if (!stp) { -+ ino = iget_locked(sb, arg->ino); -+ if (!ino) -+ goto out; -+ -+ if (ino->i_state & I_NEW) { -+ iget_failed(ino); -+ goto out; -+ } -+ -+ dl = find_delegation_stateid(ino, stid); -+ if (dl) -+ status = 0; -+ -+ iput(ino); -+ } else { -+ /* XXX ANDROS: marc removed nfs4_check_fh - how come? */ -+ -+ /* arg->devid is the Data server id, set by the cluster fs */ -+ status = nfs4_add_pnfs_ds_dev(stp, arg->dsid); -+ if (status) -+ goto out; -+ -+ arg->access = stp->st_access_bmap; -+ *(clientid_t *)&arg->clid = -+ stp->st_stateowner->so_client->cl_clientid; -+ } -+out: -+ nfs4_unlock_state(); -+ return status; -+} -+ -+static int -+cl_has_file_layout(struct nfs4_client *clp, struct nfs4_file *lrfile, -+ stateid_t *lsid) -+{ -+ int found = 0; -+ struct nfs4_layout *lp; -+ struct nfs4_layout_state *ls; -+ -+ spin_lock(&layout_lock); -+ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) { -+ if (lp->lo_file != lrfile) -+ continue; -+ -+ ls = find_get_layout_state(clp, lrfile); -+ if (!ls) { -+ /* This shouldn't happen as the file should have a -+ * layout stateid if it has a layout. -+ */ -+ printk(KERN_ERR "%s: file %p has no layout stateid\n", -+ __func__, lrfile); -+ WARN_ON(1); -+ break; -+ } -+ update_layout_stateid(ls, lsid); -+ put_layout_state_locked(ls); -+ found = 1; -+ break; -+ } -+ spin_unlock(&layout_lock); -+ -+ return found; -+} -+ -+static int -+cl_has_fsid_layout(struct nfs4_client *clp, struct nfs4_fsid *fsid) -+{ -+ int found = 0; -+ struct nfs4_layout *lp; -+ -+ /* note: minor version unused */ -+ spin_lock(&layout_lock); -+ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) -+ if (lp->lo_file->fi_fsid.major == fsid->major) { -+ found = 1; -+ break; -+ } -+ spin_unlock(&layout_lock); -+ return found; -+} -+ -+static int -+cl_has_any_layout(struct nfs4_client *clp) -+{ -+ return !list_empty(&clp->cl_layouts); -+} -+ -+static int -+cl_has_layout(struct nfs4_client *clp, struct nfsd4_pnfs_cb_layout *cbl, -+ struct nfs4_file *lrfile, stateid_t *lsid) -+{ -+ switch (cbl->cbl_recall_type) { -+ case RETURN_FILE: -+ return cl_has_file_layout(clp, lrfile, lsid); -+ case RETURN_FSID: -+ return cl_has_fsid_layout(clp, &cbl->cbl_fsid); -+ default: -+ return cl_has_any_layout(clp); -+ } -+} -+ -+/* -+ * Called without the layout_lock. -+ */ -+void -+nomatching_layout(struct nfs4_layoutrecall *clr) -+{ -+ struct nfsd4_pnfs_layoutreturn lr = { -+ .args.lr_return_type = clr->cb.cbl_recall_type, -+ .args.lr_seg = clr->cb.cbl_seg, -+ }; -+ struct inode *inode; -+ void *recall_cookie; -+ -+ if (clr->clr_file) { -+ inode = igrab(clr->clr_file->fi_inode); -+ if (WARN_ON(!inode)) -+ return; -+ } else { -+ inode = NULL; -+ } -+ -+ dprintk("%s: clp %p fp %p: simulating layout_return\n", __func__, -+ clr->clr_client, clr->clr_file); -+ -+ if (clr->cb.cbl_recall_type == RETURN_FILE) -+ pnfs_return_file_layouts(clr->clr_client, clr->clr_file, &lr, -+ NULL); -+ else -+ pnfs_return_client_layouts(clr->clr_client, &lr, -+ clr->cb.cbl_fsid.major); -+ -+ spin_lock(&layout_lock); -+ recall_cookie = layoutrecall_done(clr); -+ spin_unlock(&layout_lock); -+ -+ fs_layout_return(clr->clr_sb, inode, &lr, LR_FLAG_INTERN, -+ recall_cookie); -+ iput(inode); -+} -+ -+void pnfs_expire_client(struct nfs4_client *clp) -+{ -+ for (;;) { -+ struct nfs4_layoutrecall *lrp = NULL; -+ -+ spin_lock(&layout_lock); -+ if (!list_empty(&clp->cl_layoutrecalls)) { -+ lrp = list_entry(clp->cl_layoutrecalls.next, -+ struct nfs4_layoutrecall, clr_perclnt); -+ get_layoutrecall(lrp); -+ } -+ spin_unlock(&layout_lock); -+ if (!lrp) -+ break; -+ -+ dprintk("%s: lrp %p, fp %p\n", __func__, lrp, lrp->clr_file); -+ BUG_ON(lrp->clr_client != clp); -+ nomatching_layout(lrp); -+ put_layoutrecall(lrp); -+ } -+ -+ for (;;) { -+ struct nfs4_layout *lp = NULL; -+ struct inode *inode = NULL; -+ struct nfsd4_pnfs_layoutreturn lr; -+ bool empty = false; -+ -+ spin_lock(&layout_lock); -+ if (!list_empty(&clp->cl_layouts)) { -+ lp = list_entry(clp->cl_layouts.next, -+ struct nfs4_layout, lo_perclnt); -+ inode = igrab(lp->lo_file->fi_inode); -+ memset(&lr, 0, sizeof(lr)); -+ lr.args.lr_return_type = RETURN_FILE; -+ lr.args.lr_seg = lp->lo_seg; -+ empty = list_empty(&lp->lo_file->fi_layouts); -+ BUG_ON(lp->lo_client != clp); -+ dequeue_layout(lp); -+ destroy_layout(lp); /* do not access lp after this */ -+ } -+ spin_unlock(&layout_lock); -+ if (!lp) -+ break; -+ -+ if (WARN_ON(!inode)) -+ break; -+ -+ dprintk("%s: inode %lu lp %p clp %p\n", __func__, inode->i_ino, -+ lp, clp); -+ -+ fs_layout_return(inode->i_sb, inode, &lr, LR_FLAG_EXPIRE, -+ empty ? PNFS_LAST_LAYOUT_NO_RECALLS : NULL); -+ iput(inode); -+ } -+} -+ -+struct create_recall_list_arg { -+ struct nfsd4_pnfs_cb_layout *cbl; -+ struct nfs4_file *lrfile; -+ struct list_head *todolist; -+ unsigned todo_count; -+}; -+ -+/* -+ * look for matching layout for the given client -+ * and add a pending layout recall to the todo list -+ * if found any. -+ * returns: -+ * 0 if layouts found or negative error. -+ */ -+static int -+lo_recall_per_client(struct nfs4_client *clp, void *p) -+{ -+ stateid_t lsid; -+ struct nfs4_layoutrecall *pending; -+ struct create_recall_list_arg *arg = p; -+ -+ memset(&lsid, 0, sizeof(lsid)); -+ if (!cl_has_layout(clp, arg->cbl, arg->lrfile, &lsid)) -+ return 0; -+ -+ /* Matching put done by layoutreturn */ -+ pending = alloc_init_layoutrecall(arg->cbl, clp, arg->lrfile); -+ /* out of memory, drain todo queue */ -+ if (!pending) -+ return -ENOMEM; -+ -+ *(stateid_t *)&pending->cb.cbl_sid = lsid; -+ list_add(&pending->clr_perclnt, arg->todolist); -+ arg->todo_count++; -+ return 0; -+} -+ -+/* Create a layoutrecall structure for each client based on the -+ * original structure. */ -+int -+create_layout_recall_list(struct list_head *todolist, unsigned *todo_len, -+ struct nfsd4_pnfs_cb_layout *cbl, -+ struct nfs4_file *lrfile) -+{ -+ struct nfs4_client *clp; -+ struct create_recall_list_arg arg = { -+ .cbl = cbl, -+ .lrfile = lrfile, -+ .todolist = todolist, -+ }; -+ int status = 0; -+ -+ dprintk("%s: -->\n", __func__); -+ -+ /* If client given by fs, just do single client */ -+ if (cbl->cbl_seg.clientid) { -+ clp = find_confirmed_client( -+ (clientid_t *)&cbl->cbl_seg.clientid); -+ if (!clp) { -+ status = -ENOENT; -+ dprintk("%s: clientid %llx not found\n", __func__, -+ (unsigned long long)cbl->cbl_seg.clientid); -+ goto out; -+ } -+ -+ status = lo_recall_per_client(clp, &arg); -+ } else { -+ /* Check all clients for layout matches */ -+ status = filter_confirmed_clients(lo_recall_per_client, &arg); -+ } -+ -+out: -+ *todo_len = arg.todo_count; -+ dprintk("%s: <-- list len %u status %d\n", __func__, *todo_len, status); -+ return status; -+} -+ -+/* -+ * Recall layouts asynchronously -+ * Called with state lock. -+ */ -+static int -+spawn_layout_recall(struct super_block *sb, struct list_head *todolist, -+ unsigned todo_len) -+{ -+ struct nfs4_layoutrecall *pending; -+ struct nfs4_layoutrecall *parent = NULL; -+ int status = 0; -+ -+ dprintk("%s: -->\n", __func__); -+ -+ if (todo_len > 1) { -+ pending = list_entry(todolist->next, struct nfs4_layoutrecall, -+ clr_perclnt); -+ -+ parent = alloc_init_layoutrecall(&pending->cb, NULL, -+ pending->clr_file); -+ if (unlikely(!parent)) { -+ /* We want forward progress. If parent cannot be -+ * allocated take the first one as parent but don't -+ * execute it. Caller must check for -EAGAIN, if so -+ * When the partial recalls return, -+ * nfsd_layout_recall_cb should be called again. -+ */ -+ list_del_init(&pending->clr_perclnt); -+ if (todo_len > 2) { -+ parent = pending; -+ } else { -+ parent = NULL; -+ put_layoutrecall(pending); -+ } -+ --todo_len; -+ status = -ENOMEM; -+ } -+ } -+ -+ while (!list_empty(todolist)) { -+ pending = list_entry(todolist->next, struct nfs4_layoutrecall, -+ clr_perclnt); -+ list_del_init(&pending->clr_perclnt); -+ dprintk("%s: clp %p cb_client %p fp %p\n", __func__, -+ pending->clr_client, -+ pending->clr_client->cl_cb_client, -+ pending->clr_file); -+ if (unlikely(!pending->clr_client->cl_cb_client)) { -+ printk(KERN_INFO -+ "%s: clientid %08x/%08x has no callback path\n", -+ __func__, -+ pending->clr_client->cl_clientid.cl_boot, -+ pending->clr_client->cl_clientid.cl_id); -+ put_layoutrecall(pending); -+ continue; -+ } -+ -+ pending->clr_time = CURRENT_TIME; -+ pending->clr_sb = sb; -+ if (parent) { -+ /* If we created a parent its initial ref count is 1. -+ * We will need to de-ref it eventually. So we just -+ * don't increment on behalf of the last one. -+ */ -+ if (todo_len != 1) -+ get_layoutrecall(parent); -+ } -+ pending->parent = parent; -+ get_layoutrecall(pending); -+ /* Add to list so corresponding layoutreturn can find req */ -+ list_add(&pending->clr_perclnt, -+ &pending->clr_client->cl_layoutrecalls); -+ -+ nfsd4_cb_layout(pending); -+ --todo_len; -+ } -+ -+ return status; -+} -+ -+/* -+ * Spawn a thread to perform a recall layout -+ * -+ */ -+int nfsd_layout_recall_cb(struct super_block *sb, struct inode *inode, -+ struct nfsd4_pnfs_cb_layout *cbl) -+{ -+ int status; -+ struct nfs4_file *lrfile = NULL; -+ struct list_head todolist; -+ unsigned todo_len = 0; -+ -+ dprintk("NFSD nfsd_layout_recall_cb: inode %p cbl %p\n", inode, cbl); -+ BUG_ON(!cbl); -+ BUG_ON(cbl->cbl_recall_type != RETURN_FILE && -+ cbl->cbl_recall_type != RETURN_FSID && -+ cbl->cbl_recall_type != RETURN_ALL); -+ BUG_ON(cbl->cbl_recall_type == RETURN_FILE && !inode); -+ BUG_ON(cbl->cbl_seg.iomode != IOMODE_READ && -+ cbl->cbl_seg.iomode != IOMODE_RW && -+ cbl->cbl_seg.iomode != IOMODE_ANY); -+ -+ if (nfsd_serv == NULL) { -+ dprintk("NFSD nfsd_layout_recall_cb: nfsd_serv == NULL\n"); -+ return -ENOENT; -+ } -+ -+ nfs4_lock_state(); -+ status = -ENOENT; -+ if (inode) { -+ lrfile = find_file(inode); -+ if (!lrfile) { -+ dprintk("NFSD nfsd_layout_recall_cb: " -+ "nfs4_file not found\n"); -+ goto err; -+ } -+ if (cbl->cbl_recall_type == RETURN_FSID) -+ cbl->cbl_fsid = lrfile->fi_fsid; -+ } -+ -+ INIT_LIST_HEAD(&todolist); -+ -+ /* If no cookie provided by FS, return a default one */ -+ if (!cbl->cbl_cookie) -+ cbl->cbl_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; -+ -+ status = create_layout_recall_list(&todolist, &todo_len, cbl, lrfile); -+ if (list_empty(&todolist)) { -+ status = -ENOENT; -+ } else { -+ /* process todolist even if create_layout_recall_list -+ * returned an error */ -+ int status2 = spawn_layout_recall(sb, &todolist, todo_len); -+ if (status2) -+ status = status2; -+ } -+ -+err: -+ nfs4_unlock_state(); -+ if (lrfile) -+ put_nfs4_file(lrfile); -+ return (todo_len && status) ? -EAGAIN : status; -+} -+ -+struct create_device_notify_list_arg { -+ struct list_head *todolist; -+ struct nfsd4_pnfs_cb_dev_list *ndl; -+}; -+ -+static int -+create_device_notify_per_cl(struct nfs4_client *clp, void *p) -+{ -+ struct nfs4_notify_device *cbnd; -+ struct create_device_notify_list_arg *arg = p; -+ -+ if (atomic_read(&clp->cl_deviceref) <= 0) -+ return 0; -+ -+ cbnd = kmalloc(sizeof(*cbnd), GFP_KERNEL); -+ if (!cbnd) -+ return -ENOMEM; -+ -+ cbnd->nd_list = arg->ndl; -+ cbnd->nd_client = clp; -+ list_add(&cbnd->nd_perclnt, arg->todolist); -+ return 0; -+} -+ -+/* Create a list of clients to send device notifications. */ -+int -+create_device_notify_list(struct list_head *todolist, -+ struct nfsd4_pnfs_cb_dev_list *ndl) -+{ -+ int status; -+ struct create_device_notify_list_arg arg = { -+ .todolist = todolist, -+ .ndl = ndl, -+ }; -+ -+ nfs4_lock_state(); -+ status = filter_confirmed_clients(create_device_notify_per_cl, &arg); -+ nfs4_unlock_state(); -+ -+ return status; -+} -+ -+/* -+ * For each client that a device, send a device notification. -+ * XXX: Need to track which clients have which devices. -+ */ -+int nfsd_device_notify_cb(struct super_block *sb, -+ struct nfsd4_pnfs_cb_dev_list *ndl) -+{ -+ struct nfs4_notify_device *cbnd; -+ unsigned int notify_num = 0; -+ int status2, status = 0; -+ struct list_head todolist; -+ -+ BUG_ON(!ndl || ndl->cbd_len == 0 || !ndl->cbd_list); -+ -+ dprintk("NFSD %s: cbl %p len %u\n", __func__, ndl, ndl->cbd_len); -+ -+ if (nfsd_serv == NULL) -+ return -ENOENT; -+ -+ INIT_LIST_HEAD(&todolist); -+ -+ status = create_device_notify_list(&todolist, ndl); -+ -+ while (!list_empty(&todolist)) { -+ cbnd = list_entry(todolist.next, struct nfs4_notify_device, -+ nd_perclnt); -+ list_del_init(&cbnd->nd_perclnt); -+ status2 = nfsd4_cb_notify_device(cbnd); -+ pnfs_clear_device_notify(cbnd->nd_client); -+ if (status2) { -+ kfree(cbnd); -+ status = status2; -+ } -+ notify_num++; -+ } -+ -+ dprintk("NFSD %s: status %d clients %u\n", -+ __func__, status, notify_num); -+ return status; -+} -diff --git a/fs/nfsd/nfs4pnfsdlm.c b/fs/nfsd/nfs4pnfsdlm.c -new file mode 100644 -index 0000000..006ded5 ---- /dev/null -+++ b/fs/nfsd/nfs4pnfsdlm.c -@@ -0,0 +1,461 @@ -+/****************************************************************************** -+ * -+ * (c) 2007 Network Appliance, Inc. All Rights Reserved. -+ * (c) 2009 NetApp. All Rights Reserved. -+ * -+ * NetApp provides this source code under the GPL v2 License. -+ * The GPL v2 license is available at -+ * http://opensource.org/licenses/gpl-license.php. -+ * -+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ * -+ ******************************************************************************/ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "nfsfh.h" -+#include "nfsd.h" -+ -+#define NFSDDBG_FACILITY NFSDDBG_PROC -+ -+/* Just use a linked list. Do not expect more than 32 dlm_device_entries -+ * the first implementation will just use one device per cluster file system -+ */ -+ -+static LIST_HEAD(dlm_device_list); -+static DEFINE_SPINLOCK(dlm_device_list_lock); -+ -+struct dlm_device_entry { -+ struct list_head dlm_dev_list; -+ char disk_name[DISK_NAME_LEN]; -+ int num_ds; -+ char ds_list[NFSD_DLM_DS_LIST_MAX]; -+}; -+ -+static struct dlm_device_entry * -+_nfsd4_find_pnfs_dlm_device(char *disk_name) -+{ -+ struct dlm_device_entry *dlm_pdev; -+ -+ dprintk("--> %s disk name %s\n", __func__, disk_name); -+ spin_lock(&dlm_device_list_lock); -+ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) { -+ dprintk("%s Look for dlm_pdev %s\n", __func__, -+ dlm_pdev->disk_name); -+ if (!memcmp(dlm_pdev->disk_name, disk_name, strlen(disk_name))) { -+ spin_unlock(&dlm_device_list_lock); -+ return dlm_pdev; -+ } -+ } -+ spin_unlock(&dlm_device_list_lock); -+ return NULL; -+} -+ -+static struct dlm_device_entry * -+nfsd4_find_pnfs_dlm_device(struct super_block *sb) { -+ char dname[BDEVNAME_SIZE]; -+ -+ bdevname(sb->s_bdev, dname); -+ return _nfsd4_find_pnfs_dlm_device(dname); -+} -+ -+ssize_t -+nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen) -+{ -+ char *pos = buf; -+ ssize_t size = 0; -+ struct dlm_device_entry *dlm_pdev; -+ int ret = -EINVAL; -+ -+ spin_lock(&dlm_device_list_lock); -+ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) -+ { -+ int advanced; -+ advanced = snprintf(pos, buflen - size, "%s:%s\n", dlm_pdev->disk_name, dlm_pdev->ds_list); -+ if (advanced >= buflen - size) -+ goto out; -+ size += advanced; -+ pos += advanced; -+ } -+ ret = size; -+ -+out: -+ spin_unlock(&dlm_device_list_lock); -+ return ret; -+} -+ -+bool nfsd4_validate_pnfs_dlm_device(char *ds_list, int *num_ds) -+{ -+ char *start = ds_list; -+ -+ *num_ds = 0; -+ -+ while (*start) { -+ struct sockaddr_storage tempAddr; -+ int ipLen = strcspn(start, ","); -+ -+ if (!rpc_pton(start, ipLen, (struct sockaddr *)&tempAddr, sizeof(tempAddr))) -+ return false; -+ (*num_ds)++; -+ start += ipLen + 1; -+ } -+ return true; -+} -+ -+/* -+ * pnfs_dlm_device string format: -+ * block-device-path:, -+ * -+ * Examples -+ * /dev/sda:192.168.1.96,192.168.1.97' creates a data server list with -+ * two data servers for the dlm cluster file system mounted on /dev/sda. -+ * -+ * /dev/sda:192.168.1.96,192.168.1.100' -+ * replaces the data server list for /dev/sda -+ * -+ * Only the deviceid == 1 is supported. Can add device id to -+ * pnfs_dlm_device string when needed. -+ * -+ * Only the round robin each data server once stripe index is supported. -+ */ -+int -+nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len) -+ -+{ -+ struct dlm_device_entry *new, *found; -+ char *bufp = pnfs_dlm_device; -+ char *endp = bufp + strlen(bufp); -+ int err = -ENOMEM; -+ -+ dprintk("--> %s len %d\n", __func__, len); -+ -+ new = kzalloc(sizeof(*new), GFP_KERNEL); -+ if (!new) -+ return err; -+ -+ err = -EINVAL; -+ /* disk_name */ -+ /* FIXME: need to check for valid disk_name. search superblocks? -+ * check for slash dev slash ? -+ */ -+ len = strcspn(bufp, ":"); -+ if (len > DISK_NAME_LEN) -+ goto out_free; -+ memcpy(new->disk_name, bufp, len); -+ -+ err = -EINVAL; -+ bufp += len + 1; -+ if (bufp >= endp) -+ goto out_free; -+ -+ /* data server list */ -+ /* FIXME: need to check for comma separated valid ip format */ -+ len = strcspn(bufp, ":"); -+ if (len > NFSD_DLM_DS_LIST_MAX) -+ goto out_free; -+ memcpy(new->ds_list, bufp, len); -+ -+ -+ /* validate the ips */ -+ if (!nfsd4_validate_pnfs_dlm_device(new->ds_list, &(new->num_ds))) -+ goto out_free; -+ -+ dprintk("%s disk_name %s num_ds %d ds_list %s\n", __func__, -+ new->disk_name, new->num_ds, new->ds_list); -+ -+ found = _nfsd4_find_pnfs_dlm_device(new->disk_name); -+ if (found) { -+ /* FIXME: should compare found->ds_list with new->ds_list -+ * and if it is different, kick off a CB_NOTIFY change -+ * deviceid. -+ */ -+ dprintk("%s pnfs_dlm_device %s:%s already in cache " -+ " replace ds_list with new ds_list %s\n", __func__, -+ found->disk_name, found->ds_list, new->ds_list); -+ memset(found->ds_list, 0, DISK_NAME_LEN); -+ memcpy(found->ds_list, new->ds_list, strlen(new->ds_list)); -+ found->num_ds = new->num_ds; -+ kfree(new); -+ } else { -+ dprintk("%s Adding pnfs_dlm_device %s:%s\n", __func__, -+ new->disk_name, new->ds_list); -+ spin_lock(&dlm_device_list_lock); -+ list_add(&new->dlm_dev_list, &dlm_device_list); -+ spin_unlock(&dlm_device_list_lock); -+ } -+ dprintk("<-- %s Success\n", __func__); -+ return 0; -+ -+out_free: -+ kfree(new); -+ dprintk("<-- %s returns %d\n", __func__, err); -+ return err; -+} -+ -+void nfsd4_pnfs_dlm_shutdown(void) -+{ -+ struct dlm_device_entry *dlm_pdev, *next; -+ -+ dprintk("--> %s\n", __func__); -+ -+ spin_lock(&dlm_device_list_lock); -+ list_for_each_entry_safe (dlm_pdev, next, &dlm_device_list, -+ dlm_dev_list) { -+ list_del(&dlm_pdev->dlm_dev_list); -+ kfree(dlm_pdev); -+ } -+ spin_unlock(&dlm_device_list_lock); -+} -+ -+static int nfsd4_pnfs_dlm_getdeviter(struct super_block *sb, -+ u32 layout_type, -+ struct nfsd4_pnfs_dev_iter_res *res) -+{ -+ if (layout_type != LAYOUT_NFSV4_1_FILES) { -+ printk(KERN_ERR "%s: ERROR: layout type isn't 'file' " -+ "(type: %x)\n", __func__, layout_type); -+ return -ENOTSUPP; -+ } -+ -+ res->gd_eof = 1; -+ if (res->gd_cookie) -+ return -ENOENT; -+ -+ res->gd_cookie = 1; -+ res->gd_verf = 1; -+ res->gd_devid = 1; -+ return 0; -+} -+ -+static int nfsd4_pnfs_dlm_getdevinfo(struct super_block *sb, -+ struct exp_xdr_stream *xdr, -+ u32 layout_type, -+ const struct nfsd4_pnfs_deviceid *devid) -+{ -+ int err, len, i = 0; -+ struct pnfs_filelayout_device fdev; -+ struct pnfs_filelayout_devaddr *daddr; -+ struct dlm_device_entry *dlm_pdev; -+ char *bufp; -+ -+ err = -ENOTSUPP; -+ if (layout_type != LAYOUT_NFSV4_1_FILES) { -+ dprintk("%s: ERROR: layout type isn't 'file' " -+ "(type: %x)\n", __func__, layout_type); -+ return err; -+ } -+ -+ /* We only hand out a deviceid of 1 in LAYOUTGET, so a GETDEVICEINFO -+ * with a gdia_device_id != 1 is invalid. -+ */ -+ err = -EINVAL; -+ if (devid->devid != 1) { -+ dprintk("%s: WARNING: didn't receive a deviceid of " -+ "1 (got: 0x%llx)\n", __func__, devid->devid); -+ return err; -+ } -+ -+ /* -+ * If the DS list has not been established, return -EINVAL -+ */ -+ dlm_pdev = nfsd4_find_pnfs_dlm_device(sb); -+ if (!dlm_pdev) { -+ dprintk("%s: DEBUG: disk %s Not Found\n", __func__, -+ sb->s_bdev->bd_disk->disk_name); -+ return err; -+ } -+ -+ dprintk("%s: Found disk %s with DS list |%s|\n", -+ __func__, dlm_pdev->disk_name, dlm_pdev->ds_list); -+ -+ memset(&fdev, '\0', sizeof(fdev)); -+ fdev.fl_device_length = dlm_pdev->num_ds; -+ -+ err = -ENOMEM; -+ len = sizeof(*fdev.fl_device_list) * fdev.fl_device_length; -+ fdev.fl_device_list = kzalloc(len, GFP_KERNEL); -+ if (!fdev.fl_device_list) { -+ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device list " -+ "buffer for %d DSes.\n", __func__, i); -+ fdev.fl_device_length = 0; -+ goto out; -+ } -+ -+ /* Set a simple stripe indicie */ -+ fdev.fl_stripeindices_length = fdev.fl_device_length; -+ fdev.fl_stripeindices_list = kzalloc(sizeof(u32) * -+ fdev.fl_stripeindices_length, GFP_KERNEL); -+ -+ if (!fdev.fl_stripeindices_list) { -+ printk(KERN_ERR "%s: ERROR: unable to kmalloc a stripeindices " -+ "list buffer for %d DSes.\n", __func__, i); -+ goto out; -+ } -+ for (i = 0; i < fdev.fl_stripeindices_length; i++) -+ fdev.fl_stripeindices_list[i] = i; -+ -+ /* Transfer the data server list with a single multipath entry */ -+ bufp = dlm_pdev->ds_list; -+ for (i = 0; i < fdev.fl_device_length; i++) { -+ daddr = kmalloc(sizeof(*daddr), GFP_KERNEL); -+ if (!daddr) { -+ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device " -+ "addr buffer.\n", __func__); -+ goto out; -+ } -+ -+ daddr->r_netid.data = "tcp"; -+ daddr->r_netid.len = 3; -+ -+ len = strcspn(bufp, ","); -+ daddr->r_addr.data = kmalloc(len + 4, GFP_KERNEL); -+ memcpy(daddr->r_addr.data, bufp, len); -+ /* -+ * append the port number. interpreted as two more bytes -+ * beyond the quad: ".8.1" -> 0x08.0x01 -> 0x0801 = port 2049. -+ */ -+ memcpy(daddr->r_addr.data + len, ".8.1", 4); -+ daddr->r_addr.len = len + 4; -+ -+ fdev.fl_device_list[i].fl_multipath_length = 1; -+ fdev.fl_device_list[i].fl_multipath_list = daddr; -+ -+ dprintk("%s: encoding DS |%s|\n", __func__, bufp); -+ -+ bufp += len + 1; -+ } -+ -+ /* have nfsd encode the device info */ -+ err = filelayout_encode_devinfo(xdr, &fdev); -+out: -+ for (i = 0; i < fdev.fl_device_length; i++) -+ kfree(fdev.fl_device_list[i].fl_multipath_list); -+ kfree(fdev.fl_device_list); -+ kfree(fdev.fl_stripeindices_list); -+ dprintk("<-- %s returns %d\n", __func__, err); -+ return err; -+} -+ -+static int get_stripe_unit(int blocksize) -+{ -+ if (blocksize >= NFSSVC_MAXBLKSIZE) -+ return blocksize; -+ return NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); -+} -+ -+/* -+ * Look up inode block device in pnfs_dlm_device list. -+ * Hash on the inode->i_ino and number of data servers. -+ */ -+static int dlm_ino_hash(struct inode *ino) -+{ -+ struct dlm_device_entry *de; -+ u32 hash_mask = 0; -+ -+ /* If can't find the inode block device in the pnfs_dlm_deivce list -+ * then don't hand out a layout -+ */ -+ de = nfsd4_find_pnfs_dlm_device(ino->i_sb); -+ if (!de) -+ return -1; -+ hash_mask = de->num_ds - 1; -+ return ino->i_ino & hash_mask; -+} -+ -+static enum nfsstat4 nfsd4_pnfs_dlm_layoutget(struct inode *inode, -+ struct exp_xdr_stream *xdr, -+ const struct nfsd4_pnfs_layoutget_arg *args, -+ struct nfsd4_pnfs_layoutget_res *res) -+{ -+ struct pnfs_filelayout_layout *layout = NULL; -+ struct knfsd_fh *fhp = NULL; -+ int index; -+ enum nfsstat4 rc = NFS4_OK; -+ -+ dprintk("%s: LAYOUT_GET\n", __func__); -+ -+ /* DLM exported file systems only support layouts for READ */ -+ if (res->lg_seg.iomode == IOMODE_RW) -+ return NFS4ERR_BADIOMODE; -+ -+ index = dlm_ino_hash(inode); -+ dprintk("%s first stripe index %d i_ino %lu\n", __func__, index, -+ inode->i_ino); -+ if (index < 0) -+ return NFS4ERR_LAYOUTUNAVAILABLE; -+ -+ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; -+ /* Always give out whole file layouts */ -+ res->lg_seg.offset = 0; -+ res->lg_seg.length = NFS4_MAX_UINT64; -+ /* Always give out READ ONLY layouts */ -+ res->lg_seg.iomode = IOMODE_READ; -+ -+ layout = kzalloc(sizeof(*layout), GFP_KERNEL); -+ if (layout == NULL) { -+ rc = NFS4ERR_LAYOUTTRYLATER; -+ goto error; -+ } -+ -+ /* Set file layout response args */ -+ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; -+ layout->lg_stripe_type = STRIPE_SPARSE; -+ layout->lg_commit_through_mds = false; -+ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); -+ layout->lg_fh_length = 1; -+ layout->device_id.sbid = args->lg_sbid; -+ layout->device_id.devid = 1; /*FSFTEMP*/ -+ layout->lg_first_stripe_index = index; /*FSFTEMP*/ -+ layout->lg_pattern_offset = 0; -+ -+ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); -+ if (fhp == NULL) { -+ rc = NFS4ERR_LAYOUTTRYLATER; -+ goto error; -+ } -+ -+ memcpy(fhp, args->lg_fh, sizeof(*fhp)); -+ pnfs_fh_mark_ds(fhp); -+ layout->lg_fh_list = fhp; -+ -+ /* Call nfsd to encode layout */ -+ rc = filelayout_encode_layout(xdr, layout); -+exit: -+ kfree(layout); -+ kfree(fhp); -+ return rc; -+ -+error: -+ res->lg_seg.length = 0; -+ goto exit; -+} -+ -+static int -+nfsd4_pnfs_dlm_layouttype(struct super_block *sb) -+{ -+ return LAYOUT_NFSV4_1_FILES; -+} -+ -+/* For use by DLM cluster file systems exported by pNFSD */ -+const struct pnfs_export_operations pnfs_dlm_export_ops = { -+ .layout_type = nfsd4_pnfs_dlm_layouttype, -+ .get_device_info = nfsd4_pnfs_dlm_getdevinfo, -+ .get_device_iter = nfsd4_pnfs_dlm_getdeviter, -+ .layout_get = nfsd4_pnfs_dlm_layoutget, -+}; -+EXPORT_SYMBOL(pnfs_dlm_export_ops); -diff --git a/fs/nfsd/nfs4pnfsds.c b/fs/nfsd/nfs4pnfsds.c -new file mode 100644 -index 0000000..8ebc64d ---- /dev/null -+++ b/fs/nfsd/nfs4pnfsds.c -@@ -0,0 +1,620 @@ -+/* -+* linux/fs/nfsd/nfs4pnfsds.c -+* -+* Copyright (c) 2005 The Regents of the University of Michigan. -+* All rights reserved. -+* -+* Andy Adamson -+* -+* Redistribution and use in source and binary forms, with or without -+* modification, are permitted provided that the following conditions -+* are met: -+* -+* 1. Redistributions of source code must retain the above copyright -+* notice, this list of conditions and the following disclaimer. -+* 2. Redistributions in binary form must reproduce the above copyright -+* notice, this list of conditions and the following disclaimer in the -+* documentation and/or other materials provided with the distribution. -+* 3. Neither the name of the University nor the names of its -+* contributors may be used to endorse or promote products derived -+* from this software without specific prior written permission. -+* -+* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED -+* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -+* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -+* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+* -+*/ -+#if defined(CONFIG_PNFSD) -+ -+#define NFSDDBG_FACILITY NFSDDBG_PNFS -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "nfsd.h" -+#include "pnfsd.h" -+#include "state.h" -+ -+/* -+ ******************* -+ * PNFS -+ ******************* -+ */ -+/* -+ * Hash tables for pNFS Data Server state -+ * -+ * mds_nodeid: list of struct pnfs_mds_id one per Metadata server (MDS) using -+ * this data server (DS). -+ * -+ * mds_clid_hashtbl[]: uses clientid_hashval(), hash of all clientids obtained -+ * from any MDS. -+ * -+ * ds_stid_hashtbl[]: uses stateid_hashval(), hash of all stateids obtained -+ * from any MDS. -+ * -+ */ -+/* Hash tables for clientid state */ -+#define CLIENT_HASH_BITS 4 -+#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) -+#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) -+ -+#define clientid_hashval(id) \ -+ ((id) & CLIENT_HASH_MASK) -+ -+/* hash table for pnfs_ds_stateid */ -+#define STATEID_HASH_BITS 10 -+#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) -+#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) -+ -+#define stateid_hashval(owner_id, file_id) \ -+ (((owner_id) + (file_id)) & STATEID_HASH_MASK) -+ -+static struct list_head mds_id_tbl; -+static struct list_head mds_clid_hashtbl[CLIENT_HASH_SIZE]; -+static struct list_head ds_stid_hashtbl[STATEID_HASH_SIZE]; -+ -+static inline void put_ds_clientid(struct pnfs_ds_clientid *dcp); -+static inline void put_ds_mdsid(struct pnfs_mds_id *mdp); -+ -+/* Mutex for data server state. Needs to be separate from -+ * mds state mutex since a node can be both mds and ds */ -+static DEFINE_MUTEX(ds_mutex); -+static struct thread_info *ds_mutex_owner; -+ -+static void -+ds_lock_state(void) -+{ -+ mutex_lock(&ds_mutex); -+ ds_mutex_owner = current_thread_info(); -+} -+ -+static void -+ds_unlock_state(void) -+{ -+ BUG_ON(ds_mutex_owner != current_thread_info()); -+ ds_mutex_owner = NULL; -+ mutex_unlock(&ds_mutex); -+} -+ -+static int -+cmp_clid(const clientid_t *cl1, const clientid_t *cl2) -+{ -+ return (cl1->cl_boot == cl2->cl_boot) && -+ (cl1->cl_id == cl2->cl_id); -+} -+ -+void -+nfs4_pnfs_state_init(void) -+{ -+ int i; -+ -+ for (i = 0; i < CLIENT_HASH_SIZE; i++) -+ INIT_LIST_HEAD(&mds_clid_hashtbl[i]); -+ -+ for (i = 0; i < STATEID_HASH_SIZE; i++) -+ INIT_LIST_HEAD(&ds_stid_hashtbl[i]); -+ -+ INIT_LIST_HEAD(&mds_id_tbl); -+} -+ -+static struct pnfs_mds_id * -+find_pnfs_mds_id(u32 mdsid) -+{ -+ struct pnfs_mds_id *local = NULL; -+ -+ dprintk("pNFSD: %s\n", __func__); -+ list_for_each_entry(local, &mds_id_tbl, di_hash) { -+ if (local->di_mdsid == mdsid) -+ return local; -+ } -+ return NULL; -+} -+ -+static struct pnfs_ds_clientid * -+find_pnfs_ds_clientid(const clientid_t *clid) -+{ -+ struct pnfs_ds_clientid *local = NULL; -+ unsigned int hashval; -+ -+ dprintk("pNFSD: %s\n", __func__); -+ -+ hashval = clientid_hashval(clid->cl_id); -+ list_for_each_entry(local, &mds_clid_hashtbl[hashval], dc_hash) { -+ if (cmp_clid(&local->dc_mdsclid, clid)) -+ return local; -+ } -+ return NULL; -+} -+ -+static struct pnfs_ds_stateid * -+find_pnfs_ds_stateid(stateid_t *stid) -+{ -+ struct pnfs_ds_stateid *local = NULL; -+ u32 st_id = stid->si_stateownerid; -+ u32 f_id = stid->si_fileid; -+ unsigned int hashval; -+ -+ dprintk("pNFSD: %s\n", __func__); -+ -+ hashval = stateid_hashval(st_id, f_id); -+ list_for_each_entry(local, &ds_stid_hashtbl[hashval], ds_hash) -+ if ((local->ds_stid.si_stateownerid == st_id) && -+ (local->ds_stid.si_fileid == f_id) && -+ (local->ds_stid.si_boot == stid->si_boot)) { -+ stateid_t *sid = &local->ds_stid; -+ dprintk("NFSD: %s <-- %p ds_flags %lx " STATEID_FMT "\n", -+ __func__, local, local->ds_flags, -+ STATEID_VAL(sid)); -+ return local; -+ } -+ return NULL; -+} -+ -+static void -+release_ds_mdsid(struct kref *kref) -+{ -+ struct pnfs_mds_id *mdp = -+ container_of(kref, struct pnfs_mds_id, di_ref); -+ dprintk("pNFSD: %s\n", __func__); -+ -+ list_del(&mdp->di_hash); -+ list_del(&mdp->di_mdsclid); -+ kfree(mdp); -+} -+ -+static void -+release_ds_clientid(struct kref *kref) -+{ -+ struct pnfs_ds_clientid *dcp = -+ container_of(kref, struct pnfs_ds_clientid, dc_ref); -+ struct pnfs_mds_id *mdp; -+ dprintk("pNFSD: %s\n", __func__); -+ -+ mdp = find_pnfs_mds_id(dcp->dc_mdsid); -+ if (mdp) -+ put_ds_mdsid(mdp); -+ -+ list_del(&dcp->dc_hash); -+ list_del(&dcp->dc_stateid); -+ list_del(&dcp->dc_permdsid); -+ kfree(dcp); -+} -+ -+static void -+release_ds_stateid(struct kref *kref) -+{ -+ struct pnfs_ds_stateid *dsp = -+ container_of(kref, struct pnfs_ds_stateid, ds_ref); -+ struct pnfs_ds_clientid *dcp; -+ dprintk("pNFS %s: dsp %p\n", __func__, dsp); -+ -+ dcp = find_pnfs_ds_clientid(&dsp->ds_mdsclid); -+ if (dcp) -+ put_ds_clientid(dcp); -+ -+ list_del(&dsp->ds_hash); -+ list_del(&dsp->ds_perclid); -+ kfree(dsp); -+} -+ -+static inline void -+put_ds_clientid(struct pnfs_ds_clientid *dcp) -+{ -+ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, -+ atomic_read(&dcp->dc_ref.refcount)); -+ kref_put(&dcp->dc_ref, release_ds_clientid); -+} -+ -+static inline void -+get_ds_clientid(struct pnfs_ds_clientid *dcp) -+{ -+ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, -+ atomic_read(&dcp->dc_ref.refcount)); -+ kref_get(&dcp->dc_ref); -+} -+ -+static inline void -+put_ds_mdsid(struct pnfs_mds_id *mdp) -+{ -+ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, -+ atomic_read(&mdp->di_ref.refcount)); -+ kref_put(&mdp->di_ref, release_ds_mdsid); -+} -+ -+static inline void -+get_ds_mdsid(struct pnfs_mds_id *mdp) -+{ -+ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, -+ atomic_read(&mdp->di_ref.refcount)); -+ kref_get(&mdp->di_ref); -+} -+ -+static inline void -+put_ds_stateid(struct pnfs_ds_stateid *dsp) -+{ -+ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, -+ atomic_read(&dsp->ds_ref.refcount)); -+ kref_put(&dsp->ds_ref, release_ds_stateid); -+} -+ -+static inline void -+get_ds_stateid(struct pnfs_ds_stateid *dsp) -+{ -+ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, -+ atomic_read(&dsp->ds_ref.refcount)); -+ kref_get(&dsp->ds_ref); -+} -+ -+void -+nfs4_pnfs_state_shutdown(void) -+{ -+ struct pnfs_ds_stateid *dsp; -+ int i; -+ -+ dprintk("pNFSD %s: -->\n", __func__); -+ -+ ds_lock_state(); -+ for (i = 0; i < STATEID_HASH_SIZE; i++) { -+ while (!list_empty(&ds_stid_hashtbl[i])) { -+ dsp = list_entry(ds_stid_hashtbl[i].next, -+ struct pnfs_ds_stateid, ds_hash); -+ put_ds_stateid(dsp); -+ } -+ } -+ ds_unlock_state(); -+} -+ -+static struct pnfs_mds_id * -+alloc_init_mds_id(struct pnfs_get_state *gsp) -+{ -+ struct pnfs_mds_id *mdp; -+ -+ dprintk("pNFSD: %s\n", __func__); -+ -+ mdp = kmalloc(sizeof(*mdp), GFP_KERNEL); -+ if (!mdp) -+ return NULL; -+ INIT_LIST_HEAD(&mdp->di_hash); -+ INIT_LIST_HEAD(&mdp->di_mdsclid); -+ list_add(&mdp->di_hash, &mds_id_tbl); -+ mdp->di_mdsid = gsp->dsid; -+ mdp->di_mdsboot = 0; -+ kref_init(&mdp->di_ref); -+ return mdp; -+} -+ -+static struct pnfs_ds_clientid * -+alloc_init_ds_clientid(struct pnfs_get_state *gsp) -+{ -+ struct pnfs_mds_id *mdp; -+ struct pnfs_ds_clientid *dcp; -+ clientid_t *clid = (clientid_t *)&gsp->clid; -+ unsigned int hashval = clientid_hashval(clid->cl_id); -+ -+ dprintk("pNFSD: %s\n", __func__); -+ -+ mdp = find_pnfs_mds_id(gsp->dsid); -+ if (!mdp) { -+ mdp = alloc_init_mds_id(gsp); -+ if (!mdp) -+ return NULL; -+ } else { -+ get_ds_mdsid(mdp); -+ } -+ -+ dcp = kmalloc(sizeof(*dcp), GFP_KERNEL); -+ if (!dcp) -+ return NULL; -+ -+ INIT_LIST_HEAD(&dcp->dc_hash); -+ INIT_LIST_HEAD(&dcp->dc_stateid); -+ INIT_LIST_HEAD(&dcp->dc_permdsid); -+ list_add(&dcp->dc_hash, &mds_clid_hashtbl[hashval]); -+ list_add(&dcp->dc_permdsid, &mdp->di_mdsclid); -+ dcp->dc_mdsclid = *clid; -+ kref_init(&dcp->dc_ref); -+ dcp->dc_mdsid = gsp->dsid; -+ return dcp; -+} -+ -+static struct pnfs_ds_stateid * -+alloc_init_ds_stateid(struct svc_fh *cfh, stateid_t *stidp) -+{ -+ struct pnfs_ds_stateid *dsp; -+ u32 st_id = stidp->si_stateownerid; -+ u32 f_id = stidp->si_fileid; -+ unsigned int hashval; -+ -+ dprintk("pNFSD: %s\n", __func__); -+ -+ dsp = kmalloc(sizeof(*dsp), GFP_KERNEL); -+ if (!dsp) -+ return dsp; -+ -+ INIT_LIST_HEAD(&dsp->ds_hash); -+ INIT_LIST_HEAD(&dsp->ds_perclid); -+ memcpy(&dsp->ds_stid, stidp, sizeof(stateid_t)); -+ fh_copy_shallow(&dsp->ds_fh, &cfh->fh_handle); -+ dsp->ds_access = 0; -+ dsp->ds_status = 0; -+ dsp->ds_flags = 0L; -+ kref_init(&dsp->ds_ref); -+ set_bit(DS_STATEID_NEW, &dsp->ds_flags); -+ clear_bit(DS_STATEID_VALID, &dsp->ds_flags); -+ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); -+ init_waitqueue_head(&dsp->ds_waitq); -+ -+ hashval = stateid_hashval(st_id, f_id); -+ list_add(&dsp->ds_hash, &ds_stid_hashtbl[hashval]); -+ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); -+ return dsp; -+} -+ -+static int -+update_ds_stateid(struct pnfs_ds_stateid *dsp, struct svc_fh *cfh, -+ struct pnfs_get_state *gsp) -+{ -+ struct pnfs_ds_clientid *dcp; -+ int new = 0; -+ -+ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); -+ -+ dcp = find_pnfs_ds_clientid((clientid_t *)&gsp->clid); -+ if (!dcp) { -+ dcp = alloc_init_ds_clientid(gsp); -+ if (!dcp) -+ return 1; -+ new = 1; -+ } -+ if (test_bit(DS_STATEID_NEW, &dsp->ds_flags)) { -+ list_add(&dsp->ds_perclid, &dcp->dc_stateid); -+ if (!new) -+ get_ds_clientid(dcp); -+ } -+ -+ memcpy(&dsp->ds_stid, &gsp->stid, sizeof(stateid_t)); -+ dsp->ds_access = gsp->access; -+ dsp->ds_status = 0; -+ dsp->ds_verifier[0] = gsp->verifier[0]; -+ dsp->ds_verifier[1] = gsp->verifier[1]; -+ memcpy(&dsp->ds_mdsclid, &gsp->clid, sizeof(clientid_t)); -+ set_bit(DS_STATEID_VALID, &dsp->ds_flags); -+ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); -+ clear_bit(DS_STATEID_NEW, &dsp->ds_flags); -+ return 0; -+} -+ -+int -+nfs4_pnfs_cb_change_state(struct pnfs_get_state *gs) -+{ -+ stateid_t *stid = (stateid_t *)&gs->stid; -+ struct pnfs_ds_stateid *dsp; -+ -+ dprintk("pNFSD: %s stateid=" STATEID_FMT "\n", __func__, -+ STATEID_VAL(stid)); -+ -+ ds_lock_state(); -+ dsp = find_pnfs_ds_stateid(stid); -+ if (dsp) -+ put_ds_stateid(dsp); -+ ds_unlock_state(); -+ -+ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); -+ -+ if (dsp) -+ return 0; -+ return -ENOENT; -+} -+ -+/* Retrieves and validates stateid. -+ * If stateid exists and its fields match, return it. -+ * If stateid exists but either the generation or -+ * ownerids don't match, check with mds to see if it is valid. -+ * If the stateid doesn't exist, the first thread creates a -+ * invalid *marker* stateid, then checks to see if the -+ * stateid exists on the mds. If so, it validates the *marker* -+ * stateid and updates its fields. Subsequent threads that -+ * find the *marker* stateid wait until it is valid or an error -+ * occurs. -+ * Called with ds_state_lock. -+ */ -+static struct pnfs_ds_stateid * -+nfsv4_ds_get_state(struct svc_fh *cfh, stateid_t *stidp) -+{ -+ struct inode *ino = cfh->fh_dentry->d_inode; -+ struct super_block *sb; -+ struct pnfs_ds_stateid *dsp = NULL; -+ struct pnfs_get_state gs = { -+ .access = 0, -+ }; -+ int status = 0, waiter = 0; -+ -+ dprintk("pNFSD: %s -->\n", __func__); -+ -+ dsp = find_pnfs_ds_stateid(stidp); -+ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags) && -+ (stidp->si_generation == dsp->ds_stid.si_generation)) -+ goto out_noput; -+ -+ sb = ino->i_sb; -+ if (!sb || !sb->s_pnfs_op->get_state) -+ goto out_noput; -+ -+ /* Uninitialize current state if it exists yet it doesn't match. -+ * If it is already invalid, another thread is checking state */ -+ if (dsp) { -+ if (!test_and_clear_bit(DS_STATEID_VALID, &dsp->ds_flags)) -+ waiter = 1; -+ } else { -+ dsp = alloc_init_ds_stateid(cfh, stidp); -+ if (!dsp) -+ goto out_noput; -+ } -+ -+ dprintk("pNFSD: %s Starting loop\n", __func__); -+ get_ds_stateid(dsp); -+ while (!test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { -+ ds_unlock_state(); -+ -+ /* Another thread is checking the state */ -+ if (waiter) { -+ dprintk("pNFSD: %s waiting\n", __func__); -+ wait_event_interruptible_timeout(dsp->ds_waitq, -+ (test_bit(DS_STATEID_VALID, &dsp->ds_flags) || -+ test_bit(DS_STATEID_ERROR, &dsp->ds_flags)), -+ msecs_to_jiffies(1024)); -+ dprintk("pNFSD: %s awake\n", __func__); -+ ds_lock_state(); -+ if (test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) -+ goto out; -+ -+ continue; -+ } -+ -+ /* Validate stateid on mds */ -+ dprintk("pNFSD: %s Checking state on MDS\n", __func__); -+ memcpy(&gs.stid, stidp, sizeof(stateid_t)); -+ status = sb->s_pnfs_op->get_state(ino, &cfh->fh_handle, &gs); -+ dprintk("pNFSD: %s from MDS status %d\n", __func__, status); -+ ds_lock_state(); -+ /* if !status and stateid is valid, update id and mark valid */ -+ if (status || update_ds_stateid(dsp, cfh, &gs)) { -+ set_bit(DS_STATEID_ERROR, &dsp->ds_flags); -+ /* remove invalid stateid from list */ -+ put_ds_stateid(dsp); -+ wake_up(&dsp->ds_waitq); -+ goto out; -+ } -+ -+ wake_up(&dsp->ds_waitq); -+ } -+out: -+ if (dsp) -+ put_ds_stateid(dsp); -+out_noput: -+ if (dsp) -+ dprintk("pNFSD: %s <-- dsp %p ds_flags %lx " STATEID_FMT "\n", -+ __func__, dsp, dsp->ds_flags, STATEID_VAL(&dsp->ds_stid)); -+ /* If error, return null */ -+ if (dsp && test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) -+ dsp = NULL; -+ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); -+ return dsp; -+} -+ -+int -+nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *cfh, stateid_t *stateid) -+{ -+ struct pnfs_ds_stateid *dsp; -+ int status = 0; -+ -+ dprintk("pNFSD: %s --> " STATEID_FMT "\n", __func__, -+ STATEID_VAL(stateid)); -+ -+ /* Must release state lock while verifying stateid on mds */ -+ nfs4_unlock_state(); -+ ds_lock_state(); -+ dsp = nfsv4_ds_get_state(cfh, stateid); -+ if (dsp) { -+ get_ds_stateid(dsp); -+ dprintk("pNFSD: %s Found " STATEID_FMT "\n", __func__, -+ STATEID_VAL(&dsp->ds_stid)); -+ -+ dprintk("NFSD: %s: dsp %p fh_size %u:%u " -+ "fh [%08x:%08x:%08x:%08x]:[%08x:%08x:%08x:%08x] " -+ "gen %x:%x\n", -+ __func__, dsp, -+ cfh->fh_handle.fh_size, dsp->ds_fh.fh_size, -+ ((unsigned *)&cfh->fh_handle.fh_base)[0], -+ ((unsigned *)&cfh->fh_handle.fh_base)[1], -+ ((unsigned *)&cfh->fh_handle.fh_base)[2], -+ ((unsigned *)&cfh->fh_handle.fh_base)[3], -+ ((unsigned *)&dsp->ds_fh.fh_base)[0], -+ ((unsigned *)&dsp->ds_fh.fh_base)[1], -+ ((unsigned *)&dsp->ds_fh.fh_base)[2], -+ ((unsigned *)&dsp->ds_fh.fh_base)[3], -+ stateid->si_generation, dsp->ds_stid.si_generation); -+ } -+ -+ if (!dsp || -+ (cfh->fh_handle.fh_size != dsp->ds_fh.fh_size) || -+ (memcmp(&cfh->fh_handle.fh_base, &dsp->ds_fh.fh_base, -+ dsp->ds_fh.fh_size) != 0) || -+ (stateid->si_generation > dsp->ds_stid.si_generation)) -+ status = nfserr_bad_stateid; -+ else if (stateid->si_generation < dsp->ds_stid.si_generation) -+ status = nfserr_old_stateid; -+ -+ if (dsp) -+ put_ds_stateid(dsp); -+ ds_unlock_state(); -+ nfs4_lock_state(); -+ dprintk("pNFSD: %s <-- status %d\n", __func__, be32_to_cpu(status)); -+ return status; -+} -+ -+void -+nfs4_ds_get_verifier(stateid_t *stateid, struct super_block *sb, u32 *p) -+{ -+ struct pnfs_ds_stateid *dsp = NULL; -+ -+ dprintk("pNFSD: %s --> stid %p\n", __func__, stateid); -+ -+ ds_lock_state(); -+ if (stateid != NULL) { -+ dsp = find_pnfs_ds_stateid(stateid); -+ if (dsp) -+ get_ds_stateid(dsp); -+ } -+ -+ /* XXX: Should we fetch the stateid or wait if some other -+ * thread is currently retrieving the stateid ? */ -+ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { -+ *p++ = dsp->ds_verifier[0]; -+ *p++ = dsp->ds_verifier[1]; -+ put_ds_stateid(dsp); -+ } else { -+ /* must be on MDS */ -+ ds_unlock_state(); -+ sb->s_pnfs_op->get_verifier(sb, p); -+ ds_lock_state(); -+ p += 2; -+ } -+ ds_unlock_state(); -+ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); -+ return; -+} -+ -+#endif /* CONFIG_PNFSD */ -diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c -index 59ec449..00cbf11 100644 ---- a/fs/nfsd/nfs4proc.c -+++ b/fs/nfsd/nfs4proc.c -@@ -34,10 +34,14 @@ - */ - #include - #include -+#include -+#include -+#include - - #include "cache.h" - #include "xdr4.h" - #include "vfs.h" -+#include "pnfsd.h" - - #define NFSDDBG_FACILITY NFSDDBG_PROC - -@@ -372,6 +376,24 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - * set, (2) sets open->op_stateid, (3) sets open->op_delegation. - */ - status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); -+#if defined(CONFIG_SPNFS) -+ if (!status && spnfs_enabled()) { -+ struct inode *inode = cstate->current_fh.fh_dentry->d_inode; -+ -+ status = spnfs_open(inode, open); -+ if (status) { -+ dprintk( -+ "nfsd: pNFS could not be enabled for inode: %lu\n", -+ inode->i_ino); -+ /* -+ * XXX When there's a failure then need to indicate to -+ * future ops that no pNFS is available. Should I save -+ * the status in the inode? It's kind of a big hammer. -+ * But there may be no stripes available? -+ */ -+ } -+ } -+#endif /* CONFIG_SPNFS */ - out: - if (open->op_stateowner) { - nfs4_get_stateowner(open->op_stateowner); -@@ -454,16 +476,30 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - &access->ac_supported); - } - -+static void -+nfsd4_get_verifier(struct super_block *sb, nfs4_verifier *verf) -+{ -+ u32 *p = (u32 *)verf->data; -+ -+#if defined(CONFIG_PNFSD) -+ if (sb->s_pnfs_op && sb->s_pnfs_op->get_verifier) { -+ nfs4_ds_get_verifier(NULL, sb, p); -+ return; -+ } -+#endif /* CONFIG_PNFSD */ -+ -+ *p++ = nfssvc_boot.tv_sec; -+ *p++ = nfssvc_boot.tv_usec; -+} -+ - static __be32 - nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - struct nfsd4_commit *commit) - { - __be32 status; - -- u32 *p = (u32 *)commit->co_verf.data; -- *p++ = nfssvc_boot.tv_sec; -- *p++ = nfssvc_boot.tv_usec; -- -+ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, -+ &commit->co_verf); - status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, - commit->co_count); - if (status == nfserr_symlink) -@@ -816,7 +852,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - { - stateid_t *stateid = &write->wr_stateid; - struct file *filp = NULL; -- u32 *p; - __be32 status = nfs_ok; - unsigned long cnt; - -@@ -838,13 +873,49 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - - cnt = write->wr_buflen; - write->wr_how_written = write->wr_stable_how; -- p = (u32 *)write->wr_verifier.data; -- *p++ = nfssvc_boot.tv_sec; -- *p++ = nfssvc_boot.tv_usec; - -+ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, -+ &write->wr_verifier); -+#if defined(CONFIG_SPNFS) -+#if defined(CONFIG_SPNFS_BLOCK) -+ if (pnfs_block_enabled(cstate->current_fh.fh_dentry->d_inode, 0)) { -+ status = bl_layoutrecall(cstate->current_fh.fh_dentry->d_inode, -+ RETURN_FILE, write->wr_offset, write->wr_buflen); -+ if (!status) { -+ status = nfsd_write(rqstp, &cstate->current_fh, filp, -+ write->wr_offset, rqstp->rq_vec, write->wr_vlen, -+ &cnt, &write->wr_how_written); -+ } -+ } else -+#endif -+ -+ if (spnfs_enabled()) { -+ status = spnfs_write(cstate->current_fh.fh_dentry->d_inode, -+ write->wr_offset, write->wr_buflen, write->wr_vlen, -+ rqstp); -+ if (status == nfs_ok) { -+ /* DMXXX: HACK to get filesize set */ -+ /* write one byte at offset+length-1 */ -+ struct kvec k[1]; -+ char zero = 0; -+ unsigned long cnt = 1; -+ -+ k[0].iov_base = (void *)&zero; -+ k[0].iov_len = 1; -+ nfsd_write(rqstp, &cstate->current_fh, filp, -+ write->wr_offset+write->wr_buflen-1, k, 1, -+ &cnt, &write->wr_how_written); -+ } -+ } else /* we're not an MDS */ -+ status = nfsd_write(rqstp, &cstate->current_fh, filp, -+ write->wr_offset, rqstp->rq_vec, write->wr_vlen, -+ &cnt, &write->wr_how_written); -+#else - status = nfsd_write(rqstp, &cstate->current_fh, filp, - write->wr_offset, rqstp->rq_vec, write->wr_vlen, - &cnt, &write->wr_how_written); -+#endif /* CONFIG_SPNFS */ -+ - if (filp) - fput(filp); - -@@ -935,6 +1006,306 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - return status == nfserr_same ? nfs_ok : status; - } - -+#if defined(CONFIG_PNFSD) -+ -+static __be32 -+nfsd4_layout_verify(struct super_block *sb, struct svc_export *exp, -+ unsigned int layout_type) -+{ -+ int status, type; -+ -+ /* check to see if pNFS is supported. */ -+ status = nfserr_layoutunavailable; -+ if (exp && exp->ex_pnfs == 0) { -+ dprintk("%s: Underlying file system " -+ "is not exported over pNFS\n", __func__); -+ goto out; -+ } -+ if (!sb->s_pnfs_op || !sb->s_pnfs_op->layout_type) { -+ dprintk("%s: Underlying file system " -+ "does not support pNFS\n", __func__); -+ goto out; -+ } -+ -+ type = sb->s_pnfs_op->layout_type(sb); -+ -+ /* check to see if requested layout type is supported. */ -+ status = nfserr_unknown_layouttype; -+ if (!type) -+ dprintk("BUG: %s: layout_type 0 is reserved and must not be " -+ "used by filesystem\n", __func__); -+ else if (type != layout_type) -+ dprintk("%s: requested layout type %d " -+ "does not match supported type %d\n", -+ __func__, layout_type, type); -+ else -+ status = nfs_ok; -+out: -+ return status; -+} -+ -+static __be32 -+nfsd4_getdevlist(struct svc_rqst *rqstp, -+ struct nfsd4_compound_state *cstate, -+ struct nfsd4_pnfs_getdevlist *gdlp) -+{ -+ struct super_block *sb; -+ struct svc_fh *current_fh = &cstate->current_fh; -+ int status; -+ -+ dprintk("%s: type %u maxdevices %u cookie %llu verf %llu\n", -+ __func__, gdlp->gd_layout_type, gdlp->gd_maxdevices, -+ gdlp->gd_cookie, gdlp->gd_verf); -+ -+ -+ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); -+ if (status) -+ goto out; -+ -+ status = nfserr_inval; -+ sb = current_fh->fh_dentry->d_inode->i_sb; -+ if (!sb) -+ goto out; -+ -+ /* We must be able to encode at list one device */ -+ if (!gdlp->gd_maxdevices) -+ goto out; -+ -+ /* Ensure underlying file system supports pNFS and, -+ * if so, the requested layout type -+ */ -+ status = nfsd4_layout_verify(sb, current_fh->fh_export, -+ gdlp->gd_layout_type); -+ if (status) -+ goto out; -+ -+ /* Do nothing if underlying file system does not support -+ * getdevicelist */ -+ if (!sb->s_pnfs_op->get_device_iter) { -+ status = nfserr_notsupp; -+ goto out; -+ } -+ -+ /* Set up arguments so device can be retrieved at encode time */ -+ gdlp->gd_fhp = &cstate->current_fh; -+out: -+ return status; -+} -+ -+static __be32 -+nfsd4_getdevinfo(struct svc_rqst *rqstp, -+ struct nfsd4_compound_state *cstate, -+ struct nfsd4_pnfs_getdevinfo *gdp) -+{ -+ struct super_block *sb; -+ int status; -+ clientid_t clid; -+ -+ dprintk("%s: layout_type %u dev_id %llx:%llx maxcnt %u\n", -+ __func__, gdp->gd_layout_type, gdp->gd_devid.sbid, -+ gdp->gd_devid.devid, gdp->gd_maxcount); -+ -+ status = nfserr_inval; -+ sb = find_sbid_id(gdp->gd_devid.sbid); -+ dprintk("%s: sb %p\n", __func__, sb); -+ if (!sb) { -+ status = nfserr_noent; -+ goto out; -+ } -+ -+ /* Ensure underlying file system supports pNFS and, -+ * if so, the requested layout type -+ */ -+ status = nfsd4_layout_verify(sb, NULL, gdp->gd_layout_type); -+ if (status) -+ goto out; -+ -+ /* Set up arguments so device can be retrieved at encode time */ -+ gdp->gd_sb = sb; -+ -+ /* Update notifications */ -+ copy_clientid(&clid, cstate->session); -+ pnfs_set_device_notify(&clid, gdp->gd_notify_types); -+out: -+ return status; -+} -+ -+static __be32 -+nfsd4_layoutget(struct svc_rqst *rqstp, -+ struct nfsd4_compound_state *cstate, -+ struct nfsd4_pnfs_layoutget *lgp) -+{ -+ int status; -+ struct super_block *sb; -+ struct svc_fh *current_fh = &cstate->current_fh; -+ -+ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); -+ if (status) -+ goto out; -+ -+ status = nfserr_inval; -+ sb = current_fh->fh_dentry->d_inode->i_sb; -+ if (!sb) -+ goto out; -+ -+ /* Ensure underlying file system supports pNFS and, -+ * if so, the requested layout type -+ */ -+ status = nfsd4_layout_verify(sb, current_fh->fh_export, -+ lgp->lg_seg.layout_type); -+ if (status) -+ goto out; -+ -+ status = nfserr_badiomode; -+ if (lgp->lg_seg.iomode != IOMODE_READ && -+ lgp->lg_seg.iomode != IOMODE_RW) { -+ dprintk("pNFS %s: invalid iomode %d\n", __func__, -+ lgp->lg_seg.iomode); -+ goto out; -+ } -+ -+ /* Set up arguments so layout can be retrieved at encode time */ -+ lgp->lg_fhp = current_fh; -+ copy_clientid((clientid_t *)&lgp->lg_seg.clientid, cstate->session); -+ status = nfs_ok; -+out: -+ return status; -+} -+ -+static __be32 -+nfsd4_layoutcommit(struct svc_rqst *rqstp, -+ struct nfsd4_compound_state *cstate, -+ struct nfsd4_pnfs_layoutcommit *lcp) -+{ -+ int status; -+ struct inode *ino = NULL; -+ struct iattr ia; -+ struct super_block *sb; -+ struct svc_fh *current_fh = &cstate->current_fh; -+ -+ dprintk("NFSD: nfsd4_layoutcommit \n"); -+ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); -+ if (status) -+ goto out; -+ -+ status = nfserr_inval; -+ ino = current_fh->fh_dentry->d_inode; -+ if (!ino) -+ goto out; -+ -+ status = nfserr_inval; -+ sb = ino->i_sb; -+ if (!sb) -+ goto out; -+ -+ /* Ensure underlying file system supports pNFS and, -+ * if so, the requested layout type -+ */ -+ status = nfsd4_layout_verify(sb, current_fh->fh_export, -+ lcp->args.lc_seg.layout_type); -+ if (status) -+ goto out; -+ -+ /* This will only extend the file length. Do a quick -+ * check to see if there is any point in waiting for the update -+ * locks. -+ * TODO: Is this correct for all back ends? -+ */ -+ dprintk("%s:new offset: %d new size: %llu old size: %lld\n", -+ __func__, lcp->args.lc_newoffset, lcp->args.lc_last_wr + 1, -+ ino->i_size); -+ -+ /* Set clientid from sessionid */ -+ copy_clientid((clientid_t *)&lcp->args.lc_seg.clientid, cstate->session); -+ lcp->res.lc_size_chg = 0; -+ if (sb->s_pnfs_op->layout_commit) { -+ status = sb->s_pnfs_op->layout_commit(ino, &lcp->args, &lcp->res); -+ dprintk("%s:layout_commit result %d\n", __func__, status); -+ } else { -+ fh_lock(current_fh); -+ if ((lcp->args.lc_newoffset == 0) || -+ ((lcp->args.lc_last_wr + 1) <= ino->i_size)) { -+ status = 0; -+ lcp->res.lc_size_chg = 0; -+ fh_unlock(current_fh); -+ goto out; -+ } -+ -+ /* Try our best to update the file size */ -+ dprintk("%s: Modifying file size\n", __func__); -+ ia.ia_valid = ATTR_SIZE; -+ ia.ia_size = lcp->args.lc_last_wr + 1; -+ status = notify_change(current_fh->fh_dentry, &ia); -+ fh_unlock(current_fh); -+ dprintk("%s:notify_change result %d\n", __func__, status); -+ } -+ -+ if (!status && lcp->res.lc_size_chg && -+ EX_ISSYNC(current_fh->fh_export)) { -+ dprintk("%s: Synchronously writing inode size %llu\n", -+ __func__, ino->i_size); -+ write_inode_now(ino, 1); -+ lcp->res.lc_newsize = i_size_read(ino); -+ } -+out: -+ return status; -+} -+ -+static __be32 -+nfsd4_layoutreturn(struct svc_rqst *rqstp, -+ struct nfsd4_compound_state *cstate, -+ struct nfsd4_pnfs_layoutreturn *lrp) -+{ -+ int status; -+ struct super_block *sb; -+ struct svc_fh *current_fh = &cstate->current_fh; -+ -+ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); -+ if (status) -+ goto out; -+ -+ status = nfserr_inval; -+ sb = current_fh->fh_dentry->d_inode->i_sb; -+ if (!sb) -+ goto out; -+ -+ /* Ensure underlying file system supports pNFS and, -+ * if so, the requested layout type -+ */ -+ status = nfsd4_layout_verify(sb, current_fh->fh_export, -+ lrp->args.lr_seg.layout_type); -+ if (status) -+ goto out; -+ -+ status = nfserr_inval; -+ if (lrp->args.lr_return_type != RETURN_FILE && -+ lrp->args.lr_return_type != RETURN_FSID && -+ lrp->args.lr_return_type != RETURN_ALL) { -+ dprintk("pNFS %s: invalid return_type %d\n", __func__, -+ lrp->args.lr_return_type); -+ goto out; -+ } -+ -+ status = nfserr_inval; -+ if (lrp->args.lr_seg.iomode != IOMODE_READ && -+ lrp->args.lr_seg.iomode != IOMODE_RW && -+ lrp->args.lr_seg.iomode != IOMODE_ANY) { -+ dprintk("pNFS %s: invalid iomode %d\n", __func__, -+ lrp->args.lr_seg.iomode); -+ goto out; -+ } -+ -+ /* Set clientid from sessionid */ -+ copy_clientid((clientid_t *)&lrp->args.lr_seg.clientid, cstate->session); -+ lrp->lrs_present = (lrp->args.lr_return_type == RETURN_FILE); -+ status = nfs4_pnfs_return_layout(sb, current_fh, lrp); -+out: -+ dprintk("pNFS %s: status %d return_type 0x%x lrs_present %d\n", -+ __func__, status, lrp->args.lr_return_type, lrp->lrs_present); -+ return status; -+} -+#endif /* CONFIG_PNFSD */ -+ - /* - * NULL call. - */ -@@ -1317,6 +1688,29 @@ static struct nfsd4_operation nfsd4_ops[] = { - .op_flags = ALLOWED_WITHOUT_FH, - .op_name = "OP_RECLAIM_COMPLETE", - }, -+#if defined(CONFIG_PNFSD) -+ [OP_GETDEVICELIST] = { -+ .op_func = (nfsd4op_func)nfsd4_getdevlist, -+ .op_name = "OP_GETDEVICELIST", -+ }, -+ [OP_GETDEVICEINFO] = { -+ .op_func = (nfsd4op_func)nfsd4_getdevinfo, -+ .op_flags = ALLOWED_WITHOUT_FH, -+ .op_name = "OP_GETDEVICEINFO", -+ }, -+ [OP_LAYOUTGET] = { -+ .op_func = (nfsd4op_func)nfsd4_layoutget, -+ .op_name = "OP_LAYOUTGET", -+ }, -+ [OP_LAYOUTCOMMIT] = { -+ .op_func = (nfsd4op_func)nfsd4_layoutcommit, -+ .op_name = "OP_LAYOUTCOMMIT", -+ }, -+ [OP_LAYOUTRETURN] = { -+ .op_func = (nfsd4op_func)nfsd4_layoutreturn, -+ .op_name = "OP_LAYOUTRETURN", -+ }, -+#endif /* CONFIG_PNFSD */ - }; - - static const char *nfsd4_op_name(unsigned opnum) -diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c -index cf0d2ff..770b494 100644 ---- a/fs/nfsd/nfs4state.c -+++ b/fs/nfsd/nfs4state.c -@@ -42,6 +42,8 @@ - #include "xdr4.h" - #include "vfs.h" - -+#include "pnfsd.h" -+ - #define NFSDDBG_FACILITY NFSDDBG_PROC - - /* Globals */ -@@ -59,8 +61,6 @@ static u64 current_sessionid = 1; - #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) - - /* forward declarations */ --static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); --static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); - static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; - static void nfs4_set_recdir(char *recdir); - -@@ -68,6 +68,7 @@ static void nfs4_set_recdir(char *recdir); - - /* Currently used for almost all code touching nfsv4 state: */ - static DEFINE_MUTEX(client_mutex); -+struct task_struct *client_mutex_owner; - - /* - * Currently used for the del_recall_lru and file hash table. In an -@@ -85,11 +86,21 @@ void - nfs4_lock_state(void) - { - mutex_lock(&client_mutex); -+ client_mutex_owner = current; -+} -+ -+#define BUG_ON_UNLOCKED_STATE() BUG_ON(client_mutex_owner != current) -+ -+void -+nfs4_bug_on_unlocked_state(void) -+{ -+ BUG_ON(client_mutex_owner != current); - } - - void - nfs4_unlock_state(void) - { -+ client_mutex_owner = NULL; - mutex_unlock(&client_mutex); - } - -@@ -108,7 +119,7 @@ opaque_hashval(const void *ptr, int nbytes) - - static struct list_head del_recall_lru; - --static inline void -+inline void - put_nfs4_file(struct nfs4_file *fi) - { - if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { -@@ -119,7 +130,7 @@ put_nfs4_file(struct nfs4_file *fi) - } - } - --static inline void -+inline void - get_nfs4_file(struct nfs4_file *fi) - { - atomic_inc(&fi->fi_ref); -@@ -179,10 +190,16 @@ static void nfs4_file_get_access(struct nfs4_file *fp, int oflag) - - static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag) - { -- if (fp->fi_fds[oflag]) { -- fput(fp->fi_fds[oflag]); -- fp->fi_fds[oflag] = NULL; -- } -+ struct file *fd = fp->fi_fds[oflag]; -+ -+ if (!fd) -+ return; -+ -+ fp->fi_fds[oflag] = NULL; -+ BUG_ON_UNLOCKED_STATE(); -+ nfs4_unlock_state(); /* allow nested layout recall/return */ -+ fput(fd); -+ nfs4_lock_state(); - } - - static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) -@@ -308,8 +325,8 @@ static DEFINE_SPINLOCK(client_lock); - * reclaim_str_hashtbl[] holds known client info from previous reset/reboot - * used in reboot/reset lease grace period processing - * -- * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed -- * setclientid_confirmed info. -+ * conf_id_hashtbl[], and conf_str_hashtbl[] hold -+ * confirmed setclientid_confirmed info. - * - * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed - * setclientid info. -@@ -334,6 +351,7 @@ static void unhash_generic_stateid(struct nfs4_stateid *stp) - list_del(&stp->st_hash); - list_del(&stp->st_perfile); - list_del(&stp->st_perstateowner); -+ release_pnfs_ds_dev_list(stp); - } - - static void free_generic_stateid(struct nfs4_stateid *stp) -@@ -856,6 +874,8 @@ expire_client(struct nfs4_client *clp) - struct nfs4_delegation *dp; - struct list_head reaplist; - -+ BUG_ON_UNLOCKED_STATE(); -+ - INIT_LIST_HEAD(&reaplist); - spin_lock(&recall_lock); - while (!list_empty(&clp->cl_delegations)) { -@@ -875,6 +895,7 @@ expire_client(struct nfs4_client *clp) - sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); - release_openowner(sop); - } -+ pnfs_expire_client(clp); - nfsd4_set_callback_client(clp, NULL); - if (clp->cl_cb_conn.cb_xprt) - svc_xprt_put(clp->cl_cb_conn.cb_xprt); -@@ -887,6 +908,13 @@ expire_client(struct nfs4_client *clp) - spin_unlock(&client_lock); - } - -+void expire_client_lock(struct nfs4_client *clp) -+{ -+ nfs4_lock_state(); -+ expire_client(clp); -+ nfs4_unlock_state(); -+} -+ - static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) - { - memcpy(target->cl_verifier.data, source->data, -@@ -976,6 +1004,11 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, - INIT_LIST_HEAD(&clp->cl_strhash); - INIT_LIST_HEAD(&clp->cl_openowners); - INIT_LIST_HEAD(&clp->cl_delegations); -+#if defined(CONFIG_PNFSD) -+ INIT_LIST_HEAD(&clp->cl_layouts); -+ INIT_LIST_HEAD(&clp->cl_layoutrecalls); -+ atomic_set(&clp->cl_deviceref, 0); -+#endif /* CONFIG_PNFSD */ - INIT_LIST_HEAD(&clp->cl_sessions); - INIT_LIST_HEAD(&clp->cl_lru); - clp->cl_time = get_seconds(); -@@ -1025,7 +1058,7 @@ move_to_confirmed(struct nfs4_client *clp) - renew_client(clp); - } - --static struct nfs4_client * -+struct nfs4_client * - find_confirmed_client(clientid_t *clid) - { - struct nfs4_client *clp; -@@ -1095,6 +1128,24 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval, - return NULL; - } - -+int -+filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), -+ void *arg) -+{ -+ struct nfs4_client *clp, *next; -+ int i, status = 0; -+ -+ for (i = 0; i < CLIENT_HASH_SIZE; i++) -+ list_for_each_entry_safe (clp, next, &conf_str_hashtbl[i], -+ cl_strhash) { -+ status = func(clp, arg); -+ if (status) -+ break; -+ } -+ -+ return status; -+} -+ - static void - gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) - { -@@ -1227,8 +1278,12 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, - static void - nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) - { -- /* pNFS is not supported */ -+#if defined(CONFIG_PNFSD) -+ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS | -+ EXCHGID4_FLAG_USE_PNFS_DS; -+#else /* CONFIG_PNFSD */ - new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; -+#endif /* CONFIG_PNFSD */ - - /* Referrals are supported, Migration is not. */ - new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; -@@ -1418,6 +1473,13 @@ nfsd4_create_session(struct svc_rqst *rqstp, - struct nfsd4_clid_slot *cs_slot = NULL; - int status = 0; - -+#if defined(CONFIG_PNFSD_LOCAL_EXPORT) -+ /* XXX hack to get local ip address */ -+ memcpy(&pnfsd_lexp_addr, &rqstp->rq_xprt->xpt_local, -+ sizeof(pnfsd_lexp_addr)); -+ pnfs_lexp_addr_len = rqstp->rq_xprt->xpt_locallen; -+#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ -+ - nfs4_lock_state(); - unconf = find_unconfirmed_client(&cr_ses->clientid); - conf = find_confirmed_client(&cr_ses->clientid); -@@ -1457,25 +1519,26 @@ nfsd4_create_session(struct svc_rqst *rqstp, - cs_slot->sl_seqid++; /* from 0 to 1 */ - move_to_confirmed(unconf); - -- if (cr_ses->flags & SESSION4_BACK_CHAN) { -- unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; -- svc_xprt_get(rqstp->rq_xprt); -- rpc_copy_addr( -- (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, -- sa); -- unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); -- unconf->cl_cb_conn.cb_minorversion = -- cstate->minorversion; -- unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; -- unconf->cl_cb_seq_nr = 1; -- nfsd4_probe_callback(unconf, &unconf->cl_cb_conn); -- } -+ if (is_ds_only_session(unconf->cl_exchange_flags)) -+ cr_ses->flags &= ~SESSION4_BACK_CHAN; -+ - conf = unconf; - } else { - status = nfserr_stale_clientid; - goto out; - } - -+ if (cr_ses->flags & SESSION4_BACK_CHAN) { -+ conf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; -+ svc_xprt_get(rqstp->rq_xprt); -+ rpc_copy_addr((struct sockaddr *)&conf->cl_cb_conn.cb_addr, sa); -+ conf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); -+ conf->cl_cb_conn.cb_minorversion = cstate->minorversion; -+ conf->cl_cb_conn.cb_prog = cr_ses->callback_prog; -+ conf->cl_cb_seq_nr = 1; -+ nfsd4_probe_callback(conf, &conf->cl_cb_conn); -+ } -+ - /* - * We do not support RDMA or persistent sessions - */ -@@ -1863,7 +1926,7 @@ out: - - /* OPEN Share state helper functions */ - static inline struct nfs4_file * --alloc_init_file(struct inode *ino) -+alloc_init_file(struct inode *ino, struct svc_fh *current_fh) - { - struct nfs4_file *fp; - unsigned int hashval = file_hashval(ino); -@@ -1879,6 +1942,16 @@ alloc_init_file(struct inode *ino) - fp->fi_had_conflict = false; - memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); - memset(fp->fi_access, 0, sizeof(fp->fi_access)); -+#if defined(CONFIG_PNFSD) -+ INIT_LIST_HEAD(&fp->fi_layouts); -+ INIT_LIST_HEAD(&fp->fi_layout_states); -+ fp->fi_fsid.major = current_fh->fh_export->ex_fsid; -+ fp->fi_fsid.minor = 0; -+ fp->fi_fhlen = current_fh->fh_handle.fh_size; -+ BUG_ON(fp->fi_fhlen > sizeof(fp->fi_fhval)); -+ memcpy(fp->fi_fhval, ¤t_fh->fh_handle.fh_base, -+ fp->fi_fhlen); -+#endif /* CONFIG_PNFSD */ - spin_lock(&recall_lock); - list_add(&fp->fi_hash, &file_hashtbl[hashval]); - spin_unlock(&recall_lock); -@@ -1887,7 +1960,7 @@ alloc_init_file(struct inode *ino) - return NULL; - } - --static void -+void - nfsd4_free_slab(struct kmem_cache **slab) - { - if (*slab == NULL) -@@ -1903,6 +1976,7 @@ nfsd4_free_slabs(void) - nfsd4_free_slab(&file_slab); - nfsd4_free_slab(&stateid_slab); - nfsd4_free_slab(&deleg_slab); -+ nfsd4_free_pnfs_slabs(); - } - - static int -@@ -1924,6 +1998,8 @@ nfsd4_init_slabs(void) - sizeof(struct nfs4_delegation), 0, 0, NULL); - if (deleg_slab == NULL) - goto out_nomem; -+ if (nfsd4_init_pnfs_slabs()) -+ goto out_nomem; - return 0; - out_nomem: - nfsd4_free_slabs(); -@@ -1997,6 +2073,9 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open * - INIT_LIST_HEAD(&stp->st_perstateowner); - INIT_LIST_HEAD(&stp->st_lockowners); - INIT_LIST_HEAD(&stp->st_perfile); -+#if defined(CONFIG_PNFSD) -+ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); -+#endif /* CONFIG_PNFSD */ - list_add(&stp->st_hash, &stateid_hashtbl[hashval]); - list_add(&stp->st_perstateowner, &sop->so_stateids); - list_add(&stp->st_perfile, &fp->fi_stateids); -@@ -2038,6 +2117,7 @@ find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open) - { - struct nfs4_stateowner *so = NULL; - -+ BUG_ON_UNLOCKED_STATE(); - list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { - if (same_owner_str(so, &open->op_owner, &open->op_clientid)) - return so; -@@ -2046,7 +2126,7 @@ find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open) - } - - /* search file_hashtbl[] for file */ --static struct nfs4_file * -+struct nfs4_file * - find_file(struct inode *ino) - { - unsigned int hashval = file_hashval(ino); -@@ -2064,6 +2144,18 @@ find_file(struct inode *ino) - return NULL; - } - -+struct nfs4_file * -+find_alloc_file(struct inode *ino, struct svc_fh *current_fh) -+{ -+ struct nfs4_file *fp; -+ -+ fp = find_file(ino); -+ if (fp) -+ return fp; -+ -+ return alloc_init_file(ino, current_fh); -+} -+ - static inline int access_valid(u32 x, u32 minorversion) - { - if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ) -@@ -2592,7 +2684,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf - if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) - goto out; - status = nfserr_resource; -- fp = alloc_init_file(ino); -+ fp = alloc_init_file(ino, current_fh); - if (fp == NULL) - goto out; - } -@@ -2813,7 +2905,7 @@ nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp) - return fhp->fh_dentry->d_inode != stp->st_file->fi_inode; - } - --static int -+int - STALE_STATEID(stateid_t *stateid) - { - if (stateid->si_boot == boot_time) -@@ -2823,6 +2915,16 @@ STALE_STATEID(stateid_t *stateid) - return 1; - } - -+__be32 -+nfs4_check_stateid(stateid_t *stateid) -+{ -+ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) -+ return nfserr_bad_stateid; -+ if (STALE_STATEID(stateid)) -+ return nfserr_stale_stateid; -+ return 0; -+} -+ - static inline int - access_permit_read(unsigned long access_bmap) - { -@@ -2934,6 +3036,24 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, - if (grace_disallows_io(ino)) - return nfserr_grace; - -+#if defined(CONFIG_PNFSD) -+ if (pnfs_fh_is_ds(¤t_fh->fh_handle)) { -+ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) -+ status = nfserr_bad_stateid; -+ else -+#ifdef CONFIG_GFS2_FS_LOCKING_DLM -+ { -+ dprintk("%s Don't check DS stateid\n", __func__); -+ return 0; -+ } -+#else /* CONFIG_GFS2_FS_LOCKING_DLM */ -+ status = nfs4_preprocess_pnfs_ds_stateid(current_fh, -+ stateid); -+#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ -+ goto out; -+ } -+#endif /* CONFIG_PNFSD */ -+ - if (nfsd4_has_session(cstate)) - flags |= HAS_SESSION; - -@@ -3015,13 +3135,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, - *stpp = NULL; - *sopp = NULL; - -- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { -- dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); -- return nfserr_bad_stateid; -- } -- -- if (STALE_STATEID(stateid)) -- return nfserr_stale_stateid; -+ status = nfs4_check_stateid(stateid); -+ if (status) -+ return status; - - if (nfsd4_has_session(cstate)) - flags |= HAS_SESSION; -@@ -3295,11 +3411,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - if (nfsd4_has_session(cstate)) - flags |= HAS_SESSION; - nfs4_lock_state(); -- status = nfserr_bad_stateid; -- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) -- goto out; -- status = nfserr_stale_stateid; -- if (STALE_STATEID(stateid)) -+ status = nfs4_check_stateid(stateid); -+ if (status) - goto out; - status = nfserr_bad_stateid; - if (!is_delegation_stateid(stateid)) -@@ -3328,26 +3441,6 @@ out: - #define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) - #define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) - --static inline u64 --end_offset(u64 start, u64 len) --{ -- u64 end; -- -- end = start + len; -- return end >= start ? end: NFS4_MAX_UINT64; --} -- --/* last octet in a range */ --static inline u64 --last_byte_offset(u64 start, u64 len) --{ -- u64 end; -- -- BUG_ON(!len); -- end = start + len; -- return end > start ? end - 1: NFS4_MAX_UINT64; --} -- - #define lockownerid_hashval(id) \ - ((id) & LOCK_HASH_MASK) - -@@ -3364,7 +3457,7 @@ static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE]; - static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; - static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; - --static struct nfs4_stateid * -+struct nfs4_stateid * - find_stateid(stateid_t *stid, int flags) - { - struct nfs4_stateid *local; -@@ -3393,7 +3486,7 @@ find_stateid(stateid_t *stid, int flags) - return NULL; - } - --static struct nfs4_delegation * -+struct nfs4_delegation * - find_delegation_stateid(struct inode *ino, stateid_t *stid) - { - struct nfs4_file *fp; -@@ -3524,6 +3617,9 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc - INIT_LIST_HEAD(&stp->st_perfile); - INIT_LIST_HEAD(&stp->st_perstateowner); - INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ -+#if defined(CONFIG_PNFSD) -+ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); -+#endif /* CONFIG_PNFSD */ - list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]); - list_add(&stp->st_perfile, &fp->fi_stateids); - list_add(&stp->st_perstateowner, &sop->so_stateids); -@@ -4100,6 +4196,9 @@ nfs4_state_init(void) - INIT_LIST_HEAD(&client_lru); - INIT_LIST_HEAD(&del_recall_lru); - reclaim_str_hashtbl_size = 0; -+#if defined(CONFIG_PNFSD) -+ nfs4_pnfs_state_init(); -+#endif /* CONFIG_PNFSD */ - return 0; - } - -@@ -4204,6 +4303,7 @@ __nfs4_state_shutdown(void) - } - - nfsd4_shutdown_recdir(); -+ nfs4_pnfs_state_shutdown(); - } - - void -diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c -index 1a468bb..b93906b 100644 ---- a/fs/nfsd/nfs4xdr.c -+++ b/fs/nfsd/nfs4xdr.c -@@ -47,9 +47,14 @@ - #include - #include - #include -+#include -+#include -+#include -+#include - - #include "xdr4.h" - #include "vfs.h" -+#include "pnfsd.h" - - #define NFSDDBG_FACILITY NFSDDBG_XDR - -@@ -1244,6 +1249,138 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str - DECODE_TAIL; - } - -+#if defined(CONFIG_PNFSD) -+static __be32 -+nfsd4_decode_getdevlist(struct nfsd4_compoundargs *argp, -+ struct nfsd4_pnfs_getdevlist *gdevl) -+{ -+ DECODE_HEAD; -+ -+ READ_BUF(16 + sizeof(nfs4_verifier)); -+ READ32(gdevl->gd_layout_type); -+ READ32(gdevl->gd_maxdevices); -+ READ64(gdevl->gd_cookie); -+ COPYMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); -+ -+ DECODE_TAIL; -+} -+ -+static __be32 -+nfsd4_decode_getdevinfo(struct nfsd4_compoundargs *argp, -+ struct nfsd4_pnfs_getdevinfo *gdev) -+{ -+ u32 num; -+ DECODE_HEAD; -+ -+ READ_BUF(12 + sizeof(struct nfsd4_pnfs_deviceid)); -+ READ64(gdev->gd_devid.sbid); -+ READ64(gdev->gd_devid.devid); -+ READ32(gdev->gd_layout_type); -+ READ32(gdev->gd_maxcount); -+ READ32(num); -+ if (num) { -+ READ_BUF(4); -+ READ32(gdev->gd_notify_types); -+ } else { -+ gdev->gd_notify_types = 0; -+ } -+ -+ DECODE_TAIL; -+} -+ -+static __be32 -+nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, -+ struct nfsd4_pnfs_layoutget *lgp) -+{ -+ DECODE_HEAD; -+ -+ READ_BUF(36); -+ READ32(lgp->lg_signal); -+ READ32(lgp->lg_seg.layout_type); -+ READ32(lgp->lg_seg.iomode); -+ READ64(lgp->lg_seg.offset); -+ READ64(lgp->lg_seg.length); -+ READ64(lgp->lg_minlength); -+ nfsd4_decode_stateid(argp, &lgp->lg_sid); -+ READ_BUF(4); -+ READ32(lgp->lg_maxcount); -+ -+ DECODE_TAIL; -+} -+ -+static __be32 -+nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, -+ struct nfsd4_pnfs_layoutcommit *lcp) -+{ -+ DECODE_HEAD; -+ u32 timechange; -+ -+ READ_BUF(20); -+ READ64(lcp->args.lc_seg.offset); -+ READ64(lcp->args.lc_seg.length); -+ READ32(lcp->args.lc_reclaim); -+ nfsd4_decode_stateid(argp, &lcp->lc_sid); -+ READ_BUF(4); -+ READ32(lcp->args.lc_newoffset); -+ if (lcp->args.lc_newoffset) { -+ READ_BUF(8); -+ READ64(lcp->args.lc_last_wr); -+ } else -+ lcp->args.lc_last_wr = 0; -+ READ_BUF(4); -+ READ32(timechange); -+ if (timechange) { -+ READ_BUF(12); -+ READ64(lcp->args.lc_mtime.seconds); -+ READ32(lcp->args.lc_mtime.nseconds); -+ } else { -+ lcp->args.lc_mtime.seconds = 0; -+ lcp->args.lc_mtime.nseconds = 0; -+ } -+ READ_BUF(8); -+ READ32(lcp->args.lc_seg.layout_type); -+ /* XXX: saving XDR'ed layout update. Since we don't have the -+ * current_fh yet, and therefore no export_ops, we can't call -+ * the layout specific decode routines. File and pVFS2 -+ * do not use the layout update.... -+ */ -+ READ32(lcp->args.lc_up_len); -+ if (lcp->args.lc_up_len > 0) { -+ READ_BUF(lcp->args.lc_up_len); -+ READMEM(lcp->args.lc_up_layout, lcp->args.lc_up_len); -+ } -+ -+ DECODE_TAIL; -+} -+ -+static __be32 -+nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, -+ struct nfsd4_pnfs_layoutreturn *lrp) -+{ -+ DECODE_HEAD; -+ -+ READ_BUF(16); -+ READ32(lrp->args.lr_reclaim); -+ READ32(lrp->args.lr_seg.layout_type); -+ READ32(lrp->args.lr_seg.iomode); -+ READ32(lrp->args.lr_return_type); -+ if (lrp->args.lr_return_type == RETURN_FILE) { -+ READ_BUF(16); -+ READ64(lrp->args.lr_seg.offset); -+ READ64(lrp->args.lr_seg.length); -+ nfsd4_decode_stateid(argp, &lrp->lr_sid); -+ READ_BUF(4); -+ READ32(lrp->args.lrf_body_len); -+ if (lrp->args.lrf_body_len > 0) { -+ READ_BUF(lrp->args.lrf_body_len); -+ READMEM(lrp->args.lrf_body, lrp->args.lrf_body_len); -+ } -+ } -+ -+ DECODE_TAIL; -+} -+#endif /* CONFIG_PNFSD */ -+ - static __be32 - nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) - { -@@ -1345,11 +1482,19 @@ static nfsd4_dec nfsd41_dec_ops[] = { - [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, - [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, -+#if defined(CONFIG_PNFSD) -+ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdevinfo, -+ [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_getdevlist, -+ [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, -+ [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, -+ [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, -+#else /* CONFIG_PNFSD */ - [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, -+#endif /* CONFIG_PNFSD */ - [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, - [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, -@@ -1805,19 +1950,23 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, - goto out_nfserr; - } - } -- if ((buflen -= 16) < 0) -- goto out_resource; - - if (unlikely(bmval2)) { -+ if ((buflen -= 16) < 0) -+ goto out_resource; - WRITE32(3); - WRITE32(bmval0); - WRITE32(bmval1); - WRITE32(bmval2); - } else if (likely(bmval1)) { -+ if ((buflen -= 12) < 0) -+ goto out_resource; - WRITE32(2); - WRITE32(bmval0); - WRITE32(bmval1); - } else { -+ if ((buflen -= 8) < 0) -+ goto out_resource; - WRITE32(1); - WRITE32(bmval0); - } -@@ -1828,15 +1977,17 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, - u32 word1 = nfsd_suppattrs1(minorversion); - u32 word2 = nfsd_suppattrs2(minorversion); - -- if ((buflen -= 12) < 0) -- goto out_resource; - if (!aclsupport) - word0 &= ~FATTR4_WORD0_ACL; - if (!word2) { -+ if ((buflen -= 12) < 0) -+ goto out_resource; - WRITE32(2); - WRITE32(word0); - WRITE32(word1); - } else { -+ if ((buflen -= 16) < 0) -+ goto out_resource; - WRITE32(3); - WRITE32(word0); - WRITE32(word1); -@@ -2150,6 +2301,36 @@ out_acl: - } - WRITE64(stat.ino); - } -+#if defined(CONFIG_PNFSD) -+ if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { -+ struct super_block *sb = dentry->d_inode->i_sb; -+ int type = 0; -+ -+ /* Query the filesystem for supported pNFS layout types. -+ * Currently, we only support one layout type per file system. -+ * The export_ops->layout_type() returns the pnfs_layouttype4. -+ */ -+ buflen -= 4; -+ if (buflen < 0) /* length */ -+ goto out_resource; -+ -+ if (sb && sb->s_pnfs_op && sb->s_pnfs_op->layout_type) -+ type = sb->s_pnfs_op->layout_type(sb); -+ if (type) { -+ if ((buflen -= 4) < 0) /* type */ -+ goto out_resource; -+ WRITE32(1); /* length */ -+ WRITE32(type); /* type */ -+ } else -+ WRITE32(0); /* length */ -+ } -+ -+ if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { -+ if ((buflen -= 4) < 0) -+ goto out_resource; -+ WRITE32(stat.blksize); -+ } -+#endif /* CONFIG_PNFSD */ - if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { - WRITE32(3); - WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); -@@ -2380,6 +2561,10 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ - if (!nfserr) { - RESERVE_SPACE(8); - WRITEMEM(commit->co_verf.data, 8); -+ dprintk("NFSD: nfsd4_encode_commit: verifier %x:%x\n", -+ ((u32 *)(&commit->co_verf.data))[0], -+ ((u32 *)(&commit->co_verf.data))[1]); -+ - ADJUST_ARGS(); - } - return nfserr; -@@ -2634,6 +2819,13 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, - } - read->rd_vlen = v; - -+#if defined(CONFIG_SPNFS) -+ if (spnfs_enabled()) -+ nfserr = spnfs_read(read->rd_fhp->fh_dentry->d_inode, -+ read->rd_offset, &maxcount, read->rd_vlen, -+ resp->rqstp); -+ else /* we're not an MDS */ -+#endif /* CONFIG_SPNFS */ - nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp, - read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, - &maxcount); -@@ -2940,6 +3132,9 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w - WRITE32(write->wr_bytes_written); - WRITE32(write->wr_how_written); - WRITEMEM(write->wr_verifier.data, 8); -+ dprintk("NFSD: nfsd4_encode_write: verifier %x:%x\n", -+ ((u32 *)(&write->wr_verifier.data))[0], -+ ((u32 *)(&write->wr_verifier.data))[1]); - ADJUST_ARGS(); - } - return nfserr; -@@ -3083,6 +3278,343 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, - return 0; - } - -+#if defined(CONFIG_PNFSD) -+ -+/* Uses the export interface to iterate through the available devices -+ * and encodes them on the response stream. -+ */ -+static __be32 -+nfsd4_encode_devlist_iterator(struct nfsd4_compoundres *resp, -+ struct nfsd4_pnfs_getdevlist *gdevl, -+ unsigned int *dev_count) -+{ -+ struct super_block *sb = gdevl->gd_fhp->fh_dentry->d_inode->i_sb; -+ __be32 nfserr; -+ int status; -+ __be32 *p; -+ struct nfsd4_pnfs_dev_iter_res res = { -+ .gd_cookie = gdevl->gd_cookie, -+ .gd_verf = gdevl->gd_verf, -+ .gd_eof = 0 -+ }; -+ u64 sbid; -+ -+ dprintk("%s: Begin\n", __func__); -+ -+ sbid = find_create_sbid(sb); -+ *dev_count = 0; -+ do { -+ status = sb->s_pnfs_op->get_device_iter(sb, -+ gdevl->gd_layout_type, -+ &res); -+ if (status) { -+ if (status == -ENOENT) { -+ res.gd_eof = 1; -+ /* return success */ -+ break; -+ } -+ nfserr = nfserrno(status); -+ goto out_err; -+ } -+ -+ /* Encode device id and layout type */ -+ RESERVE_SPACE(sizeof(struct nfsd4_pnfs_deviceid)); -+ WRITE64((__be64)sbid); -+ WRITE64(res.gd_devid); /* devid minor */ -+ ADJUST_ARGS(); -+ (*dev_count)++; -+ } while (*dev_count < gdevl->gd_maxdevices && !res.gd_eof); -+ gdevl->gd_cookie = res.gd_cookie; -+ gdevl->gd_verf = res.gd_verf; -+ gdevl->gd_eof = res.gd_eof; -+ nfserr = nfs_ok; -+out_err: -+ dprintk("%s: Encoded %u devices\n", __func__, *dev_count); -+ return nfserr; -+} -+ -+/* Encodes the response of get device list. -+*/ -+static __be32 -+nfsd4_encode_getdevlist(struct nfsd4_compoundres *resp, __be32 nfserr, -+ struct nfsd4_pnfs_getdevlist *gdevl) -+{ -+ unsigned int dev_count = 0, lead_count; -+ u32 *p_in = resp->p; -+ __be32 *p; -+ -+ dprintk("%s: err %d\n", __func__, nfserr); -+ if (nfserr) -+ return nfserr; -+ -+ /* Ensure we have room for cookie, verifier, and devlist len, -+ * which we will backfill in after we encode as many devices as possible -+ */ -+ lead_count = 8 + sizeof(nfs4_verifier) + 4; -+ RESERVE_SPACE(lead_count); -+ /* skip past these values */ -+ p += XDR_QUADLEN(lead_count); -+ ADJUST_ARGS(); -+ -+ /* Iterate over as many device ids as possible on the xdr stream */ -+ nfserr = nfsd4_encode_devlist_iterator(resp, gdevl, &dev_count); -+ if (nfserr) -+ goto out_err; -+ -+ /* Backfill in cookie, verf and number of devices encoded */ -+ p = p_in; -+ WRITE64(gdevl->gd_cookie); -+ WRITEMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); -+ WRITE32(dev_count); -+ -+ /* Skip over devices */ -+ p += XDR_QUADLEN(dev_count * sizeof(struct nfsd4_pnfs_deviceid)); -+ ADJUST_ARGS(); -+ -+ /* are we at the end of devices? */ -+ RESERVE_SPACE(4); -+ WRITE32(gdevl->gd_eof); -+ ADJUST_ARGS(); -+ -+ dprintk("%s: done.\n", __func__); -+ -+ nfserr = nfs_ok; -+out: -+ return nfserr; -+out_err: -+ p = p_in; -+ ADJUST_ARGS(); -+ goto out; -+} -+ -+/* For a given device id, have the file system retrieve and encode the -+ * associated device. For file layout, the encoding function is -+ * passed down to the file system. The file system then has the option -+ * of using this encoding function or one of its own. -+ * -+ * Note: the file system must return the XDR size of struct device_addr4 -+ * da_addr_body in pnfs_xdr_info.bytes_written on NFS4ERR_TOOSMALL for the -+ * gdir_mincount calculation. -+ */ -+static __be32 -+nfsd4_encode_getdevinfo(struct nfsd4_compoundres *resp, __be32 nfserr, -+ struct nfsd4_pnfs_getdevinfo *gdev) -+{ -+ struct super_block *sb; -+ int maxcount = 0, type_notify_len = 12; -+ __be32 *p, *p_save = NULL, *p_in = resp->p; -+ struct exp_xdr_stream xdr; -+ -+ dprintk("%s: err %d\n", __func__, nfserr); -+ if (nfserr) -+ return nfserr; -+ -+ sb = gdev->gd_sb; -+ -+ if (gdev->gd_maxcount != 0) { -+ /* FIXME: this will be bound by the session max response */ -+ maxcount = svc_max_payload(resp->rqstp); -+ if (maxcount > gdev->gd_maxcount) -+ maxcount = gdev->gd_maxcount; -+ -+ /* Ensure have room for type and notify field */ -+ maxcount -= type_notify_len; -+ if (maxcount < 0) { -+ nfserr = -ETOOSMALL; -+ goto toosmall; -+ } -+ } -+ -+ RESERVE_SPACE(4); -+ WRITE32(gdev->gd_layout_type); -+ ADJUST_ARGS(); -+ -+ /* If maxcount is 0 then just update notifications */ -+ if (gdev->gd_maxcount == 0) -+ goto handle_notifications; -+ -+ xdr.p = p_save = resp->p; -+ xdr.end = resp->end; -+ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) -+ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); -+ -+ nfserr = sb->s_pnfs_op->get_device_info(sb, &xdr, gdev->gd_layout_type, -+ &gdev->gd_devid); -+ if (nfserr) -+ goto err; -+ -+ /* The file system should never write 0 bytes without -+ * returning an error -+ */ -+ BUG_ON(xdr.p == p_save); -+ BUG_ON(xdr.p > xdr.end); -+ -+ /* Update the xdr stream with the number of bytes encoded -+ * by the file system. -+ */ -+ p = xdr.p; -+ ADJUST_ARGS(); -+ -+handle_notifications: -+ /* Encode supported device notifications */ -+ RESERVE_SPACE(4); -+ if (sb->s_pnfs_op->set_device_notify) { -+ struct pnfs_devnotify_arg dn_args; -+ -+ dn_args.dn_layout_type = gdev->gd_layout_type; -+ dn_args.dn_devid = gdev->gd_devid; -+ dn_args.dn_notify_types = gdev->gd_notify_types; -+ nfserr = sb->s_pnfs_op->set_device_notify(sb, &dn_args); -+ if (nfserr) -+ goto err; -+ WRITE32(dn_args.dn_notify_types); -+ } else { -+ WRITE32(0); -+ } -+ ADJUST_ARGS(); -+ -+out: -+ return nfserrno(nfserr); -+toosmall: -+ dprintk("%s: maxcount too small\n", __func__); -+ RESERVE_SPACE(4); -+ WRITE32((p_save ? (xdr.p - p_save) * 4 : 0) + type_notify_len); -+ ADJUST_ARGS(); -+ goto out; -+err: -+ /* Rewind to the beginning */ -+ p = p_in; -+ ADJUST_ARGS(); -+ if (nfserr == -ETOOSMALL) -+ goto toosmall; -+ printk(KERN_ERR "%s: export ERROR %d\n", __func__, nfserr); -+ goto out; -+} -+ -+static __be32 -+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, -+ __be32 nfserr, -+ struct nfsd4_pnfs_layoutget *lgp) -+{ -+ int maxcount, leadcount; -+ struct super_block *sb; -+ struct exp_xdr_stream xdr; -+ __be32 *p, *p_save, *p_start = resp->p; -+ -+ dprintk("%s: err %d\n", __func__, nfserr); -+ if (nfserr) -+ return nfserr; -+ -+ sb = lgp->lg_fhp->fh_dentry->d_inode->i_sb; -+ maxcount = PAGE_SIZE; -+ if (maxcount > lgp->lg_maxcount) -+ maxcount = lgp->lg_maxcount; -+ -+ /* Check for space on xdr stream */ -+ leadcount = 36 + sizeof(stateid_opaque_t); -+ RESERVE_SPACE(leadcount); -+ /* encode layout metadata after file system encodes layout */ -+ p += XDR_QUADLEN(leadcount); -+ ADJUST_ARGS(); -+ -+ /* Ensure have room for ret_on_close, off, len, iomode, type */ -+ maxcount -= leadcount; -+ if (maxcount < 0) { -+ printk(KERN_ERR "%s: buffer too small\n", __func__); -+ nfserr = nfserr_toosmall; -+ goto err; -+ } -+ -+ /* Set xdr info so file system can encode layout */ -+ xdr.p = p_save = resp->p; -+ xdr.end = resp->end; -+ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) -+ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); -+ -+ /* Retrieve, encode, and merge layout; process stateid */ -+ nfserr = nfs4_pnfs_get_layout(lgp, &xdr); -+ if (nfserr) -+ goto err; -+ -+ /* Ensure file system returned enough bytes for the client -+ * to access. -+ */ -+ if (lgp->lg_seg.length < lgp->lg_minlength) { -+ nfserr = nfserr_badlayout; -+ goto err; -+ } -+ -+ /* The file system should never write 0 bytes without -+ * returning an error -+ */ -+ BUG_ON(xdr.p == p_save); -+ -+ /* Rewind to beginning and encode attrs */ -+ resp->p = p_start; -+ RESERVE_SPACE(4); -+ WRITE32(lgp->lg_roc); /* return on close */ -+ ADJUST_ARGS(); -+ nfsd4_encode_stateid(resp, &lgp->lg_sid); -+ RESERVE_SPACE(28); -+ /* Note: response logr_layout array count, always one for now */ -+ WRITE32(1); -+ WRITE64(lgp->lg_seg.offset); -+ WRITE64(lgp->lg_seg.length); -+ WRITE32(lgp->lg_seg.iomode); -+ WRITE32(lgp->lg_seg.layout_type); -+ -+ /* Update the xdr stream with the number of bytes written -+ * by the file system -+ */ -+ p = xdr.p; -+ ADJUST_ARGS(); -+ -+ return nfs_ok; -+err: -+ resp->p = p_start; -+ return nfserr; -+} -+ -+static __be32 -+nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, -+ struct nfsd4_pnfs_layoutcommit *lcp) -+{ -+ __be32 *p; -+ -+ if (nfserr) -+ goto out; -+ -+ RESERVE_SPACE(4); -+ WRITE32(lcp->res.lc_size_chg); -+ ADJUST_ARGS(); -+ if (lcp->res.lc_size_chg) { -+ RESERVE_SPACE(8); -+ WRITE64(lcp->res.lc_newsize); -+ ADJUST_ARGS(); -+ } -+out: -+ return nfserr; -+} -+ -+static __be32 -+nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, -+ struct nfsd4_pnfs_layoutreturn *lrp) -+{ -+ __be32 *p; -+ -+ if (nfserr) -+ goto out; -+ -+ RESERVE_SPACE(4); -+ WRITE32(lrp->lrs_present != 0); /* got stateid? */ -+ ADJUST_ARGS(); -+ if (lrp->lrs_present) -+ nfsd4_encode_stateid(resp, &lrp->lr_sid); -+out: -+ return nfserr; -+} -+#endif /* CONFIG_PNFSD */ -+ - static __be32 - nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) - { -@@ -3143,11 +3675,19 @@ static nfsd4_enc nfsd4_enc_ops[] = { - [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, - [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, - [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, -+#if defined(CONFIG_PNFSD) -+ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdevinfo, -+ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_getdevlist, -+ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, -+ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, -+ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, -+#else /* CONFIG_PNFSD */ - [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, - [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, -+#endif /* CONFIG_PNFSD */ - [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop, - [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, - [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, -diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c -index b53b1d0..1bbd9c2 100644 ---- a/fs/nfsd/nfsctl.c -+++ b/fs/nfsd/nfsctl.c -@@ -13,10 +13,15 @@ - #include - #include - #include -+#include - - #include "nfsd.h" - #include "cache.h" - -+#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) -+#include -+#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ -+ - /* - * We have a single directory with 9 nodes in it. - */ -@@ -49,6 +54,9 @@ enum { - NFSD_Gracetime, - NFSD_RecoveryDir, - #endif -+#ifdef CONFIG_PNFSD -+ NFSD_pnfs_dlm_device, -+#endif - }; - - /* -@@ -74,6 +82,9 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size); - static ssize_t write_gracetime(struct file *file, char *buf, size_t size); - static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); - #endif -+#ifdef CONFIG_PNFSD -+static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size); -+#endif - - static ssize_t (*write_op[])(struct file *, char *, size_t) = { - [NFSD_Svc] = write_svc, -@@ -96,6 +107,9 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = { - [NFSD_Gracetime] = write_gracetime, - [NFSD_RecoveryDir] = write_recoverydir, - #endif -+#ifdef CONFIG_PNFSD -+ [NFSD_pnfs_dlm_device] = write_pnfs_dlm_device, -+#endif - }; - - static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) -@@ -1347,6 +1361,68 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) - - #endif - -+#ifdef CONFIG_PNFSD -+ -+static ssize_t __write_pnfs_dlm_device(struct file *file, char *buf, -+ size_t size) -+{ -+ char *mesg = buf; -+ char *pnfs_dlm_device; -+ int max_size = NFSD_PNFS_DLM_DEVICE_MAX; -+ int len, ret = 0; -+ -+ if (size > 0) { -+ ret = -EINVAL; -+ if (size > max_size || buf[size-1] != '\n') -+ return ret; -+ buf[size-1] = 0; -+ -+ pnfs_dlm_device = mesg; -+ len = qword_get(&mesg, pnfs_dlm_device, size); -+ if (len <= 0) -+ return ret; -+ -+ ret = nfsd4_set_pnfs_dlm_device(pnfs_dlm_device, len); -+ } else -+ return nfsd4_get_pnfs_dlm_device_list(buf, SIMPLE_TRANSACTION_LIMIT); -+ -+ return ret <= 0 ? ret : strlen(buf); -+} -+ -+/** -+ * write_pnfs_dlm_device - Set or report the current pNFS data server list -+ * -+ * Input: -+ * buf: ignored -+ * size: zero -+ * -+ * OR -+ * -+ * Input: -+ * buf: C string containing a block device name, -+ * a colon, and then a comma separated -+ * list of pNFS data server IPv4 addresses -+ * size: non-zero length of C string in @buf -+ * Output: -+ * On success: passed-in buffer filled with '\n'-terminated C -+ * string containing a block device name, a colon, and -+ * then a comma separated list of pNFS -+ * data server IPv4 addresses. -+ * return code is the size in bytes of the string -+ * On error: return code is a negative errno value -+ */ -+static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size) -+{ -+ ssize_t rv; -+ -+ mutex_lock(&nfsd_mutex); -+ rv = __write_pnfs_dlm_device(file, buf, size); -+ mutex_unlock(&nfsd_mutex); -+ return rv; -+} -+ -+#endif /* CONFIG_PNFSD */ -+ - /*----------------------------------------------------------------------------*/ - /* - * populating the filesystem. -@@ -1381,6 +1457,10 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) - [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, - [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, - #endif -+#ifdef CONFIG_PNFSD -+ [NFSD_pnfs_dlm_device] = {"pnfs_dlm_device", &transaction_ops, -+ S_IWUSR|S_IRUSR}, -+#endif - /* last one */ {""} - }; - return simple_fill_super(sb, 0x6e667364, nfsd_files); -@@ -1419,6 +1499,9 @@ static int create_proc_exports_entry(void) - } - #endif - -+#if defined(CONFIG_SPNFS_BLOCK) -+int nfsd_bl_init(void); -+#endif - static int __init init_nfsd(void) - { - int retval; -@@ -1441,6 +1524,15 @@ static int __init init_nfsd(void) - retval = create_proc_exports_entry(); - if (retval) - goto out_free_idmap; -+#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) -+ retval = spnfs_init_proc(); -+ if (retval != 0) -+ goto out_free_idmap; -+#if defined(CONFIG_SPNFS_BLOCK) -+ nfsd_bl_init(); -+#endif /* CONFIG_SPNFS_BLOCK */ -+#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ -+ - retval = register_filesystem(&nfsd_fs_type); - if (retval) - goto out_free_all; -@@ -1463,7 +1555,22 @@ out_free_stat: - - static void __exit exit_nfsd(void) - { -+#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) -+ remove_proc_entry("fs/nfs/spnfs/recall", NULL); -+ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); -+ remove_proc_entry("fs/nfs/spnfs/getfh", NULL); -+ remove_proc_entry("fs/nfs/spnfs/config", NULL); -+ remove_proc_entry("fs/nfs/spnfs/ctl", NULL); -+ remove_proc_entry("fs/nfs/spnfs", NULL); -+#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ -+ -+#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS_LAYOUTSEGMENTS) -+ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); -+ remove_proc_entry("fs/nfs/spnfs/layoutsegsize", NULL); -+#endif /* CONFIG_PROC_FS && CONFIG_SPNFS_LAYOUTSEGMENTS */ -+ - nfsd_export_shutdown(); -+ nfsd4_pnfs_dlm_shutdown(); - nfsd_reply_cache_shutdown(); - remove_proc_entry("fs/nfs/exports", NULL); - remove_proc_entry("fs/nfs", NULL); -diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h -index b76ac3a..cef6770 100644 ---- a/fs/nfsd/nfsd.h -+++ b/fs/nfsd/nfsd.h -@@ -286,11 +286,22 @@ extern time_t nfsd4_grace; - #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ - NFSD4_SUPPORTED_ATTRS_WORD0 - -+#if defined(CONFIG_PNFSD) -+#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ -+ (NFSD4_SUPPORTED_ATTRS_WORD1 | FATTR4_WORD1_FS_LAYOUT_TYPES) -+#else /* CONFIG_PNFSD */ - #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ - NFSD4_SUPPORTED_ATTRS_WORD1 -+#endif /* CONFIG_PNFSD */ - -+#if defined(CONFIG_PNFSD) -+#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ -+ (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT | \ -+ FATTR4_WORD2_LAYOUT_BLKSIZE) -+#else /* CONFIG_PNFSD */ - #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ - (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) -+#endif /* CONFIG_PNFSD */ - - static inline u32 nfsd_suppattrs0(u32 minorversion) - { -diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c -index 55c8e63..544c957 100644 ---- a/fs/nfsd/nfsfh.c -+++ b/fs/nfsd/nfsfh.c -@@ -10,6 +10,7 @@ - #include - - #include -+#include - #include "nfsd.h" - #include "vfs.h" - #include "auth.h" -@@ -139,6 +140,7 @@ static inline __be32 check_pseudo_root(struct svc_rqst *rqstp, - static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) - { - struct knfsd_fh *fh = &fhp->fh_handle; -+ int fsid_type; - struct fid *fid = NULL, sfid; - struct svc_export *exp; - struct dentry *dentry; -@@ -159,7 +161,8 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) - return error; - if (fh->fh_auth_type != 0) - return error; -- len = key_len(fh->fh_fsid_type) / 4; -+ fsid_type = pnfs_fh_fsid_type(fh); -+ len = key_len(fsid_type) / 4; - if (len == 0) - return error; - if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { -@@ -172,7 +175,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) - data_left -= len; - if (data_left < 0) - return error; -- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth); -+ exp = rqst_exp_find(rqstp, fsid_type, fh->fh_auth); - fid = (struct fid *)(fh->fh_auth + len); - } else { - __u32 tfh[2]; -diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h -index c16f8d8..4263812 100644 ---- a/fs/nfsd/nfsfh.h -+++ b/fs/nfsd/nfsfh.h -@@ -14,6 +14,7 @@ enum nfsd_fsid { - FSID_UUID8, - FSID_UUID16, - FSID_UUID16_INUM, -+ FSID_MAX - }; - - enum fsid_source { -@@ -203,4 +204,42 @@ fh_unlock(struct svc_fh *fhp) - } - } - -+#if defined(CONFIG_PNFSD) -+ -+/* -+ * fh_fsid_type is overloaded to indicate whether a filehandle was one supplied -+ * to a DS by LAYOUTGET. nfs4_preprocess_stateid_op() uses this to decide how -+ * to handle a given stateid. -+ */ -+static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) -+{ -+ return fh->fh_fsid_type >= FSID_MAX; -+} -+ -+static inline void pnfs_fh_mark_ds(struct knfsd_fh *fh) -+{ -+ BUG_ON(fh->fh_version != 1); -+ BUG_ON(pnfs_fh_is_ds(fh)); -+ fh->fh_fsid_type += FSID_MAX; -+} -+ -+#else /* CONFIG_PNFSD */ -+ -+static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) -+{ -+ return 0; -+} -+ -+#endif /* CONFIG_PNFSD */ -+ -+/* allows fh_verify() to check the real fsid_type (i.e., not overloaded). */ -+static inline int pnfs_fh_fsid_type(struct knfsd_fh *fh) -+{ -+ int fsid_type = fh->fh_fsid_type; -+ -+ if (pnfs_fh_is_ds(fh)) -+ return fsid_type - FSID_MAX; -+ return fsid_type; -+} -+ - #endif /* _LINUX_NFSD_FH_INT_H */ -diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c -index e2c4346..d1262ec 100644 ---- a/fs/nfsd/nfssvc.c -+++ b/fs/nfsd/nfssvc.c -@@ -115,7 +115,7 @@ struct svc_program nfsd_program = { - - }; - --u32 nfsd_supported_minorversion; -+u32 nfsd_supported_minorversion = NFSD_SUPPORTED_MINOR_VERSION; - - int nfsd_vers(int vers, enum vers_op change) - { -diff --git a/fs/nfsd/pnfsd.h b/fs/nfsd/pnfsd.h -new file mode 100644 -index 0000000..a181bc3 ---- /dev/null -+++ b/fs/nfsd/pnfsd.h -@@ -0,0 +1,143 @@ -+/* -+ * Copyright (c) 2005 The Regents of the University of Michigan. -+ * All rights reserved. -+ * -+ * Andy Adamson -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. Neither the name of the University nor the names of its -+ * contributors may be used to endorse or promote products derived -+ * from this software without specific prior written permission. -+ * -+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED -+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ * -+ */ -+ -+#ifndef LINUX_NFSD_PNFSD_H -+#define LINUX_NFSD_PNFSD_H -+ -+#include -+#include -+ -+#include "state.h" -+#include "xdr4.h" -+ -+/* outstanding layout stateid */ -+struct nfs4_layout_state { -+ struct list_head ls_perfile; -+ struct list_head ls_layouts; /* list of nfs4_layouts */ -+ struct kref ls_ref; -+ struct nfs4_client *ls_client; -+ struct nfs4_file *ls_file; -+ stateid_t ls_stateid; -+}; -+ -+/* outstanding layout */ -+struct nfs4_layout { -+ struct list_head lo_perfile; /* hash by f_id */ -+ struct list_head lo_perclnt; /* hash by clientid */ -+ struct list_head lo_perstate; -+ struct nfs4_file *lo_file; /* backpointer */ -+ struct nfs4_client *lo_client; -+ struct nfs4_layout_state *lo_state; -+ struct nfsd4_layout_seg lo_seg; -+}; -+ -+struct pnfs_inval_state { -+ struct knfsd_fh mdsfh; /* needed only by invalidate all */ -+ stateid_t stid; -+ clientid_t clid; -+ u32 status; -+}; -+ -+/* pNFS Data Server state */ -+#define DS_STATEID_VALID 0 -+#define DS_STATEID_ERROR 1 -+#define DS_STATEID_NEW 2 -+ -+struct pnfs_ds_stateid { -+ struct list_head ds_hash; /* ds_stateid hash entry */ -+ struct list_head ds_perclid; /* per client hash entry */ -+ stateid_t ds_stid; -+ struct knfsd_fh ds_fh; -+ unsigned long ds_access; -+ u32 ds_status; /* from MDS */ -+ u32 ds_verifier[2]; /* from MDS */ -+ wait_queue_head_t ds_waitq; -+ unsigned long ds_flags; -+ struct kref ds_ref; -+ clientid_t ds_mdsclid; -+}; -+ -+struct pnfs_ds_clientid { -+ struct list_head dc_hash; /* mds_clid_hashtbl entry */ -+ struct list_head dc_stateid; /* ds_stateid head */ -+ struct list_head dc_permdsid; /* per mdsid hash entry */ -+ clientid_t dc_mdsclid; -+ struct kref dc_ref; -+ uint32_t dc_mdsid; -+}; -+ -+struct pnfs_mds_id { -+ struct list_head di_hash; /* mds_nodeid list entry */ -+ struct list_head di_mdsclid; /* mds_clientid head */ -+ uint32_t di_mdsid; -+ time_t di_mdsboot; /* mds boot time */ -+ struct kref di_ref; -+}; -+ -+/* notify device request (from exported filesystem) */ -+struct nfs4_notify_device { -+ struct nfsd4_pnfs_cb_dev_list *nd_list; -+ struct nfs4_client *nd_client; -+ struct list_head nd_perclnt; -+ -+ void *nd_args; /* nfsd internal */ -+}; -+ -+u64 find_create_sbid(struct super_block *); -+struct super_block *find_sbid_id(u64); -+__be32 nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *, struct exp_xdr_stream *); -+int nfs4_pnfs_return_layout(struct super_block *, struct svc_fh *, -+ struct nfsd4_pnfs_layoutreturn *); -+int nfs4_pnfs_cb_get_state(struct super_block *, struct pnfs_get_state *); -+int nfs4_pnfs_cb_change_state(struct pnfs_get_state *); -+void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); -+int put_layoutrecall(struct nfs4_layoutrecall *); -+void nomatching_layout(struct nfs4_layoutrecall *); -+void *layoutrecall_done(struct nfs4_layoutrecall *); -+int nfsd4_cb_layout(struct nfs4_layoutrecall *); -+int nfsd_layout_recall_cb(struct super_block *, struct inode *, -+ struct nfsd4_pnfs_cb_layout *); -+int nfsd_device_notify_cb(struct super_block *, -+ struct nfsd4_pnfs_cb_dev_list *); -+int nfsd4_cb_notify_device(struct nfs4_notify_device *); -+void pnfs_set_device_notify(clientid_t *, unsigned int types); -+void pnfs_clear_device_notify(struct nfs4_client *); -+ -+#if defined(CONFIG_PNFSD_LOCAL_EXPORT) -+extern struct sockaddr pnfsd_lexp_addr; -+extern size_t pnfs_lexp_addr_len; -+ -+extern void pnfsd_lexp_init(struct inode *); -+#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ -+ -+#endif /* LINUX_NFSD_PNFSD_H */ -diff --git a/fs/nfsd/pnfsd_lexp.c b/fs/nfsd/pnfsd_lexp.c -new file mode 100644 -index 0000000..bf2f403 ---- /dev/null -+++ b/fs/nfsd/pnfsd_lexp.c -@@ -0,0 +1,225 @@ -+/* -+ * linux/fs/nfsd/pnfs_lexp.c -+ * -+ * pNFS export of local filesystems. -+ * -+ * Export local file systems over the files layout type. -+ * The MDS (metadata server) functions also as a single DS (data server). -+ * This is mostly useful for development and debugging purposes. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * Copyright (C) 2008 Benny Halevy, -+ * -+ * Initial implementation was based on the pnfs-gfs2 patches done -+ * by David M. Richter -+ */ -+ -+#include -+#include -+ -+#include "pnfsd.h" -+ -+#define NFSDDBG_FACILITY NFSDDBG_PNFS -+ -+struct sockaddr pnfsd_lexp_addr; -+size_t pnfs_lexp_addr_len; -+ -+static int -+pnfsd_lexp_layout_type(struct super_block *sb) -+{ -+ int ret = LAYOUT_NFSV4_1_FILES; -+ dprintk("<-- %s: return %d\n", __func__, ret); -+ return ret; -+} -+ -+static int -+pnfsd_lexp_get_device_iter(struct super_block *sb, -+ u32 layout_type, -+ struct nfsd4_pnfs_dev_iter_res *res) -+{ -+ dprintk("--> %s: sb=%p\n", __func__, sb); -+ -+ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); -+ -+ res->gd_eof = 1; -+ if (res->gd_cookie) -+ return -ENOENT; -+ res->gd_cookie = 1; -+ res->gd_verf = 1; -+ res->gd_devid = 1; -+ -+ dprintk("<-- %s: return 0\n", __func__); -+ return 0; -+} -+ -+static int -+pnfsd_lexp_get_device_info(struct super_block *sb, -+ struct exp_xdr_stream *xdr, -+ u32 layout_type, -+ const struct nfsd4_pnfs_deviceid *devid) -+{ -+ int err; -+ struct pnfs_filelayout_device fdev; -+ struct pnfs_filelayout_multipath fl_devices[1]; -+ u32 fl_stripe_indices[1] = { 0 }; -+ struct pnfs_filelayout_devaddr daddr; -+ /* %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x.%03u.%03u */ -+ char daddr_buf[8*4 + 2*3 + 10]; -+ -+ dprintk("--> %s: sb=%p\n", __func__, sb); -+ -+ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); -+ -+ memset(&fdev, '\0', sizeof(fdev)); -+ -+ if (devid->devid != 1) { -+ printk(KERN_ERR "%s: WARNING: didn't receive a deviceid of 1 " -+ "(got: 0x%llx)\n", __func__, devid->devid); -+ err = -EINVAL; -+ goto out; -+ } -+ -+ /* count the number of comma-delimited DS IPs */ -+ fdev.fl_device_length = 1; -+ fdev.fl_device_list = fl_devices; -+ -+ fdev.fl_stripeindices_length = fdev.fl_device_length; -+ fdev.fl_stripeindices_list = fl_stripe_indices; -+ -+ daddr.r_addr.data = daddr_buf; -+ daddr.r_addr.len = sizeof(daddr_buf); -+ err = __svc_print_netaddr(&pnfsd_lexp_addr, &daddr.r_addr); -+ if (err < 0) -+ goto out; -+ daddr.r_addr.len = err; -+ switch (pnfsd_lexp_addr.sa_family) { -+ case AF_INET: -+ daddr.r_netid.data = "tcp"; -+ daddr.r_netid.len = 3; -+ break; -+ case AF_INET6: -+ daddr.r_netid.data = "tcp6"; -+ daddr.r_netid.len = 4; -+ break; -+ default: -+ BUG(); -+ } -+ fdev.fl_device_list[0].fl_multipath_length = 1; -+ fdev.fl_device_list[0].fl_multipath_list = &daddr; -+ -+ /* have nfsd encode the device info */ -+ err = filelayout_encode_devinfo(xdr, &fdev); -+out: -+ dprintk("<-- %s: return %d\n", __func__, err); -+ return err; -+} -+ -+static int get_stripe_unit(int blocksize) -+{ -+ if (blocksize < NFSSVC_MAXBLKSIZE) -+ blocksize = NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); -+ dprintk("%s: return %d\n", __func__, blocksize); -+ return blocksize; -+} -+ -+static enum nfsstat4 -+pnfsd_lexp_layout_get(struct inode *inode, -+ struct exp_xdr_stream *xdr, -+ const struct nfsd4_pnfs_layoutget_arg *arg, -+ struct nfsd4_pnfs_layoutget_res *res) -+{ -+ enum nfsstat4 rc = NFS4_OK; -+ struct pnfs_filelayout_layout *layout = NULL; -+ struct knfsd_fh *fhp = NULL; -+ -+ dprintk("--> %s: inode=%p\n", __func__, inode); -+ -+ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; -+ res->lg_seg.offset = 0; -+ res->lg_seg.length = NFS4_MAX_UINT64; -+ -+ layout = kzalloc(sizeof(*layout), GFP_KERNEL); -+ if (layout == NULL) { -+ rc = -ENOMEM; -+ goto error; -+ } -+ -+ /* Set file layout response args */ -+ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; -+ layout->lg_stripe_type = STRIPE_SPARSE; -+ layout->lg_commit_through_mds = true; -+ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); -+ layout->lg_fh_length = 1; -+ layout->device_id.sbid = arg->lg_sbid; -+ layout->device_id.devid = 1; /*FSFTEMP*/ -+ layout->lg_first_stripe_index = 0; /*FSFTEMP*/ -+ layout->lg_pattern_offset = 0; -+ -+ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); -+ if (fhp == NULL) { -+ rc = -ENOMEM; -+ goto error; -+ } -+ -+ memcpy(fhp, arg->lg_fh, sizeof(*fhp)); -+ pnfs_fh_mark_ds(fhp); -+ layout->lg_fh_list = fhp; -+ -+ /* Call nfsd to encode layout */ -+ rc = filelayout_encode_layout(xdr, layout); -+exit: -+ kfree(layout); -+ kfree(fhp); -+ dprintk("<-- %s: return %d\n", __func__, rc); -+ return rc; -+ -+error: -+ res->lg_seg.length = 0; -+ goto exit; -+} -+ -+static int -+pnfsd_lexp_layout_commit(struct inode *inode, -+ const struct nfsd4_pnfs_layoutcommit_arg *args, -+ struct nfsd4_pnfs_layoutcommit_res *res) -+{ -+ dprintk("%s: (unimplemented)\n", __func__); -+ -+ return 0; -+} -+ -+static int -+pnfsd_lexp_layout_return(struct inode *inode, -+ const struct nfsd4_pnfs_layoutreturn_arg *args) -+{ -+ dprintk("%s: (unimplemented)\n", __func__); -+ -+ return 0; -+} -+ -+static int pnfsd_lexp_get_state(struct inode *inode, struct knfsd_fh *fh, -+ struct pnfs_get_state *p) -+{ -+ return 0; /* just use the current stateid */ -+} -+ -+static struct pnfs_export_operations pnfsd_lexp_ops = { -+ .layout_type = pnfsd_lexp_layout_type, -+ .get_device_info = pnfsd_lexp_get_device_info, -+ .get_device_iter = pnfsd_lexp_get_device_iter, -+ .layout_get = pnfsd_lexp_layout_get, -+ .layout_commit = pnfsd_lexp_layout_commit, -+ .layout_return = pnfsd_lexp_layout_return, -+ .get_state = pnfsd_lexp_get_state, -+}; -+ -+void -+pnfsd_lexp_init(struct inode *inode) -+{ -+ dprintk("%s: &pnfsd_lexp_ops=%p\n", __func__, &pnfsd_lexp_ops); -+ inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops; -+} -diff --git a/fs/nfsd/spnfs_com.c b/fs/nfsd/spnfs_com.c -new file mode 100644 -index 0000000..1ce9ee6 ---- /dev/null -+++ b/fs/nfsd/spnfs_com.c -@@ -0,0 +1,535 @@ -+/* -+ * fs/nfsd/spnfs_com.c -+ * -+ * Communcation layer between spNFS kernel and userspace -+ * Based heavily on idmap.c -+ * -+ */ -+ -+/* -+ * Copyright (c) 2002 The Regents of the University of Michigan. -+ * All rights reserved. -+ * -+ * Marius Aamodt Eriksen -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. Neither the name of the University nor the names of its -+ * contributors may be used to endorse or promote products derived -+ * from this software without specific prior written permission. -+ * -+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED -+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#define NFSDDBG_FACILITY NFSDDBG_PROC -+ -+static ssize_t spnfs_pipe_upcall(struct file *, struct rpc_pipe_msg *, -+ char __user *, size_t); -+static ssize_t spnfs_pipe_downcall(struct file *, const char __user *, -+ size_t); -+static void spnfs_pipe_destroy_msg(struct rpc_pipe_msg *); -+ -+static struct rpc_pipe_ops spnfs_upcall_ops = { -+ .upcall = spnfs_pipe_upcall, -+ .downcall = spnfs_pipe_downcall, -+ .destroy_msg = spnfs_pipe_destroy_msg, -+}; -+ -+/* evil global variable */ -+struct spnfs *global_spnfs; -+struct spnfs_config *spnfs_config; -+#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS -+int spnfs_use_layoutsegments; -+uint64_t layoutsegment_size; -+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ -+ -+/* -+ * Used by spnfs_enabled() -+ * Tracks if the subsystem has been initialized at some point. It doesn't -+ * matter if it's not currently initialized. -+ */ -+static int spnfs_enabled_at_some_point; -+ -+/* call this to start the ball rolling */ -+/* code it like we're going to avoid the global variable in the future */ -+int -+nfsd_spnfs_new(void) -+{ -+ struct spnfs *spnfs = NULL; -+ struct path path; -+ struct nameidata nd; -+ int rc; -+ -+ if (global_spnfs != NULL) -+ return -EEXIST; -+ -+ path.mnt = rpc_get_mount(); -+ if (IS_ERR(path.mnt)) -+ return PTR_ERR(path.mnt); -+ -+ /* FIXME: do not abuse rpc_pipefs/nfs */ -+ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); -+ if (rc) -+ goto err; -+ -+ spnfs = kzalloc(sizeof(*spnfs), GFP_KERNEL); -+ if (spnfs == NULL){ -+ rc = -ENOMEM; -+ goto err; -+ } -+ -+ spnfs->spnfs_dentry = rpc_mkpipe(nd.path.dentry, "spnfs", spnfs, -+ &spnfs_upcall_ops, 0); -+ if (IS_ERR(spnfs->spnfs_dentry)) { -+ rc = -EPIPE; -+ goto err; -+ } -+ -+ mutex_init(&spnfs->spnfs_lock); -+ mutex_init(&spnfs->spnfs_plock); -+ init_waitqueue_head(&spnfs->spnfs_wq); -+ -+ global_spnfs = spnfs; -+ spnfs_enabled_at_some_point = 1; -+ -+ return 0; -+err: -+ rpc_put_mount(); -+ kfree(spnfs); -+ return rc; -+} -+ -+/* again, code it like we're going to remove the global variable */ -+void -+nfsd_spnfs_delete(void) -+{ -+ struct spnfs *spnfs = global_spnfs; -+ -+ if (!spnfs) -+ return; -+ rpc_unlink(spnfs->spnfs_dentry); -+ rpc_put_mount(); -+ global_spnfs = NULL; -+ kfree(spnfs); -+} -+ -+/* RPC pipefs upcall/downcall routines */ -+/* looks like this code is invoked by the rpc_pipe code */ -+/* to handle upcalls on things we've queued elsewhere */ -+/* See nfs_idmap_id for an exmaple of enqueueing */ -+static ssize_t -+spnfs_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, -+ char __user *dst, size_t buflen) -+{ -+ char *data = (char *)msg->data + msg->copied; -+ ssize_t mlen = msg->len - msg->copied; -+ ssize_t left; -+ -+ if (mlen > buflen) -+ mlen = buflen; -+ -+ left = copy_to_user(dst, data, mlen); -+ if (left < 0) { -+ msg->errno = left; -+ return left; -+ } -+ mlen -= left; -+ msg->copied += mlen; -+ msg->errno = 0; -+ return mlen; -+} -+ -+static ssize_t -+spnfs_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) -+{ -+ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); -+ struct spnfs *spnfs = (struct spnfs *)rpci->private; -+ struct spnfs_msg *im_in = NULL, *im = &spnfs->spnfs_im; -+ int ret; -+ -+ if (mlen != sizeof(struct spnfs_msg)) -+ return -ENOSPC; -+ -+ im_in = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); -+ if (im_in == NULL) -+ return -ENOMEM; -+ -+ if (copy_from_user(im_in, src, mlen) != 0) -+ return -EFAULT; -+ -+ mutex_lock(&spnfs->spnfs_plock); -+ -+ ret = mlen; -+ im->im_status = im_in->im_status; -+ /* If we got an error, terminate now, and wake up pending upcalls */ -+ if (!(im_in->im_status & SPNFS_STATUS_SUCCESS)) { -+ wake_up(&spnfs->spnfs_wq); -+ goto out; -+ } -+ -+ ret = -EINVAL; -+ /* Did we match the current upcall? */ -+ /* DMXXX: do not understand the comment above, from original code */ -+ /* DMXXX: when do we _not_ match the current upcall? */ -+ /* DMXXX: anyway, let's to a simplistic check */ -+ if (im_in->im_type == im->im_type) { -+ /* copy the response into the spnfs struct */ -+ memcpy(&im->im_res, &im_in->im_res, sizeof(im->im_res)); -+ ret = mlen; -+ } else -+ dprintk("spnfs: downcall type != upcall type\n"); -+ -+ -+ wake_up(&spnfs->spnfs_wq); -+/* DMXXX handle rval processing */ -+out: -+ mutex_unlock(&spnfs->spnfs_plock); -+ kfree(im_in); -+ return ret; -+} -+ -+static void -+spnfs_pipe_destroy_msg(struct rpc_pipe_msg *msg) -+{ -+ struct spnfs_msg *im = msg->data; -+ struct spnfs *spnfs = container_of(im, struct spnfs, spnfs_im); -+ -+ if (msg->errno >= 0) -+ return; -+ mutex_lock(&spnfs->spnfs_plock); -+ im->im_status = SPNFS_STATUS_FAIL; /* DMXXX */ -+ wake_up(&spnfs->spnfs_wq); -+ mutex_unlock(&spnfs->spnfs_plock); -+} -+ -+/* generic upcall. called by functions in spnfs_ops.c */ -+int -+spnfs_upcall(struct spnfs *spnfs, struct spnfs_msg *upmsg, -+ union spnfs_msg_res *res) -+{ -+ struct rpc_pipe_msg msg; -+ struct spnfs_msg *im; -+ DECLARE_WAITQUEUE(wq, current); -+ int ret = -EIO; -+ int rval; -+ -+ im = &spnfs->spnfs_im; -+ -+ mutex_lock(&spnfs->spnfs_lock); -+ mutex_lock(&spnfs->spnfs_plock); -+ -+ memset(im, 0, sizeof(*im)); -+ memcpy(im, upmsg, sizeof(*upmsg)); -+ -+ memset(&msg, 0, sizeof(msg)); -+ msg.data = im; -+ msg.len = sizeof(*im); -+ -+ add_wait_queue(&spnfs->spnfs_wq, &wq); -+ rval = rpc_queue_upcall(spnfs->spnfs_dentry->d_inode, &msg); -+ if (rval < 0) { -+ remove_wait_queue(&spnfs->spnfs_wq, &wq); -+ goto out; -+ } -+ -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ mutex_unlock(&spnfs->spnfs_plock); -+ schedule(); -+ current->state = TASK_RUNNING; -+ remove_wait_queue(&spnfs->spnfs_wq, &wq); -+ mutex_lock(&spnfs->spnfs_plock); -+ -+ if (im->im_status & SPNFS_STATUS_SUCCESS) { -+ /* copy our result from the upcall */ -+ memcpy(res, &im->im_res, sizeof(*res)); -+ ret = 0; -+ } -+ -+out: -+ memset(im, 0, sizeof(*im)); -+ mutex_unlock(&spnfs->spnfs_plock); -+ mutex_unlock(&spnfs->spnfs_lock); -+ return(ret); -+} -+ -+/* -+ * This is used to determine if the spnfsd daemon has been started at -+ * least once since the system came up. This is used to by the export -+ * mechanism to decide if spnfs is in use. -+ * -+ * Returns non-zero if the spnfsd has initialized the communication pipe -+ * at least once. -+ */ -+int spnfs_enabled(void) -+{ -+ return spnfs_enabled_at_some_point; -+} -+ -+#ifdef CONFIG_PROC_FS -+ -+/* -+ * procfs virtual files for user/kernel space communication: -+ * -+ * ctl - currently just an on/off switch...can be expanded -+ * getfh - fd to fh conversion -+ * recall - recall a layout from the command line, for example: -+ * echo > /proc/fs/spnfs/recall -+ * config - configuration info, e.g., stripe size, num ds, etc. -+ */ -+ -+/*-------------- start ctl -------------------------*/ -+static ssize_t ctl_write(struct file *file, const char __user *buf, -+ size_t count, loff_t *offset) -+{ -+ int cmd, rc; -+ -+ if (copy_from_user((int *)&cmd, (int *)buf, sizeof(int))) -+ return -EFAULT; -+ if (cmd) { -+ rc = nfsd_spnfs_new(); -+ if (rc != 0) -+ return rc; -+ } else -+ nfsd_spnfs_delete(); -+ -+ return count; -+} -+ -+static const struct file_operations ctl_ops = { -+ .write = ctl_write, -+}; -+/*-------------- end ctl ---------------------------*/ -+ -+/*-------------- start config -------------------------*/ -+static ssize_t config_write(struct file *file, const char __user *buf, -+ size_t count, loff_t *offset) -+{ -+ static struct spnfs_config cfg; -+ -+ if (copy_from_user(&cfg, buf, count)) -+ return -EFAULT; -+ -+ spnfs_config = &cfg; -+ return 0; -+} -+ -+static const struct file_operations config_ops = { -+ .write = config_write, -+}; -+/*-------------- end config ---------------------------*/ -+ -+/*-------------- start getfh -----------------------*/ -+static int getfh_open(struct inode *inode, struct file *file) -+{ -+ file->private_data = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); -+ if (file->private_data == NULL) -+ return -ENOMEM; -+ -+ return 0; -+} -+ -+static ssize_t getfh_read(struct file *file, char __user *buf, size_t count, -+ loff_t *offset) -+{ -+ if (copy_to_user(buf, file->private_data, sizeof(struct nfs_fh))) -+ return -EFAULT; -+ -+ return count; -+} -+ -+static ssize_t getfh_write(struct file *file, const char __user *buf, -+ size_t count, loff_t *offset) -+{ -+ int fd; -+ -+ if (copy_from_user((int *)&fd, (int *)buf, sizeof(int))) -+ return -EFAULT; -+ if (spnfs_getfh(fd, file->private_data) != 0) -+ return -EIO; -+ -+ return count; -+} -+ -+static int getfh_release(struct inode *inode, struct file *file) -+{ -+ kfree(file->private_data); -+ return 0; -+} -+ -+static const struct file_operations getfh_ops = { -+ .open = getfh_open, -+ .read = getfh_read, -+ .write = getfh_write, -+ .release = getfh_release, -+}; -+/*-------------- end getfh ------------------------*/ -+ -+ -+/*-------------- start recall layout --------------*/ -+static ssize_t recall_write(struct file *file, const char __user *buf, -+ size_t count, loff_t *offset) -+{ -+ char input[128]; -+ char *path, *str, *p; -+ int rc; -+ u64 off = 0, len = 0; -+ -+ if (count > 128) -+ return -EINVAL; -+ -+ if (copy_from_user(input, buf, count)) -+ return -EFAULT; -+ -+ /* assumes newline-terminated path */ -+ p = memchr(input, '\n', count); -+ if (p == NULL) -+ return -EINVAL; -+ *p = '\0'; -+ -+ /* -+ * Scan for path and, optionally, an offset and length -+ * of a layout segment to be recalled; if there are two -+ * fields, they're assumed to be path and offset. -+ */ -+ p = input; -+ path = strsep(&p, " "); -+ if (path == NULL) -+ return -EINVAL; -+ -+ str = strsep(&p, " "); -+ if (str != NULL) { -+ rc = strict_strtoull(str, 10, &off); -+ if (rc != 0) -+ return -EINVAL; -+ -+ str = strsep(&p, " "); -+ if (str != NULL) { -+ rc = strict_strtoull(str, 10, &len); -+ if (rc != 0) -+ return -EINVAL; -+ } -+ } -+ -+ rc = spnfs_test_layoutrecall(path, off, len); -+ if (rc != 0) -+ return rc; -+ -+ return count; -+} -+ -+static const struct file_operations recall_ops = { -+ .write = recall_write, -+}; -+/*-------------- end recall layout --------------*/ -+ -+ -+#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS -+/*-------------- start layoutseg -------------------------*/ -+static ssize_t layoutseg_write(struct file *file, const char __user *buf, -+ size_t count, loff_t *offset) -+{ -+ char cmd[3]; -+ -+ if (copy_from_user(cmd, buf, 1)) -+ return -EFAULT; -+ if (cmd[0] == '0') -+ spnfs_use_layoutsegments = 0; -+ else -+ spnfs_use_layoutsegments = 1; -+ -+ return count; -+} -+ -+static const struct file_operations layoutseg_ops = { -+ .write = layoutseg_write, -+}; -+/*-------------- end layoutseg ---------------------------*/ -+ -+/*-------------- start layoutsegsize -------------------------*/ -+static ssize_t layoutsegsize_write(struct file *file, const char __user *buf, -+ size_t count, loff_t *offset) -+{ -+ char cmd[50]; -+ -+ if (copy_from_user(cmd, buf, 49)) -+ return -EFAULT; -+ layoutsegment_size = simple_strtoull(cmd, NULL, 10); -+ -+ return count; -+} -+ -+static const struct file_operations layoutsegsize_ops = { -+ .write = layoutsegsize_write, -+}; -+/*-------------- end layoutsegsize ---------------------------*/ -+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ -+ -+int -+spnfs_init_proc(void) -+{ -+ struct proc_dir_entry *entry; -+ -+ entry = proc_mkdir("fs/spnfs", NULL); -+ if (!entry) -+ return -ENOMEM; -+ -+ entry = create_proc_entry("fs/spnfs/ctl", 0, NULL); -+ if (!entry) -+ return -ENOMEM; -+ entry->proc_fops = &ctl_ops; -+ -+ entry = create_proc_entry("fs/spnfs/config", 0, NULL); -+ if (!entry) -+ return -ENOMEM; -+ entry->proc_fops = &config_ops; -+ -+ entry = create_proc_entry("fs/spnfs/getfh", 0, NULL); -+ if (!entry) -+ return -ENOMEM; -+ entry->proc_fops = &getfh_ops; -+ -+ entry = create_proc_entry("fs/spnfs/recall", 0, NULL); -+ if (!entry) -+ return -ENOMEM; -+ entry->proc_fops = &recall_ops; -+ -+#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS -+ entry = create_proc_entry("fs/spnfs/layoutseg", 0, NULL); -+ if (!entry) -+ return -ENOMEM; -+ entry->proc_fops = &layoutseg_ops; -+ -+ entry = create_proc_entry("fs/spnfs/layoutsegsize", 0, NULL); -+ if (!entry) -+ return -ENOMEM; -+ entry->proc_fops = &layoutsegsize_ops; -+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ -+ -+ return 0; -+} -+#endif /* CONFIG_PROC_FS */ -diff --git a/fs/nfsd/spnfs_ops.c b/fs/nfsd/spnfs_ops.c -new file mode 100644 -index 0000000..b97a5af ---- /dev/null -+++ b/fs/nfsd/spnfs_ops.c -@@ -0,0 +1,878 @@ -+/* -+ * fs/nfsd/spnfs_ops.c -+ * -+ * Communcation layer between spNFS kernel and userspace -+ * -+ */ -+/****************************************************************************** -+ -+(c) 2007 Network Appliance, Inc. All Rights Reserved. -+ -+Network Appliance provides this source code under the GPL v2 License. -+The GPL v2 license is available at -+http://opensource.org/licenses/gpl-license.php. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+******************************************************************************/ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "pnfsd.h" -+ -+/* comment out CONFIG_SPNFS_TEST for non-test behaviour */ -+/* #define CONFIG_SPNFS_TEST 1 */ -+ -+#define NFSDDBG_FACILITY NFSDDBG_PNFS -+ -+/* -+ * The functions that are called from elsewhere in the kernel -+ * to perform tasks in userspace -+ * -+ */ -+ -+#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS -+extern int spnfs_use_layoutsegments; -+extern uint64_t layoutsegment_size; -+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ -+extern struct spnfs *global_spnfs; -+ -+int -+spnfs_layout_type(struct super_block *sb) -+{ -+ return LAYOUT_NFSV4_1_FILES; -+} -+ -+enum nfsstat4 -+spnfs_layoutget(struct inode *inode, struct exp_xdr_stream *xdr, -+ const struct nfsd4_pnfs_layoutget_arg *lg_arg, -+ struct nfsd4_pnfs_layoutget_res *lg_res) -+{ -+ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ -+ struct spnfs_msg *im = NULL; -+ union spnfs_msg_res *res = NULL; -+ struct pnfs_filelayout_layout *flp = NULL; -+ int status, i; -+ enum nfsstat4 nfserr; -+ -+ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); -+ if (im == NULL) { -+ nfserr = NFS4ERR_LAYOUTTRYLATER; -+ goto layoutget_cleanup; -+ } -+ -+ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); -+ if (res == NULL) { -+ nfserr = NFS4ERR_LAYOUTTRYLATER; -+ goto layoutget_cleanup; -+ } -+ -+ im->im_type = SPNFS_TYPE_LAYOUTGET; -+ im->im_args.layoutget_args.inode = inode->i_ino; -+ im->im_args.layoutget_args.generation = inode->i_generation; -+ -+ /* call function to queue the msg for upcall */ -+ if (spnfs_upcall(spnfs, im, res) != 0) { -+ dprintk("failed spnfs upcall: layoutget\n"); -+ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; -+ goto layoutget_cleanup; -+ } -+ status = res->layoutget_res.status; -+ if (status != 0) { -+ /* FIXME? until user mode is fixed, translate system error */ -+ switch (status) { -+ case -E2BIG: -+ case -ETOOSMALL: -+ nfserr = NFS4ERR_TOOSMALL; -+ break; -+ case -ENOMEM: -+ case -EAGAIN: -+ case -EINTR: -+ nfserr = NFS4ERR_LAYOUTTRYLATER; -+ break; -+ case -ENOENT: -+ nfserr = NFS4ERR_BADLAYOUT; -+ break; -+ default: -+ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; -+ } -+ dprintk("spnfs layout_get upcall: status=%d nfserr=%u\n", -+ status, nfserr); -+ goto layoutget_cleanup; -+ } -+ -+ lg_res->lg_return_on_close = 0; -+#if defined(CONFIG_SPNFS_LAYOUTSEGMENTS) -+ /* if spnfs_use_layoutsegments & layoutsegment_size == 0, use */ -+ /* the amount requested by the client. */ -+ if (spnfs_use_layoutsegments) { -+ if (layoutsegment_size != 0) -+ lg_res->lg_seg.length = layoutsegment_size; -+ } else -+ lg_res->lg_seg.length = NFS4_MAX_UINT64; -+#else -+ lg_res->lg_seg.length = NFS4_MAX_UINT64; -+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ -+ -+ flp = kmalloc(sizeof(struct pnfs_filelayout_layout), GFP_KERNEL); -+ if (flp == NULL) { -+ nfserr = NFS4ERR_LAYOUTTRYLATER; -+ goto layoutget_cleanup; -+ } -+ flp->device_id.sbid = lg_arg->lg_sbid; -+ flp->device_id.devid = res->layoutget_res.devid; -+ flp->lg_layout_type = 1; /* XXX */ -+ flp->lg_stripe_type = res->layoutget_res.stripe_type; -+ flp->lg_commit_through_mds = 0; -+ flp->lg_stripe_unit = res->layoutget_res.stripe_size; -+ flp->lg_first_stripe_index = 0; -+ flp->lg_pattern_offset = 0; -+ flp->lg_fh_length = res->layoutget_res.stripe_count; -+ -+ flp->lg_fh_list = kmalloc(flp->lg_fh_length * sizeof(struct knfsd_fh), -+ GFP_KERNEL); -+ if (flp->lg_fh_list == NULL) { -+ nfserr = NFS4ERR_LAYOUTTRYLATER; -+ goto layoutget_cleanup; -+ } -+ /* -+ * FIX: Doing an extra copy here. Should group res.flist's fh_len -+ * and fh_val into a knfsd_fh structure. -+ */ -+ for (i = 0; i < flp->lg_fh_length; i++) { -+ flp->lg_fh_list[i].fh_size = res->layoutget_res.flist[i].fh_len; -+ memcpy(&flp->lg_fh_list[i].fh_base, -+ res->layoutget_res.flist[i].fh_val, -+ res->layoutget_res.flist[i].fh_len); -+ } -+ -+ /* encode the layoutget body */ -+ nfserr = filelayout_encode_layout(xdr, flp); -+ -+layoutget_cleanup: -+ if (flp) { -+ if (flp->lg_fh_list) -+ kfree(flp->lg_fh_list); -+ kfree(flp); -+ } -+ kfree(im); -+ kfree(res); -+ -+ return nfserr; -+} -+ -+int -+spnfs_layoutcommit(void) -+{ -+ return 0; -+} -+ -+int -+spnfs_layoutreturn(struct inode *inode, -+ const struct nfsd4_pnfs_layoutreturn_arg *args) -+{ -+ return 0; -+} -+ -+int -+spnfs_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) -+{ -+ struct super_block *sb; -+ struct nfsd4_pnfs_cb_layout lr; -+ -+ switch (type) { -+ case RETURN_FILE: -+ sb = inode->i_sb; -+ dprintk("%s: recalling layout for ino = %lu\n", -+ __func__, inode->i_ino); -+ break; -+ case RETURN_FSID: -+ sb = inode->i_sb; -+ dprintk("%s: recalling layout for fsid x (unimplemented)\n", -+ __func__); -+ return 0; -+ case RETURN_ALL: -+ /* XXX figure out how to get a sb since there's no inode ptr */ -+ dprintk("%s: recalling all layouts (unimplemented)\n", -+ __func__); -+ return 0; -+ default: -+ return -EINVAL; -+ } -+ -+ lr.cbl_recall_type = type; -+ lr.cbl_seg.layout_type = LAYOUT_NFSV4_1_FILES; -+ lr.cbl_seg.clientid = 0; -+ lr.cbl_seg.offset = offset; -+ lr.cbl_seg.length = len; -+ lr.cbl_seg.iomode = IOMODE_ANY; -+ lr.cbl_layoutchanged = 0; -+ -+ nfsd_layout_recall_cb(sb, inode, &lr); -+ -+ return 0; -+} -+ -+ -+int -+spnfs_test_layoutrecall(char *path, u64 offset, u64 len) -+{ -+ struct nameidata nd; -+ struct inode *inode; -+ int type, rc; -+ -+ dprintk("%s: path=%s, offset=%llu, len=%llu\n", -+ __func__, path, offset, len); -+ -+ if (strcmp(path, "all") == 0) { -+ inode = NULL; -+ type = RETURN_ALL; -+ } else { -+ rc = path_lookup(path, 0, &nd); -+ if (rc != 0) -+ return -ENOENT; -+ -+ /* -+ * XXX todo: add a RETURN_FSID scenario here...maybe if -+ * inode is a dir... -+ */ -+ -+ inode = nd.path.dentry->d_inode; -+ type = RETURN_FILE; -+ } -+ -+ if (len == 0) -+ len = NFS4_MAX_UINT64; -+ -+ rc = spnfs_layoutrecall(inode, type, offset, len); -+ -+ if (type != RETURN_ALL) -+ path_put(&nd.path); -+ return rc; -+} -+ -+int -+spnfs_getdeviceiter(struct super_block *sb, -+ u32 layout_type, -+ struct nfsd4_pnfs_dev_iter_res *gd_res) -+{ -+ struct spnfs *spnfs = global_spnfs; /* XXX keep up the pretence */ -+ struct spnfs_msg *im = NULL; -+ union spnfs_msg_res *res = NULL; -+ int status = 0; -+ -+ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); -+ if (im == NULL) { -+ status = -ENOMEM; -+ goto getdeviceiter_out; -+ } -+ -+ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); -+ if (res == NULL) { -+ status = -ENOMEM; -+ goto getdeviceiter_out; -+ } -+ -+ im->im_type = SPNFS_TYPE_GETDEVICEITER; -+ im->im_args.getdeviceiter_args.cookie = gd_res->gd_cookie; -+ im->im_args.getdeviceiter_args.verf = gd_res->gd_verf; -+ -+ /* call function to queue the msg for upcall */ -+ status = spnfs_upcall(spnfs, im, res); -+ if (status != 0) { -+ dprintk("%s spnfs upcall failure: %d\n", __func__, status); -+ status = -EIO; -+ goto getdeviceiter_out; -+ } -+ status = res->getdeviceiter_res.status; -+ -+ if (res->getdeviceiter_res.eof) -+ gd_res->gd_eof = 1; -+ else { -+ gd_res->gd_devid = res->getdeviceiter_res.devid; -+ gd_res->gd_cookie = res->getdeviceiter_res.cookie; -+ gd_res->gd_verf = res->getdeviceiter_res.verf; -+ gd_res->gd_eof = 0; -+ } -+ -+getdeviceiter_out: -+ kfree(im); -+ kfree(res); -+ -+ return status; -+} -+ -+#ifdef CONFIG_SPNFS_TEST -+/* -+ * Setup the rq_res xdr_buf. The svc_rqst rq_respages[1] page contains the -+ * 1024 encoded stripe indices. -+ * -+ * Skip the devaddr4 length and encode the indicies count (1024) in the -+ * rq_res.head and set the rq_res.head length. -+ * -+ * Set the rq_res page_len to 4096 (for the 1024 stripe indices). -+ * Set the rq_res xdr_buf tail base to rq_respages[0] just after the -+ * rq_res head to hold the rest of the getdeviceinfo return. -+ * -+ * So rq_respages[rq_resused - 1] contains the rq_res.head and rq_res.tail and -+ * rq_respages[rq_resused] contains the rq_res.pages. -+ */ -+static int spnfs_test_indices_xdr(struct pnfs_xdr_info *info, -+ const struct pnfs_filelayout_device *fdev) -+{ -+ struct nfsd4_compoundres *resp = info->resp; -+ struct svc_rqst *rqstp = resp->rqstp; -+ struct xdr_buf *xb = &resp->rqstp->rq_res; -+ __be32 *p; -+ -+ p = nfsd4_xdr_reserve_space(resp, 8); -+ p++; /* Fill in length later */ -+ *p++ = cpu_to_be32(fdev->fl_stripeindices_length); /* 1024 */ -+ resp->p = p; -+ -+ xb->head[0].iov_len = (char *)resp->p - (char *)xb->head[0].iov_base; -+ xb->pages = &rqstp->rq_respages[rqstp->rq_resused]; -+ xb->page_base = 0; -+ xb->page_len = PAGE_SIZE; /* page of 1024 encoded indices */ -+ xb->tail[0].iov_base = resp->p; -+ resp->end = xb->head[0].iov_base + PAGE_SIZE; -+ xb->tail[0].iov_len = (char *)resp->end - (char *)resp->p; -+ return 0; -+} -+/* -+ * Return a stripeindices of length 1024 to test -+ * the pNFS client multipage getdeviceinfo implementation. -+ * -+ * Encode a page of stripe indices. -+ */ -+static void spnfs_set_test_indices(struct pnfs_filelayout_device *fldev, -+ struct spnfs_device *dev, -+ struct pnfs_devinfo_arg *info) -+{ -+ struct svc_rqst *rqstp = info->xdr.resp->rqstp; -+ __be32 *p; -+ int i, j = 0; -+ -+ p = (__be32 *)page_address(rqstp->rq_respages[rqstp->rq_resused]); -+ fldev->fl_stripeindices_length = 1024; -+ /* round-robin the data servers device index into the stripe indicie */ -+ for (i = 0; i < 1024; i++) { -+ *p++ = cpu_to_be32(j); -+ if (j < dev->dscount - 1) -+ j++; -+ else -+ j = 0; -+ } -+ fldev->fl_stripeindices_list = NULL; -+} -+#endif /* CONFIG_SPNFS_TEST */ -+ -+int -+spnfs_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, -+ u32 layout_type, -+ const struct nfsd4_pnfs_deviceid *devid) -+{ -+ struct spnfs *spnfs = global_spnfs; -+ struct spnfs_msg *im = NULL; -+ union spnfs_msg_res *res = NULL; -+ struct spnfs_device *dev; -+ struct pnfs_filelayout_device *fldev = NULL; -+ struct pnfs_filelayout_multipath *mp = NULL; -+ struct pnfs_filelayout_devaddr *fldap = NULL; -+ int status = 0, i, len; -+ -+ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); -+ if (im == NULL) { -+ status = -ENOMEM; -+ goto getdeviceinfo_out; -+ } -+ -+ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); -+ if (res == NULL) { -+ status = -ENOMEM; -+ goto getdeviceinfo_out; -+ } -+ -+ im->im_type = SPNFS_TYPE_GETDEVICEINFO; -+ /* XXX FIX: figure out what to do about fsid */ -+ im->im_args.getdeviceinfo_args.devid = devid->devid; -+ -+ /* call function to queue the msg for upcall */ -+ status = spnfs_upcall(spnfs, im, res); -+ if (status != 0) { -+ dprintk("%s spnfs upcall failure: %d\n", __func__, status); -+ status = -EIO; -+ goto getdeviceinfo_out; -+ } -+ status = res->getdeviceinfo_res.status; -+ if (status != 0) -+ goto getdeviceinfo_out; -+ -+ dev = &res->getdeviceinfo_res.devinfo; -+ -+ /* Fill in the device data, i.e., nfs4_1_file_layout_ds_addr4 */ -+ fldev = kzalloc(sizeof(struct pnfs_filelayout_device), GFP_KERNEL); -+ if (fldev == NULL) { -+ status = -ENOMEM; -+ goto getdeviceinfo_out; -+ } -+ -+ /* -+ * Stripe count is the same as data server count for our purposes -+ */ -+ fldev->fl_stripeindices_length = dev->dscount; -+ fldev->fl_device_length = dev->dscount; -+ -+ /* Set stripe indices */ -+#ifdef CONFIG_SPNFS_TEST -+ spnfs_set_test_indices(fldev, dev, info); -+ fldev->fl_enc_stripe_indices = spnfs_test_indices_xdr; -+#else /* CONFIG_SPNFS_TEST */ -+ fldev->fl_stripeindices_list = -+ kmalloc(fldev->fl_stripeindices_length * sizeof(u32), -+ GFP_KERNEL); -+ if (fldev->fl_stripeindices_list == NULL) { -+ status = -ENOMEM; -+ goto getdeviceinfo_out; -+ } -+ for (i = 0; i < fldev->fl_stripeindices_length; i++) -+ fldev->fl_stripeindices_list[i] = i; -+#endif /* CONFIG_SPNFS_TEST */ -+ -+ /* -+ * Set the device's data server addresses No multipath for spnfs, -+ * so mp length is always 1. -+ * -+ */ -+ fldev->fl_device_list = -+ kmalloc(fldev->fl_device_length * -+ sizeof(struct pnfs_filelayout_multipath), -+ GFP_KERNEL); -+ if (fldev->fl_device_list == NULL) { -+ status = -ENOMEM; -+ goto getdeviceinfo_out; -+ } -+ for (i = 0; i < fldev->fl_device_length; i++) { -+ mp = &fldev->fl_device_list[i]; -+ mp->fl_multipath_length = 1; -+ mp->fl_multipath_list = -+ kmalloc(sizeof(struct pnfs_filelayout_devaddr), -+ GFP_KERNEL); -+ if (mp->fl_multipath_list == NULL) { -+ status = -ENOMEM; -+ goto getdeviceinfo_out; -+ } -+ fldap = mp->fl_multipath_list; -+ -+ /* -+ * Copy the netid into the device address, for example: "tcp" -+ */ -+ len = strlen(dev->dslist[i].netid); -+ fldap->r_netid.data = kmalloc(len, GFP_KERNEL); -+ if (fldap->r_netid.data == NULL) { -+ status = -ENOMEM; -+ goto getdeviceinfo_out; -+ } -+ memcpy(fldap->r_netid.data, dev->dslist[i].netid, len); -+ fldap->r_netid.len = len; -+ -+ /* -+ * Copy the network address into the device address, -+ * for example: "10.35.9.16.08.01" -+ */ -+ len = strlen(dev->dslist[i].addr); -+ fldap->r_addr.data = kmalloc(len, GFP_KERNEL); -+ if (fldap->r_addr.data == NULL) { -+ status = -ENOMEM; -+ goto getdeviceinfo_out; -+ } -+ memcpy(fldap->r_addr.data, dev->dslist[i].addr, len); -+ fldap->r_addr.len = len; -+ } -+ -+ /* encode the device data */ -+ status = filelayout_encode_devinfo(xdr, fldev); -+ -+getdeviceinfo_out: -+ if (fldev) { -+ kfree(fldev->fl_stripeindices_list); -+ if (fldev->fl_device_list) { -+ for (i = 0; i < fldev->fl_device_length; i++) { -+ fldap = -+ fldev->fl_device_list[i].fl_multipath_list; -+ kfree(fldap->r_netid.data); -+ kfree(fldap->r_addr.data); -+ kfree(fldap); -+ } -+ kfree(fldev->fl_device_list); -+ } -+ kfree(fldev); -+ } -+ -+ kfree(im); -+ kfree(res); -+ -+ return status; -+} -+ -+int -+spnfs_setattr(void) -+{ -+ return 0; -+} -+ -+int -+spnfs_open(struct inode *inode, struct nfsd4_open *open) -+{ -+ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ -+ struct spnfs_msg *im = NULL; -+ union spnfs_msg_res *res = NULL; -+ int status = 0; -+ -+ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); -+ if (im == NULL) { -+ status = -ENOMEM; -+ goto open_out; -+ } -+ -+ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); -+ if (res == NULL) { -+ status = -ENOMEM; -+ goto open_out; -+ } -+ -+ im->im_type = SPNFS_TYPE_OPEN; -+ im->im_args.open_args.inode = inode->i_ino; -+ im->im_args.open_args.generation = inode->i_generation; -+ im->im_args.open_args.create = open->op_create; -+ im->im_args.open_args.createmode = open->op_createmode; -+ im->im_args.open_args.truncate = open->op_truncate; -+ -+ /* call function to queue the msg for upcall */ -+ status = spnfs_upcall(spnfs, im, res); -+ if (status != 0) { -+ dprintk("%s spnfs upcall failure: %d\n", __func__, status); -+ status = -EIO; -+ goto open_out; -+ } -+ status = res->open_res.status; -+ -+open_out: -+ kfree(im); -+ kfree(res); -+ -+ return status; -+} -+ -+int -+spnfs_create(void) -+{ -+ return 0; -+} -+ -+/* -+ * Invokes the spnfsd with the inode number of the object to remove. -+ * The file has already been removed on the MDS, so all the spnsfd -+ * daemon does is remove the stripes. -+ * Returns 0 on success otherwise error code -+ */ -+int -+spnfs_remove(unsigned long ino, unsigned long generation) -+{ -+ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ -+ struct spnfs_msg *im = NULL; -+ union spnfs_msg_res *res = NULL; -+ int status = 0; -+ -+ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); -+ if (im == NULL) { -+ status = -ENOMEM; -+ goto remove_out; -+ } -+ -+ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); -+ if (res == NULL) { -+ status = -ENOMEM; -+ goto remove_out; -+ } -+ -+ im->im_type = SPNFS_TYPE_REMOVE; -+ im->im_args.remove_args.inode = ino; -+ im->im_args.remove_args.generation = generation; -+ -+ /* call function to queue the msg for upcall */ -+ status = spnfs_upcall(spnfs, im, res); -+ if (status != 0) { -+ dprintk("%s spnfs upcall failure: %d\n", __func__, status); -+ status = -EIO; -+ goto remove_out; -+ } -+ status = res->remove_res.status; -+ -+remove_out: -+ kfree(im); -+ kfree(res); -+ -+ return status; -+} -+ -+static int -+read_one(struct inode *inode, loff_t offset, size_t len, char *buf, -+ struct file **filp) -+{ -+ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; -+ size_t iolen; -+ int completed = 0, ds, err; -+ -+ while (len > 0) { -+ tmp = offset; -+ soff = do_div(tmp, spnfs_config->stripe_size); -+ snum = tmp; -+ ds = do_div(tmp, spnfs_config->num_ds); -+ if (spnfs_config->dense_striping == 0) -+ soffset = offset; -+ else { -+ tmp = snum; -+ do_div(tmp, spnfs_config->num_ds); -+ soffset = tmp * spnfs_config->stripe_size + soff; -+ } -+ if (len < spnfs_config->stripe_size - soff) -+ iolen = len; -+ else -+ iolen = spnfs_config->stripe_size - soff; -+ -+ pos = soffset; -+ err = vfs_read(filp[ds], buf + bufoffset, iolen, &pos); -+ if (err < 0) -+ return -EIO; -+ if (err == 0) -+ break; -+ filp[ds]->f_pos = pos; -+ iolen = err; -+ completed += iolen; -+ len -= iolen; -+ offset += iolen; -+ bufoffset += iolen; -+ } -+ -+ return completed; -+} -+ -+static __be32 -+read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, -+ struct svc_rqst *rqstp) -+{ -+ int i, vnum, err, bytecount = 0; -+ char path[128]; -+ struct file *filp[SPNFS_MAX_DATA_SERVERS]; -+ size_t iolen; -+ __be32 status = nfs_ok; -+ -+ /* -+ * XXX We should just be doing this at open time, but it gets -+ * kind of messy storing this info in nfsd's state structures -+ * and piggybacking its path through the various state handling -+ * functions. Revisit this. -+ */ -+ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); -+ for (i = 0; i < spnfs_config->num_ds; i++) { -+ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], -+ inode->i_ino, inode->i_generation); -+ filp[i] = filp_open(path, O_RDONLY | O_LARGEFILE, 0); -+ if (filp[i] == NULL) { -+ status = nfserr_io; -+ goto read_out; -+ } -+ get_file(filp[i]); -+ } -+ -+ for (vnum = 0 ; vnum < vlen ; vnum++) { -+ iolen = rqstp->rq_vec[vnum].iov_len; -+ err = read_one(inode, offset + bytecount, iolen, -+ (char *)rqstp->rq_vec[vnum].iov_base, filp); -+ if (err < 0) { -+ status = nfserr_io; -+ goto read_out; -+ } -+ if (err < iolen) { -+ bytecount += err; -+ goto read_out; -+ } -+ bytecount += rqstp->rq_vec[vnum].iov_len; -+ } -+ -+read_out: -+ *lenp = bytecount; -+ for (i = 0; i < spnfs_config->num_ds; i++) { -+ if (filp[i]) { -+ filp_close(filp[i], current->files); -+ fput(filp[i]); -+ } -+ } -+ return status; -+} -+ -+__be32 -+spnfs_read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, -+ struct svc_rqst *rqstp) -+{ -+ if (spnfs_config) -+ return read(inode, offset, lenp, vlen, rqstp); -+ else { -+ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); -+ return nfserr_notsupp; -+ } -+} -+ -+static int -+write_one(struct inode *inode, loff_t offset, size_t len, char *buf, -+ struct file **filp) -+{ -+ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; -+ size_t iolen; -+ int completed = 0, ds, err; -+ -+ while (len > 0) { -+ tmp = offset; -+ soff = do_div(tmp, spnfs_config->stripe_size); -+ snum = tmp; -+ ds = do_div(tmp, spnfs_config->num_ds); -+ if (spnfs_config->dense_striping == 0) -+ soffset = offset; -+ else { -+ tmp = snum; -+ do_div(tmp, spnfs_config->num_ds); -+ soffset = tmp * spnfs_config->stripe_size + soff; -+ } -+ if (len < spnfs_config->stripe_size - soff) -+ iolen = len; -+ else -+ iolen = spnfs_config->stripe_size - soff; -+ -+ pos = soffset; -+ err = vfs_write(filp[ds], buf + bufoffset, iolen, &pos); -+ if (err < 0) -+ return -EIO; -+ filp[ds]->f_pos = pos; -+ iolen = err; -+ completed += iolen; -+ len -= iolen; -+ offset += iolen; -+ bufoffset += iolen; -+ } -+ -+ return completed; -+} -+ -+static __be32 -+write(struct inode *inode, loff_t offset, size_t len, int vlen, -+ struct svc_rqst *rqstp) -+{ -+ int i, vnum, err, bytecount = 0; -+ char path[128]; -+ struct file *filp[SPNFS_MAX_DATA_SERVERS]; -+ size_t iolen; -+ __be32 status = nfs_ok; -+ -+ /* -+ * XXX We should just be doing this at open time, but it gets -+ * kind of messy storing this info in nfsd's state structures -+ * and piggybacking its path through the various state handling -+ * functions. Revisit this. -+ */ -+ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); -+ for (i = 0; i < spnfs_config->num_ds; i++) { -+ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], -+ inode->i_ino, inode->i_generation); -+ filp[i] = filp_open(path, O_RDWR | O_LARGEFILE, 0); -+ if (filp[i] == NULL) { -+ status = nfserr_io; -+ goto write_out; -+ } -+ get_file(filp[i]); -+ } -+ -+ for (vnum = 0; vnum < vlen; vnum++) { -+ iolen = rqstp->rq_vec[vnum].iov_len; -+ err = write_one(inode, offset + bytecount, iolen, -+ (char *)rqstp->rq_vec[vnum].iov_base, filp); -+ if (err != iolen) { -+ dprintk("spnfs_write: err=%d expected %Zd\n", err, len); -+ status = nfserr_io; -+ goto write_out; -+ } -+ bytecount += rqstp->rq_vec[vnum].iov_len; -+ } -+ -+write_out: -+ for (i = 0; i < spnfs_config->num_ds; i++) { -+ if (filp[i]) { -+ filp_close(filp[i], current->files); -+ fput(filp[i]); -+ } -+ } -+ -+ return status; -+} -+ -+__be32 -+spnfs_write(struct inode *inode, loff_t offset, size_t len, int vlen, -+ struct svc_rqst *rqstp) -+{ -+ if (spnfs_config) -+ return write(inode, offset, len, vlen, rqstp); -+ else { -+ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); -+ return nfserr_notsupp; -+ } -+} -+ -+int -+spnfs_commit(void) -+{ -+ return 0; -+} -+ -+/* -+ * Return the state for this object. -+ * At this time simply return 0 to indicate success and use the existing state -+ */ -+int -+spnfs_get_state(struct inode *inode, struct knfsd_fh *fh, struct pnfs_get_state *arg) -+{ -+ return 0; -+} -+ -+/* -+ * Return the filehandle for the specified file descriptor -+ */ -+int -+spnfs_getfh(int fd, struct nfs_fh *fh) -+{ -+ struct file *file; -+ -+ file = fget(fd); -+ if (file == NULL) -+ return -EIO; -+ -+ memcpy(fh, NFS_FH(file->f_dentry->d_inode), sizeof(struct nfs_fh)); -+ fput(file); -+ return 0; -+} -diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h -index 322518c..2536d06 100644 ---- a/fs/nfsd/state.h -+++ b/fs/nfsd/state.h -@@ -241,6 +241,12 @@ struct nfs4_client { - u32 cl_cb_seq_nr; - struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ - /* wait here for slots */ -+#if defined(CONFIG_PNFSD) -+ struct list_head cl_layouts; /* outstanding layouts */ -+ struct list_head cl_layoutrecalls; /* outstanding layoutrecall -+ callbacks */ -+ atomic_t cl_deviceref; /* Num outstanding devs */ -+#endif /* CONFIG_PNFSD */ - }; - - static inline void -@@ -357,6 +363,14 @@ struct nfs4_file { - u32 fi_id; /* used with stateowner->so_id - * for stateid_hashtbl hash */ - bool fi_had_conflict; -+#if defined(CONFIG_PNFSD) -+ struct list_head fi_layouts; -+ struct list_head fi_layout_states; -+ /* used by layoutget / layoutrecall */ -+ struct nfs4_fsid fi_fsid; -+ u32 fi_fhlen; -+ u8 fi_fhval[NFS4_FHSIZE]; -+#endif /* CONFIG_PNFSD */ - }; - - /* XXX: for first cut may fall back on returning file that doesn't work -@@ -385,6 +399,15 @@ static inline struct file *find_any_file(struct nfs4_file *f) - return f->fi_fds[O_RDONLY]; - } - -+#if defined(CONFIG_PNFSD) -+/* pNFS Metadata server state */ -+ -+struct pnfs_ds_dev_entry { -+ struct list_head dd_dev_entry; /* st_pnfs_ds_id entry */ -+ u32 dd_dsid; -+}; -+#endif /* CONFIG_PNFSD */ -+ - /* - * nfs4_stateid can either be an open stateid or (eventually) a lock stateid - * -@@ -407,6 +430,9 @@ struct nfs4_stateid { - struct list_head st_perfile; - struct list_head st_perstateowner; - struct list_head st_lockowners; -+#if defined(CONFIG_PNFSD) -+ struct list_head st_pnfs_ds_id; -+#endif /* CONFIG_PNFSD */ - struct nfs4_stateowner * st_stateowner; - struct nfs4_file * st_file; - stateid_t st_stateid; -@@ -457,6 +483,34 @@ extern void nfsd4_recdir_purge_old(void); - extern int nfsd4_create_clid_dir(struct nfs4_client *clp); - extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); - extern void release_session_client(struct nfsd4_session *); -+extern void nfsd4_free_slab(struct kmem_cache **); -+extern struct nfs4_file *find_file(struct inode *); -+extern struct nfs4_file *find_alloc_file(struct inode *, struct svc_fh *); -+extern void put_nfs4_file(struct nfs4_file *); -+extern void get_nfs4_file(struct nfs4_file *); -+extern struct nfs4_client *find_confirmed_client(clientid_t *); -+extern struct nfs4_stateid *find_stateid(stateid_t *, int flags); -+extern struct nfs4_delegation *find_delegation_stateid(struct inode *, stateid_t *); -+extern __be32 nfs4_check_stateid(stateid_t *); -+extern void expire_client_lock(struct nfs4_client *); -+extern int filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), void *); -+ -+#if defined(CONFIG_PNFSD) -+extern int nfsd4_init_pnfs_slabs(void); -+extern void nfsd4_free_pnfs_slabs(void); -+extern void pnfs_expire_client(struct nfs4_client *); -+extern void release_pnfs_ds_dev_list(struct nfs4_stateid *); -+extern void nfs4_pnfs_state_init(void); -+extern void nfs4_pnfs_state_shutdown(void); -+extern void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); -+extern int nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *, stateid_t *); -+#else /* CONFIG_PNFSD */ -+static inline void nfsd4_free_pnfs_slabs(void) {} -+static inline int nfsd4_init_pnfs_slabs(void) { return 0; } -+static inline void pnfs_expire_client(struct nfs4_client *clp) {} -+static inline void release_pnfs_ds_dev_list(struct nfs4_stateid *stp) {} -+static inline void nfs4_pnfs_state_shutdown(void) {} -+#endif /* CONFIG_PNFSD */ - - static inline void - nfs4_put_stateowner(struct nfs4_stateowner *so) -@@ -470,4 +524,24 @@ nfs4_get_stateowner(struct nfs4_stateowner *so) - kref_get(&so->so_ref); - } - -+static inline u64 -+end_offset(u64 start, u64 len) -+{ -+ u64 end; -+ -+ end = start + len; -+ return end >= start ? end : NFS4_MAX_UINT64; -+} -+ -+/* last octet in a range */ -+static inline u64 -+last_byte_offset(u64 start, u64 len) -+{ -+ u64 end; -+ -+ BUG_ON(!len); -+ end = start + len; -+ return end > start ? end - 1 : NFS4_MAX_UINT64; -+} -+ - #endif /* NFSD4_STATE_H */ -diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c -index 661a6cf..ed3a1b7 100644 ---- a/fs/nfsd/vfs.c -+++ b/fs/nfsd/vfs.c -@@ -37,7 +37,12 @@ - #ifdef CONFIG_NFSD_V4 - #include - #include -+#include -+#include - #endif /* CONFIG_NFSD_V4 */ -+#if defined(CONFIG_SPNFS_BLOCK) -+#include -+#endif - - #include "nfsd.h" - #include "vfs.h" -@@ -383,6 +388,12 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, - NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE); - if (err) - goto out; -+#if defined(CONFIG_SPNFS_BLOCK) -+ if (pnfs_block_enabled(inode, 0)) { -+ err = bl_layoutrecall(inode, RETURN_FILE, -+ iap->ia_size, inode->i_size - iap->ia_size); -+ } -+#endif /* CONFIG_SPNFS_BLOCK */ - } - - /* -@@ -1716,6 +1727,11 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, - struct inode *fdir, *tdir; - __be32 err; - int host_err; -+#ifdef CONFIG_SPNFS -+ unsigned long ino = 0; -+ unsigned long generation = 0; -+ unsigned int nlink = 0; -+#endif /* CONFIG_SPNFS */ - - err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); - if (err) -@@ -1779,7 +1795,26 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, - if (host_err) - goto out_dput_new; - -+#ifdef CONFIG_SPNFS -+ /* -+ * if the target is a preexisting regular file, remember the -+ * inode number and generation so we can delete the stripes; -+ * save the link count as well so that the stripes only get -+ * get deleted when the last link is deleted -+ */ -+ if (ndentry && ndentry->d_inode && S_ISREG(ndentry->d_inode->i_mode)) { -+ ino = ndentry->d_inode->i_ino; -+ generation = ndentry->d_inode->i_generation; -+ nlink = ndentry->d_inode->i_nlink; -+ } -+#endif /* CONFIG_SPNFS */ -+ - host_err = vfs_rename(fdir, odentry, tdir, ndentry); -+#ifdef CONFIG_SPNFS -+ if (spnfs_enabled() && (!host_err && ino && nlink == 1)) -+ spnfs_remove(ino, generation); -+#endif /* CONFIG_SPNFS */ -+ - if (!host_err) { - host_err = commit_metadata(tfhp); - if (!host_err) -@@ -1820,6 +1855,11 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, - struct inode *dirp; - __be32 err; - int host_err; -+#if defined(CONFIG_SPNFS) -+ unsigned long ino; -+ unsigned long generation; -+ unsigned int nlink; -+#endif /* defined(CONFIG_SPNFS) */ - - err = nfserr_acces; - if (!flen || isdotent(fname, flen)) -@@ -1843,6 +1883,17 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, - goto out; - } - -+#if defined(CONFIG_SPNFS) -+ /* -+ * Remember the inode number to communicate to the spnfsd -+ * for removal of stripes; save the link count as well so that -+ * the stripes only get get deleted when the last link is deleted -+ */ -+ ino = rdentry->d_inode->i_ino; -+ generation = rdentry->d_inode->i_generation; -+ nlink = rdentry->d_inode->i_nlink; -+#endif /* defined(CONFIG_SPNFS) */ -+ - if (!type) - type = rdentry->d_inode->i_mode & S_IFMT; - -@@ -1867,6 +1918,29 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, - if (!host_err) - host_err = commit_metadata(fhp); - -+#if defined(CONFIG_SPNFS) -+ /* -+ * spnfs: notify spnfsd of removal to destroy stripes -+ */ -+/* -+ sb = current_fh->fh_dentry->d_inode->i_sb; -+ if (sb->s_export_op->spnfs_remove) { -+*/ -+ dprintk("%s check if spnfs_enabled\n", __FUNCTION__); -+ if (spnfs_enabled() && nlink == 1) { -+ BUG_ON(ino == 0); -+ dprintk("%s calling spnfs_remove inumber=%ld\n", -+ __FUNCTION__, ino); -+ if (spnfs_remove(ino, generation) == 0) { -+ dprintk("%s spnfs_remove success\n", __FUNCTION__); -+ } else { -+ /* XXX How do we make this atomic? */ -+ printk(KERN_WARNING "nfsd: pNFS could not " -+ "remove stripes for inode: %ld\n", ino); -+ } -+ } -+#endif /* defined(CONFIG_SPNFS) */ -+ - mnt_drop_write(fhp->fh_export->ex_path.mnt); - out_nfserr: - err = nfserrno(host_err); -diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h -index 4d476ff..4cc7558 100644 ---- a/fs/nfsd/xdr4.h -+++ b/fs/nfsd/xdr4.h -@@ -37,6 +37,8 @@ - #ifndef _LINUX_NFSD_XDR4_H - #define _LINUX_NFSD_XDR4_H - -+#include -+ - #include "state.h" - #include "nfsd.h" - -@@ -385,6 +387,51 @@ struct nfsd4_reclaim_complete { - u32 rca_one_fs; - }; - -+struct nfsd4_pnfs_getdevinfo { -+ struct nfsd4_pnfs_deviceid gd_devid; /* request */ -+ u32 gd_layout_type; /* request */ -+ u32 gd_maxcount; /* request */ -+ u32 gd_notify_types;/* request */ -+ struct super_block *gd_sb; -+}; -+ -+struct nfsd4_pnfs_getdevlist { -+ u32 gd_layout_type; /* request */ -+ u32 gd_maxdevices; /* request */ -+ u64 gd_cookie; /* request - response */ -+ u64 gd_verf; /* request - response */ -+ struct svc_fh *gd_fhp; /* response */ -+ u32 gd_eof; /* response */ -+}; -+ -+struct nfsd4_pnfs_layoutget { -+ u64 lg_minlength; /* request */ -+ u32 lg_signal; /* request */ -+ u32 lg_maxcount; /* request */ -+ struct svc_fh *lg_fhp; /* request */ -+ stateid_t lg_sid; /* request/response */ -+ struct nfsd4_layout_seg lg_seg; /* request/response */ -+ u32 lg_roc; /* response */ -+}; -+ -+struct nfsd4_pnfs_layoutcommit { -+ struct nfsd4_pnfs_layoutcommit_arg args; -+ stateid_t lc_sid; /* request */ -+ struct nfsd4_pnfs_layoutcommit_res res; -+}; -+ -+enum layoutreturn_flags { -+ LR_FLAG_INTERN = 1 << 0, /* internal return */ -+ LR_FLAG_EXPIRE = 1 << 1, /* return on client expiration */ -+}; -+ -+struct nfsd4_pnfs_layoutreturn { -+ struct nfsd4_pnfs_layoutreturn_arg args; -+ u32 lr_flags; -+ stateid_t lr_sid; /* request/resopnse */ -+ u32 lrs_present; /* response */ -+}; -+ - struct nfsd4_op { - int opnum; - __be32 status; -@@ -426,6 +473,13 @@ struct nfsd4_op { - struct nfsd4_destroy_session destroy_session; - struct nfsd4_sequence sequence; - struct nfsd4_reclaim_complete reclaim_complete; -+#if defined(CONFIG_PNFSD) -+ struct nfsd4_pnfs_getdevlist pnfs_getdevlist; -+ struct nfsd4_pnfs_getdevinfo pnfs_getdevinfo; -+ struct nfsd4_pnfs_layoutget pnfs_layoutget; -+ struct nfsd4_pnfs_layoutcommit pnfs_layoutcommit; -+ struct nfsd4_pnfs_layoutreturn pnfs_layoutreturn; -+#endif /* CONFIG_PNFSD */ - } u; - struct nfs4_replay * replay; - }; -diff --git a/include/linux/exp_xdr.h b/include/linux/exp_xdr.h -new file mode 100644 -index 0000000..b69c309 ---- /dev/null -+++ b/include/linux/exp_xdr.h -@@ -0,0 +1,141 @@ -+#ifndef _LINUX_EXP_XDR_H -+#define _LINUX_EXP_XDR_H -+ -+#include -+#include -+#include -+ -+struct exp_xdr_stream { -+ __be32 *p; -+ __be32 *end; -+}; -+ -+/** -+ * exp_xdr_qwords - Calculate the number of quad-words holding nbytes -+ * @nbytes: number of bytes to encode -+ */ -+static inline size_t -+exp_xdr_qwords(__u32 nbytes) -+{ -+ return DIV_ROUND_UP(nbytes, 4); -+} -+ -+/** -+ * exp_xdr_qbytes - Calculate the number of bytes holding qwords -+ * @qwords: number of quad-words to encode -+ */ -+static inline size_t -+exp_xdr_qbytes(size_t qwords) -+{ -+ return qwords << 2; -+} -+ -+/** -+ * exp_xdr_reserve_space - Reserve buffer space for sending -+ * @xdr: pointer to exp_xdr_stream -+ * @nbytes: number of bytes to reserve -+ * -+ * Checks that we have enough buffer space to encode 'nbytes' more -+ * bytes of data. If so, update the xdr stream. -+ */ -+static inline __be32 * -+exp_xdr_reserve_space(struct exp_xdr_stream *xdr, size_t nbytes) -+{ -+ __be32 *p = xdr->p; -+ __be32 *q; -+ -+ /* align nbytes on the next 32-bit boundary */ -+ q = p + exp_xdr_qwords(nbytes); -+ if (unlikely(q > xdr->end || q < p)) -+ return NULL; -+ xdr->p = q; -+ return p; -+} -+ -+/** -+ * exp_xdr_reserve_qwords - Reserve buffer space for sending -+ * @xdr: pointer to exp_xdr_stream -+ * @nwords: number of quad words (u32's) to reserve -+ */ -+static inline __be32 * -+exp_xdr_reserve_qwords(struct exp_xdr_stream *xdr, size_t qwords) -+{ -+ return exp_xdr_reserve_space(xdr, exp_xdr_qbytes(qwords)); -+} -+ -+/** -+ * exp_xdr_encode_u32 - Encode an unsigned 32-bit value onto a xdr stream -+ * @p: pointer to encoding destination -+ * @val: value to encode -+ */ -+static inline __be32 * -+exp_xdr_encode_u32(__be32 *p, __u32 val) -+{ -+ *p = cpu_to_be32(val); -+ return p + 1; -+} -+ -+/** -+ * exp_xdr_encode_u64 - Encode an unsigned 64-bit value onto a xdr stream -+ * @p: pointer to encoding destination -+ * @val: value to encode -+ */ -+static inline __be32 * -+exp_xdr_encode_u64(__be32 *p, __u64 val) -+{ -+ put_unaligned_be64(val, p); -+ return p + 2; -+} -+ -+/** -+ * exp_xdr_encode_bytes - Encode an array of bytes onto a xdr stream -+ * @p: pointer to encoding destination -+ * @ptr: pointer to the array of bytes -+ * @nbytes: number of bytes to encode -+ */ -+static inline __be32 * -+exp_xdr_encode_bytes(__be32 *p, const void *ptr, __u32 nbytes) -+{ -+ if (likely(nbytes != 0)) { -+ unsigned int qwords = exp_xdr_qwords(nbytes); -+ unsigned int padding = exp_xdr_qbytes(qwords) - nbytes; -+ -+ memcpy(p, ptr, nbytes); -+ if (padding != 0) -+ memset((char *)p + nbytes, 0, padding); -+ p += qwords; -+ } -+ return p; -+} -+ -+/** -+ * exp_xdr_encode_opaque - Encode an opaque type onto a xdr stream -+ * @p: pointer to encoding destination -+ * @ptr: pointer to the opaque array -+ * @nbytes: number of bytes to encode -+ * -+ * Encodes the 32-bit opaque size in bytes followed by the opaque value. -+ */ -+static inline __be32 * -+exp_xdr_encode_opaque(__be32 *p, const void *ptr, __u32 nbytes) -+{ -+ p = exp_xdr_encode_u32(p, nbytes); -+ return exp_xdr_encode_bytes(p, ptr, nbytes); -+} -+ -+/** -+ * exp_xdr_encode_opaque_qlen - Encode the opaque length onto a xdr stream -+ * @lenp: pointer to the opaque length destination -+ * @endp: pointer to the end of the opaque array -+ * -+ * Encodes the 32-bit opaque size in bytes given the start and end pointers -+ */ -+static inline __be32 * -+exp_xdr_encode_opaque_len(__be32 *lenp, const void *endp) -+{ -+ size_t nbytes = (char *)endp - (char *)(lenp + 1); -+ -+ exp_xdr_encode_u32(lenp, nbytes); -+ return lenp + 1 + exp_xdr_qwords(nbytes); -+} -+#endif /* _LINUX_EXP_XDR_H */ -diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h -index a9cd507..225560d 100644 ---- a/include/linux/exportfs.h -+++ b/include/linux/exportfs.h -@@ -2,6 +2,7 @@ - #define LINUX_EXPORTFS_H 1 - - #include -+#include - - struct dentry; - struct inode; -@@ -175,4 +176,62 @@ extern struct dentry *generic_fh_to_parent(struct super_block *sb, - struct fid *fid, int fh_len, int fh_type, - struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)); - -+#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) -+struct pnfs_filelayout_device; -+struct pnfs_filelayout_layout; -+ -+extern int filelayout_encode_devinfo(struct exp_xdr_stream *xdr, -+ const struct pnfs_filelayout_device *fdev); -+extern enum nfsstat4 filelayout_encode_layout(struct exp_xdr_stream *xdr, -+ const struct pnfs_filelayout_layout *flp); -+#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ -+ -+#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) -+struct list_head; -+ -+extern int blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, -+ const struct list_head *volumes); -+ -+extern enum nfsstat4 blocklayout_encode_layout(struct exp_xdr_stream *xdr, -+ const struct list_head *layouts); -+#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ -+ -+#if defined(CONFIG_PNFSD) -+#include -+ -+struct pnfsd_cb_operations; -+ -+struct pnfsd_cb_ctl { -+ spinlock_t lock; -+ struct module *module; -+ const struct pnfsd_cb_operations *cb_op; -+}; -+ -+/* in expfs.c so that file systems can depend on it */ -+extern struct pnfsd_cb_ctl pnfsd_cb_ctl; -+ -+static inline int -+pnfsd_get_cb_op(struct pnfsd_cb_ctl *ctl) -+{ -+ int ret = -ENOENT; -+ -+ spin_lock(&pnfsd_cb_ctl.lock); -+ if (!pnfsd_cb_ctl.cb_op) -+ goto out; -+ if (!try_module_get(pnfsd_cb_ctl.module)) -+ goto out; -+ ctl->cb_op = pnfsd_cb_ctl.cb_op; -+ ctl->module = pnfsd_cb_ctl.module; -+ ret = 0; -+out: -+ spin_unlock(&pnfsd_cb_ctl.lock); -+ return ret; -+} -+ -+static inline void -+pnfsd_put_cb_op(struct pnfsd_cb_ctl *ctl) -+{ -+ module_put(ctl->module); -+} -+#endif /* CONFIG_PNFSD */ - #endif /* LINUX_EXPORTFS_H */ -diff --git a/include/linux/fs.h b/include/linux/fs.h -index 63d069b..3a8601a 100644 ---- a/include/linux/fs.h -+++ b/include/linux/fs.h -@@ -388,6 +388,7 @@ struct inodes_stat_t { - #include - - struct export_operations; -+struct pnfs_export_operations; - struct hd_geometry; - struct iovec; - struct nameidata; -@@ -1327,6 +1328,7 @@ struct super_block { - const struct dquot_operations *dq_op; - const struct quotactl_ops *s_qcop; - const struct export_operations *s_export_op; -+ const struct pnfs_export_operations *s_pnfs_op; - unsigned long s_flags; - unsigned long s_magic; - struct dentry *s_root; -diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h -index 07e40c6..df29296 100644 ---- a/include/linux/nfs4.h -+++ b/include/linux/nfs4.h -@@ -17,7 +17,10 @@ - - #define NFS4_BITMAP_SIZE 2 - #define NFS4_VERIFIER_SIZE 8 --#define NFS4_STATEID_SIZE 16 -+#define NFS4_CLIENTID_SIZE 8 -+#define NFS4_STATEID_SEQID_SIZE 4 -+#define NFS4_STATEID_OTHER_SIZE 12 -+#define NFS4_STATEID_SIZE (NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE) - #define NFS4_FHSIZE 128 - #define NFS4_MAXPATHLEN PATH_MAX - #define NFS4_MAXNAMLEN NAME_MAX -@@ -119,6 +122,13 @@ - #define EXCHGID4_FLAG_MASK_A 0x40070003 - #define EXCHGID4_FLAG_MASK_R 0x80070003 - -+static inline bool -+is_ds_only_session(u32 exchange_flags) -+{ -+ u32 mask = EXCHGID4_FLAG_USE_PNFS_DS | EXCHGID4_FLAG_USE_PNFS_MDS; -+ return (exchange_flags & mask) == EXCHGID4_FLAG_USE_PNFS_DS; -+} -+ - #define SEQ4_STATUS_CB_PATH_DOWN 0x00000001 - #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING 0x00000002 - #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED 0x00000004 -@@ -166,8 +176,23 @@ struct nfs4_acl { - struct nfs4_ace aces[0]; - }; - -+struct nfs4_fsid { -+ u64 major; -+ u64 minor; -+}; -+ - typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier; --typedef struct { char data[NFS4_STATEID_SIZE]; } nfs4_stateid; -+typedef struct { char data[NFS4_CLIENTID_SIZE]; } nfs4_clientid; -+ -+struct nfs41_stateid { -+ __be32 seqid; -+ char other[NFS4_STATEID_OTHER_SIZE]; -+} __attribute__ ((packed)); -+ -+typedef union { -+ char data[NFS4_STATEID_SIZE]; -+ struct nfs41_stateid stateid; -+} nfs4_stateid; - - enum nfs_opnum4 { - OP_ACCESS = 3, -@@ -471,6 +496,8 @@ enum lock_type4 { - #define FATTR4_WORD1_TIME_MODIFY (1UL << 21) - #define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22) - #define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23) -+#define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30) -+#define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) - - #define NFSPROC4_NULL 0 - #define NFSPROC4_COMPOUND 1 -@@ -532,6 +559,13 @@ enum { - NFSPROC4_CLNT_SEQUENCE, - NFSPROC4_CLNT_GET_LEASE_TIME, - NFSPROC4_CLNT_RECLAIM_COMPLETE, -+ NFSPROC4_CLNT_LAYOUTGET, -+ NFSPROC4_CLNT_LAYOUTCOMMIT, -+ NFSPROC4_CLNT_LAYOUTRETURN, -+ NFSPROC4_CLNT_GETDEVICELIST, -+ NFSPROC4_CLNT_GETDEVICEINFO, -+ NFSPROC4_CLNT_PNFS_WRITE, -+ NFSPROC4_CLNT_PNFS_COMMIT, - }; - - /* nfs41 types */ -@@ -550,6 +584,51 @@ enum state_protect_how4 { - SP4_SSV = 2 - }; - -+enum pnfs_layouttype { -+ LAYOUT_NFSV4_1_FILES = 1, -+ LAYOUT_OSD2_OBJECTS = 2, -+ LAYOUT_BLOCK_VOLUME = 3, -+ -+ NFS4_PNFS_PRIVATE_LAYOUT = 0x80000000 -+}; -+ -+/* used for both layout return and recall */ -+enum pnfs_layoutreturn_type { -+ RETURN_FILE = 1, -+ RETURN_FSID = 2, -+ RETURN_ALL = 3 -+}; -+ -+enum pnfs_iomode { -+ IOMODE_READ = 1, -+ IOMODE_RW = 2, -+ IOMODE_ANY = 3, -+}; -+ -+enum pnfs_notify_deviceid_type4 { -+ NOTIFY_DEVICEID4_CHANGE = 1 << 1, -+ NOTIFY_DEVICEID4_DELETE = 1 << 2, -+}; -+ -+#define NFL4_UFLG_MASK 0x0000003F -+#define NFL4_UFLG_DENSE 0x00000001 -+#define NFL4_UFLG_COMMIT_THRU_MDS 0x00000002 -+#define NFL4_UFLG_STRIPE_UNIT_SIZE_MASK 0xFFFFFFC0 -+ -+/* Encoded in the loh_body field of type layouthint4 */ -+enum filelayout_hint_care4 { -+ NFLH4_CARE_DENSE = NFL4_UFLG_DENSE, -+ NFLH4_CARE_COMMIT_THRU_MDS = NFL4_UFLG_COMMIT_THRU_MDS, -+ NFLH4_CARE_STRIPE_UNIT_SIZE = 0x00000040, -+ NFLH4_CARE_STRIPE_COUNT = 0x00000080 -+}; -+ -+#define NFS4_DEVICEID4_SIZE 16 -+ -+struct nfs4_deviceid { -+ char data[NFS4_DEVICEID4_SIZE]; -+}; -+ - #endif - #endif - -diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h -index 508f8cf..27c45cc 100644 ---- a/include/linux/nfs_fs.h -+++ b/include/linux/nfs_fs.h -@@ -188,6 +188,10 @@ struct nfs_inode { - struct nfs_delegation *delegation; - fmode_t delegation_state; - struct rw_semaphore rwsem; -+ -+ /* pNFS layout information */ -+ struct rpc_wait_queue lo_rpcwaitq; -+ struct pnfs_layout_hdr *layout; - #endif /* CONFIG_NFS_V4*/ - #ifdef CONFIG_NFS_FSCACHE - struct fscache_cookie *fscache; -@@ -490,8 +494,12 @@ extern void nfs_unblock_sillyrename(struct dentry *dentry); - extern int nfs_congestion_kb; - extern int nfs_writepage(struct page *page, struct writeback_control *wbc); - extern int nfs_writepages(struct address_space *, struct writeback_control *); --extern int nfs_flush_incompatible(struct file *file, struct page *page); --extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); -+struct pnfs_layout_segment; -+extern int nfs_flush_incompatible(struct file *file, struct page *page, -+ struct pnfs_layout_segment *lseg); -+extern int nfs_updatepage(struct file *, struct page *, -+ unsigned int offset, unsigned int count, -+ struct pnfs_layout_segment *lseg, void *fsdata); - extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); - - /* -@@ -613,6 +621,8 @@ extern void * nfs_root_data(void); - #define NFSDBG_CLIENT 0x0200 - #define NFSDBG_MOUNT 0x0400 - #define NFSDBG_FSCACHE 0x0800 -+#define NFSDBG_PNFS 0x1000 -+#define NFSDBG_PNFS_LD 0x2000 - #define NFSDBG_ALL 0xFFFF - - #ifdef __KERNEL__ -diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h -index c82ee7c..81121d0 100644 ---- a/include/linux/nfs_fs_sb.h -+++ b/include/linux/nfs_fs_sb.h -@@ -82,6 +82,8 @@ struct nfs_client { - /* The flags used for obtaining the clientid during EXCHANGE_ID */ - u32 cl_exchange_flags; - struct nfs4_session *cl_session; /* sharred session */ -+ struct list_head cl_layouts; -+ struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */ - #endif /* CONFIG_NFS_V4_1 */ - - #ifdef CONFIG_NFS_FSCACHE -@@ -89,6 +91,16 @@ struct nfs_client { - #endif - }; - -+static inline bool -+is_ds_only_client(struct nfs_client *clp) -+{ -+#ifdef CONFIG_NFS_V4_1 -+ return is_ds_only_session(clp->cl_exchange_flags); -+#else -+ return false; -+#endif -+} -+ - /* - * NFS client parameters stored in the superblock. - */ -@@ -133,7 +145,7 @@ struct nfs_server { - #endif - - #ifdef CONFIG_NFS_V4 -- u32 attr_bitmask[2];/* V4 bitmask representing the set -+ u32 attr_bitmask[3];/* V4 bitmask representing the set - of attributes supported on this - filesystem */ - u32 cache_consistency_bitmask[2]; -@@ -144,6 +156,11 @@ struct nfs_server { - u32 acl_bitmask; /* V4 bitmask representing the ACEs - that are supported on this - filesystem */ -+ struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ -+ void *pnfs_ld_data; /* Per-mount data */ -+ unsigned int ds_rsize; /* Data server read size */ -+ unsigned int ds_wsize; /* Data server write size */ -+ u32 pnfs_blksize; /* layout_blksize attr */ - #endif - void (*destroy)(struct nfs_server *); - -diff --git a/include/linux/nfs_iostat.h b/include/linux/nfs_iostat.h -index 68b10f5..f9b5f44 100644 ---- a/include/linux/nfs_iostat.h -+++ b/include/linux/nfs_iostat.h -@@ -113,6 +113,9 @@ enum nfs_stat_eventcounters { - NFSIOS_SHORTREAD, - NFSIOS_SHORTWRITE, - NFSIOS_DELAY, -+ NFSIOS_PNFS_READ, -+ NFSIOS_PNFS_WRITE, -+ NFSIOS_PNFS_COMMIT, - __NFSIOS_COUNTSMAX, - }; - -diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h -index f8b60e7..6fa43c7 100644 ---- a/include/linux/nfs_page.h -+++ b/include/linux/nfs_page.h -@@ -48,6 +48,7 @@ struct nfs_page { - struct kref wb_kref; /* reference count */ - unsigned long wb_flags; - struct nfs_writeverf wb_verf; /* Commit cookie */ -+ struct pnfs_layout_segment *wb_lseg; /* Pnfs layout info */ - }; - - struct nfs_pageio_descriptor { -@@ -61,6 +62,11 @@ struct nfs_pageio_descriptor { - int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int); - int pg_ioflags; - int pg_error; -+ struct pnfs_layout_segment *pg_lseg; -+#ifdef CONFIG_NFS_V4_1 -+ int pg_iswrite; -+ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); -+#endif /* CONFIG_NFS_V4_1 */ - }; - - #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) -@@ -69,13 +75,15 @@ extern struct nfs_page *nfs_create_request(struct nfs_open_context *ctx, - struct inode *inode, - struct page *page, - unsigned int offset, -- unsigned int count); -+ unsigned int count, -+ struct pnfs_layout_segment *lseg); - extern void nfs_clear_request(struct nfs_page *req); - extern void nfs_release_request(struct nfs_page *req); - - - extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst, -- pgoff_t idx_start, unsigned int npages, int tag); -+ pgoff_t idx_start, unsigned int npages, int tag, -+ int *use_pnfs); - extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, - struct inode *inode, - int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), -diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h -index fc46192..63c0301 100644 ---- a/include/linux/nfs_xdr.h -+++ b/include/linux/nfs_xdr.h -@@ -3,6 +3,8 @@ - - #include - #include -+#include -+#include - - /* - * To change the maximum rsize and wsize supported by the NFS client, adjust -@@ -10,7 +12,7 @@ - * support a megabyte or more. The default is left at 4096 bytes, which is - * reasonable for NFS over UDP. - */ --#define NFS_MAX_FILE_IO_SIZE (1048576U) -+#define NFS_MAX_FILE_IO_SIZE (4U * 1048576U) - #define NFS_DEF_FILE_IO_SIZE (4096U) - #define NFS_MIN_FILE_IO_SIZE (1024U) - -@@ -113,6 +115,8 @@ struct nfs_fsinfo { - __u32 dtpref; /* pref. readdir transfer size */ - __u64 maxfilesize; - __u32 lease_time; /* in seconds */ -+ __u32 layouttype; /* supported pnfs layout driver */ -+ __u32 blksize; /* preferred pnfs io block size */ - }; - - struct nfs_fsstat { -@@ -185,6 +189,123 @@ struct nfs4_get_lease_time_res { - struct nfs4_sequence_res lr_seq_res; - }; - -+#define PNFS_LAYOUT_MAXSIZE 4096 -+ -+struct nfs4_layoutdriver_data { -+ __u32 len; -+ void *buf; -+}; -+ -+struct pnfs_layout_range { -+ u32 iomode; -+ u64 offset; -+ u64 length; -+}; -+ -+struct nfs4_layoutget_args { -+ __u32 type; -+ struct pnfs_layout_range range; -+ __u64 minlength; -+ __u32 maxcount; -+ struct inode *inode; -+ struct nfs_open_context *ctx; -+ struct nfs4_sequence_args seq_args; -+}; -+ -+struct nfs4_layoutget_res { -+ __u32 return_on_close; -+ struct pnfs_layout_range range; -+ __u32 type; -+ nfs4_stateid stateid; -+ struct nfs4_layoutdriver_data layout; -+ struct nfs4_sequence_res seq_res; -+}; -+ -+struct nfs4_layoutget { -+ struct nfs4_layoutget_args args; -+ struct nfs4_layoutget_res res; -+ struct pnfs_layout_segment **lsegpp; -+ int status; -+}; -+ -+struct nfs4_layoutcommit_args { -+ nfs4_stateid stateid; -+ __u64 lastbytewritten; -+ __u32 time_modify_changed; -+ struct timespec time_modify; -+ const u32 *bitmask; -+ struct nfs_fh *fh; -+ struct inode *inode; -+ -+ /* Values set by layout driver */ -+ struct pnfs_layout_range range; -+ __u32 layout_type; -+ void *layoutdriver_data; -+ struct nfs4_sequence_args seq_args; -+}; -+ -+struct nfs4_layoutcommit_res { -+ __u32 sizechanged; -+ __u64 newsize; -+ struct nfs_fattr *fattr; -+ const struct nfs_server *server; -+ struct nfs4_sequence_res seq_res; -+}; -+ -+struct nfs4_layoutcommit_data { -+ struct rpc_task task; -+ struct rpc_cred *cred; -+ struct nfs_fattr fattr; -+ struct nfs4_layoutcommit_args args; -+ struct nfs4_layoutcommit_res res; -+ int status; -+}; -+ -+struct nfs4_layoutreturn_args { -+ __u32 reclaim; -+ __u32 layout_type; -+ __u32 return_type; -+ struct pnfs_layout_range range; -+ struct inode *inode; -+ struct nfs4_sequence_args seq_args; -+}; -+ -+struct nfs4_layoutreturn_res { -+ struct nfs4_sequence_res seq_res; -+ bool valid; /* internal, true if received reply */ -+ u32 lrs_present; -+ nfs4_stateid stateid; -+}; -+ -+struct nfs4_layoutreturn { -+ struct nfs4_layoutreturn_args args; -+ struct nfs4_layoutreturn_res res; -+ struct rpc_cred *cred; -+ const nfs4_stateid *stateid; -+ int rpc_status; -+}; -+ -+struct nfs4_getdevicelist_args { -+ const struct nfs_fh *fh; -+ u32 layoutclass; -+ struct nfs4_sequence_args seq_args; -+}; -+ -+struct nfs4_getdevicelist_res { -+ struct pnfs_devicelist *devlist; -+ struct nfs4_sequence_res seq_res; -+}; -+ -+struct nfs4_getdeviceinfo_args { -+ struct pnfs_device *pdev; -+ struct nfs4_sequence_args seq_args; -+}; -+ -+struct nfs4_getdeviceinfo_res { -+ struct pnfs_device *pdev; -+ struct nfs4_sequence_res seq_res; -+}; -+ - /* - * Arguments to the open call. - */ -@@ -854,7 +975,7 @@ struct nfs4_server_caps_arg { - }; - - struct nfs4_server_caps_res { -- u32 attr_bitmask[2]; -+ u32 attr_bitmask[3]; - u32 acl_bitmask; - u32 has_links; - u32 has_symlinks; -@@ -969,6 +1090,30 @@ struct nfs_page; - - #define NFS_PAGEVEC_SIZE (8U) - -+#if defined(CONFIG_NFS_V4_1) -+ -+/* pnfsflag values */ -+enum pnfs_flags { -+ PNFS_NO_RPC = 1 << 0, /* non rpc result callback switch */ -+}; -+ -+/* pnfs-specific data needed for read, write, and commit calls */ -+struct pnfs_call_data { -+ struct pnfs_layout_segment *lseg; -+ const struct rpc_call_ops *call_ops; -+ u32 orig_count; /* for retry via MDS */ -+ int pnfs_error; -+ u8 pnfsflags; -+ u8 how; /* for FLUSH_STABLE */ -+}; -+ -+/* files layout-type specific data for read, write, and commit */ -+struct pnfs_fl_call_data { -+ struct nfs_client *ds_nfs_client; -+ __u64 orig_offset; -+}; -+#endif /* CONFIG_NFS_V4_1 */ -+ - struct nfs_read_data { - int flags; - struct rpc_task task; -@@ -984,10 +1129,16 @@ struct nfs_read_data { - #ifdef CONFIG_NFS_V4 - unsigned long timestamp; /* For lease renewal */ - #endif -+#if defined(CONFIG_NFS_V4_1) -+ struct pnfs_call_data pdata; -+ struct pnfs_fl_call_data fldata; -+#endif /* CONFIG_NFS_V4_1 */ - struct page *page_array[NFS_PAGEVEC_SIZE]; - }; - - struct nfs_write_data { -+ struct kref refcount; /* For pnfs commit splitting */ -+ struct nfs_write_data *parent; /* For pnfs commit splitting */ - int flags; - struct rpc_task task; - struct inode *inode; -@@ -1003,6 +1154,10 @@ struct nfs_write_data { - #ifdef CONFIG_NFS_V4 - unsigned long timestamp; /* For lease renewal */ - #endif -+#if defined(CONFIG_NFS_V4_1) -+ struct pnfs_call_data pdata; -+ struct pnfs_fl_call_data fldata; -+#endif /* CONFIG_NFS_V4_1 */ - struct page *page_array[NFS_PAGEVEC_SIZE]; - }; - -diff --git a/include/linux/nfsd/const.h b/include/linux/nfsd/const.h -index 323f8cf..520fcfb 100644 ---- a/include/linux/nfsd/const.h -+++ b/include/linux/nfsd/const.h -@@ -29,6 +29,7 @@ - #ifdef __KERNEL__ - - #include -+#include - - /* - * Largest number of bytes we need to allocate for an NFS -diff --git a/include/linux/nfsd/debug.h b/include/linux/nfsd/debug.h -index ee4aa91..aad7013 100644 ---- a/include/linux/nfsd/debug.h -+++ b/include/linux/nfsd/debug.h -@@ -32,6 +32,8 @@ - #define NFSDDBG_REPCACHE 0x0080 - #define NFSDDBG_XDR 0x0100 - #define NFSDDBG_LOCKD 0x0200 -+#define NFSDDBG_PNFS 0x0400 -+#define NFSDDBG_FILELAYOUT 0x0800 - #define NFSDDBG_ALL 0x7FFF - #define NFSDDBG_NOCHANGE 0xFFFF - -diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h -index 8ae78a6..933ab78 100644 ---- a/include/linux/nfsd/export.h -+++ b/include/linux/nfsd/export.h -@@ -100,6 +100,7 @@ struct svc_export { - uid_t ex_anon_uid; - gid_t ex_anon_gid; - int ex_fsid; -+ int ex_pnfs; - unsigned char * ex_uuid; /* 16 byte fsid */ - struct nfsd4_fs_locations ex_fslocs; - int ex_nflavors; -diff --git a/include/linux/nfsd/nfs4layoutxdr.h b/include/linux/nfsd/nfs4layoutxdr.h -new file mode 100644 -index 0000000..b02d96a ---- /dev/null -+++ b/include/linux/nfsd/nfs4layoutxdr.h -@@ -0,0 +1,132 @@ -+/* -+ * Copyright (c) 2006 The Regents of the University of Michigan. -+ * All rights reserved. -+ * -+ * Andy Adamson -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. Neither the name of the University nor the names of its -+ * contributors may be used to endorse or promote products derived -+ * from this software without specific prior written permission. -+ * -+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED -+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ * -+ */ -+ -+#ifndef NFSD_NFS4LAYOUTXDR_H -+#define NFSD_NFS4LAYOUTXDR_H -+ -+#include -+#include -+ -+/* the nfsd4_pnfs_devlist dev_addr for the file layout type */ -+struct pnfs_filelayout_devaddr { -+ struct xdr_netobj r_netid; -+ struct xdr_netobj r_addr; -+}; -+ -+/* list of multipath servers */ -+struct pnfs_filelayout_multipath { -+ u32 fl_multipath_length; -+ struct pnfs_filelayout_devaddr *fl_multipath_list; -+}; -+ -+struct pnfs_filelayout_device { -+ u32 fl_stripeindices_length; -+ u32 *fl_stripeindices_list; -+ u32 fl_device_length; -+ struct pnfs_filelayout_multipath *fl_device_list; -+}; -+ -+struct pnfs_filelayout_layout { -+ u32 lg_layout_type; /* response */ -+ u32 lg_stripe_type; /* response */ -+ u32 lg_commit_through_mds; /* response */ -+ u64 lg_stripe_unit; /* response */ -+ u64 lg_pattern_offset; /* response */ -+ u32 lg_first_stripe_index; /* response */ -+ struct nfsd4_pnfs_deviceid device_id; /* response */ -+ u32 lg_fh_length; /* response */ -+ struct knfsd_fh *lg_fh_list; /* response */ -+}; -+ -+enum stripetype4 { -+ STRIPE_SPARSE = 1, -+ STRIPE_DENSE = 2 -+}; -+ -+enum pnfs_block_extent_state4 { -+ PNFS_BLOCK_READWRITE_DATA = 0, -+ PNFS_BLOCK_READ_DATA = 1, -+ PNFS_BLOCK_INVALID_DATA = 2, -+ PNFS_BLOCK_NONE_DATA = 3 -+}; -+ -+enum pnfs_block_volume_type4 { -+ PNFS_BLOCK_VOLUME_SIMPLE = 0, -+ PNFS_BLOCK_VOLUME_SLICE = 1, -+ PNFS_BLOCK_VOLUME_CONCAT = 2, -+ PNFS_BLOCK_VOLUME_STRIPE = 3, -+}; -+typedef enum pnfs_block_volume_type4 pnfs_block_volume_type4; -+ -+enum bl_cache_state { -+ BLOCK_LAYOUT_NEW = 0, -+ BLOCK_LAYOUT_CACHE = 1, -+ BLOCK_LAYOUT_UPDATE = 2, -+}; -+ -+typedef struct pnfs_blocklayout_layout { -+ struct list_head bll_list; -+ struct nfsd4_pnfs_deviceid bll_vol_id; -+ u64 bll_foff; // file offset -+ u64 bll_len; -+ u64 bll_soff; // storage offset -+ int bll_recalled; -+ enum pnfs_block_extent_state4 bll_es; -+ enum bl_cache_state bll_cache_state; -+} pnfs_blocklayout_layout_t; -+ -+typedef struct pnfs_blocklayout_devinfo { -+ struct list_head bld_list; -+ pnfs_block_volume_type4 bld_type; -+ struct nfsd4_pnfs_deviceid bld_devid; -+ int bld_index_loc; -+ union { -+ struct { -+ u64 bld_offset; -+ u32 bld_sig_len, -+ *bld_sig; -+ } simple; -+ struct { -+ u64 bld_start, -+ bld_len; -+ u32 bld_index; /* Index of Simple Volume */ -+ } slice; -+ struct { -+ u32 bld_stripes; -+ u64 bld_chunk_size; -+ u32 *bld_stripe_indexs; -+ } stripe; -+ } u; -+} pnfs_blocklayout_devinfo_t; -+ -+#endif /* NFSD_NFS4LAYOUTXDR_H */ -diff --git a/include/linux/nfsd/nfs4pnfsdlm.h b/include/linux/nfsd/nfs4pnfsdlm.h -new file mode 100644 -index 0000000..eb31123 ---- /dev/null -+++ b/include/linux/nfsd/nfs4pnfsdlm.h -@@ -0,0 +1,54 @@ -+/****************************************************************************** -+ * -+ * (c) 2007 Network Appliance, Inc. All Rights Reserved. -+ * (c) 2009 NetApp. All Rights Reserved. -+ * -+ * NetApp provides this source code under the GPL v2 License. -+ * The GPL v2 license is available at -+ * http://opensource.org/licenses/gpl-license.php. -+ * -+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ * -+ ******************************************************************************/ -+#include -+ -+/* -+ * Length of comma separated pnfs data server IPv4 addresses. Enough room for -+ * 32 addresses. -+ */ -+#define NFSD_DLM_DS_LIST_MAX 512 -+/* -+ * Length of colon separated pnfs dlm device of the form -+ * disk_name:comma separated data server IPv4 address -+ */ -+#define NFSD_PNFS_DLM_DEVICE_MAX (NFSD_DLM_DS_LIST_MAX + DISK_NAME_LEN + 1) -+ -+#ifdef CONFIG_PNFSD -+ -+/* For use by DLM cluster file systems exported by pNFSD */ -+extern const struct pnfs_export_operations pnfs_dlm_export_ops; -+ -+int nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len); -+ -+void nfsd4_pnfs_dlm_shutdown(void); -+ -+ssize_t nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen); -+ -+#else /* CONFIG_PNFSD */ -+ -+static inline void nfsd4_pnfs_dlm_shutdown(void) -+{ -+ return; -+} -+ -+#endif /* CONFIG_PNFSD */ -diff --git a/include/linux/nfsd/nfsd4_pnfs.h b/include/linux/nfsd/nfsd4_pnfs.h -new file mode 100644 -index 0000000..2e66837 ---- /dev/null -+++ b/include/linux/nfsd/nfsd4_pnfs.h -@@ -0,0 +1,271 @@ -+/* -+ * Copyright (c) 2006 The Regents of the University of Michigan. -+ * All rights reserved. -+ * -+ * Andy Adamson -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. Neither the name of the University nor the names of its -+ * contributors may be used to endorse or promote products derived -+ * from this software without specific prior written permission. -+ * -+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED -+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ * -+ */ -+ -+#ifndef _LINUX_NFSD_NFSD4_PNFS_H -+#define _LINUX_NFSD_NFSD4_PNFS_H -+ -+#include -+#include -+#include -+ -+struct nfsd4_pnfs_deviceid { -+ u64 sbid; /* per-superblock unique ID */ -+ u64 devid; /* filesystem-wide unique device ID */ -+}; -+ -+struct nfsd4_pnfs_dev_iter_res { -+ u64 gd_cookie; /* request/repsonse */ -+ u64 gd_verf; /* request/repsonse */ -+ u64 gd_devid; /* response */ -+ u32 gd_eof; /* response */ -+}; -+ -+/* Arguments for set_device_notify */ -+struct pnfs_devnotify_arg { -+ struct nfsd4_pnfs_deviceid dn_devid; /* request */ -+ u32 dn_layout_type; /* request */ -+ u32 dn_notify_types; /* request/response */ -+}; -+ -+struct nfsd4_layout_seg { -+ u64 clientid; -+ u32 layout_type; -+ u32 iomode; -+ u64 offset; -+ u64 length; -+}; -+ -+/* Used by layout_get to encode layout (loc_body var in spec) -+ * Args: -+ * minlength - min number of accessible bytes given by layout -+ * fsid - Major part of struct pnfs_deviceid. File system uses this -+ * to build the deviceid returned in the layout. -+ * fh - fs can modify the file handle for use on data servers -+ * seg - layout info requested and layout info returned -+ * xdr - xdr info -+ * return_on_close - true if layout to be returned on file close -+ */ -+ -+struct nfsd4_pnfs_layoutget_arg { -+ u64 lg_minlength; -+ u64 lg_sbid; -+ const struct knfsd_fh *lg_fh; -+}; -+ -+struct nfsd4_pnfs_layoutget_res { -+ struct nfsd4_layout_seg lg_seg; /* request/resopnse */ -+ u32 lg_return_on_close; -+}; -+ -+struct nfsd4_pnfs_layoutcommit_arg { -+ struct nfsd4_layout_seg lc_seg; /* request */ -+ u32 lc_reclaim; /* request */ -+ u32 lc_newoffset; /* request */ -+ u64 lc_last_wr; /* request */ -+ struct nfstime4 lc_mtime; /* request */ -+ u32 lc_up_len; /* layout length */ -+ void *lc_up_layout; /* decoded by callback */ -+}; -+ -+struct nfsd4_pnfs_layoutcommit_res { -+ u32 lc_size_chg; /* boolean for response */ -+ u64 lc_newsize; /* response */ -+}; -+ -+#define PNFS_LAST_LAYOUT_NO_RECALLS ((void *)-1) /* used with lr_cookie below */ -+ -+struct nfsd4_pnfs_layoutreturn_arg { -+ u32 lr_return_type; /* request */ -+ struct nfsd4_layout_seg lr_seg; /* request */ -+ u32 lr_reclaim; /* request */ -+ u32 lrf_body_len; /* request */ -+ void *lrf_body; /* request */ -+ void *lr_cookie; /* fs private */ -+}; -+ -+/* pNFS Metadata to Data server state communication */ -+struct pnfs_get_state { -+ u32 dsid; /* request */ -+ u64 ino; /* request */ -+ nfs4_stateid stid; /* request;response */ -+ nfs4_clientid clid; /* response */ -+ u32 access; /* response */ -+ u32 stid_gen; /* response */ -+ u32 verifier[2]; /* response */ -+}; -+ -+/* -+ * pNFS export operations vector. -+ * -+ * The filesystem must implement the following methods: -+ * layout_type -+ * get_device_info -+ * layout_get -+ * -+ * All other methods are optional and can be set to NULL if not implemented. -+ */ -+struct pnfs_export_operations { -+ /* Returns the supported pnfs_layouttype4. */ -+ int (*layout_type) (struct super_block *); -+ -+ /* Encode device info onto the xdr stream. */ -+ int (*get_device_info) (struct super_block *, -+ struct exp_xdr_stream *, -+ u32 layout_type, -+ const struct nfsd4_pnfs_deviceid *); -+ -+ /* Retrieve all available devices via an iterator. -+ * arg->cookie == 0 indicates the beginning of the list, -+ * otherwise arg->verf is used to verify that the list hasn't changed -+ * while retrieved. -+ * -+ * On output, the filesystem sets the devid based on the current cookie -+ * and sets res->cookie and res->verf corresponding to the next entry. -+ * When the last entry in the list is retrieved, res->eof is set to 1. -+ */ -+ int (*get_device_iter) (struct super_block *, -+ u32 layout_type, -+ struct nfsd4_pnfs_dev_iter_res *); -+ -+ int (*set_device_notify) (struct super_block *, -+ struct pnfs_devnotify_arg *); -+ -+ /* Retrieve and encode a layout for inode onto the xdr stream. -+ * arg->minlength is the minimum number of accessible bytes required -+ * by the client. -+ * The maximum number of bytes to encode the layout is given by -+ * the xdr stream end pointer. -+ * arg->fsid contains the major part of struct pnfs_deviceid. -+ * The file system uses this to build the deviceid returned -+ * in the layout. -+ * res->seg - layout segment requested and layout info returned. -+ * res->fh can be modified the file handle for use on data servers -+ * res->return_on_close - true if layout to be returned on file close -+ * -+ * return one of the following nfs errors: -+ * NFS_OK Success -+ * NFS4ERR_ACCESS Permission error -+ * NFS4ERR_BADIOMODE Server does not support requested iomode -+ * NFS4ERR_BADLAYOUT No layout matching loga_minlength rules -+ * NFS4ERR_INVAL Parameter other than layout is invalid -+ * NFS4ERR_IO I/O error -+ * NFS4ERR_LAYOUTTRYLATER Layout may be retrieved later -+ * NFS4ERR_LAYOUTUNAVAILABLE Layout unavailable for this file -+ * NFS4ERR_LOCKED Lock conflict -+ * NFS4ERR_NOSPC Out-of-space error occured -+ * NFS4ERR_RECALLCONFLICT Layout currently unavialable due to -+ * a conflicting CB_LAYOUTRECALL -+ * NFS4ERR_SERVERFAULT Server went bezerk -+ * NFS4ERR_TOOSMALL loga_maxcount too small to fit layout -+ * NFS4ERR_WRONG_TYPE Wrong file type (not a regular file) -+ */ -+ enum nfsstat4 (*layout_get) (struct inode *, -+ struct exp_xdr_stream *xdr, -+ const struct nfsd4_pnfs_layoutget_arg *, -+ struct nfsd4_pnfs_layoutget_res *); -+ -+ /* Commit changes to layout */ -+ int (*layout_commit) (struct inode *, -+ const struct nfsd4_pnfs_layoutcommit_arg *, -+ struct nfsd4_pnfs_layoutcommit_res *); -+ -+ /* Returns the layout */ -+ int (*layout_return) (struct inode *, -+ const struct nfsd4_pnfs_layoutreturn_arg *); -+ -+ /* Can layout segments be merged for this layout type? */ -+ int (*can_merge_layouts) (u32 layout_type); -+ -+ /* pNFS Files layout specific operations */ -+ -+ /* Get the write verifier for DS (called on MDS only) */ -+ void (*get_verifier) (struct super_block *, u32 *p); -+ /* Call fs on DS only */ -+ int (*get_state) (struct inode *, struct knfsd_fh *, -+ struct pnfs_get_state *); -+}; -+ -+struct nfsd4_pnfs_cb_layout { -+ u32 cbl_recall_type; /* request */ -+ struct nfsd4_layout_seg cbl_seg; /* request */ -+ u32 cbl_layoutchanged; /* request */ -+ nfs4_stateid cbl_sid; /* request */ -+ struct nfs4_fsid cbl_fsid; -+ void *cbl_cookie; /* fs private */ -+}; -+ -+/* layoutrecall request (from exported filesystem) */ -+struct nfs4_layoutrecall { -+ struct kref clr_ref; -+ struct nfsd4_pnfs_cb_layout cb; /* request */ -+ struct list_head clr_perclnt; /* on cl_layoutrecalls */ -+ struct nfs4_client *clr_client; -+ struct nfs4_file *clr_file; -+ struct timespec clr_time; /* last activity */ -+ struct super_block *clr_sb; /* We might not have a file */ -+ struct nfs4_layoutrecall *parent; /* The initiating recall */ -+ -+ void *clr_args; /* nfsd internal */ -+}; -+ -+struct nfsd4_pnfs_cb_dev_item { -+ u32 cbd_notify_type; /* request */ -+ u32 cbd_layout_type; /* request */ -+ struct nfsd4_pnfs_deviceid cbd_devid; /* request */ -+ u32 cbd_immediate; /* request */ -+}; -+ -+struct nfsd4_pnfs_cb_dev_list { -+ u32 cbd_len; /* request */ -+ struct nfsd4_pnfs_cb_dev_item *cbd_list; /* request */ -+}; -+ -+/* -+ * callbacks provided by the nfsd -+ */ -+struct pnfsd_cb_operations { -+ /* Generic callbacks */ -+ int (*cb_layout_recall) (struct super_block *, struct inode *, -+ struct nfsd4_pnfs_cb_layout *); -+ int (*cb_device_notify) (struct super_block *, -+ struct nfsd4_pnfs_cb_dev_list *); -+ -+ /* pNFS Files layout specific callbacks */ -+ -+ /* Callback from fs on MDS only */ -+ int (*cb_get_state) (struct super_block *, struct pnfs_get_state *); -+ /* Callback from fs on DS only */ -+ int (*cb_change_state) (struct pnfs_get_state *); -+}; -+ -+#endif /* _LINUX_NFSD_NFSD4_PNFS_H */ -diff --git a/include/linux/nfsd/syscall.h b/include/linux/nfsd/syscall.h -index 812bc1e..df667d0 100644 ---- a/include/linux/nfsd/syscall.h -+++ b/include/linux/nfsd/syscall.h -@@ -29,6 +29,7 @@ - /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ - #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ - #define NFSCTL_GETFS 8 /* get an fh by path with max FH len */ -+#define NFSCTL_FD2FH 9 /* get a fh from a fd */ - - /* SVC */ - struct nfsctl_svc { -@@ -71,6 +72,11 @@ struct nfsctl_fsparm { - int gd_maxlen; - }; - -+/* FD2FH */ -+struct nfsctl_fd2fh { -+ int fd; -+}; -+ - /* - * This is the argument union. - */ -@@ -82,6 +88,7 @@ struct nfsctl_arg { - struct nfsctl_export u_export; - struct nfsctl_fdparm u_getfd; - struct nfsctl_fsparm u_getfs; -+ struct nfsctl_fd2fh u_fd2fh; - /* - * The following dummy member is needed to preserve binary compatibility - * on platforms where alignof(void*)>alignof(int). It's needed because -@@ -95,6 +102,7 @@ struct nfsctl_arg { - #define ca_export u.u_export - #define ca_getfd u.u_getfd - #define ca_getfs u.u_getfs -+#define ca_fd2fh u.u_fd2fh - }; - - union nfsctl_res { -diff --git a/include/linux/nfsd4_block.h b/include/linux/nfsd4_block.h -new file mode 100644 -index 0000000..b0d5177 ---- /dev/null -+++ b/include/linux/nfsd4_block.h -@@ -0,0 +1,101 @@ -+#ifndef NFSD4_BLOCK -+#define NFSD4_BLOCK -+ -+#include -+#include -+#include -+#include -+ -+#define PNFS_BLOCK_SUCCESS 1 -+#define PNFS_BLOCK_FAILURE 0 -+ -+#define PNFS_BLOCK_CTL_START 1 -+#define PNFS_BLOCK_CTL_STOP 2 -+#define PNFS_BLOCK_CTL_VERS 3 /* Allows daemon to request current -+ * version from kernel via an upcall. -+ */ -+ -+#define PNFS_UPCALL_MSG_STOP 0 -+#define PNFS_UPCALL_MSG_GETSIG 1 -+#define PNFS_UPCALL_MSG_GETSLICE 2 -+#define PNFS_UPCALL_MSG_DMCHK 3 // See if dev_t is a DM volume -+#define PNFS_UPCALL_MSG_DMGET 4 -+#define PNFS_UPCALL_MSG_VERS 5 -+ -+#define PNFS_UPCALL_VERS 8 -+ -+typedef struct stripe_dev { -+ int major, -+ minor, -+ offset; -+} stripe_dev_t; -+ -+typedef struct bl_comm_res { -+ int res_status; -+ union { -+ struct { -+ long long start, -+ length; -+ } slice; -+ struct { -+ int num_stripes, -+ stripe_size; -+ stripe_dev_t devs[]; -+ } stripe; -+ struct { -+ long long sector; -+ int offset, -+ len; -+ char sig[]; -+ } sig; -+ int vers, -+ dm_vol; -+ } u; -+} bl_comm_res_t; -+ -+typedef struct bl_comm_msg { -+ int msg_type, -+ msg_status; -+ union { -+ dev_t msg_dev; -+ int msg_vers; -+ } u; -+ bl_comm_res_t *msg_res; -+} bl_comm_msg_t; -+ -+#ifdef __KERNEL__ -+ -+typedef struct bl_comm { -+ /* ---- protects access to this structure ---- */ -+ struct mutex lock; -+ /* ---- protects access to rpc pipe ---- */ -+ struct mutex pipe_lock; -+ struct dentry *pipe_dentry; -+ wait_queue_head_t pipe_wq; -+ bl_comm_msg_t msg; -+} bl_comm_t; -+ -+int pnfs_block_enabled(struct inode *, int); -+int bl_layout_type(struct super_block *sb); -+int bl_getdeviceiter(struct super_block *, u32 layout_type, -+ struct nfsd4_pnfs_dev_iter_res *); -+int bl_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, -+ u32 layout_type, -+ const struct nfsd4_pnfs_deviceid *); -+enum nfsstat4 bl_layoutget(struct inode *, struct exp_xdr_stream *, -+ const struct nfsd4_pnfs_layoutget_arg *, -+ struct nfsd4_pnfs_layoutget_res *); -+int bl_layoutcommit(struct inode *, -+ const struct nfsd4_pnfs_layoutcommit_arg *, -+ struct nfsd4_pnfs_layoutcommit_res *); -+int bl_layoutreturn(struct inode *, -+ const struct nfsd4_pnfs_layoutreturn_arg *); -+int bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len); -+int bl_init_proc(void); -+int bl_upcall(bl_comm_t *, bl_comm_msg_t *, bl_comm_res_t **); -+ -+extern bl_comm_t *bl_comm_global; // Ugly... -+#endif /* __KERNEL__ */ -+ -+#endif /* NFSD4_BLOCK */ -+ -diff --git a/include/linux/nfsd4_spnfs.h b/include/linux/nfsd4_spnfs.h -new file mode 100644 -index 0000000..ea828e4 ---- /dev/null -+++ b/include/linux/nfsd4_spnfs.h -@@ -0,0 +1,345 @@ -+/* -+ * include/linux/nfsd4_spnfs.h -+ * -+ * spNFS - simple pNFS implementation with userspace daemon -+ * -+ */ -+ -+/****************************************************************************** -+ -+(c) 2007 Network Appliance, Inc. All Rights Reserved. -+ -+Network Appliance provides this source code under the GPL v2 License. -+The GPL v2 license is available at -+http://opensource.org/licenses/gpl-license.php. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+******************************************************************************/ -+ -+#ifndef NFS_SPNFS_H -+#define NFS_SPNFS_H -+ -+ -+#ifdef __KERNEL__ -+#include "exportfs.h" -+#include "sunrpc/svc.h" -+#include "nfsd/nfsfh.h" -+#else -+#include -+#endif /* __KERNEL__ */ -+ -+#define SPNFS_STATUS_INVALIDMSG 0x01 -+#define SPNFS_STATUS_AGAIN 0x02 -+#define SPNFS_STATUS_FAIL 0x04 -+#define SPNFS_STATUS_SUCCESS 0x08 -+ -+#define SPNFS_TYPE_LAYOUTGET 0x01 -+#define SPNFS_TYPE_LAYOUTCOMMIT 0x02 -+#define SPNFS_TYPE_LAYOUTRETURN 0x03 -+#define SPNFS_TYPE_GETDEVICEITER 0x04 -+#define SPNFS_TYPE_GETDEVICEINFO 0x05 -+#define SPNFS_TYPE_SETATTR 0x06 -+#define SPNFS_TYPE_OPEN 0x07 -+#define SPNFS_TYPE_CLOSE 0x08 -+#define SPNFS_TYPE_CREATE 0x09 -+#define SPNFS_TYPE_REMOVE 0x0a -+#define SPNFS_TYPE_COMMIT 0x0b -+#define SPNFS_TYPE_READ 0x0c -+#define SPNFS_TYPE_WRITE 0x0d -+ -+#define SPNFS_MAX_DEVICES 1 -+#define SPNFS_MAX_DATA_SERVERS 16 -+#define SPNFS_MAX_IO 512 -+ -+/* layout */ -+struct spnfs_msg_layoutget_args { -+ unsigned long inode; -+ unsigned long generation; -+}; -+ -+struct spnfs_filelayout_list { -+ u_int32_t fh_len; -+ unsigned char fh_val[128]; /* DMXXX fix this const */ -+}; -+ -+struct spnfs_msg_layoutget_res { -+ int status; -+ u_int64_t devid; -+ u_int64_t stripe_size; -+ u_int32_t stripe_type; -+ u_int32_t stripe_count; -+ struct spnfs_filelayout_list flist[SPNFS_MAX_DATA_SERVERS]; -+}; -+ -+/* layoutcommit */ -+struct spnfs_msg_layoutcommit_args { -+ unsigned long inode; -+ unsigned long generation; -+ u_int64_t file_size; -+}; -+ -+struct spnfs_msg_layoutcommit_res { -+ int status; -+}; -+ -+/* layoutreturn */ -+/* No op for the daemon */ -+/* -+struct spnfs_msg_layoutreturn_args { -+}; -+ -+struct spnfs_msg_layoutreturn_res { -+}; -+*/ -+ -+/* getdeviceiter */ -+struct spnfs_msg_getdeviceiter_args { -+ unsigned long inode; -+ u_int64_t cookie; -+ u_int64_t verf; -+}; -+ -+struct spnfs_msg_getdeviceiter_res { -+ int status; -+ u_int64_t devid; -+ u_int64_t cookie; -+ u_int64_t verf; -+ u_int32_t eof; -+}; -+ -+/* getdeviceinfo */ -+struct spnfs_data_server { -+ u_int32_t dsid; -+ char netid[5]; -+ char addr[29]; -+}; -+ -+struct spnfs_device { -+ u_int64_t devid; -+ int dscount; -+ struct spnfs_data_server dslist[SPNFS_MAX_DATA_SERVERS]; -+}; -+ -+struct spnfs_msg_getdeviceinfo_args { -+ u_int64_t devid; -+}; -+ -+struct spnfs_msg_getdeviceinfo_res { -+ int status; -+ struct spnfs_device devinfo; -+}; -+ -+/* setattr */ -+struct spnfs_msg_setattr_args { -+ unsigned long inode; -+ unsigned long generation; -+ int file_size; -+}; -+ -+struct spnfs_msg_setattr_res { -+ int status; -+}; -+ -+/* open */ -+struct spnfs_msg_open_args { -+ unsigned long inode; -+ unsigned long generation; -+ int create; -+ int createmode; -+ int truncate; -+}; -+ -+struct spnfs_msg_open_res { -+ int status; -+}; -+ -+/* close */ -+/* No op for daemon */ -+struct spnfs_msg_close_args { -+ int x; -+}; -+ -+struct spnfs_msg_close_res { -+ int y; -+}; -+ -+/* create */ -+/* -+struct spnfs_msg_create_args { -+ int x; -+}; -+ -+struct spnfs_msg_create_res { -+ int y; -+}; -+*/ -+ -+/* remove */ -+struct spnfs_msg_remove_args { -+ unsigned long inode; -+ unsigned long generation; -+}; -+ -+struct spnfs_msg_remove_res { -+ int status; -+}; -+ -+/* commit */ -+/* -+struct spnfs_msg_commit_args { -+ int x; -+}; -+ -+struct spnfs_msg_commit_res { -+ int y; -+}; -+*/ -+ -+/* read */ -+struct spnfs_msg_read_args { -+ unsigned long inode; -+ unsigned long generation; -+ loff_t offset; -+ unsigned long len; -+}; -+ -+struct spnfs_msg_read_res { -+ int status; -+ char data[SPNFS_MAX_IO]; -+}; -+ -+/* write */ -+struct spnfs_msg_write_args { -+ unsigned long inode; -+ unsigned long generation; -+ loff_t offset; -+ unsigned long len; -+ char data[SPNFS_MAX_IO]; -+}; -+ -+struct spnfs_msg_write_res { -+ int status; -+}; -+ -+/* bundle args and responses */ -+union spnfs_msg_args { -+ struct spnfs_msg_layoutget_args layoutget_args; -+ struct spnfs_msg_layoutcommit_args layoutcommit_args; -+/* -+ struct spnfs_msg_layoutreturn_args layoutreturn_args; -+*/ -+ struct spnfs_msg_getdeviceiter_args getdeviceiter_args; -+ struct spnfs_msg_getdeviceinfo_args getdeviceinfo_args; -+ struct spnfs_msg_setattr_args setattr_args; -+ struct spnfs_msg_open_args open_args; -+ struct spnfs_msg_close_args close_args; -+/* -+ struct spnfs_msg_create_args create_args; -+*/ -+ struct spnfs_msg_remove_args remove_args; -+/* -+ struct spnfs_msg_commit_args commit_args; -+*/ -+ struct spnfs_msg_read_args read_args; -+ struct spnfs_msg_write_args write_args; -+}; -+ -+union spnfs_msg_res { -+ struct spnfs_msg_layoutget_res layoutget_res; -+ struct spnfs_msg_layoutcommit_res layoutcommit_res; -+/* -+ struct spnfs_msg_layoutreturn_res layoutreturn_res; -+*/ -+ struct spnfs_msg_getdeviceiter_res getdeviceiter_res; -+ struct spnfs_msg_getdeviceinfo_res getdeviceinfo_res; -+ struct spnfs_msg_setattr_res setattr_res; -+ struct spnfs_msg_open_res open_res; -+ struct spnfs_msg_close_res close_res; -+/* -+ struct spnfs_msg_create_res create_res; -+*/ -+ struct spnfs_msg_remove_res remove_res; -+/* -+ struct spnfs_msg_commit_res commit_res; -+*/ -+ struct spnfs_msg_read_res read_res; -+ struct spnfs_msg_write_res write_res; -+}; -+ -+/* a spnfs message, args and response */ -+struct spnfs_msg { -+ unsigned char im_type; -+ unsigned char im_status; -+ union spnfs_msg_args im_args; -+ union spnfs_msg_res im_res; -+}; -+ -+/* spnfs configuration info */ -+struct spnfs_config { -+ unsigned char dense_striping; -+ int stripe_size; -+ int num_ds; -+ char ds_dir[SPNFS_MAX_DATA_SERVERS][80]; /* XXX */ -+}; -+ -+#if defined(__KERNEL__) && defined(CONFIG_SPNFS) -+ -+#include -+ -+/* pipe mgmt structure. messages flow through here */ -+struct spnfs { -+ struct dentry *spnfs_dentry; /* dentry for pipe */ -+ wait_queue_head_t spnfs_wq; -+ struct spnfs_msg spnfs_im; /* spnfs message */ -+ struct mutex spnfs_lock; /* Serializes upcalls */ -+ struct mutex spnfs_plock; -+}; -+ -+struct nfsd4_open; -+ -+int spnfs_layout_type(struct super_block *); -+enum nfsstat4 spnfs_layoutget(struct inode *, struct exp_xdr_stream *xdr, -+ const struct nfsd4_pnfs_layoutget_arg *, -+ struct nfsd4_pnfs_layoutget_res *); -+int spnfs_layoutcommit(void); -+int spnfs_layoutreturn(struct inode *, -+ const struct nfsd4_pnfs_layoutreturn_arg *); -+int spnfs_getdeviceiter(struct super_block *, -+ u32 layout_type, -+ struct nfsd4_pnfs_dev_iter_res *); -+int spnfs_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, -+ u32 layout_type, -+ const struct nfsd4_pnfs_deviceid *); -+int spnfs_setattr(void); -+int spnfs_open(struct inode *, struct nfsd4_open *); -+int spnfs_get_state(struct inode *, struct knfsd_fh *, struct pnfs_get_state *); -+int spnfs_remove(unsigned long, unsigned long); -+__be32 spnfs_read(struct inode *, loff_t, unsigned long *, -+ int, struct svc_rqst *); -+__be32 spnfs_write(struct inode *, loff_t, size_t, int, struct svc_rqst *); -+int spnfs_getfh(int, struct nfs_fh *); -+int spnfs_test_layoutrecall(char *, u64, u64); -+int spnfs_layoutrecall(struct inode *, int, u64, u64); -+ -+int nfsd_spnfs_new(void); -+void nfsd_spnfs_delete(void); -+int spnfs_upcall(struct spnfs *, struct spnfs_msg *, union spnfs_msg_res *); -+int spnfs_enabled(void); -+int spnfs_init_proc(void); -+ -+extern struct spnfs_config *spnfs_config; -+ -+#endif /* __KERNEL__ && CONFIG_SPNFS */ -+ -+#endif /* NFS_SPNFS_H */ -diff --git a/include/linux/panfs_shim_api.h b/include/linux/panfs_shim_api.h -new file mode 100644 -index 0000000..3b44e19 ---- /dev/null -+++ b/include/linux/panfs_shim_api.h -@@ -0,0 +1,57 @@ -+#ifndef _PANFS_SHIM_API_H -+#define _PANFS_SHIM_API_H -+ -+/* -+ * imported panfs functions -+ */ -+struct panfs_export_operations { -+ int (*convert_rc)(pan_status_t rc); -+ -+ int (*sm_sec_t_get_size_otw)( -+ pan_sm_sec_otw_t *var, -+ pan_size_t *core_sizep, -+ pan_size_t *wire_size, -+ void *buf_end); -+ -+ int (*sm_sec_t_unmarshall)( -+ pan_sm_sec_otw_t *in, -+ pan_sm_sec_t *out, -+ void *buf, -+ pan_size_t size, -+ pan_size_t *otw_consumed, -+ pan_size_t *in_core_consumed); -+ -+ int (*ucreds_get)(void **ucreds_pp); -+ -+ void (*ucreds_put)(void *ucreds); -+ -+ int (*sam_read)( -+ pan_sam_access_flags_t flags, -+ pan_sam_read_args_t *args_p, -+ pan_sam_obj_sec_t *obj_sec_p, -+ pan_sg_entry_t *data_p, -+ void *ucreds, -+ pan_sam_read_cb_t closure, -+ void *user_arg1, -+ void *user_arg2, -+ pan_sam_read_res_t *res_p); -+ -+ int (*sam_write)( -+ pan_sam_access_flags_t flags, -+ pan_sam_write_args_t *args_p, -+ pan_sam_obj_sec_t *obj_sec_p, -+ pan_sg_entry_t *data_p, -+ void *ucreds, -+ pan_sam_write_cb_t closure, -+ void *user_arg1, -+ void *user_arg2, -+ pan_sam_write_res_t *res_p); -+}; -+ -+extern int -+panfs_shim_register(struct panfs_export_operations *ops); -+ -+extern int -+panfs_shim_unregister(void); -+ -+#endif /* _PANFS_SHIM_API_H */ -diff --git a/include/linux/pnfs_osd_xdr.h b/include/linux/pnfs_osd_xdr.h -new file mode 100644 -index 0000000..b404f33 ---- /dev/null -+++ b/include/linux/pnfs_osd_xdr.h -@@ -0,0 +1,439 @@ -+/* -+ * pnfs_osd_xdr.h -+ * -+ * pNFS-osd on-the-wire data structures -+ * -+ * Copyright (C) 2007-2009 Panasas Inc. -+ * All rights reserved. -+ * -+ * Benny Halevy -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 -+ * See the file COPYING included with this distribution for more details. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. Neither the name of the Panasas company nor the names of its -+ * contributors may be used to endorse or promote products derived -+ * from this software without specific prior written permission. -+ * -+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED -+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ */ -+#ifndef __PNFS_OSD_XDR_H__ -+#define __PNFS_OSD_XDR_H__ -+ -+#include -+#include -+#include -+#include -+ -+#define PNFS_OSD_OSDNAME_MAXSIZE 256 -+ -+/* -+ * START OF "GENERIC" DECODE ROUTINES. -+ * These may look a little ugly since they are imported from a "generic" -+ * set of XDR encode/decode routines which are intended to be shared by -+ * all of our NFSv4 implementations (OpenBSD, MacOS X...). -+ * -+ * If the pain of reading these is too great, it should be a straightforward -+ * task to translate them into Linux-specific versions which are more -+ * consistent with the style used in NFSv2/v3... -+ */ -+#define READ32(x) (x) = ntohl(*p++) -+#define READ64(x) do { \ -+ (x) = (u64)ntohl(*p++) << 32; \ -+ (x) |= ntohl(*p++); \ -+} while (0) -+#define COPYMEM(x, nbytes) do { \ -+ memcpy((x), p, nbytes); \ -+ p += XDR_QUADLEN(nbytes); \ -+} while (0) -+ -+/* -+ * draft-ietf-nfsv4-minorversion-22 -+ * draft-ietf-nfsv4-pnfs-obj-12 -+ */ -+ -+/* Layout Structure */ -+ -+enum pnfs_osd_raid_algorithm4 { -+ PNFS_OSD_RAID_0 = 1, -+ PNFS_OSD_RAID_4 = 2, -+ PNFS_OSD_RAID_5 = 3, -+ PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ -+}; -+ -+/* struct pnfs_osd_data_map4 { -+ * uint32_t odm_num_comps; -+ * length4 odm_stripe_unit; -+ * uint32_t odm_group_width; -+ * uint32_t odm_group_depth; -+ * uint32_t odm_mirror_cnt; -+ * pnfs_osd_raid_algorithm4 odm_raid_algorithm; -+ * }; -+ */ -+struct pnfs_osd_data_map { -+ u32 odm_num_comps; -+ u64 odm_stripe_unit; -+ u32 odm_group_width; -+ u32 odm_group_depth; -+ u32 odm_mirror_cnt; -+ u32 odm_raid_algorithm; -+}; -+ -+static inline int -+pnfs_osd_data_map_xdr_sz(void) -+{ -+ return 1 + 2 + 1 + 1 + 1 + 1; -+} -+ -+static inline size_t -+pnfs_osd_data_map_incore_sz(void) -+{ -+ return sizeof(struct pnfs_osd_data_map); -+} -+ -+/* struct pnfs_osd_objid4 { -+ * deviceid4 oid_device_id; -+ * uint64_t oid_partition_id; -+ * uint64_t oid_object_id; -+ * }; -+ */ -+struct pnfs_osd_objid { -+ struct nfs4_deviceid oid_device_id; -+ u64 oid_partition_id; -+ u64 oid_object_id; -+}; -+ -+/* For printout. I use "dev(%llx:%llx)", _DEVID_LO(), _DEVID_HI BE style */ -+#define _DEVID_LO(oid_device_id) \ -+ (unsigned long long)be64_to_cpup((__be64 *)oid_device_id.data) -+ -+#define _DEVID_HI(oid_device_id) \ -+ (unsigned long long)be64_to_cpup(((__be64 *)oid_device_id.data) + 1) -+ -+static inline int -+pnfs_osd_objid_xdr_sz(void) -+{ -+ return (NFS4_DEVICEID4_SIZE / 4) + 2 + 2; -+} -+ -+static inline size_t -+pnfs_osd_objid_incore_sz(void) -+{ -+ return sizeof(struct pnfs_osd_objid); -+} -+ -+enum pnfs_osd_version { -+ PNFS_OSD_MISSING = 0, -+ PNFS_OSD_VERSION_1 = 1, -+ PNFS_OSD_VERSION_2 = 2 -+}; -+ -+struct pnfs_osd_opaque_cred { -+ u32 cred_len; -+ u8 *cred; -+}; -+ -+static inline int -+pnfs_osd_opaque_cred_xdr_sz(u32 *p) -+{ -+ u32 *start = p; -+ u32 n; -+ -+ READ32(n); -+ p += XDR_QUADLEN(n); -+ return p - start; -+} -+ -+static inline size_t -+pnfs_osd_opaque_cred_incore_sz(u32 *p) -+{ -+ u32 n; -+ -+ READ32(n); -+ return XDR_QUADLEN(n) * 4; -+} -+ -+enum pnfs_osd_cap_key_sec { -+ PNFS_OSD_CAP_KEY_SEC_NONE = 0, -+ PNFS_OSD_CAP_KEY_SEC_SSV = 1, -+}; -+ -+/* struct pnfs_osd_object_cred4 { -+ * pnfs_osd_objid4 oc_object_id; -+ * pnfs_osd_version4 oc_osd_version; -+ * pnfs_osd_cap_key_sec4 oc_cap_key_sec; -+ * opaque oc_capability_key<>; -+ * opaque oc_capability<>; -+ * }; -+ */ -+struct pnfs_osd_object_cred { -+ struct pnfs_osd_objid oc_object_id; -+ u32 oc_osd_version; -+ u32 oc_cap_key_sec; -+ struct pnfs_osd_opaque_cred oc_cap_key; -+ struct pnfs_osd_opaque_cred oc_cap; -+}; -+ -+static inline int -+pnfs_osd_object_cred_xdr_sz(u32 *p) -+{ -+ u32 *start = p; -+ -+ p += pnfs_osd_objid_xdr_sz() + 2; -+ p += pnfs_osd_opaque_cred_xdr_sz(p); -+ p += pnfs_osd_opaque_cred_xdr_sz(p); -+ return p - start; -+} -+ -+static inline size_t -+pnfs_osd_object_cred_incore_sz(u32 *p) -+{ -+ size_t sz = sizeof(struct pnfs_osd_object_cred); -+ -+ p += pnfs_osd_objid_xdr_sz() + 2; -+ sz += pnfs_osd_opaque_cred_incore_sz(p); -+ p += pnfs_osd_opaque_cred_xdr_sz(p); -+ sz += pnfs_osd_opaque_cred_incore_sz(p); -+ return sz; -+} -+ -+/* struct pnfs_osd_layout4 { -+ * pnfs_osd_data_map4 olo_map; -+ * uint32_t olo_comps_index; -+ * pnfs_osd_object_cred4 olo_components<>; -+ * }; -+ */ -+struct pnfs_osd_layout { -+ struct pnfs_osd_data_map olo_map; -+ u32 olo_comps_index; -+ u32 olo_num_comps; -+ struct pnfs_osd_object_cred *olo_comps; -+}; -+ -+static inline int -+pnfs_osd_layout_xdr_sz(u32 *p) -+{ -+ u32 *start = p; -+ u32 n; -+ -+ p += pnfs_osd_data_map_xdr_sz() + 1; -+ READ32(n); -+ while ((int)(n--) > 0) -+ p += pnfs_osd_object_cred_xdr_sz(p); -+ return p - start; -+} -+ -+static inline size_t -+pnfs_osd_layout_incore_sz(u32 *p) -+{ -+ u32 n; -+ size_t sz; -+ -+ p += pnfs_osd_data_map_xdr_sz() + 1; -+ READ32(n); -+ sz = sizeof(struct pnfs_osd_layout); -+ while ((int)(n--) > 0) { -+ sz += pnfs_osd_object_cred_incore_sz(p); -+ p += pnfs_osd_object_cred_xdr_sz(p); -+ } -+ return sz; -+} -+ -+/* Device Address */ -+ -+enum pnfs_osd_targetid_type { -+ OBJ_TARGET_ANON = 1, -+ OBJ_TARGET_SCSI_NAME = 2, -+ OBJ_TARGET_SCSI_DEVICE_ID = 3, -+}; -+ -+/* union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) { -+ * case OBJ_TARGET_SCSI_NAME: -+ * string oti_scsi_name<>; -+ * -+ * case OBJ_TARGET_SCSI_DEVICE_ID: -+ * opaque oti_scsi_device_id<>; -+ * -+ * default: -+ * void; -+ * }; -+ * -+ * union pnfs_osd_targetaddr4 switch (bool ota_available) { -+ * case TRUE: -+ * netaddr4 ota_netaddr; -+ * case FALSE: -+ * void; -+ * }; -+ * -+ * struct pnfs_osd_deviceaddr4 { -+ * pnfs_osd_targetid4 oda_targetid; -+ * pnfs_osd_targetaddr4 oda_targetaddr; -+ * uint64_t oda_lun; -+ * opaque oda_systemid<>; -+ * pnfs_osd_object_cred4 oda_root_obj_cred; -+ * opaque oda_osdname<>; -+ * }; -+ */ -+struct pnfs_osd_targetid { -+ u32 oti_type; -+ struct nfs4_string oti_scsi_device_id; -+}; -+ -+enum { PNFS_OSD_TARGETID_MAX = 1 + PNFS_OSD_OSDNAME_MAXSIZE / 4 }; -+ -+/* struct netaddr4 { -+ * // see struct rpcb in RFC1833 -+ * string r_netid<>; // network id -+ * string r_addr<>; // universal address -+ * }; -+ */ -+struct pnfs_osd_net_addr { -+ struct nfs4_string r_netid; -+ struct nfs4_string r_addr; -+}; -+ -+struct pnfs_osd_targetaddr { -+ u32 ota_available; -+ struct pnfs_osd_net_addr ota_netaddr; -+}; -+ -+enum { -+ NETWORK_ID_MAX = 16 / 4, -+ UNIVERSAL_ADDRESS_MAX = 64 / 4, -+ PNFS_OSD_TARGETADDR_MAX = 3 + NETWORK_ID_MAX + UNIVERSAL_ADDRESS_MAX, -+}; -+ -+struct pnfs_osd_deviceaddr { -+ struct pnfs_osd_targetid oda_targetid; -+ struct pnfs_osd_targetaddr oda_targetaddr; -+ u8 oda_lun[8]; -+ struct nfs4_string oda_systemid; -+ struct pnfs_osd_object_cred oda_root_obj_cred; -+ struct nfs4_string oda_osdname; -+}; -+ -+enum { -+ ODA_OSDNAME_MAX = PNFS_OSD_OSDNAME_MAXSIZE / 4, -+ PNFS_OSD_DEVICEADDR_MAX = -+ PNFS_OSD_TARGETID_MAX + PNFS_OSD_TARGETADDR_MAX + -+ 2 /*oda_lun*/ + -+ 1 + OSD_SYSTEMID_LEN + -+ 1 + ODA_OSDNAME_MAX, -+}; -+ -+/* LAYOUTCOMMIT: layoutupdate */ -+ -+/* union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) { -+ * case TRUE: -+ * int64_t dsu_delta; -+ * case FALSE: -+ * void; -+ * }; -+ * -+ * struct pnfs_osd_layoutupdate4 { -+ * pnfs_osd_deltaspaceused4 olu_delta_space_used; -+ * bool olu_ioerr_flag; -+ * }; -+ */ -+struct pnfs_osd_layoutupdate { -+ u32 dsu_valid; -+ s64 dsu_delta; -+ u32 olu_ioerr_flag; -+}; -+ -+/* LAYOUTRETURN: I/O Rrror Report */ -+ -+enum pnfs_osd_errno { -+ PNFS_OSD_ERR_EIO = 1, -+ PNFS_OSD_ERR_NOT_FOUND = 2, -+ PNFS_OSD_ERR_NO_SPACE = 3, -+ PNFS_OSD_ERR_BAD_CRED = 4, -+ PNFS_OSD_ERR_NO_ACCESS = 5, -+ PNFS_OSD_ERR_UNREACHABLE = 6, -+ PNFS_OSD_ERR_RESOURCE = 7 -+}; -+ -+/* struct pnfs_osd_ioerr4 { -+ * pnfs_osd_objid4 oer_component; -+ * length4 oer_comp_offset; -+ * length4 oer_comp_length; -+ * bool oer_iswrite; -+ * pnfs_osd_errno4 oer_errno; -+ * }; -+ */ -+struct pnfs_osd_ioerr { -+ struct pnfs_osd_objid oer_component; -+ u64 oer_comp_offset; -+ u64 oer_comp_length; -+ u32 oer_iswrite; -+ u32 oer_errno; -+}; -+ -+static inline unsigned -+pnfs_osd_ioerr_xdr_sz(void) -+{ -+ return pnfs_osd_objid_xdr_sz() + 2 + 2 + 1 + 1; -+} -+ -+/* OSD XDR API */ -+ -+/* Layout helpers */ -+extern struct pnfs_osd_layout *pnfs_osd_xdr_decode_layout( -+ struct pnfs_osd_layout *layout, u32 *p); -+ -+extern int pnfs_osd_xdr_encode_layout( -+ struct exp_xdr_stream *xdr, -+ struct pnfs_osd_layout *layout); -+ -+/* Device Info helpers */ -+ -+/* First pass calculate total size for space needed */ -+extern size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p); -+ -+/* Note: some strings pointed to inside @deviceaddr might point -+ * to space inside @p. @p should stay valid while @deviceaddr -+ * is in use. -+ * It is assumed that @deviceaddr points to bigger memory of size -+ * calculated in first pass by pnfs_osd_xdr_deviceaddr_incore_sz() -+ */ -+extern void pnfs_osd_xdr_decode_deviceaddr( -+ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p); -+ -+/* For Servers */ -+extern int pnfs_osd_xdr_encode_deviceaddr( -+ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr); -+ -+/* layoutupdate (layout_commit) xdr helpers */ -+extern int -+pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, -+ struct pnfs_osd_layoutupdate *lou); -+extern __be32 * -+pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p); -+ -+/* osd_ioerror encoding/decoding (layout_return) */ -+extern int -+pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, struct pnfs_osd_ioerr *ioerr); -+extern __be32 * -+pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p); -+ -+#endif /* __PNFS_OSD_XDR_H__ */ -diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h -index 6760816..fc3d2fc 100644 ---- a/include/linux/posix_acl.h -+++ b/include/linux/posix_acl.h -@@ -8,6 +8,7 @@ - #ifndef __LINUX_POSIX_ACL_H - #define __LINUX_POSIX_ACL_H - -+#include - #include - - #define ACL_UNDEFINED_ID (-1) -diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h -index 77e6248..1b26fff 100644 ---- a/include/linux/sunrpc/msg_prot.h -+++ b/include/linux/sunrpc/msg_prot.h -@@ -14,6 +14,8 @@ - /* size of an XDR encoding unit in bytes, i.e. 32bit */ - #define XDR_UNIT (4) - -+#include -+ - /* spec defines authentication flavor as an unsigned 32 bit integer */ - typedef u32 rpc_authflavor_t; - -diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h -index cf14db9..2177d50 100644 ---- a/include/linux/sunrpc/rpc_pipe_fs.h -+++ b/include/linux/sunrpc/rpc_pipe_fs.h -@@ -3,6 +3,7 @@ - - #ifdef __KERNEL__ - -+#include - #include - - struct rpc_pipe_msg { -@@ -11,6 +12,10 @@ struct rpc_pipe_msg { - size_t len; - size_t copied; - int errno; -+#define PIPEFS_AUTOFREE_RPCMSG 0x01 /* frees rpc_pipe_msg */ -+#define PIPEFS_AUTOFREE_RPCMSG_DATA 0x02 /* frees rpc_pipe_msg->data */ -+#define PIPEFS_AUTOFREE_UPCALL_MSG PIPEFS_AUTOFREE_RPCMSG_DATA -+ u8 flags; - }; - - struct rpc_pipe_ops { -diff --git a/include/linux/sunrpc/simple_rpc_pipefs.h b/include/linux/sunrpc/simple_rpc_pipefs.h -new file mode 100644 -index 0000000..f6a1227 ---- /dev/null -+++ b/include/linux/sunrpc/simple_rpc_pipefs.h -@@ -0,0 +1,105 @@ -+/* -+ * Copyright (c) 2008 The Regents of the University of Michigan. -+ * All rights reserved. -+ * -+ * David M. Richter -+ * -+ * Drawing on work done by Andy Adamson and -+ * Marius Eriksen . Thanks for the help over the -+ * years, guys. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. Neither the name of the University nor the names of its -+ * contributors may be used to endorse or promote products derived -+ * from this software without specific prior written permission. -+ * -+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED -+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ * -+ * With thanks to CITI's project sponsor and partner, IBM. -+ */ -+ -+#ifndef _SIMPLE_RPC_PIPEFS_H_ -+#define _SIMPLE_RPC_PIPEFS_H_ -+ -+#include -+ -+#define payload_of(headerp) ((void *)(headerp + 1)) -+ -+/* -+ * struct pipefs_hdr -- the generic message format for simple_rpc_pipefs. -+ * Messages may simply be the header itself, although having an optional -+ * data payload follow the header allows much more flexibility. -+ * -+ * Messages are created using pipefs_alloc_init_msg() and -+ * pipefs_alloc_init_msg_padded(), both of which accept a pointer to an -+ * (optional) data payload. -+ * -+ * Given a struct pipefs_hdr *msg that has a struct foo payload, the data -+ * can be accessed using: struct foo *foop = payload_of(msg) -+ */ -+struct pipefs_hdr { -+ u32 msgid; -+ u8 type; -+ u8 flags; -+ u16 totallen; /* length of entire message, including hdr itself */ -+ u32 status; -+}; -+ -+/* -+ * struct pipefs_list -- a type of list used for tracking callers who've made an -+ * upcall and are blocked waiting for a reply. -+ * -+ * See pipefs_queue_upcall_waitreply() and pipefs_assign_upcall_reply(). -+ */ -+struct pipefs_list { -+ struct list_head list; -+ spinlock_t list_lock; -+}; -+ -+ -+/* See net/sunrpc/simple_rpc_pipefs.c for more info on using these functions. */ -+extern struct dentry *pipefs_mkpipe(const char *name, -+ const struct rpc_pipe_ops *ops, -+ int wait_for_open); -+extern void pipefs_closepipe(struct dentry *pipe); -+extern void pipefs_init_list(struct pipefs_list *list); -+extern struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, -+ void *data, u16 datalen); -+extern struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, -+ u8 flags, void *data, -+ u16 datalen, u16 padlen); -+extern struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, -+ struct pipefs_hdr *msg, -+ struct pipefs_list -+ *uplist, u8 upflags, -+ u32 timeout); -+extern int pipefs_queue_upcall_noreply(struct dentry *pipe, -+ struct pipefs_hdr *msg, u8 upflags); -+extern int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, -+ struct pipefs_list *uplist); -+extern struct pipefs_hdr *pipefs_readmsg(struct file *filp, -+ const char __user *src, size_t len); -+extern ssize_t pipefs_generic_upcall(struct file *filp, -+ struct rpc_pipe_msg *rpcmsg, -+ char __user *dst, size_t buflen); -+extern void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg); -+ -+#endif /* _SIMPLE_RPC_PIPEFS_H_ */ -diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h -index 5f4e18b..f7a0358 100644 ---- a/include/linux/sunrpc/svc_xprt.h -+++ b/include/linux/sunrpc/svc_xprt.h -@@ -166,4 +166,41 @@ static inline char *__svc_print_addr(const struct sockaddr *addr, - - return buf; - } -+ -+/* -+ * Print a network address in a universal format (see rfc1833 and nfsv4.1) -+ */ -+static inline int __svc_print_netaddr(struct sockaddr *addr, -+ struct xdr_netobj *na) -+{ -+ u16 port; -+ ssize_t len; -+ -+ switch (addr->sa_family) { -+ case AF_INET: { -+ struct sockaddr_in *sin = (struct sockaddr_in *)addr; -+ port = ntohs(sin->sin_port); -+ -+ len = snprintf(na->data, na->len, "%pI4.%u.%u", -+ &sin->sin_addr, -+ port >> 8, port & 0xff); -+ break; -+ } -+ case AF_INET6: { -+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; -+ port = ntohs(sin6->sin6_port); -+ -+ len = snprintf(na->data, na->len, "%pI6.%u.%u", -+ &sin6->sin6_addr, -+ port >> 8, port & 0xff); -+ break; -+ } -+ default: -+ snprintf(na->data, na->len, "unknown address type: %d", -+ addr->sa_family); -+ len = -EINVAL; -+ break; -+ } -+ return len; -+} - #endif /* SUNRPC_SVC_XPRT_H */ -diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h -index 35cf2e8..bb5f3fd 100644 ---- a/include/linux/sunrpc/xdr.h -+++ b/include/linux/sunrpc/xdr.h -@@ -131,6 +131,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp) - return p + 2; - } - -+static inline __be32 * -+xdr_decode_opaque_fixed(__be32 *p, void *ptr, unsigned int len) -+{ -+ memcpy(ptr, p, len); -+ return p + XDR_QUADLEN(len); -+} -+ - /* - * Adjust kvec to reflect end of xdr'ed data (RPC client XDR) - */ -@@ -197,6 +204,7 @@ struct xdr_stream { - - extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); - extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); -+extern __be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q); - extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, - unsigned int base, unsigned int len); - extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); -diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile -index 9d2fca5..e102040 100644 ---- a/net/sunrpc/Makefile -+++ b/net/sunrpc/Makefile -@@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ - svc.o svcsock.o svcauth.o svcauth_unix.o \ - addr.o rpcb_clnt.o timer.o xdr.o \ - sunrpc_syms.o cache.o rpc_pipe.o \ -- svc_xprt.o -+ svc_xprt.o simple_rpc_pipefs.o - sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o - sunrpc-$(CONFIG_PROC_FS) += stats.o - sunrpc-$(CONFIG_SYSCTL) += sysctl.o -diff --git a/net/sunrpc/simple_rpc_pipefs.c b/net/sunrpc/simple_rpc_pipefs.c -new file mode 100644 -index 0000000..24af0a1 ---- /dev/null -+++ b/net/sunrpc/simple_rpc_pipefs.c -@@ -0,0 +1,423 @@ -+/* -+ * net/sunrpc/simple_rpc_pipefs.c -+ * -+ * Copyright (c) 2008 The Regents of the University of Michigan. -+ * All rights reserved. -+ * -+ * David M. Richter -+ * -+ * Drawing on work done by Andy Adamson and -+ * Marius Eriksen . Thanks for the help over the -+ * years, guys. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. Neither the name of the University nor the names of its -+ * contributors may be used to endorse or promote products derived -+ * from this software without specific prior written permission. -+ * -+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED -+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ * -+ * With thanks to CITI's project sponsor and partner, IBM. -+ */ -+ -+#include -+#include -+#include -+ -+ -+/* -+ * Make an rpc_pipefs pipe named @name at the root of the mounted rpc_pipefs -+ * filesystem. -+ * -+ * If @wait_for_open is non-zero and an upcall is later queued but the userland -+ * end of the pipe has not yet been opened, the upcall will remain queued until -+ * the pipe is opened; otherwise, the upcall queueing will return with -EPIPE. -+ */ -+struct dentry *pipefs_mkpipe(const char *name, const struct rpc_pipe_ops *ops, -+ int wait_for_open) -+{ -+ struct dentry *dir, *pipe; -+ struct vfsmount *mnt; -+ -+ mnt = rpc_get_mount(); -+ if (IS_ERR(mnt)) { -+ pipe = ERR_CAST(mnt); -+ goto out; -+ } -+ dir = mnt->mnt_root; -+ if (!dir) { -+ pipe = ERR_PTR(-ENOENT); -+ goto out; -+ } -+ pipe = rpc_mkpipe(dir, name, NULL, ops, -+ wait_for_open ? RPC_PIPE_WAIT_FOR_OPEN : 0); -+out: -+ return pipe; -+} -+EXPORT_SYMBOL(pipefs_mkpipe); -+ -+/* -+ * Shutdown a pipe made by pipefs_mkpipe(). -+ * XXX: do we need to retain an extra reference on the mount? -+ */ -+void pipefs_closepipe(struct dentry *pipe) -+{ -+ rpc_unlink(pipe); -+ rpc_put_mount(); -+} -+EXPORT_SYMBOL(pipefs_closepipe); -+ -+/* -+ * Initialize a struct pipefs_list -- which are a way to keep track of callers -+ * who're blocked having made an upcall and are awaiting a reply. -+ * -+ * See pipefs_queue_upcall_waitreply() and pipefs_find_upcall_msgid() for how -+ * to use them. -+ */ -+inline void pipefs_init_list(struct pipefs_list *list) -+{ -+ INIT_LIST_HEAD(&list->list); -+ spin_lock_init(&list->list_lock); -+} -+EXPORT_SYMBOL(pipefs_init_list); -+ -+/* -+ * Alloc/init a generic pipefs message header and copy into its message body -+ * an arbitrary data payload. -+ * -+ * struct pipefs_hdr's are meant to serve as generic, general-purpose message -+ * headers for easy rpc_pipefs I/O. When an upcall is made, the -+ * struct pipefs_hdr is assigned to a struct rpc_pipe_msg and delivered -+ * therein. --And yes, the naming can seem a little confusing at first: -+ * -+ * When one thinks of an upcall "message", in simple_rpc_pipefs that's a -+ * struct pipefs_hdr (possibly with an attached message body). A -+ * struct rpc_pipe_msg is actually only the -vehicle- by which the "real" -+ * message is delivered and processed. -+ */ -+struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, u8 flags, -+ void *data, u16 datalen, u16 padlen) -+{ -+ u16 totallen; -+ struct pipefs_hdr *msg = NULL; -+ -+ totallen = sizeof(*msg) + datalen + padlen; -+ if (totallen > PAGE_SIZE) { -+ msg = ERR_PTR(-E2BIG); -+ goto out; -+ } -+ -+ msg = kzalloc(totallen, GFP_KERNEL); -+ if (!msg) { -+ msg = ERR_PTR(-ENOMEM); -+ goto out; -+ } -+ -+ msg->msgid = msgid; -+ msg->type = type; -+ msg->flags = flags; -+ msg->totallen = totallen; -+ memcpy(payload_of(msg), data, datalen); -+out: -+ return msg; -+} -+EXPORT_SYMBOL(pipefs_alloc_init_msg_padded); -+ -+/* -+ * See the description of pipefs_alloc_init_msg_padded(). -+ */ -+struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, -+ void *data, u16 datalen) -+{ -+ return pipefs_alloc_init_msg_padded(msgid, type, flags, data, -+ datalen, 0); -+} -+EXPORT_SYMBOL(pipefs_alloc_init_msg); -+ -+ -+static void pipefs_init_rpcmsg(struct rpc_pipe_msg *rpcmsg, -+ struct pipefs_hdr *msg, u8 upflags) -+{ -+ memset(rpcmsg, 0, sizeof(*rpcmsg)); -+ rpcmsg->data = msg; -+ rpcmsg->len = msg->totallen; -+ rpcmsg->flags = upflags; -+} -+ -+static struct rpc_pipe_msg *pipefs_alloc_init_rpcmsg(struct pipefs_hdr *msg, -+ u8 upflags) -+{ -+ struct rpc_pipe_msg *rpcmsg; -+ -+ rpcmsg = kmalloc(sizeof(*rpcmsg), GFP_KERNEL); -+ if (!rpcmsg) -+ return ERR_PTR(-ENOMEM); -+ -+ pipefs_init_rpcmsg(rpcmsg, msg, upflags); -+ return rpcmsg; -+} -+ -+ -+/* represents an upcall that'll block and wait for a reply */ -+struct pipefs_upcall { -+ u32 msgid; -+ struct rpc_pipe_msg rpcmsg; -+ struct list_head list; -+ wait_queue_head_t waitq; -+ struct pipefs_hdr *reply; -+}; -+ -+ -+static void pipefs_init_upcall_waitreply(struct pipefs_upcall *upcall, -+ struct pipefs_hdr *msg, u8 upflags) -+{ -+ upcall->reply = NULL; -+ upcall->msgid = msg->msgid; -+ INIT_LIST_HEAD(&upcall->list); -+ init_waitqueue_head(&upcall->waitq); -+ pipefs_init_rpcmsg(&upcall->rpcmsg, msg, upflags); -+} -+ -+static int __pipefs_queue_upcall_waitreply(struct dentry *pipe, -+ struct pipefs_upcall *upcall, -+ struct pipefs_list *uplist, -+ u32 timeout) -+{ -+ int err = 0; -+ DECLARE_WAITQUEUE(wq, current); -+ -+ add_wait_queue(&upcall->waitq, &wq); -+ spin_lock(&uplist->list_lock); -+ list_add(&upcall->list, &uplist->list); -+ spin_unlock(&uplist->list_lock); -+ -+ err = rpc_queue_upcall(pipe->d_inode, &upcall->rpcmsg); -+ if (err < 0) -+ goto out; -+ -+ if (timeout) { -+ /* retval of 0 means timer expired */ -+ err = schedule_timeout_uninterruptible(timeout); -+ if (err == 0 && upcall->reply == NULL) -+ err = -ETIMEDOUT; -+ } else { -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ schedule(); -+ __set_current_state(TASK_RUNNING); -+ } -+ -+out: -+ spin_lock(&uplist->list_lock); -+ list_del_init(&upcall->list); -+ spin_unlock(&uplist->list_lock); -+ remove_wait_queue(&upcall->waitq, &wq); -+ return err; -+} -+ -+/* -+ * Queue a pipefs msg for an upcall to userspace, place the calling thread -+ * on @uplist, and block the thread to wait for a reply. If @timeout is -+ * nonzero, the thread will be blocked for at most @timeout jiffies. -+ * -+ * (To convert time units into jiffies, consider the functions -+ * msecs_to_jiffies(), usecs_to_jiffies(), timeval_to_jiffies(), and -+ * timespec_to_jiffies().) -+ * -+ * Once a reply is received by your downcall handler, call -+ * pipefs_assign_upcall_reply() with @uplist to find the corresponding upcall, -+ * assign the reply, and wake the waiting thread. -+ * -+ * This function's return value pointer may be an error and should be checked -+ * with IS_ERR() before attempting to access the reply message. -+ * -+ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() -+ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG -+ * flag is set in @upflags. See also rpc_pipe_fs.h. -+ */ -+struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, -+ struct pipefs_hdr *msg, -+ struct pipefs_list *uplist, -+ u8 upflags, u32 timeout) -+{ -+ int err = 0; -+ struct pipefs_upcall upcall; -+ -+ pipefs_init_upcall_waitreply(&upcall, msg, upflags); -+ err = __pipefs_queue_upcall_waitreply(pipe, &upcall, uplist, timeout); -+ if (err < 0) { -+ kfree(upcall.reply); -+ upcall.reply = ERR_PTR(err); -+ } -+ -+ return upcall.reply; -+} -+EXPORT_SYMBOL(pipefs_queue_upcall_waitreply); -+ -+/* -+ * Queue a pipefs msg for an upcall to userspace and immediately return (i.e., -+ * no reply is expected). -+ * -+ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() -+ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG -+ * flag is set in @upflags. See also rpc_pipe_fs.h. -+ */ -+int pipefs_queue_upcall_noreply(struct dentry *pipe, struct pipefs_hdr *msg, -+ u8 upflags) -+{ -+ int err = 0; -+ struct rpc_pipe_msg *rpcmsg; -+ -+ upflags |= PIPEFS_AUTOFREE_RPCMSG; -+ rpcmsg = pipefs_alloc_init_rpcmsg(msg, upflags); -+ if (IS_ERR(rpcmsg)) { -+ err = PTR_ERR(rpcmsg); -+ goto out; -+ } -+ err = rpc_queue_upcall(pipe->d_inode, rpcmsg); -+out: -+ return err; -+} -+EXPORT_SYMBOL(pipefs_queue_upcall_noreply); -+ -+ -+static struct pipefs_upcall *pipefs_find_upcall_msgid(u32 msgid, -+ struct pipefs_list *uplist) -+{ -+ struct pipefs_upcall *upcall; -+ -+ spin_lock(&uplist->list_lock); -+ list_for_each_entry(upcall, &uplist->list, list) -+ if (upcall->msgid == msgid) -+ goto out; -+ upcall = NULL; -+out: -+ spin_unlock(&uplist->list_lock); -+ return upcall; -+} -+ -+/* -+ * In your rpc_pipe_ops->downcall() handler, once you've read in a downcall -+ * message and have determined that it is a reply to a waiting upcall, -+ * you can use this function to find the appropriate upcall, assign the result, -+ * and wake the upcall thread. -+ * -+ * The reply message must have the same msgid as the original upcall message's. -+ * -+ * See also pipefs_queue_upcall_waitreply() and pipefs_readmsg(). -+ */ -+int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, -+ struct pipefs_list *uplist) -+{ -+ int err = 0; -+ struct pipefs_upcall *upcall; -+ -+ upcall = pipefs_find_upcall_msgid(reply->msgid, uplist); -+ if (!upcall) { -+ printk(KERN_ERR "%s: ERROR: have reply but no matching upcall " -+ "for msgid %d\n", __func__, reply->msgid); -+ err = -ENOENT; -+ goto out; -+ } -+ upcall->reply = reply; -+ wake_up(&upcall->waitq); -+out: -+ return err; -+} -+EXPORT_SYMBOL(pipefs_assign_upcall_reply); -+ -+/* -+ * Generic method to read-in and return a newly-allocated message which begins -+ * with a struct pipefs_hdr. -+ */ -+struct pipefs_hdr *pipefs_readmsg(struct file *filp, const char __user *src, -+ size_t len) -+{ -+ int err = 0, hdrsize; -+ struct pipefs_hdr *msg = NULL; -+ -+ hdrsize = sizeof(*msg); -+ if (len < hdrsize) { -+ printk(KERN_ERR "%s: ERROR: header is too short (%d vs %d)\n", -+ __func__, (int) len, hdrsize); -+ err = -EINVAL; -+ goto out; -+ } -+ -+ msg = kzalloc(len, GFP_KERNEL); -+ if (!msg) { -+ err = -ENOMEM; -+ goto out; -+ } -+ if (copy_from_user(msg, src, len)) -+ err = -EFAULT; -+out: -+ if (err) { -+ kfree(msg); -+ msg = ERR_PTR(err); -+ } -+ return msg; -+} -+EXPORT_SYMBOL(pipefs_readmsg); -+ -+/* -+ * Generic rpc_pipe_ops->upcall() handler implementation. -+ * -+ * Don't call this directly: to make an upcall, use -+ * pipefs_queue_upcall_waitreply() or pipefs_queue_upcall_noreply(). -+ */ -+ssize_t pipefs_generic_upcall(struct file *filp, struct rpc_pipe_msg *rpcmsg, -+ char __user *dst, size_t buflen) -+{ -+ char *data; -+ ssize_t len, left; -+ -+ data = (char *)rpcmsg->data + rpcmsg->copied; -+ len = rpcmsg->len - rpcmsg->copied; -+ if (len > buflen) -+ len = buflen; -+ -+ left = copy_to_user(dst, data, len); -+ if (left < 0) { -+ rpcmsg->errno = left; -+ return left; -+ } -+ -+ len -= left; -+ rpcmsg->copied += len; -+ rpcmsg->errno = 0; -+ return len; -+} -+EXPORT_SYMBOL(pipefs_generic_upcall); -+ -+/* -+ * Generic rpc_pipe_ops->destroy_msg() handler implementation. -+ * -+ * Items are only freed if @rpcmsg->flags has been set appropriately. -+ * See pipefs_queue_upcall_noreply() and rpc_pipe_fs.h. -+ */ -+void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg) -+{ -+ if (rpcmsg->flags & PIPEFS_AUTOFREE_UPCALL_MSG) -+ kfree(rpcmsg->data); -+ if (rpcmsg->flags & PIPEFS_AUTOFREE_RPCMSG) -+ kfree(rpcmsg); -+} -+EXPORT_SYMBOL(pipefs_generic_destroy_msg); -diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c -index a1f82a8..98a59f6 100644 ---- a/net/sunrpc/xdr.c -+++ b/net/sunrpc/xdr.c -@@ -395,24 +395,29 @@ xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) - { - struct kvec *tail; - size_t copy; -- char *p; - unsigned int pglen = buf->page_len; -+ unsigned int tailbuf_len; - - tail = buf->tail; - BUG_ON (len > pglen); - -+ tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len; -+ - /* Shift the tail first */ -- if (tail->iov_len != 0) { -- p = (char *)tail->iov_base + len; -+ if (tailbuf_len != 0) { -+ unsigned int free_space = tailbuf_len - tail->iov_len; -+ -+ if (len < free_space) -+ free_space = len; -+ tail->iov_len += free_space; -+ -+ copy = len; - if (tail->iov_len > len) { -- copy = tail->iov_len - len; -- memmove(p, tail->iov_base, copy); -+ char *p = (char *)tail->iov_base + len; -+ memmove(p, tail->iov_base, tail->iov_len - len); - } else -- buf->buflen -= len; -- /* Copy from the inlined pages into the tail */ -- copy = len; -- if (copy > tail->iov_len) - copy = tail->iov_len; -+ /* Copy from the inlined pages into the tail */ - _copy_from_pages((char *)tail->iov_base, - buf->pages, buf->page_base + pglen - len, - copy); -@@ -496,6 +501,27 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes) - EXPORT_SYMBOL_GPL(xdr_reserve_space); - - /** -+ * xdr_rewind_stream - rewind a stream back to some checkpoint -+ * @xdr: pointer to xdr_stream -+ * @q: some checkpoint at historical place of @xdr -+ * -+ * Restors an xdr stream to some historical point. @q must be -+ * a logical xdr point in the past that was sampled by @q = @xdr->p. -+ */ -+__be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q) -+{ -+ size_t nbytes = (xdr->p - q) << 2; -+ -+ BUG_ON(xdr->p < q); -+ BUG_ON(nbytes > xdr->iov->iov_len || nbytes > xdr->buf->len); -+ xdr->p = q; -+ xdr->iov->iov_len -= nbytes; -+ xdr->buf->len -= nbytes; -+ return q; -+} -+EXPORT_SYMBOL_GPL(xdr_rewind_stream); -+ -+/** - * xdr_write_pages - Insert a list of pages into an XDR buffer for sending - * @xdr: pointer to xdr_stream - * @pages: list of pages diff --git a/pnfs-all-2.6.38-rc2-2011-01-27.patch b/pnfs-all-2.6.38-rc2-2011-01-27.patch new file mode 100644 index 0000000..5f476b5 --- /dev/null +++ b/pnfs-all-2.6.38-rc2-2011-01-27.patch @@ -0,0 +1,27592 @@ +diff -up linux-2.6.37.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.37.noarch/Documentation/filesystems/spnfs.txt +--- linux-2.6.37.noarch/Documentation/filesystems/spnfs.txt.orig 2011-01-28 09:43:53.292780695 -0500 ++++ linux-2.6.37.noarch/Documentation/filesystems/spnfs.txt 2011-01-28 09:43:53.292780695 -0500 +@@ -0,0 +1,211 @@ ++(c) 2007 Network Appliance Inc. ++ ++spNFS ++----- ++ ++An spNFS system consists of a Meta Data Server (MDS), a number of Client machines (C) and a number of Data Servers (DS). ++ ++A file system is mounted by the clients from the MDS, and all file data ++is striped across the DSs. ++ ++Identify the machines that will be filling each of these roles. ++ ++The spnfs kernel will be installed on all machines: clients, the MDS and DSs. ++ ++ ++Building and installing the spNFS kernel ++---------------------------------------- ++ ++Get the spNFS kernel from: ++ ++ git://linux-nfs.org/~bhalevy/linux-pnfs.git ++ ++Use the pnfs-all-latest branch and add these options to your .config file ++ ++ CONFIG_NETWORK_FILESYSTEMS=y ++ CONFIG_NFS_FS=m ++ CONFIG_NFS_V4=y ++ CONFIG_NFS_V4_1=y ++ CONFIG_PNFS=y ++ CONFIG_NFSD=m ++ CONFIG_PNFSD=y ++ # CONFIG_PNFSD_LOCAL_EXPORT is not set ++ CONFIG_SPNFS=y ++ ++By default, spNFS uses whole-file layouts. Layout segments can be enabled ++by adding: ++ ++ CONFIG_SPNFS_LAYOUTSEGMENTS=y ++ ++to your .config file. ++ ++Building and installation of kernel+modules is as usual. ++This kernel should be installed and booted on the client, MDS and DSs. ++ ++Note that CONFIG_PNFSD_LOCAL_EXPORT must be disabled for spnfs as it ++takes over the pnfs export interface. ++ ++Building nfs-utils ++------------------ ++ ++Get the nfs-utils package containing spnfsd from: ++ ++ git://linux-nfs.org/~bhalevy/pnfs-nfs-utils.git ++ ++Follow the standard instructions for building nfs-utils. ++ ++After building, the spnfsd daemon will be located in utils/spnfsd. The spnfsd ++daemon will only be needed on the MDS. ++ ++ ++Installation ++------------ ++ ++The nfs-utils package contains a default spnfsd.conf file in ++utils/spnfsd/spnfsd.conf. Copy this file to /etc/spnfsd.conf. ++ ++By default, the DS-Mount-Directory is set to /spnfs (see spnfsd.conf). Under ++this directory, mount points must be created for each DS to ++be used for pNFS data stripes. These mount points are named by the ip address ++of the corresponding DS. In the sample spnfsd.conf, there are two ++DSs defined (172.16.28.134 and 172.16.28.141). ++ ++Following the sample spnfsd.conf, ++ ++ mkdir /spnfs ++ ++on the MDS (corresponding to DS-Mount-Directory). Then ++ ++ mkdir /spnfs/172.16.28.134 ++ mkdir /spnfs/172.16.28.141 ++ ++to create the mount points for the DSs. ++ ++On the DSs, chose a directory where data stripes will be created by the MDS. ++For the sample file, this directory is /pnfs, so on each DS execute: ++ ++ mkdir /pnfs ++ ++This directory is specified in the spnfsd.conf file by the DS*_ROOT option ++(where * is replaced by the DS number). DS_ROOT is specified relative to ++the directory being exported by the DSs. In our example, our DSs are exporting ++the root directory (/) and therefore our DS_ROOT is /pnfs. On the DSs, we have ++the following entry in /etc/exports: ++ ++ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check) ++ ++N.B. If we had created a /exports directory and a /pnfs directory under ++/exports, and if we were exporting /exports, then DS_ROOT would still be /pnfs ++(not /exports/pnfs). ++ ++It may be useful to add entries to /etc/fstab on the MDS to automatically ++mount the DS_ROOT file systems. For this example, our MDS fstab would ++contain: ++ ++ 172.17.84.128:/pnfs /spnfs/172.17.84.128 nfs defaults 1 2 ++ 172.17.84.122:/pnfs /spnfs/172.17.84.122 nfs defaults 1 2 ++ ++The DS mounts must be performed manually or via fstab at this time (automatic ++mounting, directory creation, etc. are on the todo list). To perform I/O ++through the MDS, the DS mounts MUST use NFSv3 at this time (this restriction ++will eventually be removed). ++ ++ ++On the MDS, choose a file system to use with spNFS and export it, e.g.: ++ ++ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check,pnfs) ++ ++Make sure nfsd and all supporting processes are running on the MDS and DSs. ++ ++ ++Running ++------- ++ ++If rpc_pipefs is not already mounted (if you're running idmapd it probably is), ++you may want to add the following line to /etc/fstab: ++ ++ rpc_pipefs /var/lib/nfs/rpc_pipefs rpc_pipefs defaults 0 0 ++ ++to automatically mount rpc_pipefs. ++ ++With spnfsd.conf configured for your environment and the mounts mounted as ++described above, spnfsd can now be started. ++ ++On the MDS, execute spnfsd: ++ ++ spnfsd ++ ++The executable is located in the directory where it was built, and ++may also have been installed elsewhere depending on how you built nfs-utils. ++It will run in the foreground by default, and in fact will do so despite ++any options suggesting the contrary (it's still a debugging build). ++ ++On the client, make sure the nfslayoutdriver module is loaded: ++ ++ modprobe nfslayoutdriver ++ ++Then mount the file system from the MDS: ++ ++ mount -t nfs4 -o minorversion=1 mds:/ /mnt ++ ++I/O through the MDS is now supported. To use it, do not load the ++nfslayoutdriver on the client, and mount the MDS using NFSv4 or 4.1 ++(NFSv2 and v3 are not yet supported). ++ ++You may now use spNFS by performing file system activities in /mnt. ++If you create files in /mnt, you should see stripe files corresponding to ++new files being created on the DSs. The current implementation names the ++stripe files based on the inode number of the file on the MDS. For example, ++if you create a file foo in /mnt and do an 'ls -li /mnt/foo': ++ ++ # ls -li foo ++ 1233 -rw-r--r-- 1 root root 0 Nov 29 15:54 foo ++ ++You should see stripe files on each under /pnfs (per the sample) named ++1233. The file /pnfs/1233 on DS1 will contain the first bytes ++of data written to foo, DS2 will contain the next bytes, etc. ++Removing /mnt/foo will remove the corresponding stripe files on the DSs. ++Other file system operations should behave (mostly :-) as expected. ++ ++ ++Layout Segments ++--------------- ++ ++If the kernel is compiled to support layout segments, there will ++be two files created under /proc/fs/spnfs for controlling layout ++segment functionality. ++ ++To enable layout segments, write a '1' to /proc/fs/spnfs/layoutseg, e.g.: ++ ++ echo 1 > /proc/fs/spnfs/layoutseg ++ ++Layout segments can be disabled (returning to whole-file layouts) by ++writing a '0' to /proc/fs/spnfs/layoutseg: ++ ++ echo 0 > /proc/fs/spnfs/layoutseg ++ ++When layout segments are enabled, the size of the layouts returned can ++be specified by writing a decimal number (ascii representation) to ++/proc/fs/spnfs/layoutsegsize: ++ ++ echo 1024 > /proc/fs/spnfs/layoutsegsize ++ ++The value'0' has a special meaning--it causes the server to return a ++layout that is exactly the size requested by the client: ++ ++ echo 0 > /proc/fs/spnfs/layoutsegsize ++ ++ ++Troubleshooting ++--------------- ++ ++If you see data being written to the files on the MDS rather than ++the stripe files, make sure the nfslayoutdriver is loaded on the client ++(see above). ++ ++If you get a "permission denied" error, make sure mountd is running on the mds ++(it occasionally fails to start). ++ ++Bugs, enhancements, compliments, complaints to: dmuntz@netapp.com ++ ++ +diff -up linux-2.6.37.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.37.noarch/drivers/md/dm-ioctl.c +--- linux-2.6.37.noarch/drivers/md/dm-ioctl.c.orig 2011-01-28 09:37:28.883106954 -0500 ++++ linux-2.6.37.noarch/drivers/md/dm-ioctl.c 2011-01-28 09:43:53.293780446 -0500 +@@ -713,6 +713,12 @@ static int dev_create(struct dm_ioctl *p + return 0; + } + ++int dm_dev_create(struct dm_ioctl *param) ++{ ++ return dev_create(param, sizeof(*param)); ++} ++EXPORT_SYMBOL(dm_dev_create); ++ + /* + * Always use UUID for lookups if it's present, otherwise use name or dev. + */ +@@ -808,6 +814,12 @@ static int dev_remove(struct dm_ioctl *p + return 0; + } + ++int dm_dev_remove(struct dm_ioctl *param) ++{ ++ return dev_remove(param, sizeof(*param)); ++} ++EXPORT_SYMBOL(dm_dev_remove); ++ + /* + * Check a string doesn't overrun the chunk of + * memory we copied from userland. +@@ -990,6 +1002,12 @@ static int do_resume(struct dm_ioctl *pa + return r; + } + ++int dm_do_resume(struct dm_ioctl *param) ++{ ++ return do_resume(param); ++} ++EXPORT_SYMBOL(dm_do_resume); ++ + /* + * Set or unset the suspension state of a device. + * If the device already is in the requested state we just return its status. +@@ -1256,6 +1274,12 @@ out: + return r; + } + ++int dm_table_load(struct dm_ioctl *param, size_t param_size) ++{ ++ return table_load(param, param_size); ++} ++EXPORT_SYMBOL(dm_table_load); ++ + static int table_clear(struct dm_ioctl *param, size_t param_size) + { + struct hash_cell *hc; +diff -up linux-2.6.37.noarch/drivers/scsi/hosts.c.orig linux-2.6.37.noarch/drivers/scsi/hosts.c +--- linux-2.6.37.noarch/drivers/scsi/hosts.c.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/drivers/scsi/hosts.c 2011-01-28 09:43:53.294780201 -0500 +@@ -50,10 +50,11 @@ static void scsi_host_cls_release(struct + put_device(&class_to_shost(dev)->shost_gendev); + } + +-static struct class shost_class = { ++struct class shost_class = { + .name = "scsi_host", + .dev_release = scsi_host_cls_release, + }; ++EXPORT_SYMBOL(shost_class); + + /** + * scsi_host_set_state - Take the given host through the host state model. +diff -up linux-2.6.37.noarch/fs/exofs/exofs.h.orig linux-2.6.37.noarch/fs/exofs/exofs.h +--- linux-2.6.37.noarch/fs/exofs/exofs.h.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/fs/exofs/exofs.h 2011-01-28 09:43:53.296779718 -0500 +@@ -36,13 +36,9 @@ + #include + #include + #include ++#include + #include "common.h" + +-/* FIXME: Remove once pnfs hits mainline +- * #include +- */ +-#include "pnfs.h" +- + #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) + + #ifdef CONFIG_EXOFS_DEBUG +@@ -103,6 +99,7 @@ struct exofs_sb_info { + struct exofs_i_info { + struct inode vfs_inode; /* normal in-memory inode */ + wait_queue_head_t i_wq; /* wait queue for inode */ ++ spinlock_t i_layout_lock; /* lock for layout/return/recall */ + unsigned long i_flags; /* various atomic flags */ + uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ + uint32_t i_dir_start_lookup; /* which page to start lookup */ +@@ -166,6 +163,9 @@ static inline unsigned exofs_io_state_si + */ + #define OBJ_2BCREATED 0 /* object will be created soon*/ + #define OBJ_CREATED 1 /* object has been created on the osd*/ ++/* Below are not used atomic but reuse the same i_flags */ ++#define OBJ_LAYOUT_IS_GIVEN 2 /* inode has given layouts to clients*/ ++#define OBJ_IN_LAYOUT_RECALL 3 /* inode is in the middle of a layout recall*/ + + static inline int obj_2bcreated(struct exofs_i_info *oi) + { +@@ -303,4 +303,21 @@ extern const struct inode_operations exo + extern const struct inode_operations exofs_symlink_inode_operations; + extern const struct inode_operations exofs_fast_symlink_inode_operations; + ++/* export.c */ ++typedef int (exofs_recall_fn)(struct inode *inode, u64 data); ++#ifdef CONFIG_PNFSD ++int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++ exofs_recall_fn todo, u64 todo_data); ++void exofs_init_export(struct super_block *sb); ++#else ++static inline int ++exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++exofs_recall_fn todo, u64 todo_data) ++{ ++ return todo(inode, todo_data); ++} ++ ++static inline void exofs_init_export(struct super_block *sb) {} ++#endif ++ + #endif +diff -up linux-2.6.37.noarch/fs/exofs/export.c.orig linux-2.6.37.noarch/fs/exofs/export.c +--- linux-2.6.37.noarch/fs/exofs/export.c.orig 2011-01-28 09:43:53.297779480 -0500 ++++ linux-2.6.37.noarch/fs/exofs/export.c 2011-01-28 09:43:53.297779480 -0500 +@@ -0,0 +1,396 @@ ++/* ++ * export.c - Implementation of the pnfs_export_operations ++ * ++ * Copyright (C) 2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Boaz Harrosh ++ * ++ * This file is part of exofs. ++ * ++ * exofs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation. Since it is based on ext2, and the only ++ * valid version of GPL for the Linux kernel is version 2, the only valid ++ * version of GPL for exofs is version 2. ++ * ++ * exofs is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with exofs; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include ++#include "exofs.h" ++ ++static int exofs_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_OSD2_OBJECTS; ++} ++ ++static void set_dev_id(struct nfs4_deviceid *pnfs_devid, u64 sbid, u64 devid) ++{ ++ struct nfsd4_pnfs_deviceid *dev_id = ++ (struct nfsd4_pnfs_deviceid *)pnfs_devid; ++ ++ dev_id->sbid = sbid; ++ dev_id->devid = devid; ++} ++ ++static int cb_layout_recall(struct inode *inode, enum pnfs_iomode iomode, ++ u64 offset, u64 length, void *cookie) ++{ ++ struct nfsd4_pnfs_cb_layout cbl; ++ struct pnfsd_cb_ctl cb_ctl; ++ int status; ++ ++ memset(&cb_ctl, 0, sizeof(cb_ctl)); ++ status = pnfsd_get_cb_op(&cb_ctl); ++ if (unlikely(status)) { ++ EXOFS_ERR("%s: nfsd unloaded!! inode (0x%lx) status=%d\n", ++ __func__, inode->i_ino, status); ++ goto err; ++ } ++ ++ memset(&cbl, 0, sizeof(cbl)); ++ cbl.cbl_recall_type = RETURN_FILE; ++ cbl.cbl_seg.layout_type = LAYOUT_OSD2_OBJECTS; ++ cbl.cbl_seg.iomode = iomode; ++ cbl.cbl_seg.offset = offset; ++ cbl.cbl_seg.length = length; ++ cbl.cbl_cookie = cookie; ++ ++ status = cb_ctl.cb_op->cb_layout_recall(inode->i_sb, inode, &cbl); ++ pnfsd_put_cb_op(&cb_ctl); ++ ++err: ++ return status; ++} ++ ++static enum nfsstat4 exofs_layout_get( ++ struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *args, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; ++ struct exofs_layout *el = &sbi->layout; ++ struct pnfs_osd_object_cred *creds = NULL; ++ struct pnfs_osd_layout layout; ++ __be32 *start; ++ bool in_recall; ++ int i, err; ++ enum nfsstat4 nfserr; ++ ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ res->lg_seg.iomode = IOMODE_RW; ++ res->lg_return_on_close = true; /* TODO: unused but will be soon */ ++ ++ /* skip opaque size, will be filled-in later */ ++ start = exp_xdr_reserve_qwords(xdr, 1); ++ if (!start) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ ++ creds = kcalloc(el->s_numdevs, sizeof(*creds), GFP_KERNEL); ++ if (!creds) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto out; ++ } ++ ++ /* Fill in a pnfs_osd_layout struct */ ++ layout.olo_map = sbi->data_map; ++ ++ for (i = 0; i < el->s_numdevs; i++) { ++ struct pnfs_osd_object_cred *cred = &creds[i]; ++ osd_id id = exofs_oi_objno(oi); ++ unsigned dev = exofs_layout_od_id(el, id, i); ++ ++ set_dev_id(&cred->oc_object_id.oid_device_id, args->lg_sbid, ++ dev); ++ cred->oc_object_id.oid_partition_id = el->s_pid; ++ cred->oc_object_id.oid_object_id = id; ++ cred->oc_osd_version = osd_dev_is_ver1(el->s_ods[dev]) ? ++ PNFS_OSD_VERSION_1 : ++ PNFS_OSD_VERSION_2; ++ cred->oc_cap_key_sec = PNFS_OSD_CAP_KEY_SEC_NONE; ++ ++ cred->oc_cap_key.cred_len = 0; ++ cred->oc_cap_key.cred = NULL; ++ ++ cred->oc_cap.cred_len = OSD_CAP_LEN; ++ cred->oc_cap.cred = oi->i_cred; ++ } ++ ++ layout.olo_comps_index = 0; ++ layout.olo_num_comps = el->s_numdevs; ++ layout.olo_comps = creds; ++ ++ err = pnfs_osd_xdr_encode_layout(xdr, &layout); ++ if (err) { ++ nfserr = NFS4ERR_TOOSMALL; /* FIXME: Change osd_xdr error codes */ ++ goto out; ++ } ++ ++ exp_xdr_encode_opaque_len(start, xdr->p); ++ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ if (!in_recall) { ++ __set_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ nfserr = NFS4_OK; ++ } else { ++ nfserr = NFS4ERR_RECALLCONFLICT; ++ } ++ spin_unlock(&oi->i_layout_lock); ++ ++out: ++ kfree(creds); ++ EXOFS_DBGMSG("(0x%lx) nfserr=%u xdr_bytes=%zu\n", ++ inode->i_ino, nfserr, exp_xdr_qbytes(xdr->p - start)); ++ return nfserr; ++} ++ ++/* NOTE: inode mutex must NOT be held */ ++static int exofs_layout_commit( ++ struct inode *inode, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ struct timespec mtime; ++ loff_t i_size; ++ int in_recall; ++ ++ /* In case of a recall we ignore the new size and mtime since they ++ * are going to be changed again by truncate, and since we cannot take ++ * the inode lock in that case. ++ */ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ if (in_recall) { ++ EXOFS_DBGMSG("(0x%lx) commit was called during recall\n", ++ inode->i_ino); ++ return 0; ++ } ++ ++ /* NOTE: I would love to call inode_setattr here ++ * but i cannot since this will cause an eventual vmtruncate, ++ * which will cause a layout_recall. So open code the i_size ++ * and mtime/atime changes under i_mutex. ++ */ ++ mutex_lock_nested(&inode->i_mutex, I_MUTEX_NORMAL); ++ ++ if (args->lc_mtime.seconds) { ++ mtime.tv_sec = args->lc_mtime.seconds; ++ mtime.tv_nsec = args->lc_mtime.nseconds; ++ ++ /* layout commit may only make time bigger, since there might ++ * be reordering of the notifications and it might arrive after ++ * A local change. ++ * TODO: if mtime > ctime then we know set_attr did an mtime ++ * in the future. and we can let this update through ++ */ ++ if (0 <= timespec_compare(&mtime, &inode->i_mtime)) ++ mtime = inode->i_mtime; ++ } else { ++ mtime = current_fs_time(inode->i_sb); ++ } ++ ++ /* TODO: Will below work? since mark_inode_dirty has it's own ++ * Time handling ++ */ ++ inode->i_atime = inode->i_mtime = mtime; ++ ++ i_size = i_size_read(inode); ++ if (args->lc_newoffset) { ++ loff_t new_size = args->lc_last_wr + 1; ++ ++ if (i_size < new_size) { ++ i_size_write(inode, i_size = new_size); ++ res->lc_size_chg = 1; ++ res->lc_newsize = new_size; ++ } ++ } ++ /* TODO: else { i_size = osd_get_object_length() } */ ++ ++/* TODO: exofs does not currently use the osd_xdr part of the layout_commit */ ++ ++ mark_inode_dirty_sync(inode); ++ ++ mutex_unlock(&inode->i_mutex); ++ EXOFS_DBGMSG("(0x%lx) i_size=0x%llx lcp->off=0x%llx\n", ++ inode->i_ino, i_size, args->lc_last_wr); ++ return 0; ++} ++ ++static void exofs_handle_error(struct pnfs_osd_ioerr *ioerr) ++{ ++ EXOFS_ERR("exofs_handle_error: errno=%d is_write=%d obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ ioerr->oer_errno, ioerr->oer_iswrite, ++ _LLU(ioerr->oer_component.oid_object_id), ++ _LLU(ioerr->oer_comp_offset), ++ _LLU(ioerr->oer_comp_length)); ++} ++ ++static int exofs_layout_return( ++ struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ __be32 *p = args->lrf_body; ++ unsigned len = exp_xdr_qwords(args->lrf_body_len); ++ ++ EXOFS_DBGMSG("(0x%lx) cookie %p xdr_len %d\n", ++ inode->i_ino, args->lr_cookie, len); ++ ++ while (len >= pnfs_osd_ioerr_xdr_sz()) { ++ struct pnfs_osd_ioerr ioerr; ++ ++ p = pnfs_osd_xdr_decode_ioerr(&ioerr, p); ++ len -= pnfs_osd_ioerr_xdr_sz(); ++ exofs_handle_error(&ioerr); ++ } ++ ++ if (args->lr_cookie) { ++ struct exofs_i_info *oi = exofs_i(inode); ++ bool in_recall; ++ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ __clear_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ /* TODO: how to communicate cookie with the waiter */ ++ if (in_recall) ++ wake_up(&oi->i_wq); /* wakeup any recalls */ ++ } ++ ++ return 0; ++} ++ ++int exofs_get_device_info(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ struct exofs_sb_info *sbi = sb->s_fs_info; ++ struct pnfs_osd_deviceaddr devaddr; ++ const struct osd_dev_info *odi; ++ u64 devno = devid->devid; ++ __be32 *start; ++ int err; ++ ++ memset(&devaddr, 0, sizeof(devaddr)); ++ ++ if (unlikely(devno >= sbi->layout.s_numdevs)) ++ return -ENODEV; ++ ++ odi = osduld_device_info(sbi->layout.s_ods[devno]); ++ ++ devaddr.oda_systemid.len = odi->systemid_len; ++ devaddr.oda_systemid.data = (void *)odi->systemid; /* !const cast */ ++ ++ devaddr.oda_osdname.len = odi->osdname_len ; ++ devaddr.oda_osdname.data = (void *)odi->osdname;/* !const cast */ ++ ++ /* skip opaque size, will be filled-in later */ ++ start = exp_xdr_reserve_qwords(xdr, 1); ++ if (!start) { ++ err = -E2BIG; ++ goto err; ++ } ++ ++ err = pnfs_osd_xdr_encode_deviceaddr(xdr, &devaddr); ++ if (err) ++ goto err; ++ ++ exp_xdr_encode_opaque_len(start, xdr->p); ++ ++ EXOFS_DBGMSG("xdr_bytes=%Zu devno=%lld osdname-%s\n", ++ exp_xdr_qbytes(xdr->p - start), devno, odi->osdname); ++ return 0; ++ ++err: ++ EXOFS_DBGMSG("Error: err=%d at_byte=%zu\n", ++ err, exp_xdr_qbytes(xdr->p - start)); ++ return err; ++} ++ ++struct pnfs_export_operations exofs_pnfs_ops = { ++ .layout_type = exofs_layout_type, ++ .layout_get = exofs_layout_get, ++ .layout_commit = exofs_layout_commit, ++ .layout_return = exofs_layout_return, ++ .get_device_info = exofs_get_device_info, ++}; ++ ++static bool is_layout_returned(struct exofs_i_info *oi) ++{ ++ bool layout_given; ++ ++ spin_lock(&oi->i_layout_lock); ++ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ return !layout_given; ++} ++ ++int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++ exofs_recall_fn todo, u64 todo_data) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ int layout_given; ++ int error = 0; ++ ++ spin_lock(&oi->i_layout_lock); ++ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ __set_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ if (!layout_given) ++ goto exec; ++ ++ for (;;) { ++ EXOFS_DBGMSG("(0x%lx) has_layout issue a recall\n", ++ inode->i_ino); ++ error = cb_layout_recall(inode, iomode, 0, NFS4_MAX_UINT64, ++ &oi->i_wq); ++ switch (error) { ++ case 0: ++ case -EAGAIN: ++ break; ++ case -ENOENT: ++ goto exec; ++ default: ++ goto err; ++ } ++ ++ error = wait_event_interruptible(oi->i_wq, ++ is_layout_returned(oi)); ++ if (error) ++ goto err; ++ } ++ ++exec: ++ error = todo(inode, todo_data); ++ ++err: ++ spin_lock(&oi->i_layout_lock); ++ __clear_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ EXOFS_DBGMSG("(0x%lx) return=>%d\n", inode->i_ino, error); ++ return error; ++} ++ ++void exofs_init_export(struct super_block *sb) ++{ ++ sb->s_pnfs_op = &exofs_pnfs_ops; ++} +diff -up linux-2.6.37.noarch/fs/exofs/inode.c.orig linux-2.6.37.noarch/fs/exofs/inode.c +--- linux-2.6.37.noarch/fs/exofs/inode.c.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/fs/exofs/inode.c 2011-01-28 09:43:53.298779243 -0500 +@@ -820,8 +820,9 @@ static inline int exofs_inode_is_fast_sy + const struct osd_attr g_attr_logical_length = ATTR_DEF( + OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); + +-static int _do_truncate(struct inode *inode, loff_t newsize) ++static int _do_truncate(struct inode *inode, u64 data) + { ++ loff_t newsize = data; + struct exofs_i_info *oi = exofs_i(inode); + int ret; + +@@ -858,7 +859,8 @@ int exofs_setattr(struct dentry *dentry, + + if ((iattr->ia_valid & ATTR_SIZE) && + iattr->ia_size != i_size_read(inode)) { +- error = _do_truncate(inode, iattr->ia_size); ++ error = exofs_inode_recall_layout(inode, IOMODE_ANY, ++ _do_truncate, iattr->ia_size); + if (unlikely(error)) + return error; + } +@@ -971,6 +973,7 @@ static void __oi_init(struct exofs_i_inf + { + init_waitqueue_head(&oi->i_wq); + oi->i_flags = 0; ++ spin_lock_init(&oi->i_layout_lock); + } + /* + * Fill in an inode read from the OSD and set it up for use +diff -up linux-2.6.37.noarch/fs/exofs/Kbuild.orig linux-2.6.37.noarch/fs/exofs/Kbuild +--- linux-2.6.37.noarch/fs/exofs/Kbuild.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/fs/exofs/Kbuild 2011-01-28 09:43:53.295779958 -0500 +@@ -13,4 +13,5 @@ + # + + exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o ++exofs-$(CONFIG_PNFSD) += export.o + obj-$(CONFIG_EXOFS_FS) += exofs.o +diff -up linux-2.6.37.noarch/fs/exofs/Kconfig.orig linux-2.6.37.noarch/fs/exofs/Kconfig +--- linux-2.6.37.noarch/fs/exofs/Kconfig.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/fs/exofs/Kconfig 2011-01-28 09:43:53.295779958 -0500 +@@ -1,6 +1,7 @@ + config EXOFS_FS + tristate "exofs: OSD based file system support" + depends on SCSI_OSD_ULD ++ select EXPORTFS_OSD_LAYOUT if PNFSD + help + EXOFS is a file system that uses an OSD storage device, + as its backing storage. +diff -up linux-2.6.37.noarch/fs/exofs/super.c.orig linux-2.6.37.noarch/fs/exofs/super.c +--- linux-2.6.37.noarch/fs/exofs/super.c.orig 2011-01-28 09:37:32.381985535 -0500 ++++ linux-2.6.37.noarch/fs/exofs/super.c 2011-01-28 09:43:53.300778781 -0500 +@@ -627,6 +627,7 @@ static int exofs_fill_super(struct super + sb->s_fs_info = sbi; + sb->s_op = &exofs_sops; + sb->s_export_op = &exofs_export_ops; ++ exofs_init_export(sb); + root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); + if (IS_ERR(root)) { + EXOFS_ERR("ERROR: exofs_iget failed\n"); +diff -up linux-2.6.37.noarch/fs/exportfs/expfs.c.orig linux-2.6.37.noarch/fs/exportfs/expfs.c +--- linux-2.6.37.noarch/fs/exportfs/expfs.c.orig 2011-01-28 09:37:32.382985501 -0500 ++++ linux-2.6.37.noarch/fs/exportfs/expfs.c 2011-01-28 09:43:53.301778557 -0500 +@@ -16,6 +16,13 @@ + #include + #include + ++#if defined(CONFIG_PNFSD) ++struct pnfsd_cb_ctl pnfsd_cb_ctl = { ++ .lock = __SPIN_LOCK_UNLOCKED(pnfsd_cb_ctl.lock) ++}; ++EXPORT_SYMBOL(pnfsd_cb_ctl); ++#endif /* CONFIG_PNFSD */ ++ + #define dprintk(fmt, args...) do{}while(0) + + +diff -up linux-2.6.37.noarch/fs/exportfs/Makefile.orig linux-2.6.37.noarch/fs/exportfs/Makefile +--- linux-2.6.37.noarch/fs/exportfs/Makefile.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/fs/exportfs/Makefile 2011-01-28 09:43:53.300778781 -0500 +@@ -3,4 +3,7 @@ + + obj-$(CONFIG_EXPORTFS) += exportfs.o + +-exportfs-objs := expfs.o ++exportfs-y := expfs.o ++exportfs-$(CONFIG_EXPORTFS_FILE_LAYOUT) += nfs4filelayoutxdr.o ++exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o ++exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o +diff -up linux-2.6.37.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.37.noarch/fs/exportfs/nfs4blocklayoutxdr.c +--- linux-2.6.37.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2011-01-28 09:43:53.301778557 -0500 ++++ linux-2.6.37.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2011-01-28 09:43:53.301778557 -0500 +@@ -0,0 +1,158 @@ ++/* ++ * linux/fs/nfsd/nfs4blocklayoutxdr.c ++ * ++ * ++ * Created by Rick McNeal on 3/31/08. ++ * Copyright 2008 __MyCompanyName__. All rights reserved. ++ * ++ */ ++#include ++#include ++#include ++#include ++ ++static int ++bl_encode_simple(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ __be32 *p = exp_xdr_reserve_space(xdr, ++ 12 + 4 + bld->u.simple.bld_sig_len); ++ ++ if (!p) ++ return -ETOOSMALL; ++ ++ p = exp_xdr_encode_u32(p, 1); ++ p = exp_xdr_encode_u64(p, bld->u.simple.bld_offset); ++ exp_xdr_encode_opaque(p, bld->u.simple.bld_sig, ++ bld->u.simple.bld_sig_len); ++ ++ return 0; ++} ++ ++static int ++bl_encode_slice(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 2 + 2 + 1); ++ ++ if (!p) ++ return -ETOOSMALL; ++ ++ p = exp_xdr_encode_u64(p, bld->u.slice.bld_start); ++ p = exp_xdr_encode_u64(p, bld->u.slice.bld_len); ++ exp_xdr_encode_u32(p, bld->u.slice.bld_index); ++ ++ return 0; ++} ++ ++static int ++bl_encode_concat(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ return -ENOTSUPP; ++} ++ ++static int ++bl_encode_stripe(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ int i; ++ __be32 *p = exp_xdr_reserve_space(xdr, ++ 2 + 1 + bld->u.stripe.bld_stripes); ++ ++ p = exp_xdr_encode_u64(p, bld->u.stripe.bld_chunk_size); ++ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripes); ++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) ++ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripe_indexs[i]); ++ ++ return 0; ++} ++ ++int ++blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct list_head *volumes) ++{ ++ u32 num_vols = 0, ++ *layoutlen_p = xdr->p; ++ pnfs_blocklayout_devinfo_t *bld; ++ int status = 0; ++ __be32 *p; ++ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return -ETOOSMALL; ++ p += 2; ++ ++ /* ++ * All simple volumes with their signature are required to be listed ++ * first. ++ */ ++ list_for_each_entry(bld, volumes, bld_list) { ++ num_vols++; ++ p = exp_xdr_reserve_qwords(xdr, 1); ++ if (!p) ++ return -ETOOSMALL; ++ p = exp_xdr_encode_u32(p, bld->bld_type); ++ switch (bld->bld_type) { ++ case PNFS_BLOCK_VOLUME_SIMPLE: ++ status = bl_encode_simple(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_SLICE: ++ status = bl_encode_slice(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_CONCAT: ++ status = bl_encode_concat(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_STRIPE: ++ status = bl_encode_stripe(xdr, bld); ++ break; ++ default: ++ BUG(); ++ } ++ if (status) ++ goto error; ++ } ++ ++ /* ---- Fill in the overall length and number of volumes ---- */ ++ p = exp_xdr_encode_u32(layoutlen_p, (xdr->p - layoutlen_p - 1) * 4); ++ exp_xdr_encode_u32(p, num_vols); ++ ++error: ++ return status; ++} ++EXPORT_SYMBOL_GPL(blocklayout_encode_devinfo); ++ ++enum nfsstat4 ++blocklayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct list_head *bl_head) ++{ ++ struct pnfs_blocklayout_layout *b; ++ u32 *layoutlen_p = xdr->p, ++ extents = 0; ++ __be32 *p; ++ ++ /* ++ * Save spot for opaque block layout length and number of extents, ++ * fill-in later. ++ */ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return NFS4ERR_TOOSMALL; ++ p += 2; ++ ++ list_for_each_entry(b, bl_head, bll_list) { ++ extents++; ++ p = exp_xdr_reserve_qwords(xdr, 5 * 2 + 1); ++ if (!p) ++ return NFS4ERR_TOOSMALL; ++ p = exp_xdr_encode_u64(p, b->bll_vol_id.sbid); ++ p = exp_xdr_encode_u64(p, b->bll_vol_id.devid); ++ p = exp_xdr_encode_u64(p, b->bll_foff); ++ p = exp_xdr_encode_u64(p, b->bll_len); ++ p = exp_xdr_encode_u64(p, b->bll_soff); ++ p = exp_xdr_encode_u32(p, b->bll_es); ++ } ++ ++ /* ---- Fill in the overall length and number of extents ---- */ ++ p = exp_xdr_encode_u32(layoutlen_p, (p - layoutlen_p - 1) * 4); ++ exp_xdr_encode_u32(p, extents); ++ ++ return NFS4_OK; ++} ++EXPORT_SYMBOL_GPL(blocklayout_encode_layout); +diff -up linux-2.6.37.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.37.noarch/fs/exportfs/nfs4filelayoutxdr.c +--- linux-2.6.37.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2011-01-28 09:43:53.302778335 -0500 ++++ linux-2.6.37.noarch/fs/exportfs/nfs4filelayoutxdr.c 2011-01-28 09:43:53.302778335 -0500 +@@ -0,0 +1,218 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++/* We do our-own dprintk so filesystems are not dependent on sunrpc */ ++#ifdef dprintk ++#undef dprintk ++#endif ++#define dprintk(fmt, args, ...) do { } while (0) ++ ++/* Calculate the XDR length of the GETDEVICEINFO4resok structure ++ * excluding the gdir_notification and the gdir_device_addr da_layout_type. ++ */ ++static int fl_devinfo_xdr_words(const struct pnfs_filelayout_device *fdev) ++{ ++ struct pnfs_filelayout_devaddr *fl_addr; ++ struct pnfs_filelayout_multipath *mp; ++ int i, j, nwords; ++ ++ /* da_addr_body length, indice length, indices, ++ * multipath_list4 length */ ++ nwords = 1 + 1 + fdev->fl_stripeindices_length + 1; ++ for (i = 0; i < fdev->fl_device_length; i++) { ++ mp = &fdev->fl_device_list[i]; ++ nwords++; /* multipath list length */ ++ for (j = 0; j < mp->fl_multipath_length; j++) { ++ fl_addr = mp->fl_multipath_list; ++ nwords += 1 + exp_xdr_qwords(fl_addr->r_netid.len); ++ nwords += 1 + exp_xdr_qwords(fl_addr->r_addr.len); ++ } ++ } ++ dprintk("<-- %s nwords %d\n", __func__, nwords); ++ return nwords; ++} ++ ++/* Encodes the nfsv4_1_file_layout_ds_addr4 structure from draft 13 ++ * on the response stream. ++ * Use linux error codes (not nfs) since these values are being ++ * returned to the file system. ++ */ ++int ++filelayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_device *fdev) ++{ ++ unsigned int i, j, len = 0, opaque_words; ++ u32 *p_in; ++ u32 index_count = fdev->fl_stripeindices_length; ++ u32 dev_count = fdev->fl_device_length; ++ int error = 0; ++ __be32 *p; ++ ++ opaque_words = fl_devinfo_xdr_words(fdev); ++ dprintk("%s: Begin indx_cnt: %u dev_cnt: %u total size %u\n", ++ __func__, ++ index_count, ++ dev_count, ++ opaque_words*4); ++ ++ /* check space for opaque length */ ++ p = p_in = exp_xdr_reserve_qwords(xdr, opaque_words); ++ if (!p) { ++ error = -ETOOSMALL; ++ goto out; ++ } ++ ++ /* Fill in length later */ ++ p++; ++ ++ /* encode device list indices */ ++ p = exp_xdr_encode_u32(p, index_count); ++ for (i = 0; i < index_count; i++) ++ p = exp_xdr_encode_u32(p, fdev->fl_stripeindices_list[i]); ++ ++ /* encode device list */ ++ p = exp_xdr_encode_u32(p, dev_count); ++ for (i = 0; i < dev_count; i++) { ++ struct pnfs_filelayout_multipath *mp = &fdev->fl_device_list[i]; ++ ++ p = exp_xdr_encode_u32(p, mp->fl_multipath_length); ++ for (j = 0; j < mp->fl_multipath_length; j++) { ++ struct pnfs_filelayout_devaddr *da = ++ &mp->fl_multipath_list[j]; ++ ++ /* Encode device info */ ++ p = exp_xdr_encode_opaque(p, da->r_netid.data, ++ da->r_netid.len); ++ p = exp_xdr_encode_opaque(p, da->r_addr.data, ++ da->r_addr.len); ++ } ++ } ++ ++ /* backfill in length. Subtract 4 for da_addr_body size */ ++ len = (char *)p - (char *)p_in; ++ exp_xdr_encode_u32(p_in, len - 4); ++ ++ error = 0; ++out: ++ dprintk("%s: End err %d xdrlen %d\n", ++ __func__, error, len); ++ return error; ++} ++EXPORT_SYMBOL(filelayout_encode_devinfo); ++ ++/* Encodes the loc_body structure from draft 13 ++ * on the response stream. ++ * Use linux error codes (not nfs) since these values are being ++ * returned to the file system. ++ */ ++enum nfsstat4 ++filelayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_layout *flp) ++{ ++ u32 len = 0, nfl_util, fhlen, i; ++ u32 *layoutlen_p; ++ enum nfsstat4 nfserr; ++ __be32 *p; ++ ++ dprintk("%s: device_id %llx:%llx fsi %u, numfh %u\n", ++ __func__, ++ flp->device_id.pnfs_fsid, ++ flp->device_id.pnfs_devid, ++ flp->lg_first_stripe_index, ++ flp->lg_fh_length); ++ ++ /* Ensure file system added at least one file handle */ ++ if (flp->lg_fh_length <= 0) { ++ dprintk("%s: File Layout has no file handles!!\n", __func__); ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ goto out; ++ } ++ ++ /* Ensure room for len, devid, util, first_stripe_index, ++ * pattern_offset, number of filehandles */ ++ p = layoutlen_p = exp_xdr_reserve_qwords(xdr, 1+2+2+1+1+2+1); ++ if (!p) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ ++ /* save spot for opaque file layout length, fill-in later*/ ++ p++; ++ ++ /* encode device id */ ++ p = exp_xdr_encode_u64(p, flp->device_id.sbid); ++ p = exp_xdr_encode_u64(p, flp->device_id.devid); ++ ++ /* set and encode flags */ ++ nfl_util = flp->lg_stripe_unit; ++ if (flp->lg_commit_through_mds) ++ nfl_util |= NFL4_UFLG_COMMIT_THRU_MDS; ++ if (flp->lg_stripe_type == STRIPE_DENSE) ++ nfl_util |= NFL4_UFLG_DENSE; ++ p = exp_xdr_encode_u32(p, nfl_util); ++ ++ /* encode first stripe index */ ++ p = exp_xdr_encode_u32(p, flp->lg_first_stripe_index); ++ ++ /* encode striping pattern start */ ++ p = exp_xdr_encode_u64(p, flp->lg_pattern_offset); ++ ++ /* encode number of file handles */ ++ p = exp_xdr_encode_u32(p, flp->lg_fh_length); ++ ++ /* encode file handles */ ++ for (i = 0; i < flp->lg_fh_length; i++) { ++ fhlen = flp->lg_fh_list[i].fh_size; ++ p = exp_xdr_reserve_space(xdr, 4 + fhlen); ++ if (!p) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ p = exp_xdr_encode_opaque(p, &flp->lg_fh_list[i].fh_base, fhlen); ++ } ++ ++ /* Set number of bytes encoded = total_bytes_encoded - length var */ ++ len = (char *)p - (char *)layoutlen_p; ++ exp_xdr_encode_u32(layoutlen_p, len - 4); ++ ++ nfserr = NFS4_OK; ++out: ++ dprintk("%s: End err %u xdrlen %d\n", ++ __func__, nfserr, len); ++ return nfserr; ++} ++EXPORT_SYMBOL(filelayout_encode_layout); +diff -up linux-2.6.37.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.37.noarch/fs/exportfs/pnfs_osd_xdr_srv.c +--- linux-2.6.37.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2011-01-28 09:43:53.303778113 -0500 ++++ linux-2.6.37.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2011-01-28 09:43:53.303778113 -0500 +@@ -0,0 +1,289 @@ ++/* ++ * pnfs_osd_xdr_enc.c ++ * ++ * Object-Based pNFS Layout XDR layer ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++ ++/* ++ * struct pnfs_osd_data_map { ++ * u32 odm_num_comps; ++ * u64 odm_stripe_unit; ++ * u32 odm_group_width; ++ * u32 odm_group_depth; ++ * u32 odm_mirror_cnt; ++ * u32 odm_raid_algorithm; ++ * }; ++ */ ++static int pnfs_osd_xdr_encode_data_map( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_data_map *data_map) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 1+2+1+1+1+1); ++ ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, data_map->odm_num_comps); ++ p = exp_xdr_encode_u64(p, data_map->odm_stripe_unit); ++ p = exp_xdr_encode_u32(p, data_map->odm_group_width); ++ p = exp_xdr_encode_u32(p, data_map->odm_group_depth); ++ p = exp_xdr_encode_u32(p, data_map->odm_mirror_cnt); ++ p = exp_xdr_encode_u32(p, data_map->odm_raid_algorithm); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline int pnfs_osd_xdr_encode_objid( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_objid *object_id) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 2+2+2+2); ++ struct nfsd4_pnfs_deviceid *dev_id = ++ (struct nfsd4_pnfs_deviceid *)&object_id->oid_device_id; ++ ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u64(p, dev_id->sbid); ++ p = exp_xdr_encode_u64(p, dev_id->devid); ++ p = exp_xdr_encode_u64(p, object_id->oid_partition_id); ++ p = exp_xdr_encode_u64(p, object_id->oid_object_id); ++ ++ return 0; ++} ++ ++/* ++ * enum pnfs_osd_cap_key_sec4 { ++ * PNFS_OSD_CAP_KEY_SEC_NONE = 0, ++ * PNFS_OSD_CAP_KEY_SEC_SSV = 1 ++ * }; ++ * ++ * struct pnfs_osd_object_cred { ++ * struct pnfs_osd_objid oc_object_id; ++ * u32 oc_osd_version; ++ * u32 oc_cap_key_sec; ++ * struct pnfs_osd_opaque_cred oc_cap_key ++ * struct pnfs_osd_opaque_cred oc_cap; ++ * }; ++ */ ++static int pnfs_osd_xdr_encode_object_cred( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_object_cred *olo_comp) ++{ ++ __be32 *p; ++ int err; ++ ++ err = pnfs_osd_xdr_encode_objid(xdr, &olo_comp->oc_object_id); ++ if (err) ++ return err; ++ ++ p = exp_xdr_reserve_space(xdr, 3*4 + 4+olo_comp->oc_cap.cred_len); ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, olo_comp->oc_osd_version); ++ ++ /* No sec for now */ ++ p = exp_xdr_encode_u32(p, PNFS_OSD_CAP_KEY_SEC_NONE); ++ p = exp_xdr_encode_u32(p, 0); /* opaque oc_capability_key<> */ ++ ++ exp_xdr_encode_opaque(p, olo_comp->oc_cap.cred, ++ olo_comp->oc_cap.cred_len); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_layout { ++ * struct pnfs_osd_data_map olo_map; ++ * u32 olo_comps_index; ++ * u32 olo_num_comps; ++ * struct pnfs_osd_object_cred *olo_comps; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_layout( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_layout *pol) ++{ ++ __be32 *p; ++ u32 i; ++ int err; ++ ++ err = pnfs_osd_xdr_encode_data_map(xdr, &pol->olo_map); ++ if (err) ++ return err; ++ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, pol->olo_comps_index); ++ p = exp_xdr_encode_u32(p, pol->olo_num_comps); ++ ++ for (i = 0; i < pol->olo_num_comps; i++) { ++ err = pnfs_osd_xdr_encode_object_cred(xdr, &pol->olo_comps[i]); ++ if (err) ++ return err; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_encode_layout); ++ ++static int _encode_string(struct exp_xdr_stream *xdr, ++ const struct nfs4_string *str) ++{ ++ __be32 *p = exp_xdr_reserve_space(xdr, 4 + str->len); ++ ++ if (!p) ++ return -E2BIG; ++ exp_xdr_encode_opaque(p, str->data, str->len); ++ return 0; ++} ++ ++/* struct pnfs_osd_deviceaddr { ++ * struct pnfs_osd_targetid oda_targetid; ++ * struct pnfs_osd_targetaddr oda_targetaddr; ++ * u8 oda_lun[8]; ++ * struct nfs4_string oda_systemid; ++ * struct pnfs_osd_object_cred oda_root_obj_cred; ++ * struct nfs4_string oda_osdname; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_deviceaddr( ++ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr) ++{ ++ __be32 *p; ++ int err; ++ ++ p = exp_xdr_reserve_space(xdr, 4 + 4 + sizeof(devaddr->oda_lun)); ++ if (!p) ++ return -E2BIG; ++ ++ /* Empty oda_targetid */ ++ p = exp_xdr_encode_u32(p, OBJ_TARGET_ANON); ++ ++ /* Empty oda_targetaddr for now */ ++ p = exp_xdr_encode_u32(p, 0); ++ ++ /* oda_lun */ ++ exp_xdr_encode_bytes(p, devaddr->oda_lun, sizeof(devaddr->oda_lun)); ++ ++ err = _encode_string(xdr, &devaddr->oda_systemid); ++ if (err) ++ return err; ++ ++ err = pnfs_osd_xdr_encode_object_cred(xdr, ++ &devaddr->oda_root_obj_cred); ++ if (err) ++ return err; ++ ++ err = _encode_string(xdr, &devaddr->oda_osdname); ++ if (err) ++ return err; ++ ++ return 0; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_encode_deviceaddr); ++ ++/* ++ * struct pnfs_osd_layoutupdate { ++ * u32 dsu_valid; ++ * s64 dsu_delta; ++ * u32 olu_ioerr_flag; ++ * }; ++ */ ++__be32 * ++pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p) ++{ ++ lou->dsu_valid = be32_to_cpu(*p++); ++ if (lou->dsu_valid) ++ p = xdr_decode_hyper(p, &lou->dsu_delta); ++ lou->olu_ioerr_flag = be32_to_cpu(*p++); ++ return p; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_decode_layoutupdate); ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline __be32 * ++pnfs_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) ++{ ++ /* FIXME: p = xdr_decode_fixed(...) */ ++ memcpy(objid->oid_device_id.data, p, sizeof(objid->oid_device_id.data)); ++ p += XDR_QUADLEN(sizeof(objid->oid_device_id.data)); ++ ++ p = xdr_decode_hyper(p, &objid->oid_partition_id); ++ p = xdr_decode_hyper(p, &objid->oid_object_id); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_ioerr { ++ * struct pnfs_osd_objid oer_component; ++ * u64 oer_comp_offset; ++ * u64 oer_comp_length; ++ * u32 oer_iswrite; ++ * u32 oer_errno; ++ * }; ++ */ ++__be32 * ++pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p) ++{ ++ p = pnfs_osd_xdr_decode_objid(p, &ioerr->oer_component); ++ p = xdr_decode_hyper(p, &ioerr->oer_comp_offset); ++ p = xdr_decode_hyper(p, &ioerr->oer_comp_length); ++ ioerr->oer_iswrite = be32_to_cpu(*p++); ++ ioerr->oer_errno = be32_to_cpu(*p++); ++ return p; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr); +diff -up linux-2.6.37.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.37.noarch/fs/gfs2/ops_fstype.c +--- linux-2.6.37.noarch/fs/gfs2/ops_fstype.c.orig 2011-01-28 09:37:32.445983313 -0500 ++++ linux-2.6.37.noarch/fs/gfs2/ops_fstype.c 2011-01-28 09:43:53.303778113 -0500 +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + + #include "gfs2.h" + #include "incore.h" +@@ -1107,6 +1108,9 @@ static int fill_super(struct super_block + sb->s_op = &gfs2_super_ops; + sb->s_d_op = &gfs2_dops; + sb->s_export_op = &gfs2_export_ops; ++#if defined(CONFIG_PNFSD) ++ sb->s_pnfs_op = &pnfs_dlm_export_ops; ++#endif /* CONFIG_PNFSD */ + sb->s_xattr = gfs2_xattr_handlers; + sb->s_qcop = &gfs2_quotactl_ops; + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; +diff -up linux-2.6.37.noarch/fs/Kconfig.orig linux-2.6.37.noarch/fs/Kconfig +--- linux-2.6.37.noarch/fs/Kconfig.orig 2011-01-28 09:37:32.257989837 -0500 ++++ linux-2.6.37.noarch/fs/Kconfig 2011-01-28 09:43:53.295779958 -0500 +@@ -49,6 +49,28 @@ config FS_POSIX_ACL + config EXPORTFS + tristate + ++config EXPORTFS_FILE_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 files layout type. ++ Must be automatically selected by supporting filesystems. ++ ++config EXPORTFS_OSD_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 objects layout type. ++ Must be automatically selected by supporting osd ++ filesystems. ++ ++config EXPORTFS_BLOCK_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 blocks layout type. ++ Must be automatically selected by supporting filesystems. ++ + config FILE_LOCKING + bool "Enable POSIX file locking API" if EXPERT + default y +diff -up linux-2.6.37.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.37.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c +--- linux-2.6.37.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2011-01-28 09:43:53.306777474 -0500 ++++ linux-2.6.37.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2011-01-28 09:43:53.306777474 -0500 +@@ -0,0 +1,66 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct pipefs_list bl_device_list; ++struct dentry *bl_device_pipe; ++ ++ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, size_t len) ++{ ++ int err; ++ struct pipefs_hdr *msg; ++ ++ dprintk("Entering %s...\n", __func__); ++ ++ msg = pipefs_readmsg(filp, src, len); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: unable to read pipefs message.\n"); ++ return PTR_ERR(msg); ++ } ++ ++ /* now assign the result, which wakes the blocked thread */ ++ err = pipefs_assign_upcall_reply(msg, &bl_device_list); ++ if (err) { ++ dprintk("ERROR: failed to assign upcall with id %u\n", ++ msg->msgid); ++ kfree(msg); ++ } ++ return len; ++} ++ ++static const struct rpc_pipe_ops bl_pipe_ops = { ++ .upcall = pipefs_generic_upcall, ++ .downcall = bl_pipe_downcall, ++ .destroy_msg = pipefs_generic_destroy_msg, ++}; ++ ++int bl_pipe_init(void) ++{ ++ dprintk("%s: block_device pipefs registering...\n", __func__); ++ bl_device_pipe = pipefs_mkpipe("bl_device_pipe", &bl_pipe_ops, 1); ++ if (IS_ERR(bl_device_pipe)) ++ dprintk("ERROR, unable to make block_device pipe\n"); ++ ++ if (!bl_device_pipe) ++ dprintk("bl_device_pipe is NULL!\n"); ++ else ++ dprintk("bl_device_pipe created!\n"); ++ pipefs_init_list(&bl_device_list); ++ return 0; ++} ++ ++void bl_pipe_exit(void) ++{ ++ dprintk("%s: block_device pipefs unregistering...\n", __func__); ++ if (IS_ERR(bl_device_pipe)) ++ return ; ++ pipefs_closepipe(bl_device_pipe); ++ return; ++} +diff -up linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayout.c +--- linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2011-01-28 09:43:53.307777263 -0500 ++++ linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayout.c 2011-01-28 09:43:53.307777263 -0500 +@@ -0,0 +1,1146 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.c ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#include ++#include ++ ++#include /* various write calls */ ++#include /* struct bio */ ++#include ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Andy Adamson "); ++MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); ++ ++/* Callback operations to the pNFS client */ ++ ++static void print_page(struct page *page) ++{ ++ dprintk("PRINTPAGE page %p\n", page); ++ dprintk(" PagePrivate %d\n", PagePrivate(page)); ++ dprintk(" PageUptodate %d\n", PageUptodate(page)); ++ dprintk(" PageError %d\n", PageError(page)); ++ dprintk(" PageDirty %d\n", PageDirty(page)); ++ dprintk(" PageReferenced %d\n", PageReferenced(page)); ++ dprintk(" PageLocked %d\n", PageLocked(page)); ++ dprintk(" PageWriteback %d\n", PageWriteback(page)); ++ dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); ++ dprintk("\n"); ++} ++ ++/* Given the be associated with isect, determine if page data needs to be ++ * initialized. ++ */ ++static int is_hole(struct pnfs_block_extent *be, sector_t isect) ++{ ++ if (be->be_state == PNFS_BLOCK_NONE_DATA) ++ return 1; ++ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) ++ return 0; ++ else ++ return !is_sector_initialized(be->be_inval, isect); ++} ++ ++/* Given the be associated with isect, determine if page data can be ++ * written to disk. ++ */ ++static int is_writable(struct pnfs_block_extent *be, sector_t isect) ++{ ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) ++ return 1; ++ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) ++ return 0; ++ else ++ return is_sector_initialized(be->be_inval, isect); ++} ++ ++static int ++dont_like_caller(struct nfs_page *req) ++{ ++ if (atomic_read(&req->wb_complete)) { ++ /* Called by _multi */ ++ return 1; ++ } else { ++ /* Called by _one */ ++ return 0; ++ } ++} ++ ++static enum pnfs_try_status ++bl_commit(struct nfs_write_data *nfs_data, ++ int sync) ++{ ++ dprintk("%s enter\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++/* The data we are handed might be spread across several bios. We need ++ * to track when the last one is finished. ++ */ ++struct parallel_io { ++ struct kref refcnt; ++ struct rpc_call_ops call_ops; ++ void (*pnfs_callback) (void *data); ++ void *data; ++}; ++ ++static inline struct parallel_io *alloc_parallel(void *data) ++{ ++ struct parallel_io *rv; ++ ++ rv = kmalloc(sizeof(*rv), GFP_KERNEL); ++ if (rv) { ++ rv->data = data; ++ kref_init(&rv->refcnt); ++ } ++ return rv; ++} ++ ++static inline void get_parallel(struct parallel_io *p) ++{ ++ kref_get(&p->refcnt); ++} ++ ++static void destroy_parallel(struct kref *kref) ++{ ++ struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); ++ ++ dprintk("%s enter\n", __func__); ++ p->pnfs_callback(p->data); ++ kfree(p); ++} ++ ++static inline void put_parallel(struct parallel_io *p) ++{ ++ kref_put(&p->refcnt, destroy_parallel); ++} ++ ++static struct bio * ++bl_submit_bio(int rw, struct bio *bio) ++{ ++ if (bio) { ++ get_parallel(bio->bi_private); ++ dprintk("%s submitting %s bio %u@%llu\n", __func__, ++ rw == READ ? "read" : "write", ++ bio->bi_size, (u64)bio->bi_sector); ++ submit_bio(rw, bio); ++ } ++ return NULL; ++} ++ ++static inline void ++bl_done_with_rpage(struct page *page, const int ok) ++{ ++ if (ok) { ++ ClearPagePnfsErr(page); ++ SetPageUptodate(page); ++ } else { ++ ClearPageUptodate(page); ++ SetPageError(page); ++ SetPagePnfsErr(page); ++ } ++ /* Page is unlocked via rpc_release. Should really be done here. */ ++} ++ ++/* This is basically copied from mpage_end_io_read */ ++static void bl_end_io_read(struct bio *bio, int err) ++{ ++ void *data = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ bl_done_with_rpage(page, uptodate); ++ } while (bvec >= bio->bi_io_vec); ++ bio_put(bio); ++ put_parallel(data); ++} ++ ++static void bl_read_cleanup(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ pnfs_read_done(rdata); ++} ++ ++static void ++bl_end_par_io_read(void *data) ++{ ++ struct nfs_read_data *rdata = data; ++ ++ INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); ++ schedule_work(&rdata->task.u.tk_work); ++} ++ ++/* We don't want normal .rpc_call_done callback used, so we replace it ++ * with this stub. ++ */ ++static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) ++{ ++ return; ++} ++ ++static enum pnfs_try_status ++bl_read_pagelist(struct nfs_read_data *rdata, ++ unsigned nr_pages) ++{ ++ int i, hole; ++ struct bio *bio = NULL; ++ struct pnfs_block_extent *be = NULL, *cow_read = NULL; ++ sector_t isect, extent_length = 0; ++ struct parallel_io *par; ++ loff_t f_offset = rdata->args.offset; ++ size_t count = rdata->args.count; ++ struct page **pages = rdata->args.pages; ++ int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; ++ ++ dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, ++ nr_pages, f_offset, count); ++ ++ if (dont_like_caller(rdata->req)) { ++ dprintk("%s dont_like_caller failed\n", __func__); ++ goto use_mds; ++ } ++ if ((nr_pages == 1) && PagePnfsErr(rdata->req->wb_page)) { ++ /* We want to fall back to mds in case of read_page ++ * after error on read_pages. ++ */ ++ dprintk("%s PG_pnfserr set\n", __func__); ++ goto use_mds; ++ } ++ par = alloc_parallel(rdata); ++ if (!par) ++ goto use_mds; ++ par->call_ops = *rdata->pdata.call_ops; ++ par->call_ops.rpc_call_done = bl_rpc_do_nothing; ++ par->pnfs_callback = bl_end_par_io_read; ++ /* At this point, we can no longer jump to use_mds */ ++ ++ isect = (sector_t) (f_offset >> 9); ++ /* Code assumes extents are page-aligned */ ++ for (i = pg_index; i < nr_pages; i++) { ++ if (!extent_length) { ++ /* We've used up the previous extent */ ++ put_extent(be); ++ put_extent(cow_read); ++ bio = bl_submit_bio(READ, bio); ++ /* Get the next one */ ++ be = find_get_extent(BLK_LSEG2EXT(rdata->pdata.lseg), ++ isect, &cow_read); ++ if (!be) { ++ /* Error out this page */ ++ bl_done_with_rpage(pages[i], 0); ++ break; ++ } ++ extent_length = be->be_length - ++ (isect - be->be_f_offset); ++ if (cow_read) { ++ sector_t cow_length = cow_read->be_length - ++ (isect - cow_read->be_f_offset); ++ extent_length = min(extent_length, cow_length); ++ } ++ } ++ hole = is_hole(be, isect); ++ if (hole && !cow_read) { ++ bio = bl_submit_bio(READ, bio); ++ /* Fill hole w/ zeroes w/o accessing device */ ++ dprintk("%s Zeroing page for hole\n", __func__); ++ zero_user(pages[i], 0, ++ min_t(int, PAGE_CACHE_SIZE, count)); ++ print_page(pages[i]); ++ bl_done_with_rpage(pages[i], 1); ++ } else { ++ struct pnfs_block_extent *be_read; ++ ++ be_read = (hole && cow_read) ? cow_read : be; ++ for (;;) { ++ if (!bio) { ++ bio = bio_alloc(GFP_NOIO, nr_pages - i); ++ if (!bio) { ++ /* Error out this page */ ++ bl_done_with_rpage(pages[i], 0); ++ break; ++ } ++ bio->bi_sector = isect - ++ be_read->be_f_offset + ++ be_read->be_v_offset; ++ bio->bi_bdev = be_read->be_mdev; ++ bio->bi_end_io = bl_end_io_read; ++ bio->bi_private = par; ++ } ++ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) ++ break; ++ bio = bl_submit_bio(READ, bio); ++ } ++ } ++ isect += PAGE_CACHE_SIZE >> 9; ++ extent_length -= PAGE_CACHE_SIZE >> 9; ++ } ++ if ((isect << 9) >= rdata->inode->i_size) { ++ rdata->res.eof = 1; ++ rdata->res.count = rdata->inode->i_size - f_offset; ++ } else { ++ rdata->res.count = (isect << 9) - f_offset; ++ } ++ put_extent(be); ++ put_extent(cow_read); ++ bl_submit_bio(READ, bio); ++ put_parallel(par); ++ return PNFS_ATTEMPTED; ++ ++ use_mds: ++ dprintk("Giving up and using normal NFS\n"); ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static void mark_extents_written(struct pnfs_block_layout *bl, ++ __u64 offset, __u32 count) ++{ ++ sector_t isect, end; ++ struct pnfs_block_extent *be; ++ ++ dprintk("%s(%llu, %u)\n", __func__, offset, count); ++ if (count == 0) ++ return; ++ isect = (offset & (long)(PAGE_CACHE_MASK)) >> 9; ++ end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); ++ end >>= 9; ++ while (isect < end) { ++ sector_t len; ++ be = find_get_extent(bl, isect, NULL); ++ BUG_ON(!be); /* FIXME */ ++ len = min(end, be->be_f_offset + be->be_length) - isect; ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) ++ mark_for_commit(be, isect, len); /* What if fails? */ ++ isect += len; ++ put_extent(be); ++ } ++} ++ ++/* STUB - this needs thought */ ++static inline void ++bl_done_with_wpage(struct page *page, const int ok) ++{ ++ if (!ok) { ++ SetPageError(page); ++ SetPagePnfsErr(page); ++ /* This is an inline copy of nfs_zap_mapping */ ++ /* This is oh so fishy, and needs deep thought */ ++ if (page->mapping->nrpages != 0) { ++ struct inode *inode = page->mapping->host; ++ spin_lock(&inode->i_lock); ++ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; ++ spin_unlock(&inode->i_lock); ++ } ++ } ++ /* end_page_writeback called in rpc_release. Should be done here. */ ++} ++ ++/* This is basically copied from mpage_end_io_read */ ++static void bl_end_io_write(struct bio *bio, int err) ++{ ++ void *data = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ bl_done_with_wpage(page, uptodate); ++ } while (bvec >= bio->bi_io_vec); ++ bio_put(bio); ++ put_parallel(data); ++} ++ ++/* Function scheduled for call during bl_end_par_io_write, ++ * it marks sectors as written and extends the commitlist. ++ */ ++static void bl_write_cleanup(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ if (!wdata->task.tk_status) { ++ /* Marks for LAYOUTCOMMIT */ ++ /* BUG - this should be called after each bio, not after ++ * all finish, unless have some way of storing success/failure ++ */ ++ mark_extents_written(BLK_LSEG2EXT(wdata->pdata.lseg), ++ wdata->args.offset, wdata->args.count); ++ } ++ pnfs_writeback_done(wdata); ++} ++ ++/* Called when last of bios associated with a bl_write_pagelist call finishes */ ++static void ++bl_end_par_io_write(void *data) ++{ ++ struct nfs_write_data *wdata = data; ++ ++ /* STUB - ignoring error handling */ ++ wdata->task.tk_status = 0; ++ wdata->verf.committed = NFS_FILE_SYNC; ++ INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); ++ schedule_work(&wdata->task.u.tk_work); ++} ++ ++static enum pnfs_try_status ++bl_write_pagelist(struct nfs_write_data *wdata, ++ unsigned nr_pages, ++ int sync) ++{ ++ int i; ++ struct bio *bio = NULL; ++ struct pnfs_block_extent *be = NULL; ++ sector_t isect, extent_length = 0; ++ struct parallel_io *par; ++ loff_t offset = wdata->args.offset; ++ size_t count = wdata->args.count; ++ struct page **pages = wdata->args.pages; ++ int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; ++ ++ dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); ++ if (!wdata->req->wb_lseg) { ++ dprintk("%s no lseg, falling back to MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ if (dont_like_caller(wdata->req)) { ++ dprintk("%s dont_like_caller failed\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ /* At this point, wdata->pages is a (sequential) list of nfs_pages. ++ * We want to write each, and if there is an error remove it from ++ * list and call ++ * nfs_retry_request(req) to have it redone using nfs. ++ * QUEST? Do as block or per req? Think have to do per block ++ * as part of end_bio ++ */ ++ par = alloc_parallel(wdata); ++ if (!par) ++ return PNFS_NOT_ATTEMPTED; ++ par->call_ops = *wdata->pdata.call_ops; ++ par->call_ops.rpc_call_done = bl_rpc_do_nothing; ++ par->pnfs_callback = bl_end_par_io_write; ++ /* At this point, have to be more careful with error handling */ ++ ++ isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> 9); ++ for (i = pg_index; i < nr_pages; i++) { ++ if (!extent_length) { ++ /* We've used up the previous extent */ ++ put_extent(be); ++ bio = bl_submit_bio(WRITE, bio); ++ /* Get the next one */ ++ be = find_get_extent(BLK_LSEG2EXT(wdata->pdata.lseg), ++ isect, NULL); ++ if (!be || !is_writable(be, isect)) { ++ /* FIXME */ ++ bl_done_with_wpage(pages[i], 0); ++ break; ++ } ++ extent_length = be->be_length - ++ (isect - be->be_f_offset); ++ } ++ for (;;) { ++ if (!bio) { ++ bio = bio_alloc(GFP_NOIO, nr_pages - i); ++ if (!bio) { ++ /* Error out this page */ ++ /* FIXME */ ++ bl_done_with_wpage(pages[i], 0); ++ break; ++ } ++ bio->bi_sector = isect - be->be_f_offset + ++ be->be_v_offset; ++ bio->bi_bdev = be->be_mdev; ++ bio->bi_end_io = bl_end_io_write; ++ bio->bi_private = par; ++ } ++ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) ++ break; ++ bio = bl_submit_bio(WRITE, bio); ++ } ++ isect += PAGE_CACHE_SIZE >> 9; ++ extent_length -= PAGE_CACHE_SIZE >> 9; ++ } ++ wdata->res.count = (isect << 9) - (offset & (long)PAGE_CACHE_MASK); ++ put_extent(be); ++ bl_submit_bio(WRITE, bio); ++ put_parallel(par); ++ return PNFS_ATTEMPTED; ++} ++ ++/* FIXME - range ignored */ ++static void ++release_extents(struct pnfs_block_layout *bl, ++ struct pnfs_layout_range *range) ++{ ++ int i; ++ struct pnfs_block_extent *be; ++ ++ spin_lock(&bl->bl_ext_lock); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ while (!list_empty(&bl->bl_extents[i])) { ++ be = list_first_entry(&bl->bl_extents[i], ++ struct pnfs_block_extent, ++ be_node); ++ list_del(&be->be_node); ++ put_extent(be); ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++} ++ ++static void ++release_inval_marks(struct pnfs_inval_markings *marks) ++{ ++ struct pnfs_inval_tracking *pos, *temp; ++ ++ list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { ++ list_del(&pos->it_link); ++ kfree(pos); ++ } ++ return; ++} ++ ++/* Note we are relying on caller locking to prevent nasty races. */ ++static void ++bl_free_layout_hdr(struct pnfs_layout_hdr *lo) ++{ ++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); ++ ++ dprintk("%s enter\n", __func__); ++ release_extents(bl, NULL); ++ release_inval_marks(&bl->bl_inval); ++ kfree(bl); ++} ++ ++static struct pnfs_layout_hdr * ++bl_alloc_layout_hdr(struct inode *inode) ++{ ++ struct pnfs_block_layout *bl; ++ ++ dprintk("%s enter\n", __func__); ++ bl = kzalloc(sizeof(*bl), GFP_KERNEL); ++ if (!bl) ++ return NULL; ++ spin_lock_init(&bl->bl_ext_lock); ++ INIT_LIST_HEAD(&bl->bl_extents[0]); ++ INIT_LIST_HEAD(&bl->bl_extents[1]); ++ INIT_LIST_HEAD(&bl->bl_commit); ++ bl->bl_count = 0; ++ bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> 9; ++ INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); ++ return &bl->bl_layout; ++} ++ ++static void ++bl_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ dprintk("%s enter\n", __func__); ++ kfree(lseg); ++} ++ ++/* Because the generic infrastructure does not correctly merge layouts, ++ * we pretty much ignore lseg, and store all data layout wide, so we ++ * can correctly merge. Eventually we should push some correct merge ++ * behavior up to the generic code, as the current behavior tends to ++ * cause lots of unnecessary overlapping LAYOUTGET requests. ++ */ ++static struct pnfs_layout_segment * ++bl_alloc_lseg(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutget_res *lgr) ++{ ++ struct pnfs_layout_segment *lseg; ++ int status; ++ ++ dprintk("%s enter\n", __func__); ++ lseg = kzalloc(sizeof(*lseg) + 0, GFP_KERNEL); ++ if (!lseg) ++ return NULL; ++ status = nfs4_blk_process_layoutget(lo, lgr); ++ if (status) { ++ /* We don't want to call the full-blown bl_free_lseg, ++ * since on error extents were not touched. ++ */ ++ /* STUB - we really want to distinguish between 2 error ++ * conditions here. This lseg failed, but lo data structures ++ * are OK, or we hosed the lo data structures. The calling ++ * code probably needs to distinguish this too. ++ */ ++ kfree(lseg); ++ return ERR_PTR(status); ++ } ++ return lseg; ++} ++ ++static int ++bl_setup_layoutcommit(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutcommit_args *arg) ++{ ++ struct nfs_server *nfss = NFS_SERVER(lo->inode); ++ struct bl_layoutupdate_data *layoutupdate_data; ++ ++ dprintk("%s enter\n", __func__); ++ /* Need to ensure commit is block-size aligned */ ++ if (nfss->pnfs_blksize) { ++ u64 mask = nfss->pnfs_blksize - 1; ++ u64 offset = arg->range.offset & mask; ++ ++ arg->range.offset -= offset; ++ arg->range.length += offset + mask; ++ arg->range.length &= ~mask; ++ } ++ ++ layoutupdate_data = kmalloc(sizeof(struct bl_layoutupdate_data), ++ GFP_KERNEL); ++ if (unlikely(!layoutupdate_data)) ++ return -ENOMEM; ++ INIT_LIST_HEAD(&layoutupdate_data->ranges); ++ arg->layoutdriver_data = layoutupdate_data; ++ ++ return 0; ++} ++ ++static void ++bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *arg) ++{ ++ dprintk("%s enter\n", __func__); ++ encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); ++} ++ ++static void ++bl_cleanup_layoutcommit(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutcommit_data *lcdata) ++{ ++ dprintk("%s enter\n", __func__); ++ clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); ++ kfree(lcdata->args.layoutdriver_data); ++} ++ ++static void free_blk_mountid(struct block_mount_id *mid) ++{ ++ if (mid) { ++ struct pnfs_block_dev *dev; ++ spin_lock(&mid->bm_lock); ++ while (!list_empty(&mid->bm_devlist)) { ++ dev = list_first_entry(&mid->bm_devlist, ++ struct pnfs_block_dev, ++ bm_node); ++ list_del(&dev->bm_node); ++ free_block_dev(dev); ++ } ++ spin_unlock(&mid->bm_lock); ++ kfree(mid); ++ } ++} ++ ++/* This is mostly copied from the filelayout's get_device_info function. ++ * It seems much of this should be at the generic pnfs level. ++ */ ++static struct pnfs_block_dev * ++nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, ++ struct nfs4_deviceid *d_id, ++ struct list_head *sdlist) ++{ ++ struct pnfs_device *dev; ++ struct pnfs_block_dev *rv = NULL; ++ u32 max_resp_sz; ++ int max_pages; ++ struct page **pages = NULL; ++ int i, rc; ++ ++ /* ++ * Use the session max response size as the basis for setting ++ * GETDEVICEINFO's maxcount ++ */ ++ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; ++ max_pages = max_resp_sz >> PAGE_SHIFT; ++ dprintk("%s max_resp_sz %u max_pages %d\n", ++ __func__, max_resp_sz, max_pages); ++ ++ dev = kmalloc(sizeof(*dev), GFP_KERNEL); ++ if (!dev) { ++ dprintk("%s kmalloc failed\n", __func__); ++ return NULL; ++ } ++ ++ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); ++ if (pages == NULL) { ++ kfree(dev); ++ return NULL; ++ } ++ for (i = 0; i < max_pages; i++) { ++ pages[i] = alloc_page(GFP_KERNEL); ++ if (!pages[i]) ++ goto out_free; ++ } ++ ++ /* set dev->area */ ++ dev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); ++ if (!dev->area) ++ goto out_free; ++ ++ memcpy(&dev->dev_id, d_id, sizeof(*d_id)); ++ dev->layout_type = LAYOUT_BLOCK_VOLUME; ++ dev->pages = pages; ++ dev->pgbase = 0; ++ dev->pglen = PAGE_SIZE * max_pages; ++ dev->mincount = 0; ++ ++ dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); ++ rc = nfs4_proc_getdeviceinfo(server, dev); ++ dprintk("%s getdevice info returns %d\n", __func__, rc); ++ if (rc) ++ goto out_free; ++ ++ rv = nfs4_blk_decode_device(server, dev, sdlist); ++ out_free: ++ if (dev->area != NULL) ++ vunmap(dev->area); ++ for (i = 0; i < max_pages; i++) ++ __free_page(pages[i]); ++ kfree(pages); ++ kfree(dev); ++ return rv; ++} ++ ++ ++/* ++ * Retrieve the list of available devices for the mountpoint. ++ */ ++static int ++bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) ++{ ++ struct block_mount_id *b_mt_id = NULL; ++ struct pnfs_mount_type *mtype = NULL; ++ struct pnfs_devicelist *dlist = NULL; ++ struct pnfs_block_dev *bdev; ++ LIST_HEAD(block_disklist); ++ int status = 0, i; ++ ++ dprintk("%s enter\n", __func__); ++ ++ if (server->pnfs_blksize == 0) { ++ dprintk("%s Server did not return blksize\n", __func__); ++ return -EINVAL; ++ } ++ b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_KERNEL); ++ if (!b_mt_id) { ++ status = -ENOMEM; ++ goto out_error; ++ } ++ /* Initialize nfs4 block layout mount id */ ++ spin_lock_init(&b_mt_id->bm_lock); ++ INIT_LIST_HEAD(&b_mt_id->bm_devlist); ++ ++ dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_KERNEL); ++ if (!dlist) ++ goto out_error; ++ dlist->eof = 0; ++ while (!dlist->eof) { ++ status = nfs4_proc_getdevicelist(server, fh, dlist); ++ if (status) ++ goto out_error; ++ dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", ++ __func__, dlist->num_devs, dlist->eof); ++ /* For each device returned in dlist, call GETDEVICEINFO, and ++ * decode the opaque topology encoding to create a flat ++ * volume topology, matching VOLUME_SIMPLE disk signatures ++ * to disks in the visible block disk list. ++ * Construct an LVM meta device from the flat volume topology. ++ */ ++ for (i = 0; i < dlist->num_devs; i++) { ++ bdev = nfs4_blk_get_deviceinfo(server, fh, ++ &dlist->dev_id[i], ++ &block_disklist); ++ if (!bdev) { ++ status = -ENODEV; ++ goto out_error; ++ } ++ spin_lock(&b_mt_id->bm_lock); ++ list_add(&bdev->bm_node, &b_mt_id->bm_devlist); ++ spin_unlock(&b_mt_id->bm_lock); ++ } ++ } ++ dprintk("%s SUCCESS\n", __func__); ++ server->pnfs_ld_data = b_mt_id; ++ ++ out_return: ++ kfree(dlist); ++ return status; ++ ++ out_error: ++ free_blk_mountid(b_mt_id); ++ kfree(mtype); ++ goto out_return; ++} ++ ++static int ++bl_clear_layoutdriver(struct nfs_server *server) ++{ ++ struct block_mount_id *b_mt_id = server->pnfs_ld_data; ++ ++ dprintk("%s enter\n", __func__); ++ free_blk_mountid(b_mt_id); ++ dprintk("%s RETURNS\n", __func__); ++ return 0; ++} ++ ++/* STUB - mark intersection of layout and page as bad, so is not ++ * used again. ++ */ ++static void mark_bad_read(void) ++{ ++ return; ++} ++ ++/* Copied from buffer.c */ ++static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) ++{ ++ if (uptodate) { ++ set_buffer_uptodate(bh); ++ } else { ++ /* This happens, due to failed READA attempts. */ ++ clear_buffer_uptodate(bh); ++ } ++ unlock_buffer(bh); ++} ++ ++/* Copied from buffer.c */ ++static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) ++{ ++ __end_buffer_read_notouch(bh, uptodate); ++} ++ ++/* ++ * map_block: map a requested I/0 block (isect) into an offset in the LVM ++ * meta block_device ++ */ ++static void ++map_block(sector_t isect, struct pnfs_block_extent *be, struct buffer_head *bh) ++{ ++ dprintk("%s enter be=%p\n", __func__, be); ++ ++ set_buffer_mapped(bh); ++ bh->b_bdev = be->be_mdev; ++ bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> ++ (be->be_mdev->bd_inode->i_blkbits - 9); ++ ++ dprintk("%s isect %ld, bh->b_blocknr %ld, using bsize %Zd\n", ++ __func__, (long)isect, ++ (long)bh->b_blocknr, ++ bh->b_size); ++ return; ++} ++ ++/* Given an unmapped page, zero it (or read in page for COW), ++ * and set appropriate flags/markings, but it is safe to not initialize ++ * the range given in [from, to). ++ */ ++/* This is loosely based on nobh_write_begin */ ++static int ++init_page_for_write(struct pnfs_block_layout *bl, struct page *page, ++ unsigned from, unsigned to, sector_t **pages_to_mark) ++{ ++ struct buffer_head *bh; ++ int inval, ret = -EIO; ++ struct pnfs_block_extent *be = NULL, *cow_read = NULL; ++ sector_t isect; ++ ++ dprintk("%s enter, %p\n", __func__, page); ++ bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); ++ if (!bh) { ++ ret = -ENOMEM; ++ goto cleanup; ++ } ++ ++ isect = (sector_t)page->index << (PAGE_CACHE_SHIFT - 9); ++ be = find_get_extent(bl, isect, &cow_read); ++ if (!be) ++ goto cleanup; ++ inval = is_hole(be, isect); ++ dprintk("%s inval=%i, from=%u, to=%u\n", __func__, inval, from, to); ++ if (inval) { ++ if (be->be_state == PNFS_BLOCK_NONE_DATA) { ++ dprintk("%s PANIC - got NONE_DATA extent %p\n", ++ __func__, be); ++ goto cleanup; ++ } ++ map_block(isect, be, bh); ++ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); ++ } ++ if (PageUptodate(page)) { ++ /* Do nothing */ ++ } else if (inval & !cow_read) { ++ zero_user_segments(page, 0, from, to, PAGE_CACHE_SIZE); ++ } else if (0 < from || PAGE_CACHE_SIZE > to) { ++ struct pnfs_block_extent *read_extent; ++ ++ read_extent = (inval && cow_read) ? cow_read : be; ++ map_block(isect, read_extent, bh); ++ lock_buffer(bh); ++ bh->b_end_io = end_buffer_read_nobh; ++ submit_bh(READ, bh); ++ dprintk("%s: Waiting for buffer read\n", __func__); ++ /* XXX Don't really want to hold layout lock here */ ++ wait_on_buffer(bh); ++ if (!buffer_uptodate(bh)) ++ goto cleanup; ++ } ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ /* There is a BUG here if is a short copy after write_begin, ++ * but I think this is a generic fs bug. The problem is that ++ * we have marked the page as initialized, but it is possible ++ * that the section not copied may never get copied. ++ */ ++ ret = mark_initialized_sectors(be->be_inval, isect, ++ PAGE_CACHE_SECTORS, ++ pages_to_mark); ++ /* Want to preallocate mem so above can't fail */ ++ if (ret) ++ goto cleanup; ++ } ++ SetPageMappedToDisk(page); ++ ret = 0; ++ ++cleanup: ++ free_buffer_head(bh); ++ put_extent(be); ++ put_extent(cow_read); ++ if (ret) { ++ /* Need to mark layout with bad read...should now ++ * just use nfs4 for reads and writes. ++ */ ++ mark_bad_read(); ++ } ++ return ret; ++} ++ ++static int ++bl_write_begin(struct pnfs_layout_segment *lseg, struct page *page, loff_t pos, ++ unsigned count, struct pnfs_fsdata *fsdata) ++{ ++ unsigned from, to; ++ int ret; ++ sector_t *pages_to_mark = NULL; ++ struct pnfs_block_layout *bl = BLK_LSEG2EXT(lseg); ++ ++ dprintk("%s enter, %u@%lld\n", __func__, count, pos); ++ print_page(page); ++ /* The following code assumes blocksize >= PAGE_CACHE_SIZE */ ++ if (bl->bl_blocksize < (PAGE_CACHE_SIZE >> 9)) { ++ dprintk("%s Can't handle blocksize %llu\n", __func__, ++ (u64)bl->bl_blocksize); ++ put_lseg(fsdata->lseg); ++ fsdata->lseg = NULL; ++ return 0; ++ } ++ if (PageMappedToDisk(page)) { ++ /* Basically, this is a flag that says we have ++ * successfully called write_begin already on this page. ++ */ ++ /* NOTE - there are cache consistency issues here. ++ * For example, what if the layout is recalled, then regained? ++ * If the file is closed and reopened, will the page flags ++ * be reset? If not, we'll have to use layout info instead of ++ * the page flag. ++ */ ++ return 0; ++ } ++ from = pos & (PAGE_CACHE_SIZE - 1); ++ to = from + count; ++ ret = init_page_for_write(bl, page, from, to, &pages_to_mark); ++ if (ret) { ++ dprintk("%s init page failed with %i", __func__, ret); ++ /* Revert back to plain NFS and just continue on with ++ * write. This assumes there is no request attached, which ++ * should be true if we get here. ++ */ ++ BUG_ON(PagePrivate(page)); ++ put_lseg(fsdata->lseg); ++ fsdata->lseg = NULL; ++ kfree(pages_to_mark); ++ ret = 0; ++ } else { ++ fsdata->private = pages_to_mark; ++ } ++ return ret; ++} ++ ++/* CAREFUL - what happens if copied < count??? */ ++static int ++bl_write_end(struct inode *inode, struct page *page, loff_t pos, ++ unsigned count, unsigned copied, struct pnfs_layout_segment *lseg) ++{ ++ dprintk("%s enter, %u@%lld, lseg=%p\n", __func__, count, pos, lseg); ++ print_page(page); ++ if (lseg) ++ SetPageUptodate(page); ++ return 0; ++} ++ ++/* Return any memory allocated to fsdata->private, and take advantage ++ * of no page locks to mark pages noted in write_begin as needing ++ * initialization. ++ */ ++static void ++bl_write_end_cleanup(struct file *filp, struct pnfs_fsdata *fsdata) ++{ ++ struct page *page; ++ pgoff_t index; ++ sector_t *pos; ++ struct address_space *mapping = filp->f_mapping; ++ struct pnfs_fsdata *fake_data; ++ struct pnfs_layout_segment *lseg; ++ ++ if (!fsdata) ++ return; ++ lseg = fsdata->lseg; ++ if (!lseg) ++ return; ++ pos = fsdata->private; ++ if (!pos) ++ return; ++ dprintk("%s enter with pos=%llu\n", __func__, (u64)(*pos)); ++ for (; *pos != ~0; pos++) { ++ index = *pos >> (PAGE_CACHE_SHIFT - 9); ++ /* XXX How do we properly deal with failures here??? */ ++ page = grab_cache_page_write_begin(mapping, index, 0); ++ if (!page) { ++ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", __func__); ++ continue; ++ } ++ dprintk("%s: Examining block page\n", __func__); ++ print_page(page); ++ if (!PageMappedToDisk(page)) { ++ /* XXX How do we properly deal with failures here??? */ ++ dprintk("%s Marking block page\n", __func__); ++ init_page_for_write(BLK_LSEG2EXT(fsdata->lseg), page, ++ PAGE_CACHE_SIZE, PAGE_CACHE_SIZE, ++ NULL); ++ print_page(page); ++ fake_data = kzalloc(sizeof(*fake_data), GFP_KERNEL); ++ if (!fake_data) { ++ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", ++ __func__); ++ unlock_page(page); ++ continue; ++ } ++ get_lseg(lseg); ++ fake_data->lseg = lseg; ++ fake_data->bypass_eof = 1; ++ mapping->a_ops->write_end(filp, mapping, ++ index << PAGE_CACHE_SHIFT, ++ PAGE_CACHE_SIZE, ++ PAGE_CACHE_SIZE, ++ page, fake_data); ++ /* Note fake_data is freed by nfs_write_end */ ++ } else ++ unlock_page(page); ++ } ++ kfree(fsdata->private); ++ fsdata->private = NULL; ++} ++ ++/* This is called by nfs_can_coalesce_requests via nfs_pageio_do_add_request. ++ * Should return False if there is a reason requests can not be coalesced, ++ * otherwise, should default to returning True. ++ */ ++static int ++bl_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, ++ struct nfs_page *req) ++{ ++ dprintk("%s enter\n", __func__); ++ if (pgio->pg_iswrite) ++ return prev->wb_lseg == req->wb_lseg; ++ else ++ return 1; ++} ++ ++static struct pnfs_layoutdriver_type blocklayout_type = { ++ .id = LAYOUT_BLOCK_VOLUME, ++ .name = "LAYOUT_BLOCK_VOLUME", ++ .commit = bl_commit, ++ .read_pagelist = bl_read_pagelist, ++ .write_pagelist = bl_write_pagelist, ++ .write_begin = bl_write_begin, ++ .write_end = bl_write_end, ++ .write_end_cleanup = bl_write_end_cleanup, ++ .alloc_layout_hdr = bl_alloc_layout_hdr, ++ .free_layout_hdr = bl_free_layout_hdr, ++ .alloc_lseg = bl_alloc_lseg, ++ .free_lseg = bl_free_lseg, ++ .setup_layoutcommit = bl_setup_layoutcommit, ++ .encode_layoutcommit = bl_encode_layoutcommit, ++ .cleanup_layoutcommit = bl_cleanup_layoutcommit, ++ .set_layoutdriver = bl_set_layoutdriver, ++ .clear_layoutdriver = bl_clear_layoutdriver, ++ .pg_test = bl_pg_test, ++}; ++ ++static int __init nfs4blocklayout_init(void) ++{ ++ int ret; ++ ++ dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); ++ ++ ret = pnfs_register_layoutdriver(&blocklayout_type); ++ if (!ret) ++ bl_pipe_init(); ++ return ret; ++} ++ ++static void __exit nfs4blocklayout_exit(void) ++{ ++ dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", ++ __func__); ++ ++ pnfs_unregister_layoutdriver(&blocklayout_type); ++ bl_pipe_exit(); ++} ++ ++module_init(nfs4blocklayout_init); ++module_exit(nfs4blocklayout_exit); +diff -up linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayoutdev.c +--- linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2011-01-28 09:43:53.309776857 -0500 ++++ linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2011-01-28 09:43:53.309776857 -0500 +@@ -0,0 +1,334 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayoutdev.c ++ * ++ * Device operations for the pnfs nfs4 file layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#include ++#include /* __bread */ ++ ++#include ++#include ++#include ++ ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes) ++{ ++ uint32_t *q = p + XDR_QUADLEN(nbytes); ++ if (unlikely(q > end || q < p)) ++ return NULL; ++ return p; ++} ++EXPORT_SYMBOL(blk_overflow); ++ ++/* Open a block_device by device number. */ ++struct block_device *nfs4_blkdev_get(dev_t dev) ++{ ++ struct block_device *bd; ++ ++ dprintk("%s enter\n", __func__); ++ bd = blkdev_get_by_dev(dev, FMODE_READ, NULL); ++ if (IS_ERR(bd)) ++ goto fail; ++ return bd; ++fail: ++ dprintk("%s failed to open device : %ld\n", ++ __func__, PTR_ERR(bd)); ++ return NULL; ++} ++ ++/* ++ * Release the block device ++ */ ++int nfs4_blkdev_put(struct block_device *bdev) ++{ ++ dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), ++ MINOR(bdev->bd_dev)); ++ return blkdev_put(bdev, FMODE_READ); ++} ++ ++/* Decodes pnfs_block_deviceaddr4 (draft-8) which is XDR encoded ++ * in dev->dev_addr_buf. ++ */ ++struct pnfs_block_dev * ++nfs4_blk_decode_device(struct nfs_server *server, ++ struct pnfs_device *dev, ++ struct list_head *sdlist) ++{ ++ struct pnfs_block_dev *rv = NULL; ++ struct block_device *bd = NULL; ++ struct pipefs_hdr *msg = NULL, *reply = NULL; ++ uint32_t major, minor; ++ ++ dprintk("%s enter\n", __func__); ++ ++ if (IS_ERR(bl_device_pipe)) ++ return NULL; ++ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); ++ dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, ++ dev->mincount); ++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_MOUNT, 0, dev->area, ++ dev->mincount); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: couldn't make pipefs message.\n"); ++ goto out_err; ++ } ++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); ++ msg->status = BL_DEVICE_REQUEST_INIT; ++ ++ dprintk("%s CALLING USERSPACE DAEMON\n", __func__); ++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, ++ &bl_device_list, 0, 0); ++ ++ if (IS_ERR(reply)) { ++ dprintk("ERROR: upcall_waitreply failed\n"); ++ goto out_err; ++ } ++ if (reply->status != BL_DEVICE_REQUEST_PROC) { ++ dprintk("%s failed to open device: %ld\n", ++ __func__, PTR_ERR(bd)); ++ goto out_err; ++ } ++ memcpy(&major, (uint32_t *)(payload_of(reply)), sizeof(uint32_t)); ++ memcpy(&minor, (uint32_t *)(payload_of(reply) + sizeof(uint32_t)), ++ sizeof(uint32_t)); ++ bd = nfs4_blkdev_get(MKDEV(major, minor)); ++ if (IS_ERR(bd)) { ++ dprintk("%s failed to open device : %ld\n", ++ __func__, PTR_ERR(bd)); ++ goto out_err; ++ } ++ ++ rv = kzalloc(sizeof(*rv), GFP_KERNEL); ++ if (!rv) ++ goto out_err; ++ ++ rv->bm_mdev = bd; ++ memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); ++ dprintk("%s Created device %s with bd_block_size %u\n", ++ __func__, ++ bd->bd_disk->disk_name, ++ bd->bd_block_size); ++ kfree(reply); ++ kfree(msg); ++ return rv; ++ ++out_err: ++ kfree(rv); ++ if (!IS_ERR(reply)) ++ kfree(reply); ++ if (!IS_ERR(msg)) ++ kfree(msg); ++ return NULL; ++} ++ ++/* Map deviceid returned by the server to constructed block_device */ ++static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, ++ struct nfs4_deviceid *id) ++{ ++ struct block_device *rv = NULL; ++ struct block_mount_id *mid; ++ struct pnfs_block_dev *dev; ++ ++ dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); ++ mid = BLK_ID(lo); ++ spin_lock(&mid->bm_lock); ++ list_for_each_entry(dev, &mid->bm_devlist, bm_node) { ++ if (memcmp(id->data, dev->bm_mdevid.data, ++ NFS4_DEVICEID4_SIZE) == 0) { ++ rv = dev->bm_mdev; ++ goto out; ++ } ++ } ++ out: ++ spin_unlock(&mid->bm_lock); ++ dprintk("%s returning %p\n", __func__, rv); ++ return rv; ++} ++ ++/* Tracks info needed to ensure extents in layout obey constraints of spec */ ++struct layout_verification { ++ u32 mode; /* R or RW */ ++ u64 start; /* Expected start of next non-COW extent */ ++ u64 inval; /* Start of INVAL coverage */ ++ u64 cowread; /* End of COW read coverage */ ++}; ++ ++/* Verify the extent meets the layout requirements of the pnfs-block draft, ++ * section 2.3.1. ++ */ ++static int verify_extent(struct pnfs_block_extent *be, ++ struct layout_verification *lv) ++{ ++ if (lv->mode == IOMODE_READ) { ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA || ++ be->be_state == PNFS_BLOCK_INVALID_DATA) ++ return -EIO; ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ return 0; ++ } ++ /* lv->mode == IOMODE_RW */ ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ if (lv->cowread > lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ lv->inval = lv->start; ++ return 0; ++ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ return 0; ++ } else if (be->be_state == PNFS_BLOCK_READ_DATA) { ++ if (be->be_f_offset > lv->start) ++ return -EIO; ++ if (be->be_f_offset < lv->inval) ++ return -EIO; ++ if (be->be_f_offset < lv->cowread) ++ return -EIO; ++ /* It looks like you might want to min this with lv->start, ++ * but you really don't. ++ */ ++ lv->inval = lv->inval + be->be_length; ++ lv->cowread = be->be_f_offset + be->be_length; ++ return 0; ++ } else ++ return -EIO; ++} ++ ++/* XDR decode pnfs_block_layout4 structure */ ++int ++nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutget_res *lgr) ++{ ++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); ++ uint32_t *p = (uint32_t *)lgr->layout.buf; ++ uint32_t *end = (uint32_t *)((char *)lgr->layout.buf + lgr->layout.len); ++ int i, status = -EIO; ++ uint32_t count; ++ struct pnfs_block_extent *be = NULL, *save; ++ uint64_t tmp; /* Used by READSECTOR */ ++ struct layout_verification lv = { ++ .mode = lgr->range.iomode, ++ .start = lgr->range.offset >> 9, ++ .inval = lgr->range.offset >> 9, ++ .cowread = lgr->range.offset >> 9, ++ }; ++ ++ LIST_HEAD(extents); ++ ++ BLK_READBUF(p, end, 4); ++ READ32(count); ++ ++ dprintk("%s enter, number of extents %i\n", __func__, count); ++ BLK_READBUF(p, end, (28 + NFS4_DEVICEID4_SIZE) * count); ++ ++ /* Decode individual extents, putting them in temporary ++ * staging area until whole layout is decoded to make error ++ * recovery easier. ++ */ ++ for (i = 0; i < count; i++) { ++ be = alloc_extent(); ++ if (!be) { ++ status = -ENOMEM; ++ goto out_err; ++ } ++ READ_DEVID(&be->be_devid); ++ be->be_mdev = translate_devid(lo, &be->be_devid); ++ if (!be->be_mdev) ++ goto out_err; ++ /* The next three values are read in as bytes, ++ * but stored as 512-byte sector lengths ++ */ ++ READ_SECTOR(be->be_f_offset); ++ READ_SECTOR(be->be_length); ++ READ_SECTOR(be->be_v_offset); ++ READ32(be->be_state); ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) ++ be->be_inval = &bl->bl_inval; ++ if (verify_extent(be, &lv)) { ++ dprintk("%s verify failed\n", __func__); ++ goto out_err; ++ } ++ list_add_tail(&be->be_node, &extents); ++ } ++ if (p != end) { ++ dprintk("%s Undecoded cruft at end of opaque\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ if (lgr->range.offset + lgr->range.length != lv.start << 9) { ++ dprintk("%s Final length mismatch\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ if (lv.start < lv.cowread) { ++ dprintk("%s Final uncovered COW extent\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ /* Extents decoded properly, now try to merge them in to ++ * existing layout extents. ++ */ ++ spin_lock(&bl->bl_ext_lock); ++ list_for_each_entry_safe(be, save, &extents, be_node) { ++ list_del(&be->be_node); ++ status = add_and_merge_extent(bl, be); ++ if (status) { ++ spin_unlock(&bl->bl_ext_lock); ++ /* This is a fairly catastrophic error, as the ++ * entire layout extent lists are now corrupted. ++ * We should have some way to distinguish this. ++ */ ++ be = NULL; ++ goto out_err; ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ status = 0; ++ out: ++ dprintk("%s returns %i\n", __func__, status); ++ return status; ++ ++ out_err: ++ put_extent(be); ++ while (!list_empty(&extents)) { ++ be = list_first_entry(&extents, struct pnfs_block_extent, ++ be_node); ++ list_del(&be->be_node); ++ put_extent(be); ++ } ++ goto out; ++} +diff -up linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayoutdm.c +--- linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2011-01-28 09:43:53.309776857 -0500 ++++ linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2011-01-28 09:43:53.309776857 -0500 +@@ -0,0 +1,120 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayoutdm.c ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2007 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Fred Isaman ++ * Andy Adamson ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++ ++#include /* gendisk - used in a dprintk*/ ++#include ++#include ++ ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* Defines used for calculating memory usage in nfs4_blk_flatten() */ ++#define ARGSIZE 24 /* Max bytes needed for linear target arg string */ ++#define SPECSIZE (sizeof8(struct dm_target_spec) + ARGSIZE) ++#define SPECS_PER_PAGE (PAGE_SIZE / SPECSIZE) ++#define SPEC_HEADER_ADJUST (SPECS_PER_PAGE - \ ++ (PAGE_SIZE - sizeof8(struct dm_ioctl)) / SPECSIZE) ++#define roundup8(x) (((x)+7) & ~7) ++#define sizeof8(x) roundup8(sizeof(x)) ++ ++static int dev_remove(dev_t dev) ++{ ++ int ret = 1; ++ struct pipefs_hdr *msg = NULL, *reply = NULL; ++ uint64_t bl_dev; ++ uint32_t major = MAJOR(dev), minor = MINOR(dev); ++ ++ dprintk("Entering %s\n", __func__); ++ ++ if (IS_ERR(bl_device_pipe)) ++ return ret; ++ ++ memcpy((void *)&bl_dev, &major, sizeof(uint32_t)); ++ memcpy((void *)&bl_dev + sizeof(uint32_t), &minor, sizeof(uint32_t)); ++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_UMOUNT, 0, (void *)&bl_dev, ++ sizeof(uint64_t)); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: couldn't make pipefs message.\n"); ++ goto out; ++ } ++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); ++ msg->status = BL_DEVICE_REQUEST_INIT; ++ ++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, ++ &bl_device_list, 0, 0); ++ if (IS_ERR(reply)) { ++ dprintk("ERROR: upcall_waitreply failed\n"); ++ goto out; ++ } ++ ++ if (reply->status == BL_DEVICE_REQUEST_PROC) ++ ret = 0; /*TODO: what to return*/ ++out: ++ if (!IS_ERR(reply)) ++ kfree(reply); ++ if (!IS_ERR(msg)) ++ kfree(msg); ++ return ret; ++} ++ ++/* ++ * Release meta device ++ */ ++static int nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) ++{ ++ int rv; ++ ++ dprintk("%s Releasing\n", __func__); ++ /* XXX Check return? */ ++ rv = nfs4_blkdev_put(bdev->bm_mdev); ++ dprintk("%s nfs4_blkdev_put returns %d\n", __func__, rv); ++ ++ rv = dev_remove(bdev->bm_mdev->bd_dev); ++ dprintk("%s Returns %d\n", __func__, rv); ++ return rv; ++} ++ ++void free_block_dev(struct pnfs_block_dev *bdev) ++{ ++ if (bdev) { ++ if (bdev->bm_mdev) { ++ dprintk("%s Removing DM device: %d:%d\n", ++ __func__, ++ MAJOR(bdev->bm_mdev->bd_dev), ++ MINOR(bdev->bm_mdev->bd_dev)); ++ /* XXX Check status ?? */ ++ nfs4_blk_metadev_release(bdev); ++ } ++ kfree(bdev); ++ } ++} +diff -up linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayout.h +--- linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2011-01-28 09:43:53.308777059 -0500 ++++ linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayout.h 2011-01-28 09:43:53.308777059 -0500 +@@ -0,0 +1,302 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.h ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#ifndef FS_NFS_NFS4BLOCKLAYOUT_H ++#define FS_NFS_NFS4BLOCKLAYOUT_H ++ ++#include ++#include /* Needed for struct dm_ioctl*/ ++#include "../pnfs.h" ++ ++#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> 9) ++ ++#define PG_pnfserr PG_owner_priv_1 ++#define PagePnfsErr(page) test_bit(PG_pnfserr, &(page)->flags) ++#define SetPagePnfsErr(page) set_bit(PG_pnfserr, &(page)->flags) ++#define ClearPagePnfsErr(page) clear_bit(PG_pnfserr, &(page)->flags) ++ ++extern int dm_dev_create(struct dm_ioctl *param); /* from dm-ioctl.c */ ++extern int dm_dev_remove(struct dm_ioctl *param); /* from dm-ioctl.c */ ++extern int dm_do_resume(struct dm_ioctl *param); ++extern int dm_table_load(struct dm_ioctl *param, size_t param_size); ++ ++struct block_mount_id { ++ spinlock_t bm_lock; /* protects list */ ++ struct list_head bm_devlist; /* holds pnfs_block_dev */ ++}; ++ ++struct pnfs_block_dev { ++ struct list_head bm_node; ++ struct nfs4_deviceid bm_mdevid; /* associated devid */ ++ struct block_device *bm_mdev; /* meta device itself */ ++}; ++ ++/* holds visible disks that can be matched against VOLUME_SIMPLE signatures */ ++struct visible_block_device { ++ struct list_head vi_node; ++ struct block_device *vi_bdev; ++ int vi_mapped; ++ int vi_put_done; ++}; ++ ++enum blk_vol_type { ++ PNFS_BLOCK_VOLUME_SIMPLE = 0, /* maps to a single LU */ ++ PNFS_BLOCK_VOLUME_SLICE = 1, /* slice of another volume */ ++ PNFS_BLOCK_VOLUME_CONCAT = 2, /* concatenation of multiple volumes */ ++ PNFS_BLOCK_VOLUME_STRIPE = 3 /* striped across multiple volumes */ ++}; ++ ++/* All disk offset/lengths are stored in 512-byte sectors */ ++struct pnfs_blk_volume { ++ uint32_t bv_type; ++ sector_t bv_size; ++ struct pnfs_blk_volume **bv_vols; ++ int bv_vol_n; ++ union { ++ dev_t bv_dev; ++ sector_t bv_stripe_unit; ++ sector_t bv_offset; ++ }; ++}; ++ ++/* Since components need not be aligned, cannot use sector_t */ ++struct pnfs_blk_sig_comp { ++ int64_t bs_offset; /* In bytes */ ++ uint32_t bs_length; /* In bytes */ ++ char *bs_string; ++}; ++ ++/* Maximum number of signatures components in a simple volume */ ++# define PNFS_BLOCK_MAX_SIG_COMP 16 ++ ++struct pnfs_blk_sig { ++ int si_num_comps; ++ struct pnfs_blk_sig_comp si_comps[PNFS_BLOCK_MAX_SIG_COMP]; ++}; ++ ++enum exstate4 { ++ PNFS_BLOCK_READWRITE_DATA = 0, ++ PNFS_BLOCK_READ_DATA = 1, ++ PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ ++ PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ ++}; ++ ++#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ ++ ++struct my_tree_t { ++ sector_t mtt_step_size; /* Internal sector alignment */ ++ struct list_head mtt_stub; /* Should be a radix tree */ ++}; ++ ++struct pnfs_inval_markings { ++ spinlock_t im_lock; ++ struct my_tree_t im_tree; /* Sectors that need LAYOUTCOMMIT */ ++ sector_t im_block_size; /* Server blocksize in sectors */ ++}; ++ ++struct pnfs_inval_tracking { ++ struct list_head it_link; ++ int it_sector; ++ int it_tags; ++}; ++ ++/* sector_t fields are all in 512-byte sectors */ ++struct pnfs_block_extent { ++ struct kref be_refcnt; ++ struct list_head be_node; /* link into lseg list */ ++ struct nfs4_deviceid be_devid; /* STUB - remevable??? */ ++ struct block_device *be_mdev; ++ sector_t be_f_offset; /* the starting offset in the file */ ++ sector_t be_length; /* the size of the extent */ ++ sector_t be_v_offset; /* the starting offset in the volume */ ++ enum exstate4 be_state; /* the state of this extent */ ++ struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ ++}; ++ ++/* Shortened extent used by LAYOUTCOMMIT */ ++struct pnfs_block_short_extent { ++ struct list_head bse_node; ++ struct nfs4_deviceid bse_devid; /* STUB - removable??? */ ++ struct block_device *bse_mdev; ++ sector_t bse_f_offset; /* the starting offset in the file */ ++ sector_t bse_length; /* the size of the extent */ ++}; ++ ++static inline void ++INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) ++{ ++ spin_lock_init(&marks->im_lock); ++ INIT_LIST_HEAD(&marks->im_tree.mtt_stub); ++ marks->im_block_size = blocksize; ++ marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, ++ blocksize); ++} ++ ++enum extentclass4 { ++ RW_EXTENT = 0, /* READWRTE and INVAL */ ++ RO_EXTENT = 1, /* READ and NONE */ ++ EXTENT_LISTS = 2, ++}; ++ ++static inline int choose_list(enum exstate4 state) ++{ ++ if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) ++ return RO_EXTENT; ++ else ++ return RW_EXTENT; ++} ++ ++struct pnfs_block_layout { ++ struct pnfs_layout_hdr bl_layout; ++ struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ ++ spinlock_t bl_ext_lock; /* Protects list manipulation */ ++ struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ ++ struct list_head bl_commit; /* Needs layout commit */ ++ unsigned int bl_count; /* entries in bl_commit */ ++ sector_t bl_blocksize; /* Server blocksize in sectors */ ++}; ++ ++/* this struct is comunicated between: ++ * bl_setup_layoutcommit && bl_encode_layoutcommit && bl_cleanup_layoutcommit ++ */ ++struct bl_layoutupdate_data { ++ struct list_head ranges; ++}; ++ ++#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->inode)->pnfs_ld_data)) ++ ++static inline struct pnfs_block_layout * ++BLK_LO2EXT(struct pnfs_layout_hdr *lo) ++{ ++ return container_of(lo, struct pnfs_block_layout, bl_layout); ++} ++ ++static inline struct pnfs_block_layout * ++BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) ++{ ++ return BLK_LO2EXT(lseg->layout); ++} ++ ++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes); ++ ++#define BLK_READBUF(p, e, nbytes) do { \ ++ p = blk_overflow(p, e, nbytes); \ ++ if (!p) { \ ++ printk(KERN_WARNING \ ++ "%s: reply buffer overflowed in line %d.\n", \ ++ __func__, __LINE__); \ ++ goto out_err; \ ++ } \ ++} while (0) ++ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (uint64_t)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define COPYMEM(x, nbytes) do { \ ++ memcpy((x), p, nbytes); \ ++ p += XDR_QUADLEN(nbytes); \ ++} while (0) ++#define READ_DEVID(x) COPYMEM((x)->data, NFS4_DEVICEID4_SIZE) ++#define READ_SECTOR(x) do { \ ++ READ64(tmp); \ ++ if (tmp & 0x1ff) { \ ++ printk(KERN_WARNING \ ++ "%s Value not 512-byte aligned at line %d\n", \ ++ __func__, __LINE__); \ ++ goto out_err; \ ++ } \ ++ (x) = tmp >> 9; \ ++} while (0) ++ ++#define WRITE32(n) do { \ ++ *p++ = htonl(n); \ ++ } while (0) ++#define WRITE64(n) do { \ ++ *p++ = htonl((uint32_t)((n) >> 32)); \ ++ *p++ = htonl((uint32_t)(n)); \ ++} while (0) ++#define WRITEMEM(ptr, nbytes) do { \ ++ p = xdr_encode_opaque_fixed(p, ptr, nbytes); \ ++} while (0) ++#define WRITE_DEVID(x) WRITEMEM((x)->data, NFS4_DEVICEID4_SIZE) ++ ++/* blocklayoutdev.c */ ++struct block_device *nfs4_blkdev_get(dev_t dev); ++int nfs4_blkdev_put(struct block_device *bdev); ++struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, ++ struct pnfs_device *dev, ++ struct list_head *sdlist); ++int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutget_res *lgr); ++int nfs4_blk_create_block_disk_list(struct list_head *); ++void nfs4_blk_destroy_disk_list(struct list_head *); ++/* blocklayoutdm.c */ ++int nfs4_blk_flatten(struct pnfs_blk_volume *, int, struct pnfs_block_dev *); ++void free_block_dev(struct pnfs_block_dev *bdev); ++/* extents.c */ ++struct pnfs_block_extent * ++find_get_extent(struct pnfs_block_layout *bl, sector_t isect, ++ struct pnfs_block_extent **cow_read); ++int mark_initialized_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length, ++ sector_t **pages); ++void put_extent(struct pnfs_block_extent *be); ++struct pnfs_block_extent *alloc_extent(void); ++struct pnfs_block_extent *get_extent(struct pnfs_block_extent *be); ++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect); ++int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *arg); ++void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ const struct nfs4_layoutcommit_args *arg, ++ int status); ++int add_and_merge_extent(struct pnfs_block_layout *bl, ++ struct pnfs_block_extent *new); ++int mark_for_commit(struct pnfs_block_extent *be, ++ sector_t offset, sector_t length); ++ ++#include ++ ++extern struct pipefs_list bl_device_list; ++extern struct dentry *bl_device_pipe; ++ ++int bl_pipe_init(void); ++void bl_pipe_exit(void); ++ ++#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ ++#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ ++#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ ++#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ ++#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ ++ ++#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ +diff -up linux-2.6.37.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.37.noarch/fs/nfs/blocklayout/extents.c +--- linux-2.6.37.noarch/fs/nfs/blocklayout/extents.c.orig 2011-01-28 09:43:53.310776657 -0500 ++++ linux-2.6.37.noarch/fs/nfs/blocklayout/extents.c 2011-01-28 09:43:53.311776459 -0500 +@@ -0,0 +1,948 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.h ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++ ++#include "blocklayout.h" ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* Bit numbers */ ++#define EXTENT_INITIALIZED 0 ++#define EXTENT_WRITTEN 1 ++#define EXTENT_IN_COMMIT 2 ++#define INTERNAL_EXISTS MY_MAX_TAGS ++#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) ++ ++/* Returns largest t<=s s.t. t%base==0 */ ++static inline sector_t normalize(sector_t s, int base) ++{ ++ sector_t tmp = s; /* Since do_div modifies its argument */ ++ return s - do_div(tmp, base); ++} ++ ++static inline sector_t normalize_up(sector_t s, int base) ++{ ++ return normalize(s + base - 1, base); ++} ++ ++/* Complete stub using list while determine API wanted */ ++ ++/* Returns tags, or negative */ ++static int32_t _find_entry(struct my_tree_t *tree, u64 s) ++{ ++ struct pnfs_inval_tracking *pos; ++ ++ dprintk("%s(%llu) enter\n", __func__, s); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector > s) ++ continue; ++ else if (pos->it_sector == s) ++ return pos->it_tags & INTERNAL_MASK; ++ else ++ break; ++ } ++ return -ENOENT; ++} ++ ++static inline ++int _has_tag(struct my_tree_t *tree, u64 s, int32_t tag) ++{ ++ int32_t tags; ++ ++ dprintk("%s(%llu, %i) enter\n", __func__, s, tag); ++ s = normalize(s, tree->mtt_step_size); ++ tags = _find_entry(tree, s); ++ if ((tags < 0) || !(tags & (1 << tag))) ++ return 0; ++ else ++ return 1; ++} ++ ++/* Creates entry with tag, or if entry already exists, unions tag to it. ++ * If storage is not NULL, newly created entry will use it. ++ * Returns number of entries added, or negative on error. ++ */ ++static int _add_entry(struct my_tree_t *tree, u64 s, int32_t tag, ++ struct pnfs_inval_tracking *storage) ++{ ++ int found = 0; ++ struct pnfs_inval_tracking *pos; ++ ++ dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector > s) ++ continue; ++ else if (pos->it_sector == s) { ++ found = 1; ++ break; ++ } else ++ break; ++ } ++ if (found) { ++ pos->it_tags |= (1 << tag); ++ return 0; ++ } else { ++ struct pnfs_inval_tracking *new; ++ if (storage) ++ new = storage; ++ else { ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ } ++ new->it_sector = s; ++ new->it_tags = (1 << tag); ++ list_add(&new->it_link, &pos->it_link); ++ return 1; ++ } ++} ++ ++/* XXXX Really want option to not create */ ++/* Over range, unions tag with existing entries, else creates entry with tag */ ++static int _set_range(struct my_tree_t *tree, int32_t tag, u64 s, u64 length) ++{ ++ u64 i; ++ ++ dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); ++ for (i = normalize(s, tree->mtt_step_size); i < s + length; ++ i += tree->mtt_step_size) ++ if (_add_entry(tree, i, tag, NULL)) ++ return -ENOMEM; ++ return 0; ++} ++ ++/* Ensure that future operations on given range of tree will not malloc */ ++static int _preload_range(struct my_tree_t *tree, u64 offset, u64 length) ++{ ++ u64 start, end, s; ++ int count, i, used = 0, status = -ENOMEM; ++ struct pnfs_inval_tracking **storage; ++ ++ dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); ++ start = normalize(offset, tree->mtt_step_size); ++ end = normalize_up(offset + length, tree->mtt_step_size); ++ count = (int)(end - start) / (int)tree->mtt_step_size; ++ ++ /* Pre-malloc what memory we might need */ ++ storage = kmalloc(sizeof(*storage) * count, GFP_KERNEL); ++ if (!storage) ++ return -ENOMEM; ++ for (i = 0; i < count; i++) { ++ storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), ++ GFP_KERNEL); ++ if (!storage[i]) ++ goto out_cleanup; ++ } ++ ++ /* Now need lock - HOW??? */ ++ ++ for (s = start; s < end; s += tree->mtt_step_size) ++ used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); ++ ++ /* Unlock - HOW??? */ ++ status = 0; ++ ++ out_cleanup: ++ for (i = used; i < count; i++) { ++ if (!storage[i]) ++ break; ++ kfree(storage[i]); ++ } ++ kfree(storage); ++ return status; ++} ++ ++static void set_needs_init(sector_t *array, sector_t offset) ++{ ++ sector_t *p = array; ++ ++ dprintk("%s enter\n", __func__); ++ if (!p) ++ return; ++ while (*p < offset) ++ p++; ++ if (*p == offset) ++ return; ++ else if (*p == ~0) { ++ *p++ = offset; ++ *p = ~0; ++ return; ++ } else { ++ sector_t *save = p; ++ dprintk("%s Adding %llu\n", __func__, (u64)offset); ++ while (*p != ~0) ++ p++; ++ p++; ++ memmove(save + 1, save, (char *)p - (char *)save); ++ *save = offset; ++ return; ++ } ++} ++ ++/* We are relying on page lock to serialize this */ ++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect) ++{ ++ int rv; ++ ++ spin_lock(&marks->im_lock); ++ rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); ++ spin_unlock(&marks->im_lock); ++ return rv; ++} ++ ++/* Assume start, end already sector aligned */ ++static int ++_range_has_tag(struct my_tree_t *tree, u64 start, u64 end, int32_t tag) ++{ ++ struct pnfs_inval_tracking *pos; ++ u64 expect = 0; ++ ++ dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector >= end) ++ continue; ++ if (!expect) { ++ if ((pos->it_sector == end - tree->mtt_step_size) && ++ (pos->it_tags & (1 << tag))) { ++ expect = pos->it_sector - tree->mtt_step_size; ++ if (expect < start) ++ return 1; ++ continue; ++ } else { ++ return 0; ++ } ++ } ++ if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) ++ return 0; ++ expect -= tree->mtt_step_size; ++ if (expect < start) ++ return 1; ++ } ++ return 0; ++} ++ ++static int is_range_written(struct pnfs_inval_markings *marks, ++ sector_t start, sector_t end) ++{ ++ int rv; ++ ++ spin_lock(&marks->im_lock); ++ rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); ++ spin_unlock(&marks->im_lock); ++ return rv; ++} ++ ++/* Marks sectors in [offest, offset_length) as having been initialized. ++ * All lengths are step-aligned, where step is min(pagesize, blocksize). ++ * Notes where partial block is initialized, and helps prepare it for ++ * complete initialization later. ++ */ ++/* Currently assumes offset is page-aligned */ ++int mark_initialized_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length, ++ sector_t **pages) ++{ ++ sector_t s, start, end; ++ sector_t *array = NULL; /* Pages to mark */ ++ ++ dprintk("%s(offset=%llu,len=%llu) enter\n", ++ __func__, (u64)offset, (u64)length); ++ s = max((sector_t) 3, ++ 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); ++ dprintk("%s set max=%llu\n", __func__, (u64)s); ++ if (pages) { ++ array = kmalloc(s * sizeof(sector_t), GFP_KERNEL); ++ if (!array) ++ goto outerr; ++ array[0] = ~0; ++ } ++ ++ start = normalize(offset, marks->im_block_size); ++ end = normalize_up(offset + length, marks->im_block_size); ++ if (_preload_range(&marks->im_tree, start, end - start)) ++ goto outerr; ++ ++ spin_lock(&marks->im_lock); ++ ++ for (s = normalize_up(start, PAGE_CACHE_SECTORS); ++ s < offset; s += PAGE_CACHE_SECTORS) { ++ dprintk("%s pre-area pages\n", __func__); ++ /* Portion of used block is not initialized */ ++ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) ++ set_needs_init(array, s); ++ } ++ if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) ++ goto out_unlock; ++ for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); ++ s < end; s += PAGE_CACHE_SECTORS) { ++ dprintk("%s post-area pages\n", __func__); ++ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) ++ set_needs_init(array, s); ++ } ++ ++ spin_unlock(&marks->im_lock); ++ ++ if (pages) { ++ if (array[0] == ~0) { ++ kfree(array); ++ *pages = NULL; ++ } else ++ *pages = array; ++ } ++ return 0; ++ ++ out_unlock: ++ spin_unlock(&marks->im_lock); ++ outerr: ++ if (pages) { ++ kfree(array); ++ *pages = NULL; ++ } ++ return -ENOMEM; ++} ++ ++/* Marks sectors in [offest, offset+length) as having been written to disk. ++ * All lengths should be block aligned. ++ */ ++int mark_written_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length) ++{ ++ int status; ++ ++ dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, ++ (u64)offset, (u64)length); ++ spin_lock(&marks->im_lock); ++ status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); ++ spin_unlock(&marks->im_lock); ++ return status; ++} ++ ++static void print_short_extent(struct pnfs_block_short_extent *be) ++{ ++ dprintk("PRINT SHORT EXTENT extent %p\n", be); ++ if (be) { ++ dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); ++ dprintk(" be_length %llu\n", (u64)be->bse_length); ++ } ++} ++ ++void print_clist(struct list_head *list, unsigned int count) ++{ ++ struct pnfs_block_short_extent *be; ++ unsigned int i = 0; ++ ++ dprintk("****************\n"); ++ dprintk("Extent list looks like:\n"); ++ list_for_each_entry(be, list, bse_node) { ++ i++; ++ print_short_extent(be); ++ } ++ if (i != count) ++ dprintk("\n\nExpected %u entries\n\n\n", count); ++ dprintk("****************\n"); ++} ++ ++/* Note: In theory, we should do more checking that devid's match between ++ * old and new, but if they don't, the lists are too corrupt to salvage anyway. ++ */ ++/* Note this is very similar to add_and_merge_extent */ ++static void add_to_commitlist(struct pnfs_block_layout *bl, ++ struct pnfs_block_short_extent *new) ++{ ++ struct list_head *clist = &bl->bl_commit; ++ struct pnfs_block_short_extent *old, *save; ++ sector_t end = new->bse_f_offset + new->bse_length; ++ ++ dprintk("%s enter\n", __func__); ++ print_short_extent(new); ++ print_clist(clist, bl->bl_count); ++ bl->bl_count++; ++ /* Scan for proper place to insert, extending new to the left ++ * as much as possible. ++ */ ++ list_for_each_entry_safe(old, save, clist, bse_node) { ++ if (new->bse_f_offset < old->bse_f_offset) ++ break; ++ if (end <= old->bse_f_offset + old->bse_length) { ++ /* Range is already in list */ ++ bl->bl_count--; ++ kfree(new); ++ return; ++ } else if (new->bse_f_offset <= ++ old->bse_f_offset + old->bse_length) { ++ /* new overlaps or abuts existing be */ ++ if (new->bse_mdev == old->bse_mdev) { ++ /* extend new to fully replace old */ ++ new->bse_length += new->bse_f_offset - ++ old->bse_f_offset; ++ new->bse_f_offset = old->bse_f_offset; ++ list_del(&old->bse_node); ++ bl->bl_count--; ++ kfree(old); ++ } ++ } ++ } ++ /* Note that if we never hit the above break, old will not point to a ++ * valid extent. However, in that case &old->bse_node==list. ++ */ ++ list_add_tail(&new->bse_node, &old->bse_node); ++ /* Scan forward for overlaps. If we find any, extend new and ++ * remove the overlapped extent. ++ */ ++ old = list_prepare_entry(new, clist, bse_node); ++ list_for_each_entry_safe_continue(old, save, clist, bse_node) { ++ if (end < old->bse_f_offset) ++ break; ++ /* new overlaps or abuts old */ ++ if (new->bse_mdev == old->bse_mdev) { ++ if (end < old->bse_f_offset + old->bse_length) { ++ /* extend new to fully cover old */ ++ end = old->bse_f_offset + old->bse_length; ++ new->bse_length = end - new->bse_f_offset; ++ } ++ list_del(&old->bse_node); ++ bl->bl_count--; ++ kfree(old); ++ } ++ } ++ dprintk("%s: after merging\n", __func__); ++ print_clist(clist, bl->bl_count); ++} ++ ++/* Note the range described by offset, length is guaranteed to be contained ++ * within be. ++ */ ++int mark_for_commit(struct pnfs_block_extent *be, ++ sector_t offset, sector_t length) ++{ ++ sector_t new_end, end = offset + length; ++ struct pnfs_block_short_extent *new; ++ struct pnfs_block_layout *bl = container_of(be->be_inval, ++ struct pnfs_block_layout, ++ bl_inval); ++ ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ ++ mark_written_sectors(be->be_inval, offset, length); ++ /* We want to add the range to commit list, but it must be ++ * block-normalized, and verified that the normalized range has ++ * been entirely written to disk. ++ */ ++ new->bse_f_offset = offset; ++ offset = normalize(offset, bl->bl_blocksize); ++ if (offset < new->bse_f_offset) { ++ if (is_range_written(be->be_inval, offset, new->bse_f_offset)) ++ new->bse_f_offset = offset; ++ else ++ new->bse_f_offset = offset + bl->bl_blocksize; ++ } ++ new_end = normalize_up(end, bl->bl_blocksize); ++ if (end < new_end) { ++ if (is_range_written(be->be_inval, end, new_end)) ++ end = new_end; ++ else ++ end = new_end - bl->bl_blocksize; ++ } ++ if (end <= new->bse_f_offset) { ++ kfree(new); ++ return 0; ++ } ++ new->bse_length = end - new->bse_f_offset; ++ new->bse_devid = be->be_devid; ++ new->bse_mdev = be->be_mdev; ++ ++ spin_lock(&bl->bl_ext_lock); ++ /* new will be freed, either by add_to_commitlist if it decides not ++ * to use it, or after LAYOUTCOMMIT uses it in the commitlist. ++ */ ++ add_to_commitlist(bl, new); ++ spin_unlock(&bl->bl_ext_lock); ++ return 0; ++} ++ ++static void print_bl_extent(struct pnfs_block_extent *be) ++{ ++ dprintk("PRINT EXTENT extent %p\n", be); ++ if (be) { ++ dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); ++ dprintk(" be_length %llu\n", (u64)be->be_length); ++ dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); ++ dprintk(" be_state %d\n", be->be_state); ++ } ++} ++ ++static void ++destroy_extent(struct kref *kref) ++{ ++ struct pnfs_block_extent *be; ++ ++ be = container_of(kref, struct pnfs_block_extent, be_refcnt); ++ dprintk("%s be=%p\n", __func__, be); ++ kfree(be); ++} ++ ++void ++put_extent(struct pnfs_block_extent *be) ++{ ++ if (be) { ++ dprintk("%s enter %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_put(&be->be_refcnt, destroy_extent); ++ } ++} ++ ++struct pnfs_block_extent *alloc_extent(void) ++{ ++ struct pnfs_block_extent *be; ++ ++ be = kmalloc(sizeof(struct pnfs_block_extent), GFP_KERNEL); ++ if (!be) ++ return NULL; ++ INIT_LIST_HEAD(&be->be_node); ++ kref_init(&be->be_refcnt); ++ be->be_inval = NULL; ++ return be; ++} ++ ++struct pnfs_block_extent * ++get_extent(struct pnfs_block_extent *be) ++{ ++ if (be) ++ kref_get(&be->be_refcnt); ++ return be; ++} ++ ++void print_elist(struct list_head *list) ++{ ++ struct pnfs_block_extent *be; ++ dprintk("****************\n"); ++ dprintk("Extent list looks like:\n"); ++ list_for_each_entry(be, list, be_node) { ++ print_bl_extent(be); ++ } ++ dprintk("****************\n"); ++} ++ ++static inline int ++extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) ++{ ++ /* Note this assumes new->be_f_offset >= old->be_f_offset */ ++ return (new->be_state == old->be_state) && ++ ((new->be_state == PNFS_BLOCK_NONE_DATA) || ++ ((new->be_v_offset - old->be_v_offset == ++ new->be_f_offset - old->be_f_offset) && ++ new->be_mdev == old->be_mdev)); ++} ++ ++/* Adds new to appropriate list in bl, modifying new and removing existing ++ * extents as appropriate to deal with overlaps. ++ * ++ * See find_get_extent for list constraints. ++ * ++ * Refcount on new is already set. If end up not using it, or error out, ++ * need to put the reference. ++ * ++ * Lock is held by caller. ++ */ ++int ++add_and_merge_extent(struct pnfs_block_layout *bl, ++ struct pnfs_block_extent *new) ++{ ++ struct pnfs_block_extent *be, *tmp; ++ sector_t end = new->be_f_offset + new->be_length; ++ struct list_head *list; ++ ++ dprintk("%s enter with be=%p\n", __func__, new); ++ print_bl_extent(new); ++ list = &bl->bl_extents[choose_list(new->be_state)]; ++ print_elist(list); ++ ++ /* Scan for proper place to insert, extending new to the left ++ * as much as possible. ++ */ ++ list_for_each_entry_safe_reverse(be, tmp, list, be_node) { ++ if (new->be_f_offset >= be->be_f_offset + be->be_length) ++ break; ++ if (new->be_f_offset >= be->be_f_offset) { ++ if (end <= be->be_f_offset + be->be_length) { ++ /* new is a subset of existing be*/ ++ if (extents_consistent(be, new)) { ++ dprintk("%s: new is subset, ignoring\n", ++ __func__); ++ put_extent(new); ++ return 0; ++ } else { ++ goto out_err; ++ } ++ } else { ++ /* |<-- be -->| ++ * |<-- new -->| */ ++ if (extents_consistent(be, new)) { ++ /* extend new to fully replace be */ ++ new->be_length += new->be_f_offset - ++ be->be_f_offset; ++ new->be_f_offset = be->be_f_offset; ++ new->be_v_offset = be->be_v_offset; ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } ++ } else if (end >= be->be_f_offset + be->be_length) { ++ /* new extent overlap existing be */ ++ if (extents_consistent(be, new)) { ++ /* extend new to fully replace be */ ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } else if (end > be->be_f_offset) { ++ /* |<-- be -->| ++ *|<-- new -->| */ ++ if (extents_consistent(new, be)) { ++ /* extend new to fully replace be */ ++ new->be_length += be->be_f_offset + be->be_length - ++ new->be_f_offset - new->be_length; ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } ++ } ++ /* Note that if we never hit the above break, be will not point to a ++ * valid extent. However, in that case &be->be_node==list. ++ */ ++ list_add(&new->be_node, &be->be_node); ++ dprintk("%s: inserting new\n", __func__); ++ print_elist(list); ++ /* STUB - The per-list consistency checks have all been done, ++ * should now check cross-list consistency. ++ */ ++ return 0; ++ ++ out_err: ++ put_extent(new); ++ return -EIO; ++} ++ ++/* Returns extent, or NULL. If a second READ extent exists, it is returned ++ * in cow_read, if given. ++ * ++ * The extents are kept in two seperate ordered lists, one for READ and NONE, ++ * one for READWRITE and INVALID. Within each list, we assume: ++ * 1. Extents are ordered by file offset. ++ * 2. For any given isect, there is at most one extents that matches. ++ */ ++struct pnfs_block_extent * ++find_get_extent(struct pnfs_block_layout *bl, sector_t isect, ++ struct pnfs_block_extent **cow_read) ++{ ++ struct pnfs_block_extent *be, *cow, *ret; ++ int i; ++ ++ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); ++ cow = ret = NULL; ++ spin_lock(&bl->bl_ext_lock); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ if (ret && ++ (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) ++ break; ++ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { ++ if (isect >= be->be_f_offset + be->be_length) ++ break; ++ if (isect >= be->be_f_offset) { ++ /* We have found an extent */ ++ dprintk("%s Get %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_get(&be->be_refcnt); ++ if (!ret) ++ ret = be; ++ else if (be->be_state != PNFS_BLOCK_READ_DATA) ++ put_extent(be); ++ else ++ cow = be; ++ break; ++ } ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ if (cow_read) ++ *cow_read = cow; ++ print_bl_extent(ret); ++ return ret; ++} ++ ++/* Similar to find_get_extent, but called with lock held, and ignores cow */ ++static struct pnfs_block_extent * ++find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) ++{ ++ struct pnfs_block_extent *be, *ret = NULL; ++ int i; ++ ++ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ if (ret) ++ break; ++ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { ++ if (isect >= be->be_f_offset + be->be_length) ++ break; ++ if (isect >= be->be_f_offset) { ++ /* We have found an extent */ ++ dprintk("%s Get %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_get(&be->be_refcnt); ++ ret = be; ++ break; ++ } ++ } ++ } ++ print_bl_extent(ret); ++ return ret; ++} ++ ++int ++encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *arg) ++{ ++ sector_t start, end; ++ struct pnfs_block_short_extent *lce, *save; ++ unsigned int count = 0; ++ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; ++ struct list_head *ranges = &bld->ranges; ++ __be32 *p, *xdr_start; ++ ++ dprintk("%s enter\n", __func__); ++ start = arg->range.offset >> 9; ++ end = start + (arg->range.length >> 9); ++ dprintk("%s set start=%llu, end=%llu\n", ++ __func__, (u64)start, (u64)end); ++ ++ /* BUG - creation of bl_commit is buggy - need to wait for ++ * entire block to be marked WRITTEN before it can be added. ++ */ ++ spin_lock(&bl->bl_ext_lock); ++ /* Want to adjust for possible truncate */ ++ /* We now want to adjust argument range */ ++ ++ /* XDR encode the ranges found */ ++ xdr_start = xdr_reserve_space(xdr, 8); ++ if (!xdr_start) ++ goto out; ++ list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { ++ p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); ++ if (!p) ++ break; ++ WRITE_DEVID(&lce->bse_devid); ++ WRITE64(lce->bse_f_offset << 9); ++ WRITE64(lce->bse_length << 9); ++ WRITE64(0LL); ++ WRITE32(PNFS_BLOCK_READWRITE_DATA); ++ list_del(&lce->bse_node); ++ list_add_tail(&lce->bse_node, ranges); ++ bl->bl_count--; ++ count++; ++ } ++ xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); ++ xdr_start[1] = cpu_to_be32(count); ++out: ++ spin_unlock(&bl->bl_ext_lock); ++ dprintk("%s found %i ranges\n", __func__, count); ++ return 0; ++} ++ ++/* Helper function to set_to_rw that initialize a new extent */ ++static void ++_prep_new_extent(struct pnfs_block_extent *new, ++ struct pnfs_block_extent *orig, ++ sector_t offset, sector_t length, int state) ++{ ++ kref_init(&new->be_refcnt); ++ /* don't need to INIT_LIST_HEAD(&new->be_node) */ ++ memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid)); ++ new->be_mdev = orig->be_mdev; ++ new->be_f_offset = offset; ++ new->be_length = length; ++ new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; ++ new->be_state = state; ++ new->be_inval = orig->be_inval; ++} ++ ++/* Tries to merge be with extent in front of it in list. ++ * Frees storage if not used. ++ */ ++static struct pnfs_block_extent * ++_front_merge(struct pnfs_block_extent *be, struct list_head *head, ++ struct pnfs_block_extent *storage) ++{ ++ struct pnfs_block_extent *prev; ++ ++ if (!storage) ++ goto no_merge; ++ if (&be->be_node == head || be->be_node.prev == head) ++ goto no_merge; ++ prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); ++ if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || ++ !extents_consistent(prev, be)) ++ goto no_merge; ++ _prep_new_extent(storage, prev, prev->be_f_offset, ++ prev->be_length + be->be_length, prev->be_state); ++ list_replace(&prev->be_node, &storage->be_node); ++ put_extent(prev); ++ list_del(&be->be_node); ++ put_extent(be); ++ return storage; ++ ++ no_merge: ++ kfree(storage); ++ return be; ++} ++ ++static u64 ++set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) ++{ ++ u64 rv = offset + length; ++ struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; ++ struct pnfs_block_extent *children[3]; ++ struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; ++ int i = 0, j; ++ ++ dprintk("%s(%llu, %llu)\n", __func__, offset, length); ++ /* Create storage for up to three new extents e1, e2, e3 */ ++ e1 = kmalloc(sizeof(*e1), GFP_KERNEL); ++ e2 = kmalloc(sizeof(*e2), GFP_KERNEL); ++ e3 = kmalloc(sizeof(*e3), GFP_KERNEL); ++ /* BUG - we are ignoring any failure */ ++ if (!e1 || !e2 || !e3) ++ goto out_nosplit; ++ ++ spin_lock(&bl->bl_ext_lock); ++ be = find_get_extent_locked(bl, offset); ++ rv = be->be_f_offset + be->be_length; ++ if (be->be_state != PNFS_BLOCK_INVALID_DATA) { ++ spin_unlock(&bl->bl_ext_lock); ++ goto out_nosplit; ++ } ++ /* Add e* to children, bumping e*'s krefs */ ++ if (be->be_f_offset != offset) { ++ _prep_new_extent(e1, be, be->be_f_offset, ++ offset - be->be_f_offset, ++ PNFS_BLOCK_INVALID_DATA); ++ children[i++] = e1; ++ print_bl_extent(e1); ++ } else ++ merge1 = e1; ++ _prep_new_extent(e2, be, offset, ++ min(length, be->be_f_offset + be->be_length - offset), ++ PNFS_BLOCK_READWRITE_DATA); ++ children[i++] = e2; ++ print_bl_extent(e2); ++ if (offset + length < be->be_f_offset + be->be_length) { ++ _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, ++ be->be_f_offset + be->be_length - ++ offset - length, ++ PNFS_BLOCK_INVALID_DATA); ++ children[i++] = e3; ++ print_bl_extent(e3); ++ } else ++ merge2 = e3; ++ ++ /* Remove be from list, and insert the e* */ ++ /* We don't get refs on e*, since this list is the base reference ++ * set when init'ed. ++ */ ++ if (i < 3) ++ children[i] = NULL; ++ new = children[0]; ++ list_replace(&be->be_node, &new->be_node); ++ put_extent(be); ++ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); ++ for (j = 1; j < i; j++) { ++ old = new; ++ new = children[j]; ++ list_add(&new->be_node, &old->be_node); ++ } ++ if (merge2) { ++ /* This is a HACK, should just create a _back_merge function */ ++ new = list_entry(new->be_node.next, ++ struct pnfs_block_extent, be_node); ++ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ ++ /* Since we removed the base reference above, be is now scheduled for ++ * destruction. ++ */ ++ put_extent(be); ++ dprintk("%s returns %llu after split\n", __func__, rv); ++ return rv; ++ ++ out_nosplit: ++ kfree(e1); ++ kfree(e2); ++ kfree(e3); ++ dprintk("%s returns %llu without splitting\n", __func__, rv); ++ return rv; ++} ++ ++void ++clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ const struct nfs4_layoutcommit_args *arg, ++ int status) ++{ ++ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; ++ struct pnfs_block_short_extent *lce, *save; ++ ++ dprintk("%s status %d\n", __func__, status); ++ list_for_each_entry_safe_reverse(lce, save, &bld->ranges, bse_node) { ++ if (likely(!status)) { ++ u64 offset = lce->bse_f_offset; ++ u64 end = offset + lce->bse_length; ++ ++ do { ++ offset = set_to_rw(bl, offset, end - offset); ++ } while (offset < end); ++ ++ kfree(lce); ++ } else { ++ spin_lock(&bl->bl_ext_lock); ++ add_to_commitlist(bl, lce); ++ spin_unlock(&bl->bl_ext_lock); ++ } ++ } ++} +diff -up linux-2.6.37.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.37.noarch/fs/nfs/blocklayout/Makefile +--- linux-2.6.37.noarch/fs/nfs/blocklayout/Makefile.orig 2011-01-28 09:43:53.305777685 -0500 ++++ linux-2.6.37.noarch/fs/nfs/blocklayout/Makefile 2011-01-28 09:43:53.305777685 -0500 +@@ -0,0 +1,6 @@ ++# ++# Makefile for the pNFS block layout driver kernel module ++# ++obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o ++blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \ ++ extents.o block-device-discovery-pipe.o +diff -up linux-2.6.37.noarch/fs/nfs/callback.h.orig linux-2.6.37.noarch/fs/nfs/callback.h +--- linux-2.6.37.noarch/fs/nfs/callback.h.orig 2011-01-28 09:37:32.520980712 -0500 ++++ linux-2.6.37.noarch/fs/nfs/callback.h 2011-01-28 09:43:53.311776459 -0500 +@@ -164,9 +164,39 @@ struct cb_layoutrecallargs { + extern unsigned nfs4_callback_layoutrecall( + struct cb_layoutrecallargs *args, + void *dummy, struct cb_process_state *cps); ++extern bool matches_outstanding_recall(struct inode *ino, ++ struct pnfs_layout_range *range); ++extern void notify_drained(struct nfs_client *clp, u64 mask); ++extern void nfs_client_return_layouts(struct nfs_client *clp); + + extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); + extern void nfs4_cb_take_slot(struct nfs_client *clp); ++ ++struct cb_devicenotifyitem { ++ uint32_t cbd_notify_type; ++ uint32_t cbd_layout_type; ++ struct nfs4_deviceid cbd_dev_id; ++ uint32_t cbd_immediate; ++}; ++ ++/* XXX: Should be dynamic up to max compound size */ ++#define NFS4_DEV_NOTIFY_MAXENTRIES 10 ++struct cb_devicenotifyargs { ++ struct sockaddr *addr; ++ int ndevs; ++ struct cb_devicenotifyitem devs[NFS4_DEV_NOTIFY_MAXENTRIES]; ++}; ++ ++extern __be32 nfs4_callback_devicenotify( ++ struct cb_devicenotifyargs *args, ++ void *dummy, struct cb_process_state *cps); ++ ++#else /* CONFIG_NFS_V4_1 */ ++ ++static inline void nfs_client_return_layouts(struct nfs_client *clp) ++{ ++} ++ + #endif /* CONFIG_NFS_V4_1 */ + + extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, +diff -up linux-2.6.37.noarch/fs/nfs/callback_proc.c.orig linux-2.6.37.noarch/fs/nfs/callback_proc.c +--- linux-2.6.37.noarch/fs/nfs/callback_proc.c.orig 2011-01-28 09:37:32.521980677 -0500 ++++ linux-2.6.37.noarch/fs/nfs/callback_proc.c 2011-01-28 09:43:53.312776264 -0500 +@@ -108,106 +108,277 @@ int nfs4_validate_delegation_stateid(str + + #if defined(CONFIG_NFS_V4_1) + +-static u32 initiate_file_draining(struct nfs_client *clp, +- struct cb_layoutrecallargs *args) ++static bool ++_recall_matches_lget(struct pnfs_cb_lrecall_info *cb_info, ++ struct inode *ino, struct pnfs_layout_range *range) + { +- struct pnfs_layout_hdr *lo; +- struct inode *ino; +- bool found = false; +- u32 rv = NFS4ERR_NOMATCHING_LAYOUT; +- LIST_HEAD(free_me_list); ++ struct cb_layoutrecallargs *cb_args = &cb_info->pcl_args; + +- spin_lock(&clp->cl_lock); +- list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { +- if (nfs_compare_fh(&args->cbl_fh, +- &NFS_I(lo->plh_inode)->fh)) +- continue; +- ino = igrab(lo->plh_inode); +- if (!ino) +- continue; +- found = true; +- /* Without this, layout can be freed as soon +- * as we release cl_lock. ++ switch (cb_args->cbl_recall_type) { ++ case RETURN_ALL: ++ return true; ++ case RETURN_FSID: ++ return !memcmp(&NFS_SERVER(ino)->fsid, &cb_args->cbl_fsid, ++ sizeof(struct nfs_fsid)); ++ case RETURN_FILE: ++ return (ino == cb_info->pcl_ino) && ++ should_free_lseg(range, &cb_args->cbl_range); ++ default: ++ /* Should never hit here, as decode_layoutrecall_args() ++ * will verify cb_info from server. + */ +- get_layout_hdr(lo); +- break; ++ BUG(); + } +- spin_unlock(&clp->cl_lock); +- if (!found) +- return NFS4ERR_NOMATCHING_LAYOUT; ++} + +- spin_lock(&ino->i_lock); +- if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || +- mark_matching_lsegs_invalid(lo, &free_me_list, +- args->cbl_range.iomode)) +- rv = NFS4ERR_DELAY; +- else +- rv = NFS4ERR_NOMATCHING_LAYOUT; +- pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); +- spin_unlock(&ino->i_lock); +- pnfs_free_lseg_list(&free_me_list); +- put_layout_hdr(lo); +- iput(ino); ++bool ++matches_outstanding_recall(struct inode *ino, struct pnfs_layout_range *range) ++{ ++ struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; ++ struct pnfs_cb_lrecall_info *cb_info; ++ bool rv = false; ++ ++ assert_spin_locked(&clp->cl_lock); ++ list_for_each_entry(cb_info, &clp->cl_layoutrecalls, pcl_list) { ++ if (_recall_matches_lget(cb_info, ino, range)) { ++ rv = true; ++ break; ++ } ++ } + return rv; + } + +-static u32 initiate_bulk_draining(struct nfs_client *clp, +- struct cb_layoutrecallargs *args) ++/* Send a synchronous LAYOUTRETURN. By the time this is called, we know ++ * all IO has been drained, any matching lsegs deleted, and that no ++ * overlapping LAYOUTGETs will be sent or processed for the duration ++ * of this call. ++ * Note that it is possible that when this is called, the stateid has ++ * been invalidated. But will not be cleared, so can still use. ++ */ ++static int ++pnfs_send_layoutreturn(struct nfs_client *clp, ++ struct pnfs_cb_lrecall_info *cb_info) ++{ ++ struct cb_layoutrecallargs *args = &cb_info->pcl_args; ++ struct nfs4_layoutreturn *lrp; ++ ++ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); ++ if (!lrp) ++ return -ENOMEM; ++ lrp->args.reclaim = 0; ++ lrp->args.layout_type = args->cbl_layout_type; ++ lrp->args.return_type = args->cbl_recall_type; ++ lrp->clp = clp; ++ if (args->cbl_recall_type == RETURN_FILE) { ++ lrp->args.range = args->cbl_range; ++ lrp->args.inode = cb_info->pcl_ino; ++ } else { ++ lrp->args.range.iomode = IOMODE_ANY; ++ lrp->args.inode = NULL; ++ } ++ return nfs4_proc_layoutreturn(lrp, true); ++} ++ ++/* Called by state manager to finish CB_LAYOUTRECALLS initiated by ++ * nfs4_callback_layoutrecall(). ++ */ ++void nfs_client_return_layouts(struct nfs_client *clp) + { +- struct pnfs_layout_hdr *lo; +- struct inode *ino; +- u32 rv = NFS4ERR_NOMATCHING_LAYOUT; +- struct pnfs_layout_hdr *tmp; +- LIST_HEAD(recall_list); +- LIST_HEAD(free_me_list); +- struct pnfs_layout_range range = { +- .iomode = IOMODE_ANY, +- .offset = 0, +- .length = NFS4_MAX_UINT64, +- }; ++ struct pnfs_cb_lrecall_info *cb_info; + + spin_lock(&clp->cl_lock); +- list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { +- if ((args->cbl_recall_type == RETURN_FSID) && +- memcmp(&NFS_SERVER(lo->plh_inode)->fsid, +- &args->cbl_fsid, sizeof(struct nfs_fsid))) +- continue; +- if (!igrab(lo->plh_inode)) +- continue; +- get_layout_hdr(lo); +- BUG_ON(!list_empty(&lo->plh_bulk_recall)); +- list_add(&lo->plh_bulk_recall, &recall_list); ++ while (true) { ++ if (list_empty(&clp->cl_layoutrecalls)) { ++ spin_unlock(&clp->cl_lock); ++ break; ++ } ++ cb_info = list_first_entry(&clp->cl_layoutrecalls, ++ struct pnfs_cb_lrecall_info, ++ pcl_list); ++ spin_unlock(&clp->cl_lock); ++ if (atomic_read(&cb_info->pcl_count) != 0) ++ break; ++ /* What do on error return? These layoutreturns are ++ * required by the protocol. So if do not get ++ * successful reply, probably have to do something ++ * more drastic. ++ */ ++ pnfs_send_layoutreturn(clp, cb_info); ++ spin_lock(&clp->cl_lock); ++ /* Removing from the list unblocks LAYOUTGETs */ ++ list_del(&cb_info->pcl_list); ++ clp->cl_cb_lrecall_count--; ++ clp->cl_drain_notification[1 << cb_info->pcl_notify_bit] = NULL; ++ rpc_wake_up(&clp->cl_rpcwaitq_recall); ++ kfree(cb_info); + } +- spin_unlock(&clp->cl_lock); +- list_for_each_entry_safe(lo, tmp, +- &recall_list, plh_bulk_recall) { +- ino = lo->plh_inode; +- spin_lock(&ino->i_lock); +- set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); +- if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode)) +- rv = NFS4ERR_DELAY; +- list_del_init(&lo->plh_bulk_recall); +- spin_unlock(&ino->i_lock); +- put_layout_hdr(lo); +- iput(ino); ++} ++ ++void notify_drained(struct nfs_client *clp, u64 mask) ++{ ++ atomic_t **ptr = clp->cl_drain_notification; ++ bool done = false; ++ ++ /* clp lock not needed except to remove used up entries */ ++ /* Should probably use functions defined in bitmap.h */ ++ while (mask) { ++ if ((mask & 1) && (atomic_dec_and_test(*ptr))) ++ done = true; ++ mask >>= 1; ++ ptr++; ++ } ++ if (done) { ++ set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); ++ nfs4_schedule_state_manager(clp); ++ } ++} ++ ++static int initiate_layout_draining(struct pnfs_cb_lrecall_info *cb_info) ++{ ++ struct nfs_client *clp = cb_info->pcl_clp; ++ struct pnfs_layout_hdr *lo; ++ int rv = NFS4ERR_NOMATCHING_LAYOUT; ++ struct cb_layoutrecallargs *args = &cb_info->pcl_args; ++ ++ if (args->cbl_recall_type == RETURN_FILE) { ++ LIST_HEAD(free_me_list); ++ ++ spin_lock(&clp->cl_lock); ++ list_for_each_entry(lo, &clp->cl_layouts, layouts) { ++ if (nfs_compare_fh(&args->cbl_fh, ++ &NFS_I(lo->inode)->fh)) ++ continue; ++ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) ++ rv = NFS4ERR_DELAY; ++ else { ++ /* FIXME I need to better understand igrab and ++ * does having a layout ref keep ino around? ++ * It should. ++ */ ++ /* We need to hold the reference until any ++ * potential LAYOUTRETURN is finished. ++ */ ++ get_layout_hdr(lo); ++ cb_info->pcl_ino = lo->inode; ++ rv = NFS4_OK; ++ } ++ break; ++ } ++ spin_unlock(&clp->cl_lock); ++ ++ spin_lock(&lo->inode->i_lock); ++ if (rv == NFS4_OK) { ++ lo->plh_block_lgets++; ++ nfs4_asynch_forget_layouts(lo, &args->cbl_range, ++ cb_info->pcl_notify_bit, ++ &cb_info->pcl_count, ++ &free_me_list); ++ } ++ pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); ++ spin_unlock(&lo->inode->i_lock); ++ pnfs_free_lseg_list(&free_me_list); ++ } else { ++ struct pnfs_layout_hdr *tmp; ++ LIST_HEAD(recall_list); ++ LIST_HEAD(free_me_list); ++ struct pnfs_layout_range range = { ++ .iomode = IOMODE_ANY, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ ++ spin_lock(&clp->cl_lock); ++ /* Per RFC 5661, 12.5.5.2.1.5, bulk recall must be serialized */ ++ if (!list_is_singular(&clp->cl_layoutrecalls)) { ++ spin_unlock(&clp->cl_lock); ++ return NFS4ERR_DELAY; ++ } ++ list_for_each_entry(lo, &clp->cl_layouts, layouts) { ++ if ((args->cbl_recall_type == RETURN_FSID) && ++ memcmp(&NFS_SERVER(lo->inode)->fsid, ++ &args->cbl_fsid, sizeof(struct nfs_fsid))) ++ continue; ++ get_layout_hdr(lo); ++ /* We could list_del(&lo->layouts) here */ ++ BUG_ON(!list_empty(&lo->plh_bulk_recall)); ++ list_add(&lo->plh_bulk_recall, &recall_list); ++ } ++ spin_unlock(&clp->cl_lock); ++ list_for_each_entry_safe(lo, tmp, ++ &recall_list, plh_bulk_recall) { ++ spin_lock(&lo->inode->i_lock); ++ set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); ++ nfs4_asynch_forget_layouts(lo, &range, ++ cb_info->pcl_notify_bit, ++ &cb_info->pcl_count, ++ &free_me_list); ++ list_del_init(&lo->plh_bulk_recall); ++ spin_unlock(&lo->inode->i_lock); ++ put_layout_hdr(lo); ++ rv = NFS4_OK; ++ } ++ pnfs_free_lseg_list(&free_me_list); + } +- pnfs_free_lseg_list(&free_me_list); + return rv; + } + + static u32 do_callback_layoutrecall(struct nfs_client *clp, + struct cb_layoutrecallargs *args) + { +- u32 res = NFS4ERR_DELAY; ++ struct pnfs_cb_lrecall_info *new; ++ atomic_t **ptr; ++ int bit_num; ++ u32 res; + + dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type); +- if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state)) ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) { ++ res = NFS4ERR_DELAY; + goto out; +- if (args->cbl_recall_type == RETURN_FILE) +- res = initiate_file_draining(clp, args); +- else +- res = initiate_bulk_draining(clp, args); +- clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state); ++ } ++ memcpy(&new->pcl_args, args, sizeof(*args)); ++ atomic_set(&new->pcl_count, 1); ++ new->pcl_clp = clp; ++ new->pcl_ino = NULL; ++ spin_lock(&clp->cl_lock); ++ if (clp->cl_cb_lrecall_count >= PNFS_MAX_CB_LRECALLS) { ++ kfree(new); ++ res = NFS4ERR_DELAY; ++ spin_unlock(&clp->cl_lock); ++ goto out; ++ } ++ clp->cl_cb_lrecall_count++; ++ /* Adding to the list will block conflicting LGET activity */ ++ list_add_tail(&new->pcl_list, &clp->cl_layoutrecalls); ++ for (bit_num = 0, ptr = clp->cl_drain_notification; *ptr; ptr++) ++ bit_num++; ++ *ptr = &new->pcl_count; ++ new->pcl_notify_bit = bit_num; ++ spin_unlock(&clp->cl_lock); ++ res = initiate_layout_draining(new); ++ if (res || atomic_dec_and_test(&new->pcl_count)) { ++ spin_lock(&clp->cl_lock); ++ list_del(&new->pcl_list); ++ clp->cl_cb_lrecall_count--; ++ clp->cl_drain_notification[1 << bit_num] = NULL; ++ rpc_wake_up(&clp->cl_rpcwaitq_recall); ++ spin_unlock(&clp->cl_lock); ++ if (res == NFS4_OK) { ++ if (args->cbl_recall_type == RETURN_FILE) { ++ struct pnfs_layout_hdr *lo; ++ ++ lo = NFS_I(new->pcl_ino)->layout; ++ spin_lock(&lo->inode->i_lock); ++ lo->plh_block_lgets--; ++ if (!pnfs_layoutgets_blocked(lo, NULL)) ++ rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid); ++ spin_unlock(&lo->inode->i_lock); ++ put_layout_hdr(lo); ++ } ++ res = NFS4ERR_NOMATCHING_LAYOUT; ++ } ++ kfree(new); ++ } + out: + dprintk("%s returning %i\n", __func__, res); + return res; +@@ -241,6 +412,36 @@ static void pnfs_recall_all_layouts(stru + do_callback_layoutrecall(clp, &args); + } + ++__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, ++ void *dummy, struct cb_process_state *cps) ++{ ++ int i; ++ u32 type, res = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ if (!cps->clp) { ++ res = NFS4ERR_OP_NOT_IN_SESSION; ++ goto out; ++ } ++ ++ for (i = 0; i < args->ndevs; i++) { ++ struct cb_devicenotifyitem *dev = &args->devs[i]; ++ type = dev->cbd_notify_type; ++ if (type == NOTIFY_DEVICEID4_DELETE && cps->clp->cl_devid_cache) ++ pnfs_delete_deviceid(cps->clp->cl_devid_cache, ++ &dev->cbd_dev_id); ++ else if (type == NOTIFY_DEVICEID4_CHANGE) ++ printk(KERN_ERR "%s: NOTIFY_DEVICEID4_CHANGE " ++ "not supported\n", __func__); ++ } ++ ++out: ++ dprintk("%s: exit with status = %u\n", ++ __func__, res); ++ return cpu_to_be32(res); ++} ++ + int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) + { + if (delegation == NULL) +diff -up linux-2.6.37.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.37.noarch/fs/nfs/callback_xdr.c +--- linux-2.6.37.noarch/fs/nfs/callback_xdr.c.orig 2011-01-28 09:37:32.522980641 -0500 ++++ linux-2.6.37.noarch/fs/nfs/callback_xdr.c 2011-01-28 09:43:53.313776069 -0500 +@@ -25,6 +25,7 @@ + + #if defined(CONFIG_NFS_V4_1) + #define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) ++#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ + 4 + 1 + 3) + #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +@@ -284,6 +285,93 @@ out: + return status; + } + ++static ++__be32 decode_devicenotify_args(struct svc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct cb_devicenotifyargs *args) ++{ ++ __be32 *p; ++ __be32 status = 0; ++ u32 tmp; ++ int n, i; ++ args->ndevs = 0; ++ ++ args->addr = svc_addr(rqstp); ++ ++ /* Num of device notifications */ ++ p = read_buf(xdr, sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ n = ntohl(*p++); ++ if (n <= 0) ++ goto out; ++ ++ /* XXX: need to possibly return error in this case */ ++ if (n > NFS4_DEV_NOTIFY_MAXENTRIES) { ++ dprintk("%s: Processing (%d) notifications out of (%d)\n", ++ __func__, NFS4_DEV_NOTIFY_MAXENTRIES, n); ++ n = NFS4_DEV_NOTIFY_MAXENTRIES; ++ } ++ ++ /* Decode each dev notification */ ++ for (i = 0; i < n; i++) { ++ struct cb_devicenotifyitem *dev = &args->devs[i]; ++ ++ p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ ++ tmp = ntohl(*p++); /* bitmap size */ ++ if (tmp != 1) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ dev->cbd_notify_type = ntohl(*p++); ++ if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && ++ dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ ++ tmp = ntohl(*p++); /* opaque size */ ++ if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) && ++ (tmp != NFS4_DEVICEID4_SIZE + 8)) || ++ ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) && ++ (tmp != NFS4_DEVICEID4_SIZE + 4))) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ dev->cbd_layout_type = ntohl(*p++); ++ memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE); ++ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); ++ ++ if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { ++ p = read_buf(xdr, sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_DELAY); ++ goto out; ++ } ++ dev->cbd_immediate = ntohl(*p++); ++ } else { ++ dev->cbd_immediate = 0; ++ } ++ ++ args->ndevs++; ++ ++ dprintk("%s: type %d layout 0x%x immediate %d\n", ++ __func__, dev->cbd_notify_type, dev->cbd_layout_type, ++ dev->cbd_immediate); ++ } ++out: ++ dprintk("%s: status %d ndevs %d\n", ++ __func__, ntohl(status), args->ndevs); ++ return status; ++} ++ + static __be32 decode_sessionid(struct xdr_stream *xdr, + struct nfs4_sessionid *sid) + { +@@ -639,10 +727,10 @@ preprocess_nfs41_op(int nop, unsigned in + case OP_CB_RECALL_ANY: + case OP_CB_RECALL_SLOT: + case OP_CB_LAYOUTRECALL: ++ case OP_CB_NOTIFY_DEVICEID: + *op = &callback_ops[op_nr]; + break; + +- case OP_CB_NOTIFY_DEVICEID: + case OP_CB_NOTIFY: + case OP_CB_PUSH_DELEG: + case OP_CB_RECALLABLE_OBJ_AVAIL: +@@ -850,6 +938,12 @@ static struct callback_op callback_ops[] + (callback_decode_arg_t)decode_layoutrecall_args, + .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, + }, ++ [OP_CB_NOTIFY_DEVICEID] = { ++ .process_op = (callback_process_op_t)nfs4_callback_devicenotify, ++ .decode_args = ++ (callback_decode_arg_t)decode_devicenotify_args, ++ .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ, ++ }, + [OP_CB_SEQUENCE] = { + .process_op = (callback_process_op_t)nfs4_callback_sequence, + .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, +diff -up linux-2.6.37.noarch/fs/nfs/client.c.orig linux-2.6.37.noarch/fs/nfs/client.c +--- linux-2.6.37.noarch/fs/nfs/client.c.orig 2011-01-28 09:37:32.523980606 -0500 ++++ linux-2.6.37.noarch/fs/nfs/client.c 2011-01-28 09:43:53.314775880 -0500 +@@ -185,6 +185,9 @@ static struct nfs_client *nfs_alloc_clie + clp->cl_machine_cred = cred; + #if defined(CONFIG_NFS_V4_1) + INIT_LIST_HEAD(&clp->cl_layouts); ++ INIT_LIST_HEAD(&clp->cl_layoutrecalls); ++ rpc_init_wait_queue(&clp->cl_rpcwaitq_recall, ++ "NFS client CB_LAYOUTRECALLS"); + #endif + nfs_fscache_get_client_cookie(clp); + +@@ -243,11 +246,6 @@ static void nfs_cb_idr_remove_locked(str + idr_remove(&cb_ident_idr, clp->cl_cb_ident); + } + +-static void pnfs_init_server(struct nfs_server *server) +-{ +- rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); +-} +- + #else + static void nfs4_shutdown_client(struct nfs_client *clp) + { +@@ -261,10 +259,6 @@ static void nfs_cb_idr_remove_locked(str + { + } + +-static void pnfs_init_server(struct nfs_server *server) +-{ +-} +- + #endif /* CONFIG_NFS_V4 */ + + /* +@@ -404,7 +398,7 @@ static int nfs_sockaddr_match_ipaddr(con + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, including the port number. + */ +-static int nfs_sockaddr_cmp(const struct sockaddr *sa1, ++int nfs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2) + { + if (sa1->sa_family != sa2->sa_family) +@@ -418,6 +412,7 @@ static int nfs_sockaddr_cmp(const struct + } + return 0; + } ++EXPORT_SYMBOL(nfs_sockaddr_cmp); + + /* Common match routine for v4.0 and v4.1 callback services */ + bool +@@ -567,6 +562,7 @@ int nfs4_check_client_ready(struct nfs_c + return -EPROTONOSUPPORT; + return 0; + } ++EXPORT_SYMBOL(nfs4_check_client_ready); + + /* + * Initialise the timeout values for a connection +@@ -889,7 +885,7 @@ error: + /* + * Load up the server record from information gained in an fsinfo record + */ +-static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) ++static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) + { + unsigned long max_rpc_payload; + +@@ -919,7 +915,9 @@ static void nfs_server_set_fsinfo(struct + if (server->wsize > NFS_MAX_FILE_IO_SIZE) + server->wsize = NFS_MAX_FILE_IO_SIZE; + server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +- set_pnfs_layoutdriver(server, fsinfo->layouttype); ++ server->pnfs_blksize = fsinfo->blksize; ++ set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype); ++ pnfs_set_ds_iosize(server); + + server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); + +@@ -965,7 +963,7 @@ static int nfs_probe_fsinfo(struct nfs_s + if (error < 0) + goto out_error; + +- nfs_server_set_fsinfo(server, &fsinfo); ++ nfs_server_set_fsinfo(server, mntfh, &fsinfo); + + /* Get some general file system info */ + if (server->namelen == 0) { +@@ -1055,8 +1053,6 @@ static struct nfs_server *nfs_alloc_serv + return NULL; + } + +- pnfs_init_server(server); +- + return server; + } + +@@ -1360,7 +1356,7 @@ error: + /* + * Set up an NFS4 client + */ +-static int nfs4_set_client(struct nfs_server *server, ++int nfs4_set_client(struct nfs_server *server, + const char *hostname, + const struct sockaddr *addr, + const size_t addrlen, +@@ -1403,6 +1399,7 @@ error: + dprintk("<-- nfs4_set_client() = xerror %d\n", error); + return error; + } ++EXPORT_SYMBOL(nfs4_set_client); + + + /* +diff -up linux-2.6.37.noarch/fs/nfsd/bl_com.c.orig linux-2.6.37.noarch/fs/nfsd/bl_com.c +--- linux-2.6.37.noarch/fs/nfsd/bl_com.c.orig 2011-01-28 09:43:53.347770803 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/bl_com.c 2011-01-28 09:43:53.347770803 -0500 +@@ -0,0 +1,292 @@ ++#if defined(CONFIG_SPNFS_BLOCK) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++static ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *, ++ char __user *, size_t); ++static ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); ++static void bl_pipe_destroy_msg(struct rpc_pipe_msg *); ++ ++static struct rpc_pipe_ops bl_upcall_ops = { ++ .upcall = bl_pipe_upcall, ++ .downcall = bl_pipe_downcall, ++ .destroy_msg = bl_pipe_destroy_msg, ++}; ++ ++bl_comm_t *bl_comm_global; ++ ++int ++nfsd_bl_start(void) ++{ ++ bl_comm_t *bl_comm = NULL; ++ struct path path; ++ struct nameidata nd; ++ int rc; ++ ++ dprintk("%s: starting pipe\n", __func__); ++ if (bl_comm_global) ++ return -EEXIST; ++ ++ path.mnt = rpc_get_mount(); ++ if (IS_ERR(path.mnt)) ++ return PTR_ERR(path.mnt); ++ ++ /* FIXME: do not abuse rpc_pipefs/nfs */ ++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); ++ if (rc) ++ goto err; ++ ++ bl_comm = kzalloc(sizeof (*bl_comm), GFP_KERNEL); ++ if (!bl_comm) { ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ /* FIXME: rename to "spnfs_block" */ ++ bl_comm->pipe_dentry = rpc_mkpipe(nd.path.dentry, "pnfs_block", bl_comm, ++ &bl_upcall_ops, 0); ++ if (IS_ERR(bl_comm->pipe_dentry)) { ++ rc = -EPIPE; ++ goto err; ++ } ++ mutex_init(&bl_comm->lock); ++ mutex_init(&bl_comm->pipe_lock); ++ init_waitqueue_head(&bl_comm->pipe_wq); ++ ++ bl_comm_global = bl_comm; ++ return 0; ++err: ++ rpc_put_mount(); ++ kfree(bl_comm); ++ return rc; ++} ++ ++void ++nfsd_bl_stop(void) ++{ ++ bl_comm_t *c = bl_comm_global; ++ ++ dprintk("%s: stopping pipe\n", __func__); ++ if (!c) ++ return; ++ rpc_unlink(c->pipe_dentry); ++ rpc_put_mount(); ++ bl_comm_global = NULL; ++ kfree(c); ++} ++ ++static ssize_t ++bl_pipe_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *dst, ++ size_t buflen) ++{ ++ char *data = (char *)msg->data + msg->copied; ++ ssize_t mlen = msg->len - msg->copied, ++ left; ++ ++ if (mlen > buflen) ++ mlen = buflen; ++ ++ left = copy_to_user(dst, data, mlen); ++ if (left < 0) { ++ msg->errno = left; ++ return left; ++ } ++ mlen -= left; ++ msg->copied += mlen; ++ msg->errno = 0; ++ ++ return mlen; ++} ++ ++static ssize_t ++bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ++{ ++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); ++ bl_comm_t *bc = (bl_comm_t *)rpci->private; ++ bl_comm_msg_t *im = &bc->msg; ++ int ret; ++ bl_comm_res_t *res; ++ ++ ++ if (mlen == 0) { ++ im->msg_status = PNFS_BLOCK_FAILURE; ++ im->msg_res = NULL; ++ wake_up(&bc->pipe_wq); ++ return -EFAULT; ++ } ++ ++ if ((res = kmalloc(mlen, GFP_KERNEL)) == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(res, src, mlen)) { ++ kfree(res); ++ return -EFAULT; ++ } ++ ++ mutex_lock(&bc->pipe_lock); ++ ++ ret = mlen; ++ im->msg_status = res->res_status; ++ im->msg_res = res; ++ ++ wake_up(&bc->pipe_wq); ++ mutex_unlock(&bc->pipe_lock); ++ return ret; ++} ++ ++static void ++bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) ++{ ++ bl_comm_msg_t *im = msg->data; ++ bl_comm_t *bc = container_of(im, struct bl_comm, msg); ++ ++ if (msg->errno >= 0) ++ return; ++ ++ mutex_lock(&bc->pipe_lock); ++ im->msg_status = PNFS_BLOCK_FAILURE; ++ wake_up(&bc->pipe_wq); ++ mutex_unlock(&bc->pipe_lock); ++} ++ ++int ++bl_upcall(bl_comm_t *bc, bl_comm_msg_t *upmsg, bl_comm_res_t **res) ++{ ++ struct rpc_pipe_msg msg; ++ DECLARE_WAITQUEUE(wq, current); ++ int rval = 1; ++ bl_comm_msg_t *m = &bc->msg; ++ ++ if (bc == NULL) { ++ dprintk("%s: No pNFS block daemon available\n", __func__); ++ return 1; ++ } ++ ++ mutex_lock(&bc->lock); ++ mutex_lock(&bc->pipe_lock); ++ ++ memcpy(m, upmsg, sizeof (*m)); ++ ++ memset(&msg, 0, sizeof (msg)); ++ msg.data = m; ++ msg.len = sizeof (*m); ++ ++ add_wait_queue(&bc->pipe_wq, &wq); ++ rval = rpc_queue_upcall(bc->pipe_dentry->d_inode, &msg); ++ if (rval < 0) { ++ remove_wait_queue(&bc->pipe_wq, &wq); ++ goto out; ++ } ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&bc->pipe_lock); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ remove_wait_queue(&bc->pipe_wq, &wq); ++ mutex_lock(&bc->pipe_lock); ++ ++ if (m->msg_status == PNFS_BLOCK_SUCCESS) { ++ *res = m->msg_res; ++ rval = 0; ++ } else ++ rval = 1; ++ ++out: ++ mutex_unlock(&bc->pipe_lock); ++ mutex_unlock(&bc->lock); ++ return rval; ++} ++ ++static ssize_t ctl_write(struct file *file, const char __user *buf, size_t len, ++ loff_t *offset) ++{ ++ int cmd, ++ rc; ++ bl_comm_t *bc = bl_comm_global; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof (int))) ++ return -EFAULT; ++ switch (cmd) { ++ case PNFS_BLOCK_CTL_STOP: ++ msg.msg_type = PNFS_UPCALL_MSG_STOP; ++ (void) bl_upcall(bc, &msg, &res); ++ kfree(res); ++ nfsd_bl_stop(); ++ break; ++ ++ case PNFS_BLOCK_CTL_START: ++ rc = nfsd_bl_start(); ++ if (rc != 0) ++ return rc; ++ break; ++ ++ case PNFS_BLOCK_CTL_VERS: ++ msg.msg_type = PNFS_UPCALL_MSG_VERS; ++ msg.u.msg_vers = PNFS_UPCALL_VERS; ++ if (bl_upcall(bc, &msg, &res)) { ++ dprintk("%s: Failed to contact pNFS block daemon\n", ++ __func__); ++ return 0; ++ } ++ kfree(res); ++ break; ++ ++ default: ++ dprintk("%s: unknown ctl command %d\n", __func__, cmd); ++ break; ++ } ++ return len; ++} ++ ++static struct file_operations ctl_ops = { ++ .write = ctl_write, ++}; ++ ++/* ++ * bl_init_proc -- set up proc interfaces ++ * ++ * Creating a pnfs_block directory isn't really required at this point ++ * since we've only got a single node in that directory. If the need for ++ * more nodes doesn't present itself shortly this code should revert ++ * to a single top level node. McNeal 11-Aug-2008. ++ */ ++int ++bl_init_proc(void) ++{ ++ struct proc_dir_entry *e; ++ ++ e = proc_mkdir("fs/pnfs_block", NULL); ++ if (!e) ++ return -ENOMEM; ++ ++ e = create_proc_entry("fs/pnfs_block/ctl", 0, NULL); ++ if (!e) ++ return -ENOMEM; ++ e->proc_fops = &ctl_ops; ++ ++ return 0; ++} ++#endif /* CONFIG_SPNFS_BLOCK */ +diff -up linux-2.6.37.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.37.noarch/fs/nfsd/bl_ops.c +--- linux-2.6.37.noarch/fs/nfsd/bl_ops.c.orig 2011-01-28 09:43:53.349770555 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/bl_ops.c 2011-01-28 09:43:53.349770555 -0500 +@@ -0,0 +1,1672 @@ ++/* ++ * bl_ops.c ++ * spNFS ++ * ++ * Created by Rick McNeal on 4/1/08. ++ * Copyright 2008 __MyCompanyName__. All rights reserved. ++ * ++ */ ++ ++/* ++ * Block layout operations. ++ * ++ * These functions, with the exception of pnfs_block_enabled, are assigned to ++ * the super block s_export_op structure. ++ */ ++#if defined(CONFIG_SPNFS_BLOCK) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++#define MIN(a, b) ((a) < (b) ? (a) : (b)) ++ ++#define BL_LAYOUT_HASH_BITS 4 ++#define BL_LAYOUT_HASH_SIZE (1 << BL_LAYOUT_HASH_BITS) ++#define BL_LAYOUT_HASH_MASK (BL_LAYOUT_HASH_SIZE - 1) ++#define BL_LIST_REQ (sizeof (struct dm_ioctl) + 256) ++ ++#define bl_layout_hashval(id) \ ++ ((id) & BL_LAYOUT_HASH_MASK) ++ ++#define BLL_F_END(p) ((p)->bll_foff + (p)->bll_len) ++#define BLL_S_END(p) ((p)->bll_soff + (p)->bll_len) ++#define _2SECTS(v) ((v) >> 9) ++ ++#ifndef READ32 ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++(x) = (u64)ntohl(*p++) << 32; \ ++(x) |= ntohl(*p++); \ ++} while (0) ++#endif ++ ++ ++typedef enum {True, False} boolean_t; ++/* ---- block layoutget and commit structure ---- */ ++typedef struct bl_layout_rec { ++ struct list_head blr_hash, ++ blr_layouts; ++ dev_t blr_rdev; ++ struct inode *blr_inode; ++ int blr_recalled; // debug ++ u64 blr_orig_size, ++ blr_commit_size, ++ blr_ext_size; ++ spinlock_t blr_lock; // Protects blr_layouts ++} bl_layout_rec_t; ++ ++static struct list_head layout_hash; ++static struct list_head layout_hashtbl[BL_LAYOUT_HASH_SIZE]; ++static spinlock_t layout_hashtbl_lock; ++ ++/* ---- prototypes ---- */ ++static boolean_t device_slice(dev_t devid); ++static boolean_t device_dm(dev_t devid); ++static boolean_t layout_inode_add(struct inode *i, bl_layout_rec_t **); ++static bl_layout_rec_t *layout_inode_find(struct inode *i); ++static void layout_inode_del(struct inode *i); ++static char *map_state2name(enum pnfs_block_extent_state4 s); ++static pnfs_blocklayout_devinfo_t *bld_alloc(struct list_head *volume, int type); ++static void bld_free(pnfs_blocklayout_devinfo_t *bld); ++static pnfs_blocklayout_devinfo_t *bld_simple(struct list_head *volumes, ++ dev_t devid, int local_index); ++static pnfs_blocklayout_devinfo_t *bld_slice(struct list_head *volumes, ++ dev_t devid, int my_loc, int idx); ++static int layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg); ++struct list_head *layout_cache_iter(bl_layout_rec_t *r, ++ struct list_head *bl_possible, struct nfsd4_layout_seg *seg); ++static void layout_cache_merge(bl_layout_rec_t *r, struct list_head *h); ++static int layout_cache_update(bl_layout_rec_t *r, struct list_head *h); ++static void layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg); ++static void print_bll(pnfs_blocklayout_layout_t *b, char *); ++static inline boolean_t layout_cache_fill_from_list(bl_layout_rec_t *r, ++ struct list_head *h, struct nfsd4_layout_seg *seg); ++static inline void bll_collapse(bl_layout_rec_t *r, ++ pnfs_blocklayout_layout_t *c); ++static pnfs_blocklayout_layout_t *bll_alloc(u64 offset, u64 len, ++ enum bl_cache_state state, struct list_head *h); ++static pnfs_blocklayout_layout_t *bll_alloc_dup(pnfs_blocklayout_layout_t *b, ++ enum bl_cache_state c, struct list_head *h); ++static inline boolean_t layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, ++ enum pnfs_block_extent_state4 *s); ++static void extents_setup(struct fiemap_extent_info *fei); ++static void extents_count(struct fiemap_extent_info *fei, struct inode *i, ++ u64 foff, u64 len); ++static boolean_t extents_get(struct fiemap_extent_info *fei, struct inode *i, ++ u64 foff, u64 len); ++static boolean_t extents_process(struct fiemap_extent_info *fei, ++ struct list_head *bl_candidates, struct nfsd4_layout_seg *, dev_t dev, ++ pnfs_blocklayout_layout_t *b); ++static void extents_cleanup(struct fiemap_extent_info *fei); ++ ++void ++nfsd_bl_init(void) ++{ ++ int i; ++ dprintk("%s loaded\n", __func__); ++ ++ spin_lock_init(&layout_hashtbl_lock); ++ INIT_LIST_HEAD(&layout_hash); ++ for (i = 0; i < BL_LAYOUT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&layout_hashtbl[i]); ++ bl_init_proc(); ++} ++ ++/* ++ * pnfs_block_enabled -- check to see if this file system should be export as ++ * block pnfs ++ */ ++int ++pnfs_block_enabled(struct inode *inode, int ex_flags) ++{ ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res = NULL; ++ static int bl_comm_once = 0; ++ ++ dprintk("--> %s\n", __func__); ++ /* ++ * FIXME: Figure out method to determine if this file system should ++ * be exported. The following areas need to be checked. ++ * (1) Validate that this file system was exported as a pNFS ++ * block-layout ++ * (2) Has there been successful communication with the ++ * volume daemon? ++ */ ++ /* Check #1 */ ++#ifdef notyet ++ if (!(ex_flags & NFSEXP_PNFS_BLOCK)) { ++ dprintk("%s: pnfs_block not set in export\n", __func__); ++ return 0; ++ } ++#endif ++ ++ /* Check #1 */ ++ if (!bl_comm_once) { ++ msg.msg_type = PNFS_UPCALL_MSG_VERS; ++ msg.u.msg_vers = PNFS_UPCALL_VERS; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: Failed to contact pNFS block daemon\n", ++ __func__); ++ return 0; ++ } ++ if (msg.u.msg_vers != res->u.vers) { ++ dprintk("%s: vers mismatch, kernel != daemon\n", ++ __func__); ++ kfree(res); ++ return 0; ++ } ++ } ++ bl_comm_once = 1; ++ ++ kfree(res); ++ ++ dprintk("<-- %s okay\n", __func__); ++ return 1; ++} ++ ++int ++bl_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_BLOCK_VOLUME; ++} ++ ++int ++bl_getdeviceiter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ res->gd_devid = sb->s_dev; ++ res->gd_verf = 1; ++ res->gd_cookie = 1; ++ return 0; ++} ++ ++static int ++bl_getdeviceinfo_slice(struct super_block *sb, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ pnfs_blocklayout_devinfo_t *bld_slice_p, ++ *bld_simple_p, ++ *bld; ++ int status = -EIO, ++ location = 0; ++ struct list_head volumes; ++ ++ dprintk("--> %s\n", __func__); ++ INIT_LIST_HEAD(&volumes); ++ ++ bld_simple_p = bld_simple(&volumes, devid->devid, ++ location++); ++ if (!bld_simple_p) ++ goto out; ++ bld_slice_p = bld_slice(&volumes, devid->devid, location++, ++ bld_simple_p->bld_index_loc); ++ ++ if (!bld_slice_p) ++ goto out; ++ ++ status = blocklayout_encode_devinfo(xdr, &volumes); ++ ++out: ++ while (!list_empty(&volumes)) { ++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, ++ bld_list); ++ if (bld->bld_type == PNFS_BLOCK_VOLUME_SIMPLE) ++ kfree(bld->u.simple.bld_sig); ++ bld_free(bld); ++ } ++ ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++static int ++bl_getdeviceinfo_dm(struct super_block *sb, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ pnfs_blocklayout_devinfo_t *bld = NULL; ++ int status = -EIO, // default to error ++ i, ++ location = 0; ++ struct list_head volumes; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ dprintk("--> %s\n", __func__); ++ INIT_LIST_HEAD(&volumes); ++ ++ msg.msg_type = PNFS_UPCALL_MSG_DMGET; ++ msg.u.msg_dev = devid->devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: upcall for DMGET failed\n", __func__); ++ goto out; ++ } ++ ++ /* ++ * Don't use bld_alloc() here. If used this will be the first volume ++ * type added to the list whereas the protocol requires it to be the ++ * last. ++ */ ++ bld = kmalloc(sizeof (*bld), GFP_KERNEL); ++ if (!bld) ++ goto out; ++ memset(bld, 0, sizeof (*bld)); ++ bld->bld_type = PNFS_BLOCK_VOLUME_STRIPE; ++ bld->u.stripe.bld_stripes = res->u.stripe.num_stripes; ++ bld->u.stripe.bld_chunk_size = res->u.stripe.stripe_size * 512LL; ++ dprintk("%s: stripes %d, chunk_size %Lu\n", __func__, ++ bld->u.stripe.bld_stripes, bld->u.stripe.bld_chunk_size / 512LL); ++ ++ bld->u.stripe.bld_stripe_indexs = kmalloc(bld->u.stripe.bld_stripes * ++ sizeof (int), GFP_KERNEL); ++ if (!bld->u.stripe.bld_stripe_indexs) ++ goto out; ++ ++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) { ++ dev_t dev; ++ pnfs_blocklayout_devinfo_t *bldp; ++ ++ dev = MKDEV(res->u.stripe.devs[i].major, ++ res->u.stripe.devs[i].minor); ++ if (dev == 0) ++ goto out; ++ ++ bldp = bld_simple(&volumes, dev, location++); ++ if (!bldp) { ++ dprintk("%s: bld_simple failed\n", __func__); ++ goto out; ++ } ++ bldp = bld_slice(&volumes, dev, location++, bldp->bld_index_loc); ++ ++ if (!bldp) { ++ dprintk("%s: bld_slice failed\n", __func__); ++ goto out; ++ } ++ bld->u.stripe.bld_stripe_indexs[i] = bldp->bld_index_loc; ++ ++ } ++ list_add_tail(&bld->bld_list, &volumes); ++ status = blocklayout_encode_devinfo(xdr, &volumes); ++ ++out: ++ while (!list_empty(&volumes)) { ++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, ++ bld_list); ++ switch (bld->bld_type) { ++ case PNFS_BLOCK_VOLUME_SLICE: ++ case PNFS_BLOCK_VOLUME_CONCAT: ++ // No memory to release for these ++ break; ++ case PNFS_BLOCK_VOLUME_SIMPLE: ++ kfree(bld->u.simple.bld_sig); ++ break; ++ case PNFS_BLOCK_VOLUME_STRIPE: ++ kfree(bld->u.stripe.bld_stripe_indexs); ++ break; ++ } ++ bld_free(bld); ++ } ++ kfree(res); ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++/* ++ * bl_getdeviceinfo -- determine device tree for requested devid ++ */ ++int ++bl_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ if (device_slice(devid->devid) == True) ++ return bl_getdeviceinfo_slice(sb, xdr, devid); ++ else if (device_dm(devid->devid) == True) ++ return bl_getdeviceinfo_dm(sb, xdr, devid); ++ return -EINVAL; ++} ++ ++enum nfsstat4 ++bl_layoutget(struct inode *i, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *arg, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ pnfs_blocklayout_layout_t *b; ++ bl_layout_rec_t *r; ++ struct list_head bl_possible, ++ *bl_candidates = NULL; ++ boolean_t del_on_error = False; ++ int adj; ++ enum nfsstat4 nfserr = NFS4_OK; ++ ++ dprintk("--> %s (inode=[0x%x:%lu], offset=%Lu, len=%Lu, iomode=%d)\n", ++ __func__, i->i_sb->s_dev, i->i_ino, _2SECTS(res->lg_seg.offset), ++ _2SECTS(res->lg_seg.length), res->lg_seg.iomode); ++ ++ if (res->lg_seg.length == 0) { ++ printk("%s: request length of 0, error condition\n", __func__); ++ return NFS4ERR_BADLAYOUT; ++ } ++ ++ /* ++ * Adjust the length as required per spec. ++ * - First case is were the length is set to (u64)-1. Cheap means to ++ * define the end of the file. ++ * - Second case is were the I/O mode is read-only, but the request is ++ * past the end of the file so the request needs to be trimed. ++ */ ++ if ((res->lg_seg.length == NFS4_MAX_UINT64) || ++ (((res->lg_seg.offset + res->lg_seg.length) > i->i_size) && ++ (res->lg_seg.iomode == IOMODE_READ))) ++ res->lg_seg.length = i->i_size - res->lg_seg.offset; ++ ++ adj = (res->lg_seg.offset & 511) ? res->lg_seg.offset & 511 : 0; ++ res->lg_seg.offset -= adj; ++ res->lg_seg.length = (res->lg_seg.length + adj + 511) & ~511; ++ ++ if (res->lg_seg.iomode != IOMODE_READ) ++ if (i->i_fop->fallocate(i, FALLOC_FL_KEEP_SIZE, ++ res->lg_seg.offset, res->lg_seg.length)) ++ return NFS4ERR_IO; ++ ++ INIT_LIST_HEAD(&bl_possible); ++ ++ if ((r = layout_inode_find(i)) == NULL) { ++ if (layout_inode_add(i, &r) == False) { ++ printk("%s: layout_inode_add failed\n", __func__); ++ return NFS4ERR_IO; ++ } ++ del_on_error = True; ++ } ++ BUG_ON(!r); ++ ++ spin_lock(&r->blr_lock); ++ ++ if (layout_cache_fill_from(r, &bl_possible, &res->lg_seg)) { ++ /* ++ * This will send LAYOUTTRYAGAIN error to the client. ++ */ ++ dprintk("%s: layout_cache_fill_from() failed\n", __func__); ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ res->lg_return_on_close = 1; ++ res->lg_seg.length = 0; ++ ++ bl_candidates = layout_cache_iter(r, &bl_possible, &res->lg_seg); ++ if (!bl_candidates) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ layout_cache_merge(r, bl_candidates); ++ if (layout_cache_update(r, bl_candidates)) { ++ /* ---- Failed to allocate memory. ---- */ ++ dprintk("%s: layout_cache_update() failed\n", __func__); ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ nfserr = blocklayout_encode_layout(xdr, bl_candidates); ++ if (nfserr) ++ dprintk("%s: layoutget xdr routine failed\n", __func__); ++ ++layoutget_cleanup: ++ if (bl_candidates) { ++ while (!list_empty(bl_candidates)) { ++ b = list_entry(bl_candidates->next, ++ struct pnfs_blocklayout_layout, bll_list); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ } ++ ++ spin_unlock(&r->blr_lock); ++ if (unlikely(nfserr)) { ++ if (del_on_error == True) ++ layout_inode_del(i); ++ res->lg_seg.length = 0; ++ res->lg_seg.offset = 0; ++ } ++ ++ dprintk("<-- %s (rval %u)\n", __func__, nfserr); ++ return nfserr; ++} ++ ++/* ++ * bl_layoutcommit -- commit changes, especially size, to file systemj ++ * ++ * Currently this routine isn't called and everything is handled within ++ * nfsd4_layoutcommit(). By not calling this routine the server doesn't ++ * handle a partial return, a set of extents, of the layout. The extents ++ * are decoded here, but nothing is done with them. If this routine is ++ * be called the interface must change to pass the 'dentry' pointer such ++ * that notify_change() can be called. ++ */ ++int ++bl_layoutcommit(struct inode *i, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ bl_layout_rec_t *r; ++ int status = 0; ++ u64 lw_plus; ++ ++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); ++ r = layout_inode_find(i); ++ if (r) { ++ lw_plus = args->lc_last_wr + 1; ++ if (args->lc_newoffset) { ++ dprintk(" lc_last_wr %Lu\n", lw_plus); ++ if (r->blr_orig_size < lw_plus) { ++ r->blr_orig_size = lw_plus; ++ res->lc_size_chg = 1; ++ res->lc_newsize = lw_plus; ++ } ++ } ++ ++ if (args->lc_up_len) { ++ int extents, ++ i; ++ struct pnfs_blocklayout_layout *b; ++ __be32 *p = args->lc_up_layout; ++ ++ /* ++ * Client is returning a set of extents which ++ * should/could be used to update the file system. ++ * See section 2.3.2 in draft-ietf-nfsv4-pnfs-block-08 ++ */ ++ READ32(extents); ++ dprintk(" Client returning %d extents: data size %d\n", ++ extents, args->lc_up_len); ++ b = kmalloc(sizeof (struct pnfs_blocklayout_layout) * ++ extents, GFP_KERNEL); ++ if (b) { ++ for (i = 0; i < extents; i++) { ++ READ64(b[i].bll_vol_id.sbid); ++ READ64(b[i].bll_vol_id.devid); ++ READ64(b[i].bll_foff); ++ READ64(b[i].bll_len); ++ READ64(b[i].bll_soff); ++ READ32(b[i].bll_es); ++ dprintk(" %d: foff %Lu, len %Lu, soff %Lu " ++ "state %s\n", ++ i, _2SECTS(b[i].bll_foff), ++ _2SECTS(b[i].bll_len), ++ _2SECTS(b[i].bll_soff), ++ map_state2name(b[i].bll_es)); ++ } ++ kfree(b); ++ } else { ++ status = -ENOMEM; ++ } ++ } ++ } else ++ dprintk("%s: Unexpected commit to inode %p\n", __func__, i); ++ ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++int ++bl_layoutreturn(struct inode *i, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ int status = 0; ++ bl_layout_rec_t *r; ++ ++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); ++ ++ r = layout_inode_find(i); ++ if (r) { ++ spin_lock(&r->blr_lock); ++ layout_cache_del(r, &args->lr_seg); ++ spin_unlock(&r->blr_lock); ++ dprintk(" ext_size %Lu, i_size %Lu, orig_size %Lu\n", ++ r->blr_ext_size, i->i_size, r->blr_orig_size); ++ } ++ ++ layout_inode_del(i); ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++int ++bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) ++{ ++ struct super_block *sb; ++ struct nfsd4_pnfs_cb_layout lr; ++ bl_layout_rec_t *r; ++ pnfs_blocklayout_layout_t *b; ++ u64 adj; ++ ++ dprintk("--> %s\n", __func__); ++ BUG_ON(!len); ++ switch (type) { ++ case RETURN_FILE: ++ sb = inode->i_sb; ++ dprintk(" recalling layout [0x%x:%lu], %Lu:%Lu\n", ++ inode->i_sb->s_dev, inode->i_ino, ++ _2SECTS(offset), _2SECTS(len)); ++ break; ++ case RETURN_FSID: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for fsid x (unimplemented)\n", ++ __func__); ++ return 0; ++ case RETURN_ALL: ++ /* ++ * XXX figure out how to get a sb since there's no ++ * inode ptr ++ */ ++ dprintk("%s: recalling all layouts (unimplemented)\n", ++ __func__); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ ++restart: ++ r = layout_inode_find(inode); ++ if (r && len && !r->blr_recalled) { ++ spin_lock(&r->blr_lock); ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (!r->blr_recalled && !b->bll_recalled && ++ (offset >= b->bll_foff) && (offset < BLL_F_END(b))) { ++ b->bll_recalled = 1; ++ lr.cbl_recall_type = type; ++ lr.cbl_seg.layout_type = LAYOUT_BLOCK_VOLUME; ++ lr.cbl_seg.clientid = 0; ++ lr.cbl_seg.offset = 0; ++ lr.cbl_seg.length = NFS4_MAX_UINT64; ++ r->blr_recalled = 1; ++ dprintk(" FULL LAYOUTRECALL\n"); ++ lr.cbl_seg.iomode = IOMODE_ANY; ++ ++ /* ++ * Currently there are only two cases where the ++ * layout is being returned. ++ * (1) Someone is issuing a NFS_WRITE operation ++ * to this layout. ++ * (2) The file has been truncated which means ++ * the layout is immediately made invalid. ++ * In both cases the client must write any ++ * uncommitted modifications to the server via ++ * NFS_WRITE. ++ */ ++ lr.cbl_layoutchanged = 1; ++ ++ /* ++ * Need to drop the lock because we'll get a ++ * layoutreturn which will block waiting for ++ * the lock. The request will come in on the ++ * same thread which will cause a deadlock. ++ */ ++ spin_unlock(&r->blr_lock); ++ nfsd_layout_recall_cb(sb, inode, &lr); ++ adj = MIN(b->bll_len - (offset - b->bll_foff), ++ len); ++ offset += adj; ++ len -= adj; ++ if (!len) { ++ spin_lock(&r->blr_lock); ++ break; ++ } ++ /* ++ * Since layoutreturn will have been called we ++ * can't assume blr_layouts is still valid, ++ * so restart. ++ */ ++ goto restart; ++ } ++ } ++ spin_unlock(&r->blr_lock); ++ } ++ ++ dprintk("<-- %s\n", __func__); ++ return 0; ++} ++ ++/* ++ * []------------------------------------------------------------------[] ++ * | Support functions from here on down. | ++ * []------------------------------------------------------------------[] ++ */ ++ ++/* ++ * bld_simple -- given a dev_t build a simple volume structure ++ * ++ * Simple volume contains the device signature and offset to that data in ++ * the storage volume. ++ */ ++static pnfs_blocklayout_devinfo_t * ++bld_simple(struct list_head *volumes, dev_t devid, int local_index) ++{ ++ pnfs_blocklayout_devinfo_t *bld = NULL; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res = NULL; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_GETSIG; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: Failed to get signature information\n", __func__); ++ goto error; ++ } ++ ++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SIMPLE); ++ if (!bld) ++ return NULL; ++ ++ bld->u.simple.bld_offset = (res->u.sig.sector * 512LL) + res->u.sig.offset; ++ bld->u.simple.bld_sig_len = res->u.sig.len; ++ bld->u.simple.bld_sig = kmalloc(res->u.sig.len, GFP_KERNEL); ++ if (!bld->u.simple.bld_sig) ++ goto error; ++ ++ memcpy(bld->u.simple.bld_sig, res->u.sig.sig, res->u.sig.len); ++ kfree(res); ++ return bld; ++ ++error: ++ if (bld) ++ bld_free(bld); ++ if (res) ++ kfree(res); ++ dprintk("%s: error in bld_simple\n", __func__); ++ return NULL; ++} ++ ++/* ++ * bld_slice -- given a dev_t build a slice volume structure ++ * ++ * A slice volume contains the length of the slice/partition and its offset ++ * from the beginning of the storage volume. There's also a reference to ++ * the "simple" volume which contains this slice. ++ */ ++static pnfs_blocklayout_devinfo_t * ++bld_slice(struct list_head *volumes, dev_t devid, int my_loc, int simple_loc) ++{ ++ pnfs_blocklayout_devinfo_t *bld; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ dprintk("--> %s\n", __func__); ++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SLICE); ++ if (!bld) ++ return NULL; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_GETSLICE; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("Upcall to get slice info failed\n"); ++ bld_free(bld); ++ return NULL; ++ } ++ ++ bld->bld_devid.devid = devid; ++ bld->bld_index_loc = my_loc; ++ bld->u.slice.bld_start = res->u.slice.start * 512LL; ++ bld->u.slice.bld_len = res->u.slice.length * 512LL; ++ bld->u.slice.bld_index = simple_loc; ++ ++ dprintk("%s: start %Lu, len %Lu\n", __func__, ++ bld->u.slice.bld_start / 512LL, bld->u.slice.bld_len / 512LL); ++ ++ kfree(res); ++ dprintk("<-- %s (rval %p)\n", __func__, bld); ++ return bld; ++} ++ ++static int ++layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *n; ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!list_empty(&r->blr_layouts)) ++ if (layout_cache_fill_from_list(r, h, seg) == False) ++ return -EIO; ++ ++ /* ++ * This deals with two conditions. ++ * (1) When blr_layouts is empty we need to create the first entry ++ * (2) When the range requested falls past the end of any current ++ * layout the residual must be taken care of. ++ */ ++ if (seg->length) { ++ n = bll_alloc(seg->offset, seg->length, BLOCK_LAYOUT_NEW, h); ++ if (!n) ++ return -ENOMEM; ++ dprintk(" remaining at %Lu, len %Lu\n", _2SECTS(n->bll_foff), ++ _2SECTS(n->bll_len)); ++ } ++ ++ dprintk("<-- %s\n", __func__); ++ return 0; ++} ++ ++struct list_head * ++layout_cache_iter(bl_layout_rec_t *r, struct list_head *bl_possible, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *n = NULL; ++ struct list_head *bl_candidates = NULL; ++ struct fiemap_extent_info fei; ++ struct inode *i; ++ dev_t dev; ++ ++ dev = r->blr_rdev; ++ i = r->blr_inode; ++ ++ dprintk("--> %s\n", __func__); ++ bl_candidates = kmalloc(sizeof (*bl_candidates), GFP_KERNEL); ++ if (!bl_candidates) ++ return NULL; ++ INIT_LIST_HEAD(bl_candidates); ++ extents_setup(&fei); ++ ++ list_for_each_entry(b, bl_possible, bll_list) { ++ if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { ++ ++ extents_count(&fei, i, b->bll_foff, b->bll_len); ++ if (fei.fi_extents_mapped) { ++ ++ /* ++ * Common case here. Got a range which has ++ * extents. Now get those extents and process ++ * them into pNFS extents. ++ */ ++ if (extents_get(&fei, i, b->bll_foff, ++ b->bll_len) == False) ++ goto cleanup; ++ if (extents_process(&fei, bl_candidates, ++ seg, dev, b) == False) ++ goto cleanup; ++ extents_cleanup(&fei); ++ ++ } else if (seg->iomode == IOMODE_READ) { ++ ++ /* ++ * Found a hole in a file while reading. No ++ * problem, just create a pNFS extent for the ++ * range and let the client know there's no ++ * backing store. ++ */ ++ n = bll_alloc(b->bll_foff, b->bll_len, ++ BLOCK_LAYOUT_NEW, bl_candidates); ++ n->bll_es = PNFS_BLOCK_NONE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ seg->length += b->bll_len; ++ } else { ++ ++ /* ++ * There's a problem here. Since the iomode ++ * is read/write fallocate should have allocated ++ * any necessary storage for the given range. ++ */ ++ dprintk(" Extent count for RW is 0\n"); ++ goto cleanup; ++ } ++ ++ } else { ++ n = bll_alloc_dup(b, b->bll_cache_state, bl_candidates); ++ seg->length += n->bll_len; ++ } ++ ++ if (r->blr_ext_size < (b->bll_foff + b->bll_len)) ++ r->blr_ext_size = b->bll_foff + b->bll_len; ++ } ++ ++ while (!list_empty(bl_possible)) { ++ b = list_entry(bl_possible->next, ++ struct pnfs_blocklayout_layout, bll_list); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ ++ b = list_first_entry(bl_candidates, struct pnfs_blocklayout_layout, ++ bll_list); ++ seg->offset = b->bll_foff; ++ dprintk("<-- %s okay\n", __func__); ++ return bl_candidates; ++ ++cleanup: ++ extents_cleanup(&fei); ++ if (bl_candidates) ++ kfree(bl_candidates); ++ dprintk("<-- %s, error occurred\n", __func__); ++ return NULL; ++} ++ ++/* ++ * layout_cache_merge -- collapse layouts which make up a contiguous range. ++ */ ++static void ++layout_cache_merge(bl_layout_rec_t *r, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *p; ++ ++ dprintk("--> %s\n", __func__); ++restart: ++ p = NULL; ++ list_for_each_entry(b, h, bll_list) { ++ if (p && (BLL_S_END(p) == b->bll_soff) && ++ (p->bll_es == b->bll_es) && ++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { ++ /* ++ * We've got a condidate. ++ */ ++#ifdef too_verbose ++ dprintk(" merge %Lu(f):%Lu(l):%Lu(s) into %Lu(f):%Lu(l):%Lu(s)\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), ++ _2SECTS(p->bll_foff), _2SECTS(p->bll_len), ++ _2SECTS(b->bll_soff)); ++#endif ++ ++ if (p->bll_cache_state == BLOCK_LAYOUT_CACHE) ++ p->bll_cache_state = BLOCK_LAYOUT_UPDATE; ++ p->bll_len += b->bll_len; ++ list_del(&b->bll_list); ++ kfree(b); ++ goto restart; ++ } else if (p && (BLL_F_END(p) == b->bll_foff) && ++ (p->bll_es == b->bll_es) && ++ (b->bll_es == PNFS_BLOCK_NONE_DATA)) { ++ p->bll_len += b->bll_len; ++ list_del(&b->bll_list); ++ kfree(b); ++ goto restart; ++ } else ++ p = b; ++ } ++ dprintk("<-- %s\n", __func__); ++} ++ ++static int ++layout_cache_update(bl_layout_rec_t *r, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *c, ++ *n; ++ boolean_t status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ if (list_empty(&r->blr_layouts)) { ++ /* ---- Just add entries and return ---- */ ++ dprintk(" cache empty for inode 0x%x:%ld\n", r->blr_rdev, ++ r->blr_inode->i_ino); ++ list_for_each_entry(b, h, bll_list) { ++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, ++ &r->blr_layouts); ++ if (!c) { ++ status = -ENOMEM; ++ break; ++ } ++ dprintk(" adding %Lu(f):%Lu(l):%Lu(s):%d\n", ++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len), ++ _2SECTS(c->bll_soff), c->bll_es); ++ } ++ return status; ++ } ++ ++ list_for_each_entry(b, h, bll_list) { ++ BUG_ON(!b->bll_vol_id.devid); ++ if (b->bll_cache_state == BLOCK_LAYOUT_UPDATE) { ++ boolean_t found = False; ++ list_for_each_entry(c, &r->blr_layouts, bll_list) { ++ if ((b->bll_soff >= c->bll_soff) && ++ (b->bll_soff < BLL_S_END(c)) && ++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { ++ u64 u; ++ ++ if ((b->bll_foff < c->bll_foff) || ++ (b->bll_foff > BLL_F_END(c))) ++ BUG(); ++ ++ u = BLL_S_END(b) - BLL_S_END(c); ++ /* ++ * The updated cache entry has to be ++ * different than the current. ++ * Otherwise the cache state for 'b' ++ * should be BLOCK_LAYOUT_CACHE. ++ */ ++ BUG_ON(BLL_S_END(b) < BLL_S_END(c)); ++ ++ dprintk(" " ++ "updating %Lu(f):%Lu(l):%Lu(s) to len %Lu\n", ++ _2SECTS(c->bll_foff), ++ _2SECTS(c->bll_len), ++ _2SECTS(c->bll_soff), ++ _2SECTS(c->bll_len + u)); ++ c->bll_len += u; ++ bll_collapse(r, c); ++ found = True; ++ break; ++ } ++ } ++ ++ if (found == False) { ++ dprintk(" ERROR Expected to find" ++ " %Lu(f):%Lu(l):%Lu(s), but didn't\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff)); ++ list_for_each_entry(c, &r->blr_layouts, bll_list) ++ print_bll(c, "Cached"); ++ BUG(); ++ } ++ } else if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { ++ ++ c = list_first_entry(&r->blr_layouts, ++ struct pnfs_blocklayout_layout, bll_list); ++ if (b->bll_foff < c->bll_foff) { ++ /* ++ * Special case where new entry is before ++ * first cached entry. ++ */ ++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, NULL); ++ list_add(&c->bll_list, &r->blr_layouts); ++ dprintk(" new entry at head of list at %Lu, " ++ "len %Lu\n", ++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len)); ++ } else { ++ list_for_each_entry(c, &r->blr_layouts, ++ bll_list) { ++ n = list_entry(c->bll_list.next, ++ struct pnfs_blocklayout_layout, ++ bll_list); ++ /* ++ * This is ugly, but can't think of ++ * another way to examine this case. ++ * Consider the following. Need to ++ * add an entry which starts at 40 ++ * and the cache has the following ++ * entries: ++ * Start Length ++ * 10 5 ++ * 30 5 ++ * 50 5 ++ * So, need to look and see if the new ++ * entry starts after the current ++ * cache, but before the next one. ++ * There's a catch in that the next ++ * entry might not be valid as it's ++ * really just a pointer to the list ++ * head. ++ */ ++ if (((b->bll_foff >= ++ BLL_F_END(c)) && ++ (c->bll_list.next == &r->blr_layouts)) || ++ ((b->bll_foff >= ++ BLL_F_END(c)) && ++ (b->bll_foff < n->bll_foff))) { ++ ++ n = bll_alloc_dup(b, ++ BLOCK_LAYOUT_CACHE, NULL); ++ dprintk(" adding new %Lu:%Lu" ++ " after %Lu:%Lu\n", ++ _2SECTS(n->bll_foff), ++ _2SECTS(n->bll_len), ++ _2SECTS(c->bll_foff), ++ _2SECTS(c->bll_len)); ++ list_add(&n->bll_list, ++ &c->bll_list); ++ break; ++ } ++ } ++ } ++ } ++ } ++ dprintk("<-- %s\n", __func__); ++ return status; ++} ++ ++static void ++layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg_in) ++{ ++ struct pnfs_blocklayout_layout *b, ++ *n; ++ u64 len; ++ struct nfsd4_layout_seg seg = *seg_in; ++ ++ dprintk("--> %s\n", __func__); ++ if (seg.length == NFS4_MAX_UINT64) { ++ r->blr_recalled = 0; ++ dprintk(" Fast return of all layouts\n"); ++ while (!list_empty(&r->blr_layouts)) { ++ b = list_entry(r->blr_layouts.next, ++ struct pnfs_blocklayout_layout, bll_list); ++ dprintk(" foff %Lu, len %Lu, soff %Lu\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff)); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ dprintk("<-- %s\n", __func__); ++ return; ++ } ++ ++restart: ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (seg.offset == b->bll_foff) { ++ /* ++ * This handle the following three cases: ++ * (1) return layout matches entire cache layout ++ * (2) return layout matches beginning portion of cache ++ * (3) return layout matches entire cache layout and ++ * into next entry. Varies from #1 in end case. ++ */ ++ dprintk(" match on offsets, %Lu:%Lu\n", ++ _2SECTS(seg.offset), _2SECTS(seg.length)); ++ len = MIN(seg.length, b->bll_len); ++ b->bll_foff += len; ++ b->bll_soff += len; ++ b->bll_len -= len; ++ seg.length -= len; ++ seg.offset += len; ++ if (!b->bll_len) { ++ list_del(&b->bll_list); ++ kfree(b); ++ dprintk(" removing cache line\n"); ++ if (!seg.length) { ++ dprintk(" also finished\n"); ++ goto complete; ++ } ++ /* ++ * Since 'b' was freed we can't continue at the ++ * next entry which is referenced as ++ * b->bll_list.next by the list_for_each_entry ++ * macro. Need to restart the loop. ++ * TODO: Think about creating a dummy 'b' which ++ * would keep list_for_each_entry() happy. ++ */ ++ goto restart; ++ } ++ if (!seg.length) { ++ dprintk(" finished, but cache line not" ++ "empty\n"); ++ goto complete; ++ } ++ } else if ((seg.offset >= b->bll_foff) && ++ (seg.offset < BLL_F_END(b))) { ++ /* ++ * layout being returned is within this cache line. ++ */ ++ dprintk(" layout %Lu:%Lu within cache line %Lu:%Lu\n", ++ _2SECTS(seg.offset), _2SECTS(seg.length), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); ++ BUG_ON(!seg.length); ++ if ((seg.offset + seg.length) >= BLL_F_END(b)) { ++ /* ++ * Layout returned starts in the middle of ++ * cache entry and just need to trim back ++ * cache to shorter length. ++ */ ++ dprintk(" trim back cache line\n"); ++ len = seg.offset - b->bll_foff; ++ seg.offset += b->bll_len - len; ++ seg.length -= b->bll_len - len; ++ b->bll_len = len; ++ if (!seg.length) ++ return; ++ } else { ++ /* ++ * Need to split current cache layout because ++ * chunk is being removed from the middle. ++ */ ++ dprintk(" split cache line\n"); ++ len = seg.offset + seg.length; ++ n = bll_alloc(len, ++ (b->bll_foff + b->bll_len) - len, ++ BLOCK_LAYOUT_CACHE, NULL); ++ n->bll_soff = b->bll_soff + len; ++ list_add(&n->bll_list, &b->bll_list); ++ b->bll_len = seg.offset - b->bll_foff; ++ return; ++ } ++ } ++ } ++complete: ++ if (list_empty(&r->blr_layouts)) ++ r->blr_recalled = 0; ++ dprintk("<-- %s\n", __func__); ++} ++ ++/* ++ * layout_cache_fill_from_list -- fills from cache list ++ * ++ * NOTE: This routine was only seperated out from layout_cache_file_from() ++ * to reduce the indentation level which makes the code easier to read. ++ */ ++static inline boolean_t ++layout_cache_fill_from_list(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *n; ++ enum pnfs_block_extent_state4 s; ++ ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (seg->offset < b->bll_foff) { ++ n = bll_alloc(seg->offset, ++ MIN(seg->length, b->bll_foff - seg->offset), ++ BLOCK_LAYOUT_NEW, NULL); ++ if (!n) ++ return False; ++ ++ list_add(&n->bll_list, h->prev); ++ dprintk(" new: %Lu:%Lu, added before %Lu:%Lu\n", ++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); ++ seg->offset += n->bll_len; ++ seg->length -= n->bll_len; ++ if (!seg->length) ++ break; ++ } ++ ++ if ((seg->offset >= b->bll_foff) && ++ (seg->offset < BLL_F_END(b))) { ++ if (layout_conflict(b, seg->iomode, &s) == False) { ++ dprintk(" CONFLICT FOUND: " ++ "%Lu(f):%Lu(l):%Lu(s) state %d, iomode %d\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), b->bll_es, ++ seg->iomode); ++ return False; ++ } ++ n = bll_alloc(seg->offset, ++ MIN(seg->length, BLL_F_END(b) - seg->offset), ++ BLOCK_LAYOUT_CACHE, h); ++ dprintk(" CACHE hit: Found %Lu(f):%Lu(l): " ++ "in %Lu(f):%Lu(l):%Lu(s):%d\n", ++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), b->bll_es); ++ if (!n) ++ return False; ++ ++ n->bll_soff = b->bll_soff + seg->offset - b->bll_foff; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = b->bll_vol_id.devid; ++ n->bll_es = s; ++ seg->offset += n->bll_len; ++ seg->length -= n->bll_len; ++ if (!seg->length) ++ break; ++ } ++ } ++ return True; ++} ++ ++static u64 ++bll_alloc_holey(struct list_head *bl_candidates, u64 offset, u64 length, ++ dev_t dev) ++{ ++ pnfs_blocklayout_layout_t *n; ++ ++ n = bll_alloc(offset, length, BLOCK_LAYOUT_NEW, bl_candidates); ++ if (!n) ++ return 0; ++ n->bll_es = PNFS_BLOCK_NONE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ ++ return n->bll_len; ++} ++ ++static void ++extents_setup(struct fiemap_extent_info *fei) ++{ ++ fei->fi_extents_start = NULL; ++} ++ ++/* ++ * extents_count -- Determine the number of extents for a given range. ++ * ++ * No need to call set_fs() here because the function ++ * doesn't use copy_to_user() if it's only counting ++ * the number of extents needed. ++ */ ++static void ++extents_count(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) ++{ ++ dprintk(" Need fiemap of %Ld:%Ld\n", _2SECTS(foff), _2SECTS(len)); ++ fei->fi_flags = FIEMAP_FLAG_SYNC; ++ fei->fi_extents_max = 0; ++ fei->fi_extents_start = NULL; ++ fei->fi_extents_mapped = 0; ++ i->i_op->fiemap(i, fei, foff, len + (1 << i->i_sb->s_blocksize_bits) - 1); ++} ++ ++/* ++ * extents_get -- Get list of extents for range ++ * ++ * extents_count() must have been called before this routine such that ++ * fi_extents_mapped is known. ++ */ ++static boolean_t ++extents_get(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) ++{ ++ int m_space, ++ rval; ++ struct fiemap_extent *fe; ++ mm_segment_t old_fs = get_fs(); ++ ++ /* ++ * Now malloc the correct amount of space ++ * needed. It's possible for the file to have changed ++ * between calls which would require more space for ++ * the extents. If that occurs the last extent will ++ * not have FIEMAP_EXTENT_LAST set and the error will ++ * be caught in extents_process(). ++ */ ++ m_space = fei->fi_extents_mapped * sizeof (struct fiemap_extent); ++ fe = kmalloc(m_space, GFP_KERNEL); ++ if (!fe) ++ return False; ++ memset(fe, 0, m_space); ++ ++ fei->fi_extents_max = fei->fi_extents_mapped; ++ fei->fi_extents_mapped = 0; ++ fei->fi_extents_start = fe; ++ ++ set_fs(KERNEL_DS); ++ rval = i->i_op->fiemap(i, fei, foff, len + ++ (1 << i->i_sb->s_blocksize_bits) - 1); ++ set_fs(old_fs); ++ ++ if (rval || !fei->fi_extents_mapped) { ++ dprintk(" No extents. Wanted %d, got %d\n", ++ fei->fi_extents_max, fei->fi_extents_mapped); ++ kfree(fe); ++ fei->fi_extents_start = NULL; ++ return False; ++ } else ++ return True; ++} ++ ++/* ++ * extents_process -- runs through the extent returned from the file system and ++ * creates block layout entries. ++ */ ++static boolean_t ++extents_process(struct fiemap_extent_info *fei, struct list_head *bl_candidates, ++ struct nfsd4_layout_seg *seg, dev_t dev, pnfs_blocklayout_layout_t *b) ++{ ++ struct fiemap_extent *fep, ++ *fep_last = NULL; ++ int i; ++ pnfs_blocklayout_layout_t *n; ++ u64 last_end, ++ rval; ++ ++ dprintk("--> %s\n", __func__); ++ for (fep = fei->fi_extents_start, i = 0; i < fei->fi_extents_mapped; ++ i++, fep++) { ++ ++ BUG_ON(!fep->fe_physical); ++ /* ++ * Deal with corner cases of hoel-y files. ++ */ ++ if (fep_last && ((fep_last->fe_logical + fep_last->fe_length) != ++ fep->fe_logical)) { ++ ++ /* ++ * If the last extent doesn't end logically ++ * at the beginning of the current we've got ++ * hole and need to create a pNFS extent. ++ */ ++ dprintk(" Got a hole at %Ld:%Ld \n", ++ _2SECTS(fep_last->fe_logical), ++ _2SECTS(fep_last->fe_length)); ++ last_end = fep_last->fe_logical + fep_last->fe_length; ++ rval = bll_alloc_holey(bl_candidates, last_end, ++ fep->fe_logical - last_end, dev); ++ if (!rval) ++ return False; ++ seg->length += rval; ++ } ++ ++ n = bll_alloc(fep->fe_logical, fep->fe_length, ++ BLOCK_LAYOUT_NEW, bl_candidates); ++ if (unlikely(n == NULL)) { ++ dprintk("%s: bll_alloc failed\n", __func__); ++ return False; ++ } ++ ++ n->bll_soff = fep->fe_physical; ++ n->bll_es = seg->iomode == IOMODE_READ ? ++ PNFS_BLOCK_READ_DATA : PNFS_BLOCK_READWRITE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ seg->length += fep->fe_length; ++ print_bll(n, "New extent"); ++ fep_last = fep; ++ } ++ dprintk("<-- %s (i=%d)\n", __func__, i); ++ ++ return True; ++} ++ ++static void ++extents_cleanup(struct fiemap_extent_info *fei) ++{ ++ if (fei->fi_extents_start) { ++ kfree(fei->fi_extents_start); ++ fei->fi_extents_start = NULL; ++ } ++} ++ ++/* ++ * device_slice -- check to see if device is a slice or DM ++ */ ++static boolean_t ++device_slice(dev_t devid) ++{ ++ struct block_device *bd = blkdev_get_by_dev(devid, FMODE_READ, NULL); ++ boolean_t rval = False; ++ ++ if (bd) { ++ if (bd->bd_disk->minors > 1) ++ rval = True; ++ blkdev_put(bd, FMODE_READ); ++ } ++ return rval; ++} ++ ++/* ++ * device_dm -- check to see if device is a Device Mapper volume. ++ * ++ * Returns 1 for DM or 0 if not ++ */ ++static boolean_t ++device_dm(dev_t devid) ++{ ++ boolean_t rval = False; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_DMCHK; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("Failed upcall to check on DM status\n"); ++ } else if (res->u.dm_vol) { ++ rval = True; ++ dprintk("Device is DM volume\n"); ++ } else ++ dprintk("Device is not DM volume\n"); ++ kfree(res); ++ ++ return rval; ++} ++ ++static boolean_t ++layout_inode_add(struct inode *i, bl_layout_rec_t **p) ++{ ++ bl_layout_rec_t *r = NULL; ++ ++ if (!i->i_op->fiemap || !i->i_fop->fallocate) { ++ printk("pNFS: file system doesn't support required fiemap or" ++ "fallocate methods\n"); ++ return False; ++ } ++ ++ r = kmalloc(sizeof (*r), GFP_KERNEL); ++ if (!r) ++ goto error; ++ ++ r->blr_rdev = i->i_sb->s_dev; ++ r->blr_inode = i; ++ r->blr_orig_size = i->i_size; ++ r->blr_ext_size = 0; ++ r->blr_recalled = 0; ++ INIT_LIST_HEAD(&r->blr_layouts); ++ spin_lock_init(&r->blr_lock); ++ spin_lock(&layout_hashtbl_lock); ++ list_add_tail(&r->blr_hash, &layout_hash); ++ spin_unlock(&layout_hashtbl_lock); ++ *p = r; ++ return True; ++ ++error: ++ if (r) ++ kfree(r); ++ return False; ++} ++ ++static bl_layout_rec_t * ++__layout_inode_find(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ if (!list_empty(&layout_hash)) { ++ list_for_each_entry(r, &layout_hash, blr_hash) { ++ if ((r->blr_inode->i_ino == i->i_ino) && ++ (r->blr_rdev == i->i_sb->s_dev)) { ++ return r; ++ } ++ } ++ } ++ return NULL; ++} ++ ++static bl_layout_rec_t * ++layout_inode_find(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ spin_lock(&layout_hashtbl_lock); ++ r = __layout_inode_find(i); ++ spin_unlock(&layout_hashtbl_lock); ++ ++ return r; ++} ++ ++static void ++layout_inode_del(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ spin_lock(&layout_hashtbl_lock); ++ r = __layout_inode_find(i); ++ if (r) { ++ spin_lock(&r->blr_lock); ++ if (list_empty(&r->blr_layouts)) { ++ list_del(&r->blr_hash); ++ spin_unlock(&r->blr_lock); ++ kfree(r); ++ } else { ++ spin_unlock(&r->blr_lock); ++ } ++ } else { ++ dprintk("%s: failed to find inode [0x%x:%lu] in table for delete\n", ++ __func__, i->i_sb->s_dev, i->i_ino); ++ } ++ spin_unlock(&layout_hashtbl_lock); ++} ++ ++/* ++ * map_state2name -- converts state in ascii string. ++ * ++ * Used for debug messages only. ++ */ ++static char * ++map_state2name(enum pnfs_block_extent_state4 s) ++{ ++ switch (s) { ++ case PNFS_BLOCK_READWRITE_DATA: return " RW"; ++ case PNFS_BLOCK_READ_DATA: return " RO"; ++ case PNFS_BLOCK_INVALID_DATA: return "INVALID"; ++ case PNFS_BLOCK_NONE_DATA: return " NONE"; ++ default: ++ BUG(); ++ } ++} ++ ++static pnfs_blocklayout_devinfo_t * ++bld_alloc(struct list_head *volumes, int type) ++{ ++ pnfs_blocklayout_devinfo_t *bld; ++ ++ bld = kmalloc(sizeof (*bld), GFP_KERNEL); ++ if (!bld) ++ return NULL; ++ ++ memset(bld, 0, sizeof (*bld)); ++ bld->bld_type = type; ++ list_add_tail(&bld->bld_list, volumes); ++ ++ return bld; ++} ++ ++static void ++bld_free(pnfs_blocklayout_devinfo_t *bld) ++{ ++ list_del(&bld->bld_list); ++ kfree(bld); ++} ++ ++static void ++print_bll(pnfs_blocklayout_layout_t *b, char *text) ++{ ++ dprintk(" BLL: %s\n", text); ++ dprintk(" foff %Lu, soff %Lu, len %Lu, state %s\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_soff), _2SECTS(b->bll_len), ++ map_state2name(b->bll_es)); ++} ++ ++static inline void ++bll_collapse(bl_layout_rec_t *r, pnfs_blocklayout_layout_t *c) ++{ ++ pnfs_blocklayout_layout_t *n; ++ int dbg_count = 0; ++ u64 endpoint; ++ ++ BUG_ON(c->bll_es == PNFS_BLOCK_NONE_DATA); ++ while (c->bll_list.next != &r->blr_layouts) { ++ n = list_entry(c->bll_list.next, ++ struct pnfs_blocklayout_layout, bll_list); ++ endpoint = BLL_S_END(c); ++ if ((n->bll_soff >= c->bll_soff) && ++ (n->bll_soff < endpoint)) { ++ if (endpoint < BLL_S_END(n)) { ++ /* ++ * The following is possible. ++ * ++ * ++ * Existing: +---+ +---+ ++ * New: +-----------------------+ ++ * The client request merge entries together ++ * but didn't require picking up all of the ++ * last entry. So, we still need to delete ++ * the last entry and add the remaining space ++ * to the new entry. ++ */ ++ c->bll_len += BLL_S_END(n) - endpoint; ++ } ++ dbg_count++; ++ list_del(&n->bll_list); ++ kfree(n); ++ } else { ++ break; ++ } ++ } ++ /* ---- Debug only, remove before integration ---- */ ++ if (dbg_count) ++ dprintk(" Collapsed %d cache entries between %Lu(s) and %Lu(s)\n", ++ dbg_count, _2SECTS(c->bll_soff), _2SECTS(BLL_S_END(c))); ++} ++ ++static pnfs_blocklayout_layout_t * ++bll_alloc(u64 offset, u64 len, enum bl_cache_state state, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *n = NULL; ++ ++ n = kmalloc(sizeof (*n), GFP_KERNEL); ++ if (n) { ++ memset(n, 0, sizeof (*n)); ++ n->bll_foff = offset; ++ n->bll_len = len; ++ n->bll_cache_state = state; ++ if (h) ++ list_add_tail(&n->bll_list, h); ++ } ++ return n; ++} ++ ++static pnfs_blocklayout_layout_t * ++bll_alloc_dup(pnfs_blocklayout_layout_t *b, enum bl_cache_state c, ++ struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *n = NULL; ++ ++ n = bll_alloc(b->bll_foff, b->bll_len, c, h); ++ if (n) { ++ n->bll_es = b->bll_es; ++ n->bll_soff = b->bll_soff; ++ n->bll_vol_id.devid = b->bll_vol_id.devid; ++ } ++ return n; ++} ++ ++static inline boolean_t ++layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, ++ enum pnfs_block_extent_state4 *s) ++{ ++ /* ---- Normal case ---- */ ++ *s = b->bll_es; ++ ++ switch (b->bll_es) { ++ case PNFS_BLOCK_READWRITE_DATA: ++ if (iomode == IOMODE_READ) ++ *s = PNFS_BLOCK_READ_DATA; ++ /* ---- Any use is permitted. ---- */ ++ break; ++ case PNFS_BLOCK_READ_DATA: ++ /* ---- Committed as read only data. ---- */ ++ if (iomode == IOMODE_RW) ++ return False; ++ break; ++ case PNFS_BLOCK_INVALID_DATA: ++ /* ---- Blocks have been allocated, but not initialized ---- */ ++ if (iomode == IOMODE_READ) ++ *s = PNFS_BLOCK_NONE_DATA; ++ break; ++ case PNFS_BLOCK_NONE_DATA: ++ /* ---- Hole-y file. No backing store avail. ---- */ ++ if (iomode != IOMODE_READ) ++ return False; ++ break; ++ default: ++ BUG(); ++ } ++ return True; ++} ++ ++#endif /* CONFIG_SPNFS_BLOCK */ +diff -up linux-2.6.37.noarch/fs/nfsd/export.c.orig linux-2.6.37.noarch/fs/nfsd/export.c +--- linux-2.6.37.noarch/fs/nfsd/export.c.orig 2011-01-28 09:37:32.554979531 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/export.c 2011-01-28 09:43:53.350770434 -0500 +@@ -16,11 +16,19 @@ + #include + #include + ++#include ++#if defined(CONFIG_SPNFS) ++#include ++#if defined(CONFIG_SPNFS_BLOCK) ++#include ++#endif ++#endif + #include + #include + + #include "nfsd.h" + #include "nfsfh.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_EXPORT + +@@ -348,10 +356,84 @@ static int svc_export_upcall(struct cach + return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); + } + ++#if defined(CONFIG_PNFSD) ++static struct pnfsd_cb_operations pnfsd_cb_op = { ++ .cb_layout_recall = nfsd_layout_recall_cb, ++ .cb_device_notify = nfsd_device_notify_cb, ++ ++ .cb_get_state = nfs4_pnfs_cb_get_state, ++ .cb_change_state = nfs4_pnfs_cb_change_state, ++}; ++ ++#if defined(CONFIG_SPNFS) ++static struct pnfs_export_operations spnfs_export_ops = { ++ .layout_type = spnfs_layout_type, ++ .get_device_info = spnfs_getdeviceinfo, ++ .get_device_iter = spnfs_getdeviceiter, ++ .layout_get = spnfs_layoutget, ++ .layout_return = spnfs_layoutreturn, ++}; ++ ++static struct pnfs_export_operations spnfs_ds_export_ops = { ++ .get_state = spnfs_get_state, ++}; ++ ++#if defined(CONFIG_SPNFS_BLOCK) ++static struct pnfs_export_operations bl_export_ops = { ++ .layout_type = bl_layout_type, ++ .get_device_info = bl_getdeviceinfo, ++ .get_device_iter = bl_getdeviceiter, ++ .layout_get = bl_layoutget, ++ .layout_return = bl_layoutreturn, ++}; ++#endif /* CONFIG_SPNFS_BLOCK */ ++#endif /* CONFIG_SPNFS */ ++#endif /* CONFIG_PNFSD */ ++ + static struct svc_export *svc_export_update(struct svc_export *new, + struct svc_export *old); + static struct svc_export *svc_export_lookup(struct svc_export *); + ++static int pnfsd_check_export(struct inode *inode, int *flags) ++{ ++#if defined(CONFIG_PNFSD) ++ ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ if (!inode->i_sb->s_pnfs_op) ++ pnfsd_lexp_init(inode); ++ return 0; ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ ++#if defined(CONFIG_SPNFS) ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(inode, *flags)) { ++ dprintk("set pnfs block export structure... \n"); ++ inode->i_sb->s_pnfs_op = &bl_export_ops; ++ } else ++#endif /* CONFIG_SPNFS_BLOCK */ ++ /* ++ * spnfs_enabled() indicates we're an MDS. ++ * XXX Better to check an export time option as well. ++ */ ++ if (spnfs_enabled()) { ++ dprintk("set spnfs export structure...\n"); ++ inode->i_sb->s_pnfs_op = &spnfs_export_ops; ++ } else { ++ dprintk("%s spnfs not in use\n", __func__); ++ ++ /* ++ * get_state is needed if we're a DS using spnfs. ++ * XXX Better to check an export time option instead. ++ */ ++ inode->i_sb->s_pnfs_op = &spnfs_ds_export_ops; ++ } ++#endif /* CONFIG_SPNFS */ ++ ++#endif /* CONFIG_PNFSD */ ++ ++ return 0; ++} ++ + static int check_export(struct inode *inode, int *flags, unsigned char *uuid) + { + +@@ -391,8 +473,17 @@ static int check_export(struct inode *in + return -EINVAL; + } + +- return 0; ++#if !defined(CONFIG_SPNFS) ++ if (inode->i_sb->s_pnfs_op && ++ (!inode->i_sb->s_pnfs_op->layout_type || ++ !inode->i_sb->s_pnfs_op->get_device_info || ++ !inode->i_sb->s_pnfs_op->layout_get)) { ++ dprintk("exp_export: export of invalid fs pnfs export ops.\n"); ++ return -EINVAL; ++ } ++#endif /* !CONFIG_SPNFS */ + ++ return pnfsd_check_export(inode, flags); + } + + #ifdef CONFIG_NFSD_V4 +@@ -582,6 +673,8 @@ static int svc_export_parse(struct cache + if (exp.ex_uuid == NULL) + err = -ENOMEM; + } ++ } else if (strcmp(buf, "pnfs") == 0) { ++ exp.ex_pnfs = 1; + } else if (strcmp(buf, "secinfo") == 0) + err = secinfo_parse(&mesg, buf, &exp); + else +@@ -656,6 +749,8 @@ static int svc_export_show(struct seq_fi + seq_printf(m, "%02x", exp->ex_uuid[i]); + } + } ++ if (exp->ex_pnfs) ++ seq_puts(m, ",pnfs"); + show_secinfo(m, exp); + } + seq_puts(m, ")\n"); +@@ -683,6 +778,7 @@ static void svc_export_init(struct cache + new->ex_fslocs.locations = NULL; + new->ex_fslocs.locations_count = 0; + new->ex_fslocs.migrated = 0; ++ new->ex_pnfs = 0; + } + + static void export_update(struct cache_head *cnew, struct cache_head *citem) +@@ -695,6 +791,7 @@ static void export_update(struct cache_h + new->ex_anon_uid = item->ex_anon_uid; + new->ex_anon_gid = item->ex_anon_gid; + new->ex_fsid = item->ex_fsid; ++ new->ex_pnfs = item->ex_pnfs; + new->ex_uuid = item->ex_uuid; + item->ex_uuid = NULL; + new->ex_pathname = item->ex_pathname; +@@ -1662,8 +1759,17 @@ nfsd_export_init(void) + if (rv) + return rv; + rv = cache_register(&svc_expkey_cache); +- if (rv) ++ if (rv) { + cache_unregister(&svc_export_cache); ++ goto out; ++ } ++#if defined(CONFIG_PNFSD) ++ spin_lock(&pnfsd_cb_ctl.lock); ++ pnfsd_cb_ctl.module = THIS_MODULE; ++ pnfsd_cb_ctl.cb_op = &pnfsd_cb_op; ++ spin_unlock(&pnfsd_cb_ctl.lock); ++#endif /* CONFIG_PNFSD */ ++out: + return rv; + + } +@@ -1691,6 +1797,12 @@ nfsd_export_shutdown(void) + + exp_writelock(); + ++#if defined(CONFIG_PNFSD) ++ spin_lock(&pnfsd_cb_ctl.lock); ++ pnfsd_cb_ctl.module = NULL; ++ pnfsd_cb_ctl.cb_op = NULL; ++ spin_unlock(&pnfsd_cb_ctl.lock); ++#endif /* CONFIG_PNFSD */ + cache_unregister(&svc_expkey_cache); + cache_unregister(&svc_export_cache); + svcauth_unix_purge(); +diff -up linux-2.6.37.noarch/fs/nfs/direct.c.orig linux-2.6.37.noarch/fs/nfs/direct.c +--- linux-2.6.37.noarch/fs/nfs/direct.c.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/fs/nfs/direct.c 2011-01-28 09:43:53.315775694 -0500 +@@ -271,6 +271,38 @@ static const struct rpc_call_ops nfs_rea + .rpc_release = nfs_direct_read_release, + }; + ++static long nfs_direct_read_execute(struct nfs_read_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct inode *inode = data->inode; ++ struct rpc_task *task; ++ ++ nfs_fattr_init(&data->fattr); ++ msg->rpc_argp = &data->args; ++ msg->rpc_resp = &data->res; ++ ++ task_setup_data->task = &data->task; ++ task_setup_data->callback_data = data; ++ NFS_PROTO(inode)->read_setup(data, msg); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ ++ dprintk("NFS: %5u initiated direct read call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ return 0; ++} ++ + /* + * For each rsize'd chunk of the user's buffer, dispatch an NFS READ + * operation. If nfs_readdata_alloc() or get_user_pages() fails, +@@ -287,7 +319,6 @@ static ssize_t nfs_direct_read_schedule_ + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; + size_t rsize = NFS_SERVER(inode)->rsize; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; +@@ -348,26 +379,9 @@ static ssize_t nfs_direct_read_schedule_ + data->res.fattr = &data->fattr; + data->res.eof = 0; + data->res.count = bytes; +- nfs_fattr_init(&data->fattr); +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; + +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- NFS_PROTO(inode)->read_setup(data, &msg); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- break; +- rpc_put_task(task); +- +- dprintk("NFS: %5u initiated direct read call " +- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- bytes, +- (unsigned long long)data->args.offset); ++ if (nfs_direct_read_execute(data, &task_setup_data, &msg)) ++ break; + + started += bytes; + user_addr += bytes; +@@ -457,12 +471,15 @@ static void nfs_direct_free_writedata(st + } + + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) ++static long nfs_direct_write_execute(struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg); ++ + static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) + { + struct inode *inode = dreq->inode; + struct list_head *p; + struct nfs_write_data *data; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = dreq->ctx->cred, + }; +@@ -496,25 +513,7 @@ static void nfs_direct_write_reschedule( + * Reuse data->task; data->args should not have changed + * since the original request was sent. + */ +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- /* +- * We're called via an RPC callback, so BKL is already held. +- */ +- task = rpc_run_task(&task_setup_data); +- if (!IS_ERR(task)) +- rpc_put_task(task); +- +- dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- data->args.count, +- (unsigned long long)data->args.offset); ++ nfs_direct_write_execute(data, &task_setup_data, &msg); + } + + if (put_dreq(dreq)) +@@ -557,10 +556,31 @@ static const struct rpc_call_ops nfs_com + .rpc_release = nfs_direct_commit_release, + }; + ++static long nfs_direct_commit_execute(struct nfs_direct_req *dreq, ++ struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct rpc_task *task; ++ ++ NFS_PROTO(data->inode)->commit_setup(data, msg); ++ ++ /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ ++ dreq->commit_data = NULL; ++ ++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ return 0; ++} ++ + static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) + { + struct nfs_write_data *data = dreq->commit_data; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +@@ -589,16 +609,7 @@ static void nfs_direct_commit_schedule(s + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- NFS_PROTO(data->inode)->commit_setup(data, &msg); +- +- /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ +- dreq->commit_data = NULL; +- +- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); +- +- task = rpc_run_task(&task_setup_data); +- if (!IS_ERR(task)) +- rpc_put_task(task); ++ nfs_direct_commit_execute(dreq, data, &task_setup_data, &msg); + } + + static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +@@ -700,6 +711,36 @@ static const struct rpc_call_ops nfs_wri + .rpc_release = nfs_direct_write_release, + }; + ++static long nfs_direct_write_execute(struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct inode *inode = data->inode; ++ struct rpc_task *task; ++ ++ task_setup_data->task = &data->task; ++ task_setup_data->callback_data = data; ++ msg->rpc_argp = &data->args; ++ msg->rpc_resp = &data->res; ++ NFS_PROTO(inode)->write_setup(data, msg); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ ++ dprintk("NFS: %5u initiated direct write call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ return 0; ++} ++ + /* + * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE + * operation. If nfs_writedata_alloc() or get_user_pages() fails, +@@ -715,7 +756,6 @@ static ssize_t nfs_direct_write_schedule + struct inode *inode = ctx->path.dentry->d_inode; + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; +@@ -782,24 +822,8 @@ static ssize_t nfs_direct_write_schedule + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- break; +- rpc_put_task(task); +- +- dprintk("NFS: %5u initiated direct write call " +- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- bytes, +- (unsigned long long)data->args.offset); ++ if (nfs_direct_write_execute(data, &task_setup_data, &msg)) ++ break; + + started += bytes; + user_addr += bytes; +diff -up linux-2.6.37.noarch/fs/nfsd/Kconfig.orig linux-2.6.37.noarch/fs/nfsd/Kconfig +--- linux-2.6.37.noarch/fs/nfsd/Kconfig.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/Kconfig 2011-01-28 09:43:53.346770928 -0500 +@@ -91,3 +91,52 @@ config NFSD_V4 + available from http://linux-nfs.org/. + + If unsure, say N. ++ ++config PNFSD ++ bool "NFSv4.1 server support for Parallel NFS (pNFS) (DEVELOPER ONLY)" ++ depends on NFSD_V4 && EXPERIMENTAL ++ select EXPORTFS_FILE_LAYOUT ++ help ++ This option enables support for the parallel NFS features of the ++ minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1) ++ in the kernel's NFS server. ++ ++ Unless you're an NFS developer, say N. ++ ++config PNFSD_LOCAL_EXPORT ++ bool "Enable pNFS support for exporting local filesystems for debugging purposes" ++ depends on PNFSD ++ help ++ Say Y here if you want your pNFS server to export local file systems ++ over the files layout type. With this option the MDS (metadata ++ server) functions also as a single DS (data server). This is mostly ++ useful for development and debugging purposes. ++ ++ If unsure, say N. ++ ++config SPNFS ++ bool "Provide spNFS server support (EXPERIMENTAL)" ++ depends on PNFSD ++ select RPCSEC_GSS_KRB5 ++ help ++ Say Y here if you want spNFS server support. ++ ++ If unsure, say N. ++ ++config SPNFS_LAYOUTSEGMENTS ++ bool "Allow spNFS to return partial file layouts (EXPERIMENTAL)" ++ depends on SPNFS ++ select RPCSEC_GSS_KRB5 ++ help ++ Say Y here if you want spNFS to be able to return layout segments. ++ ++ If unsure, say N. ++ ++config SPNFS_BLOCK ++ bool "Provide Block Layout server support (EXPERIMENTAL)" ++ depends on SPNFS ++ select EXPORTFS_BLOCK_LAYOUT ++ help ++ Say Y here if you want spNFS block layout support ++ ++ If unsure, say N. +diff -up linux-2.6.37.noarch/fs/nfsd/Makefile.orig linux-2.6.37.noarch/fs/nfsd/Makefile +--- linux-2.6.37.noarch/fs/nfsd/Makefile.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/Makefile 2011-01-28 09:43:53.347770803 -0500 +@@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs + nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o + nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ + nfs4acl.o nfs4callback.o nfs4recover.o ++nfsd-$(CONFIG_PNFSD) += nfs4pnfsd.o nfs4pnfsdlm.o nfs4pnfsds.o ++nfsd-$(CONFIG_PNFSD_LOCAL_EXPORT) += pnfsd_lexp.o ++nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o ++nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o +diff -up linux-2.6.37.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.37.noarch/fs/nfsd/nfs4callback.c +--- linux-2.6.37.noarch/fs/nfsd/nfs4callback.c.orig 2011-01-28 09:37:32.557979427 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/nfs4callback.c 2011-01-28 09:43:53.352770195 -0500 +@@ -48,6 +48,8 @@ enum { + NFSPROC4_CLNT_CB_NULL = 0, + NFSPROC4_CLNT_CB_RECALL, + NFSPROC4_CLNT_CB_SEQUENCE, ++ NFSPROC4_CLNT_CB_LAYOUT, ++ NFSPROC4_CLNT_CB_DEVICE, + }; + + #define NFS4_MAXTAGLEN 20 +@@ -73,6 +75,19 @@ enum { + #define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) ++#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \ ++ cb_sequence_enc_sz + \ ++ 1 + 3 + \ ++ enc_nfs4_fh_sz + 4) ++#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ ++ cb_sequence_dec_sz + \ ++ op_dec_sz) ++#define NFS4_enc_cb_device_sz (cb_compound_enc_hdr_sz + \ ++ cb_sequence_enc_sz + \ ++ 1 + 6) ++#define NFS4_dec_cb_device_sz (cb_compound_dec_hdr_sz + \ ++ cb_sequence_dec_sz + \ ++ op_dec_sz) + + struct nfs4_cb_compound_hdr { + /* args */ +@@ -361,6 +376,151 @@ static void encode_cb_recall4args(struct + hdr->nops++; + } + ++#if defined(CONFIG_PNFSD) ++ ++#include "pnfsd.h" ++ ++/* ++ * CB_LAYOUTRECALL4args ++ * ++ * struct layoutrecall_file4 { ++ * nfs_fh4 lor_fh; ++ * offset4 lor_offset; ++ * length4 lor_length; ++ * stateid4 lor_stateid; ++ * }; ++ * ++ * union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) { ++ * case LAYOUTRECALL4_FILE: ++ * layoutrecall_file4 lor_layout; ++ * case LAYOUTRECALL4_FSID: ++ * fsid4 lor_fsid; ++ * case LAYOUTRECALL4_ALL: ++ * void; ++ * }; ++ * ++ * struct CB_LAYOUTRECALL4args { ++ * layouttype4 clora_type; ++ * layoutiomode4 clora_iomode; ++ * bool clora_changed; ++ * layoutrecall4 clora_recall; ++ * }; ++ */ ++static void encode_cb_layout4args(struct xdr_stream *xdr, ++ const struct nfs4_layoutrecall *clr, ++ struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 *p; ++ ++ BUG_ON(hdr->minorversion == 0); ++ ++ p = xdr_reserve_space(xdr, 5 * 4); ++ *p++ = cpu_to_be32(OP_CB_LAYOUTRECALL); ++ *p++ = cpu_to_be32(clr->cb.cbl_seg.layout_type); ++ *p++ = cpu_to_be32(clr->cb.cbl_seg.iomode); ++ *p++ = cpu_to_be32(clr->cb.cbl_layoutchanged); ++ *p = cpu_to_be32(clr->cb.cbl_recall_type); ++ if (unlikely(clr->cb.cbl_recall_type == RETURN_FSID)) { ++ struct nfs4_fsid fsid = clr->cb.cbl_fsid; ++ ++ p = xdr_reserve_space(xdr, 2 * 8); ++ p = xdr_encode_hyper(p, fsid.major); ++ xdr_encode_hyper(p, fsid.minor); ++ dprintk("%s: type %x iomode %d changed %d recall_type %d " ++ "fsid 0x%llx-0x%llx\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type, fsid.major, fsid.minor); ++ } else if (clr->cb.cbl_recall_type == RETURN_FILE) { ++ int len = clr->clr_file->fi_fhlen; ++ stateid_t *cbl_sid = (stateid_t *)&clr->cb.cbl_sid; ++ ++ p = xdr_reserve_space(xdr, 4 + len + 2 * 8); ++ *p++ = cpu_to_be32(len); ++ xdr_encode_opaque_fixed(p, clr->clr_file->fi_fhval, len); ++ p += XDR_QUADLEN(len); ++ p = xdr_encode_hyper(p, clr->cb.cbl_seg.offset); ++ xdr_encode_hyper(p, clr->cb.cbl_seg.length); ++ encode_stateid4(xdr, cbl_sid); ++ dprintk("%s: type %x iomode %d changed %d recall_type %d " ++ "offset %lld length %lld stateid " STATEID_FMT "\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type, ++ clr->cb.cbl_seg.offset, clr->cb.cbl_seg.length, ++ STATEID_VAL(cbl_sid)); ++ } else { ++ dprintk("%s: type %x iomode %d changed %d recall_type %d\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type); ++ } ++ hdr->nops++; ++} ++ ++/* ++ * CB_NOTIFY_DEVICEID4args ++ * ++ * typedef opaque notifylist4<>; ++ * ++ * struct notify4 { ++ * bitmap4 notify_mask; ++ * notifylist4 notify_vals; ++ * }; ++ * ++ * struct CB_NOTIFY_DEVICEID4args { ++ * notify4 cnda_changes<>; ++ * }; ++ */ ++static void encode_cb_device4args(struct xdr_stream *xdr, ++ const struct nfs4_notify_device *nd, ++ struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 *p; ++ int i; ++ int len = nd->nd_list->cbd_len; ++ struct nfsd4_pnfs_cb_dev_item *cbd = nd->nd_list->cbd_list; ++ ++ dprintk("NFSD %s: --> num %d\n", __func__, len); ++ ++ BUG_ON(hdr->minorversion == 0); ++ ++ p = xdr_reserve_space(xdr, 2 * 4); ++ *p++ = cpu_to_be32(OP_CB_NOTIFY_DEVICEID); ++ /* notify4 cnda_changes<>; */ ++ *p = cpu_to_be32(len); ++ for (i = 0; i < len; i++) { ++ dprintk("%s: nt %d lt %d devid x%llx-x%llx im %d i %d\n", ++ __func__, cbd[i].cbd_notify_type, ++ cbd[i].cbd_layout_type, ++ cbd[i].cbd_devid.sbid, ++ cbd[i].cbd_devid.devid, ++ cbd[i].cbd_immediate, i); ++ ++ BUG_ON(cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && ++ cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_DELETE); ++ p = xdr_reserve_space(xdr, 4 * 4 + 2 * 8); ++ /* bitmap4 notify_mask; */ ++ *p++ = cpu_to_be32(1); ++ *p++ = cpu_to_be32(cbd[i].cbd_notify_type); ++ /* opaque notify_vals<>; */ ++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) ++ *p++ = cpu_to_be32(24); ++ else ++ *p++ = cpu_to_be32(20); ++ *p++ = cpu_to_be32(cbd[i].cbd_layout_type); ++ p = xdr_encode_hyper(p, cbd[i].cbd_devid.sbid); ++ xdr_encode_hyper(p, cbd[i].cbd_devid.devid); ++ ++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) { ++ p = xdr_reserve_space(xdr, 4); ++ *p = cpu_to_be32(cbd[i].cbd_immediate); ++ } ++ } ++ hdr->nops++; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * CB_SEQUENCE4args + * +@@ -484,7 +644,7 @@ static int decode_cb_sequence4res(struct + out: + return status; + out_default: +- return nfs_cb_stat_to_errno(status); ++ return nfs_cb_stat_to_errno(nfserr); + } + + /* +@@ -523,6 +683,39 @@ static void nfs4_xdr_enc_cb_recall(struc + encode_cb_nops(&hdr); + } + ++#if defined(CONFIG_PNFSD) ++static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, ++ struct xdr_stream *xdr, ++ const struct nfsd4_callback *cb) ++{ ++ const struct nfs4_layoutrecall *args = cb->cb_op; ++ struct nfs4_cb_compound_hdr hdr = { ++ .ident = 0, ++ .minorversion = cb->cb_minorversion, ++ }; ++ ++ encode_cb_compound4args(xdr, &hdr); ++ encode_cb_sequence4args(xdr, cb, &hdr); ++ encode_cb_layout4args(xdr, args, &hdr); ++ encode_cb_nops(&hdr); ++} ++ ++static void nfs4_xdr_enc_cb_device(struct rpc_rqst *req, ++ struct xdr_stream *xdr, ++ const struct nfsd4_callback *cb) ++{ ++ struct nfs4_notify_device *args = cb->cb_op; ++ struct nfs4_cb_compound_hdr hdr = { ++ .ident = 0, ++ .minorversion = cb->cb_minorversion, ++ }; ++ ++ encode_cb_compound4args(xdr, &hdr); ++ encode_cb_sequence4args(xdr, cb, &hdr); ++ encode_cb_device4args(xdr, args, &hdr); ++ encode_cb_nops(&hdr); ++} ++#endif /* CONFIG_PNFSD */ + + /* + * NFSv4.0 and NFSv4.1 XDR decode functions +@@ -564,13 +757,63 @@ static int nfs4_xdr_dec_cb_recall(struct + if (unlikely(status)) + goto out; + if (unlikely(nfserr != NFS4_OK)) +- goto out_default; ++ status = nfs_cb_stat_to_errno(nfserr); ++out: ++ return status; ++} ++ ++#if defined(CONFIG_PNFSD) ++static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct nfsd4_callback *cb) ++{ ++ struct nfs4_cb_compound_hdr hdr; ++ enum nfsstat4 nfserr; ++ int status; ++ ++ status = decode_cb_compound4res(xdr, &hdr); ++ if (unlikely(status)) ++ goto out; ++ if (cb) { ++ status = decode_cb_sequence4res(xdr, cb); ++ if (unlikely(status)) ++ goto out; ++ } ++ status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr); ++ if (unlikely(status)) ++ goto out; ++ if (unlikely(nfserr != NFS4_OK)) ++ status = nfs_cb_stat_to_errno(nfserr); + out: + return status; +-out_default: +- return nfs_cb_stat_to_errno(status); + } + ++static int nfs4_xdr_dec_cb_device(struct rpc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct nfsd4_callback *cb) ++{ ++ struct nfs4_cb_compound_hdr hdr; ++ enum nfsstat4 nfserr; ++ int status; ++ ++ status = decode_cb_compound4res(xdr, &hdr); ++ if (unlikely(status)) ++ goto out; ++ if (cb) { ++ status = decode_cb_sequence4res(xdr, cb); ++ if (unlikely(status)) ++ goto out; ++ } ++ status = decode_cb_op_status(xdr, OP_CB_NOTIFY_DEVICEID, &nfserr); ++ if (unlikely(status)) ++ goto out; ++ if (unlikely(nfserr != NFS4_OK)) ++ status = nfs_cb_stat_to_errno(nfserr); ++out: ++ return status; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * RPC procedure tables + */ +@@ -588,6 +831,10 @@ out_default: + static struct rpc_procinfo nfs4_cb_procedures[] = { + PROC(CB_NULL, NULL, cb_null, cb_null), + PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall), ++#if defined(CONFIG_PNFSD) ++ PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout), ++ PROC(CB_DEVICE, COMPOUND, cb_device, cb_device), ++#endif + }; + + static struct rpc_version nfs_cb_version4 = { +@@ -785,11 +1032,10 @@ static bool nfsd41_cb_get_slot(struct nf + * TODO: cb_sequence should support referring call lists, cachethis, multiple + * slots, and mark callback channel down on communication errors. + */ +-static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) ++static void nfsd4_cb_prepare_sequence(struct rpc_task *task, ++ struct nfsd4_callback *cb, ++ struct nfs4_client *clp) + { +- struct nfsd4_callback *cb = calldata; +- struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); +- struct nfs4_client *clp = dp->dl_client; + u32 minorversion = clp->cl_minorversion; + + cb->cb_minorversion = minorversion; +@@ -807,12 +1053,17 @@ static void nfsd4_cb_prepare(struct rpc_ + rpc_call_start(task); + } + +-static void nfsd4_cb_done(struct rpc_task *task, void *calldata) ++static void nfsd4_cb_recall_prepare(struct rpc_task *task, void *calldata) + { + struct nfsd4_callback *cb = calldata; + struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); +- struct nfs4_client *clp = dp->dl_client; + ++ nfsd4_cb_prepare_sequence(task, cb, dp->dl_client); ++} ++ ++static void nfsd4_cb_done_sequence(struct rpc_task *task, ++ struct nfs4_client *clp) ++{ + dprintk("%s: minorversion=%d\n", __func__, + clp->cl_minorversion); + +@@ -837,7 +1088,7 @@ static void nfsd4_cb_recall_done(struct + struct nfs4_client *clp = dp->dl_client; + struct rpc_clnt *current_rpc_client = clp->cl_cb_client; + +- nfsd4_cb_done(task, calldata); ++ nfsd4_cb_done_sequence(task, clp); + + if (current_rpc_client != task->tk_client) { + /* We're shutting down or changing cl_cb_client; leave +@@ -886,7 +1137,7 @@ static void nfsd4_cb_recall_release(void + } + + static const struct rpc_call_ops nfsd4_cb_recall_ops = { +- .rpc_call_prepare = nfsd4_cb_prepare, ++ .rpc_call_prepare = nfsd4_cb_recall_prepare, + .rpc_call_done = nfsd4_cb_recall_done, + .rpc_release = nfsd4_cb_recall_release, + }; +@@ -1026,3 +1277,188 @@ void nfsd4_cb_recall(struct nfs4_delegat + + run_nfsd4_cb(&dp->dl_recall); + } ++ ++#if defined(CONFIG_PNFSD) ++static void nfsd4_cb_layout_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfsd4_callback *cb = calldata; ++ struct nfs4_layoutrecall *clr = container_of(cb, struct nfs4_layoutrecall, clr_recall); ++ ++ nfsd4_cb_prepare_sequence(task, cb, clr->clr_client); ++} ++ ++static void nfsd4_cb_layout_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfsd4_callback *cb = calldata; ++ struct nfs4_layoutrecall *clr = container_of(cb, struct nfs4_layoutrecall, clr_recall); ++ struct nfs4_client *clp = clr->clr_client; ++ struct rpc_clnt *current_rpc_client = clp->cl_cb_client; ++ ++ nfsd4_cb_done_sequence(task, clp); ++ ++ if (current_rpc_client != task->tk_client) { ++ /* We're shutting down or changing cl_cb_client; leave ++ * it to nfsd4_process_cb_update to restart the call if ++ * necessary. */ ++ return; ++ } ++ ++ if (cb->cb_done) ++ return; ++ ++ if (task->tk_status) ++ printk("%s: clp %p cb_client %p fp %p failed with status %d\n", ++ __func__, ++ clp, ++ clp->cl_cb_client, ++ clr->clr_file, ++ task->tk_status); ++ ++ switch (task->tk_status) { ++ case 0: ++ goto done; ++ ++ case -NFS4ERR_NOMATCHING_LAYOUT: ++ task->tk_status = 0; ++ nomatching_layout(clr); ++ goto done; ++ ++ case -NFS4ERR_DELAY: ++ /* Poll the client until it's done with the layout */ ++ /* FIXME: cap number of retries. ++ * The pnfs standard states that we need to only expire ++ * the client after at-least "lease time" .eg lease-time * 2 ++ * when failing to communicate a recall ++ */ ++ rpc_delay(task, HZ/100); /* 10 mili-seconds */ ++ task->tk_status = 0; ++ rpc_restart_call_prepare(task); ++ return; ++ ++ case -NFS4ERR_BADHANDLE: ++ /* FIXME: handle more gracefully */ ++ goto done; ++ ++ case -NFS4ERR_BAD_STATEID: ++ case -NFS4ERR_BADIOMODE: ++ case -NFS4ERR_BADXDR: ++ case -NFS4ERR_INVAL: ++ case -NFS4ERR_NOTSUPP: ++ case -NFS4ERR_OP_NOT_IN_SESSION: ++ case -NFS4ERR_REP_TOO_BIG: ++ case -NFS4ERR_REP_TOO_BIG_TO_CACHE: ++ case -NFS4ERR_REQ_TOO_BIG: ++ case -NFS4ERR_RETRY_UNCACHED_REP: ++ case -NFS4ERR_TOO_MANY_OPS: ++ case -NFS4ERR_UNKNOWN_LAYOUTTYPE: ++ case -NFS4ERR_WRONG_TYPE: ++ /* We should never get these, yet it could be a result of a ++ * buggy client, therefore no BUG here. ++ */ ++ goto done; ++ ++ default: ++ break; ++ } ++ ++ /* Network partition? */ ++ nfsd4_mark_cb_down(clp, task->tk_status); ++done: ++ cb->cb_done = true; ++} ++ ++static void nfsd4_cb_layout_release(void *calldata) ++{ ++ struct nfsd4_callback *cb = calldata; ++ struct nfs4_layoutrecall *clr = container_of(cb, struct nfs4_layoutrecall, clr_recall); ++ ++ put_layoutrecall(clr); ++} ++ ++static const struct rpc_call_ops nfsd4_cb_layout_ops = { ++ .rpc_call_prepare = nfsd4_cb_layout_prepare, ++ .rpc_call_done = nfsd4_cb_layout_done, ++ .rpc_release = nfsd4_cb_layout_release, ++}; ++ ++/* ++ * Called with state lock. ++ */ ++void ++nfsd4_cb_layout(struct nfs4_layoutrecall *clr) ++{ ++ struct nfsd4_callback *cb = &clr->clr_recall; ++ ++ cb->cb_op = clr; ++ cb->cb_clp = clr->clr_client; ++ cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_LAYOUT]; ++ cb->cb_msg.rpc_argp = cb; ++ cb->cb_msg.rpc_resp = cb; ++ cb->cb_msg.rpc_cred = callback_cred; ++ ++ cb->cb_ops = &nfsd4_cb_layout_ops; ++ run_nfsd4_cb(cb); ++} ++ ++static void nfsd4_cb_device_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfsd4_callback *cb = calldata; ++ struct nfs4_notify_device *cbnd = container_of(cb, struct nfs4_notify_device, nd_recall); ++ ++ nfsd4_cb_prepare_sequence(task, cb, cbnd->nd_client); ++} ++ ++static void nfsd4_cb_device_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfsd4_callback *cb = calldata; ++ struct nfs4_notify_device *cbnd = container_of(cb, struct nfs4_notify_device, nd_recall); ++ struct nfs4_client *clp = cbnd->nd_client; ++ ++ nfsd4_cb_done_sequence(task, clp); ++ ++ dprintk("%s: clp %p cb_client %p: status %d\n", ++ __func__, ++ clp, ++ clp->cl_cb_client, ++ task->tk_status); ++ ++ if (task->tk_status == -EIO) { ++ /* Network partition? */ ++ nfsd4_mark_cb_down(clp, task->tk_status); ++ } ++ cb->cb_done = true; ++} ++ ++static void nfsd4_cb_device_release(void *calldata) ++{ ++ struct nfsd4_callback *cb = calldata; ++ struct nfs4_notify_device *cbnd = container_of(cb, struct nfs4_notify_device, nd_recall); ++ ++ kfree(cbnd); ++} ++ ++static const struct rpc_call_ops nfsd4_cb_device_ops = { ++ .rpc_call_prepare = nfsd4_cb_device_prepare, ++ .rpc_call_done = nfsd4_cb_device_done, ++ .rpc_release = nfsd4_cb_device_release, ++}; ++ ++/* ++ * Called with state lock. ++ */ ++void ++nfsd4_cb_notify_device(struct nfs4_notify_device *cbnd) ++{ ++ struct nfsd4_callback *cb = &cbnd->nd_recall; ++ ++ cb->cb_op = cbnd; ++ cb->cb_clp = cbnd->nd_client; ++ cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_DEVICE]; ++ cb->cb_msg.rpc_argp = cb; ++ cb->cb_msg.rpc_resp = cb; ++ cb->cb_msg.rpc_cred = callback_cred; ++ ++ cb->cb_ops = &nfsd4_cb_device_ops; ++ run_nfsd4_cb(cb); ++} ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.37.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.37.noarch/fs/nfsd/nfs4pnfsd.c +--- linux-2.6.37.noarch/fs/nfsd/nfs4pnfsd.c.orig 2011-01-28 09:43:53.353770077 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/nfs4pnfsd.c 2011-01-28 09:43:53.353770077 -0500 +@@ -0,0 +1,1688 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ *****************************************************************************/ ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* Globals */ ++static u32 current_layoutid = 1; ++ ++/* ++ * Currently used for manipulating the layout state. ++ */ ++static DEFINE_SPINLOCK(layout_lock); ++ ++#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_SMP) ++# define BUG_ON_UNLOCKED_LAYOUT() BUG_ON(!spin_is_locked(&layout_lock)) ++#else ++# define BUG_ON_UNLOCKED_LAYOUT() ++#endif ++ ++/* ++ * Layout state - NFSv4.1 pNFS ++ */ ++static struct kmem_cache *pnfs_layout_slab; ++static struct kmem_cache *pnfs_layoutrecall_slab; ++ ++/* hash table for nfsd4_pnfs_deviceid.sbid */ ++#define SBID_HASH_BITS 8 ++#define SBID_HASH_SIZE (1 << SBID_HASH_BITS) ++#define SBID_HASH_MASK (SBID_HASH_SIZE - 1) ++ ++struct sbid_tracker { ++ u64 id; ++ struct super_block *sb; ++ struct list_head hash; ++}; ++ ++static u64 current_sbid; ++static struct list_head sbid_hashtbl[SBID_HASH_SIZE]; ++ ++static inline unsigned long ++sbid_hashval(struct super_block *sb) ++{ ++ return hash_ptr(sb, SBID_HASH_BITS); ++} ++ ++static inline struct sbid_tracker * ++alloc_sbid(void) ++{ ++ return kmalloc(sizeof(struct sbid_tracker), GFP_KERNEL); ++} ++ ++static void ++destroy_sbid(struct sbid_tracker *sbid) ++{ ++ spin_lock(&layout_lock); ++ list_del(&sbid->hash); ++ spin_unlock(&layout_lock); ++ kfree(sbid); ++} ++ ++void ++nfsd4_free_pnfs_slabs(void) ++{ ++ int i; ++ struct sbid_tracker *sbid; ++ ++ nfsd4_free_slab(&pnfs_layout_slab); ++ nfsd4_free_slab(&pnfs_layoutrecall_slab); ++ ++ for (i = 0; i < SBID_HASH_SIZE; i++) { ++ while (!list_empty(&sbid_hashtbl[i])) { ++ sbid = list_first_entry(&sbid_hashtbl[i], ++ struct sbid_tracker, ++ hash); ++ destroy_sbid(sbid); ++ } ++ } ++} ++ ++int ++nfsd4_init_pnfs_slabs(void) ++{ ++ int i; ++ ++ pnfs_layout_slab = kmem_cache_create("pnfs_layouts", ++ sizeof(struct nfs4_layout), 0, 0, NULL); ++ if (pnfs_layout_slab == NULL) ++ return -ENOMEM; ++ pnfs_layoutrecall_slab = kmem_cache_create("pnfs_layoutrecalls", ++ sizeof(struct nfs4_layoutrecall), 0, 0, NULL); ++ if (pnfs_layoutrecall_slab == NULL) ++ return -ENOMEM; ++ ++ for (i = 0; i < SBID_HASH_SIZE; i++) { ++ INIT_LIST_HEAD(&sbid_hashtbl[i]); ++ } ++ ++ return 0; ++} ++ ++/* XXX: Need to implement the notify types and track which ++ * clients have which devices. */ ++void pnfs_set_device_notify(clientid_t *clid, unsigned int types) ++{ ++ struct nfs4_client *clp; ++ dprintk("%s: -->\n", __func__); ++ ++ nfs4_lock_state(); ++ /* Indicate that client has a device so we can only notify ++ * the correct clients */ ++ clp = find_confirmed_client(clid); ++ if (clp) { ++ atomic_inc(&clp->cl_deviceref); ++ dprintk("%s: Incr device count (clnt %p) to %d\n", ++ __func__, clp, atomic_read(&clp->cl_deviceref)); ++ } ++ nfs4_unlock_state(); ++} ++ ++/* Clear notifications for this client ++ * XXX: Do we need to loop through a clean up all ++ * krefs when nfsd cleans up the client? */ ++void pnfs_clear_device_notify(struct nfs4_client *clp) ++{ ++ atomic_dec(&clp->cl_deviceref); ++ dprintk("%s: Decr device count (clnt %p) to %d\n", ++ __func__, clp, atomic_read(&clp->cl_deviceref)); ++} ++ ++static struct nfs4_layout_state * ++alloc_init_layout_state(struct nfs4_client *clp, struct nfs4_file *fp, ++ stateid_t *stateid) ++{ ++ struct nfs4_layout_state *new; ++ ++ /* FIXME: use a kmem_cache */ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return new; ++ get_nfs4_file(fp); ++ INIT_LIST_HEAD(&new->ls_perfile); ++ INIT_LIST_HEAD(&new->ls_layouts); ++ kref_init(&new->ls_ref); ++ new->ls_client = clp; ++ new->ls_file = fp; ++ new->ls_stateid.si_boot = stateid->si_boot; ++ new->ls_stateid.si_stateownerid = 0; /* identifies layout stateid */ ++ new->ls_stateid.si_generation = 1; ++ spin_lock(&layout_lock); ++ new->ls_stateid.si_fileid = current_layoutid++; ++ list_add(&new->ls_perfile, &fp->fi_layout_states); ++ spin_unlock(&layout_lock); ++ return new; ++} ++ ++static inline void ++get_layout_state(struct nfs4_layout_state *ls) ++{ ++ kref_get(&ls->ls_ref); ++} ++ ++static void ++destroy_layout_state_common(struct nfs4_layout_state *ls) ++{ ++ struct nfs4_file *fp = ls->ls_file; ++ ++ dprintk("pNFS %s: ls %p fp %p clp %p\n", __func__, ls, fp, ++ ls->ls_client); ++ BUG_ON(!list_empty(&ls->ls_layouts)); ++ kfree(ls); ++ put_nfs4_file(fp); ++} ++ ++static void ++destroy_layout_state(struct kref *kref) ++{ ++ struct nfs4_layout_state *ls = ++ container_of(kref, struct nfs4_layout_state, ls_ref); ++ ++ spin_lock(&layout_lock); ++ list_del(&ls->ls_perfile); ++ spin_unlock(&layout_lock); ++ destroy_layout_state_common(ls); ++} ++ ++static void ++destroy_layout_state_locked(struct kref *kref) ++{ ++ struct nfs4_layout_state *ls = ++ container_of(kref, struct nfs4_layout_state, ls_ref); ++ ++ list_del(&ls->ls_perfile); ++ destroy_layout_state_common(ls); ++} ++ ++static inline void ++put_layout_state(struct nfs4_layout_state *ls) ++{ ++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ kref_put(&ls->ls_ref, destroy_layout_state); ++} ++ ++static inline void ++put_layout_state_locked(struct nfs4_layout_state *ls) ++{ ++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ kref_put(&ls->ls_ref, destroy_layout_state_locked); ++} ++ ++/* ++ * Search the fp->fi_layout_state list for a layout state with the clientid. ++ * If not found, then this is a 'first open/delegation/lock stateid' from ++ * the client for this file. ++ * Called under the layout_lock. ++ */ ++static struct nfs4_layout_state * ++find_get_layout_state(struct nfs4_client *clp, struct nfs4_file *fp) ++{ ++ struct nfs4_layout_state *ls; ++ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_for_each_entry(ls, &fp->fi_layout_states, ls_perfile) { ++ if (ls->ls_client == clp) { ++ dprintk("pNFS %s: before GET ls %p ls_ref %d\n", ++ __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ get_layout_state(ls); ++ return ls; ++ } ++ } ++ return NULL; ++} ++ ++static __be32 ++verify_stateid(struct nfs4_file *fp, stateid_t *stateid) ++{ ++ struct nfs4_stateid *local = NULL; ++ struct nfs4_delegation *temp = NULL; ++ ++ /* check if open or lock stateid */ ++ local = find_stateid(stateid, RD_STATE); ++ if (local) ++ return 0; ++ temp = find_delegation_stateid(fp->fi_inode, stateid); ++ if (temp) ++ return 0; ++ return nfserr_bad_stateid; ++} ++ ++/* ++ * nfs4_preocess_layout_stateid () ++ * ++ * We have looked up the nfs4_file corresponding to the current_fh, and ++ * confirmed the clientid. Pull the few tests from nfs4_preprocess_stateid_op() ++ * that make sense with a layout stateid. ++ * ++ * Called with the state_lock held ++ * Returns zero and stateid is updated, or error. ++ * ++ * Note: the struct nfs4_layout_state pointer is only set by layoutget. ++ */ ++static __be32 ++nfs4_process_layout_stateid(struct nfs4_client *clp, struct nfs4_file *fp, ++ stateid_t *stateid, struct nfs4_layout_state **lsp) ++{ ++ struct nfs4_layout_state *ls = NULL; ++ __be32 status = 0; ++ ++ dprintk("--> %s clp %p fp %p \n", __func__, clp, fp); ++ ++ dprintk("%s: operation stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(stateid)); ++ ++ status = nfs4_check_stateid(stateid); ++ if (status) ++ goto out; ++ ++ /* Is this the first use of this layout ? */ ++ spin_lock(&layout_lock); ++ ls = find_get_layout_state(clp, fp); ++ spin_unlock(&layout_lock); ++ if (!ls) { ++ /* Only alloc layout state on layoutget (which sets lsp). */ ++ if (!lsp) { ++ dprintk("%s ERROR: Not layoutget & no layout stateid\n", ++ __func__); ++ status = nfserr_bad_stateid; ++ goto out; ++ } ++ dprintk("%s Initial stateid for layout: file %p client %p\n", ++ __func__, fp, clp); ++ ++ /* verify input stateid */ ++ status = verify_stateid(fp, stateid); ++ if (status) { ++ dprintk("%s ERROR: invalid open/deleg/lock stateid\n", ++ __func__); ++ goto out; ++ } ++ ls = alloc_init_layout_state(clp, fp, stateid); ++ if (!ls) { ++ dprintk("%s pNFS ERROR: no memory for layout state\n", ++ __func__); ++ status = nfserr_resource; ++ goto out; ++ } ++ } else { ++ dprintk("%s Not initial stateid. Layout state %p file %p\n", ++ __func__, ls, fp); ++ ++ /* BAD STATEID */ ++ status = nfserr_bad_stateid; ++ if (memcmp(&ls->ls_stateid.si_opaque, &stateid->si_opaque, ++ sizeof(stateid_opaque_t)) != 0) { ++ ++ /* if a LAYOUTGET operation and stateid is a valid ++ * open/deleg/lock stateid, accept it as a parallel ++ * initial layout stateid ++ */ ++ if (lsp && ((verify_stateid(fp, stateid)) == 0)) { ++ dprintk("%s parallel initial layout state\n", ++ __func__); ++ goto verified; ++ } ++ ++ dprintk("%s ERROR bad opaque in stateid 1\n", __func__); ++ goto out_put; ++ } ++ ++ /* stateid is a valid layout stateid for this file. */ ++ if (stateid->si_generation > ls->ls_stateid.si_generation) { ++ dprintk("%s bad stateid 1\n", __func__); ++ goto out_put; ++ } ++ } ++verified: ++ status = 0; ++ ++ /* Return the layout state if requested */ ++ if (lsp) { ++ get_layout_state(ls); ++ *lsp = ls; ++ } ++ dprintk("%s: layout stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(&ls->ls_stateid)); ++out_put: ++ dprintk("%s PUT LO STATE:\n", __func__); ++ put_layout_state(ls); ++out: ++ dprintk("<-- %s status %d\n", __func__, htonl(status)); ++ ++ return status; ++} ++ ++static inline struct nfs4_layout * ++alloc_layout(void) ++{ ++ return kmem_cache_alloc(pnfs_layout_slab, GFP_KERNEL); ++} ++ ++static inline void ++free_layout(struct nfs4_layout *lp) ++{ ++ kmem_cache_free(pnfs_layout_slab, lp); ++} ++ ++#define update_layout_stateid(ls, sid) { \ ++ update_stateid(&(ls)->ls_stateid); \ ++ dprintk("%s Updated ls_stateid to %d on layoutstate %p\n", \ ++ __func__, (ls)->ls_stateid.si_generation, (ls)); \ ++ memcpy((sid), &(ls)->ls_stateid, sizeof(stateid_t)); \ ++} ++ ++static void ++init_layout(struct nfs4_layout_state *ls, ++ struct nfs4_layout *lp, ++ struct nfs4_file *fp, ++ struct nfs4_client *clp, ++ struct svc_fh *current_fh, ++ struct nfsd4_layout_seg *seg, ++ stateid_t *stateid) ++{ ++ dprintk("pNFS %s: ls %p lp %p clp %p fp %p ino %p\n", __func__, ++ ls, lp, clp, fp, fp->fi_inode); ++ ++ get_nfs4_file(fp); ++ lp->lo_client = clp; ++ lp->lo_file = fp; ++ get_layout_state(ls); ++ lp->lo_state = ls; ++ memcpy(&lp->lo_seg, seg, sizeof(lp->lo_seg)); ++ spin_lock(&layout_lock); ++ update_layout_stateid(ls, stateid); ++ list_add_tail(&lp->lo_perstate, &ls->ls_layouts); ++ list_add_tail(&lp->lo_perclnt, &clp->cl_layouts); ++ list_add_tail(&lp->lo_perfile, &fp->fi_layouts); ++ spin_unlock(&layout_lock); ++ dprintk("pNFS %s end\n", __func__); ++} ++ ++static void ++dequeue_layout(struct nfs4_layout *lp) ++{ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_del(&lp->lo_perclnt); ++ list_del(&lp->lo_perfile); ++ list_del(&lp->lo_perstate); ++} ++ ++static void ++destroy_layout(struct nfs4_layout *lp) ++{ ++ struct nfs4_client *clp; ++ struct nfs4_file *fp; ++ struct nfs4_layout_state *ls; ++ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ clp = lp->lo_client; ++ fp = lp->lo_file; ++ ls = lp->lo_state; ++ dprintk("pNFS %s: lp %p clp %p fp %p ino %p ls_layouts empty %d\n", ++ __func__, lp, clp, fp, fp->fi_inode, ++ list_empty(&ls->ls_layouts)); ++ ++ kmem_cache_free(pnfs_layout_slab, lp); ++ /* release references taken by init_layout */ ++ put_layout_state_locked(ls); ++ put_nfs4_file(fp); ++} ++ ++void fs_layout_return(struct super_block *sb, struct inode *ino, ++ struct nfsd4_pnfs_layoutreturn *lrp, int flags, ++ void *recall_cookie) ++{ ++ int ret; ++ ++ if (unlikely(!sb->s_pnfs_op->layout_return)) ++ return; ++ ++ lrp->lr_flags = flags; ++ lrp->args.lr_cookie = recall_cookie; ++ ++ if (!ino) /* FSID or ALL */ ++ ino = sb->s_root->d_inode; ++ ++ ret = sb->s_pnfs_op->layout_return(ino, &lrp->args); ++ dprintk("%s: inode %lu iomode=%d offset=0x%llx length=0x%llx " ++ "cookie = %p flags 0x%x status=%d\n", ++ __func__, ino->i_ino, lrp->args.lr_seg.iomode, ++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, ++ recall_cookie, flags, ret); ++} ++ ++static u64 ++alloc_init_sbid(struct super_block *sb) ++{ ++ struct sbid_tracker *sbid; ++ struct sbid_tracker *new = alloc_sbid(); ++ unsigned long hash_idx = sbid_hashval(sb); ++ u64 id = 0; ++ ++ if (likely(new)) { ++ spin_lock(&layout_lock); ++ id = ++current_sbid; ++ new->id = (id << SBID_HASH_BITS) | (hash_idx & SBID_HASH_MASK); ++ id = new->id; ++ BUG_ON(id == 0); ++ new->sb = sb; ++ ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) ++ if (sbid->sb == sb) { ++ kfree(new); ++ id = sbid->id; ++ spin_unlock(&layout_lock); ++ return id; ++ } ++ list_add(&new->hash, &sbid_hashtbl[hash_idx]); ++ spin_unlock(&layout_lock); ++ } ++ return id; ++} ++ ++struct super_block * ++find_sbid_id(u64 id) ++{ ++ struct sbid_tracker *sbid; ++ struct super_block *sb = NULL; ++ unsigned long hash_idx = id & SBID_HASH_MASK; ++ int pos = 0; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { ++ pos++; ++ if (sbid->id != id) ++ continue; ++ if (pos > 1) ++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); ++ sb = sbid->sb; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ return sb; ++} ++ ++u64 ++find_create_sbid(struct super_block *sb) ++{ ++ struct sbid_tracker *sbid; ++ unsigned long hash_idx = sbid_hashval(sb); ++ int pos = 0; ++ u64 id = 0; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { ++ pos++; ++ if (sbid->sb != sb) ++ continue; ++ if (pos > 1) ++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); ++ id = sbid->id; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ if (!id) ++ id = alloc_init_sbid(sb); ++ ++ return id; ++} ++ ++/* ++ * Create a layoutrecall structure ++ * An optional layoutrecall can be cloned (except for the layoutrecall lists) ++ */ ++static struct nfs4_layoutrecall * ++alloc_init_layoutrecall(struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_client *clp, ++ struct nfs4_file *lrfile) ++{ ++ struct nfs4_layoutrecall *clr; ++ ++ dprintk("NFSD %s\n", __func__); ++ clr = kmem_cache_alloc(pnfs_layoutrecall_slab, GFP_KERNEL); ++ if (clr == NULL) ++ return clr; ++ ++ dprintk("NFSD %s -->\n", __func__); ++ ++ memset(clr, 0, sizeof(*clr)); ++ if (lrfile) ++ get_nfs4_file(lrfile); ++ clr->clr_client = clp; ++ clr->clr_file = lrfile; ++ clr->cb = *cbl; ++ ++ kref_init(&clr->clr_ref); ++ INIT_LIST_HEAD(&clr->clr_perclnt); ++ INIT_WORK(&clr->clr_recall.cb_work, nfsd4_do_callback_rpc); ++ ++ dprintk("NFSD %s return %p\n", __func__, clr); ++ return clr; ++} ++ ++static void ++get_layoutrecall(struct nfs4_layoutrecall *clr) ++{ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ kref_get(&clr->clr_ref); ++} ++ ++static void ++destroy_layoutrecall(struct kref *kref) ++{ ++ struct nfs4_layoutrecall *clr = ++ container_of(kref, struct nfs4_layoutrecall, clr_ref); ++ dprintk("pNFS %s: clr %p fp %p clp %p\n", __func__, clr, ++ clr->clr_file, clr->clr_client); ++ BUG_ON(!list_empty(&clr->clr_perclnt)); ++ if (clr->clr_file) ++ put_nfs4_file(clr->clr_file); ++ kmem_cache_free(pnfs_layoutrecall_slab, clr); ++} ++ ++int ++put_layoutrecall(struct nfs4_layoutrecall *clr) ++{ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ return kref_put(&clr->clr_ref, destroy_layoutrecall); ++} ++ ++void * ++layoutrecall_done(struct nfs4_layoutrecall *clr) ++{ ++ void *recall_cookie = clr->cb.cbl_cookie; ++ struct nfs4_layoutrecall *parent = clr->parent; ++ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_del_init(&clr->clr_perclnt); ++ put_layoutrecall(clr); ++ ++ if (parent && !put_layoutrecall(parent)) ++ recall_cookie = NULL; ++ ++ return recall_cookie; ++} ++ ++/* ++ * get_state() and cb_get_state() are ++ */ ++void ++release_pnfs_ds_dev_list(struct nfs4_stateid *stp) ++{ ++ struct pnfs_ds_dev_entry *ddp; ++ ++ while (!list_empty(&stp->st_pnfs_ds_id)) { ++ ddp = list_entry(stp->st_pnfs_ds_id.next, ++ struct pnfs_ds_dev_entry, dd_dev_entry); ++ list_del(&ddp->dd_dev_entry); ++ kfree(ddp); ++ } ++} ++ ++static int ++nfs4_add_pnfs_ds_dev(struct nfs4_stateid *stp, u32 dsid) ++{ ++ struct pnfs_ds_dev_entry *ddp; ++ ++ ddp = kmalloc(sizeof(*ddp), GFP_KERNEL); ++ if (!ddp) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&ddp->dd_dev_entry); ++ list_add(&ddp->dd_dev_entry, &stp->st_pnfs_ds_id); ++ ddp->dd_dsid = dsid; ++ return 0; ++} ++ ++/* ++ * are two octet ranges overlapping? ++ * start1 last1 ++ * |-----------------| ++ * start2 last2 ++ * |----------------| ++ */ ++static inline int ++lo_seg_overlapping(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 last1 = last_byte_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 last2 = last_byte_offset(start2, l2->length); ++ int ret; ++ ++ /* if last1 == start2 there's a single byte overlap */ ++ ret = (last2 >= start1) && (last1 >= start2); ++ dprintk("%s: l1 %llu:%lld l2 %llu:%lld ret=%d\n", __func__, ++ l1->offset, l1->length, l2->offset, l2->length, ret); ++ return ret; ++} ++ ++static inline int ++same_fsid_major(struct nfs4_fsid *fsid, u64 major) ++{ ++ return fsid->major == major; ++} ++ ++static inline int ++same_fsid(struct nfs4_fsid *fsid, struct svc_fh *current_fh) ++{ ++ return same_fsid_major(fsid, current_fh->fh_export->ex_fsid); ++} ++ ++/* ++ * find a layout recall conflicting with the specified layoutget ++ */ ++static int ++is_layout_recalled(struct nfs4_client *clp, ++ struct svc_fh *current_fh, ++ struct nfsd4_layout_seg *seg) ++{ ++ struct nfs4_layoutrecall *clr; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (clr, &clp->cl_layoutrecalls, clr_perclnt) { ++ if (clr->cb.cbl_seg.layout_type != seg->layout_type) ++ continue; ++ if (clr->cb.cbl_recall_type == RETURN_ALL) ++ goto found; ++ if (clr->cb.cbl_recall_type == RETURN_FSID) { ++ if (same_fsid(&clr->cb.cbl_fsid, current_fh)) ++ goto found; ++ else ++ continue; ++ } ++ BUG_ON(clr->cb.cbl_recall_type != RETURN_FILE); ++ if (clr->cb.cbl_seg.clientid == seg->clientid && ++ lo_seg_overlapping(&clr->cb.cbl_seg, seg)) ++ goto found; ++ } ++ spin_unlock(&layout_lock); ++ return 0; ++found: ++ spin_unlock(&layout_lock); ++ return 1; ++} ++ ++/* ++ * are two octet ranges overlapping or adjacent? ++ */ ++static inline int ++lo_seg_mergeable(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ /* is end1 == start2 ranges are adjacent */ ++ return (end2 >= start1) && (end1 >= start2); ++} ++ ++static void ++extend_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lg) ++{ ++ u64 lo_start = lo->offset; ++ u64 lo_end = end_offset(lo_start, lo->length); ++ u64 lg_start = lg->offset; ++ u64 lg_end = end_offset(lg_start, lg->length); ++ ++ /* lo already covers lg? */ ++ if (lo_start <= lg_start && lg_end <= lo_end) ++ return; ++ ++ /* extend start offset */ ++ if (lo_start > lg_start) ++ lo_start = lg_start; ++ ++ /* extend end offset */ ++ if (lo_end < lg_end) ++ lo_end = lg_end; ++ ++ lo->offset = lo_start; ++ lo->length = (lo_end == NFS4_MAX_UINT64) ? ++ lo_end : lo_end - lo_start; ++} ++ ++static struct nfs4_layout * ++merge_layout(struct nfs4_file *fp, ++ struct nfs4_client *clp, ++ struct nfsd4_layout_seg *seg) ++{ ++ struct nfs4_layout *lp = NULL; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (lp, &fp->fi_layouts, lo_perfile) ++ if (lp->lo_seg.layout_type == seg->layout_type && ++ lp->lo_seg.clientid == seg->clientid && ++ lp->lo_seg.iomode == seg->iomode && ++ lo_seg_mergeable(&lp->lo_seg, seg)) { ++ extend_layout(&lp->lo_seg, seg); ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ return lp; ++} ++ ++__be32 ++nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *lgp, ++ struct exp_xdr_stream *xdr) ++{ ++ u32 status; ++ __be32 nfserr; ++ struct inode *ino = lgp->lg_fhp->fh_dentry->d_inode; ++ struct super_block *sb = ino->i_sb; ++ int can_merge; ++ struct nfs4_file *fp; ++ struct nfs4_client *clp; ++ struct nfs4_layout *lp = NULL; ++ struct nfs4_layout_state *ls = NULL; ++ struct nfsd4_pnfs_layoutget_arg args = { ++ .lg_minlength = lgp->lg_minlength, ++ .lg_fh = &lgp->lg_fhp->fh_handle, ++ }; ++ struct nfsd4_pnfs_layoutget_res res = { ++ .lg_seg = lgp->lg_seg, ++ }; ++ ++ dprintk("NFSD: %s Begin\n", __func__); ++ ++ args.lg_sbid = find_create_sbid(sb); ++ if (!args.lg_sbid) { ++ nfserr = nfserr_layouttrylater; ++ goto out; ++ } ++ ++ can_merge = sb->s_pnfs_op->can_merge_layouts != NULL && ++ sb->s_pnfs_op->can_merge_layouts(lgp->lg_seg.layout_type); ++ ++ nfs4_lock_state(); ++ fp = find_alloc_file(ino, lgp->lg_fhp); ++ clp = find_confirmed_client((clientid_t *)&lgp->lg_seg.clientid); ++ dprintk("pNFS %s: fp %p clp %p \n", __func__, fp, clp); ++ if (!fp || !clp) { ++ nfserr = nfserr_inval; ++ goto out_unlock; ++ } ++ ++ /* Check decoded layout stateid */ ++ nfserr = nfs4_process_layout_stateid(clp, fp, &lgp->lg_sid, &ls); ++ if (nfserr) ++ goto out_unlock; ++ ++ if (is_layout_recalled(clp, lgp->lg_fhp, &lgp->lg_seg)) { ++ nfserr = nfserr_recallconflict; ++ goto out; ++ } ++ ++ /* pre-alloc layout in case we can't merge after we call ++ * the file system ++ */ ++ lp = alloc_layout(); ++ if (!lp) { ++ nfserr = nfserr_layouttrylater; ++ goto out_unlock; ++ } ++ ++ dprintk("pNFS %s: pre-export type 0x%x maxcount %Zd " ++ "iomode %u offset %llu length %llu\n", ++ __func__, lgp->lg_seg.layout_type, ++ exp_xdr_qbytes(xdr->end - xdr->p), ++ lgp->lg_seg.iomode, lgp->lg_seg.offset, lgp->lg_seg.length); ++ ++ /* FIXME: need to eliminate the use of the state lock */ ++ nfs4_unlock_state(); ++ status = sb->s_pnfs_op->layout_get(ino, xdr, &args, &res); ++ nfs4_lock_state(); ++ ++ dprintk("pNFS %s: post-export status %u " ++ "iomode %u offset %llu length %llu\n", ++ __func__, status, res.lg_seg.iomode, ++ res.lg_seg.offset, res.lg_seg.length); ++ ++ /* ++ * The allowable error codes for the layout_get pNFS export ++ * operations vector function (from the file system) can be ++ * expanded as needed to include other errors defined for ++ * the RFC 5561 LAYOUTGET operation. ++ */ ++ switch (status) { ++ case 0: ++ nfserr = NFS4_OK; ++ break; ++ case NFS4ERR_ACCESS: ++ case NFS4ERR_BADIOMODE: ++ /* No support for LAYOUTIOMODE4_RW layouts */ ++ case NFS4ERR_BADLAYOUT: ++ /* No layout matching loga_minlength rules */ ++ case NFS4ERR_INVAL: ++ case NFS4ERR_IO: ++ case NFS4ERR_LAYOUTTRYLATER: ++ case NFS4ERR_LAYOUTUNAVAILABLE: ++ case NFS4ERR_LOCKED: ++ case NFS4ERR_NOSPC: ++ case NFS4ERR_RECALLCONFLICT: ++ case NFS4ERR_SERVERFAULT: ++ case NFS4ERR_TOOSMALL: ++ /* Requested layout too big for loga_maxcount */ ++ case NFS4ERR_WRONG_TYPE: ++ /* Not a regular file */ ++ nfserr = cpu_to_be32(status); ++ goto out_freelayout; ++ default: ++ BUG(); ++ nfserr = nfserr_serverfault; ++ } ++ ++ lgp->lg_seg = res.lg_seg; ++ lgp->lg_roc = res.lg_return_on_close; ++ ++ /* SUCCESS! ++ * Can the new layout be merged into an existing one? ++ * If so, free unused layout struct ++ */ ++ if (can_merge && merge_layout(fp, clp, &res.lg_seg)) ++ goto out_freelayout; ++ ++ /* Can't merge, so let's initialize this new layout */ ++ init_layout(ls, lp, fp, clp, lgp->lg_fhp, &res.lg_seg, &lgp->lg_sid); ++out_unlock: ++ if (ls) ++ put_layout_state(ls); ++ if (fp) ++ put_nfs4_file(fp); ++ nfs4_unlock_state(); ++out: ++ dprintk("pNFS %s: lp %p exit nfserr %u\n", __func__, lp, ++ be32_to_cpu(nfserr)); ++ return nfserr; ++out_freelayout: ++ free_layout(lp); ++ goto out_unlock; ++} ++ ++static void ++trim_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lr) ++{ ++ u64 lo_start = lo->offset; ++ u64 lo_end = end_offset(lo_start, lo->length); ++ u64 lr_start = lr->offset; ++ u64 lr_end = end_offset(lr_start, lr->length); ++ ++ dprintk("%s:Begin lo %llu:%lld lr %llu:%lld\n", __func__, ++ lo->offset, lo->length, lr->offset, lr->length); ++ ++ /* lr fully covers lo? */ ++ if (lr_start <= lo_start && lo_end <= lr_end) { ++ lo->length = 0; ++ goto out; ++ } ++ ++ /* ++ * split not supported yet. retain layout segment. ++ * remains must be returned by the client ++ * on the final layout return. ++ */ ++ if (lo_start < lr_start && lr_end < lo_end) { ++ dprintk("%s: split not supported\n", __func__); ++ goto out; ++ } ++ ++ if (lo_start < lr_start) ++ lo_end = lr_start - 1; ++ else /* lr_end < lo_end */ ++ lo_start = lr_end + 1; ++ ++ lo->offset = lo_start; ++ lo->length = (lo_end == NFS4_MAX_UINT64) ? lo_end : lo_end - lo_start; ++out: ++ dprintk("%s:End lo %llu:%lld\n", __func__, lo->offset, lo->length); ++} ++ ++static int ++pnfs_return_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_layout_state *ls) ++{ ++ int layouts_found = 0; ++ struct nfs4_layout *lp, *nextlp; ++ ++ dprintk("%s: clp %p fp %p\n", __func__, clp, fp); ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (lp, nextlp, &fp->fi_layouts, lo_perfile) { ++ dprintk("%s: lp %p client %p,%p lo_type %x,%x iomode %d,%d\n", ++ __func__, lp, ++ lp->lo_client, clp, ++ lp->lo_seg.layout_type, lrp->args.lr_seg.layout_type, ++ lp->lo_seg.iomode, lrp->args.lr_seg.iomode); ++ if (lp->lo_client != clp || ++ lp->lo_seg.layout_type != lrp->args.lr_seg.layout_type || ++ (lp->lo_seg.iomode != lrp->args.lr_seg.iomode && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) || ++ !lo_seg_overlapping(&lp->lo_seg, &lrp->args.lr_seg)) ++ continue; ++ layouts_found++; ++ trim_layout(&lp->lo_seg, &lrp->args.lr_seg); ++ if (!lp->lo_seg.length) { ++ lrp->lrs_present = 0; ++ dequeue_layout(lp); ++ destroy_layout(lp); ++ } ++ } ++ if (ls && layouts_found && lrp->lrs_present) ++ update_layout_stateid(ls, &lrp->lr_sid); ++ spin_unlock(&layout_lock); ++ ++ return layouts_found; ++} ++ ++static int ++pnfs_return_client_layouts(struct nfs4_client *clp, ++ struct nfsd4_pnfs_layoutreturn *lrp, u64 ex_fsid) ++{ ++ int layouts_found = 0; ++ struct nfs4_layout *lp, *nextlp; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (lp, nextlp, &clp->cl_layouts, lo_perclnt) { ++ if (lrp->args.lr_seg.layout_type != lp->lo_seg.layout_type || ++ (lrp->args.lr_seg.iomode != lp->lo_seg.iomode && ++ lrp->args.lr_seg.iomode != IOMODE_ANY)) ++ continue; ++ ++ if (lrp->args.lr_return_type == RETURN_FSID && ++ !same_fsid_major(&lp->lo_file->fi_fsid, ex_fsid)) ++ continue; ++ ++ layouts_found++; ++ dequeue_layout(lp); ++ destroy_layout(lp); ++ } ++ spin_unlock(&layout_lock); ++ ++ return layouts_found; ++} ++ ++static int ++recall_return_perfect_match(struct nfs4_layoutrecall *clr, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_file *fp, ++ struct svc_fh *current_fh) ++{ ++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode || ++ clr->cb.cbl_recall_type != lrp->args.lr_return_type) ++ return 0; ++ ++ return (clr->cb.cbl_recall_type == RETURN_FILE && ++ clr->clr_file == fp && ++ clr->cb.cbl_seg.offset == lrp->args.lr_seg.offset && ++ clr->cb.cbl_seg.length == lrp->args.lr_seg.length) || ++ ++ (clr->cb.cbl_recall_type == RETURN_FSID && ++ same_fsid(&clr->cb.cbl_fsid, current_fh)) || ++ ++ clr->cb.cbl_recall_type == RETURN_ALL; ++} ++ ++static int ++recall_return_partial_match(struct nfs4_layoutrecall *clr, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_file *fp, ++ struct svc_fh *current_fh) ++{ ++ /* iomode matching? */ ++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode && ++ clr->cb.cbl_seg.iomode != IOMODE_ANY && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) ++ return 0; ++ ++ if (clr->cb.cbl_recall_type == RETURN_ALL || ++ lrp->args.lr_return_type == RETURN_ALL) ++ return 1; ++ ++ /* fsid matches? */ ++ if (clr->cb.cbl_recall_type == RETURN_FSID || ++ lrp->args.lr_return_type == RETURN_FSID) ++ return same_fsid(&clr->cb.cbl_fsid, current_fh); ++ ++ /* file matches, range overlapping? */ ++ return clr->clr_file == fp && ++ lo_seg_overlapping(&clr->cb.cbl_seg, &lrp->args.lr_seg); ++} ++ ++int nfs4_pnfs_return_layout(struct super_block *sb, struct svc_fh *current_fh, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int status = 0; ++ int layouts_found = 0; ++ struct inode *ino = current_fh->fh_dentry->d_inode; ++ struct nfs4_file *fp = NULL; ++ struct nfs4_client *clp; ++ struct nfs4_layout_state *ls = NULL; ++ struct nfs4_layoutrecall *clr, *nextclr; ++ u64 ex_fsid = current_fh->fh_export->ex_fsid; ++ void *recall_cookie = NULL; ++ ++ dprintk("NFSD: %s\n", __func__); ++ ++ nfs4_lock_state(); ++ clp = find_confirmed_client((clientid_t *)&lrp->args.lr_seg.clientid); ++ if (!clp) ++ goto out; ++ ++ if (lrp->args.lr_return_type == RETURN_FILE) { ++ fp = find_file(ino); ++ if (!fp) { ++ printk(KERN_ERR "%s: RETURN_FILE: no nfs4_file for " ++ "ino %p:%lu\n", ++ __func__, ino, ino ? ino->i_ino : 0L); ++ goto out; ++ } ++ ++ /* Check the stateid */ ++ dprintk("%s PROCESS LO_STATEID inode %p\n", __func__, ino); ++ status = nfs4_process_layout_stateid(clp, fp, &lrp->lr_sid, &ls); ++ if (status) ++ goto out_put_file; ++ ++ /* update layouts */ ++ layouts_found = pnfs_return_file_layouts(clp, fp, lrp, ls); ++ /* optimize for the all-empty case */ ++ if (list_empty(&fp->fi_layouts)) ++ recall_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; ++ } else { ++ layouts_found = pnfs_return_client_layouts(clp, lrp, ex_fsid); ++ } ++ ++ dprintk("pNFS %s: clp %p fp %p layout_type 0x%x iomode %d " ++ "return_type %d fsid 0x%llx offset %llu length %llu: " ++ "layouts_found %d\n", ++ __func__, clp, fp, lrp->args.lr_seg.layout_type, ++ lrp->args.lr_seg.iomode, lrp->args.lr_return_type, ++ ex_fsid, ++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, layouts_found); ++ ++ /* update layoutrecalls ++ * note: for RETURN_{FSID,ALL}, fp may be NULL ++ */ ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (clr, nextclr, &clp->cl_layoutrecalls, ++ clr_perclnt) { ++ if (clr->cb.cbl_seg.layout_type != lrp->args.lr_seg.layout_type) ++ continue; ++ ++ if (recall_return_perfect_match(clr, lrp, fp, current_fh)) ++ recall_cookie = layoutrecall_done(clr); ++ else if (layouts_found && ++ recall_return_partial_match(clr, lrp, fp, current_fh)) ++ clr->clr_time = CURRENT_TIME; ++ } ++ spin_unlock(&layout_lock); ++ ++out_put_file: ++ if (fp) ++ put_nfs4_file(fp); ++ if (ls) ++ put_layout_state(ls); ++out: ++ nfs4_unlock_state(); ++ ++ /* call exported filesystem layout_return (ignore return-code) */ ++ fs_layout_return(sb, ino, lrp, 0, recall_cookie); ++ ++ dprintk("pNFS %s: exit status %d \n", __func__, status); ++ return status; ++} ++ ++/* ++ * PNFS Metadata server export operations callback for get_state ++ * ++ * called by the cluster fs when it receives a get_state() from a data ++ * server. ++ * returns status, or pnfs_get_state* with pnfs_get_state->status set. ++ * ++ */ ++int ++nfs4_pnfs_cb_get_state(struct super_block *sb, struct pnfs_get_state *arg) ++{ ++ struct nfs4_stateid *stp; ++ int flags = LOCK_STATE | OPEN_STATE; /* search both hash tables */ ++ int status = -EINVAL; ++ struct inode *ino; ++ struct nfs4_delegation *dl; ++ stateid_t *stid = (stateid_t *)&arg->stid; ++ ++ dprintk("NFSD: %s sid=" STATEID_FMT " ino %llu\n", __func__, ++ STATEID_VAL(stid), arg->ino); ++ ++ nfs4_lock_state(); ++ stp = find_stateid(stid, flags); ++ if (!stp) { ++ ino = iget_locked(sb, arg->ino); ++ if (!ino) ++ goto out; ++ ++ if (ino->i_state & I_NEW) { ++ iget_failed(ino); ++ goto out; ++ } ++ ++ dl = find_delegation_stateid(ino, stid); ++ if (dl) ++ status = 0; ++ ++ iput(ino); ++ } else { ++ /* XXX ANDROS: marc removed nfs4_check_fh - how come? */ ++ ++ /* arg->devid is the Data server id, set by the cluster fs */ ++ status = nfs4_add_pnfs_ds_dev(stp, arg->dsid); ++ if (status) ++ goto out; ++ ++ arg->access = stp->st_access_bmap; ++ *(clientid_t *)&arg->clid = ++ stp->st_stateowner->so_client->cl_clientid; ++ } ++out: ++ nfs4_unlock_state(); ++ return status; ++} ++ ++static int ++cl_has_file_layout(struct nfs4_client *clp, struct nfs4_file *lrfile, ++ stateid_t *lsid) ++{ ++ int found = 0; ++ struct nfs4_layout *lp; ++ struct nfs4_layout_state *ls; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) { ++ if (lp->lo_file != lrfile) ++ continue; ++ ++ ls = find_get_layout_state(clp, lrfile); ++ if (!ls) { ++ /* This shouldn't happen as the file should have a ++ * layout stateid if it has a layout. ++ */ ++ printk(KERN_ERR "%s: file %p has no layout stateid\n", ++ __func__, lrfile); ++ WARN_ON(1); ++ break; ++ } ++ update_layout_stateid(ls, lsid); ++ put_layout_state_locked(ls); ++ found = 1; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ return found; ++} ++ ++static int ++cl_has_fsid_layout(struct nfs4_client *clp, struct nfs4_fsid *fsid) ++{ ++ int found = 0; ++ struct nfs4_layout *lp; ++ ++ /* note: minor version unused */ ++ spin_lock(&layout_lock); ++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) ++ if (lp->lo_file->fi_fsid.major == fsid->major) { ++ found = 1; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ return found; ++} ++ ++static int ++cl_has_any_layout(struct nfs4_client *clp) ++{ ++ return !list_empty(&clp->cl_layouts); ++} ++ ++static int ++cl_has_layout(struct nfs4_client *clp, struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_file *lrfile, stateid_t *lsid) ++{ ++ switch (cbl->cbl_recall_type) { ++ case RETURN_FILE: ++ return cl_has_file_layout(clp, lrfile, lsid); ++ case RETURN_FSID: ++ return cl_has_fsid_layout(clp, &cbl->cbl_fsid); ++ default: ++ return cl_has_any_layout(clp); ++ } ++} ++ ++/* ++ * Called without the layout_lock. ++ */ ++void ++nomatching_layout(struct nfs4_layoutrecall *clr) ++{ ++ struct nfsd4_pnfs_layoutreturn lr = { ++ .args.lr_return_type = clr->cb.cbl_recall_type, ++ .args.lr_seg = clr->cb.cbl_seg, ++ }; ++ struct inode *inode; ++ void *recall_cookie; ++ ++ if (clr->clr_file) { ++ inode = igrab(clr->clr_file->fi_inode); ++ if (WARN_ON(!inode)) ++ return; ++ } else { ++ inode = NULL; ++ } ++ ++ dprintk("%s: clp %p fp %p: simulating layout_return\n", __func__, ++ clr->clr_client, clr->clr_file); ++ ++ if (clr->cb.cbl_recall_type == RETURN_FILE) ++ pnfs_return_file_layouts(clr->clr_client, clr->clr_file, &lr, ++ NULL); ++ else ++ pnfs_return_client_layouts(clr->clr_client, &lr, ++ clr->cb.cbl_fsid.major); ++ ++ spin_lock(&layout_lock); ++ recall_cookie = layoutrecall_done(clr); ++ spin_unlock(&layout_lock); ++ ++ fs_layout_return(clr->clr_sb, inode, &lr, LR_FLAG_INTERN, ++ recall_cookie); ++ iput(inode); ++} ++ ++void pnfs_expire_client(struct nfs4_client *clp) ++{ ++ for (;;) { ++ struct nfs4_layoutrecall *lrp = NULL; ++ ++ spin_lock(&layout_lock); ++ if (!list_empty(&clp->cl_layoutrecalls)) { ++ lrp = list_entry(clp->cl_layoutrecalls.next, ++ struct nfs4_layoutrecall, clr_perclnt); ++ get_layoutrecall(lrp); ++ } ++ spin_unlock(&layout_lock); ++ if (!lrp) ++ break; ++ ++ dprintk("%s: lrp %p, fp %p\n", __func__, lrp, lrp->clr_file); ++ BUG_ON(lrp->clr_client != clp); ++ nomatching_layout(lrp); ++ put_layoutrecall(lrp); ++ } ++ ++ for (;;) { ++ struct nfs4_layout *lp = NULL; ++ struct inode *inode = NULL; ++ struct nfsd4_pnfs_layoutreturn lr; ++ bool empty = false; ++ ++ spin_lock(&layout_lock); ++ if (!list_empty(&clp->cl_layouts)) { ++ lp = list_entry(clp->cl_layouts.next, ++ struct nfs4_layout, lo_perclnt); ++ inode = igrab(lp->lo_file->fi_inode); ++ memset(&lr, 0, sizeof(lr)); ++ lr.args.lr_return_type = RETURN_FILE; ++ lr.args.lr_seg = lp->lo_seg; ++ empty = list_empty(&lp->lo_file->fi_layouts); ++ BUG_ON(lp->lo_client != clp); ++ dequeue_layout(lp); ++ destroy_layout(lp); /* do not access lp after this */ ++ } ++ spin_unlock(&layout_lock); ++ if (!lp) ++ break; ++ ++ if (WARN_ON(!inode)) ++ break; ++ ++ dprintk("%s: inode %lu lp %p clp %p\n", __func__, inode->i_ino, ++ lp, clp); ++ ++ fs_layout_return(inode->i_sb, inode, &lr, LR_FLAG_EXPIRE, ++ empty ? PNFS_LAST_LAYOUT_NO_RECALLS : NULL); ++ iput(inode); ++ } ++} ++ ++struct create_recall_list_arg { ++ struct nfsd4_pnfs_cb_layout *cbl; ++ struct nfs4_file *lrfile; ++ struct list_head *todolist; ++ unsigned todo_count; ++}; ++ ++/* ++ * look for matching layout for the given client ++ * and add a pending layout recall to the todo list ++ * if found any. ++ * returns: ++ * 0 if layouts found or negative error. ++ */ ++static int ++lo_recall_per_client(struct nfs4_client *clp, void *p) ++{ ++ stateid_t lsid; ++ struct nfs4_layoutrecall *pending; ++ struct create_recall_list_arg *arg = p; ++ ++ memset(&lsid, 0, sizeof(lsid)); ++ if (!cl_has_layout(clp, arg->cbl, arg->lrfile, &lsid)) ++ return 0; ++ ++ /* Matching put done by layoutreturn */ ++ pending = alloc_init_layoutrecall(arg->cbl, clp, arg->lrfile); ++ /* out of memory, drain todo queue */ ++ if (!pending) ++ return -ENOMEM; ++ ++ *(stateid_t *)&pending->cb.cbl_sid = lsid; ++ list_add(&pending->clr_perclnt, arg->todolist); ++ arg->todo_count++; ++ return 0; ++} ++ ++/* Create a layoutrecall structure for each client based on the ++ * original structure. */ ++int ++create_layout_recall_list(struct list_head *todolist, unsigned *todo_len, ++ struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_file *lrfile) ++{ ++ struct nfs4_client *clp; ++ struct create_recall_list_arg arg = { ++ .cbl = cbl, ++ .lrfile = lrfile, ++ .todolist = todolist, ++ }; ++ int status = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ /* If client given by fs, just do single client */ ++ if (cbl->cbl_seg.clientid) { ++ clp = find_confirmed_client( ++ (clientid_t *)&cbl->cbl_seg.clientid); ++ if (!clp) { ++ status = -ENOENT; ++ dprintk("%s: clientid %llx not found\n", __func__, ++ (unsigned long long)cbl->cbl_seg.clientid); ++ goto out; ++ } ++ ++ status = lo_recall_per_client(clp, &arg); ++ } else { ++ /* Check all clients for layout matches */ ++ status = filter_confirmed_clients(lo_recall_per_client, &arg); ++ } ++ ++out: ++ *todo_len = arg.todo_count; ++ dprintk("%s: <-- list len %u status %d\n", __func__, *todo_len, status); ++ return status; ++} ++ ++/* ++ * Recall layouts asynchronously ++ * Called with state lock. ++ */ ++static int ++spawn_layout_recall(struct super_block *sb, struct list_head *todolist, ++ unsigned todo_len) ++{ ++ struct nfs4_layoutrecall *pending; ++ struct nfs4_layoutrecall *parent = NULL; ++ int status = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ if (todo_len > 1) { ++ pending = list_entry(todolist->next, struct nfs4_layoutrecall, ++ clr_perclnt); ++ ++ parent = alloc_init_layoutrecall(&pending->cb, NULL, ++ pending->clr_file); ++ if (unlikely(!parent)) { ++ /* We want forward progress. If parent cannot be ++ * allocated take the first one as parent but don't ++ * execute it. Caller must check for -EAGAIN, if so ++ * When the partial recalls return, ++ * nfsd_layout_recall_cb should be called again. ++ */ ++ list_del_init(&pending->clr_perclnt); ++ if (todo_len > 2) { ++ parent = pending; ++ } else { ++ parent = NULL; ++ put_layoutrecall(pending); ++ } ++ --todo_len; ++ status = -ENOMEM; ++ } ++ } ++ ++ while (!list_empty(todolist)) { ++ pending = list_entry(todolist->next, struct nfs4_layoutrecall, ++ clr_perclnt); ++ list_del_init(&pending->clr_perclnt); ++ dprintk("%s: clp %p cb_client %p fp %p\n", __func__, ++ pending->clr_client, ++ pending->clr_client->cl_cb_client, ++ pending->clr_file); ++ if (unlikely(!pending->clr_client->cl_cb_client)) { ++ printk(KERN_INFO ++ "%s: clientid %08x/%08x has no callback path\n", ++ __func__, ++ pending->clr_client->cl_clientid.cl_boot, ++ pending->clr_client->cl_clientid.cl_id); ++ put_layoutrecall(pending); ++ continue; ++ } ++ ++ pending->clr_time = CURRENT_TIME; ++ pending->clr_sb = sb; ++ if (parent) { ++ /* If we created a parent its initial ref count is 1. ++ * We will need to de-ref it eventually. So we just ++ * don't increment on behalf of the last one. ++ */ ++ if (todo_len != 1) ++ get_layoutrecall(parent); ++ } ++ pending->parent = parent; ++ get_layoutrecall(pending); ++ /* Add to list so corresponding layoutreturn can find req */ ++ list_add(&pending->clr_perclnt, ++ &pending->clr_client->cl_layoutrecalls); ++ ++ nfsd4_cb_layout(pending); ++ --todo_len; ++ } ++ ++ return status; ++} ++ ++/* ++ * Spawn a thread to perform a recall layout ++ * ++ */ ++int nfsd_layout_recall_cb(struct super_block *sb, struct inode *inode, ++ struct nfsd4_pnfs_cb_layout *cbl) ++{ ++ int status; ++ struct nfs4_file *lrfile = NULL; ++ struct list_head todolist; ++ unsigned todo_len = 0; ++ ++ dprintk("NFSD nfsd_layout_recall_cb: inode %p cbl %p\n", inode, cbl); ++ BUG_ON(!cbl); ++ BUG_ON(cbl->cbl_recall_type != RETURN_FILE && ++ cbl->cbl_recall_type != RETURN_FSID && ++ cbl->cbl_recall_type != RETURN_ALL); ++ BUG_ON(cbl->cbl_recall_type == RETURN_FILE && !inode); ++ BUG_ON(cbl->cbl_seg.iomode != IOMODE_READ && ++ cbl->cbl_seg.iomode != IOMODE_RW && ++ cbl->cbl_seg.iomode != IOMODE_ANY); ++ ++ if (nfsd_serv == NULL) { ++ dprintk("NFSD nfsd_layout_recall_cb: nfsd_serv == NULL\n"); ++ return -ENOENT; ++ } ++ ++ nfs4_lock_state(); ++ status = -ENOENT; ++ if (inode) { ++ lrfile = find_file(inode); ++ if (!lrfile) { ++ dprintk("NFSD nfsd_layout_recall_cb: " ++ "nfs4_file not found\n"); ++ goto err; ++ } ++ if (cbl->cbl_recall_type == RETURN_FSID) ++ cbl->cbl_fsid = lrfile->fi_fsid; ++ } ++ ++ INIT_LIST_HEAD(&todolist); ++ ++ /* If no cookie provided by FS, return a default one */ ++ if (!cbl->cbl_cookie) ++ cbl->cbl_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; ++ ++ status = create_layout_recall_list(&todolist, &todo_len, cbl, lrfile); ++ if (list_empty(&todolist)) { ++ status = -ENOENT; ++ } else { ++ /* process todolist even if create_layout_recall_list ++ * returned an error */ ++ int status2 = spawn_layout_recall(sb, &todolist, todo_len); ++ if (status2) ++ status = status2; ++ } ++ ++err: ++ nfs4_unlock_state(); ++ if (lrfile) ++ put_nfs4_file(lrfile); ++ return (todo_len && status) ? -EAGAIN : status; ++} ++ ++struct create_device_notify_list_arg { ++ struct list_head *todolist; ++ struct nfsd4_pnfs_cb_dev_list *ndl; ++}; ++ ++static int ++create_device_notify_per_cl(struct nfs4_client *clp, void *p) ++{ ++ struct nfs4_notify_device *cbnd; ++ struct create_device_notify_list_arg *arg = p; ++ ++ if (atomic_read(&clp->cl_deviceref) <= 0) ++ return 0; ++ ++ cbnd = kzalloc(sizeof(*cbnd), GFP_KERNEL); ++ if (!cbnd) ++ return -ENOMEM; ++ ++ cbnd->nd_list = arg->ndl; ++ cbnd->nd_client = clp; ++ INIT_WORK(&cbnd->nd_recall.cb_work, nfsd4_do_callback_rpc); ++ list_add(&cbnd->nd_perclnt, arg->todolist); ++ return 0; ++} ++ ++/* Create a list of clients to send device notifications. */ ++int ++create_device_notify_list(struct list_head *todolist, ++ struct nfsd4_pnfs_cb_dev_list *ndl) ++{ ++ int status; ++ struct create_device_notify_list_arg arg = { ++ .todolist = todolist, ++ .ndl = ndl, ++ }; ++ ++ nfs4_lock_state(); ++ status = filter_confirmed_clients(create_device_notify_per_cl, &arg); ++ nfs4_unlock_state(); ++ ++ return status; ++} ++ ++/* ++ * For each client that a device, send a device notification. ++ * XXX: Need to track which clients have which devices. ++ */ ++int nfsd_device_notify_cb(struct super_block *sb, ++ struct nfsd4_pnfs_cb_dev_list *ndl) ++{ ++ struct nfs4_notify_device *cbnd; ++ struct nfs4_client *nd_client; ++ unsigned int notify_num = 0; ++ int status = 0; ++ struct list_head todolist; ++ ++ BUG_ON(!ndl || ndl->cbd_len == 0 || !ndl->cbd_list); ++ ++ dprintk("NFSD %s: cbl %p len %u\n", __func__, ndl, ndl->cbd_len); ++ ++ if (nfsd_serv == NULL) ++ return -ENOENT; ++ ++ INIT_LIST_HEAD(&todolist); ++ ++ status = create_device_notify_list(&todolist, ndl); ++ ++ while (!list_empty(&todolist)) { ++ cbnd = list_entry(todolist.next, struct nfs4_notify_device, ++ nd_perclnt); ++ list_del_init(&cbnd->nd_perclnt); ++ nd_client = cbnd->nd_client; ++ nfsd4_cb_notify_device(cbnd); ++ pnfs_clear_device_notify(nd_client); ++ notify_num++; ++ } ++ ++ dprintk("NFSD %s: status %d clients %u\n", ++ __func__, status, notify_num); ++ return status; ++} +diff -up linux-2.6.37.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.37.noarch/fs/nfsd/nfs4pnfsdlm.c +--- linux-2.6.37.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2011-01-28 09:43:53.354769959 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/nfs4pnfsdlm.c 2011-01-28 09:43:53.354769959 -0500 +@@ -0,0 +1,461 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ ******************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfsfh.h" ++#include "nfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* Just use a linked list. Do not expect more than 32 dlm_device_entries ++ * the first implementation will just use one device per cluster file system ++ */ ++ ++static LIST_HEAD(dlm_device_list); ++static DEFINE_SPINLOCK(dlm_device_list_lock); ++ ++struct dlm_device_entry { ++ struct list_head dlm_dev_list; ++ char disk_name[DISK_NAME_LEN]; ++ int num_ds; ++ char ds_list[NFSD_DLM_DS_LIST_MAX]; ++}; ++ ++static struct dlm_device_entry * ++_nfsd4_find_pnfs_dlm_device(char *disk_name) ++{ ++ struct dlm_device_entry *dlm_pdev; ++ ++ dprintk("--> %s disk name %s\n", __func__, disk_name); ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) { ++ dprintk("%s Look for dlm_pdev %s\n", __func__, ++ dlm_pdev->disk_name); ++ if (!memcmp(dlm_pdev->disk_name, disk_name, strlen(disk_name))) { ++ spin_unlock(&dlm_device_list_lock); ++ return dlm_pdev; ++ } ++ } ++ spin_unlock(&dlm_device_list_lock); ++ return NULL; ++} ++ ++static struct dlm_device_entry * ++nfsd4_find_pnfs_dlm_device(struct super_block *sb) { ++ char dname[BDEVNAME_SIZE]; ++ ++ bdevname(sb->s_bdev, dname); ++ return _nfsd4_find_pnfs_dlm_device(dname); ++} ++ ++ssize_t ++nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen) ++{ ++ char *pos = buf; ++ ssize_t size = 0; ++ struct dlm_device_entry *dlm_pdev; ++ int ret = -EINVAL; ++ ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) ++ { ++ int advanced; ++ advanced = snprintf(pos, buflen - size, "%s:%s\n", dlm_pdev->disk_name, dlm_pdev->ds_list); ++ if (advanced >= buflen - size) ++ goto out; ++ size += advanced; ++ pos += advanced; ++ } ++ ret = size; ++ ++out: ++ spin_unlock(&dlm_device_list_lock); ++ return ret; ++} ++ ++bool nfsd4_validate_pnfs_dlm_device(char *ds_list, int *num_ds) ++{ ++ char *start = ds_list; ++ ++ *num_ds = 0; ++ ++ while (*start) { ++ struct sockaddr_storage tempAddr; ++ int ipLen = strcspn(start, ","); ++ ++ if (!rpc_pton(start, ipLen, (struct sockaddr *)&tempAddr, sizeof(tempAddr))) ++ return false; ++ (*num_ds)++; ++ start += ipLen + 1; ++ } ++ return true; ++} ++ ++/* ++ * pnfs_dlm_device string format: ++ * block-device-path:, ++ * ++ * Examples ++ * /dev/sda:192.168.1.96,192.168.1.97' creates a data server list with ++ * two data servers for the dlm cluster file system mounted on /dev/sda. ++ * ++ * /dev/sda:192.168.1.96,192.168.1.100' ++ * replaces the data server list for /dev/sda ++ * ++ * Only the deviceid == 1 is supported. Can add device id to ++ * pnfs_dlm_device string when needed. ++ * ++ * Only the round robin each data server once stripe index is supported. ++ */ ++int ++nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len) ++ ++{ ++ struct dlm_device_entry *new, *found; ++ char *bufp = pnfs_dlm_device; ++ char *endp = bufp + strlen(bufp); ++ int err = -ENOMEM; ++ ++ dprintk("--> %s len %d\n", __func__, len); ++ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return err; ++ ++ err = -EINVAL; ++ /* disk_name */ ++ /* FIXME: need to check for valid disk_name. search superblocks? ++ * check for slash dev slash ? ++ */ ++ len = strcspn(bufp, ":"); ++ if (len > DISK_NAME_LEN) ++ goto out_free; ++ memcpy(new->disk_name, bufp, len); ++ ++ err = -EINVAL; ++ bufp += len + 1; ++ if (bufp >= endp) ++ goto out_free; ++ ++ /* data server list */ ++ /* FIXME: need to check for comma separated valid ip format */ ++ len = strcspn(bufp, ":"); ++ if (len > NFSD_DLM_DS_LIST_MAX) ++ goto out_free; ++ memcpy(new->ds_list, bufp, len); ++ ++ ++ /* validate the ips */ ++ if (!nfsd4_validate_pnfs_dlm_device(new->ds_list, &(new->num_ds))) ++ goto out_free; ++ ++ dprintk("%s disk_name %s num_ds %d ds_list %s\n", __func__, ++ new->disk_name, new->num_ds, new->ds_list); ++ ++ found = _nfsd4_find_pnfs_dlm_device(new->disk_name); ++ if (found) { ++ /* FIXME: should compare found->ds_list with new->ds_list ++ * and if it is different, kick off a CB_NOTIFY change ++ * deviceid. ++ */ ++ dprintk("%s pnfs_dlm_device %s:%s already in cache " ++ " replace ds_list with new ds_list %s\n", __func__, ++ found->disk_name, found->ds_list, new->ds_list); ++ memset(found->ds_list, 0, DISK_NAME_LEN); ++ memcpy(found->ds_list, new->ds_list, strlen(new->ds_list)); ++ found->num_ds = new->num_ds; ++ kfree(new); ++ } else { ++ dprintk("%s Adding pnfs_dlm_device %s:%s\n", __func__, ++ new->disk_name, new->ds_list); ++ spin_lock(&dlm_device_list_lock); ++ list_add(&new->dlm_dev_list, &dlm_device_list); ++ spin_unlock(&dlm_device_list_lock); ++ } ++ dprintk("<-- %s Success\n", __func__); ++ return 0; ++ ++out_free: ++ kfree(new); ++ dprintk("<-- %s returns %d\n", __func__, err); ++ return err; ++} ++ ++void nfsd4_pnfs_dlm_shutdown(void) ++{ ++ struct dlm_device_entry *dlm_pdev, *next; ++ ++ dprintk("--> %s\n", __func__); ++ ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry_safe (dlm_pdev, next, &dlm_device_list, ++ dlm_dev_list) { ++ list_del(&dlm_pdev->dlm_dev_list); ++ kfree(dlm_pdev); ++ } ++ spin_unlock(&dlm_device_list_lock); ++} ++ ++static int nfsd4_pnfs_dlm_getdeviter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ if (layout_type != LAYOUT_NFSV4_1_FILES) { ++ printk(KERN_ERR "%s: ERROR: layout type isn't 'file' " ++ "(type: %x)\n", __func__, layout_type); ++ return -ENOTSUPP; ++ } ++ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ ++ res->gd_cookie = 1; ++ res->gd_verf = 1; ++ res->gd_devid = 1; ++ return 0; ++} ++ ++static int nfsd4_pnfs_dlm_getdevinfo(struct super_block *sb, ++ struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ int err, len, i = 0; ++ struct pnfs_filelayout_device fdev; ++ struct pnfs_filelayout_devaddr *daddr; ++ struct dlm_device_entry *dlm_pdev; ++ char *bufp; ++ ++ err = -ENOTSUPP; ++ if (layout_type != LAYOUT_NFSV4_1_FILES) { ++ dprintk("%s: ERROR: layout type isn't 'file' " ++ "(type: %x)\n", __func__, layout_type); ++ return err; ++ } ++ ++ /* We only hand out a deviceid of 1 in LAYOUTGET, so a GETDEVICEINFO ++ * with a gdia_device_id != 1 is invalid. ++ */ ++ err = -EINVAL; ++ if (devid->devid != 1) { ++ dprintk("%s: WARNING: didn't receive a deviceid of " ++ "1 (got: 0x%llx)\n", __func__, devid->devid); ++ return err; ++ } ++ ++ /* ++ * If the DS list has not been established, return -EINVAL ++ */ ++ dlm_pdev = nfsd4_find_pnfs_dlm_device(sb); ++ if (!dlm_pdev) { ++ dprintk("%s: DEBUG: disk %s Not Found\n", __func__, ++ sb->s_bdev->bd_disk->disk_name); ++ return err; ++ } ++ ++ dprintk("%s: Found disk %s with DS list |%s|\n", ++ __func__, dlm_pdev->disk_name, dlm_pdev->ds_list); ++ ++ memset(&fdev, '\0', sizeof(fdev)); ++ fdev.fl_device_length = dlm_pdev->num_ds; ++ ++ err = -ENOMEM; ++ len = sizeof(*fdev.fl_device_list) * fdev.fl_device_length; ++ fdev.fl_device_list = kzalloc(len, GFP_KERNEL); ++ if (!fdev.fl_device_list) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device list " ++ "buffer for %d DSes.\n", __func__, i); ++ fdev.fl_device_length = 0; ++ goto out; ++ } ++ ++ /* Set a simple stripe indicie */ ++ fdev.fl_stripeindices_length = fdev.fl_device_length; ++ fdev.fl_stripeindices_list = kzalloc(sizeof(u32) * ++ fdev.fl_stripeindices_length, GFP_KERNEL); ++ ++ if (!fdev.fl_stripeindices_list) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a stripeindices " ++ "list buffer for %d DSes.\n", __func__, i); ++ goto out; ++ } ++ for (i = 0; i < fdev.fl_stripeindices_length; i++) ++ fdev.fl_stripeindices_list[i] = i; ++ ++ /* Transfer the data server list with a single multipath entry */ ++ bufp = dlm_pdev->ds_list; ++ for (i = 0; i < fdev.fl_device_length; i++) { ++ daddr = kmalloc(sizeof(*daddr), GFP_KERNEL); ++ if (!daddr) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device " ++ "addr buffer.\n", __func__); ++ goto out; ++ } ++ ++ daddr->r_netid.data = "tcp"; ++ daddr->r_netid.len = 3; ++ ++ len = strcspn(bufp, ","); ++ daddr->r_addr.data = kmalloc(len + 4, GFP_KERNEL); ++ memcpy(daddr->r_addr.data, bufp, len); ++ /* ++ * append the port number. interpreted as two more bytes ++ * beyond the quad: ".8.1" -> 0x08.0x01 -> 0x0801 = port 2049. ++ */ ++ memcpy(daddr->r_addr.data + len, ".8.1", 4); ++ daddr->r_addr.len = len + 4; ++ ++ fdev.fl_device_list[i].fl_multipath_length = 1; ++ fdev.fl_device_list[i].fl_multipath_list = daddr; ++ ++ dprintk("%s: encoding DS |%s|\n", __func__, bufp); ++ ++ bufp += len + 1; ++ } ++ ++ /* have nfsd encode the device info */ ++ err = filelayout_encode_devinfo(xdr, &fdev); ++out: ++ for (i = 0; i < fdev.fl_device_length; i++) ++ kfree(fdev.fl_device_list[i].fl_multipath_list); ++ kfree(fdev.fl_device_list); ++ kfree(fdev.fl_stripeindices_list); ++ dprintk("<-- %s returns %d\n", __func__, err); ++ return err; ++} ++ ++static int get_stripe_unit(int blocksize) ++{ ++ if (blocksize >= NFSSVC_MAXBLKSIZE) ++ return blocksize; ++ return NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); ++} ++ ++/* ++ * Look up inode block device in pnfs_dlm_device list. ++ * Hash on the inode->i_ino and number of data servers. ++ */ ++static int dlm_ino_hash(struct inode *ino) ++{ ++ struct dlm_device_entry *de; ++ u32 hash_mask = 0; ++ ++ /* If can't find the inode block device in the pnfs_dlm_deivce list ++ * then don't hand out a layout ++ */ ++ de = nfsd4_find_pnfs_dlm_device(ino->i_sb); ++ if (!de) ++ return -1; ++ hash_mask = de->num_ds - 1; ++ return ino->i_ino & hash_mask; ++} ++ ++static enum nfsstat4 nfsd4_pnfs_dlm_layoutget(struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *args, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ struct pnfs_filelayout_layout *layout = NULL; ++ struct knfsd_fh *fhp = NULL; ++ int index; ++ enum nfsstat4 rc = NFS4_OK; ++ ++ dprintk("%s: LAYOUT_GET\n", __func__); ++ ++ /* DLM exported file systems only support layouts for READ */ ++ if (res->lg_seg.iomode == IOMODE_RW) ++ return NFS4ERR_BADIOMODE; ++ ++ index = dlm_ino_hash(inode); ++ dprintk("%s first stripe index %d i_ino %lu\n", __func__, index, ++ inode->i_ino); ++ if (index < 0) ++ return NFS4ERR_LAYOUTUNAVAILABLE; ++ ++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ /* Always give out whole file layouts */ ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ /* Always give out READ ONLY layouts */ ++ res->lg_seg.iomode = IOMODE_READ; ++ ++ layout = kzalloc(sizeof(*layout), GFP_KERNEL); ++ if (layout == NULL) { ++ rc = NFS4ERR_LAYOUTTRYLATER; ++ goto error; ++ } ++ ++ /* Set file layout response args */ ++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; ++ layout->lg_stripe_type = STRIPE_SPARSE; ++ layout->lg_commit_through_mds = false; ++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); ++ layout->lg_fh_length = 1; ++ layout->device_id.sbid = args->lg_sbid; ++ layout->device_id.devid = 1; /*FSFTEMP*/ ++ layout->lg_first_stripe_index = index; /*FSFTEMP*/ ++ layout->lg_pattern_offset = 0; ++ ++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); ++ if (fhp == NULL) { ++ rc = NFS4ERR_LAYOUTTRYLATER; ++ goto error; ++ } ++ ++ memcpy(fhp, args->lg_fh, sizeof(*fhp)); ++ pnfs_fh_mark_ds(fhp); ++ layout->lg_fh_list = fhp; ++ ++ /* Call nfsd to encode layout */ ++ rc = filelayout_encode_layout(xdr, layout); ++exit: ++ kfree(layout); ++ kfree(fhp); ++ return rc; ++ ++error: ++ res->lg_seg.length = 0; ++ goto exit; ++} ++ ++static int ++nfsd4_pnfs_dlm_layouttype(struct super_block *sb) ++{ ++ return LAYOUT_NFSV4_1_FILES; ++} ++ ++/* For use by DLM cluster file systems exported by pNFSD */ ++const struct pnfs_export_operations pnfs_dlm_export_ops = { ++ .layout_type = nfsd4_pnfs_dlm_layouttype, ++ .get_device_info = nfsd4_pnfs_dlm_getdevinfo, ++ .get_device_iter = nfsd4_pnfs_dlm_getdeviter, ++ .layout_get = nfsd4_pnfs_dlm_layoutget, ++}; ++EXPORT_SYMBOL(pnfs_dlm_export_ops); +diff -up linux-2.6.37.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.37.noarch/fs/nfsd/nfs4pnfsds.c +--- linux-2.6.37.noarch/fs/nfsd/nfs4pnfsds.c.orig 2011-01-28 09:43:53.355769845 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/nfs4pnfsds.c 2011-01-28 09:43:53.355769845 -0500 +@@ -0,0 +1,620 @@ ++/* ++* linux/fs/nfsd/nfs4pnfsds.c ++* ++* Copyright (c) 2005 The Regents of the University of Michigan. ++* All rights reserved. ++* ++* Andy Adamson ++* ++* Redistribution and use in source and binary forms, with or without ++* modification, are permitted provided that the following conditions ++* are met: ++* ++* 1. Redistributions of source code must retain the above copyright ++* notice, this list of conditions and the following disclaimer. ++* 2. Redistributions in binary form must reproduce the above copyright ++* notice, this list of conditions and the following disclaimer in the ++* documentation and/or other materials provided with the distribution. ++* 3. Neither the name of the University nor the names of its ++* contributors may be used to endorse or promote products derived ++* from this software without specific prior written permission. ++* ++* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++* ++*/ ++#if defined(CONFIG_PNFSD) ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfsd.h" ++#include "pnfsd.h" ++#include "state.h" ++ ++/* ++ ******************* ++ * PNFS ++ ******************* ++ */ ++/* ++ * Hash tables for pNFS Data Server state ++ * ++ * mds_nodeid: list of struct pnfs_mds_id one per Metadata server (MDS) using ++ * this data server (DS). ++ * ++ * mds_clid_hashtbl[]: uses clientid_hashval(), hash of all clientids obtained ++ * from any MDS. ++ * ++ * ds_stid_hashtbl[]: uses stateid_hashval(), hash of all stateids obtained ++ * from any MDS. ++ * ++ */ ++/* Hash tables for clientid state */ ++#define CLIENT_HASH_BITS 4 ++#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) ++#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) ++ ++#define clientid_hashval(id) \ ++ ((id) & CLIENT_HASH_MASK) ++ ++/* hash table for pnfs_ds_stateid */ ++#define STATEID_HASH_BITS 10 ++#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) ++#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) ++ ++#define stateid_hashval(owner_id, file_id) \ ++ (((owner_id) + (file_id)) & STATEID_HASH_MASK) ++ ++static struct list_head mds_id_tbl; ++static struct list_head mds_clid_hashtbl[CLIENT_HASH_SIZE]; ++static struct list_head ds_stid_hashtbl[STATEID_HASH_SIZE]; ++ ++static inline void put_ds_clientid(struct pnfs_ds_clientid *dcp); ++static inline void put_ds_mdsid(struct pnfs_mds_id *mdp); ++ ++/* Mutex for data server state. Needs to be separate from ++ * mds state mutex since a node can be both mds and ds */ ++static DEFINE_MUTEX(ds_mutex); ++static struct thread_info *ds_mutex_owner; ++ ++static void ++ds_lock_state(void) ++{ ++ mutex_lock(&ds_mutex); ++ ds_mutex_owner = current_thread_info(); ++} ++ ++static void ++ds_unlock_state(void) ++{ ++ BUG_ON(ds_mutex_owner != current_thread_info()); ++ ds_mutex_owner = NULL; ++ mutex_unlock(&ds_mutex); ++} ++ ++static int ++cmp_clid(const clientid_t *cl1, const clientid_t *cl2) ++{ ++ return (cl1->cl_boot == cl2->cl_boot) && ++ (cl1->cl_id == cl2->cl_id); ++} ++ ++void ++nfs4_pnfs_state_init(void) ++{ ++ int i; ++ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&mds_clid_hashtbl[i]); ++ ++ for (i = 0; i < STATEID_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&ds_stid_hashtbl[i]); ++ ++ INIT_LIST_HEAD(&mds_id_tbl); ++} ++ ++static struct pnfs_mds_id * ++find_pnfs_mds_id(u32 mdsid) ++{ ++ struct pnfs_mds_id *local = NULL; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ list_for_each_entry(local, &mds_id_tbl, di_hash) { ++ if (local->di_mdsid == mdsid) ++ return local; ++ } ++ return NULL; ++} ++ ++static struct pnfs_ds_clientid * ++find_pnfs_ds_clientid(const clientid_t *clid) ++{ ++ struct pnfs_ds_clientid *local = NULL; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ hashval = clientid_hashval(clid->cl_id); ++ list_for_each_entry(local, &mds_clid_hashtbl[hashval], dc_hash) { ++ if (cmp_clid(&local->dc_mdsclid, clid)) ++ return local; ++ } ++ return NULL; ++} ++ ++static struct pnfs_ds_stateid * ++find_pnfs_ds_stateid(stateid_t *stid) ++{ ++ struct pnfs_ds_stateid *local = NULL; ++ u32 st_id = stid->si_stateownerid; ++ u32 f_id = stid->si_fileid; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ hashval = stateid_hashval(st_id, f_id); ++ list_for_each_entry(local, &ds_stid_hashtbl[hashval], ds_hash) ++ if ((local->ds_stid.si_stateownerid == st_id) && ++ (local->ds_stid.si_fileid == f_id) && ++ (local->ds_stid.si_boot == stid->si_boot)) { ++ stateid_t *sid = &local->ds_stid; ++ dprintk("NFSD: %s <-- %p ds_flags %lx " STATEID_FMT "\n", ++ __func__, local, local->ds_flags, ++ STATEID_VAL(sid)); ++ return local; ++ } ++ return NULL; ++} ++ ++static void ++release_ds_mdsid(struct kref *kref) ++{ ++ struct pnfs_mds_id *mdp = ++ container_of(kref, struct pnfs_mds_id, di_ref); ++ dprintk("pNFSD: %s\n", __func__); ++ ++ list_del(&mdp->di_hash); ++ list_del(&mdp->di_mdsclid); ++ kfree(mdp); ++} ++ ++static void ++release_ds_clientid(struct kref *kref) ++{ ++ struct pnfs_ds_clientid *dcp = ++ container_of(kref, struct pnfs_ds_clientid, dc_ref); ++ struct pnfs_mds_id *mdp; ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = find_pnfs_mds_id(dcp->dc_mdsid); ++ if (mdp) ++ put_ds_mdsid(mdp); ++ ++ list_del(&dcp->dc_hash); ++ list_del(&dcp->dc_stateid); ++ list_del(&dcp->dc_permdsid); ++ kfree(dcp); ++} ++ ++static void ++release_ds_stateid(struct kref *kref) ++{ ++ struct pnfs_ds_stateid *dsp = ++ container_of(kref, struct pnfs_ds_stateid, ds_ref); ++ struct pnfs_ds_clientid *dcp; ++ dprintk("pNFS %s: dsp %p\n", __func__, dsp); ++ ++ dcp = find_pnfs_ds_clientid(&dsp->ds_mdsclid); ++ if (dcp) ++ put_ds_clientid(dcp); ++ ++ list_del(&dsp->ds_hash); ++ list_del(&dsp->ds_perclid); ++ kfree(dsp); ++} ++ ++static inline void ++put_ds_clientid(struct pnfs_ds_clientid *dcp) ++{ ++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, ++ atomic_read(&dcp->dc_ref.refcount)); ++ kref_put(&dcp->dc_ref, release_ds_clientid); ++} ++ ++static inline void ++get_ds_clientid(struct pnfs_ds_clientid *dcp) ++{ ++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, ++ atomic_read(&dcp->dc_ref.refcount)); ++ kref_get(&dcp->dc_ref); ++} ++ ++static inline void ++put_ds_mdsid(struct pnfs_mds_id *mdp) ++{ ++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, ++ atomic_read(&mdp->di_ref.refcount)); ++ kref_put(&mdp->di_ref, release_ds_mdsid); ++} ++ ++static inline void ++get_ds_mdsid(struct pnfs_mds_id *mdp) ++{ ++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, ++ atomic_read(&mdp->di_ref.refcount)); ++ kref_get(&mdp->di_ref); ++} ++ ++static inline void ++put_ds_stateid(struct pnfs_ds_stateid *dsp) ++{ ++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, ++ atomic_read(&dsp->ds_ref.refcount)); ++ kref_put(&dsp->ds_ref, release_ds_stateid); ++} ++ ++static inline void ++get_ds_stateid(struct pnfs_ds_stateid *dsp) ++{ ++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, ++ atomic_read(&dsp->ds_ref.refcount)); ++ kref_get(&dsp->ds_ref); ++} ++ ++void ++nfs4_pnfs_state_shutdown(void) ++{ ++ struct pnfs_ds_stateid *dsp; ++ int i; ++ ++ dprintk("pNFSD %s: -->\n", __func__); ++ ++ ds_lock_state(); ++ for (i = 0; i < STATEID_HASH_SIZE; i++) { ++ while (!list_empty(&ds_stid_hashtbl[i])) { ++ dsp = list_entry(ds_stid_hashtbl[i].next, ++ struct pnfs_ds_stateid, ds_hash); ++ put_ds_stateid(dsp); ++ } ++ } ++ ds_unlock_state(); ++} ++ ++static struct pnfs_mds_id * ++alloc_init_mds_id(struct pnfs_get_state *gsp) ++{ ++ struct pnfs_mds_id *mdp; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = kmalloc(sizeof(*mdp), GFP_KERNEL); ++ if (!mdp) ++ return NULL; ++ INIT_LIST_HEAD(&mdp->di_hash); ++ INIT_LIST_HEAD(&mdp->di_mdsclid); ++ list_add(&mdp->di_hash, &mds_id_tbl); ++ mdp->di_mdsid = gsp->dsid; ++ mdp->di_mdsboot = 0; ++ kref_init(&mdp->di_ref); ++ return mdp; ++} ++ ++static struct pnfs_ds_clientid * ++alloc_init_ds_clientid(struct pnfs_get_state *gsp) ++{ ++ struct pnfs_mds_id *mdp; ++ struct pnfs_ds_clientid *dcp; ++ clientid_t *clid = (clientid_t *)&gsp->clid; ++ unsigned int hashval = clientid_hashval(clid->cl_id); ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = find_pnfs_mds_id(gsp->dsid); ++ if (!mdp) { ++ mdp = alloc_init_mds_id(gsp); ++ if (!mdp) ++ return NULL; ++ } else { ++ get_ds_mdsid(mdp); ++ } ++ ++ dcp = kmalloc(sizeof(*dcp), GFP_KERNEL); ++ if (!dcp) ++ return NULL; ++ ++ INIT_LIST_HEAD(&dcp->dc_hash); ++ INIT_LIST_HEAD(&dcp->dc_stateid); ++ INIT_LIST_HEAD(&dcp->dc_permdsid); ++ list_add(&dcp->dc_hash, &mds_clid_hashtbl[hashval]); ++ list_add(&dcp->dc_permdsid, &mdp->di_mdsclid); ++ dcp->dc_mdsclid = *clid; ++ kref_init(&dcp->dc_ref); ++ dcp->dc_mdsid = gsp->dsid; ++ return dcp; ++} ++ ++static struct pnfs_ds_stateid * ++alloc_init_ds_stateid(struct svc_fh *cfh, stateid_t *stidp) ++{ ++ struct pnfs_ds_stateid *dsp; ++ u32 st_id = stidp->si_stateownerid; ++ u32 f_id = stidp->si_fileid; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ dsp = kmalloc(sizeof(*dsp), GFP_KERNEL); ++ if (!dsp) ++ return dsp; ++ ++ INIT_LIST_HEAD(&dsp->ds_hash); ++ INIT_LIST_HEAD(&dsp->ds_perclid); ++ memcpy(&dsp->ds_stid, stidp, sizeof(stateid_t)); ++ fh_copy_shallow(&dsp->ds_fh, &cfh->fh_handle); ++ dsp->ds_access = 0; ++ dsp->ds_status = 0; ++ dsp->ds_flags = 0L; ++ kref_init(&dsp->ds_ref); ++ set_bit(DS_STATEID_NEW, &dsp->ds_flags); ++ clear_bit(DS_STATEID_VALID, &dsp->ds_flags); ++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ init_waitqueue_head(&dsp->ds_waitq); ++ ++ hashval = stateid_hashval(st_id, f_id); ++ list_add(&dsp->ds_hash, &ds_stid_hashtbl[hashval]); ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return dsp; ++} ++ ++static int ++update_ds_stateid(struct pnfs_ds_stateid *dsp, struct svc_fh *cfh, ++ struct pnfs_get_state *gsp) ++{ ++ struct pnfs_ds_clientid *dcp; ++ int new = 0; ++ ++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); ++ ++ dcp = find_pnfs_ds_clientid((clientid_t *)&gsp->clid); ++ if (!dcp) { ++ dcp = alloc_init_ds_clientid(gsp); ++ if (!dcp) ++ return 1; ++ new = 1; ++ } ++ if (test_bit(DS_STATEID_NEW, &dsp->ds_flags)) { ++ list_add(&dsp->ds_perclid, &dcp->dc_stateid); ++ if (!new) ++ get_ds_clientid(dcp); ++ } ++ ++ memcpy(&dsp->ds_stid, &gsp->stid, sizeof(stateid_t)); ++ dsp->ds_access = gsp->access; ++ dsp->ds_status = 0; ++ dsp->ds_verifier[0] = gsp->verifier[0]; ++ dsp->ds_verifier[1] = gsp->verifier[1]; ++ memcpy(&dsp->ds_mdsclid, &gsp->clid, sizeof(clientid_t)); ++ set_bit(DS_STATEID_VALID, &dsp->ds_flags); ++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ clear_bit(DS_STATEID_NEW, &dsp->ds_flags); ++ return 0; ++} ++ ++int ++nfs4_pnfs_cb_change_state(struct pnfs_get_state *gs) ++{ ++ stateid_t *stid = (stateid_t *)&gs->stid; ++ struct pnfs_ds_stateid *dsp; ++ ++ dprintk("pNFSD: %s stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(stid)); ++ ++ ds_lock_state(); ++ dsp = find_pnfs_ds_stateid(stid); ++ if (dsp) ++ put_ds_stateid(dsp); ++ ds_unlock_state(); ++ ++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); ++ ++ if (dsp) ++ return 0; ++ return -ENOENT; ++} ++ ++/* Retrieves and validates stateid. ++ * If stateid exists and its fields match, return it. ++ * If stateid exists but either the generation or ++ * ownerids don't match, check with mds to see if it is valid. ++ * If the stateid doesn't exist, the first thread creates a ++ * invalid *marker* stateid, then checks to see if the ++ * stateid exists on the mds. If so, it validates the *marker* ++ * stateid and updates its fields. Subsequent threads that ++ * find the *marker* stateid wait until it is valid or an error ++ * occurs. ++ * Called with ds_state_lock. ++ */ ++static struct pnfs_ds_stateid * ++nfsv4_ds_get_state(struct svc_fh *cfh, stateid_t *stidp) ++{ ++ struct inode *ino = cfh->fh_dentry->d_inode; ++ struct super_block *sb; ++ struct pnfs_ds_stateid *dsp = NULL; ++ struct pnfs_get_state gs = { ++ .access = 0, ++ }; ++ int status = 0, waiter = 0; ++ ++ dprintk("pNFSD: %s -->\n", __func__); ++ ++ dsp = find_pnfs_ds_stateid(stidp); ++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags) && ++ (stidp->si_generation == dsp->ds_stid.si_generation)) ++ goto out_noput; ++ ++ sb = ino->i_sb; ++ if (!sb || !sb->s_pnfs_op->get_state) ++ goto out_noput; ++ ++ /* Uninitialize current state if it exists yet it doesn't match. ++ * If it is already invalid, another thread is checking state */ ++ if (dsp) { ++ if (!test_and_clear_bit(DS_STATEID_VALID, &dsp->ds_flags)) ++ waiter = 1; ++ } else { ++ dsp = alloc_init_ds_stateid(cfh, stidp); ++ if (!dsp) ++ goto out_noput; ++ } ++ ++ dprintk("pNFSD: %s Starting loop\n", __func__); ++ get_ds_stateid(dsp); ++ while (!test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { ++ ds_unlock_state(); ++ ++ /* Another thread is checking the state */ ++ if (waiter) { ++ dprintk("pNFSD: %s waiting\n", __func__); ++ wait_event_interruptible_timeout(dsp->ds_waitq, ++ (test_bit(DS_STATEID_VALID, &dsp->ds_flags) || ++ test_bit(DS_STATEID_ERROR, &dsp->ds_flags)), ++ msecs_to_jiffies(1024)); ++ dprintk("pNFSD: %s awake\n", __func__); ++ ds_lock_state(); ++ if (test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) ++ goto out; ++ ++ continue; ++ } ++ ++ /* Validate stateid on mds */ ++ dprintk("pNFSD: %s Checking state on MDS\n", __func__); ++ memcpy(&gs.stid, stidp, sizeof(stateid_t)); ++ status = sb->s_pnfs_op->get_state(ino, &cfh->fh_handle, &gs); ++ dprintk("pNFSD: %s from MDS status %d\n", __func__, status); ++ ds_lock_state(); ++ /* if !status and stateid is valid, update id and mark valid */ ++ if (status || update_ds_stateid(dsp, cfh, &gs)) { ++ set_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ /* remove invalid stateid from list */ ++ put_ds_stateid(dsp); ++ wake_up(&dsp->ds_waitq); ++ goto out; ++ } ++ ++ wake_up(&dsp->ds_waitq); ++ } ++out: ++ if (dsp) ++ put_ds_stateid(dsp); ++out_noput: ++ if (dsp) ++ dprintk("pNFSD: %s <-- dsp %p ds_flags %lx " STATEID_FMT "\n", ++ __func__, dsp, dsp->ds_flags, STATEID_VAL(&dsp->ds_stid)); ++ /* If error, return null */ ++ if (dsp && test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) ++ dsp = NULL; ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return dsp; ++} ++ ++int ++nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *cfh, stateid_t *stateid) ++{ ++ struct pnfs_ds_stateid *dsp; ++ int status = 0; ++ ++ dprintk("pNFSD: %s --> " STATEID_FMT "\n", __func__, ++ STATEID_VAL(stateid)); ++ ++ /* Must release state lock while verifying stateid on mds */ ++ nfs4_unlock_state(); ++ ds_lock_state(); ++ dsp = nfsv4_ds_get_state(cfh, stateid); ++ if (dsp) { ++ get_ds_stateid(dsp); ++ dprintk("pNFSD: %s Found " STATEID_FMT "\n", __func__, ++ STATEID_VAL(&dsp->ds_stid)); ++ ++ dprintk("NFSD: %s: dsp %p fh_size %u:%u " ++ "fh [%08x:%08x:%08x:%08x]:[%08x:%08x:%08x:%08x] " ++ "gen %x:%x\n", ++ __func__, dsp, ++ cfh->fh_handle.fh_size, dsp->ds_fh.fh_size, ++ ((unsigned *)&cfh->fh_handle.fh_base)[0], ++ ((unsigned *)&cfh->fh_handle.fh_base)[1], ++ ((unsigned *)&cfh->fh_handle.fh_base)[2], ++ ((unsigned *)&cfh->fh_handle.fh_base)[3], ++ ((unsigned *)&dsp->ds_fh.fh_base)[0], ++ ((unsigned *)&dsp->ds_fh.fh_base)[1], ++ ((unsigned *)&dsp->ds_fh.fh_base)[2], ++ ((unsigned *)&dsp->ds_fh.fh_base)[3], ++ stateid->si_generation, dsp->ds_stid.si_generation); ++ } ++ ++ if (!dsp || ++ (cfh->fh_handle.fh_size != dsp->ds_fh.fh_size) || ++ (memcmp(&cfh->fh_handle.fh_base, &dsp->ds_fh.fh_base, ++ dsp->ds_fh.fh_size) != 0) || ++ (stateid->si_generation > dsp->ds_stid.si_generation)) ++ status = nfserr_bad_stateid; ++ else if (stateid->si_generation < dsp->ds_stid.si_generation) ++ status = nfserr_old_stateid; ++ ++ if (dsp) ++ put_ds_stateid(dsp); ++ ds_unlock_state(); ++ nfs4_lock_state(); ++ dprintk("pNFSD: %s <-- status %d\n", __func__, be32_to_cpu(status)); ++ return status; ++} ++ ++void ++nfs4_ds_get_verifier(stateid_t *stateid, struct super_block *sb, u32 *p) ++{ ++ struct pnfs_ds_stateid *dsp = NULL; ++ ++ dprintk("pNFSD: %s --> stid %p\n", __func__, stateid); ++ ++ ds_lock_state(); ++ if (stateid != NULL) { ++ dsp = find_pnfs_ds_stateid(stateid); ++ if (dsp) ++ get_ds_stateid(dsp); ++ } ++ ++ /* XXX: Should we fetch the stateid or wait if some other ++ * thread is currently retrieving the stateid ? */ ++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { ++ *p++ = dsp->ds_verifier[0]; ++ *p++ = dsp->ds_verifier[1]; ++ put_ds_stateid(dsp); ++ } else { ++ /* must be on MDS */ ++ ds_unlock_state(); ++ sb->s_pnfs_op->get_verifier(sb, p); ++ ds_lock_state(); ++ p += 2; ++ } ++ ds_unlock_state(); ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return; ++} ++ ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.37.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.37.noarch/fs/nfsd/nfs4proc.c +--- linux-2.6.37.noarch/fs/nfsd/nfs4proc.c.orig 2011-01-28 09:37:32.559979357 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/nfs4proc.c 2011-01-28 09:43:53.357769620 -0500 +@@ -34,10 +34,14 @@ + */ + #include + #include ++#include ++#include ++#include + + #include "cache.h" + #include "xdr4.h" + #include "vfs.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_PROC + +@@ -372,6 +376,24 @@ nfsd4_open(struct svc_rqst *rqstp, struc + * set, (2) sets open->op_stateid, (3) sets open->op_delegation. + */ + status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); ++#if defined(CONFIG_SPNFS) ++ if (!status && spnfs_enabled()) { ++ struct inode *inode = cstate->current_fh.fh_dentry->d_inode; ++ ++ status = spnfs_open(inode, open); ++ if (status) { ++ dprintk( ++ "nfsd: pNFS could not be enabled for inode: %lu\n", ++ inode->i_ino); ++ /* ++ * XXX When there's a failure then need to indicate to ++ * future ops that no pNFS is available. Should I save ++ * the status in the inode? It's kind of a big hammer. ++ * But there may be no stripes available? ++ */ ++ } ++ } ++#endif /* CONFIG_SPNFS */ + out: + if (open->op_stateowner) { + nfs4_get_stateowner(open->op_stateowner); +@@ -454,16 +476,30 @@ nfsd4_access(struct svc_rqst *rqstp, str + &access->ac_supported); + } + ++static void ++nfsd4_get_verifier(struct super_block *sb, nfs4_verifier *verf) ++{ ++ u32 *p = (u32 *)verf->data; ++ ++#if defined(CONFIG_PNFSD) ++ if (sb->s_pnfs_op && sb->s_pnfs_op->get_verifier) { ++ nfs4_ds_get_verifier(NULL, sb, p); ++ return; ++ } ++#endif /* CONFIG_PNFSD */ ++ ++ *p++ = nfssvc_boot.tv_sec; ++ *p++ = nfssvc_boot.tv_usec; ++} ++ + static __be32 + nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_commit *commit) + { + __be32 status; + +- u32 *p = (u32 *)commit->co_verf.data; +- *p++ = nfssvc_boot.tv_sec; +- *p++ = nfssvc_boot.tv_usec; +- ++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, ++ &commit->co_verf); + status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + commit->co_count); + if (status == nfserr_symlink) +@@ -846,7 +882,6 @@ nfsd4_write(struct svc_rqst *rqstp, stru + { + stateid_t *stateid = &write->wr_stateid; + struct file *filp = NULL; +- u32 *p; + __be32 status = nfs_ok; + unsigned long cnt; + +@@ -868,13 +903,49 @@ nfsd4_write(struct svc_rqst *rqstp, stru + + cnt = write->wr_buflen; + write->wr_how_written = write->wr_stable_how; +- p = (u32 *)write->wr_verifier.data; +- *p++ = nfssvc_boot.tv_sec; +- *p++ = nfssvc_boot.tv_usec; + ++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, ++ &write->wr_verifier); ++#if defined(CONFIG_SPNFS) ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(cstate->current_fh.fh_dentry->d_inode, 0)) { ++ status = bl_layoutrecall(cstate->current_fh.fh_dentry->d_inode, ++ RETURN_FILE, write->wr_offset, write->wr_buflen); ++ if (!status) { ++ status = nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset, rqstp->rq_vec, write->wr_vlen, ++ &cnt, &write->wr_how_written); ++ } ++ } else ++#endif ++ ++ if (spnfs_enabled()) { ++ status = spnfs_write(cstate->current_fh.fh_dentry->d_inode, ++ write->wr_offset, write->wr_buflen, write->wr_vlen, ++ rqstp); ++ if (status == nfs_ok) { ++ /* DMXXX: HACK to get filesize set */ ++ /* write one byte at offset+length-1 */ ++ struct kvec k[1]; ++ char zero = 0; ++ unsigned long cnt = 1; ++ ++ k[0].iov_base = (void *)&zero; ++ k[0].iov_len = 1; ++ nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset+write->wr_buflen-1, k, 1, ++ &cnt, &write->wr_how_written); ++ } ++ } else /* we're not an MDS */ ++ status = nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset, rqstp->rq_vec, write->wr_vlen, ++ &cnt, &write->wr_how_written); ++#else + status = nfsd_write(rqstp, &cstate->current_fh, filp, + write->wr_offset, rqstp->rq_vec, write->wr_vlen, + &cnt, &write->wr_how_written); ++#endif /* CONFIG_SPNFS */ ++ + if (filp) + fput(filp); + +@@ -965,6 +1036,306 @@ nfsd4_verify(struct svc_rqst *rqstp, str + return status == nfserr_same ? nfs_ok : status; + } + ++#if defined(CONFIG_PNFSD) ++ ++static __be32 ++nfsd4_layout_verify(struct super_block *sb, struct svc_export *exp, ++ unsigned int layout_type) ++{ ++ int status, type; ++ ++ /* check to see if pNFS is supported. */ ++ status = nfserr_layoutunavailable; ++ if (exp && exp->ex_pnfs == 0) { ++ dprintk("%s: Underlying file system " ++ "is not exported over pNFS\n", __func__); ++ goto out; ++ } ++ if (!sb->s_pnfs_op || !sb->s_pnfs_op->layout_type) { ++ dprintk("%s: Underlying file system " ++ "does not support pNFS\n", __func__); ++ goto out; ++ } ++ ++ type = sb->s_pnfs_op->layout_type(sb); ++ ++ /* check to see if requested layout type is supported. */ ++ status = nfserr_unknown_layouttype; ++ if (!type) ++ dprintk("BUG: %s: layout_type 0 is reserved and must not be " ++ "used by filesystem\n", __func__); ++ else if (type != layout_type) ++ dprintk("%s: requested layout type %d " ++ "does not match supported type %d\n", ++ __func__, layout_type, type); ++ else ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_getdevlist(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_getdevlist *gdlp) ++{ ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ int status; ++ ++ dprintk("%s: type %u maxdevices %u cookie %llu verf %llu\n", ++ __func__, gdlp->gd_layout_type, gdlp->gd_maxdevices, ++ gdlp->gd_cookie, gdlp->gd_verf); ++ ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* We must be able to encode at list one device */ ++ if (!gdlp->gd_maxdevices) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ gdlp->gd_layout_type); ++ if (status) ++ goto out; ++ ++ /* Do nothing if underlying file system does not support ++ * getdevicelist */ ++ if (!sb->s_pnfs_op->get_device_iter) { ++ status = nfserr_notsupp; ++ goto out; ++ } ++ ++ /* Set up arguments so device can be retrieved at encode time */ ++ gdlp->gd_fhp = &cstate->current_fh; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_getdevinfo(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_getdevinfo *gdp) ++{ ++ struct super_block *sb; ++ int status; ++ clientid_t clid; ++ ++ dprintk("%s: layout_type %u dev_id %llx:%llx maxcnt %u\n", ++ __func__, gdp->gd_layout_type, gdp->gd_devid.sbid, ++ gdp->gd_devid.devid, gdp->gd_maxcount); ++ ++ status = nfserr_inval; ++ sb = find_sbid_id(gdp->gd_devid.sbid); ++ dprintk("%s: sb %p\n", __func__, sb); ++ if (!sb) { ++ status = nfserr_noent; ++ goto out; ++ } ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, NULL, gdp->gd_layout_type); ++ if (status) ++ goto out; ++ ++ /* Set up arguments so device can be retrieved at encode time */ ++ gdp->gd_sb = sb; ++ ++ /* Update notifications */ ++ copy_clientid(&clid, cstate->session); ++ pnfs_set_device_notify(&clid, gdp->gd_notify_types); ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutget(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ int status; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lgp->lg_seg.layout_type); ++ if (status) ++ goto out; ++ ++ status = nfserr_badiomode; ++ if (lgp->lg_seg.iomode != IOMODE_READ && ++ lgp->lg_seg.iomode != IOMODE_RW) { ++ dprintk("pNFS %s: invalid iomode %d\n", __func__, ++ lgp->lg_seg.iomode); ++ goto out; ++ } ++ ++ /* Set up arguments so layout can be retrieved at encode time */ ++ lgp->lg_fhp = current_fh; ++ copy_clientid((clientid_t *)&lgp->lg_seg.clientid, cstate->session); ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutcommit(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ int status; ++ struct inode *ino = NULL; ++ struct iattr ia; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ dprintk("NFSD: nfsd4_layoutcommit \n"); ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ ino = current_fh->fh_dentry->d_inode; ++ if (!ino) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = ino->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lcp->args.lc_seg.layout_type); ++ if (status) ++ goto out; ++ ++ /* This will only extend the file length. Do a quick ++ * check to see if there is any point in waiting for the update ++ * locks. ++ * TODO: Is this correct for all back ends? ++ */ ++ dprintk("%s:new offset: %d new size: %llu old size: %lld\n", ++ __func__, lcp->args.lc_newoffset, lcp->args.lc_last_wr + 1, ++ ino->i_size); ++ ++ /* Set clientid from sessionid */ ++ copy_clientid((clientid_t *)&lcp->args.lc_seg.clientid, cstate->session); ++ lcp->res.lc_size_chg = 0; ++ if (sb->s_pnfs_op->layout_commit) { ++ status = sb->s_pnfs_op->layout_commit(ino, &lcp->args, &lcp->res); ++ dprintk("%s:layout_commit result %d\n", __func__, status); ++ } else { ++ fh_lock(current_fh); ++ if ((lcp->args.lc_newoffset == 0) || ++ ((lcp->args.lc_last_wr + 1) <= ino->i_size)) { ++ status = 0; ++ lcp->res.lc_size_chg = 0; ++ fh_unlock(current_fh); ++ goto out; ++ } ++ ++ /* Try our best to update the file size */ ++ dprintk("%s: Modifying file size\n", __func__); ++ ia.ia_valid = ATTR_SIZE; ++ ia.ia_size = lcp->args.lc_last_wr + 1; ++ status = notify_change(current_fh->fh_dentry, &ia); ++ fh_unlock(current_fh); ++ dprintk("%s:notify_change result %d\n", __func__, status); ++ } ++ ++ if (!status && lcp->res.lc_size_chg && ++ EX_ISSYNC(current_fh->fh_export)) { ++ dprintk("%s: Synchronously writing inode size %llu\n", ++ __func__, ino->i_size); ++ write_inode_now(ino, 1); ++ lcp->res.lc_newsize = i_size_read(ino); ++ } ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutreturn(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int status; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lrp->args.lr_seg.layout_type); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ if (lrp->args.lr_return_type != RETURN_FILE && ++ lrp->args.lr_return_type != RETURN_FSID && ++ lrp->args.lr_return_type != RETURN_ALL) { ++ dprintk("pNFS %s: invalid return_type %d\n", __func__, ++ lrp->args.lr_return_type); ++ goto out; ++ } ++ ++ status = nfserr_inval; ++ if (lrp->args.lr_seg.iomode != IOMODE_READ && ++ lrp->args.lr_seg.iomode != IOMODE_RW && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) { ++ dprintk("pNFS %s: invalid iomode %d\n", __func__, ++ lrp->args.lr_seg.iomode); ++ goto out; ++ } ++ ++ /* Set clientid from sessionid */ ++ copy_clientid((clientid_t *)&lrp->args.lr_seg.clientid, cstate->session); ++ lrp->lrs_present = (lrp->args.lr_return_type == RETURN_FILE); ++ status = nfs4_pnfs_return_layout(sb, current_fh, lrp); ++out: ++ dprintk("pNFS %s: status %d return_type 0x%x lrs_present %d\n", ++ __func__, status, lrp->args.lr_return_type, lrp->lrs_present); ++ return status; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * NULL call. + */ +@@ -1355,6 +1726,29 @@ static struct nfsd4_operation nfsd4_ops[ + .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, + .op_name = "OP_SECINFO_NO_NAME", + }, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICELIST] = { ++ .op_func = (nfsd4op_func)nfsd4_getdevlist, ++ .op_name = "OP_GETDEVICELIST", ++ }, ++ [OP_GETDEVICEINFO] = { ++ .op_func = (nfsd4op_func)nfsd4_getdevinfo, ++ .op_flags = ALLOWED_WITHOUT_FH, ++ .op_name = "OP_GETDEVICEINFO", ++ }, ++ [OP_LAYOUTGET] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutget, ++ .op_name = "OP_LAYOUTGET", ++ }, ++ [OP_LAYOUTCOMMIT] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutcommit, ++ .op_name = "OP_LAYOUTCOMMIT", ++ }, ++ [OP_LAYOUTRETURN] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutreturn, ++ .op_name = "OP_LAYOUTRETURN", ++ }, ++#endif /* CONFIG_PNFSD */ + }; + + static const char *nfsd4_op_name(unsigned opnum) +diff -up linux-2.6.37.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.37.noarch/fs/nfsd/nfs4state.c +--- linux-2.6.37.noarch/fs/nfsd/nfs4state.c.orig 2011-01-28 09:37:32.562979253 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/nfs4state.c 2011-01-28 09:43:53.359769399 -0500 +@@ -42,6 +42,8 @@ + #include "xdr4.h" + #include "vfs.h" + ++#include "pnfsd.h" ++ + #define NFSDDBG_FACILITY NFSDDBG_PROC + + /* Globals */ +@@ -59,8 +61,6 @@ static u64 current_sessionid = 1; + #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) + + /* forward declarations */ +-static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); +-static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); + static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; + static void nfs4_set_recdir(char *recdir); + +@@ -68,6 +68,7 @@ static void nfs4_set_recdir(char *recdir + + /* Currently used for almost all code touching nfsv4 state: */ + static DEFINE_MUTEX(client_mutex); ++struct task_struct *client_mutex_owner; + + /* + * Currently used for the del_recall_lru and file hash table. In an +@@ -85,11 +86,21 @@ void + nfs4_lock_state(void) + { + mutex_lock(&client_mutex); ++ client_mutex_owner = current; ++} ++ ++#define BUG_ON_UNLOCKED_STATE() BUG_ON(client_mutex_owner != current) ++ ++void ++nfs4_bug_on_unlocked_state(void) ++{ ++ BUG_ON(client_mutex_owner != current); + } + + void + nfs4_unlock_state(void) + { ++ client_mutex_owner = NULL; + mutex_unlock(&client_mutex); + } + +@@ -108,7 +119,7 @@ opaque_hashval(const void *ptr, int nbyt + + static struct list_head del_recall_lru; + +-static inline void ++inline void + put_nfs4_file(struct nfs4_file *fi) + { + if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { +@@ -119,7 +130,7 @@ put_nfs4_file(struct nfs4_file *fi) + } + } + +-static inline void ++inline void + get_nfs4_file(struct nfs4_file *fi) + { + atomic_inc(&fi->fi_ref); +@@ -179,10 +190,16 @@ static void nfs4_file_get_access(struct + + static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag) + { +- if (fp->fi_fds[oflag]) { +- fput(fp->fi_fds[oflag]); +- fp->fi_fds[oflag] = NULL; +- } ++ struct file *fd = fp->fi_fds[oflag]; ++ ++ if (!fd) ++ return; ++ ++ fp->fi_fds[oflag] = NULL; ++ BUG_ON_UNLOCKED_STATE(); ++ nfs4_unlock_state(); /* allow nested layout recall/return */ ++ fput(fd); ++ nfs4_lock_state(); + } + + static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) +@@ -306,8 +323,8 @@ static DEFINE_SPINLOCK(client_lock); + * reclaim_str_hashtbl[] holds known client info from previous reset/reboot + * used in reboot/reset lease grace period processing + * +- * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed +- * setclientid_confirmed info. ++ * conf_id_hashtbl[], and conf_str_hashtbl[] hold ++ * confirmed setclientid_confirmed info. + * + * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed + * setclientid info. +@@ -332,6 +349,7 @@ static void unhash_generic_stateid(struc + list_del(&stp->st_hash); + list_del(&stp->st_perfile); + list_del(&stp->st_perstateowner); ++ release_pnfs_ds_dev_list(stp); + } + + static void free_generic_stateid(struct nfs4_stateid *stp) +@@ -954,6 +972,8 @@ expire_client(struct nfs4_client *clp) + struct nfs4_delegation *dp; + struct list_head reaplist; + ++ BUG_ON_UNLOCKED_STATE(); ++ + INIT_LIST_HEAD(&reaplist); + spin_lock(&recall_lock); + while (!list_empty(&clp->cl_delegations)) { +@@ -973,6 +993,7 @@ expire_client(struct nfs4_client *clp) + sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); + release_openowner(sop); + } ++ pnfs_expire_client(clp); + nfsd4_shutdown_callback(clp); + if (clp->cl_cb_conn.cb_xprt) + svc_xprt_put(clp->cl_cb_conn.cb_xprt); +@@ -985,6 +1006,13 @@ expire_client(struct nfs4_client *clp) + spin_unlock(&client_lock); + } + ++void expire_client_lock(struct nfs4_client *clp) ++{ ++ nfs4_lock_state(); ++ expire_client(clp); ++ nfs4_unlock_state(); ++} ++ + static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) + { + memcpy(target->cl_verifier.data, source->data, +@@ -1076,6 +1104,11 @@ static struct nfs4_client *create_client + INIT_LIST_HEAD(&clp->cl_strhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&clp->cl_layouts); ++ INIT_LIST_HEAD(&clp->cl_layoutrecalls); ++ atomic_set(&clp->cl_deviceref, 0); ++#endif /* CONFIG_PNFSD */ + INIT_LIST_HEAD(&clp->cl_lru); + INIT_LIST_HEAD(&clp->cl_callbacks); + spin_lock_init(&clp->cl_lock); +@@ -1127,7 +1160,7 @@ move_to_confirmed(struct nfs4_client *cl + renew_client(clp); + } + +-static struct nfs4_client * ++struct nfs4_client * + find_confirmed_client(clientid_t *clid) + { + struct nfs4_client *clp; +@@ -1182,6 +1215,24 @@ find_unconfirmed_client_by_str(const cha + return NULL; + } + ++int ++filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), ++ void *arg) ++{ ++ struct nfs4_client *clp, *next; ++ int i, status = 0; ++ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ list_for_each_entry_safe (clp, next, &conf_str_hashtbl[i], ++ cl_strhash) { ++ status = func(clp, arg); ++ if (status) ++ break; ++ } ++ ++ return status; ++} ++ + static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr) + { + switch (family) { +@@ -1330,8 +1381,12 @@ nfsd4_replay_cache_entry(struct nfsd4_co + static void + nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) + { +- /* pNFS is not supported */ ++#if defined(CONFIG_PNFSD) ++ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS | ++ EXCHGID4_FLAG_USE_PNFS_DS; ++#else /* CONFIG_PNFSD */ + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; ++#endif /* CONFIG_PNFSD */ + + /* Referrals are supported, Migration is not. */ + new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; +@@ -1527,6 +1582,13 @@ nfsd4_create_session(struct svc_rqst *rq + bool confirm_me = false; + int status = 0; + ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ /* XXX hack to get local ip address */ ++ memcpy(&pnfsd_lexp_addr, &rqstp->rq_xprt->xpt_local, ++ sizeof(pnfsd_lexp_addr)); ++ pnfs_lexp_addr_len = rqstp->rq_xprt->xpt_locallen; ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ + nfs4_lock_state(); + unconf = find_unconfirmed_client(&cr_ses->clientid); + conf = find_confirmed_client(&cr_ses->clientid); +@@ -1562,6 +1624,9 @@ nfsd4_create_session(struct svc_rqst *rq + goto out; + } + ++ if (is_ds_only_session(unconf->cl_exchange_flags)) ++ cr_ses->flags &= ~SESSION4_BACK_CHAN; ++ + confirm_me = true; + conf = unconf; + } else { +@@ -2064,7 +2129,7 @@ out: + + /* OPEN Share state helper functions */ + static inline struct nfs4_file * +-alloc_init_file(struct inode *ino) ++alloc_init_file(struct inode *ino, struct svc_fh *current_fh) + { + struct nfs4_file *fp; + unsigned int hashval = file_hashval(ino); +@@ -2080,6 +2145,16 @@ alloc_init_file(struct inode *ino) + fp->fi_had_conflict = false; + memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); + memset(fp->fi_access, 0, sizeof(fp->fi_access)); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&fp->fi_layouts); ++ INIT_LIST_HEAD(&fp->fi_layout_states); ++ fp->fi_fsid.major = current_fh->fh_export->ex_fsid; ++ fp->fi_fsid.minor = 0; ++ fp->fi_fhlen = current_fh->fh_handle.fh_size; ++ BUG_ON(fp->fi_fhlen > sizeof(fp->fi_fhval)); ++ memcpy(fp->fi_fhval, ¤t_fh->fh_handle.fh_base, ++ fp->fi_fhlen); ++#endif /* CONFIG_PNFSD */ + spin_lock(&recall_lock); + list_add(&fp->fi_hash, &file_hashtbl[hashval]); + spin_unlock(&recall_lock); +@@ -2088,7 +2163,7 @@ alloc_init_file(struct inode *ino) + return NULL; + } + +-static void ++void + nfsd4_free_slab(struct kmem_cache **slab) + { + if (*slab == NULL) +@@ -2104,6 +2179,7 @@ nfsd4_free_slabs(void) + nfsd4_free_slab(&file_slab); + nfsd4_free_slab(&stateid_slab); + nfsd4_free_slab(&deleg_slab); ++ nfsd4_free_pnfs_slabs(); + } + + static int +@@ -2125,6 +2201,8 @@ nfsd4_init_slabs(void) + sizeof(struct nfs4_delegation), 0, 0, NULL); + if (deleg_slab == NULL) + goto out_nomem; ++ if (nfsd4_init_pnfs_slabs()) ++ goto out_nomem; + return 0; + out_nomem: + nfsd4_free_slabs(); +@@ -2198,6 +2276,9 @@ init_stateid(struct nfs4_stateid *stp, s + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); + INIT_LIST_HEAD(&stp->st_perfile); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); ++#endif /* CONFIG_PNFSD */ + list_add(&stp->st_hash, &stateid_hashtbl[hashval]); + list_add(&stp->st_perstateowner, &sop->so_stateids); + list_add(&stp->st_perfile, &fp->fi_stateids); +@@ -2239,6 +2320,7 @@ find_openstateowner_str(unsigned int has + { + struct nfs4_stateowner *so = NULL; + ++ BUG_ON_UNLOCKED_STATE(); + list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { + if (same_owner_str(so, &open->op_owner, &open->op_clientid)) + return so; +@@ -2247,7 +2329,7 @@ find_openstateowner_str(unsigned int has + } + + /* search file_hashtbl[] for file */ +-static struct nfs4_file * ++struct nfs4_file * + find_file(struct inode *ino) + { + unsigned int hashval = file_hashval(ino); +@@ -2265,6 +2347,18 @@ find_file(struct inode *ino) + return NULL; + } + ++struct nfs4_file * ++find_alloc_file(struct inode *ino, struct svc_fh *current_fh) ++{ ++ struct nfs4_file *fp; ++ ++ fp = find_file(ino); ++ if (fp) ++ return fp; ++ ++ return alloc_init_file(ino, current_fh); ++} ++ + static inline int access_valid(u32 x, u32 minorversion) + { + if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ) +@@ -2758,7 +2852,7 @@ nfsd4_process_open2(struct svc_rqst *rqs + if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) + goto out; + status = nfserr_resource; +- fp = alloc_init_file(ino); ++ fp = alloc_init_file(ino, current_fh); + if (fp == NULL) + goto out; + } +@@ -2979,7 +3073,7 @@ nfs4_check_fh(struct svc_fh *fhp, struct + return fhp->fh_dentry->d_inode != stp->st_file->fi_inode; + } + +-static int ++int + STALE_STATEID(stateid_t *stateid) + { + if (stateid->si_boot == boot_time) +@@ -2989,6 +3083,16 @@ STALE_STATEID(stateid_t *stateid) + return 1; + } + ++__be32 ++nfs4_check_stateid(stateid_t *stateid) ++{ ++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) ++ return nfserr_bad_stateid; ++ if (STALE_STATEID(stateid)) ++ return nfserr_stale_stateid; ++ return 0; ++} ++ + static inline int + access_permit_read(unsigned long access_bmap) + { +@@ -3100,6 +3204,24 @@ nfs4_preprocess_stateid_op(struct nfsd4_ + if (grace_disallows_io(ino)) + return nfserr_grace; + ++#if defined(CONFIG_PNFSD) ++ if (pnfs_fh_is_ds(¤t_fh->fh_handle)) { ++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) ++ status = nfserr_bad_stateid; ++ else ++#ifdef CONFIG_GFS2_FS_LOCKING_DLM ++ { ++ dprintk("%s Don't check DS stateid\n", __func__); ++ return 0; ++ } ++#else /* CONFIG_GFS2_FS_LOCKING_DLM */ ++ status = nfs4_preprocess_pnfs_ds_stateid(current_fh, ++ stateid); ++#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ ++ goto out; ++ } ++#endif /* CONFIG_PNFSD */ ++ + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + +@@ -3187,13 +3309,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_co + *stpp = NULL; + *sopp = NULL; + +- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { +- dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); +- return nfserr_bad_stateid; +- } +- +- if (STALE_STATEID(stateid)) +- return nfserr_stale_stateid; ++ status = nfs4_check_stateid(stateid); ++ if (status) ++ return status; + + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; +@@ -3468,11 +3586,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + nfs4_lock_state(); +- status = nfserr_bad_stateid; +- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) +- goto out; +- status = nfserr_stale_stateid; +- if (STALE_STATEID(stateid)) ++ status = nfs4_check_stateid(stateid); ++ if (status) + goto out; + status = nfserr_bad_stateid; + if (!is_delegation_stateid(stateid)) +@@ -3502,26 +3617,6 @@ out: + #define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) + #define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) + +-static inline u64 +-end_offset(u64 start, u64 len) +-{ +- u64 end; +- +- end = start + len; +- return end >= start ? end: NFS4_MAX_UINT64; +-} +- +-/* last octet in a range */ +-static inline u64 +-last_byte_offset(u64 start, u64 len) +-{ +- u64 end; +- +- BUG_ON(!len); +- end = start + len; +- return end > start ? end - 1: NFS4_MAX_UINT64; +-} +- + #define lockownerid_hashval(id) \ + ((id) & LOCK_HASH_MASK) + +@@ -3538,7 +3633,7 @@ static struct list_head lock_ownerid_has + static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; + static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; + +-static struct nfs4_stateid * ++struct nfs4_stateid * + find_stateid(stateid_t *stid, int flags) + { + struct nfs4_stateid *local; +@@ -3567,7 +3662,7 @@ find_stateid(stateid_t *stid, int flags) + return NULL; + } + +-static struct nfs4_delegation * ++struct nfs4_delegation * + find_delegation_stateid(struct inode *ino, stateid_t *stid) + { + struct nfs4_file *fp; +@@ -3698,6 +3793,9 @@ alloc_init_lock_stateid(struct nfs4_stat + INIT_LIST_HEAD(&stp->st_perfile); + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); ++#endif /* CONFIG_PNFSD */ + list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]); + list_add(&stp->st_perfile, &fp->fi_stateids); + list_add(&stp->st_perstateowner, &sop->so_stateids); +@@ -4274,6 +4372,9 @@ nfs4_state_init(void) + INIT_LIST_HEAD(&client_lru); + INIT_LIST_HEAD(&del_recall_lru); + reclaim_str_hashtbl_size = 0; ++#if defined(CONFIG_PNFSD) ++ nfs4_pnfs_state_init(); ++#endif /* CONFIG_PNFSD */ + return 0; + } + +@@ -4378,6 +4479,7 @@ __nfs4_state_shutdown(void) + } + + nfsd4_shutdown_recdir(); ++ nfs4_pnfs_state_shutdown(); + } + + void +diff -up linux-2.6.37.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.37.noarch/fs/nfsd/nfs4xdr.c +--- linux-2.6.37.noarch/fs/nfsd/nfs4xdr.c.orig 2011-01-28 09:37:32.564979184 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/nfs4xdr.c 2011-01-28 09:43:53.361769183 -0500 +@@ -45,11 +45,16 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + + #include "idmap.h" + #include "acl.h" + #include "xdr4.h" + #include "vfs.h" ++#include "pnfsd.h" + + + #define NFSDDBG_FACILITY NFSDDBG_XDR +@@ -1279,6 +1284,138 @@ static __be32 nfsd4_decode_reclaim_compl + DECODE_TAIL; + } + ++#if defined(CONFIG_PNFSD) ++static __be32 ++nfsd4_decode_getdevlist(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_getdevlist *gdevl) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(16 + sizeof(nfs4_verifier)); ++ READ32(gdevl->gd_layout_type); ++ READ32(gdevl->gd_maxdevices); ++ READ64(gdevl->gd_cookie); ++ COPYMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_getdevinfo(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_getdevinfo *gdev) ++{ ++ u32 num; ++ DECODE_HEAD; ++ ++ READ_BUF(12 + sizeof(struct nfsd4_pnfs_deviceid)); ++ READ64(gdev->gd_devid.sbid); ++ READ64(gdev->gd_devid.devid); ++ READ32(gdev->gd_layout_type); ++ READ32(gdev->gd_maxcount); ++ READ32(num); ++ if (num) { ++ READ_BUF(4); ++ READ32(gdev->gd_notify_types); ++ } else { ++ gdev->gd_notify_types = 0; ++ } ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(36); ++ READ32(lgp->lg_signal); ++ READ32(lgp->lg_seg.layout_type); ++ READ32(lgp->lg_seg.iomode); ++ READ64(lgp->lg_seg.offset); ++ READ64(lgp->lg_seg.length); ++ READ64(lgp->lg_minlength); ++ nfsd4_decode_stateid(argp, &lgp->lg_sid); ++ READ_BUF(4); ++ READ32(lgp->lg_maxcount); ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ DECODE_HEAD; ++ u32 timechange; ++ ++ READ_BUF(20); ++ READ64(lcp->args.lc_seg.offset); ++ READ64(lcp->args.lc_seg.length); ++ READ32(lcp->args.lc_reclaim); ++ nfsd4_decode_stateid(argp, &lcp->lc_sid); ++ READ_BUF(4); ++ READ32(lcp->args.lc_newoffset); ++ if (lcp->args.lc_newoffset) { ++ READ_BUF(8); ++ READ64(lcp->args.lc_last_wr); ++ } else ++ lcp->args.lc_last_wr = 0; ++ READ_BUF(4); ++ READ32(timechange); ++ if (timechange) { ++ READ_BUF(12); ++ READ64(lcp->args.lc_mtime.seconds); ++ READ32(lcp->args.lc_mtime.nseconds); ++ } else { ++ lcp->args.lc_mtime.seconds = 0; ++ lcp->args.lc_mtime.nseconds = 0; ++ } ++ READ_BUF(8); ++ READ32(lcp->args.lc_seg.layout_type); ++ /* XXX: saving XDR'ed layout update. Since we don't have the ++ * current_fh yet, and therefore no export_ops, we can't call ++ * the layout specific decode routines. File and pVFS2 ++ * do not use the layout update.... ++ */ ++ READ32(lcp->args.lc_up_len); ++ if (lcp->args.lc_up_len > 0) { ++ READ_BUF(lcp->args.lc_up_len); ++ READMEM(lcp->args.lc_up_layout, lcp->args.lc_up_len); ++ } ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(16); ++ READ32(lrp->args.lr_reclaim); ++ READ32(lrp->args.lr_seg.layout_type); ++ READ32(lrp->args.lr_seg.iomode); ++ READ32(lrp->args.lr_return_type); ++ if (lrp->args.lr_return_type == RETURN_FILE) { ++ READ_BUF(16); ++ READ64(lrp->args.lr_seg.offset); ++ READ64(lrp->args.lr_seg.length); ++ nfsd4_decode_stateid(argp, &lrp->lr_sid); ++ READ_BUF(4); ++ READ32(lrp->args.lrf_body_len); ++ if (lrp->args.lrf_body_len > 0) { ++ READ_BUF(lrp->args.lrf_body_len); ++ READMEM(lrp->args.lrf_body, lrp->args.lrf_body_len); ++ } ++ } ++ ++ DECODE_TAIL; ++} ++#endif /* CONFIG_PNFSD */ ++ + static __be32 + nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) + { +@@ -1380,11 +1517,19 @@ static nfsd4_dec nfsd41_dec_ops[] = { + [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdevinfo, ++ [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_getdevlist, ++ [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, ++ [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, ++ [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, ++#else /* CONFIG_PNFSD */ + [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, ++#endif /* CONFIG_PNFSD */ + [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, + [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, + [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, +@@ -2191,6 +2336,36 @@ out_acl: + } + WRITE64(stat.ino); + } ++#if defined(CONFIG_PNFSD) ++ if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { ++ struct super_block *sb = dentry->d_inode->i_sb; ++ int type = 0; ++ ++ /* Query the filesystem for supported pNFS layout types. ++ * Currently, we only support one layout type per file system. ++ * The export_ops->layout_type() returns the pnfs_layouttype4. ++ */ ++ buflen -= 4; ++ if (buflen < 0) /* length */ ++ goto out_resource; ++ ++ if (sb && sb->s_pnfs_op && sb->s_pnfs_op->layout_type) ++ type = sb->s_pnfs_op->layout_type(sb); ++ if (type) { ++ if ((buflen -= 4) < 0) /* type */ ++ goto out_resource; ++ WRITE32(1); /* length */ ++ WRITE32(type); /* type */ ++ } else ++ WRITE32(0); /* length */ ++ } ++ ++ if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { ++ if ((buflen -= 4) < 0) ++ goto out_resource; ++ WRITE32(stat.blksize); ++ } ++#endif /* CONFIG_PNFSD */ + if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { + WRITE32(3); + WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); +@@ -2434,6 +2609,10 @@ nfsd4_encode_commit(struct nfsd4_compoun + if (!nfserr) { + RESERVE_SPACE(8); + WRITEMEM(commit->co_verf.data, 8); ++ dprintk("NFSD: nfsd4_encode_commit: verifier %x:%x\n", ++ ((u32 *)(&commit->co_verf.data))[0], ++ ((u32 *)(&commit->co_verf.data))[1]); ++ + ADJUST_ARGS(); + } + return nfserr; +@@ -2688,6 +2867,13 @@ nfsd4_encode_read(struct nfsd4_compoundr + } + read->rd_vlen = v; + ++#if defined(CONFIG_SPNFS) ++ if (spnfs_enabled()) ++ nfserr = spnfs_read(read->rd_fhp->fh_dentry->d_inode, ++ read->rd_offset, &maxcount, read->rd_vlen, ++ resp->rqstp); ++ else /* we're not an MDS */ ++#endif /* CONFIG_SPNFS */ + nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp, + read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, + &maxcount); +@@ -3007,6 +3193,9 @@ nfsd4_encode_write(struct nfsd4_compound + WRITE32(write->wr_bytes_written); + WRITE32(write->wr_how_written); + WRITEMEM(write->wr_verifier.data, 8); ++ dprintk("NFSD: nfsd4_encode_write: verifier %x:%x\n", ++ ((u32 *)(&write->wr_verifier.data))[0], ++ ((u32 *)(&write->wr_verifier.data))[1]); + ADJUST_ARGS(); + } + return nfserr; +@@ -3146,6 +3335,343 @@ nfsd4_encode_sequence(struct nfsd4_compo + return 0; + } + ++#if defined(CONFIG_PNFSD) ++ ++/* Uses the export interface to iterate through the available devices ++ * and encodes them on the response stream. ++ */ ++static __be32 ++nfsd4_encode_devlist_iterator(struct nfsd4_compoundres *resp, ++ struct nfsd4_pnfs_getdevlist *gdevl, ++ unsigned int *dev_count) ++{ ++ struct super_block *sb = gdevl->gd_fhp->fh_dentry->d_inode->i_sb; ++ __be32 nfserr; ++ int status; ++ __be32 *p; ++ struct nfsd4_pnfs_dev_iter_res res = { ++ .gd_cookie = gdevl->gd_cookie, ++ .gd_verf = gdevl->gd_verf, ++ .gd_eof = 0 ++ }; ++ u64 sbid; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ sbid = find_create_sbid(sb); ++ *dev_count = 0; ++ do { ++ status = sb->s_pnfs_op->get_device_iter(sb, ++ gdevl->gd_layout_type, ++ &res); ++ if (status) { ++ if (status == -ENOENT) { ++ res.gd_eof = 1; ++ /* return success */ ++ break; ++ } ++ nfserr = nfserrno(status); ++ goto out_err; ++ } ++ ++ /* Encode device id and layout type */ ++ RESERVE_SPACE(sizeof(struct nfsd4_pnfs_deviceid)); ++ WRITE64((__be64)sbid); ++ WRITE64(res.gd_devid); /* devid minor */ ++ ADJUST_ARGS(); ++ (*dev_count)++; ++ } while (*dev_count < gdevl->gd_maxdevices && !res.gd_eof); ++ gdevl->gd_cookie = res.gd_cookie; ++ gdevl->gd_verf = res.gd_verf; ++ gdevl->gd_eof = res.gd_eof; ++ nfserr = nfs_ok; ++out_err: ++ dprintk("%s: Encoded %u devices\n", __func__, *dev_count); ++ return nfserr; ++} ++ ++/* Encodes the response of get device list. ++*/ ++static __be32 ++nfsd4_encode_getdevlist(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_getdevlist *gdevl) ++{ ++ unsigned int dev_count = 0, lead_count; ++ u32 *p_in = resp->p; ++ __be32 *p; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ /* Ensure we have room for cookie, verifier, and devlist len, ++ * which we will backfill in after we encode as many devices as possible ++ */ ++ lead_count = 8 + sizeof(nfs4_verifier) + 4; ++ RESERVE_SPACE(lead_count); ++ /* skip past these values */ ++ p += XDR_QUADLEN(lead_count); ++ ADJUST_ARGS(); ++ ++ /* Iterate over as many device ids as possible on the xdr stream */ ++ nfserr = nfsd4_encode_devlist_iterator(resp, gdevl, &dev_count); ++ if (nfserr) ++ goto out_err; ++ ++ /* Backfill in cookie, verf and number of devices encoded */ ++ p = p_in; ++ WRITE64(gdevl->gd_cookie); ++ WRITEMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); ++ WRITE32(dev_count); ++ ++ /* Skip over devices */ ++ p += XDR_QUADLEN(dev_count * sizeof(struct nfsd4_pnfs_deviceid)); ++ ADJUST_ARGS(); ++ ++ /* are we at the end of devices? */ ++ RESERVE_SPACE(4); ++ WRITE32(gdevl->gd_eof); ++ ADJUST_ARGS(); ++ ++ dprintk("%s: done.\n", __func__); ++ ++ nfserr = nfs_ok; ++out: ++ return nfserr; ++out_err: ++ p = p_in; ++ ADJUST_ARGS(); ++ goto out; ++} ++ ++/* For a given device id, have the file system retrieve and encode the ++ * associated device. For file layout, the encoding function is ++ * passed down to the file system. The file system then has the option ++ * of using this encoding function or one of its own. ++ * ++ * Note: the file system must return the XDR size of struct device_addr4 ++ * da_addr_body in pnfs_xdr_info.bytes_written on NFS4ERR_TOOSMALL for the ++ * gdir_mincount calculation. ++ */ ++static __be32 ++nfsd4_encode_getdevinfo(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_getdevinfo *gdev) ++{ ++ struct super_block *sb; ++ int maxcount = 0, type_notify_len = 12; ++ __be32 *p, *p_save = NULL, *p_in = resp->p; ++ struct exp_xdr_stream xdr; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ sb = gdev->gd_sb; ++ ++ if (gdev->gd_maxcount != 0) { ++ /* FIXME: this will be bound by the session max response */ ++ maxcount = svc_max_payload(resp->rqstp); ++ if (maxcount > gdev->gd_maxcount) ++ maxcount = gdev->gd_maxcount; ++ ++ /* Ensure have room for type and notify field */ ++ maxcount -= type_notify_len; ++ if (maxcount < 0) { ++ nfserr = -ETOOSMALL; ++ goto toosmall; ++ } ++ } ++ ++ RESERVE_SPACE(4); ++ WRITE32(gdev->gd_layout_type); ++ ADJUST_ARGS(); ++ ++ /* If maxcount is 0 then just update notifications */ ++ if (gdev->gd_maxcount == 0) ++ goto handle_notifications; ++ ++ xdr.p = p_save = resp->p; ++ xdr.end = resp->end; ++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) ++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); ++ ++ nfserr = sb->s_pnfs_op->get_device_info(sb, &xdr, gdev->gd_layout_type, ++ &gdev->gd_devid); ++ if (nfserr) ++ goto err; ++ ++ /* The file system should never write 0 bytes without ++ * returning an error ++ */ ++ BUG_ON(xdr.p == p_save); ++ BUG_ON(xdr.p > xdr.end); ++ ++ /* Update the xdr stream with the number of bytes encoded ++ * by the file system. ++ */ ++ p = xdr.p; ++ ADJUST_ARGS(); ++ ++handle_notifications: ++ /* Encode supported device notifications */ ++ RESERVE_SPACE(4); ++ if (sb->s_pnfs_op->set_device_notify) { ++ struct pnfs_devnotify_arg dn_args; ++ ++ dn_args.dn_layout_type = gdev->gd_layout_type; ++ dn_args.dn_devid = gdev->gd_devid; ++ dn_args.dn_notify_types = gdev->gd_notify_types; ++ nfserr = sb->s_pnfs_op->set_device_notify(sb, &dn_args); ++ if (nfserr) ++ goto err; ++ WRITE32(dn_args.dn_notify_types); ++ } else { ++ WRITE32(0); ++ } ++ ADJUST_ARGS(); ++ ++out: ++ return nfserrno(nfserr); ++toosmall: ++ dprintk("%s: maxcount too small\n", __func__); ++ RESERVE_SPACE(4); ++ WRITE32((p_save ? (xdr.p - p_save) * 4 : 0) + type_notify_len); ++ ADJUST_ARGS(); ++ goto out; ++err: ++ /* Rewind to the beginning */ ++ p = p_in; ++ ADJUST_ARGS(); ++ if (nfserr == -ETOOSMALL) ++ goto toosmall; ++ printk(KERN_ERR "%s: export ERROR %d\n", __func__, nfserr); ++ goto out; ++} ++ ++static __be32 ++nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, ++ __be32 nfserr, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ int maxcount, leadcount; ++ struct super_block *sb; ++ struct exp_xdr_stream xdr; ++ __be32 *p, *p_save, *p_start = resp->p; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ sb = lgp->lg_fhp->fh_dentry->d_inode->i_sb; ++ maxcount = PAGE_SIZE; ++ if (maxcount > lgp->lg_maxcount) ++ maxcount = lgp->lg_maxcount; ++ ++ /* Check for space on xdr stream */ ++ leadcount = 36 + sizeof(stateid_opaque_t); ++ RESERVE_SPACE(leadcount); ++ /* encode layout metadata after file system encodes layout */ ++ p += XDR_QUADLEN(leadcount); ++ ADJUST_ARGS(); ++ ++ /* Ensure have room for ret_on_close, off, len, iomode, type */ ++ maxcount -= leadcount; ++ if (maxcount < 0) { ++ printk(KERN_ERR "%s: buffer too small\n", __func__); ++ nfserr = nfserr_toosmall; ++ goto err; ++ } ++ ++ /* Set xdr info so file system can encode layout */ ++ xdr.p = p_save = resp->p; ++ xdr.end = resp->end; ++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) ++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); ++ ++ /* Retrieve, encode, and merge layout; process stateid */ ++ nfserr = nfs4_pnfs_get_layout(lgp, &xdr); ++ if (nfserr) ++ goto err; ++ ++ /* Ensure file system returned enough bytes for the client ++ * to access. ++ */ ++ if (lgp->lg_seg.length < lgp->lg_minlength) { ++ nfserr = nfserr_badlayout; ++ goto err; ++ } ++ ++ /* The file system should never write 0 bytes without ++ * returning an error ++ */ ++ BUG_ON(xdr.p == p_save); ++ ++ /* Rewind to beginning and encode attrs */ ++ resp->p = p_start; ++ RESERVE_SPACE(4); ++ WRITE32(lgp->lg_roc); /* return on close */ ++ ADJUST_ARGS(); ++ nfsd4_encode_stateid(resp, &lgp->lg_sid); ++ RESERVE_SPACE(28); ++ /* Note: response logr_layout array count, always one for now */ ++ WRITE32(1); ++ WRITE64(lgp->lg_seg.offset); ++ WRITE64(lgp->lg_seg.length); ++ WRITE32(lgp->lg_seg.iomode); ++ WRITE32(lgp->lg_seg.layout_type); ++ ++ /* Update the xdr stream with the number of bytes written ++ * by the file system ++ */ ++ p = xdr.p; ++ ADJUST_ARGS(); ++ ++ return nfs_ok; ++err: ++ resp->p = p_start; ++ return nfserr; ++} ++ ++static __be32 ++nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ __be32 *p; ++ ++ if (nfserr) ++ goto out; ++ ++ RESERVE_SPACE(4); ++ WRITE32(lcp->res.lc_size_chg); ++ ADJUST_ARGS(); ++ if (lcp->res.lc_size_chg) { ++ RESERVE_SPACE(8); ++ WRITE64(lcp->res.lc_newsize); ++ ADJUST_ARGS(); ++ } ++out: ++ return nfserr; ++} ++ ++static __be32 ++nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ __be32 *p; ++ ++ if (nfserr) ++ goto out; ++ ++ RESERVE_SPACE(4); ++ WRITE32(lrp->lrs_present != 0); /* got stateid? */ ++ ADJUST_ARGS(); ++ if (lrp->lrs_present) ++ nfsd4_encode_stateid(resp, &lrp->lr_sid); ++out: ++ return nfserr; ++} ++#endif /* CONFIG_PNFSD */ ++ + static __be32 + nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) + { +@@ -3206,11 +3732,19 @@ static nfsd4_enc nfsd4_enc_ops[] = { + [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdevinfo, ++ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_getdevlist, ++ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, ++ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, ++ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, ++#else /* CONFIG_PNFSD */ + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, ++#endif /* CONFIG_PNFSD */ + [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, + [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, + [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, +diff -up linux-2.6.37.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.37.noarch/fs/nfsd/nfsctl.c +--- linux-2.6.37.noarch/fs/nfsd/nfsctl.c.orig 2011-01-28 09:37:32.565979149 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/nfsctl.c 2011-01-28 09:43:53.363768972 -0500 +@@ -12,11 +12,16 @@ + #include + #include + #include ++#include + + #include "idmap.h" + #include "nfsd.h" + #include "cache.h" + ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++#include ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ + /* + * We have a single directory with 9 nodes in it. + */ +@@ -51,6 +56,9 @@ enum { + NFSD_Gracetime, + NFSD_RecoveryDir, + #endif ++#ifdef CONFIG_PNFSD ++ NFSD_pnfs_dlm_device, ++#endif + }; + + /* +@@ -78,6 +86,9 @@ static ssize_t write_leasetime(struct fi + static ssize_t write_gracetime(struct file *file, char *buf, size_t size); + static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); + #endif ++#ifdef CONFIG_PNFSD ++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size); ++#endif + + static ssize_t (*write_op[])(struct file *, char *, size_t) = { + #ifdef CONFIG_NFSD_DEPRECATED +@@ -102,6 +113,9 @@ static ssize_t (*write_op[])(struct file + [NFSD_Gracetime] = write_gracetime, + [NFSD_RecoveryDir] = write_recoverydir, + #endif ++#ifdef CONFIG_PNFSD ++ [NFSD_pnfs_dlm_device] = write_pnfs_dlm_device, ++#endif + }; + + static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) +@@ -1366,6 +1380,68 @@ static ssize_t write_recoverydir(struct + + #endif + ++#ifdef CONFIG_PNFSD ++ ++static ssize_t __write_pnfs_dlm_device(struct file *file, char *buf, ++ size_t size) ++{ ++ char *mesg = buf; ++ char *pnfs_dlm_device; ++ int max_size = NFSD_PNFS_DLM_DEVICE_MAX; ++ int len, ret = 0; ++ ++ if (size > 0) { ++ ret = -EINVAL; ++ if (size > max_size || buf[size-1] != '\n') ++ return ret; ++ buf[size-1] = 0; ++ ++ pnfs_dlm_device = mesg; ++ len = qword_get(&mesg, pnfs_dlm_device, size); ++ if (len <= 0) ++ return ret; ++ ++ ret = nfsd4_set_pnfs_dlm_device(pnfs_dlm_device, len); ++ } else ++ return nfsd4_get_pnfs_dlm_device_list(buf, SIMPLE_TRANSACTION_LIMIT); ++ ++ return ret <= 0 ? ret : strlen(buf); ++} ++ ++/** ++ * write_pnfs_dlm_device - Set or report the current pNFS data server list ++ * ++ * Input: ++ * buf: ignored ++ * size: zero ++ * ++ * OR ++ * ++ * Input: ++ * buf: C string containing a block device name, ++ * a colon, and then a comma separated ++ * list of pNFS data server IPv4 addresses ++ * size: non-zero length of C string in @buf ++ * Output: ++ * On success: passed-in buffer filled with '\n'-terminated C ++ * string containing a block device name, a colon, and ++ * then a comma separated list of pNFS ++ * data server IPv4 addresses. ++ * return code is the size in bytes of the string ++ * On error: return code is a negative errno value ++ */ ++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size) ++{ ++ ssize_t rv; ++ ++ mutex_lock(&nfsd_mutex); ++ rv = __write_pnfs_dlm_device(file, buf, size); ++ mutex_unlock(&nfsd_mutex); ++ return rv; ++} ++ ++#endif /* CONFIG_PNFSD */ ++ + /*----------------------------------------------------------------------------*/ + /* + * populating the filesystem. +@@ -1402,6 +1478,10 @@ static int nfsd_fill_super(struct super_ + [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, + #endif ++#ifdef CONFIG_PNFSD ++ [NFSD_pnfs_dlm_device] = {"pnfs_dlm_device", &transaction_ops, ++ S_IWUSR|S_IRUSR}, ++#endif + /* last one */ {""} + }; + return simple_fill_super(sb, 0x6e667364, nfsd_files); +@@ -1440,6 +1520,9 @@ static int create_proc_exports_entry(voi + } + #endif + ++#if defined(CONFIG_SPNFS_BLOCK) ++int nfsd_bl_init(void); ++#endif + static int __init init_nfsd(void) + { + int retval; +@@ -1462,6 +1545,15 @@ static int __init init_nfsd(void) + retval = create_proc_exports_entry(); + if (retval) + goto out_free_idmap; ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++ retval = spnfs_init_proc(); ++ if (retval != 0) ++ goto out_free_idmap; ++#if defined(CONFIG_SPNFS_BLOCK) ++ nfsd_bl_init(); ++#endif /* CONFIG_SPNFS_BLOCK */ ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ + retval = register_filesystem(&nfsd_fs_type); + if (retval) + goto out_free_all; +@@ -1484,7 +1576,22 @@ out_free_stat: + + static void __exit exit_nfsd(void) + { ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++ remove_proc_entry("fs/nfs/spnfs/recall", NULL); ++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); ++ remove_proc_entry("fs/nfs/spnfs/getfh", NULL); ++ remove_proc_entry("fs/nfs/spnfs/config", NULL); ++ remove_proc_entry("fs/nfs/spnfs/ctl", NULL); ++ remove_proc_entry("fs/nfs/spnfs", NULL); ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS_LAYOUTSEGMENTS) ++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); ++ remove_proc_entry("fs/nfs/spnfs/layoutsegsize", NULL); ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ + nfsd_export_shutdown(); ++ nfsd4_pnfs_dlm_shutdown(); + nfsd_reply_cache_shutdown(); + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); +diff -up linux-2.6.37.noarch/fs/nfsd/nfsd.h.orig linux-2.6.37.noarch/fs/nfsd/nfsd.h +--- linux-2.6.37.noarch/fs/nfsd/nfsd.h.orig 2011-01-28 09:37:32.566979114 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/nfsd.h 2011-01-28 09:43:53.363768972 -0500 +@@ -287,11 +287,22 @@ extern time_t nfsd4_grace; + #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ + NFSD4_SUPPORTED_ATTRS_WORD0 + ++#if defined(CONFIG_PNFSD) ++#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ ++ (NFSD4_SUPPORTED_ATTRS_WORD1 | FATTR4_WORD1_FS_LAYOUT_TYPES) ++#else /* CONFIG_PNFSD */ + #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ + NFSD4_SUPPORTED_ATTRS_WORD1 ++#endif /* CONFIG_PNFSD */ + ++#if defined(CONFIG_PNFSD) ++#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ ++ (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT | \ ++ FATTR4_WORD2_LAYOUT_BLKSIZE) ++#else /* CONFIG_PNFSD */ + #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ + (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) ++#endif /* CONFIG_PNFSD */ + + static inline u32 nfsd_suppattrs0(u32 minorversion) + { +diff -up linux-2.6.37.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.37.noarch/fs/nfsd/nfsfh.c +--- linux-2.6.37.noarch/fs/nfsd/nfsfh.c.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/nfsfh.c 2011-01-28 09:43:53.364768868 -0500 +@@ -10,6 +10,7 @@ + #include + + #include ++#include + #include "nfsd.h" + #include "vfs.h" + #include "auth.h" +@@ -139,6 +140,7 @@ static inline __be32 check_pseudo_root(s + static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) + { + struct knfsd_fh *fh = &fhp->fh_handle; ++ int fsid_type; + struct fid *fid = NULL, sfid; + struct svc_export *exp; + struct dentry *dentry; +@@ -159,7 +161,8 @@ static __be32 nfsd_set_fh_dentry(struct + return error; + if (fh->fh_auth_type != 0) + return error; +- len = key_len(fh->fh_fsid_type) / 4; ++ fsid_type = pnfs_fh_fsid_type(fh); ++ len = key_len(fsid_type) / 4; + if (len == 0) + return error; + if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { +@@ -172,7 +175,7 @@ static __be32 nfsd_set_fh_dentry(struct + data_left -= len; + if (data_left < 0) + return error; +- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth); ++ exp = rqst_exp_find(rqstp, fsid_type, fh->fh_auth); + fid = (struct fid *)(fh->fh_auth + len); + } else { + __u32 tfh[2]; +diff -up linux-2.6.37.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.37.noarch/fs/nfsd/nfsfh.h +--- linux-2.6.37.noarch/fs/nfsd/nfsfh.h.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/nfsfh.h 2011-01-28 09:43:53.365768765 -0500 +@@ -14,6 +14,7 @@ enum nfsd_fsid { + FSID_UUID8, + FSID_UUID16, + FSID_UUID16_INUM, ++ FSID_MAX + }; + + enum fsid_source { +@@ -203,4 +204,42 @@ fh_unlock(struct svc_fh *fhp) + } + } + ++#if defined(CONFIG_PNFSD) ++ ++/* ++ * fh_fsid_type is overloaded to indicate whether a filehandle was one supplied ++ * to a DS by LAYOUTGET. nfs4_preprocess_stateid_op() uses this to decide how ++ * to handle a given stateid. ++ */ ++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) ++{ ++ return fh->fh_fsid_type >= FSID_MAX; ++} ++ ++static inline void pnfs_fh_mark_ds(struct knfsd_fh *fh) ++{ ++ BUG_ON(fh->fh_version != 1); ++ BUG_ON(pnfs_fh_is_ds(fh)); ++ fh->fh_fsid_type += FSID_MAX; ++} ++ ++#else /* CONFIG_PNFSD */ ++ ++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_PNFSD */ ++ ++/* allows fh_verify() to check the real fsid_type (i.e., not overloaded). */ ++static inline int pnfs_fh_fsid_type(struct knfsd_fh *fh) ++{ ++ int fsid_type = fh->fh_fsid_type; ++ ++ if (pnfs_fh_is_ds(fh)) ++ return fsid_type - FSID_MAX; ++ return fsid_type; ++} ++ + #endif /* _LINUX_NFSD_FH_INT_H */ +diff -up linux-2.6.37.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.37.noarch/fs/nfsd/nfssvc.c +--- linux-2.6.37.noarch/fs/nfsd/nfssvc.c.orig 2011-01-28 09:37:32.567979080 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/nfssvc.c 2011-01-28 09:43:53.365768765 -0500 +@@ -116,7 +116,7 @@ struct svc_program nfsd_program = { + + }; + +-u32 nfsd_supported_minorversion; ++u32 nfsd_supported_minorversion = NFSD_SUPPORTED_MINOR_VERSION; + + int nfsd_vers(int vers, enum vers_op change) + { +diff -up linux-2.6.37.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.37.noarch/fs/nfsd/pnfsd.h +--- linux-2.6.37.noarch/fs/nfsd/pnfsd.h.orig 2011-01-28 09:43:53.366768664 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/pnfsd.h 2011-01-28 09:43:53.366768664 -0500 +@@ -0,0 +1,144 @@ ++/* ++ * Copyright (c) 2005 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef LINUX_NFSD_PNFSD_H ++#define LINUX_NFSD_PNFSD_H ++ ++#include ++#include ++ ++#include "state.h" ++#include "xdr4.h" ++ ++/* outstanding layout stateid */ ++struct nfs4_layout_state { ++ struct list_head ls_perfile; ++ struct list_head ls_layouts; /* list of nfs4_layouts */ ++ struct kref ls_ref; ++ struct nfs4_client *ls_client; ++ struct nfs4_file *ls_file; ++ stateid_t ls_stateid; ++}; ++ ++/* outstanding layout */ ++struct nfs4_layout { ++ struct list_head lo_perfile; /* hash by f_id */ ++ struct list_head lo_perclnt; /* hash by clientid */ ++ struct list_head lo_perstate; ++ struct nfs4_file *lo_file; /* backpointer */ ++ struct nfs4_client *lo_client; ++ struct nfs4_layout_state *lo_state; ++ struct nfsd4_layout_seg lo_seg; ++}; ++ ++struct pnfs_inval_state { ++ struct knfsd_fh mdsfh; /* needed only by invalidate all */ ++ stateid_t stid; ++ clientid_t clid; ++ u32 status; ++}; ++ ++/* pNFS Data Server state */ ++#define DS_STATEID_VALID 0 ++#define DS_STATEID_ERROR 1 ++#define DS_STATEID_NEW 2 ++ ++struct pnfs_ds_stateid { ++ struct list_head ds_hash; /* ds_stateid hash entry */ ++ struct list_head ds_perclid; /* per client hash entry */ ++ stateid_t ds_stid; ++ struct knfsd_fh ds_fh; ++ unsigned long ds_access; ++ u32 ds_status; /* from MDS */ ++ u32 ds_verifier[2]; /* from MDS */ ++ wait_queue_head_t ds_waitq; ++ unsigned long ds_flags; ++ struct kref ds_ref; ++ clientid_t ds_mdsclid; ++}; ++ ++struct pnfs_ds_clientid { ++ struct list_head dc_hash; /* mds_clid_hashtbl entry */ ++ struct list_head dc_stateid; /* ds_stateid head */ ++ struct list_head dc_permdsid; /* per mdsid hash entry */ ++ clientid_t dc_mdsclid; ++ struct kref dc_ref; ++ uint32_t dc_mdsid; ++}; ++ ++struct pnfs_mds_id { ++ struct list_head di_hash; /* mds_nodeid list entry */ ++ struct list_head di_mdsclid; /* mds_clientid head */ ++ uint32_t di_mdsid; ++ time_t di_mdsboot; /* mds boot time */ ++ struct kref di_ref; ++}; ++ ++/* notify device request (from exported filesystem) */ ++struct nfs4_notify_device { ++ struct nfsd4_pnfs_cb_dev_list *nd_list; ++ struct nfs4_client *nd_client; ++ struct list_head nd_perclnt; ++ ++ /* nfsd internal */ ++ struct nfsd4_callback nd_recall; ++}; ++ ++u64 find_create_sbid(struct super_block *); ++struct super_block *find_sbid_id(u64); ++__be32 nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *, struct exp_xdr_stream *); ++int nfs4_pnfs_return_layout(struct super_block *, struct svc_fh *, ++ struct nfsd4_pnfs_layoutreturn *); ++int nfs4_pnfs_cb_get_state(struct super_block *, struct pnfs_get_state *); ++int nfs4_pnfs_cb_change_state(struct pnfs_get_state *); ++void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); ++int put_layoutrecall(struct nfs4_layoutrecall *); ++void nomatching_layout(struct nfs4_layoutrecall *); ++void *layoutrecall_done(struct nfs4_layoutrecall *); ++void nfsd4_cb_layout(struct nfs4_layoutrecall *); ++int nfsd_layout_recall_cb(struct super_block *, struct inode *, ++ struct nfsd4_pnfs_cb_layout *); ++int nfsd_device_notify_cb(struct super_block *, ++ struct nfsd4_pnfs_cb_dev_list *); ++void nfsd4_cb_notify_device(struct nfs4_notify_device *); ++void pnfs_set_device_notify(clientid_t *, unsigned int types); ++void pnfs_clear_device_notify(struct nfs4_client *); ++ ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++extern struct sockaddr pnfsd_lexp_addr; ++extern size_t pnfs_lexp_addr_len; ++ ++extern void pnfsd_lexp_init(struct inode *); ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ ++#endif /* LINUX_NFSD_PNFSD_H */ +diff -up linux-2.6.37.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.37.noarch/fs/nfsd/pnfsd_lexp.c +--- linux-2.6.37.noarch/fs/nfsd/pnfsd_lexp.c.orig 2011-01-28 09:43:53.367768598 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/pnfsd_lexp.c 2011-01-28 09:43:53.367768598 -0500 +@@ -0,0 +1,225 @@ ++/* ++ * linux/fs/nfsd/pnfs_lexp.c ++ * ++ * pNFS export of local filesystems. ++ * ++ * Export local file systems over the files layout type. ++ * The MDS (metadata server) functions also as a single DS (data server). ++ * This is mostly useful for development and debugging purposes. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * Copyright (C) 2008 Benny Halevy, ++ * ++ * Initial implementation was based on the pnfs-gfs2 patches done ++ * by David M. Richter ++ */ ++ ++#include ++#include ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++struct sockaddr pnfsd_lexp_addr; ++size_t pnfs_lexp_addr_len; ++ ++static int ++pnfsd_lexp_layout_type(struct super_block *sb) ++{ ++ int ret = LAYOUT_NFSV4_1_FILES; ++ dprintk("<-- %s: return %d\n", __func__, ret); ++ return ret; ++} ++ ++static int ++pnfsd_lexp_get_device_iter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ dprintk("--> %s: sb=%p\n", __func__, sb); ++ ++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); ++ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ res->gd_cookie = 1; ++ res->gd_verf = 1; ++ res->gd_devid = 1; ++ ++ dprintk("<-- %s: return 0\n", __func__); ++ return 0; ++} ++ ++static int ++pnfsd_lexp_get_device_info(struct super_block *sb, ++ struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ int err; ++ struct pnfs_filelayout_device fdev; ++ struct pnfs_filelayout_multipath fl_devices[1]; ++ u32 fl_stripe_indices[1] = { 0 }; ++ struct pnfs_filelayout_devaddr daddr; ++ /* %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x.%03u.%03u */ ++ char daddr_buf[8*4 + 2*3 + 10]; ++ ++ dprintk("--> %s: sb=%p\n", __func__, sb); ++ ++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); ++ ++ memset(&fdev, '\0', sizeof(fdev)); ++ ++ if (devid->devid != 1) { ++ printk(KERN_ERR "%s: WARNING: didn't receive a deviceid of 1 " ++ "(got: 0x%llx)\n", __func__, devid->devid); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ /* count the number of comma-delimited DS IPs */ ++ fdev.fl_device_length = 1; ++ fdev.fl_device_list = fl_devices; ++ ++ fdev.fl_stripeindices_length = fdev.fl_device_length; ++ fdev.fl_stripeindices_list = fl_stripe_indices; ++ ++ daddr.r_addr.data = daddr_buf; ++ daddr.r_addr.len = sizeof(daddr_buf); ++ err = __svc_print_netaddr(&pnfsd_lexp_addr, &daddr.r_addr); ++ if (err < 0) ++ goto out; ++ daddr.r_addr.len = err; ++ switch (pnfsd_lexp_addr.sa_family) { ++ case AF_INET: ++ daddr.r_netid.data = "tcp"; ++ daddr.r_netid.len = 3; ++ break; ++ case AF_INET6: ++ daddr.r_netid.data = "tcp6"; ++ daddr.r_netid.len = 4; ++ break; ++ default: ++ BUG(); ++ } ++ fdev.fl_device_list[0].fl_multipath_length = 1; ++ fdev.fl_device_list[0].fl_multipath_list = &daddr; ++ ++ /* have nfsd encode the device info */ ++ err = filelayout_encode_devinfo(xdr, &fdev); ++out: ++ dprintk("<-- %s: return %d\n", __func__, err); ++ return err; ++} ++ ++static int get_stripe_unit(int blocksize) ++{ ++ if (blocksize < NFSSVC_MAXBLKSIZE) ++ blocksize = NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); ++ dprintk("%s: return %d\n", __func__, blocksize); ++ return blocksize; ++} ++ ++static enum nfsstat4 ++pnfsd_lexp_layout_get(struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *arg, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ enum nfsstat4 rc = NFS4_OK; ++ struct pnfs_filelayout_layout *layout = NULL; ++ struct knfsd_fh *fhp = NULL; ++ ++ dprintk("--> %s: inode=%p\n", __func__, inode); ++ ++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ ++ layout = kzalloc(sizeof(*layout), GFP_KERNEL); ++ if (layout == NULL) { ++ rc = -ENOMEM; ++ goto error; ++ } ++ ++ /* Set file layout response args */ ++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; ++ layout->lg_stripe_type = STRIPE_SPARSE; ++ layout->lg_commit_through_mds = true; ++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); ++ layout->lg_fh_length = 1; ++ layout->device_id.sbid = arg->lg_sbid; ++ layout->device_id.devid = 1; /*FSFTEMP*/ ++ layout->lg_first_stripe_index = 0; /*FSFTEMP*/ ++ layout->lg_pattern_offset = 0; ++ ++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); ++ if (fhp == NULL) { ++ rc = -ENOMEM; ++ goto error; ++ } ++ ++ memcpy(fhp, arg->lg_fh, sizeof(*fhp)); ++ pnfs_fh_mark_ds(fhp); ++ layout->lg_fh_list = fhp; ++ ++ /* Call nfsd to encode layout */ ++ rc = filelayout_encode_layout(xdr, layout); ++exit: ++ kfree(layout); ++ kfree(fhp); ++ dprintk("<-- %s: return %d\n", __func__, rc); ++ return rc; ++ ++error: ++ res->lg_seg.length = 0; ++ goto exit; ++} ++ ++static int ++pnfsd_lexp_layout_commit(struct inode *inode, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ dprintk("%s: (unimplemented)\n", __func__); ++ ++ return 0; ++} ++ ++static int ++pnfsd_lexp_layout_return(struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ dprintk("%s: (unimplemented)\n", __func__); ++ ++ return 0; ++} ++ ++static int pnfsd_lexp_get_state(struct inode *inode, struct knfsd_fh *fh, ++ struct pnfs_get_state *p) ++{ ++ return 0; /* just use the current stateid */ ++} ++ ++static struct pnfs_export_operations pnfsd_lexp_ops = { ++ .layout_type = pnfsd_lexp_layout_type, ++ .get_device_info = pnfsd_lexp_get_device_info, ++ .get_device_iter = pnfsd_lexp_get_device_iter, ++ .layout_get = pnfsd_lexp_layout_get, ++ .layout_commit = pnfsd_lexp_layout_commit, ++ .layout_return = pnfsd_lexp_layout_return, ++ .get_state = pnfsd_lexp_get_state, ++}; ++ ++void ++pnfsd_lexp_init(struct inode *inode) ++{ ++ dprintk("%s: &pnfsd_lexp_ops=%p\n", __func__, &pnfsd_lexp_ops); ++ inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops; ++} +diff -up linux-2.6.37.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.37.noarch/fs/nfsd/spnfs_com.c +--- linux-2.6.37.noarch/fs/nfsd/spnfs_com.c.orig 2011-01-28 09:43:53.368768479 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/spnfs_com.c 2011-01-28 09:43:53.368768479 -0500 +@@ -0,0 +1,535 @@ ++/* ++ * fs/nfsd/spnfs_com.c ++ * ++ * Communcation layer between spNFS kernel and userspace ++ * Based heavily on idmap.c ++ * ++ */ ++ ++/* ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Marius Aamodt Eriksen ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++static ssize_t spnfs_pipe_upcall(struct file *, struct rpc_pipe_msg *, ++ char __user *, size_t); ++static ssize_t spnfs_pipe_downcall(struct file *, const char __user *, ++ size_t); ++static void spnfs_pipe_destroy_msg(struct rpc_pipe_msg *); ++ ++static struct rpc_pipe_ops spnfs_upcall_ops = { ++ .upcall = spnfs_pipe_upcall, ++ .downcall = spnfs_pipe_downcall, ++ .destroy_msg = spnfs_pipe_destroy_msg, ++}; ++ ++/* evil global variable */ ++struct spnfs *global_spnfs; ++struct spnfs_config *spnfs_config; ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++int spnfs_use_layoutsegments; ++uint64_t layoutsegment_size; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++/* ++ * Used by spnfs_enabled() ++ * Tracks if the subsystem has been initialized at some point. It doesn't ++ * matter if it's not currently initialized. ++ */ ++static int spnfs_enabled_at_some_point; ++ ++/* call this to start the ball rolling */ ++/* code it like we're going to avoid the global variable in the future */ ++int ++nfsd_spnfs_new(void) ++{ ++ struct spnfs *spnfs = NULL; ++ struct path path; ++ struct nameidata nd; ++ int rc; ++ ++ if (global_spnfs != NULL) ++ return -EEXIST; ++ ++ path.mnt = rpc_get_mount(); ++ if (IS_ERR(path.mnt)) ++ return PTR_ERR(path.mnt); ++ ++ /* FIXME: do not abuse rpc_pipefs/nfs */ ++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); ++ if (rc) ++ goto err; ++ ++ spnfs = kzalloc(sizeof(*spnfs), GFP_KERNEL); ++ if (spnfs == NULL){ ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ spnfs->spnfs_dentry = rpc_mkpipe(nd.path.dentry, "spnfs", spnfs, ++ &spnfs_upcall_ops, 0); ++ if (IS_ERR(spnfs->spnfs_dentry)) { ++ rc = -EPIPE; ++ goto err; ++ } ++ ++ mutex_init(&spnfs->spnfs_lock); ++ mutex_init(&spnfs->spnfs_plock); ++ init_waitqueue_head(&spnfs->spnfs_wq); ++ ++ global_spnfs = spnfs; ++ spnfs_enabled_at_some_point = 1; ++ ++ return 0; ++err: ++ rpc_put_mount(); ++ kfree(spnfs); ++ return rc; ++} ++ ++/* again, code it like we're going to remove the global variable */ ++void ++nfsd_spnfs_delete(void) ++{ ++ struct spnfs *spnfs = global_spnfs; ++ ++ if (!spnfs) ++ return; ++ rpc_unlink(spnfs->spnfs_dentry); ++ rpc_put_mount(); ++ global_spnfs = NULL; ++ kfree(spnfs); ++} ++ ++/* RPC pipefs upcall/downcall routines */ ++/* looks like this code is invoked by the rpc_pipe code */ ++/* to handle upcalls on things we've queued elsewhere */ ++/* See nfs_idmap_id for an exmaple of enqueueing */ ++static ssize_t ++spnfs_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, ++ char __user *dst, size_t buflen) ++{ ++ char *data = (char *)msg->data + msg->copied; ++ ssize_t mlen = msg->len - msg->copied; ++ ssize_t left; ++ ++ if (mlen > buflen) ++ mlen = buflen; ++ ++ left = copy_to_user(dst, data, mlen); ++ if (left < 0) { ++ msg->errno = left; ++ return left; ++ } ++ mlen -= left; ++ msg->copied += mlen; ++ msg->errno = 0; ++ return mlen; ++} ++ ++static ssize_t ++spnfs_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ++{ ++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); ++ struct spnfs *spnfs = (struct spnfs *)rpci->private; ++ struct spnfs_msg *im_in = NULL, *im = &spnfs->spnfs_im; ++ int ret; ++ ++ if (mlen != sizeof(struct spnfs_msg)) ++ return -ENOSPC; ++ ++ im_in = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im_in == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(im_in, src, mlen) != 0) ++ return -EFAULT; ++ ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ ret = mlen; ++ im->im_status = im_in->im_status; ++ /* If we got an error, terminate now, and wake up pending upcalls */ ++ if (!(im_in->im_status & SPNFS_STATUS_SUCCESS)) { ++ wake_up(&spnfs->spnfs_wq); ++ goto out; ++ } ++ ++ ret = -EINVAL; ++ /* Did we match the current upcall? */ ++ /* DMXXX: do not understand the comment above, from original code */ ++ /* DMXXX: when do we _not_ match the current upcall? */ ++ /* DMXXX: anyway, let's to a simplistic check */ ++ if (im_in->im_type == im->im_type) { ++ /* copy the response into the spnfs struct */ ++ memcpy(&im->im_res, &im_in->im_res, sizeof(im->im_res)); ++ ret = mlen; ++ } else ++ dprintk("spnfs: downcall type != upcall type\n"); ++ ++ ++ wake_up(&spnfs->spnfs_wq); ++/* DMXXX handle rval processing */ ++out: ++ mutex_unlock(&spnfs->spnfs_plock); ++ kfree(im_in); ++ return ret; ++} ++ ++static void ++spnfs_pipe_destroy_msg(struct rpc_pipe_msg *msg) ++{ ++ struct spnfs_msg *im = msg->data; ++ struct spnfs *spnfs = container_of(im, struct spnfs, spnfs_im); ++ ++ if (msg->errno >= 0) ++ return; ++ mutex_lock(&spnfs->spnfs_plock); ++ im->im_status = SPNFS_STATUS_FAIL; /* DMXXX */ ++ wake_up(&spnfs->spnfs_wq); ++ mutex_unlock(&spnfs->spnfs_plock); ++} ++ ++/* generic upcall. called by functions in spnfs_ops.c */ ++int ++spnfs_upcall(struct spnfs *spnfs, struct spnfs_msg *upmsg, ++ union spnfs_msg_res *res) ++{ ++ struct rpc_pipe_msg msg; ++ struct spnfs_msg *im; ++ DECLARE_WAITQUEUE(wq, current); ++ int ret = -EIO; ++ int rval; ++ ++ im = &spnfs->spnfs_im; ++ ++ mutex_lock(&spnfs->spnfs_lock); ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ memset(im, 0, sizeof(*im)); ++ memcpy(im, upmsg, sizeof(*upmsg)); ++ ++ memset(&msg, 0, sizeof(msg)); ++ msg.data = im; ++ msg.len = sizeof(*im); ++ ++ add_wait_queue(&spnfs->spnfs_wq, &wq); ++ rval = rpc_queue_upcall(spnfs->spnfs_dentry->d_inode, &msg); ++ if (rval < 0) { ++ remove_wait_queue(&spnfs->spnfs_wq, &wq); ++ goto out; ++ } ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&spnfs->spnfs_plock); ++ schedule(); ++ current->state = TASK_RUNNING; ++ remove_wait_queue(&spnfs->spnfs_wq, &wq); ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ if (im->im_status & SPNFS_STATUS_SUCCESS) { ++ /* copy our result from the upcall */ ++ memcpy(res, &im->im_res, sizeof(*res)); ++ ret = 0; ++ } ++ ++out: ++ memset(im, 0, sizeof(*im)); ++ mutex_unlock(&spnfs->spnfs_plock); ++ mutex_unlock(&spnfs->spnfs_lock); ++ return(ret); ++} ++ ++/* ++ * This is used to determine if the spnfsd daemon has been started at ++ * least once since the system came up. This is used to by the export ++ * mechanism to decide if spnfs is in use. ++ * ++ * Returns non-zero if the spnfsd has initialized the communication pipe ++ * at least once. ++ */ ++int spnfs_enabled(void) ++{ ++ return spnfs_enabled_at_some_point; ++} ++ ++#ifdef CONFIG_PROC_FS ++ ++/* ++ * procfs virtual files for user/kernel space communication: ++ * ++ * ctl - currently just an on/off switch...can be expanded ++ * getfh - fd to fh conversion ++ * recall - recall a layout from the command line, for example: ++ * echo > /proc/fs/spnfs/recall ++ * config - configuration info, e.g., stripe size, num ds, etc. ++ */ ++ ++/*-------------- start ctl -------------------------*/ ++static ssize_t ctl_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ int cmd, rc; ++ ++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof(int))) ++ return -EFAULT; ++ if (cmd) { ++ rc = nfsd_spnfs_new(); ++ if (rc != 0) ++ return rc; ++ } else ++ nfsd_spnfs_delete(); ++ ++ return count; ++} ++ ++static const struct file_operations ctl_ops = { ++ .write = ctl_write, ++}; ++/*-------------- end ctl ---------------------------*/ ++ ++/*-------------- start config -------------------------*/ ++static ssize_t config_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ static struct spnfs_config cfg; ++ ++ if (copy_from_user(&cfg, buf, count)) ++ return -EFAULT; ++ ++ spnfs_config = &cfg; ++ return 0; ++} ++ ++static const struct file_operations config_ops = { ++ .write = config_write, ++}; ++/*-------------- end config ---------------------------*/ ++ ++/*-------------- start getfh -----------------------*/ ++static int getfh_open(struct inode *inode, struct file *file) ++{ ++ file->private_data = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); ++ if (file->private_data == NULL) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static ssize_t getfh_read(struct file *file, char __user *buf, size_t count, ++ loff_t *offset) ++{ ++ if (copy_to_user(buf, file->private_data, sizeof(struct nfs_fh))) ++ return -EFAULT; ++ ++ return count; ++} ++ ++static ssize_t getfh_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ int fd; ++ ++ if (copy_from_user((int *)&fd, (int *)buf, sizeof(int))) ++ return -EFAULT; ++ if (spnfs_getfh(fd, file->private_data) != 0) ++ return -EIO; ++ ++ return count; ++} ++ ++static int getfh_release(struct inode *inode, struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static const struct file_operations getfh_ops = { ++ .open = getfh_open, ++ .read = getfh_read, ++ .write = getfh_write, ++ .release = getfh_release, ++}; ++/*-------------- end getfh ------------------------*/ ++ ++ ++/*-------------- start recall layout --------------*/ ++static ssize_t recall_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char input[128]; ++ char *path, *str, *p; ++ int rc; ++ u64 off = 0, len = 0; ++ ++ if (count > 128) ++ return -EINVAL; ++ ++ if (copy_from_user(input, buf, count)) ++ return -EFAULT; ++ ++ /* assumes newline-terminated path */ ++ p = memchr(input, '\n', count); ++ if (p == NULL) ++ return -EINVAL; ++ *p = '\0'; ++ ++ /* ++ * Scan for path and, optionally, an offset and length ++ * of a layout segment to be recalled; if there are two ++ * fields, they're assumed to be path and offset. ++ */ ++ p = input; ++ path = strsep(&p, " "); ++ if (path == NULL) ++ return -EINVAL; ++ ++ str = strsep(&p, " "); ++ if (str != NULL) { ++ rc = strict_strtoull(str, 10, &off); ++ if (rc != 0) ++ return -EINVAL; ++ ++ str = strsep(&p, " "); ++ if (str != NULL) { ++ rc = strict_strtoull(str, 10, &len); ++ if (rc != 0) ++ return -EINVAL; ++ } ++ } ++ ++ rc = spnfs_test_layoutrecall(path, off, len); ++ if (rc != 0) ++ return rc; ++ ++ return count; ++} ++ ++static const struct file_operations recall_ops = { ++ .write = recall_write, ++}; ++/*-------------- end recall layout --------------*/ ++ ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++/*-------------- start layoutseg -------------------------*/ ++static ssize_t layoutseg_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char cmd[3]; ++ ++ if (copy_from_user(cmd, buf, 1)) ++ return -EFAULT; ++ if (cmd[0] == '0') ++ spnfs_use_layoutsegments = 0; ++ else ++ spnfs_use_layoutsegments = 1; ++ ++ return count; ++} ++ ++static const struct file_operations layoutseg_ops = { ++ .write = layoutseg_write, ++}; ++/*-------------- end layoutseg ---------------------------*/ ++ ++/*-------------- start layoutsegsize -------------------------*/ ++static ssize_t layoutsegsize_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char cmd[50]; ++ ++ if (copy_from_user(cmd, buf, 49)) ++ return -EFAULT; ++ layoutsegment_size = simple_strtoull(cmd, NULL, 10); ++ ++ return count; ++} ++ ++static const struct file_operations layoutsegsize_ops = { ++ .write = layoutsegsize_write, ++}; ++/*-------------- end layoutsegsize ---------------------------*/ ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++int ++spnfs_init_proc(void) ++{ ++ struct proc_dir_entry *entry; ++ ++ entry = proc_mkdir("fs/spnfs", NULL); ++ if (!entry) ++ return -ENOMEM; ++ ++ entry = create_proc_entry("fs/spnfs/ctl", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &ctl_ops; ++ ++ entry = create_proc_entry("fs/spnfs/config", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &config_ops; ++ ++ entry = create_proc_entry("fs/spnfs/getfh", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &getfh_ops; ++ ++ entry = create_proc_entry("fs/spnfs/recall", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &recall_ops; ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++ entry = create_proc_entry("fs/spnfs/layoutseg", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &layoutseg_ops; ++ ++ entry = create_proc_entry("fs/spnfs/layoutsegsize", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &layoutsegsize_ops; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++ return 0; ++} ++#endif /* CONFIG_PROC_FS */ +diff -up linux-2.6.37.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.37.noarch/fs/nfsd/spnfs_ops.c +--- linux-2.6.37.noarch/fs/nfsd/spnfs_ops.c.orig 2011-01-28 09:43:53.369768328 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/spnfs_ops.c 2011-01-28 09:43:53.369768328 -0500 +@@ -0,0 +1,878 @@ ++/* ++ * fs/nfsd/spnfs_ops.c ++ * ++ * Communcation layer between spNFS kernel and userspace ++ * ++ */ ++/****************************************************************************** ++ ++(c) 2007 Network Appliance, Inc. All Rights Reserved. ++ ++Network Appliance provides this source code under the GPL v2 License. ++The GPL v2 license is available at ++http://opensource.org/licenses/gpl-license.php. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++******************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++/* comment out CONFIG_SPNFS_TEST for non-test behaviour */ ++/* #define CONFIG_SPNFS_TEST 1 */ ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++/* ++ * The functions that are called from elsewhere in the kernel ++ * to perform tasks in userspace ++ * ++ */ ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++extern int spnfs_use_layoutsegments; ++extern uint64_t layoutsegment_size; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++extern struct spnfs *global_spnfs; ++ ++int ++spnfs_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_NFSV4_1_FILES; ++} ++ ++enum nfsstat4 ++spnfs_layoutget(struct inode *inode, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *lg_arg, ++ struct nfsd4_pnfs_layoutget_res *lg_res) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ struct pnfs_filelayout_layout *flp = NULL; ++ int status, i; ++ enum nfsstat4 nfserr; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ im->im_type = SPNFS_TYPE_LAYOUTGET; ++ im->im_args.layoutget_args.inode = inode->i_ino; ++ im->im_args.layoutget_args.generation = inode->i_generation; ++ ++ /* call function to queue the msg for upcall */ ++ if (spnfs_upcall(spnfs, im, res) != 0) { ++ dprintk("failed spnfs upcall: layoutget\n"); ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ goto layoutget_cleanup; ++ } ++ status = res->layoutget_res.status; ++ if (status != 0) { ++ /* FIXME? until user mode is fixed, translate system error */ ++ switch (status) { ++ case -E2BIG: ++ case -ETOOSMALL: ++ nfserr = NFS4ERR_TOOSMALL; ++ break; ++ case -ENOMEM: ++ case -EAGAIN: ++ case -EINTR: ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ break; ++ case -ENOENT: ++ nfserr = NFS4ERR_BADLAYOUT; ++ break; ++ default: ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ } ++ dprintk("spnfs layout_get upcall: status=%d nfserr=%u\n", ++ status, nfserr); ++ goto layoutget_cleanup; ++ } ++ ++ lg_res->lg_return_on_close = 0; ++#if defined(CONFIG_SPNFS_LAYOUTSEGMENTS) ++ /* if spnfs_use_layoutsegments & layoutsegment_size == 0, use */ ++ /* the amount requested by the client. */ ++ if (spnfs_use_layoutsegments) { ++ if (layoutsegment_size != 0) ++ lg_res->lg_seg.length = layoutsegment_size; ++ } else ++ lg_res->lg_seg.length = NFS4_MAX_UINT64; ++#else ++ lg_res->lg_seg.length = NFS4_MAX_UINT64; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++ flp = kmalloc(sizeof(struct pnfs_filelayout_layout), GFP_KERNEL); ++ if (flp == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ flp->device_id.sbid = lg_arg->lg_sbid; ++ flp->device_id.devid = res->layoutget_res.devid; ++ flp->lg_layout_type = 1; /* XXX */ ++ flp->lg_stripe_type = res->layoutget_res.stripe_type; ++ flp->lg_commit_through_mds = 0; ++ flp->lg_stripe_unit = res->layoutget_res.stripe_size; ++ flp->lg_first_stripe_index = 0; ++ flp->lg_pattern_offset = 0; ++ flp->lg_fh_length = res->layoutget_res.stripe_count; ++ ++ flp->lg_fh_list = kmalloc(flp->lg_fh_length * sizeof(struct knfsd_fh), ++ GFP_KERNEL); ++ if (flp->lg_fh_list == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ /* ++ * FIX: Doing an extra copy here. Should group res.flist's fh_len ++ * and fh_val into a knfsd_fh structure. ++ */ ++ for (i = 0; i < flp->lg_fh_length; i++) { ++ flp->lg_fh_list[i].fh_size = res->layoutget_res.flist[i].fh_len; ++ memcpy(&flp->lg_fh_list[i].fh_base, ++ res->layoutget_res.flist[i].fh_val, ++ res->layoutget_res.flist[i].fh_len); ++ } ++ ++ /* encode the layoutget body */ ++ nfserr = filelayout_encode_layout(xdr, flp); ++ ++layoutget_cleanup: ++ if (flp) { ++ if (flp->lg_fh_list) ++ kfree(flp->lg_fh_list); ++ kfree(flp); ++ } ++ kfree(im); ++ kfree(res); ++ ++ return nfserr; ++} ++ ++int ++spnfs_layoutcommit(void) ++{ ++ return 0; ++} ++ ++int ++spnfs_layoutreturn(struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ return 0; ++} ++ ++int ++spnfs_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) ++{ ++ struct super_block *sb; ++ struct nfsd4_pnfs_cb_layout lr; ++ ++ switch (type) { ++ case RETURN_FILE: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for ino = %lu\n", ++ __func__, inode->i_ino); ++ break; ++ case RETURN_FSID: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for fsid x (unimplemented)\n", ++ __func__); ++ return 0; ++ case RETURN_ALL: ++ /* XXX figure out how to get a sb since there's no inode ptr */ ++ dprintk("%s: recalling all layouts (unimplemented)\n", ++ __func__); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ ++ lr.cbl_recall_type = type; ++ lr.cbl_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ lr.cbl_seg.clientid = 0; ++ lr.cbl_seg.offset = offset; ++ lr.cbl_seg.length = len; ++ lr.cbl_seg.iomode = IOMODE_ANY; ++ lr.cbl_layoutchanged = 0; ++ ++ nfsd_layout_recall_cb(sb, inode, &lr); ++ ++ return 0; ++} ++ ++ ++int ++spnfs_test_layoutrecall(char *path, u64 offset, u64 len) ++{ ++ struct nameidata nd; ++ struct inode *inode; ++ int type, rc; ++ ++ dprintk("%s: path=%s, offset=%llu, len=%llu\n", ++ __func__, path, offset, len); ++ ++ if (strcmp(path, "all") == 0) { ++ inode = NULL; ++ type = RETURN_ALL; ++ } else { ++ rc = path_lookup(path, 0, &nd); ++ if (rc != 0) ++ return -ENOENT; ++ ++ /* ++ * XXX todo: add a RETURN_FSID scenario here...maybe if ++ * inode is a dir... ++ */ ++ ++ inode = nd.path.dentry->d_inode; ++ type = RETURN_FILE; ++ } ++ ++ if (len == 0) ++ len = NFS4_MAX_UINT64; ++ ++ rc = spnfs_layoutrecall(inode, type, offset, len); ++ ++ if (type != RETURN_ALL) ++ path_put(&nd.path); ++ return rc; ++} ++ ++int ++spnfs_getdeviceiter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *gd_res) ++{ ++ struct spnfs *spnfs = global_spnfs; /* XXX keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto getdeviceiter_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto getdeviceiter_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_GETDEVICEITER; ++ im->im_args.getdeviceiter_args.cookie = gd_res->gd_cookie; ++ im->im_args.getdeviceiter_args.verf = gd_res->gd_verf; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto getdeviceiter_out; ++ } ++ status = res->getdeviceiter_res.status; ++ ++ if (res->getdeviceiter_res.eof) ++ gd_res->gd_eof = 1; ++ else { ++ gd_res->gd_devid = res->getdeviceiter_res.devid; ++ gd_res->gd_cookie = res->getdeviceiter_res.cookie; ++ gd_res->gd_verf = res->getdeviceiter_res.verf; ++ gd_res->gd_eof = 0; ++ } ++ ++getdeviceiter_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++#ifdef CONFIG_SPNFS_TEST ++/* ++ * Setup the rq_res xdr_buf. The svc_rqst rq_respages[1] page contains the ++ * 1024 encoded stripe indices. ++ * ++ * Skip the devaddr4 length and encode the indicies count (1024) in the ++ * rq_res.head and set the rq_res.head length. ++ * ++ * Set the rq_res page_len to 4096 (for the 1024 stripe indices). ++ * Set the rq_res xdr_buf tail base to rq_respages[0] just after the ++ * rq_res head to hold the rest of the getdeviceinfo return. ++ * ++ * So rq_respages[rq_resused - 1] contains the rq_res.head and rq_res.tail and ++ * rq_respages[rq_resused] contains the rq_res.pages. ++ */ ++static int spnfs_test_indices_xdr(struct pnfs_xdr_info *info, ++ const struct pnfs_filelayout_device *fdev) ++{ ++ struct nfsd4_compoundres *resp = info->resp; ++ struct svc_rqst *rqstp = resp->rqstp; ++ struct xdr_buf *xb = &resp->rqstp->rq_res; ++ __be32 *p; ++ ++ p = nfsd4_xdr_reserve_space(resp, 8); ++ p++; /* Fill in length later */ ++ *p++ = cpu_to_be32(fdev->fl_stripeindices_length); /* 1024 */ ++ resp->p = p; ++ ++ xb->head[0].iov_len = (char *)resp->p - (char *)xb->head[0].iov_base; ++ xb->pages = &rqstp->rq_respages[rqstp->rq_resused]; ++ xb->page_base = 0; ++ xb->page_len = PAGE_SIZE; /* page of 1024 encoded indices */ ++ xb->tail[0].iov_base = resp->p; ++ resp->end = xb->head[0].iov_base + PAGE_SIZE; ++ xb->tail[0].iov_len = (char *)resp->end - (char *)resp->p; ++ return 0; ++} ++/* ++ * Return a stripeindices of length 1024 to test ++ * the pNFS client multipage getdeviceinfo implementation. ++ * ++ * Encode a page of stripe indices. ++ */ ++static void spnfs_set_test_indices(struct pnfs_filelayout_device *fldev, ++ struct spnfs_device *dev, ++ struct pnfs_devinfo_arg *info) ++{ ++ struct svc_rqst *rqstp = info->xdr.resp->rqstp; ++ __be32 *p; ++ int i, j = 0; ++ ++ p = (__be32 *)page_address(rqstp->rq_respages[rqstp->rq_resused]); ++ fldev->fl_stripeindices_length = 1024; ++ /* round-robin the data servers device index into the stripe indicie */ ++ for (i = 0; i < 1024; i++) { ++ *p++ = cpu_to_be32(j); ++ if (j < dev->dscount - 1) ++ j++; ++ else ++ j = 0; ++ } ++ fldev->fl_stripeindices_list = NULL; ++} ++#endif /* CONFIG_SPNFS_TEST */ ++ ++int ++spnfs_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ struct spnfs *spnfs = global_spnfs; ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ struct spnfs_device *dev; ++ struct pnfs_filelayout_device *fldev = NULL; ++ struct pnfs_filelayout_multipath *mp = NULL; ++ struct pnfs_filelayout_devaddr *fldap = NULL; ++ int status = 0, i, len; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_GETDEVICEINFO; ++ /* XXX FIX: figure out what to do about fsid */ ++ im->im_args.getdeviceinfo_args.devid = devid->devid; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto getdeviceinfo_out; ++ } ++ status = res->getdeviceinfo_res.status; ++ if (status != 0) ++ goto getdeviceinfo_out; ++ ++ dev = &res->getdeviceinfo_res.devinfo; ++ ++ /* Fill in the device data, i.e., nfs4_1_file_layout_ds_addr4 */ ++ fldev = kzalloc(sizeof(struct pnfs_filelayout_device), GFP_KERNEL); ++ if (fldev == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ /* ++ * Stripe count is the same as data server count for our purposes ++ */ ++ fldev->fl_stripeindices_length = dev->dscount; ++ fldev->fl_device_length = dev->dscount; ++ ++ /* Set stripe indices */ ++#ifdef CONFIG_SPNFS_TEST ++ spnfs_set_test_indices(fldev, dev, info); ++ fldev->fl_enc_stripe_indices = spnfs_test_indices_xdr; ++#else /* CONFIG_SPNFS_TEST */ ++ fldev->fl_stripeindices_list = ++ kmalloc(fldev->fl_stripeindices_length * sizeof(u32), ++ GFP_KERNEL); ++ if (fldev->fl_stripeindices_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ for (i = 0; i < fldev->fl_stripeindices_length; i++) ++ fldev->fl_stripeindices_list[i] = i; ++#endif /* CONFIG_SPNFS_TEST */ ++ ++ /* ++ * Set the device's data server addresses No multipath for spnfs, ++ * so mp length is always 1. ++ * ++ */ ++ fldev->fl_device_list = ++ kmalloc(fldev->fl_device_length * ++ sizeof(struct pnfs_filelayout_multipath), ++ GFP_KERNEL); ++ if (fldev->fl_device_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ for (i = 0; i < fldev->fl_device_length; i++) { ++ mp = &fldev->fl_device_list[i]; ++ mp->fl_multipath_length = 1; ++ mp->fl_multipath_list = ++ kmalloc(sizeof(struct pnfs_filelayout_devaddr), ++ GFP_KERNEL); ++ if (mp->fl_multipath_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ fldap = mp->fl_multipath_list; ++ ++ /* ++ * Copy the netid into the device address, for example: "tcp" ++ */ ++ len = strlen(dev->dslist[i].netid); ++ fldap->r_netid.data = kmalloc(len, GFP_KERNEL); ++ if (fldap->r_netid.data == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ memcpy(fldap->r_netid.data, dev->dslist[i].netid, len); ++ fldap->r_netid.len = len; ++ ++ /* ++ * Copy the network address into the device address, ++ * for example: "10.35.9.16.08.01" ++ */ ++ len = strlen(dev->dslist[i].addr); ++ fldap->r_addr.data = kmalloc(len, GFP_KERNEL); ++ if (fldap->r_addr.data == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ memcpy(fldap->r_addr.data, dev->dslist[i].addr, len); ++ fldap->r_addr.len = len; ++ } ++ ++ /* encode the device data */ ++ status = filelayout_encode_devinfo(xdr, fldev); ++ ++getdeviceinfo_out: ++ if (fldev) { ++ kfree(fldev->fl_stripeindices_list); ++ if (fldev->fl_device_list) { ++ for (i = 0; i < fldev->fl_device_length; i++) { ++ fldap = ++ fldev->fl_device_list[i].fl_multipath_list; ++ kfree(fldap->r_netid.data); ++ kfree(fldap->r_addr.data); ++ kfree(fldap); ++ } ++ kfree(fldev->fl_device_list); ++ } ++ kfree(fldev); ++ } ++ ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++int ++spnfs_setattr(void) ++{ ++ return 0; ++} ++ ++int ++spnfs_open(struct inode *inode, struct nfsd4_open *open) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto open_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto open_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_OPEN; ++ im->im_args.open_args.inode = inode->i_ino; ++ im->im_args.open_args.generation = inode->i_generation; ++ im->im_args.open_args.create = open->op_create; ++ im->im_args.open_args.createmode = open->op_createmode; ++ im->im_args.open_args.truncate = open->op_truncate; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto open_out; ++ } ++ status = res->open_res.status; ++ ++open_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++int ++spnfs_create(void) ++{ ++ return 0; ++} ++ ++/* ++ * Invokes the spnfsd with the inode number of the object to remove. ++ * The file has already been removed on the MDS, so all the spnsfd ++ * daemon does is remove the stripes. ++ * Returns 0 on success otherwise error code ++ */ ++int ++spnfs_remove(unsigned long ino, unsigned long generation) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto remove_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto remove_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_REMOVE; ++ im->im_args.remove_args.inode = ino; ++ im->im_args.remove_args.generation = generation; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto remove_out; ++ } ++ status = res->remove_res.status; ++ ++remove_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++static int ++read_one(struct inode *inode, loff_t offset, size_t len, char *buf, ++ struct file **filp) ++{ ++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; ++ size_t iolen; ++ int completed = 0, ds, err; ++ ++ while (len > 0) { ++ tmp = offset; ++ soff = do_div(tmp, spnfs_config->stripe_size); ++ snum = tmp; ++ ds = do_div(tmp, spnfs_config->num_ds); ++ if (spnfs_config->dense_striping == 0) ++ soffset = offset; ++ else { ++ tmp = snum; ++ do_div(tmp, spnfs_config->num_ds); ++ soffset = tmp * spnfs_config->stripe_size + soff; ++ } ++ if (len < spnfs_config->stripe_size - soff) ++ iolen = len; ++ else ++ iolen = spnfs_config->stripe_size - soff; ++ ++ pos = soffset; ++ err = vfs_read(filp[ds], buf + bufoffset, iolen, &pos); ++ if (err < 0) ++ return -EIO; ++ if (err == 0) ++ break; ++ filp[ds]->f_pos = pos; ++ iolen = err; ++ completed += iolen; ++ len -= iolen; ++ offset += iolen; ++ bufoffset += iolen; ++ } ++ ++ return completed; ++} ++ ++static __be32 ++read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ int i, vnum, err, bytecount = 0; ++ char path[128]; ++ struct file *filp[SPNFS_MAX_DATA_SERVERS]; ++ size_t iolen; ++ __be32 status = nfs_ok; ++ ++ /* ++ * XXX We should just be doing this at open time, but it gets ++ * kind of messy storing this info in nfsd's state structures ++ * and piggybacking its path through the various state handling ++ * functions. Revisit this. ++ */ ++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], ++ inode->i_ino, inode->i_generation); ++ filp[i] = filp_open(path, O_RDONLY | O_LARGEFILE, 0); ++ if (filp[i] == NULL) { ++ status = nfserr_io; ++ goto read_out; ++ } ++ get_file(filp[i]); ++ } ++ ++ for (vnum = 0 ; vnum < vlen ; vnum++) { ++ iolen = rqstp->rq_vec[vnum].iov_len; ++ err = read_one(inode, offset + bytecount, iolen, ++ (char *)rqstp->rq_vec[vnum].iov_base, filp); ++ if (err < 0) { ++ status = nfserr_io; ++ goto read_out; ++ } ++ if (err < iolen) { ++ bytecount += err; ++ goto read_out; ++ } ++ bytecount += rqstp->rq_vec[vnum].iov_len; ++ } ++ ++read_out: ++ *lenp = bytecount; ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ if (filp[i]) { ++ filp_close(filp[i], current->files); ++ fput(filp[i]); ++ } ++ } ++ return status; ++} ++ ++__be32 ++spnfs_read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ if (spnfs_config) ++ return read(inode, offset, lenp, vlen, rqstp); ++ else { ++ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); ++ return nfserr_notsupp; ++ } ++} ++ ++static int ++write_one(struct inode *inode, loff_t offset, size_t len, char *buf, ++ struct file **filp) ++{ ++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; ++ size_t iolen; ++ int completed = 0, ds, err; ++ ++ while (len > 0) { ++ tmp = offset; ++ soff = do_div(tmp, spnfs_config->stripe_size); ++ snum = tmp; ++ ds = do_div(tmp, spnfs_config->num_ds); ++ if (spnfs_config->dense_striping == 0) ++ soffset = offset; ++ else { ++ tmp = snum; ++ do_div(tmp, spnfs_config->num_ds); ++ soffset = tmp * spnfs_config->stripe_size + soff; ++ } ++ if (len < spnfs_config->stripe_size - soff) ++ iolen = len; ++ else ++ iolen = spnfs_config->stripe_size - soff; ++ ++ pos = soffset; ++ err = vfs_write(filp[ds], buf + bufoffset, iolen, &pos); ++ if (err < 0) ++ return -EIO; ++ filp[ds]->f_pos = pos; ++ iolen = err; ++ completed += iolen; ++ len -= iolen; ++ offset += iolen; ++ bufoffset += iolen; ++ } ++ ++ return completed; ++} ++ ++static __be32 ++write(struct inode *inode, loff_t offset, size_t len, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ int i, vnum, err, bytecount = 0; ++ char path[128]; ++ struct file *filp[SPNFS_MAX_DATA_SERVERS]; ++ size_t iolen; ++ __be32 status = nfs_ok; ++ ++ /* ++ * XXX We should just be doing this at open time, but it gets ++ * kind of messy storing this info in nfsd's state structures ++ * and piggybacking its path through the various state handling ++ * functions. Revisit this. ++ */ ++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], ++ inode->i_ino, inode->i_generation); ++ filp[i] = filp_open(path, O_RDWR | O_LARGEFILE, 0); ++ if (filp[i] == NULL) { ++ status = nfserr_io; ++ goto write_out; ++ } ++ get_file(filp[i]); ++ } ++ ++ for (vnum = 0; vnum < vlen; vnum++) { ++ iolen = rqstp->rq_vec[vnum].iov_len; ++ err = write_one(inode, offset + bytecount, iolen, ++ (char *)rqstp->rq_vec[vnum].iov_base, filp); ++ if (err != iolen) { ++ dprintk("spnfs_write: err=%d expected %Zd\n", err, len); ++ status = nfserr_io; ++ goto write_out; ++ } ++ bytecount += rqstp->rq_vec[vnum].iov_len; ++ } ++ ++write_out: ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ if (filp[i]) { ++ filp_close(filp[i], current->files); ++ fput(filp[i]); ++ } ++ } ++ ++ return status; ++} ++ ++__be32 ++spnfs_write(struct inode *inode, loff_t offset, size_t len, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ if (spnfs_config) ++ return write(inode, offset, len, vlen, rqstp); ++ else { ++ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); ++ return nfserr_notsupp; ++ } ++} ++ ++int ++spnfs_commit(void) ++{ ++ return 0; ++} ++ ++/* ++ * Return the state for this object. ++ * At this time simply return 0 to indicate success and use the existing state ++ */ ++int ++spnfs_get_state(struct inode *inode, struct knfsd_fh *fh, struct pnfs_get_state *arg) ++{ ++ return 0; ++} ++ ++/* ++ * Return the filehandle for the specified file descriptor ++ */ ++int ++spnfs_getfh(int fd, struct nfs_fh *fh) ++{ ++ struct file *file; ++ ++ file = fget(fd); ++ if (file == NULL) ++ return -EIO; ++ ++ memcpy(fh, NFS_FH(file->f_dentry->d_inode), sizeof(struct nfs_fh)); ++ fput(file); ++ return 0; ++} +diff -up linux-2.6.37.noarch/fs/nfsd/state.h.orig linux-2.6.37.noarch/fs/nfsd/state.h +--- linux-2.6.37.noarch/fs/nfsd/state.h.orig 2011-01-28 09:37:32.568979046 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/state.h 2011-01-28 09:43:53.370768171 -0500 +@@ -37,6 +37,7 @@ + + #include + #include ++#include + #include "nfsfh.h" + + typedef struct { +@@ -65,17 +66,6 @@ typedef struct { + (s)->si_fileid, \ + (s)->si_generation + +-struct nfsd4_callback { +- void *cb_op; +- struct nfs4_client *cb_clp; +- struct list_head cb_per_client; +- u32 cb_minorversion; +- struct rpc_message cb_msg; +- const struct rpc_call_ops *cb_ops; +- struct work_struct cb_work; +- bool cb_done; +-}; +- + struct nfs4_delegation { + struct list_head dl_perfile; + struct list_head dl_perclnt; +@@ -267,6 +257,12 @@ struct nfs4_client { + unsigned long cl_cb_slot_busy; + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ ++#if defined(CONFIG_PNFSD) ++ struct list_head cl_layouts; /* outstanding layouts */ ++ struct list_head cl_layoutrecalls; /* outstanding layoutrecall ++ callbacks */ ++ atomic_t cl_deviceref; /* Num outstanding devs */ ++#endif /* CONFIG_PNFSD */ + }; + + static inline void +@@ -383,6 +379,14 @@ struct nfs4_file { + u32 fi_id; /* used with stateowner->so_id + * for stateid_hashtbl hash */ + bool fi_had_conflict; ++#if defined(CONFIG_PNFSD) ++ struct list_head fi_layouts; ++ struct list_head fi_layout_states; ++ /* used by layoutget / layoutrecall */ ++ struct nfs4_fsid fi_fsid; ++ u32 fi_fhlen; ++ u8 fi_fhval[NFS4_FHSIZE]; ++#endif /* CONFIG_PNFSD */ + }; + + /* XXX: for first cut may fall back on returning file that doesn't work +@@ -411,6 +415,15 @@ static inline struct file *find_any_file + return f->fi_fds[O_RDONLY]; + } + ++#if defined(CONFIG_PNFSD) ++/* pNFS Metadata server state */ ++ ++struct pnfs_ds_dev_entry { ++ struct list_head dd_dev_entry; /* st_pnfs_ds_id entry */ ++ u32 dd_dsid; ++}; ++#endif /* CONFIG_PNFSD */ ++ + /* + * nfs4_stateid can either be an open stateid or (eventually) a lock stateid + * +@@ -433,6 +446,9 @@ struct nfs4_stateid { + struct list_head st_perfile; + struct list_head st_perstateowner; + struct list_head st_lockowners; ++#if defined(CONFIG_PNFSD) ++ struct list_head st_pnfs_ds_id; ++#endif /* CONFIG_PNFSD */ + struct nfs4_stateowner * st_stateowner; + struct nfs4_file * st_file; + stateid_t st_stateid; +@@ -485,6 +501,34 @@ extern void nfsd4_recdir_purge_old(void) + extern int nfsd4_create_clid_dir(struct nfs4_client *clp); + extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); + extern void release_session_client(struct nfsd4_session *); ++extern void nfsd4_free_slab(struct kmem_cache **); ++extern struct nfs4_file *find_file(struct inode *); ++extern struct nfs4_file *find_alloc_file(struct inode *, struct svc_fh *); ++extern void put_nfs4_file(struct nfs4_file *); ++extern void get_nfs4_file(struct nfs4_file *); ++extern struct nfs4_client *find_confirmed_client(clientid_t *); ++extern struct nfs4_stateid *find_stateid(stateid_t *, int flags); ++extern struct nfs4_delegation *find_delegation_stateid(struct inode *, stateid_t *); ++extern __be32 nfs4_check_stateid(stateid_t *); ++extern void expire_client_lock(struct nfs4_client *); ++extern int filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), void *); ++ ++#if defined(CONFIG_PNFSD) ++extern int nfsd4_init_pnfs_slabs(void); ++extern void nfsd4_free_pnfs_slabs(void); ++extern void pnfs_expire_client(struct nfs4_client *); ++extern void release_pnfs_ds_dev_list(struct nfs4_stateid *); ++extern void nfs4_pnfs_state_init(void); ++extern void nfs4_pnfs_state_shutdown(void); ++extern void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); ++extern int nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *, stateid_t *); ++#else /* CONFIG_PNFSD */ ++static inline void nfsd4_free_pnfs_slabs(void) {} ++static inline int nfsd4_init_pnfs_slabs(void) { return 0; } ++static inline void pnfs_expire_client(struct nfs4_client *clp) {} ++static inline void release_pnfs_ds_dev_list(struct nfs4_stateid *stp) {} ++static inline void nfs4_pnfs_state_shutdown(void) {} ++#endif /* CONFIG_PNFSD */ + + static inline void + nfs4_put_stateowner(struct nfs4_stateowner *so) +@@ -498,4 +542,24 @@ nfs4_get_stateowner(struct nfs4_stateown + kref_get(&so->so_ref); + } + ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end : NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1 : NFS4_MAX_UINT64; ++} ++ + #endif /* NFSD4_STATE_H */ +diff -up linux-2.6.37.noarch/fs/nfsd/vfs.c.orig linux-2.6.37.noarch/fs/nfsd/vfs.c +--- linux-2.6.37.noarch/fs/nfsd/vfs.c.orig 2011-01-28 09:37:32.569979012 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/vfs.c 2011-01-28 09:43:53.371768014 -0500 +@@ -36,7 +36,11 @@ + #ifdef CONFIG_NFSD_V4 + #include "acl.h" + #include "idmap.h" ++#include + #endif /* CONFIG_NFSD_V4 */ ++#if defined(CONFIG_SPNFS_BLOCK) ++#include ++#endif + + #include "nfsd.h" + #include "vfs.h" +@@ -380,6 +384,12 @@ nfsd_setattr(struct svc_rqst *rqstp, str + NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE); + if (err) + goto out; ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(inode, 0)) { ++ err = bl_layoutrecall(inode, RETURN_FILE, ++ iap->ia_size, inode->i_size - iap->ia_size); ++ } ++#endif /* CONFIG_SPNFS_BLOCK */ + } + + host_err = get_write_access(inode); +@@ -1685,6 +1695,11 @@ nfsd_rename(struct svc_rqst *rqstp, stru + struct inode *fdir, *tdir; + __be32 err; + int host_err; ++#ifdef CONFIG_SPNFS ++ unsigned long ino = 0; ++ unsigned long generation = 0; ++ unsigned int nlink = 0; ++#endif /* CONFIG_SPNFS */ + + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); + if (err) +@@ -1744,7 +1759,27 @@ nfsd_rename(struct svc_rqst *rqstp, stru + host_err = nfsd_break_lease(odentry->d_inode); + if (host_err) + goto out_drop_write; ++ ++#ifdef CONFIG_SPNFS ++ /* ++ * if the target is a preexisting regular file, remember the ++ * inode number and generation so we can delete the stripes; ++ * save the link count as well so that the stripes only get ++ * get deleted when the last link is deleted ++ */ ++ if (ndentry && ndentry->d_inode && S_ISREG(ndentry->d_inode->i_mode)) { ++ ino = ndentry->d_inode->i_ino; ++ generation = ndentry->d_inode->i_generation; ++ nlink = ndentry->d_inode->i_nlink; ++ } ++#endif /* CONFIG_SPNFS */ ++ + host_err = vfs_rename(fdir, odentry, tdir, ndentry); ++#ifdef CONFIG_SPNFS ++ if (spnfs_enabled() && (!host_err && ino && nlink == 1)) ++ spnfs_remove(ino, generation); ++#endif /* CONFIG_SPNFS */ ++ + if (!host_err) { + host_err = commit_metadata(tfhp); + if (!host_err) +@@ -1784,6 +1819,11 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + struct inode *dirp; + __be32 err; + int host_err; ++#if defined(CONFIG_SPNFS) ++ unsigned long ino; ++ unsigned long generation; ++ unsigned int nlink; ++#endif /* defined(CONFIG_SPNFS) */ + + err = nfserr_acces; + if (!flen || isdotent(fname, flen)) +@@ -1807,6 +1847,17 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + goto out; + } + ++#if defined(CONFIG_SPNFS) ++ /* ++ * Remember the inode number to communicate to the spnfsd ++ * for removal of stripes; save the link count as well so that ++ * the stripes only get get deleted when the last link is deleted ++ */ ++ ino = rdentry->d_inode->i_ino; ++ generation = rdentry->d_inode->i_generation; ++ nlink = rdentry->d_inode->i_nlink; ++#endif /* defined(CONFIG_SPNFS) */ ++ + if (!type) + type = rdentry->d_inode->i_mode & S_IFMT; + +@@ -1827,6 +1878,29 @@ out_put: + if (!host_err) + host_err = commit_metadata(fhp); + ++#if defined(CONFIG_SPNFS) ++ /* ++ * spnfs: notify spnfsd of removal to destroy stripes ++ */ ++/* ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (sb->s_export_op->spnfs_remove) { ++*/ ++ dprintk("%s check if spnfs_enabled\n", __FUNCTION__); ++ if (spnfs_enabled() && nlink == 1) { ++ BUG_ON(ino == 0); ++ dprintk("%s calling spnfs_remove inumber=%ld\n", ++ __FUNCTION__, ino); ++ if (spnfs_remove(ino, generation) == 0) { ++ dprintk("%s spnfs_remove success\n", __FUNCTION__); ++ } else { ++ /* XXX How do we make this atomic? */ ++ printk(KERN_WARNING "nfsd: pNFS could not " ++ "remove stripes for inode: %ld\n", ino); ++ } ++ } ++#endif /* defined(CONFIG_SPNFS) */ ++ + mnt_drop_write(fhp->fh_export->ex_path.mnt); + out_nfserr: + err = nfserrno(host_err); +diff -up linux-2.6.37.noarch/fs/nfsd/xdr4.h.orig linux-2.6.37.noarch/fs/nfsd/xdr4.h +--- linux-2.6.37.noarch/fs/nfsd/xdr4.h.orig 2011-01-28 09:37:32.570978977 -0500 ++++ linux-2.6.37.noarch/fs/nfsd/xdr4.h 2011-01-28 09:43:53.372767858 -0500 +@@ -37,6 +37,8 @@ + #ifndef _LINUX_NFSD_XDR4_H + #define _LINUX_NFSD_XDR4_H + ++#include ++ + #include "state.h" + #include "nfsd.h" + +@@ -390,6 +392,51 @@ struct nfsd4_reclaim_complete { + u32 rca_one_fs; + }; + ++struct nfsd4_pnfs_getdevinfo { ++ struct nfsd4_pnfs_deviceid gd_devid; /* request */ ++ u32 gd_layout_type; /* request */ ++ u32 gd_maxcount; /* request */ ++ u32 gd_notify_types;/* request */ ++ struct super_block *gd_sb; ++}; ++ ++struct nfsd4_pnfs_getdevlist { ++ u32 gd_layout_type; /* request */ ++ u32 gd_maxdevices; /* request */ ++ u64 gd_cookie; /* request - response */ ++ u64 gd_verf; /* request - response */ ++ struct svc_fh *gd_fhp; /* response */ ++ u32 gd_eof; /* response */ ++}; ++ ++struct nfsd4_pnfs_layoutget { ++ u64 lg_minlength; /* request */ ++ u32 lg_signal; /* request */ ++ u32 lg_maxcount; /* request */ ++ struct svc_fh *lg_fhp; /* request */ ++ stateid_t lg_sid; /* request/response */ ++ struct nfsd4_layout_seg lg_seg; /* request/response */ ++ u32 lg_roc; /* response */ ++}; ++ ++struct nfsd4_pnfs_layoutcommit { ++ struct nfsd4_pnfs_layoutcommit_arg args; ++ stateid_t lc_sid; /* request */ ++ struct nfsd4_pnfs_layoutcommit_res res; ++}; ++ ++enum layoutreturn_flags { ++ LR_FLAG_INTERN = 1 << 0, /* internal return */ ++ LR_FLAG_EXPIRE = 1 << 1, /* return on client expiration */ ++}; ++ ++struct nfsd4_pnfs_layoutreturn { ++ struct nfsd4_pnfs_layoutreturn_arg args; ++ u32 lr_flags; ++ stateid_t lr_sid; /* request/resopnse */ ++ u32 lrs_present; /* response */ ++}; ++ + struct nfsd4_op { + int opnum; + __be32 status; +@@ -432,6 +479,13 @@ struct nfsd4_op { + struct nfsd4_destroy_session destroy_session; + struct nfsd4_sequence sequence; + struct nfsd4_reclaim_complete reclaim_complete; ++#if defined(CONFIG_PNFSD) ++ struct nfsd4_pnfs_getdevlist pnfs_getdevlist; ++ struct nfsd4_pnfs_getdevinfo pnfs_getdevinfo; ++ struct nfsd4_pnfs_layoutget pnfs_layoutget; ++ struct nfsd4_pnfs_layoutcommit pnfs_layoutcommit; ++ struct nfsd4_pnfs_layoutreturn pnfs_layoutreturn; ++#endif /* CONFIG_PNFSD */ + } u; + struct nfs4_replay * replay; + }; +diff -up linux-2.6.37.noarch/fs/nfs/file.c.orig linux-2.6.37.noarch/fs/nfs/file.c +--- linux-2.6.37.noarch/fs/nfs/file.c.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/fs/nfs/file.c 2011-01-28 09:43:53.316775510 -0500 +@@ -381,16 +381,16 @@ static int nfs_write_begin(struct file * + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int once_thru = 0; ++ struct pnfs_layout_segment *lseg; + + dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + mapping->host->i_ino, len, (long long) pos); + +- pnfs_update_layout(mapping->host, +- nfs_file_open_context(file), +- IOMODE_RW); +- ++ lseg = pnfs_update_layout(mapping->host, ++ nfs_file_open_context(file), ++ pos, len, IOMODE_RW); + start: + /* + * Prevent starvation issues if someone is doing a consistency +@@ -399,17 +399,22 @@ start: + ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) +- return ret; ++ goto out; + + page = grab_cache_page_write_begin(mapping, index, flags); +- if (!page) +- return -ENOMEM; ++ if (!page) { ++ ret = -ENOMEM; ++ goto out; ++ } + *pagep = page; + +- ret = nfs_flush_incompatible(file, page); ++ ret = nfs_flush_incompatible(file, page, lseg); + if (ret) { + unlock_page(page); + page_cache_release(page); ++ *pagep = NULL; ++ *fsdata = NULL; ++ goto out; + } else if (!once_thru && + nfs_want_read_modify_write(file, page, pos, len)) { + once_thru = 1; +@@ -418,6 +423,12 @@ start: + if (!ret) + goto start; + } ++ ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata); ++ out: ++ if (ret) { ++ put_lseg(lseg); ++ *fsdata = NULL; ++ } + return ret; + } + +@@ -427,6 +438,7 @@ static int nfs_write_end(struct file *fi + { + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + int status; ++ struct pnfs_layout_segment *lseg; + + dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, +@@ -453,10 +465,17 @@ static int nfs_write_end(struct file *fi + zero_user_segment(page, pglen, PAGE_CACHE_SIZE); + } + +- status = nfs_updatepage(file, page, offset, copied); ++ lseg = nfs4_pull_lseg_from_fsdata(file, fsdata); ++ status = pnfs_write_end(file, page, pos, len, copied, lseg); ++ if (status) ++ goto out; ++ status = nfs_updatepage(file, page, offset, copied, lseg, fsdata); + ++ out: + unlock_page(page); + page_cache_release(page); ++ pnfs_write_end_cleanup(file, fsdata); ++ put_lseg(lseg); + + if (status < 0) + return status; +@@ -567,6 +586,8 @@ static int nfs_vm_page_mkwrite(struct vm + /* make sure the cache has finished storing the page */ + nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); + ++ /* XXX Do we want to call pnfs_update_layout here? */ ++ + lock_page(page); + mapping = page->mapping; + if (mapping != dentry->d_inode->i_mapping) +@@ -577,8 +598,8 @@ static int nfs_vm_page_mkwrite(struct vm + goto out_unlock; + + ret = VM_FAULT_LOCKED; +- if (nfs_flush_incompatible(filp, page) == 0 && +- nfs_updatepage(filp, page, 0, pagelen) == 0) ++ if (nfs_flush_incompatible(filp, page, NULL) == 0 && ++ nfs_updatepage(filp, page, 0, pagelen, NULL, NULL) == 0) + goto out; + + ret = VM_FAULT_SIGBUS; +diff -up linux-2.6.37.noarch/fs/nfs/inode.c.orig linux-2.6.37.noarch/fs/nfs/inode.c +--- linux-2.6.37.noarch/fs/nfs/inode.c.orig 2011-01-28 09:37:32.529980398 -0500 ++++ linux-2.6.37.noarch/fs/nfs/inode.c 2011-01-28 09:43:53.317775328 -0500 +@@ -648,6 +648,7 @@ struct nfs_open_context *get_nfs_open_co + atomic_inc(&ctx->lock_context.count); + return ctx; + } ++EXPORT_SYMBOL(get_nfs_open_context); + + static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) + { +@@ -1003,6 +1004,7 @@ void nfs_fattr_init(struct nfs_fattr *fa + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); + } ++EXPORT_SYMBOL(nfs_fattr_init); + + struct nfs_fattr *nfs_alloc_fattr(void) + { +@@ -1212,6 +1214,14 @@ static int nfs_update_inode(struct inode + server->fsid = fattr->fsid; + + /* ++ * file needs layout commit, server attributes may be stale ++ */ ++ if (layoutcommit_needed(nfsi) && nfsi->change_attr >= fattr->change_attr) { ++ dprintk("NFS: %s: layoutcommit is needed for file %s/%ld\n", ++ __func__, inode->i_sb->s_id, inode->i_ino); ++ return 0; ++ } ++ /* + * Update the read time so we don't revalidate too often. + */ + nfsi->read_cache_jiffies = fattr->time_start; +@@ -1410,9 +1420,10 @@ static int nfs_update_inode(struct inode + */ + void nfs4_evict_inode(struct inode *inode) + { +- pnfs_destroy_layout(NFS_I(inode)); ++ pnfs_return_layout(inode, NULL, true); + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); ++ pnfs_destroy_layout(NFS_I(inode)); + /* If we are holding a delegation, return it! */ + nfs_inode_return_delegation_noreclaim(inode); + /* First call standard NFS clear_inode() code */ +@@ -1457,6 +1468,8 @@ static inline void nfs4_init_once(struct + nfsi->delegation = NULL; + nfsi->delegation_state = 0; + init_rwsem(&nfsi->rwsem); ++ rpc_init_wait_queue(&nfsi->lo_rpcwaitq, "pNFS Layoutreturn"); ++ rpc_init_wait_queue(&nfsi->lo_rpcwaitq_stateid, "pNFS Layoutstateid"); + nfsi->layout = NULL; + #endif + } +diff -up linux-2.6.37.noarch/fs/nfs/internal.h.orig linux-2.6.37.noarch/fs/nfs/internal.h +--- linux-2.6.37.noarch/fs/nfs/internal.h.orig 2011-01-28 09:37:32.529980398 -0500 ++++ linux-2.6.37.noarch/fs/nfs/internal.h 2011-01-28 09:43:53.318775148 -0500 +@@ -149,6 +149,16 @@ extern struct nfs_server *nfs_clone_serv + struct nfs_fattr *); + extern void nfs_mark_client_ready(struct nfs_client *clp, int state); + extern int nfs4_check_client_ready(struct nfs_client *clp); ++extern int nfs_sockaddr_cmp(const struct sockaddr *sa1, ++ const struct sockaddr *sa2); ++extern int nfs4_set_client(struct nfs_server *server, ++ const char *hostname, ++ const struct sockaddr *addr, ++ const size_t addrlen, ++ const char *ip_addr, ++ rpc_authflavor_t authflavour, ++ int proto, const struct rpc_timeout *timeparms, ++ u32 minorversion); + #ifdef CONFIG_PROC_FS + extern int __init nfs_fs_proc_init(void); + extern void nfs_fs_proc_exit(void); +@@ -214,6 +224,8 @@ extern const u32 nfs41_maxwrite_overhead + extern struct rpc_procinfo nfs4_procedures[]; + #endif + ++extern int nfs4_recover_expired_lease(struct nfs_client *clp); ++ + /* proc.c */ + void nfs_close_context(struct nfs_open_context *ctx, int is_sync); + +@@ -263,10 +275,31 @@ extern int nfs4_get_rootfh(struct nfs_se + #endif + + /* read.c */ ++extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops); ++extern int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops); + extern void nfs_read_prepare(struct rpc_task *task, void *calldata); + + /* write.c */ ++extern int nfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int pnfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int nfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int pnfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how, int pnfs); + extern void nfs_write_prepare(struct rpc_task *task, void *calldata); ++extern void nfs_mark_list_commit(struct list_head *head); + #ifdef CONFIG_MIGRATION + extern int nfs_migrate_page(struct address_space *, + struct page *, struct page *); +diff -up linux-2.6.37.noarch/fs/nfs/Kconfig.orig linux-2.6.37.noarch/fs/nfs/Kconfig +--- linux-2.6.37.noarch/fs/nfs/Kconfig.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/fs/nfs/Kconfig 2011-01-28 09:43:53.304777898 -0500 +@@ -87,6 +87,34 @@ config NFS_V4_1 + config PNFS_FILE_LAYOUT + tristate + ++config PNFS_OBJLAYOUT ++ tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" ++ depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD ++ help ++ Say M here if you want your pNFS client to support the Objects Layout Driver. ++ Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and ++ upper level driver (SCSI_OSD_ULD). ++ ++ If unsure, say N. ++ ++config PNFS_PANLAYOUT ++ tristate "Provide support for the Panasas OSD Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" ++ depends on PNFS_OBJLAYOUT ++ help ++ Say M or y here if you want your pNFS client to support the Panasas OSD Layout Driver. ++ ++ If unsure, say N. ++ ++config PNFS_BLOCK ++ tristate "Provide a pNFS block client (EXPERIMENTAL)" ++ depends on NFS_FS && NFS_V4_1 ++ select MD ++ select BLK_DEV_DM ++ help ++ Say M or y here if you want your pNfs client to support the block protocol ++ ++ If unsure, say N. ++ + config ROOT_NFS + bool "Root file system on NFS" + depends on NFS_FS=y && IP_PNP +diff -up linux-2.6.37.noarch/fs/nfs/Makefile.orig linux-2.6.37.noarch/fs/nfs/Makefile +--- linux-2.6.37.noarch/fs/nfs/Makefile.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/fs/nfs/Makefile 2011-01-28 09:43:53.305777685 -0500 +@@ -21,3 +21,6 @@ nfs-$(CONFIG_NFS_FSCACHE) += fscache.o f + + obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o + nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o ++ ++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ ++obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ +diff -up linux-2.6.37.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.37.noarch/fs/nfs/nfs4filelayout.c +--- linux-2.6.37.noarch/fs/nfs/nfs4filelayout.c.orig 2011-01-28 09:37:32.537980121 -0500 ++++ linux-2.6.37.noarch/fs/nfs/nfs4filelayout.c 2011-01-28 09:43:53.320774796 -0500 +@@ -41,7 +41,7 @@ MODULE_AUTHOR("Dean Hildebrand nfs_client, + nfs4_fl_free_deviceid_callback); +@@ -66,6 +66,200 @@ filelayout_clear_layoutdriver(struct nfs + return 0; + } + ++/* This function is used by the layout driver to calculate the ++ * offset of the file on the dserver based on whether the ++ * layout type is STRIPE_DENSE or STRIPE_SPARSE ++ */ ++static loff_t ++filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); ++ ++ switch (flseg->stripe_type) { ++ case STRIPE_SPARSE: ++ return offset; ++ ++ case STRIPE_DENSE: ++ { ++ u32 stripe_width; ++ u64 tmp, off; ++ u32 unit = flseg->stripe_unit; ++ ++ stripe_width = unit * flseg->dsaddr->stripe_count; ++ tmp = off = offset - flseg->pattern_offset; ++ do_div(tmp, stripe_width); ++ return tmp * unit + do_div(off, unit); ++ } ++ default: ++ BUG(); ++ } ++ ++ /* We should never get here... just to stop the gcc warning */ ++ return 0; ++} ++ ++/* ++ * Call ops for the async read/write cases ++ * In the case of dense layouts, the offset needs to be reset to its ++ * original value. ++ */ ++static void filelayout_read_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_read_data *rdata = (struct nfs_read_data *)data; ++ ++ if (rdata->fldata.orig_offset) { ++ dprintk("%s new off %llu orig offset %llu\n", __func__, ++ rdata->args.offset, rdata->fldata.orig_offset); ++ rdata->args.offset = rdata->fldata.orig_offset; ++ } ++ ++ /* Note this may cause RPC to be resent */ ++ rdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static void filelayout_read_release(void *data) ++{ ++ struct nfs_read_data *rdata = (struct nfs_read_data *)data; ++ ++ put_lseg(rdata->pdata.lseg); ++ rdata->pdata.lseg = NULL; ++ rdata->pdata.call_ops->rpc_release(data); ++} ++ ++static void filelayout_write_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ if (wdata->fldata.orig_offset) { ++ dprintk("%s new off %llu orig offset %llu\n", __func__, ++ wdata->args.offset, wdata->fldata.orig_offset); ++ wdata->args.offset = wdata->fldata.orig_offset; ++ } ++ ++ /* Note this may cause RPC to be resent */ ++ wdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static void filelayout_write_release(void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ put_lseg(wdata->pdata.lseg); ++ wdata->pdata.lseg = NULL; ++ wdata->pdata.call_ops->rpc_release(data); ++} ++ ++struct rpc_call_ops filelayout_read_call_ops = { ++ .rpc_call_prepare = nfs_read_prepare, ++ .rpc_call_done = filelayout_read_call_done, ++ .rpc_release = filelayout_read_release, ++}; ++ ++struct rpc_call_ops filelayout_write_call_ops = { ++ .rpc_call_prepare = nfs_write_prepare, ++ .rpc_call_done = filelayout_write_call_done, ++ .rpc_release = filelayout_write_release, ++}; ++ ++/* Perform sync or async reads. ++ * ++ * An optimization for the NFS file layout driver ++ * allows the original read/write data structs to be passed in the ++ * last argument. ++ * ++ * TODO: join with write_pagelist? ++ */ ++static enum pnfs_try_status ++filelayout_read_pagelist(struct nfs_read_data *data, unsigned nr_pages) ++{ ++ struct pnfs_layout_segment *lseg = data->pdata.lseg; ++ struct nfs4_pnfs_ds *ds; ++ loff_t offset = data->args.offset; ++ u32 idx; ++ struct nfs_fh *fh; ++ ++ dprintk("--> %s ino %lu nr_pages %d pgbase %u req %Zu@%llu\n", ++ __func__, data->inode->i_ino, nr_pages, ++ data->args.pgbase, (size_t)data->args.count, offset); ++ ++ /* Retrieve the correct rpc_client for the byte range */ ++ idx = nfs4_fl_calc_ds_index(lseg, offset); ++ ds = nfs4_fl_prepare_ds(lseg, idx); ++ if (!ds) { ++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ dprintk("%s USE DS:ip %x %hu\n", __func__, ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); ++ ++ /* just try the first data server for the index..*/ ++ data->fldata.ds_nfs_client = ds->ds_clp; ++ fh = nfs4_fl_select_ds_fh(lseg, offset); ++ if (fh) ++ data->args.fh = fh; ++ ++ /* ++ * Now get the file offset on the dserver ++ * Set the read offset to this offset, and ++ * save the original offset in orig_offset ++ * In the case of aync reads, the offset will be reset in the ++ * call_ops->rpc_call_done() routine. ++ */ ++ data->args.offset = filelayout_get_dserver_offset(lseg, offset); ++ data->fldata.orig_offset = offset; ++ ++ /* Perform an asynchronous read */ ++ nfs_initiate_read(data, ds->ds_clp->cl_rpcclient, ++ &filelayout_read_call_ops); ++ ++ data->pdata.pnfs_error = 0; ++ ++ return PNFS_ATTEMPTED; ++} ++ ++/* Perform async writes. */ ++static enum pnfs_try_status ++filelayout_write_pagelist(struct nfs_write_data *data, unsigned nr_pages, int sync) ++{ ++ struct pnfs_layout_segment *lseg = data->pdata.lseg; ++ struct nfs4_pnfs_ds *ds; ++ loff_t offset = data->args.offset; ++ u32 idx; ++ struct nfs_fh *fh; ++ ++ /* Retrieve the correct rpc_client for the byte range */ ++ idx = nfs4_fl_calc_ds_index(lseg, offset); ++ ds = nfs4_fl_prepare_ds(lseg, idx); ++ if (!ds) { ++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__, ++ data->inode->i_ino, sync, (size_t) data->args.count, offset, ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); ++ ++ data->fldata.ds_nfs_client = ds->ds_clp; ++ fh = nfs4_fl_select_ds_fh(lseg, offset); ++ if (fh) ++ data->args.fh = fh; ++ /* ++ * Get the file offset on the dserver. Set the write offset to ++ * this offset and save the original offset. ++ */ ++ data->args.offset = filelayout_get_dserver_offset(lseg, offset); ++ data->fldata.orig_offset = offset; ++ ++ /* ++ * Perform an asynchronous write The offset will be reset in the ++ * call_ops->rpc_call_done() routine ++ */ ++ nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, ++ &filelayout_write_call_ops, sync); ++ ++ data->pdata.pnfs_error = 0; ++ return PNFS_ATTEMPTED; ++} ++ + /* + * filelayout_check_layout() + * +@@ -82,7 +276,7 @@ filelayout_check_layout(struct pnfs_layo + { + struct nfs4_file_layout_dsaddr *dsaddr; + int status = -EINVAL; +- struct nfs_server *nfss = NFS_SERVER(lo->plh_inode); ++ struct nfs_server *nfss = NFS_SERVER(lo->inode); + + dprintk("--> %s\n", __func__); + +@@ -101,7 +295,7 @@ filelayout_check_layout(struct pnfs_layo + /* find and reference the deviceid */ + dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id); + if (dsaddr == NULL) { +- dsaddr = get_device_info(lo->plh_inode, id); ++ dsaddr = get_device_info(lo->inode, id); + if (dsaddr == NULL) + goto out; + } +@@ -243,7 +437,7 @@ filelayout_alloc_lseg(struct pnfs_layout + static void + filelayout_free_lseg(struct pnfs_layout_segment *lseg) + { +- struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode); ++ struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode); + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + + dprintk("--> %s\n", __func__); +@@ -252,14 +446,229 @@ filelayout_free_lseg(struct pnfs_layout_ + _filelayout_free_lseg(fl); + } + ++/* Allocate a new nfs_write_data struct and initialize */ ++static struct nfs_write_data * ++filelayout_clone_write_data(struct nfs_write_data *old) ++{ ++ static struct nfs_write_data *new; ++ ++ new = nfs_commitdata_alloc(); ++ if (!new) ++ goto out; ++ kref_init(&new->refcount); ++ new->parent = old; ++ kref_get(&old->refcount); ++ new->inode = old->inode; ++ new->cred = old->cred; ++ new->args.offset = 0; ++ new->args.count = 0; ++ new->res.count = 0; ++ new->res.fattr = &new->fattr; ++ nfs_fattr_init(&new->fattr); ++ new->res.verf = &new->verf; ++ new->args.context = get_nfs_open_context(old->args.context); ++ new->pdata.lseg = NULL; ++ new->pdata.call_ops = old->pdata.call_ops; ++ new->pdata.how = old->pdata.how; ++out: ++ return new; ++} ++ ++static void filelayout_commit_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ wdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static struct rpc_call_ops filelayout_commit_call_ops = { ++ .rpc_call_prepare = nfs_write_prepare, ++ .rpc_call_done = filelayout_commit_call_done, ++ .rpc_release = filelayout_write_release, ++}; ++ ++/* ++ * Execute a COMMIT op to the MDS or to each data server on which a page ++ * in 'pages' exists. ++ * Invoke the pnfs_commit_complete callback. ++ */ ++enum pnfs_try_status ++filelayout_commit(struct nfs_write_data *data, int sync) ++{ ++ LIST_HEAD(head); ++ struct nfs_page *req; ++ loff_t file_offset = 0; ++ u16 idx, i; ++ struct list_head **ds_page_list = NULL; ++ u16 *indices_used; ++ int num_indices_seen = 0; ++ bool used_mds = false; ++ const struct rpc_call_ops *call_ops; ++ struct rpc_clnt *clnt; ++ struct nfs_write_data **clone_list = NULL; ++ struct nfs_write_data *dsdata; ++ struct nfs4_pnfs_ds *ds; ++ ++ dprintk("%s data %p sync %d\n", __func__, data, sync); ++ ++ /* Alloc room for both in one go */ ++ ds_page_list = kzalloc((NFS4_PNFS_MAX_MULTI_CNT + 1) * ++ (sizeof(u16) + sizeof(struct list_head *)), ++ GFP_KERNEL); ++ if (!ds_page_list) ++ goto mem_error; ++ indices_used = (u16 *) (ds_page_list + NFS4_PNFS_MAX_MULTI_CNT + 1); ++ /* ++ * Sort pages based on which ds to send to. ++ * MDS is given index equal to NFS4_PNFS_MAX_MULTI_CNT. ++ * Note we are assuming there is only a single lseg in play. ++ * When that is not true, we could first sort on lseg, then ++ * sort within each as we do here. ++ */ ++ while (!list_empty(&data->pages)) { ++ req = nfs_list_entry(data->pages.next); ++ nfs_list_remove_request(req); ++ if (!req->wb_lseg || ++ ((struct nfs4_filelayout_segment *) ++ FILELAYOUT_LSEG(req->wb_lseg))->commit_through_mds) ++ idx = NFS4_PNFS_MAX_MULTI_CNT; ++ else { ++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; ++ idx = nfs4_fl_calc_ds_index(req->wb_lseg, file_offset); ++ } ++ if (ds_page_list[idx]) { ++ /* Already seen this idx */ ++ list_add(&req->wb_list, ds_page_list[idx]); ++ } else { ++ /* New idx not seen so far */ ++ list_add_tail(&req->wb_list, &head); ++ indices_used[num_indices_seen++] = idx; ++ } ++ ds_page_list[idx] = &req->wb_list; ++ } ++ /* Once created, clone must be released via call_op */ ++ clone_list = kzalloc(num_indices_seen * ++ sizeof(struct nfs_write_data *), GFP_KERNEL); ++ if (!clone_list) ++ goto mem_error; ++ for (i = 0; i < num_indices_seen - 1; i++) { ++ if (indices_used[i] == NFS4_PNFS_MAX_MULTI_CNT) { ++ used_mds = true; ++ clone_list[i] = data; ++ } else { ++ clone_list[i] = filelayout_clone_write_data(data); ++ if (!clone_list[i]) ++ goto mem_error; ++ } ++ } ++ if (used_mds) { ++ clone_list[i] = filelayout_clone_write_data(data); ++ if (!clone_list[i]) ++ goto mem_error; ++ } else ++ clone_list[i] = data; ++ /* ++ * Now send off the RPCs to each ds. Note that it is important ++ * that any RPC to the MDS be sent last (or at least after all ++ * clones have been made.) ++ */ ++ for (i = 0; i < num_indices_seen; i++) { ++ dsdata = clone_list[i]; ++ idx = indices_used[i]; ++ list_cut_position(&dsdata->pages, &head, ds_page_list[idx]); ++ if (idx == NFS4_PNFS_MAX_MULTI_CNT) { ++ call_ops = data->pdata.call_ops;; ++ clnt = NFS_CLIENT(dsdata->inode); ++ ds = NULL; ++ } else { ++ struct nfs_fh *fh; ++ ++ call_ops = &filelayout_commit_call_ops; ++ req = nfs_list_entry(dsdata->pages.next); ++ ds = nfs4_fl_prepare_ds(req->wb_lseg, idx); ++ if (!ds) { ++ /* Trigger retry of this chunk through MDS */ ++ dsdata->task.tk_status = -EIO; ++ data->pdata.call_ops->rpc_release(dsdata); ++ continue; ++ } ++ clnt = ds->ds_clp->cl_rpcclient; ++ dsdata->fldata.ds_nfs_client = ds->ds_clp; ++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; ++ fh = nfs4_fl_select_ds_fh(req->wb_lseg, file_offset); ++ if (fh) ++ dsdata->args.fh = fh; ++ } ++ dprintk("%s: Initiating commit: %llu USE DS:\n", ++ __func__, file_offset); ++ ifdebug(FACILITY) ++ print_ds(ds); ++ ++ /* Send COMMIT to data server */ ++ nfs_initiate_commit(dsdata, clnt, call_ops, sync); ++ } ++ kfree(clone_list); ++ kfree(ds_page_list); ++ data->pdata.pnfs_error = 0; ++ return PNFS_ATTEMPTED; ++ ++ mem_error: ++ if (clone_list) { ++ for (i = 0; i < num_indices_seen - 1; i++) { ++ if (!clone_list[i]) ++ break; ++ data->pdata.call_ops->rpc_release(clone_list[i]); ++ } ++ kfree(clone_list); ++ } ++ kfree(ds_page_list); ++ /* One of these will be empty, but doesn't hurt to do both */ ++ nfs_mark_list_commit(&head); ++ nfs_mark_list_commit(&data->pages); ++ data->pdata.call_ops->rpc_release(data); ++ return PNFS_ATTEMPTED; ++} ++ ++/* ++ * filelayout_pg_test(). Called by nfs_can_coalesce_requests() ++ * ++ * return 1 : coalesce page ++ * return 0 : don't coalesce page ++ * ++ * By the time this is called, we know req->wb_lseg == prev->wb_lseg ++ */ ++int ++filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, ++ struct nfs_page *req) ++{ ++ u64 p_stripe, r_stripe; ++ u32 stripe_unit; ++ ++ if (!req->wb_lseg) ++ return 1; ++ p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; ++ r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; ++ stripe_unit = FILELAYOUT_LSEG(req->wb_lseg)->stripe_unit; ++ ++ do_div(p_stripe, stripe_unit); ++ do_div(r_stripe, stripe_unit); ++ ++ return (p_stripe == r_stripe); ++} ++ + static struct pnfs_layoutdriver_type filelayout_type = { + .id = LAYOUT_NFSV4_1_FILES, + .name = "LAYOUT_NFSV4_1_FILES", + .owner = THIS_MODULE, ++ .flags = PNFS_USE_RPC_CODE, + .set_layoutdriver = filelayout_set_layoutdriver, + .clear_layoutdriver = filelayout_clear_layoutdriver, + .alloc_lseg = filelayout_alloc_lseg, + .free_lseg = filelayout_free_lseg, ++ .pg_test = filelayout_pg_test, ++ .read_pagelist = filelayout_read_pagelist, ++ .write_pagelist = filelayout_write_pagelist, ++ .commit = filelayout_commit, + }; + + static int __init nfs4filelayout_init(void) +diff -up linux-2.6.37.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.37.noarch/fs/nfs/nfs4filelayoutdev.c +--- linux-2.6.37.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/fs/nfs/nfs4filelayoutdev.c 2011-01-28 09:43:53.321774623 -0500 +@@ -104,6 +104,109 @@ _data_server_lookup_locked(u32 ip_addr, + return NULL; + } + ++/* Create an rpc to the data server defined in 'dev_list' */ ++static int ++nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) ++{ ++ struct nfs_server *tmp; ++ struct sockaddr_in sin; ++ struct rpc_clnt *mds_clnt = mds_srv->client; ++ struct nfs_client *clp = mds_srv->nfs_client; ++ struct sockaddr *mds_addr; ++ int err = 0; ++ ++ dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__, ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), ++ mds_clnt->cl_auth->au_flavor); ++ ++ sin.sin_family = AF_INET; ++ sin.sin_addr.s_addr = ds->ds_ip_addr; ++ sin.sin_port = ds->ds_port; ++ ++ /* ++ * If this DS is also the MDS, use the MDS session only if the ++ * MDS exchangeid flags show the EXCHGID4_FLAG_USE_PNFS_DS pNFS role. ++ */ ++ mds_addr = (struct sockaddr *)&clp->cl_addr; ++ if (nfs_sockaddr_cmp((struct sockaddr *)&sin, mds_addr)) { ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { ++ printk(KERN_INFO ++ "ip:port %x:%hu is not a pNFS Data Server\n", ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); ++ err = -ENODEV; ++ } else { ++ atomic_inc(&clp->cl_count); ++ ds->ds_clp = clp; ++ dprintk("%s Using MDS Session for DS\n", __func__); ++ } ++ goto out; ++ } ++ ++ /* Temporay server for nfs4_set_client */ ++ tmp = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); ++ if (!tmp) ++ goto out; ++ ++ /* ++ * Set a retrans, timeout interval, and authflavor equual to the MDS ++ * values. Use the MDS nfs_client cl_ipaddr field so as to use the ++ * same co_ownerid as the MDS. ++ */ ++ err = nfs4_set_client(tmp, ++ mds_srv->nfs_client->cl_hostname, ++ (struct sockaddr *)&sin, ++ sizeof(struct sockaddr), ++ mds_srv->nfs_client->cl_ipaddr, ++ mds_clnt->cl_auth->au_flavor, ++ IPPROTO_TCP, ++ mds_clnt->cl_xprt->timeout, ++ 1 /* minorversion */); ++ if (err < 0) ++ goto out_free; ++ ++ clp = tmp->nfs_client; ++ ++ /* Ask for only the EXCHGID4_FLAG_USE_PNFS_DS pNFS role */ ++ dprintk("%s EXCHANGE_ID for clp %p\n", __func__, clp); ++ clp->cl_exchange_flags = EXCHGID4_FLAG_USE_PNFS_DS; ++ ++ err = nfs4_recover_expired_lease(clp); ++ if (!err) ++ err = nfs4_check_client_ready(clp); ++ if (err) ++ goto out_put; ++ ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { ++ printk(KERN_INFO "ip:port %x:%hu is not a pNFS Data Server\n", ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); ++ err = -ENODEV; ++ goto out_put; ++ } ++ /* ++ * Set DS lease equal to the MDS lease, renewal is scheduled in ++ * create_session ++ */ ++ spin_lock(&mds_srv->nfs_client->cl_lock); ++ clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time; ++ spin_unlock(&mds_srv->nfs_client->cl_lock); ++ clp->cl_last_renewal = jiffies; ++ ++ clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); ++ ds->ds_clp = clp; ++ ++ dprintk("%s: ip=%x, port=%hu, rpcclient %p\n", __func__, ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), ++ clp->cl_rpcclient); ++out_free: ++ kfree(tmp); ++out: ++ dprintk("%s Returns %d\n", __func__, err); ++ return err; ++out_put: ++ nfs_put_client(clp); ++ goto out_free; ++} ++ + static void + destroy_ds(struct nfs4_pnfs_ds *ds) + { +@@ -446,3 +549,72 @@ nfs4_fl_find_get_deviceid(struct nfs_cli + return (d == NULL) ? NULL : + container_of(d, struct nfs4_file_layout_dsaddr, deviceid); + } ++ ++/* ++ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit ++ * Then: ((res + fsi) % dsaddr->stripe_count) ++ */ ++static u32 ++_nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); ++ u64 tmp; ++ ++ tmp = offset - flseg->pattern_offset; ++ do_div(tmp, flseg->stripe_unit); ++ tmp += flseg->first_stripe_index; ++ return do_div(tmp, flseg->dsaddr->stripe_count); ++} ++ ++u32 ++nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ u32 j; ++ ++ j = _nfs4_fl_calc_j_index(lseg, offset); ++ return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j]; ++} ++ ++struct nfs_fh * ++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); ++ u32 i; ++ ++ if (flseg->stripe_type == STRIPE_SPARSE) { ++ if (flseg->num_fh == 1) ++ i = 0; ++ else if (flseg->num_fh == 0) ++ return NULL; ++ else ++ i = nfs4_fl_calc_ds_index(lseg, offset); ++ } else ++ i = _nfs4_fl_calc_j_index(lseg, offset); ++ return flseg->fh_array[i]; ++} ++ ++struct nfs4_pnfs_ds * ++nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) ++{ ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ ++ dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr; ++ if (dsaddr->ds_list[ds_idx] == NULL) { ++ printk(KERN_ERR "%s: No data server for device id!\n", ++ __func__); ++ return NULL; ++ } ++ ++ if (!dsaddr->ds_list[ds_idx]->ds_clp) { ++ int err; ++ ++ err = nfs4_pnfs_ds_create(NFS_SERVER(lseg->layout->inode), ++ dsaddr->ds_list[ds_idx]); ++ if (err) { ++ printk(KERN_ERR "%s nfs4_pnfs_ds_create error %d\n", ++ __func__, err); ++ return NULL; ++ } ++ } ++ return dsaddr->ds_list[ds_idx]; ++} +diff -up linux-2.6.37.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.37.noarch/fs/nfs/nfs4filelayout.h +--- linux-2.6.37.noarch/fs/nfs/nfs4filelayout.h.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/fs/nfs/nfs4filelayout.h 2011-01-28 09:43:53.321774623 -0500 +@@ -83,9 +83,15 @@ FILELAYOUT_LSEG(struct pnfs_layout_segme + generic_hdr); + } + ++extern struct nfs_fh * ++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset); ++ + extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *); + extern void print_ds(struct nfs4_pnfs_ds *ds); + extern void print_deviceid(struct nfs4_deviceid *dev_id); ++u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset); ++struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, ++ u32 ds_idx); + extern struct nfs4_file_layout_dsaddr * + nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id); + struct nfs4_file_layout_dsaddr * +diff -up linux-2.6.37.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.37.noarch/fs/nfs/nfs4_fs.h +--- linux-2.6.37.noarch/fs/nfs/nfs4_fs.h.orig 2011-01-28 09:37:32.536980156 -0500 ++++ linux-2.6.37.noarch/fs/nfs/nfs4_fs.h 2011-01-28 09:43:53.319774971 -0500 +@@ -44,9 +44,9 @@ enum nfs4_client_state { + NFS4CLNT_RECLAIM_REBOOT, + NFS4CLNT_RECLAIM_NOGRACE, + NFS4CLNT_DELEGRETURN, +- NFS4CLNT_LAYOUTRECALL, + NFS4CLNT_SESSION_RESET, + NFS4CLNT_RECALL_SLOT, ++ NFS4CLNT_LAYOUT_RECALL, + }; + + enum nfs4_session_state { +@@ -236,7 +236,7 @@ extern int nfs4_proc_async_renew(struct + extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); + extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); + extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); +-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc); ++extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); + extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); + extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, + struct nfs4_fs_locations *fs_locations, struct page *page); +@@ -250,10 +250,12 @@ static inline struct nfs4_session *nfs4_ + } + + extern int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task); + extern void nfs4_destroy_session(struct nfs4_session *session); + extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); ++extern int nfs4_proc_exchange_id(struct nfs_client *, struct rpc_cred *); + extern int nfs4_proc_create_session(struct nfs_client *); + extern int nfs4_proc_destroy_session(struct nfs4_session *); + extern int nfs4_init_session(struct nfs_server *server); +@@ -266,6 +268,7 @@ static inline struct nfs4_session *nfs4_ + } + + static inline int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task) + { +@@ -283,7 +286,7 @@ extern const struct nfs4_minor_version_o + extern const u32 nfs4_fattr_bitmap[2]; + extern const u32 nfs4_statfs_bitmap[2]; + extern const u32 nfs4_pathconf_bitmap[2]; +-extern const u32 nfs4_fsinfo_bitmap[2]; ++extern const u32 nfs4_fsinfo_bitmap[3]; + extern const u32 nfs4_fs_locations_bitmap[2]; + + /* nfs4renewd.c */ +@@ -293,13 +296,24 @@ extern void nfs4_kill_renewd(struct nfs_ + extern void nfs4_renew_state(struct work_struct *); + + /* nfs4state.c */ ++struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); + struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp); + struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); + #if defined(CONFIG_NFS_V4_1) +-struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); + struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); + #endif /* CONFIG_NFS_V4_1 */ + ++static inline struct rpc_cred * ++nfs4_get_machine_cred(struct nfs_client *clp) ++{ ++ struct rpc_cred *cred; ++ ++ spin_lock(&clp->cl_lock); ++ cred = nfs4_get_machine_cred_locked(clp); ++ spin_unlock(&clp->cl_lock); ++ return cred; ++} ++ + extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); + extern void nfs4_put_state_owner(struct nfs4_state_owner *); + extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); +diff -up linux-2.6.37.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.37.noarch/fs/nfs/nfs4proc.c +--- linux-2.6.37.noarch/fs/nfs/nfs4proc.c.orig 2011-01-28 09:37:32.539980051 -0500 ++++ linux-2.6.37.noarch/fs/nfs/nfs4proc.c 2011-01-28 09:43:53.324774117 -0500 +@@ -69,7 +69,7 @@ struct nfs4_opendata; + static int _nfs4_proc_open(struct nfs4_opendata *data); + static int _nfs4_recover_proc_open(struct nfs4_opendata *data); + static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); +-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); ++static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, struct nfs_client *); + static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, +@@ -127,12 +127,13 @@ const u32 nfs4_pathconf_bitmap[2] = { + 0 + }; + +-const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE ++const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE + | FATTR4_WORD0_MAXREAD + | FATTR4_WORD0_MAXWRITE + | FATTR4_WORD0_LEASE_TIME, + FATTR4_WORD1_TIME_DELTA +- | FATTR4_WORD1_FS_LAYOUT_TYPES ++ | FATTR4_WORD1_FS_LAYOUT_TYPES, ++ FATTR4_WORD2_LAYOUT_BLKSIZE + }; + + const u32 nfs4_fs_locations_bitmap[2] = { +@@ -572,6 +573,7 @@ static int nfs41_setup_sequence(struct n + } + + int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply, +@@ -580,6 +582,8 @@ int nfs4_setup_sequence(const struct nfs + struct nfs4_session *session = nfs4_get_session(server); + int ret = 0; + ++ if (ds_session) ++ session = ds_session; + if (session == NULL) { + args->sa_session = NULL; + res->sr_session = NULL; +@@ -610,7 +614,7 @@ static void nfs41_call_sync_prepare(stru + + dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); + +- if (nfs4_setup_sequence(data->seq_server, data->seq_args, ++ if (nfs4_setup_sequence(data->seq_server, NULL, data->seq_args, + data->seq_res, data->cache_reply, task)) + return; + rpc_call_start(task); +@@ -1398,7 +1402,7 @@ static void nfs4_open_prepare(struct rpc + nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); + } + data->timestamp = jiffies; +- if (nfs4_setup_sequence(data->o_arg.server, ++ if (nfs4_setup_sequence(data->o_arg.server, NULL, + &data->o_arg.seq_args, + &data->o_res.seq_res, 1, task)) + return; +@@ -1573,9 +1577,8 @@ static int _nfs4_proc_open(struct nfs4_o + return 0; + } + +-static int nfs4_recover_expired_lease(struct nfs_server *server) ++int nfs4_recover_expired_lease(struct nfs_client *clp) + { +- struct nfs_client *clp = server->nfs_client; + unsigned int loop; + int ret; + +@@ -1591,6 +1594,7 @@ static int nfs4_recover_expired_lease(st + } + return ret; + } ++EXPORT_SYMBOL(nfs4_recover_expired_lease); + + /* + * OPEN_EXPIRED: +@@ -1679,7 +1683,7 @@ static int _nfs4_do_open(struct inode *d + dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); + goto out_err; + } +- status = nfs4_recover_expired_lease(server); ++ status = nfs4_recover_expired_lease(server->nfs_client); + if (status != 0) + goto err_put_state_owner; + if (path->dentry->d_inode != NULL) +@@ -1839,8 +1843,6 @@ struct nfs4_closedata { + struct nfs_closeres res; + struct nfs_fattr fattr; + unsigned long timestamp; +- bool roc; +- u32 roc_barrier; + }; + + static void nfs4_free_closedata(void *data) +@@ -1848,8 +1850,6 @@ static void nfs4_free_closedata(void *da + struct nfs4_closedata *calldata = data; + struct nfs4_state_owner *sp = calldata->state->owner; + +- if (calldata->roc) +- pnfs_roc_release(calldata->state->inode); + nfs4_put_open_state(calldata->state); + nfs_free_seqid(calldata->arg.seqid); + nfs4_put_state_owner(sp); +@@ -1882,9 +1882,6 @@ static void nfs4_close_done(struct rpc_t + */ + switch (task->tk_status) { + case 0: +- if (calldata->roc) +- pnfs_roc_set_barrier(state->inode, +- calldata->roc_barrier); + nfs_set_open_stateid(state, &calldata->res.stateid, 0); + renew_lease(server, calldata->timestamp); + nfs4_close_clear_stateid_flags(state, +@@ -1897,7 +1894,7 @@ static void nfs4_close_done(struct rpc_t + if (calldata->arg.fmode == 0) + break; + default: +- if (nfs4_async_handle_error(task, server, state) == -EAGAIN) ++ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } + nfs_release_seqid(calldata->arg.seqid); +@@ -1937,19 +1934,12 @@ static void nfs4_close_prepare(struct rp + return; + } + +- if (calldata->arg.fmode == 0) { ++ if (calldata->arg.fmode == 0) + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; +- if (calldata->roc && +- pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) { +- rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq, +- task, NULL); +- return; +- } +- } + + nfs_fattr_init(calldata->res.fattr); + calldata->timestamp = jiffies; +- if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), ++ if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), NULL, + &calldata->arg.seq_args, &calldata->res.seq_res, + 1, task)) + return; +@@ -1973,7 +1963,7 @@ static const struct rpc_call_ops nfs4_cl + * + * NOTE: Caller must be holding the sp->so_owner semaphore! + */ +-int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc) ++int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait) + { + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_closedata *calldata; +@@ -2008,7 +1998,6 @@ int nfs4_do_close(struct path *path, str + calldata->res.fattr = &calldata->fattr; + calldata->res.seqid = calldata->arg.seqid; + calldata->res.server = server; +- calldata->roc = roc; + path_get(path); + calldata->path = *path; + +@@ -2026,8 +2015,6 @@ int nfs4_do_close(struct path *path, str + out_free_calldata: + kfree(calldata); + out: +- if (roc) +- pnfs_roc_release(state->inode); + nfs4_put_open_state(state); + nfs4_put_state_owner(sp); + return status; +@@ -2269,6 +2256,9 @@ nfs4_proc_setattr(struct dentry *dentry, + struct nfs4_state *state = NULL; + int status; + ++ if (pnfs_ld_layoutret_on_setattr(inode)) ++ pnfs_return_layout(inode, NULL, true); ++ + nfs_fattr_init(fattr); + + /* Search for an existing open(O_WRITE) file */ +@@ -2596,7 +2586,7 @@ static int nfs4_proc_unlink_done(struct + + if (!nfs4_sequence_done(task, &res->seq_res)) + return 0; +- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) ++ if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN) + return 0; + update_changeattr(dir, &res->cinfo); + nfs_post_op_update_inode(dir, res->dir_attr); +@@ -2621,7 +2611,7 @@ static int nfs4_proc_rename_done(struct + + if (!nfs4_sequence_done(task, &res->seq_res)) + return 0; +- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) ++ if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN) + return 0; + + update_changeattr(old_dir, &res->old_cinfo); +@@ -3072,19 +3062,31 @@ static int nfs4_proc_pathconf(struct nfs + static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) + { + struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *client = server->nfs_client; + + dprintk("--> %s\n", __func__); + ++#ifdef CONFIG_NFS_V4_1 ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + +- if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { +- nfs_restart_rpc(task, server->nfs_client); ++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { ++ nfs_restart_rpc(task, client); + return -EAGAIN; + } + + nfs_invalidate_atime(data->inode); +- if (task->tk_status > 0) ++ if (task->tk_status > 0 && client == server->nfs_client) + renew_lease(server, data->timestamp); + return 0; + } +@@ -3095,20 +3097,56 @@ static void nfs4_proc_read_setup(struct + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + } + ++static void pnfs4_update_write_done(struct nfs_inode *nfsi, struct nfs_write_data *data) ++{ ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); ++ pnfs_need_layoutcommit(nfsi, data->args.context); ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ + static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) + { + struct inode *inode = data->inode; +- ++ struct nfs_server *server = NFS_SERVER(inode); ++ struct nfs_client *client = server->nfs_client; ++ + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + +- if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { +- nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); ++#ifdef CONFIG_NFS_V4_1 ++ /* restore original count after retry? */ ++ if (data->pdata.orig_count) { ++ dprintk("%s: restoring original count %u\n", __func__, ++ data->pdata.orig_count); ++ data->args.count = data->pdata.orig_count; ++ } ++ ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS write\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { ++ nfs_restart_rpc(task, client); + return -EAGAIN; + } ++ ++ /* ++ * MDS write: renew lease ++ * DS write: update lastbyte written, mark for layout commit ++ */ + if (task->tk_status >= 0) { +- renew_lease(NFS_SERVER(inode), data->timestamp); +- nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); ++ if (client == server->nfs_client) { ++ renew_lease(server, data->timestamp); ++ nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); ++ } else ++ pnfs4_update_write_done(NFS_I(inode), data); + } + return 0; + } +@@ -3121,21 +3159,42 @@ static void nfs4_proc_write_setup(struct + data->res.server = server; + data->timestamp = jiffies; + ++#ifdef CONFIG_NFS_V4_1 ++ /* writes to DS use pnfs vector */ ++ if (data->fldata.ds_nfs_client) { ++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_WRITE]; ++ return; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; + } + + static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) + { + struct inode *inode = data->inode; +- ++ struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *client = server->nfs_client; ++ ++#ifdef CONFIG_NFS_V4_1 ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS commit\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + +- if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { ++ if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL, NULL) == -EAGAIN) { + nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + return -EAGAIN; + } +- nfs_refresh_inode(inode, data->res.fattr); ++ if (client == server->nfs_client) ++ nfs_refresh_inode(inode, data->res.fattr); + return 0; + } + +@@ -3145,6 +3204,12 @@ static void nfs4_proc_commit_setup(struc + + data->args.bitmask = server->cache_consistency_bitmask; + data->res.server = server; ++#if defined(CONFIG_NFS_V4_1) ++ if (data->fldata.ds_nfs_client) { ++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_COMMIT]; ++ return; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; + } + +@@ -3451,9 +3516,10 @@ static int nfs4_proc_set_acl(struct inod + } + + static int +-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) ++nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state, struct nfs_client *clp) + { +- struct nfs_client *clp = server->nfs_client; ++ if (!clp) ++ clp = server->nfs_client; + + if (task->tk_status >= 0) + return 0; +@@ -3477,14 +3543,16 @@ nfs4_async_handle_error(struct rpc_task + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: +- dprintk("%s ERROR %d, Reset session\n", __func__, +- task->tk_status); ++ dprintk("%s ERROR %d, Reset session. Exchangeid " ++ "flags 0x%x\n", __func__, task->tk_status, ++ clp->cl_exchange_flags); + nfs4_schedule_state_recovery(clp); + task->tk_status = 0; + return -EAGAIN; + #endif /* CONFIG_NFS_V4_1 */ + case -NFS4ERR_DELAY: +- nfs_inc_server_stats(server, NFSIOS_DELAY); ++ if (server) ++ nfs_inc_server_stats(server, NFSIOS_DELAY); + case -NFS4ERR_GRACE: + case -EKEYEXPIRED: + rpc_delay(task, NFS4_POLL_RETRY_MAX); +@@ -3497,6 +3565,8 @@ nfs4_async_handle_error(struct rpc_task + task->tk_status = nfs4_map_errors(task->tk_status); + return 0; + do_state_recovery: ++ if (is_ds_only_client(clp)) ++ return 0; + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); + nfs4_schedule_state_recovery(clp); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) +@@ -3630,8 +3700,8 @@ static void nfs4_delegreturn_done(struct + renew_lease(data->res.server, data->timestamp); + break; + default: +- if (nfs4_async_handle_error(task, data->res.server, NULL) == +- -EAGAIN) { ++ if (nfs4_async_handle_error(task, data->res.server, NULL, NULL) ++ == -EAGAIN) { + nfs_restart_rpc(task, data->res.server->nfs_client); + return; + } +@@ -3651,7 +3721,7 @@ static void nfs4_delegreturn_prepare(str + + d_data = (struct nfs4_delegreturndata *)data; + +- if (nfs4_setup_sequence(d_data->res.server, ++ if (nfs4_setup_sequence(d_data->res.server, NULL, + &d_data->args.seq_args, + &d_data->res.seq_res, 1, task)) + return; +@@ -3885,7 +3955,7 @@ static void nfs4_locku_done(struct rpc_t + case -NFS4ERR_EXPIRED: + break; + default: +- if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) ++ if (nfs4_async_handle_error(task, calldata->server, NULL, NULL) == -EAGAIN) + nfs_restart_rpc(task, + calldata->server->nfs_client); + } +@@ -3903,7 +3973,7 @@ static void nfs4_locku_prepare(struct rp + return; + } + calldata->timestamp = jiffies; +- if (nfs4_setup_sequence(calldata->server, ++ if (nfs4_setup_sequence(calldata->server, NULL, + &calldata->arg.seq_args, + &calldata->res.seq_res, 1, task)) + return; +@@ -4058,7 +4128,7 @@ static void nfs4_lock_prepare(struct rpc + } else + data->arg.new_lock_owner = 0; + data->timestamp = jiffies; +- if (nfs4_setup_sequence(data->server, ++ if (nfs4_setup_sequence(data->server, NULL, + &data->arg.seq_args, + &data->res.seq_res, 1, task)) + return; +@@ -5077,7 +5147,7 @@ int nfs4_init_session(struct nfs_server + session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; + session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; + +- ret = nfs4_recover_expired_lease(server); ++ ret = nfs4_recover_expired_lease(server->nfs_client); + if (!ret) + ret = nfs4_check_client_ready(clp); + return ret; +@@ -5330,24 +5400,53 @@ static void + nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) + { + struct nfs4_layoutget *lgp = calldata; +- struct nfs_server *server = NFS_SERVER(lgp->args.inode); ++ struct inode *ino = lgp->args.inode; ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct nfs_server *server = NFS_SERVER(ino); ++ struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; + + dprintk("--> %s\n", __func__); ++ spin_lock(&clp->cl_lock); ++ if (matches_outstanding_recall(ino, &lgp->args.range)) { ++ rpc_sleep_on(&clp->cl_rpcwaitq_recall, task, NULL); ++ spin_unlock(&clp->cl_lock); ++ return; ++ } ++ spin_unlock(&clp->cl_lock); + /* Note the is a race here, where a CB_LAYOUTRECALL can come in + * right now covering the LAYOUTGET we are about to send. + * However, that is not so catastrophic, and there seems + * to be no way to prevent it completely. + */ +- if (nfs4_setup_sequence(server, &lgp->args.seq_args, +- &lgp->res.seq_res, 0, task)) ++ spin_lock(&ino->i_lock); ++ if (pnfs_layoutgets_blocked(nfsi->layout, NULL)) { ++ rpc_sleep_on(&nfsi->lo_rpcwaitq_stateid, task, NULL); ++ spin_unlock(&ino->i_lock); + return; ++ } ++ /* This needs after above check but atomic with it in order to properly ++ * serialize openstateid LAYOUTGETs. ++ */ ++ atomic_inc(&nfsi->layout->plh_outstanding); + if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, + NFS_I(lgp->args.inode)->layout, + lgp->args.ctx->state)) { + rpc_exit(task, NFS4_OK); +- return; ++ goto err_out_locked; ++ } ++ spin_unlock(&ino->i_lock); ++ ++ if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args, ++ &lgp->res.seq_res, 0, task)) { ++ goto err_out; + } + rpc_call_start(task); ++ return; ++err_out: ++ spin_lock(&ino->i_lock); ++err_out_locked: ++ atomic_dec(&nfsi->layout->plh_outstanding); ++ spin_unlock(&ino->i_lock); + } + + static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) +@@ -5357,9 +5456,14 @@ static void nfs4_layoutget_done(struct r + + dprintk("--> %s\n", __func__); + +- if (!nfs4_sequence_done(task, &lgp->res.seq_res)) ++ if (!nfs4_sequence_done(task, &lgp->res.seq_res)) { ++ /* layout code relies on fact that in this case ++ * code falls back to tk_action=call_start, but not ++ * back to rpc_prepare_task, to keep plh_outstanding ++ * correct. ++ */ + return; +- ++ } + switch (task->tk_status) { + case 0: + break; +@@ -5368,7 +5472,12 @@ static void nfs4_layoutget_done(struct r + task->tk_status = -NFS4ERR_DELAY; + /* Fall through */ + default: +- if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) { ++ struct inode *ino = lgp->args.inode; ++ ++ spin_lock(&ino->i_lock); ++ atomic_dec(&NFS_I(ino)->layout->plh_outstanding); ++ spin_unlock(&ino->i_lock); + rpc_restart_call_prepare(task); + return; + } +@@ -5381,6 +5490,7 @@ static void nfs4_layoutget_release(void + struct nfs4_layoutget *lgp = calldata; + + dprintk("--> %s\n", __func__); ++ put_layout_hdr(NFS_I(lgp->args.inode)->layout); + if (lgp->res.layout.buf != NULL) + free_page((unsigned long) lgp->res.layout.buf); + put_nfs_open_context(lgp->args.ctx); +@@ -5429,11 +5539,279 @@ int nfs4_proc_layoutget(struct nfs4_layo + status = task->tk_status; + if (status == 0) + status = pnfs_layout_process(lgp); ++ else { ++ struct inode *ino = lgp->args.inode; ++ struct pnfs_layout_hdr *lo = NFS_I(ino)->layout; ++ ++ spin_lock(&ino->i_lock); ++ atomic_dec(&lo->plh_outstanding); ++ if (!pnfs_layoutgets_blocked(lo, NULL)) ++ rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid); ++ spin_unlock(&ino->i_lock); ++ } ++ rpc_put_task(task); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ return status; ++} ++ ++static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *data) ++{ ++ struct nfs4_layoutcommit_data *ldata = ++ (struct nfs4_layoutcommit_data *)data; ++ struct nfs_server *server = NFS_SERVER(ldata->args.inode); ++ ++ if (nfs4_setup_sequence(server, NULL, &ldata->args.seq_args, ++ &ldata->res.seq_res, 1, task)) ++ return; ++ ldata->res.status = -1; ++ rpc_call_start(task); ++} ++ ++static void ++nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutcommit_data *data = ++ (struct nfs4_layoutcommit_data *)calldata; ++ struct nfs_server *server = NFS_SERVER(data->args.inode); ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++} ++ ++static void nfs4_layoutcommit_release(void *lcdata) ++{ ++ struct nfs4_layoutcommit_data *data = ++ (struct nfs4_layoutcommit_data *)lcdata; ++ ++ pnfs_cleanup_layoutcommit(data->args.inode, data); ++ /* Matched by get_layout in pnfs_layoutcommit_inode */ ++ put_layout_hdr(NFS_I(data->args.inode)->layout); ++ put_rpccred(data->cred); ++ kfree(lcdata); ++} ++ ++static const struct rpc_call_ops nfs4_layoutcommit_ops = { ++ .rpc_call_prepare = nfs4_layoutcommit_prepare, ++ .rpc_call_done = nfs4_layoutcommit_done, ++ .rpc_release = nfs4_layoutcommit_release, ++}; ++ ++/* Execute a layoutcommit to the server */ ++int ++nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync) ++{ ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT], ++ .rpc_argp = &data->args, ++ .rpc_resp = &data->res, ++ .rpc_cred = data->cred, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .task = &data->task, ++ .rpc_client = NFS_CLIENT(data->args.inode), ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_layoutcommit_ops, ++ .callback_data = data, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ struct rpc_task *task; ++ int status = 0; ++ ++ dprintk("NFS: %4d initiating layoutcommit call. %llu@%llu lbw: %llu " ++ "type: %d issync %d\n", ++ data->task.tk_pid, ++ data->args.range.length, ++ data->args.range.offset, ++ data->args.lastbytewritten, ++ data->args.layout_type, issync); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (!issync) ++ goto out; ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = task->tk_status; ++out: ++ dprintk("%s: status %d\n", __func__, status); ++ rpc_put_task(task); ++ return status; ++} ++ ++static void ++nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutreturn *lrp = calldata; ++ ++ dprintk("--> %s\n", __func__); ++ if (lrp->args.return_type == RETURN_FILE) { ++ struct nfs_inode *nfsi = NFS_I(lrp->args.inode); ++ ++ if (pnfs_return_layout_barrier(nfsi, &lrp->args.range)) { ++ dprintk("%s: waiting on barrier\n", __func__); ++ rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL); ++ return; ++ } ++ } ++ if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args, ++ &lrp->res.seq_res, 0, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutreturn *lrp = calldata; ++ struct nfs_server *server; ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!nfs4_sequence_done(task, &lrp->res.seq_res)) ++ return; ++ ++ if (lrp->args.return_type == RETURN_FILE) ++ server = NFS_SERVER(lrp->args.inode); ++ else ++ server = NULL; ++ if (nfs4_async_handle_error(task, server, NULL, lrp->clp) == -EAGAIN) { ++ nfs_restart_rpc(task, lrp->clp); ++ return; ++ } ++ if ((task->tk_status == 0) && (lrp->args.return_type == RETURN_FILE)) { ++ struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; ++ ++ spin_lock(&lo->inode->i_lock); ++ if (lrp->res.lrs_present) ++ pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); ++ else ++ BUG_ON(!list_empty(&lo->segs)); ++ spin_unlock(&lo->inode->i_lock); ++ } ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void nfs4_layoutreturn_release(void *calldata) ++{ ++ struct nfs4_layoutreturn *lrp = calldata; ++ ++ dprintk("--> %s return_type %d\n", __func__, lrp->args.return_type); ++ if (lrp->args.return_type == RETURN_FILE) { ++ struct inode *ino = lrp->args.inode; ++ struct pnfs_layout_hdr *lo = NFS_I(ino)->layout; ++ ++ spin_lock(&ino->i_lock); ++ lo->plh_block_lgets--; ++ atomic_dec(&lo->plh_outstanding); ++ if (!pnfs_layoutgets_blocked(lo, NULL)) ++ rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid); ++ spin_unlock(&ino->i_lock); ++ put_layout_hdr(lo); ++ } ++ kfree(calldata); ++ dprintk("<-- %s\n", __func__); ++} ++ ++static const struct rpc_call_ops nfs4_layoutreturn_call_ops = { ++ .rpc_call_prepare = nfs4_layoutreturn_prepare, ++ .rpc_call_done = nfs4_layoutreturn_done, ++ .rpc_release = nfs4_layoutreturn_release, ++}; ++ ++int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync) ++{ ++ struct rpc_task *task; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], ++ .rpc_argp = &lrp->args, ++ .rpc_resp = &lrp->res, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = lrp->clp->cl_rpcclient, ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_layoutreturn_call_ops, ++ .callback_data = lrp, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ int status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ if (lrp->args.return_type == RETURN_FILE) { ++ struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; ++ /* FIXME we should test for BULK here */ ++ spin_lock(&lo->inode->i_lock); ++ BUG_ON(lo->plh_block_lgets == 0); ++ atomic_inc(&lo->plh_outstanding); ++ spin_unlock(&lo->inode->i_lock); ++ } ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (!issync) ++ goto out; ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = task->tk_status; ++out: ++ dprintk("<-- %s\n", __func__); + rpc_put_task(task); ++ return status; ++} ++ ++/* ++ * Retrieve the list of Data Server devices from the MDS. ++ */ ++static int _nfs4_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist) ++{ ++ struct nfs4_getdevicelist_args args = { ++ .fh = fh, ++ .layoutclass = server->pnfs_curr_ld->id, ++ }; ++ struct nfs4_getdevicelist_res res = { ++ .devlist = devlist, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST], ++ .rpc_argp = &args, ++ .rpc_resp = &res, ++ .rpc_cred = nfs4_get_machine_cred(server->nfs_client), ++ }; ++ int status; ++ ++ dprintk("--> %s\n", __func__); ++ status = nfs4_call_sync(server, &msg, &args, &res, 0); ++ put_rpccred(msg.rpc_cred); + dprintk("<-- %s status=%d\n", __func__, status); + return status; + } + ++int nfs4_proc_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_getdevicelist(server, fh, devlist), ++ &exception); ++ } while (exception.retry); ++ ++ dprintk("%s: err=%d, num_devs=%u\n", __func__, ++ err, devlist->num_devs); ++ ++ return err; ++} ++EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist); ++ + static int + _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) + { +@@ -5447,11 +5825,13 @@ _nfs4_proc_getdeviceinfo(struct nfs_serv + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], + .rpc_argp = &args, + .rpc_resp = &res, ++ .rpc_cred = nfs4_get_machine_cred(server->nfs_client), + }; + int status; + + dprintk("--> %s\n", __func__); + status = nfs4_call_sync(server, &msg, &args, &res, 0); ++ put_rpccred(msg.rpc_cred); + dprintk("<-- %s status=%d\n", __func__, status); + + return status; +diff -up linux-2.6.37.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.37.noarch/fs/nfs/nfs4renewd.c +--- linux-2.6.37.noarch/fs/nfs/nfs4renewd.c.orig 2011-01-28 09:37:32.540980017 -0500 ++++ linux-2.6.37.noarch/fs/nfs/nfs4renewd.c 2011-01-28 09:43:53.326773791 -0500 +@@ -65,7 +65,7 @@ nfs4_renew_state(struct work_struct *wor + dprintk("%s: start\n", __func__); + + rcu_read_lock(); +- if (list_empty(&clp->cl_superblocks)) { ++ if (list_empty(&clp->cl_superblocks) && !is_ds_only_client(clp)) { + rcu_read_unlock(); + goto out; + } +diff -up linux-2.6.37.noarch/fs/nfs/nfs4state.c.orig linux-2.6.37.noarch/fs/nfs/nfs4state.c +--- linux-2.6.37.noarch/fs/nfs/nfs4state.c.orig 2011-01-28 09:37:32.542979947 -0500 ++++ linux-2.6.37.noarch/fs/nfs/nfs4state.c 2011-01-28 09:43:53.327773630 -0500 +@@ -153,6 +153,11 @@ static int nfs41_setup_state_renewal(str + int status; + struct nfs_fsinfo fsinfo; + ++ if (is_ds_only_client(clp)) { ++ nfs4_schedule_state_renewal(clp); ++ return 0; ++ } ++ + status = nfs4_proc_get_lease_time(clp, &fsinfo); + if (status == 0) { + /* Update lease time and schedule renewal */ +@@ -224,6 +229,7 @@ static int nfs4_begin_drain_session(stru + int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) + { + int status; ++ u32 req_exchange_flags = clp->cl_exchange_flags; + + nfs4_begin_drain_session(clp); + status = nfs4_proc_exchange_id(clp, cred); +@@ -238,6 +244,16 @@ int nfs41_init_clientid(struct nfs_clien + nfs_callback_down(1); + status = 0; + } ++ if (is_ds_only_session(req_exchange_flags)) { ++ clp->cl_exchange_flags &= ++ ~(EXCHGID4_FLAG_USE_PNFS_MDS | EXCHGID4_FLAG_USE_NON_PNFS); ++ if (!is_ds_only_session(clp->cl_exchange_flags)) { ++ nfs4_destroy_session(clp->cl_session); ++ clp->cl_session = NULL; ++ status = -ENOTSUPP; ++ goto out; ++ } ++ } + nfs41_setup_state_renewal(clp); + nfs_mark_client_ready(clp, NFS_CS_READY); + out: +@@ -669,9 +685,22 @@ static void __nfs4_close(struct path *pa + nfs4_put_open_state(state); + nfs4_put_state_owner(owner); + } else { +- bool roc = pnfs_roc(state->inode); ++ u32 roc_iomode; ++ struct nfs_inode *nfsi = NFS_I(state->inode); ++ ++ /* FIXME: should return the layout only on last close */ ++ if (has_layout(nfsi) && ++ (roc_iomode = pnfs_layout_roc_iomode(nfsi)) != 0) { ++ struct pnfs_layout_range range = { ++ .iomode = roc_iomode, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ ++ pnfs_return_layout(state->inode, &range, wait); ++ } + +- nfs4_do_close(path, state, gfp_mask, wait, roc); ++ nfs4_do_close(path, state, gfp_mask, wait); + } + } + +@@ -1661,6 +1690,10 @@ static void nfs4_state_manager(struct nf + nfs_client_return_marked_delegations(clp); + continue; + } ++ if (test_and_clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state)) { ++ nfs_client_return_layouts(clp); ++ continue; ++ } + /* Recall session slots */ + if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state) + && nfs4_has_session(clp)) { +diff -up linux-2.6.37.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.37.noarch/fs/nfs/nfs4xdr.c +--- linux-2.6.37.noarch/fs/nfs/nfs4xdr.c.orig 2011-01-28 09:37:32.546979809 -0500 ++++ linux-2.6.37.noarch/fs/nfs/nfs4xdr.c 2011-01-28 09:43:53.330773162 -0500 +@@ -90,7 +90,7 @@ static int nfs4_stat_to_errno(int); + #define encode_getfh_maxsz (op_encode_hdr_maxsz) + #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ + ((3+NFS4_FHSIZE) >> 2)) +-#define nfs4_fattr_bitmap_maxsz 3 ++#define nfs4_fattr_bitmap_maxsz 4 + #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) + #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) + #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) +@@ -112,7 +112,11 @@ static int nfs4_stat_to_errno(int); + #define encode_restorefh_maxsz (op_encode_hdr_maxsz) + #define decode_restorefh_maxsz (op_decode_hdr_maxsz) + #define encode_fsinfo_maxsz (encode_getattr_maxsz) +-#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) ++/* The 5 accounts for the PNFS attributes, and assumes that at most three ++ * layout types will be returned. ++ */ ++#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ ++ nfs4_fattr_bitmap_maxsz + 8 + 5) + #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) + #define decode_renew_maxsz (op_decode_hdr_maxsz) + #define encode_setclientid_maxsz \ +@@ -311,6 +315,17 @@ static int nfs4_stat_to_errno(int); + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) + #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) + #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) ++#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ ++ encode_verifier_maxsz) ++#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \ ++ 2 /* nfs_cookie4 gdlr_cookie */ + \ ++ decode_verifier_maxsz \ ++ /* verifier4 gdlr_verifier */ + \ ++ 1 /* gdlr_deviceid_list count */ + \ ++ XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ ++ NFS4_DEVICEID4_SIZE) \ ++ /* gdlr_deviceid_list */ + \ ++ 1 /* bool gdlr_eof */) + #define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ + XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) + #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ +@@ -324,6 +339,17 @@ static int nfs4_stat_to_errno(int); + #define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ + decode_stateid_maxsz + \ + XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) ++#define encode_layoutcommit_maxsz (18 + \ ++ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \ ++ op_encode_hdr_maxsz + \ ++ encode_stateid_maxsz) ++#define decode_layoutcommit_maxsz (3 + op_decode_hdr_maxsz) ++#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ ++ encode_stateid_maxsz + \ ++ 1 /* FIXME: opaque lrf_body always empty at ++ *the moment */) ++#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ ++ 1 + decode_stateid_maxsz) + #else /* CONFIG_NFS_V4_1 */ + #define encode_sequence_maxsz 0 + #define decode_sequence_maxsz 0 +@@ -713,6 +739,14 @@ static int nfs4_stat_to_errno(int); + #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_reclaim_complete_maxsz) ++#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_getdevicelist_maxsz) ++#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_getdevicelist_maxsz) + #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz +\ + encode_getdeviceinfo_maxsz) +@@ -727,6 +761,38 @@ static int nfs4_stat_to_errno(int); + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutget_maxsz) ++#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_putfh_maxsz + \ ++ encode_layoutcommit_maxsz + \ ++ encode_getattr_maxsz) ++#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutcommit_maxsz + \ ++ decode_getattr_maxsz) ++#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_layoutreturn_maxsz) ++#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutreturn_maxsz) ++#define NFS4_enc_dswrite_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_putfh_maxsz + \ ++ encode_write_maxsz) ++#define NFS4_dec_dswrite_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_write_maxsz) ++#define NFS4_enc_dscommit_sz (compound_encode_hdr_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_commit_maxsz) ++#define NFS4_dec_dscommit_sz (compound_decode_hdr_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_commit_maxsz) + + const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_encode_hdr_maxsz + +@@ -1031,6 +1097,35 @@ static void encode_getattr_two(struct xd + hdr->replen += decode_getattr_maxsz; + } + ++static void ++encode_getattr_three(struct xdr_stream *xdr, ++ uint32_t bm0, uint32_t bm1, uint32_t bm2, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(OP_GETATTR); ++ if (bm2) { ++ p = reserve_space(xdr, 16); ++ *p++ = cpu_to_be32(3); ++ *p++ = cpu_to_be32(bm0); ++ *p++ = cpu_to_be32(bm1); ++ *p = cpu_to_be32(bm2); ++ } else if (bm1) { ++ p = reserve_space(xdr, 12); ++ *p++ = cpu_to_be32(2); ++ *p++ = cpu_to_be32(bm0); ++ *p = cpu_to_be32(bm1); ++ } else { ++ p = reserve_space(xdr, 8); ++ *p++ = cpu_to_be32(1); ++ *p = cpu_to_be32(bm0); ++ } ++ hdr->nops++; ++ hdr->replen += decode_getattr_maxsz; ++} ++ + static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) + { + encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], +@@ -1039,8 +1134,11 @@ static void encode_getfattr(struct xdr_s + + static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) + { +- encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], +- bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); ++ encode_getattr_three(xdr, ++ bitmask[0] & nfs4_fsinfo_bitmap[0], ++ bitmask[1] & nfs4_fsinfo_bitmap[1], ++ bitmask[2] & nfs4_fsinfo_bitmap[2], ++ hdr); + } + + static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) +@@ -1767,6 +1865,26 @@ static void encode_sequence(struct xdr_s + + #ifdef CONFIG_NFS_V4_1 + static void ++encode_getdevicelist(struct xdr_stream *xdr, ++ const struct nfs4_getdevicelist_args *args, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ nfs4_verifier dummy = { ++ .data = "dummmmmy", ++ }; ++ ++ p = reserve_space(xdr, 20); ++ *p++ = cpu_to_be32(OP_GETDEVICELIST); ++ *p++ = cpu_to_be32(args->layoutclass); ++ *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); ++ xdr_encode_hyper(p, 0ULL); /* cookie */ ++ encode_nfs4_verifier(xdr, &dummy); ++ hdr->nops++; ++ hdr->replen += decode_getdevicelist_maxsz; ++} ++ ++static void + encode_getdeviceinfo(struct xdr_stream *xdr, + const struct nfs4_getdeviceinfo_args *args, + struct compound_hdr *hdr) +@@ -1812,6 +1930,102 @@ encode_layoutget(struct xdr_stream *xdr, + hdr->nops++; + hdr->replen += decode_layoutget_maxsz; + } ++ ++static void ++encode_layoutcommit(struct xdr_stream *xdr, ++ struct inode *inode, ++ const struct nfs4_layoutcommit_args *args, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ ++ dprintk("%s: %llu@%llu lbw: %llu type: %d\n", __func__, ++ args->range.length, args->range.offset, args->lastbytewritten, ++ args->layout_type); ++ ++ p = reserve_space(xdr, 40 + NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); ++ p = xdr_encode_hyper(p, args->range.offset); ++ p = xdr_encode_hyper(p, args->range.length); ++ *p++ = cpu_to_be32(0); /* reclaim */ ++ p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(1); /* newoffset = TRUE */ ++ p = xdr_encode_hyper(p, args->lastbytewritten); ++ *p = cpu_to_be32(args->time_modify_changed != 0); ++ if (args->time_modify_changed) { ++ p = reserve_space(xdr, 12); ++ *p++ = cpu_to_be32(0); ++ *p++ = cpu_to_be32(args->time_modify.tv_sec); ++ *p = cpu_to_be32(args->time_modify.tv_nsec); ++ } ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(args->layout_type); ++ ++ if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) { ++ NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( ++ NFS_I(inode)->layout, xdr, args); ++ } else { ++ p = reserve_space(xdr, 4); ++ xdr_encode_opaque(p, NULL, 0); ++ } ++ ++ hdr->nops++; ++ hdr->replen += decode_layoutcommit_maxsz; ++} ++ ++static void ++encode_layoutreturn(struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args, ++ struct compound_hdr *hdr) ++{ ++ nfs4_stateid stateid; ++ __be32 *p; ++ ++ p = reserve_space(xdr, 20); ++ *p++ = cpu_to_be32(OP_LAYOUTRETURN); ++ *p++ = cpu_to_be32(args->reclaim); ++ *p++ = cpu_to_be32(args->layout_type); ++ *p++ = cpu_to_be32(args->range.iomode); ++ *p = cpu_to_be32(args->return_type); ++ if (args->return_type == RETURN_FILE) { ++ p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE); ++ p = xdr_encode_hyper(p, args->range.offset); ++ p = xdr_encode_hyper(p, args->range.length); ++ spin_lock(&args->inode->i_lock); ++ memcpy(stateid.data, NFS_I(args->inode)->layout->stateid.data, ++ NFS4_STATEID_SIZE); ++ spin_unlock(&args->inode->i_lock); ++ p = xdr_encode_opaque_fixed(p, &stateid.data, ++ NFS4_STATEID_SIZE); ++ if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) { ++ NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn( ++ NFS_I(args->inode)->layout, xdr, args); ++ } else { ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(0); ++ } ++ } ++ hdr->nops++; ++ hdr->replen += decode_layoutreturn_maxsz; ++} ++#else /* CONFIG_NFS_V4_1 */ ++static int ++encode_layoutcommit(struct xdr_stream *xdr, ++ struct inode *inode, ++ const struct nfs4_layoutcommit_args *args, ++ struct compound_hdr *hdr) ++{ ++ return 0; ++} ++ ++static void ++encode_layoutreturn(struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args, ++ struct compound_hdr *hdr) ++{ ++} ++ + #endif /* CONFIG_NFS_V4_1 */ + + /* +@@ -2408,7 +2622,7 @@ static void nfs4_xdr_enc_setclientid_con + struct compound_hdr hdr = { + .nops = 0, + }; +- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; ++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; + + encode_compound_hdr(xdr, req, &hdr); + encode_setclientid_confirm(xdr, arg, &hdr); +@@ -2534,7 +2748,7 @@ static void nfs4_xdr_enc_get_lease_time( + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), + }; +- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; ++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->la_seq_args, &hdr); +@@ -2561,6 +2775,24 @@ static void nfs4_xdr_enc_reclaim_complet + } + + /* ++ * Encode GETDEVICELIST request ++ */ ++static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, ++ struct xdr_stream *xdr, ++ struct nfs4_getdevicelist_args *args) ++{ ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ encode_compound_hdr(xdr, req, &hdr); ++ encode_sequence(xdr, &args->seq_args, &hdr); ++ encode_putfh(xdr, args->fh, &hdr); ++ encode_getdevicelist(xdr, args, &hdr); ++ encode_nops(&hdr); ++} ++ ++/* + * Encode GETDEVICEINFO request + */ + static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, +@@ -2601,6 +2833,81 @@ static void nfs4_xdr_enc_layoutget(struc + encode_layoutget(xdr, args, &hdr); + encode_nops(&hdr); + } ++ ++/* ++ * Encode LAYOUTCOMMIT request ++ */ ++static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, ++ struct xdr_stream *xdr, ++ struct nfs4_layoutcommit_args *args) ++{ ++ struct nfs4_layoutcommit_data *data = ++ container_of(args, struct nfs4_layoutcommit_data, args); ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ encode_compound_hdr(xdr, req, &hdr); ++ encode_sequence(xdr, &args->seq_args, &hdr); ++ encode_putfh(xdr, args->fh, &hdr); ++ encode_layoutcommit(xdr, data->args.inode, args, &hdr); ++ encode_getfattr(xdr, args->bitmask, &hdr); ++ encode_nops(&hdr); ++} ++ ++/* ++ * Encode LAYOUTRETURN request ++ */ ++static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, ++ struct xdr_stream *xdr, ++ struct nfs4_layoutreturn_args *args) ++{ ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ encode_compound_hdr(xdr, req, &hdr); ++ encode_sequence(xdr, &args->seq_args, &hdr); ++ encode_putfh(xdr, NFS_FH(args->inode), &hdr); ++ encode_layoutreturn(xdr, args, &hdr); ++ encode_nops(&hdr); ++} ++ ++/* ++ * Encode a pNFS File Layout Data Server WRITE request ++ */ ++static void nfs4_xdr_enc_dswrite(struct rpc_rqst *req, ++ struct xdr_stream *xdr, ++ struct nfs_writeargs *args) ++{ ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ encode_compound_hdr(xdr, req, &hdr); ++ encode_sequence(xdr, &args->seq_args, &hdr); ++ encode_putfh(xdr, args->fh, &hdr); ++ encode_write(xdr, args, &hdr); ++ encode_nops(&hdr); ++} ++ ++/* ++ * Encode a pNFS File Layout Data Server COMMIT request ++ */ ++static void nfs4_xdr_enc_dscommit(struct rpc_rqst *req, ++ struct xdr_stream *xdr, ++ struct nfs_writeargs *args) ++{ ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ encode_compound_hdr(xdr, req, &hdr); ++ encode_sequence(xdr, &args->seq_args, &hdr); ++ encode_putfh(xdr, args->fh, &hdr); ++ encode_commit(xdr, args, &hdr); ++ encode_nops(&hdr); ++} + #endif /* CONFIG_NFS_V4_1 */ + + static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +@@ -2701,14 +3008,17 @@ static int decode_attr_bitmap(struct xdr + goto out_overflow; + bmlen = be32_to_cpup(p); + +- bitmap[0] = bitmap[1] = 0; ++ bitmap[0] = bitmap[1] = bitmap[2] = 0; + p = xdr_inline_decode(xdr, (bmlen << 2)); + if (unlikely(!p)) + goto out_overflow; + if (bmlen > 0) { + bitmap[0] = be32_to_cpup(p++); +- if (bmlen > 1) +- bitmap[1] = be32_to_cpup(p); ++ if (bmlen > 1) { ++ bitmap[1] = be32_to_cpup(p++); ++ if (bmlen > 2) ++ bitmap[2] = be32_to_cpup(p); ++ } + } + return 0; + out_overflow: +@@ -2740,8 +3050,9 @@ static int decode_attr_supported(struct + return ret; + bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; + } else +- bitmask[0] = bitmask[1] = 0; +- dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); ++ bitmask[0] = bitmask[1] = bitmask[2] = 0; ++ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, ++ bitmask[0], bitmask[1], bitmask[2]); + return 0; + } + +@@ -3794,7 +4105,7 @@ out_overflow: + static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3820,7 +4131,7 @@ xdr_error: + static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3852,7 +4163,7 @@ xdr_error: + static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3994,7 +4305,7 @@ static int decode_getfattr_generic(struc + { + __be32 *savep; + uint32_t attrlen, +- bitmap[2] = {0}; ++ bitmap[3] = {0}; + int status; + + status = decode_op_hdr(xdr, OP_GETATTR); +@@ -4080,10 +4391,32 @@ static int decode_attr_pnfstype(struct x + return status; + } + ++/* ++ * The prefered block size for layout directed io ++ */ ++static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, ++ uint32_t *res) ++{ ++ __be32 *p; ++ ++ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); ++ *res = 0; ++ if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) { ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++ } ++ *res = be32_to_cpup(p); ++ bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; ++ } ++ return 0; ++} ++ + static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2]; ++ uint32_t attrlen, bitmap[3]; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -4111,6 +4444,9 @@ static int decode_fsinfo(struct xdr_stre + status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); + if (status != 0) + goto xdr_error; ++ status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); ++ if (status) ++ goto xdr_error; + + status = verify_attr_len(xdr, savep, attrlen); + xdr_error: +@@ -4530,7 +4866,7 @@ static int decode_getacl(struct xdr_stre + { + __be32 *savep; + uint32_t attrlen, +- bitmap[2] = {0}; ++ bitmap[3] = {0}; + struct kvec *iov = req->rq_rcv_buf.head; + int status; + +@@ -4878,6 +5214,50 @@ out_overflow: + } + + #if defined(CONFIG_NFS_V4_1) ++/* ++ * TODO: Need to handle case when EOF != true; ++ */ ++static int decode_getdevicelist(struct xdr_stream *xdr, ++ struct pnfs_devicelist *res) ++{ ++ __be32 *p; ++ int status, i; ++ struct nfs_writeverf verftemp; ++ ++ status = decode_op_hdr(xdr, OP_GETDEVICELIST); ++ if (status) ++ return status; ++ ++ p = xdr_inline_decode(xdr, 8 + 8 + 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ ++ /* TODO: Skip cookie for now */ ++ p += 2; ++ ++ /* Read verifier */ ++ p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); ++ ++ res->num_devs = be32_to_cpup(p); ++ ++ dprintk("%s: num_dev %d\n", __func__, res->num_devs); ++ ++ if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) ++ return -NFS4ERR_REP_TOO_BIG; ++ ++ p = xdr_inline_decode(xdr, ++ res->num_devs * NFS4_DEVICEID4_SIZE + 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ for (i = 0; i < res->num_devs; i++) ++ p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, ++ NFS4_DEVICEID4_SIZE); ++ res->eof = be32_to_cpup(p); ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} + + static int decode_getdeviceinfo(struct xdr_stream *xdr, + struct pnfs_device *pdev) +@@ -5003,6 +5383,56 @@ out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; + } ++ ++static int decode_layoutreturn(struct xdr_stream *xdr, ++ struct nfs4_layoutreturn_res *res) ++{ ++ __be32 *p; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTRETURN); ++ if (status) ++ return status; ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->lrs_present = be32_to_cpup(p); ++ if (res->lrs_present) ++ status = decode_stateid(xdr, &res->stateid); ++ return status; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutcommit(struct xdr_stream *xdr, ++ struct rpc_rqst *req, ++ struct nfs4_layoutcommit_res *res) ++{ ++ __be32 *p; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); ++ res->status = status; ++ if (status) ++ return status; ++ ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->sizechanged = be32_to_cpup(p); ++ ++ if (res->sizechanged) { ++ p = xdr_inline_decode(xdr, 8); ++ if (unlikely(!p)) ++ goto out_overflow; ++ xdr_decode_hyper(p, &res->newsize); ++ } ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} + #endif /* CONFIG_NFS_V4_1 */ + + /* +@@ -6019,6 +6449,32 @@ static int nfs4_xdr_dec_reclaim_complete + } + + /* ++ * Decode GETDEVICELIST response ++ */ ++static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct nfs4_getdevicelist_res *res) ++{ ++ struct compound_hdr hdr; ++ int status; ++ ++ dprintk("encoding getdevicelist!\n"); ++ ++ status = decode_compound_hdr(xdr, &hdr); ++ if (status != 0) ++ goto out; ++ status = decode_sequence(xdr, &res->seq_res, rqstp); ++ if (status != 0) ++ goto out; ++ status = decode_putfh(xdr); ++ if (status != 0) ++ goto out; ++ status = decode_getdevicelist(xdr, res->devlist); ++out: ++ return status; ++} ++ ++/* + * Decode GETDEVINFO response + */ + static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, +@@ -6062,6 +6518,108 @@ static int nfs4_xdr_dec_layoutget(struct + out: + return status; + } ++ ++/* ++ * Decode LAYOUTRETURN response ++ */ ++static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct nfs4_layoutreturn_res *res) ++{ ++ struct compound_hdr hdr; ++ int status; ++ ++ status = decode_compound_hdr(xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(xdr); ++ if (status) ++ goto out; ++ status = decode_layoutreturn(xdr, res); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTCOMMIT response ++ */ ++static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct nfs4_layoutcommit_res *res) ++{ ++ struct compound_hdr hdr; ++ int status; ++ ++ status = decode_compound_hdr(xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(xdr); ++ if (status) ++ goto out; ++ status = decode_layoutcommit(xdr, rqstp, res); ++ if (status) ++ goto out; ++ decode_getfattr(xdr, res->fattr, res->server, ++ !RPC_IS_ASYNC(rqstp->rq_task)); ++out: ++ return status; ++} ++ ++/* ++ * Decode pNFS File Layout Data Server WRITE response ++ */ ++static int nfs4_xdr_dec_dswrite(struct rpc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct nfs_writeres *res) ++{ ++ struct compound_hdr hdr; ++ int status; ++ ++ status = decode_compound_hdr(xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(xdr); ++ if (status) ++ goto out; ++ status = decode_write(xdr, res); ++ if (!status) ++ return res->count; ++out: ++ return status; ++} ++ ++/* ++ * Decode pNFS File Layout Data Server COMMIT response ++ */ ++static int nfs4_xdr_dec_dscommit(struct rpc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct nfs_writeres *res) ++{ ++ struct compound_hdr hdr; ++ int status; ++ ++ status = decode_compound_hdr(xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(xdr); ++ if (status) ++ goto out; ++ status = decode_commit(xdr, res); ++out: ++ return status; ++} + #endif /* CONFIG_NFS_V4_1 */ + + /** +@@ -6081,7 +6639,7 @@ out: + int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, + int plus) + { +- uint32_t bitmap[2] = {0}; ++ uint32_t bitmap[3] = {0}; + uint32_t len; + __be32 *p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) +@@ -6266,8 +6824,13 @@ struct rpc_procinfo nfs4_procedures[] = + PROC(SEQUENCE, enc_sequence, dec_sequence), + PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), ++ PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), + PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), + PROC(LAYOUTGET, enc_layoutget, dec_layoutget), ++ PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), ++ PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), ++ PROC(PNFS_WRITE, enc_dswrite, dec_dswrite), ++ PROC(PNFS_COMMIT, enc_dscommit, dec_dscommit), + #endif /* CONFIG_NFS_V4_1 */ + }; + +diff -up linux-2.6.37.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.37.noarch/fs/nfs/objlayout/Kbuild +--- linux-2.6.37.noarch/fs/nfs/objlayout/Kbuild.orig 2011-01-28 09:43:53.331773009 -0500 ++++ linux-2.6.37.noarch/fs/nfs/objlayout/Kbuild 2011-01-28 09:43:53.331773009 -0500 +@@ -0,0 +1,11 @@ ++# ++# Makefile for the pNFS Objects Layout Driver kernel module ++# ++objlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o objio_osd.o ++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o ++ ++# ++# Panasas pNFS Layout Driver kernel module ++# ++panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o ++obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o +diff -up linux-2.6.37.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.37.noarch/fs/nfs/objlayout/objio_osd.c +--- linux-2.6.37.noarch/fs/nfs/objlayout/objio_osd.c.orig 2011-01-28 09:43:53.333772709 -0500 ++++ linux-2.6.37.noarch/fs/nfs/objlayout/objio_osd.c 2011-01-28 09:43:53.333772709 -0500 +@@ -0,0 +1,1060 @@ ++/* ++ * objio_osd.c ++ * ++ * pNFS Objects layout implementation over open-osd initiator library ++ * ++ * Copyright (C) 2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "objlayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++#define _LLU(x) ((unsigned long long)x) ++ ++enum { BIO_MAX_PAGES_KMALLOC = ++ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), ++}; ++ ++/* A per mountpoint struct currently for device cache */ ++struct objio_mount_type { ++ struct list_head dev_list; ++ spinlock_t dev_list_lock; ++}; ++ ++struct _dev_ent { ++ struct list_head list; ++ struct nfs4_deviceid d_id; ++ struct osd_dev *od; ++}; ++ ++static void _dev_list_remove_all(struct objio_mount_type *omt) ++{ ++ spin_lock(&omt->dev_list_lock); ++ ++ while (!list_empty(&omt->dev_list)) { ++ struct _dev_ent *de = list_entry(omt->dev_list.next, ++ struct _dev_ent, list); ++ ++ list_del_init(&de->list); ++ osduld_put_device(de->od); ++ kfree(de); ++ } ++ ++ spin_unlock(&omt->dev_list_lock); ++} ++ ++static struct osd_dev *___dev_list_find(struct objio_mount_type *omt, ++ struct nfs4_deviceid *d_id) ++{ ++ struct list_head *le; ++ ++ list_for_each(le, &omt->dev_list) { ++ struct _dev_ent *de = list_entry(le, struct _dev_ent, list); ++ ++ if (0 == memcmp(&de->d_id, d_id, sizeof(*d_id))) ++ return de->od; ++ } ++ ++ return NULL; ++} ++ ++static struct osd_dev *_dev_list_find(struct objio_mount_type *omt, ++ struct nfs4_deviceid *d_id) ++{ ++ struct osd_dev *od; ++ ++ spin_lock(&omt->dev_list_lock); ++ od = ___dev_list_find(omt, d_id); ++ spin_unlock(&omt->dev_list_lock); ++ return od; ++} ++ ++static int _dev_list_add(struct objio_mount_type *omt, ++ struct nfs4_deviceid *d_id, struct osd_dev *od) ++{ ++ struct _dev_ent *de = kzalloc(sizeof(*de), GFP_KERNEL); ++ ++ if (!de) ++ return -ENOMEM; ++ ++ spin_lock(&omt->dev_list_lock); ++ ++ if (___dev_list_find(omt, d_id)) { ++ kfree(de); ++ goto out; ++ } ++ ++ de->d_id = *d_id; ++ de->od = od; ++ list_add(&de->list, &omt->dev_list); ++ ++out: ++ spin_unlock(&omt->dev_list_lock); ++ return 0; ++} ++ ++struct objio_segment { ++ struct pnfs_osd_layout *layout; ++ ++ unsigned mirrors_p1; ++ unsigned stripe_unit; ++ unsigned group_width; /* Data stripe_units without integrity comps */ ++ u64 group_depth; ++ unsigned group_count; ++ ++ unsigned num_comps; ++ /* variable length */ ++ struct osd_dev *ods[1]; ++}; ++ ++struct objio_state; ++typedef ssize_t (*objio_done_fn)(struct objio_state *ios); ++ ++struct objio_state { ++ /* Generic layer */ ++ struct objlayout_io_state ol_state; ++ ++ struct objio_segment *objio_seg; ++ ++ struct kref kref; ++ objio_done_fn done; ++ void *private; ++ ++ unsigned long length; ++ unsigned numdevs; /* Actually used devs in this IO */ ++ /* A per-device variable array of size numdevs */ ++ struct _objio_per_comp { ++ struct bio *bio; ++ struct osd_request *or; ++ unsigned long length; ++ u64 offset; ++ unsigned dev; ++ } per_dev[]; ++}; ++ ++/* Send and wait for a get_device_info of devices in the layout, ++ then look them up with the osd_initiator library */ ++static struct osd_dev *_device_lookup(struct pnfs_layout_hdr *pnfslay, ++ struct objio_segment *objio_seg, unsigned comp) ++{ ++ struct pnfs_osd_layout *layout = objio_seg->layout; ++ struct pnfs_osd_deviceaddr *deviceaddr; ++ struct nfs4_deviceid *d_id; ++ struct osd_dev *od; ++ struct osd_dev_info odi; ++ struct objio_mount_type *omt = NFS_SERVER(pnfslay->inode)->pnfs_ld_data; ++ int err; ++ ++ d_id = &layout->olo_comps[comp].oc_object_id.oid_device_id; ++ ++ od = _dev_list_find(omt, d_id); ++ if (od) ++ return od; ++ ++ err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr); ++ if (unlikely(err)) { ++ dprintk("%s: objlayout_get_deviceinfo=>%d\n", __func__, err); ++ return ERR_PTR(err); ++ } ++ ++ odi.systemid_len = deviceaddr->oda_systemid.len; ++ if (odi.systemid_len > sizeof(odi.systemid)) { ++ err = -EINVAL; ++ goto out; ++ } else if (odi.systemid_len) ++ memcpy(odi.systemid, deviceaddr->oda_systemid.data, ++ odi.systemid_len); ++ odi.osdname_len = deviceaddr->oda_osdname.len; ++ odi.osdname = (u8 *)deviceaddr->oda_osdname.data; ++ ++ if (!odi.osdname_len && !odi.systemid_len) { ++ dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", ++ __func__); ++ err = -ENODEV; ++ goto out; ++ } ++ ++ od = osduld_info_lookup(&odi); ++ if (unlikely(IS_ERR(od))) { ++ err = PTR_ERR(od); ++ dprintk("%s: osduld_info_lookup => %d\n", __func__, err); ++ goto out; ++ } ++ ++ _dev_list_add(omt, d_id, od); ++ ++out: ++ dprintk("%s: return=%d\n", __func__, err); ++ objlayout_put_deviceinfo(deviceaddr); ++ return err ? ERR_PTR(err) : od; ++} ++ ++static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, ++ struct objio_segment *objio_seg) ++{ ++ struct pnfs_osd_layout *layout = objio_seg->layout; ++ unsigned i, num_comps = layout->olo_num_comps; ++ int err; ++ ++ /* lookup all devices */ ++ for (i = 0; i < num_comps; i++) { ++ struct osd_dev *od; ++ ++ od = _device_lookup(pnfslay, objio_seg, i); ++ if (unlikely(IS_ERR(od))) { ++ err = PTR_ERR(od); ++ goto out; ++ } ++ objio_seg->ods[i] = od; ++ } ++ objio_seg->num_comps = num_comps; ++ err = 0; ++ ++out: ++ dprintk("%s: return=%d\n", __func__, err); ++ return err; ++} ++ ++static int _verify_data_map(struct pnfs_osd_layout *layout) ++{ ++ struct pnfs_osd_data_map *data_map = &layout->olo_map; ++ u64 stripe_length; ++ u32 group_width; ++ ++/* FIXME: Only raid0 for now. if not go through MDS */ ++ if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { ++ printk(KERN_ERR "Only RAID_0 for now\n"); ++ return -ENOTSUPP; ++ } ++ if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { ++ printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", ++ data_map->odm_num_comps, data_map->odm_mirror_cnt); ++ return -EINVAL; ++ } ++ ++ if (data_map->odm_group_width) ++ group_width = data_map->odm_group_width; ++ else ++ group_width = data_map->odm_num_comps / ++ (data_map->odm_mirror_cnt + 1); ++ ++ stripe_length = (u64)data_map->odm_stripe_unit * group_width; ++ if (stripe_length >= (1ULL << 32)) { ++ printk(KERN_ERR "Total Stripe length(0x%llx)" ++ " >= 32bit is not supported\n", _LLU(stripe_length)); ++ return -ENOTSUPP; ++ } ++ ++ if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { ++ printk(KERN_ERR "Stripe Unit(0x%llx)" ++ " must be Multples of PAGE_SIZE(0x%lx)\n", ++ _LLU(data_map->odm_stripe_unit), PAGE_SIZE); ++ return -ENOTSUPP; ++ } ++ ++ return 0; ++} ++ ++int objio_alloc_lseg(void **outp, ++ struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout) ++{ ++ struct objio_segment *objio_seg; ++ int err; ++ ++ err = _verify_data_map(layout); ++ if (unlikely(err)) ++ return err; ++ ++ objio_seg = kzalloc(sizeof(*objio_seg) + ++ (layout->olo_num_comps - 1) * sizeof(objio_seg->ods[0]), ++ GFP_KERNEL); ++ if (!objio_seg) ++ return -ENOMEM; ++ ++ objio_seg->layout = layout; ++ err = objio_devices_lookup(pnfslay, objio_seg); ++ if (err) ++ goto free_seg; ++ ++ objio_seg->mirrors_p1 = layout->olo_map.odm_mirror_cnt + 1; ++ objio_seg->stripe_unit = layout->olo_map.odm_stripe_unit; ++ if (layout->olo_map.odm_group_width) { ++ objio_seg->group_width = layout->olo_map.odm_group_width; ++ objio_seg->group_depth = layout->olo_map.odm_group_depth; ++ objio_seg->group_count = layout->olo_map.odm_num_comps / ++ objio_seg->mirrors_p1 / ++ objio_seg->group_width; ++ } else { ++ objio_seg->group_width = layout->olo_map.odm_num_comps / ++ objio_seg->mirrors_p1; ++ objio_seg->group_depth = -1; ++ objio_seg->group_count = 1; ++ } ++ ++ *outp = objio_seg; ++ return 0; ++ ++free_seg: ++ dprintk("%s: Error: return %d\n", __func__, err); ++ kfree(objio_seg); ++ *outp = NULL; ++ return err; ++} ++ ++void objio_free_lseg(void *p) ++{ ++ struct objio_segment *objio_seg = p; ++ ++ kfree(objio_seg); ++} ++ ++int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) ++{ ++ struct objio_segment *objio_seg = seg; ++ struct objio_state *ios; ++ const unsigned first_size = sizeof(*ios) + ++ objio_seg->num_comps * sizeof(ios->per_dev[0]); ++ const unsigned sec_size = objio_seg->num_comps * ++ sizeof(ios->ol_state.ioerrs[0]); ++ ++ dprintk("%s: num_comps=%d\n", __func__, objio_seg->num_comps); ++ ios = kzalloc(first_size + sec_size, GFP_KERNEL); ++ if (unlikely(!ios)) ++ return -ENOMEM; ++ ++ ios->objio_seg = objio_seg; ++ ios->ol_state.ioerrs = ((void *)ios) + first_size; ++ ios->ol_state.num_comps = objio_seg->num_comps; ++ ++ *outp = &ios->ol_state; ++ return 0; ++} ++ ++void objio_free_io_state(struct objlayout_io_state *ol_state) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ ++ kfree(ios); ++} ++ ++enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) ++{ ++ switch (oep) { ++ case OSD_ERR_PRI_NO_ERROR: ++ return (enum pnfs_osd_errno)0; ++ ++ case OSD_ERR_PRI_CLEAR_PAGES: ++ BUG_ON(1); ++ return 0; ++ ++ case OSD_ERR_PRI_RESOURCE: ++ return PNFS_OSD_ERR_RESOURCE; ++ case OSD_ERR_PRI_BAD_CRED: ++ return PNFS_OSD_ERR_BAD_CRED; ++ case OSD_ERR_PRI_NO_ACCESS: ++ return PNFS_OSD_ERR_NO_ACCESS; ++ case OSD_ERR_PRI_UNREACHABLE: ++ return PNFS_OSD_ERR_UNREACHABLE; ++ case OSD_ERR_PRI_NOT_FOUND: ++ return PNFS_OSD_ERR_NOT_FOUND; ++ case OSD_ERR_PRI_NO_SPACE: ++ return PNFS_OSD_ERR_NO_SPACE; ++ default: ++ WARN_ON(1); ++ /* fallthrough */ ++ case OSD_ERR_PRI_EIO: ++ return PNFS_OSD_ERR_EIO; ++ } ++} ++ ++static void _clear_bio(struct bio *bio) ++{ ++ struct bio_vec *bv; ++ unsigned i; ++ ++ __bio_for_each_segment(bv, bio, i, 0) { ++ unsigned this_count = bv->bv_len; ++ ++ if (likely(PAGE_SIZE == this_count)) ++ clear_highpage(bv->bv_page); ++ else ++ zero_user(bv->bv_page, bv->bv_offset, this_count); ++ } ++} ++ ++static int _io_check(struct objio_state *ios, bool is_write) ++{ ++ enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; ++ int lin_ret = 0; ++ int i; ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct osd_sense_info osi; ++ struct osd_request *or = ios->per_dev[i].or; ++ int ret; ++ ++ if (!or) ++ continue; ++ ++ ret = osd_req_decode_sense(or, &osi); ++ if (likely(!ret)) ++ continue; ++ ++ if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { ++ /* start read offset passed endof file */ ++ BUG_ON(is_write); ++ _clear_bio(ios->per_dev[i].bio); ++ dprintk("%s: start read offset passed end of file " ++ "offset=0x%llx, length=0x%lx\n", __func__, ++ _LLU(ios->per_dev[i].offset), ++ ios->per_dev[i].length); ++ ++ continue; /* we recovered */ ++ } ++ objlayout_io_set_result(&ios->ol_state, ios->per_dev[i].dev, ++ osd_pri_2_pnfs_err(osi.osd_err_pri), ++ ios->per_dev[i].offset, ++ ios->per_dev[i].length, ++ is_write); ++ ++ if (osi.osd_err_pri >= oep) { ++ oep = osi.osd_err_pri; ++ lin_ret = ret; ++ } ++ } ++ ++ return lin_ret; ++} ++ ++/* ++ * Common IO state helpers. ++ */ ++static void _io_free(struct objio_state *ios) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct _objio_per_comp *per_dev = &ios->per_dev[i]; ++ ++ if (per_dev->or) { ++ osd_end_request(per_dev->or); ++ per_dev->or = NULL; ++ } ++ ++ if (per_dev->bio) { ++ bio_put(per_dev->bio); ++ per_dev->bio = NULL; ++ } ++ } ++} ++ ++struct osd_dev * _io_od(struct objio_state *ios, unsigned dev) ++{ ++ unsigned min_dev = ios->objio_seg->layout->olo_comps_index; ++ unsigned max_dev = min_dev + ios->ol_state.num_comps; ++ ++ BUG_ON(dev < min_dev || max_dev <= dev); ++ return ios->objio_seg->ods[dev - min_dev]; ++} ++ ++struct _striping_info { ++ u64 obj_offset; ++ u64 group_length; ++ u64 total_group_length; ++ u64 Major; ++ unsigned dev; ++ unsigned unit_off; ++}; ++ ++static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, ++ struct _striping_info *si) ++{ ++ u32 stripe_unit = ios->objio_seg->stripe_unit; ++ u32 group_width = ios->objio_seg->group_width; ++ u64 group_depth = ios->objio_seg->group_depth; ++ u32 U = stripe_unit * group_width; ++ ++ u64 T = U * group_depth; ++ u64 S = T * ios->objio_seg->group_count; ++ u64 M = div64_u64(file_offset, S); ++ ++ /* ++ G = (L - (M * S)) / T ++ H = (L - (M * S)) % T ++ */ ++ u64 LmodU = file_offset - M * S; ++ u32 G = div64_u64(LmodU, T); ++ u64 H = LmodU - G * T; ++ ++ u32 N = div_u64(H, U); ++ ++ div_u64_rem(file_offset, stripe_unit, &si->unit_off); ++ si->obj_offset = si->unit_off + (N * stripe_unit) + ++ (M * group_depth * stripe_unit); ++ ++ /* "H - (N * U)" is just "H % U" so it's bound to u32 */ ++ si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; ++ si->dev *= ios->objio_seg->mirrors_p1; ++ ++ si->group_length = T - H; ++ si->total_group_length = T; ++ si->Major = M; ++} ++ ++static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, ++ unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len) ++{ ++ unsigned pg = *cur_pg; ++ struct request_queue *q = ++ osd_request_queue(_io_od(ios, per_dev->dev)); ++ ++ per_dev->length += cur_len; ++ ++ if (per_dev->bio == NULL) { ++ unsigned stripes = ios->ol_state.num_comps / ++ ios->objio_seg->mirrors_p1; ++ unsigned pages_in_stripe = stripes * ++ (ios->objio_seg->stripe_unit / PAGE_SIZE); ++ unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / ++ stripes; ++ ++ per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); ++ if (unlikely(!per_dev->bio)) { ++ dprintk("Faild to allocate BIO size=%u\n", bio_size); ++ return -ENOMEM; ++ } ++ } ++ ++ while (cur_len > 0) { ++ unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); ++ unsigned added_len; ++ ++ BUG_ON(ios->ol_state.nr_pages <= pg); ++ cur_len -= pglen; ++ ++ added_len = bio_add_pc_page(q, per_dev->bio, ++ ios->ol_state.pages[pg], pglen, pgbase); ++ if (unlikely(pglen != added_len)) ++ return -ENOMEM; ++ pgbase = 0; ++ ++pg; ++ } ++ BUG_ON(cur_len); ++ ++ *cur_pg = pg; ++ return 0; ++} ++ ++static int _prepare_one_group(struct objio_state *ios, u64 length, ++ struct _striping_info *si, unsigned first_comp, ++ unsigned *last_pg) ++{ ++ unsigned stripe_unit = ios->objio_seg->stripe_unit; ++ unsigned mirrors_p1 = ios->objio_seg->mirrors_p1; ++ unsigned devs_in_group = ios->objio_seg->group_width * mirrors_p1; ++ unsigned dev = si->dev; ++ unsigned first_dev = dev - (dev % devs_in_group); ++ unsigned comp = first_comp + (dev - first_dev); ++ unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; ++ unsigned cur_pg = *last_pg; ++ int ret = 0; ++ ++ while (length) { ++ struct _objio_per_comp *per_dev = &ios->per_dev[comp]; ++ unsigned cur_len, page_off = 0; ++ ++ if (!per_dev->length) { ++ per_dev->dev = dev; ++ if (dev < si->dev) { ++ per_dev->offset = si->obj_offset + stripe_unit - ++ si->unit_off; ++ cur_len = stripe_unit; ++ } else if (dev == si->dev) { ++ per_dev->offset = si->obj_offset; ++ cur_len = stripe_unit - si->unit_off; ++ page_off = si->unit_off & ~PAGE_MASK; ++ BUG_ON(page_off && ++ (page_off != ios->ol_state.pgbase)); ++ } else { /* dev > si->dev */ ++ per_dev->offset = si->obj_offset - si->unit_off; ++ cur_len = stripe_unit; ++ } ++ ++ if (max_comp < comp) ++ max_comp = comp; ++ ++ dev += mirrors_p1; ++ dev = (dev % devs_in_group) + first_dev; ++ } else { ++ cur_len = stripe_unit; ++ } ++ if (cur_len >= length) ++ cur_len = length; ++ ++ ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, ++ cur_len); ++ if (unlikely(ret)) ++ goto out; ++ ++ comp += mirrors_p1; ++ comp = (comp % devs_in_group) + first_comp; ++ ++ length -= cur_len; ++ ios->length += cur_len; ++ } ++out: ++ ios->numdevs = max_comp + mirrors_p1; ++ *last_pg = cur_pg; ++ return ret; ++} ++ ++static int _io_rw_pagelist(struct objio_state *ios) ++{ ++ u64 length = ios->ol_state.count; ++ struct _striping_info si; ++ unsigned devs_in_group = ios->objio_seg->group_width * ++ ios->objio_seg->mirrors_p1; ++ unsigned first_comp = 0; ++ unsigned num_comps = ios->objio_seg->layout->olo_map.odm_num_comps; ++ unsigned last_pg = 0; ++ int ret = 0; ++ ++ _calc_stripe_info(ios, ios->ol_state.offset, &si); ++ while (length) { ++ if (length < si.group_length) ++ si.group_length = length; ++ ++ ret = _prepare_one_group(ios, si.group_length, &si, first_comp, ++ &last_pg); ++ if (unlikely(ret)) ++ goto out; ++ ++ length -= si.group_length; ++ ++ si.group_length = si.total_group_length; ++ si.unit_off = 0; ++ ++si.Major; ++ si.obj_offset = si.Major * ios->objio_seg->stripe_unit * ++ ios->objio_seg->group_depth; ++ ++ si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group; ++ si.dev %= num_comps; ++ ++ first_comp += devs_in_group; ++ first_comp %= num_comps; ++ } ++ ++out: ++ if (!ios->length) ++ return ret; ++ ++ return 0; ++} ++ ++static ssize_t _sync_done(struct objio_state *ios) ++{ ++ struct completion *waiting = ios->private; ++ ++ complete(waiting); ++ return 0; ++} ++ ++static void _last_io(struct kref *kref) ++{ ++ struct objio_state *ios = container_of(kref, struct objio_state, kref); ++ ++ ios->done(ios); ++} ++ ++static void _done_io(struct osd_request *or, void *p) ++{ ++ struct objio_state *ios = p; ++ ++ kref_put(&ios->kref, _last_io); ++} ++ ++static ssize_t _io_exec(struct objio_state *ios) ++{ ++ DECLARE_COMPLETION_ONSTACK(wait); ++ ssize_t status = 0; /* sync status */ ++ unsigned i; ++ objio_done_fn saved_done_fn = ios->done; ++ bool sync = ios->ol_state.sync; ++ ++ if (sync) { ++ ios->done = _sync_done; ++ ios->private = &wait; ++ } ++ ++ kref_init(&ios->kref); ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct osd_request *or = ios->per_dev[i].or; ++ ++ if (!or) ++ continue; ++ ++ kref_get(&ios->kref); ++ osd_execute_request_async(or, _done_io, ios); ++ } ++ ++ kref_put(&ios->kref, _last_io); ++ ++ if (sync) { ++ wait_for_completion(&wait); ++ status = saved_done_fn(ios); ++ } ++ ++ return status; ++} ++ ++/* ++ * read ++ */ ++static ssize_t _read_done(struct objio_state *ios) ++{ ++ ssize_t status; ++ int ret = _io_check(ios, false); ++ ++ _io_free(ios); ++ ++ if (likely(!ret)) ++ status = ios->length; ++ else ++ status = ret; ++ ++ objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); ++ return status; ++} ++ ++static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) ++{ ++ struct osd_request *or = NULL; ++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; ++ unsigned dev = per_dev->dev; ++ struct pnfs_osd_object_cred *cred = ++ &ios->objio_seg->layout->olo_comps[dev]; ++ struct osd_obj_id obj = { ++ .partition = cred->oc_object_id.oid_partition_id, ++ .id = cred->oc_object_id.oid_object_id, ++ }; ++ int ret; ++ ++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); ++ if (unlikely(!or)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ per_dev->or = or; ++ ++ osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); ++ ++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); ++ if (ret) { ++ dprintk("%s: Faild to osd_finalize_request() => %d\n", ++ __func__, ret); ++ goto err; ++ } ++ ++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", ++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), ++ per_dev->length); ++ ++err: ++ return ret; ++} ++ ++static ssize_t _read_exec(struct objio_state *ios) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { ++ if (!ios->per_dev[i].length) ++ continue; ++ ret = _read_mirrors(ios, i); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ ios->done = _read_done; ++ return _io_exec(ios); /* In sync mode exec returns the io status */ ++ ++err: ++ _io_free(ios); ++ return ret; ++} ++ ++ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ int ret; ++ ++ ret = _io_rw_pagelist(ios); ++ if (unlikely(ret)) ++ return ret; ++ ++ return _read_exec(ios); ++} ++ ++/* ++ * write ++ */ ++static ssize_t _write_done(struct objio_state *ios) ++{ ++ ssize_t status; ++ int ret = _io_check(ios, true); ++ ++ _io_free(ios); ++ ++ if (likely(!ret)) { ++ /* FIXME: should be based on the OSD's persistence model ++ * See OSD2r05 Section 4.13 Data persistence model */ ++ ios->ol_state.committed = NFS_UNSTABLE; //NFS_FILE_SYNC; ++ status = ios->length; ++ } else { ++ status = ret; ++ } ++ ++ objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); ++ return status; ++} ++ ++static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) ++{ ++ struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; ++ unsigned dev = ios->per_dev[cur_comp].dev; ++ unsigned last_comp = cur_comp + ios->objio_seg->mirrors_p1; ++ int ret; ++ ++ for (; cur_comp < last_comp; ++cur_comp, ++dev) { ++ struct osd_request *or = NULL; ++ struct pnfs_osd_object_cred *cred = ++ &ios->objio_seg->layout->olo_comps[dev]; ++ struct osd_obj_id obj = { ++ .partition = cred->oc_object_id.oid_partition_id, ++ .id = cred->oc_object_id.oid_object_id, ++ }; ++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; ++ struct bio *bio; ++ ++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); ++ if (unlikely(!or)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ per_dev->or = or; ++ ++ if (per_dev != master_dev) { ++ bio = bio_kmalloc(GFP_KERNEL, ++ master_dev->bio->bi_max_vecs); ++ if (unlikely(!bio)) { ++ dprintk("Faild to allocate BIO size=%u\n", ++ master_dev->bio->bi_max_vecs); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ __bio_clone(bio, master_dev->bio); ++ bio->bi_bdev = NULL; ++ bio->bi_next = NULL; ++ per_dev->bio = bio; ++ per_dev->dev = dev; ++ per_dev->length = master_dev->length; ++ per_dev->offset = master_dev->offset; ++ } else { ++ bio = master_dev->bio; ++ /* FIXME: bio_set_dir() */ ++ bio->bi_rw |= REQ_WRITE; ++ } ++ ++ osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); ++ ++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); ++ if (ret) { ++ dprintk("%s: Faild to osd_finalize_request() => %d\n", ++ __func__, ret); ++ goto err; ++ } ++ ++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", ++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), ++ per_dev->length); ++ } ++ ++err: ++ return ret; ++} ++ ++static ssize_t _write_exec(struct objio_state *ios) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { ++ if (!ios->per_dev[i].length) ++ continue; ++ ret = _write_mirrors(ios, i); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ ios->done = _write_done; ++ return _io_exec(ios); /* In sync mode exec returns the io->status */ ++ ++err: ++ _io_free(ios); ++ return ret; ++} ++ ++ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ int ret; ++ ++ /* TODO: ios->stable = stable; */ ++ ret = _io_rw_pagelist(ios); ++ if (unlikely(ret)) ++ return ret; ++ ++ return _write_exec(ios); ++} ++ ++/* ++ * Policy Operations ++ */ ++ ++/* ++ * Get the max [rw]size ++ */ ++static ssize_t ++objlayout_get_blocksize(void) ++{ ++ ssize_t sz = BIO_MAX_PAGES_KMALLOC * PAGE_SIZE; ++ ++ return sz; ++} ++ ++/* ++ * Don't gather across stripes, but rather gather (coalesce) up to ++ * the stripe size. ++ * ++ * FIXME: change interface to use merge_align, merge_count ++ */ ++static struct pnfs_layoutdriver_type objlayout_type = { ++ .id = LAYOUT_OSD2_OBJECTS, ++ .name = "LAYOUT_OSD2_OBJECTS", ++ .flags = PNFS_LAYOUTRET_ON_SETATTR, ++ ++ .set_layoutdriver = objlayout_set_layoutdriver, ++ .clear_layoutdriver = objlayout_clear_layoutdriver, ++ ++ .alloc_layout_hdr = objlayout_alloc_layout_hdr, ++ .free_layout_hdr = objlayout_free_layout_hdr, ++ ++ .alloc_lseg = objlayout_alloc_lseg, ++ .free_lseg = objlayout_free_lseg, ++ ++ .get_blocksize = objlayout_get_blocksize, ++ ++ .read_pagelist = objlayout_read_pagelist, ++ .write_pagelist = objlayout_write_pagelist, ++ .commit = objlayout_commit, ++ ++ .encode_layoutcommit = objlayout_encode_layoutcommit, ++ .encode_layoutreturn = objlayout_encode_layoutreturn, ++}; ++ ++void *objio_init_mt(void) ++{ ++ struct objio_mount_type *omt = kzalloc(sizeof(*omt), GFP_KERNEL); ++ ++ if (!omt) ++ return ERR_PTR(-ENOMEM); ++ ++ INIT_LIST_HEAD(&omt->dev_list); ++ spin_lock_init(&omt->dev_list_lock); ++ return omt; ++} ++ ++void objio_fini_mt(void *mountid) ++{ ++ _dev_list_remove_all(mountid); ++ kfree(mountid); ++} ++ ++MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); ++MODULE_AUTHOR("Benny Halevy "); ++MODULE_LICENSE("GPL"); ++ ++static int __init ++objlayout_init(void) ++{ ++ int ret = pnfs_register_layoutdriver(&objlayout_type); ++ ++ if (ret) ++ printk(KERN_INFO ++ "%s: Registering OSD pNFS Layout Driver failed: error=%d\n", ++ __func__, ret); ++ else ++ printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n", ++ __func__); ++ return ret; ++} ++ ++static void __exit ++objlayout_exit(void) ++{ ++ pnfs_unregister_layoutdriver(&objlayout_type); ++ printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n", ++ __func__); ++} ++ ++module_init(objlayout_init); ++module_exit(objlayout_exit); +diff -up linux-2.6.37.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.37.noarch/fs/nfs/objlayout/objlayout.c +--- linux-2.6.37.noarch/fs/nfs/objlayout/objlayout.c.orig 2011-01-28 09:43:53.334772561 -0500 ++++ linux-2.6.37.noarch/fs/nfs/objlayout/objlayout.c 2011-01-28 09:43:53.334772561 -0500 +@@ -0,0 +1,773 @@ ++/* ++ * objlayout.c ++ * ++ * pNFS layout driver for Panasas OSDs ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include "objlayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct pnfs_client_operations *pnfs_client_ops; ++ ++/* ++ * Create a objlayout layout structure for the given inode and return it. ++ */ ++struct pnfs_layout_hdr * ++objlayout_alloc_layout_hdr(struct inode *inode) ++{ ++ struct objlayout *objlay; ++ ++ objlay = kzalloc(sizeof(struct objlayout), GFP_KERNEL); ++ if (objlay) { ++ spin_lock_init(&objlay->lock); ++ INIT_LIST_HEAD(&objlay->err_list); ++ } ++ dprintk("%s: Return %p\n", __func__, objlay); ++ return &objlay->pnfs_layout; ++} ++ ++/* ++ * Free an objlayout layout structure ++ */ ++void ++objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo) ++{ ++ struct objlayout *objlay = OBJLAYOUT(lo); ++ ++ dprintk("%s: objlay %p\n", __func__, objlay); ++ ++ WARN_ON(!list_empty(&objlay->err_list)); ++ kfree(objlay); ++} ++ ++/* ++ * Unmarshall layout and store it in pnfslay. ++ */ ++struct pnfs_layout_segment * ++objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay, ++ struct nfs4_layoutget_res *lgr) ++{ ++ int status; ++ void *layout = lgr->layout.buf; ++ struct objlayout_segment *objlseg; ++ struct pnfs_osd_layout *pnfs_osd_layout; ++ ++ dprintk("%s: Begin pnfslay %p layout %p\n", __func__, pnfslay, layout); ++ ++ BUG_ON(!layout); ++ ++ status = -ENOMEM; ++ objlseg = kzalloc(sizeof(*objlseg) + ++ pnfs_osd_layout_incore_sz(layout), GFP_KERNEL); ++ if (!objlseg) ++ goto err; ++ ++ pnfs_osd_layout = (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; ++ pnfs_osd_xdr_decode_layout(pnfs_osd_layout, layout); ++ ++ objlseg->lseg.range = lgr->range; ++ status = objio_alloc_lseg(&objlseg->internal, pnfslay, &objlseg->lseg, ++ pnfs_osd_layout); ++ if (status) ++ goto err; ++ ++ dprintk("%s: Return %p\n", __func__, &objlseg->lseg); ++ return &objlseg->lseg; ++ ++ err: ++ kfree(objlseg); ++ return ERR_PTR(status); ++} ++ ++/* ++ * Free a layout segement ++ */ ++void ++objlayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ struct objlayout_segment *objlseg; ++ ++ dprintk("%s: freeing layout segment %p\n", __func__, lseg); ++ ++ if (unlikely(!lseg)) ++ return; ++ ++ objlseg = container_of(lseg, struct objlayout_segment, lseg); ++ objio_free_lseg(objlseg->internal); ++ kfree(objlseg); ++} ++ ++/* ++ * I/O Operations ++ */ ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end : NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1 : NFS4_MAX_UINT64; ++} ++ ++static struct objlayout_io_state * ++objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, ++ struct page **pages, ++ unsigned pgbase, ++ unsigned nr_pages, ++ loff_t offset, ++ size_t count, ++ struct pnfs_layout_segment *lseg, ++ void *rpcdata) ++{ ++ struct objlayout_segment *objlseg = ++ container_of(lseg, struct objlayout_segment, lseg); ++ struct objlayout_io_state *state; ++ u64 lseg_end_offset; ++ size_t size_nr_pages; ++ ++ dprintk("%s: allocating io_state\n", __func__); ++ if (objio_alloc_io_state(objlseg->internal, &state)) ++ return NULL; ++ ++ BUG_ON(offset < lseg->range.offset); ++ lseg_end_offset = end_offset(lseg->range.offset, lseg->range.length); ++ BUG_ON(offset >= lseg_end_offset); ++ if (offset + count > lseg_end_offset) { ++ count = lseg->range.length - (offset - lseg->range.offset); ++ dprintk("%s: truncated count %Zd\n", __func__, count); ++ } ++ ++ if (pgbase > PAGE_SIZE) { ++ unsigned n = pgbase >> PAGE_SHIFT; ++ ++ pgbase &= ~PAGE_MASK; ++ pages += n; ++ nr_pages -= n; ++ } ++ ++ size_nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ BUG_ON(nr_pages < size_nr_pages); ++ if (nr_pages > size_nr_pages) ++ nr_pages = size_nr_pages; ++ ++ INIT_LIST_HEAD(&state->err_list); ++ state->objlseg = objlseg; ++ state->rpcdata = rpcdata; ++ state->pages = pages; ++ state->pgbase = pgbase; ++ state->nr_pages = nr_pages; ++ state->offset = offset; ++ state->count = count; ++ state->sync = 0; ++ ++ return state; ++} ++ ++static void ++objlayout_free_io_state(struct objlayout_io_state *state) ++{ ++ dprintk("%s: freeing io_state\n", __func__); ++ if (unlikely(!state)) ++ return; ++ ++ objio_free_io_state(state); ++} ++ ++/* ++ * I/O done common code ++ */ ++static void ++objlayout_iodone(struct objlayout_io_state *state) ++{ ++ dprintk("%s: state %p status\n", __func__, state); ++ ++ if (likely(state->status >= 0)) { ++ objlayout_free_io_state(state); ++ } else { ++ struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.layout); ++ ++ spin_lock(&objlay->lock); ++ objlay->delta_space_valid = OBJ_DSU_INVALID; ++ list_add(&objlay->err_list, &state->err_list); ++ spin_unlock(&objlay->lock); ++ } ++} ++ ++/* ++ * objlayout_io_set_result - Set an osd_error code on a specific osd comp. ++ * ++ * The @index component IO failed (error returned from target). Register ++ * the error for later reporting at layout-return. ++ */ ++void ++objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, ++ int osd_error, u64 offset, u64 length, bool is_write) ++{ ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; ++ ++ BUG_ON(index >= state->num_comps); ++ if (osd_error) { ++ struct pnfs_osd_layout *layout = ++ (typeof(layout))state->objlseg->pnfs_osd_layout; ++ ++ ioerr->oer_component = layout->olo_comps[index].oc_object_id; ++ ioerr->oer_comp_offset = offset; ++ ioerr->oer_comp_length = length; ++ ioerr->oer_iswrite = is_write; ++ ioerr->oer_errno = osd_error; ++ ++ dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " ++ "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", ++ __func__, index, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ } else { ++ /* User need not call if no error is reported */ ++ ioerr->oer_errno = 0; ++ } ++} ++ ++static void _rpc_commit_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ ++ pnfs_commit_done(wdata); ++} ++ ++/* ++ * Commit data remotely on OSDs ++ */ ++enum pnfs_try_status ++objlayout_commit(struct nfs_write_data *wdata, int how) ++{ ++ int status = PNFS_ATTEMPTED; ++ ++ INIT_WORK(&wdata->task.u.tk_work, _rpc_commit_complete); ++ schedule_work(&wdata->task.u.tk_work); ++ dprintk("%s: Return %d\n", __func__, status); ++ return status; ++} ++ ++/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). ++ * This is because the osd completion is called with ints-off from ++ * the block layer ++ */ ++static void _rpc_read_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ ++ pnfs_read_done(rdata); ++} ++ ++void ++objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) ++{ ++ int eof = state->eof; ++ struct nfs_read_data *rdata; ++ ++ state->status = status; ++ dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof); ++ rdata = state->rpcdata; ++ rdata->task.tk_status = status; ++ if (status >= 0) { ++ rdata->res.count = status; ++ rdata->res.eof = eof; ++ } ++ objlayout_iodone(state); ++ /* must not use state after this point */ ++ ++ if (sync) ++ pnfs_read_done(rdata); ++ else { ++ INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); ++ schedule_work(&rdata->task.u.tk_work); ++ } ++} ++ ++/* ++ * Perform sync or async reads. ++ */ ++enum pnfs_try_status ++objlayout_read_pagelist(struct nfs_read_data *rdata, unsigned nr_pages) ++{ ++ loff_t offset = rdata->args.offset; ++ size_t count = rdata->args.count; ++ struct objlayout_io_state *state; ++ ssize_t status = 0; ++ loff_t eof; ++ ++ dprintk("%s: Begin inode %p offset %llu count %d\n", ++ __func__, rdata->inode, offset, (int)count); ++ ++ eof = i_size_read(rdata->inode); ++ if (unlikely(offset + count > eof)) { ++ if (offset >= eof) { ++ status = 0; ++ rdata->res.count = 0; ++ rdata->res.eof = 1; ++ goto out; ++ } ++ count = eof - offset; ++ } ++ ++ state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, ++ rdata->args.pages, rdata->args.pgbase, ++ nr_pages, offset, count, ++ rdata->pdata.lseg, rdata); ++ if (unlikely(!state)) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ state->eof = state->offset + state->count >= eof; ++ ++ status = objio_read_pagelist(state); ++ out: ++ dprintk("%s: Return status %Zd\n", __func__, status); ++ rdata->pdata.pnfs_error = status; ++ return PNFS_ATTEMPTED; ++} ++ ++/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). ++ * This is because the osd completion is called with ints-off from ++ * the block layer ++ */ ++static void _rpc_write_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ ++ pnfs_writeback_done(wdata); ++} ++ ++void ++objlayout_write_done(struct objlayout_io_state *state, ssize_t status, ++ bool sync) ++{ ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s: Begin\n", __func__); ++ wdata = state->rpcdata; ++ state->status = status; ++ wdata->task.tk_status = status; ++ if (status >= 0) { ++ wdata->res.count = status; ++ wdata->verf.committed = state->committed; ++ dprintk("%s: Return status %d committed %d\n", ++ __func__, wdata->task.tk_status, ++ wdata->verf.committed); ++ } else ++ dprintk("%s: Return status %d\n", ++ __func__, wdata->task.tk_status); ++ objlayout_iodone(state); ++ /* must not use state after this point */ ++ ++ if (sync) ++ pnfs_writeback_done(wdata); ++ else { ++ INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); ++ schedule_work(&wdata->task.u.tk_work); ++ } ++} ++ ++/* ++ * Perform sync or async writes. ++ */ ++enum pnfs_try_status ++objlayout_write_pagelist(struct nfs_write_data *wdata, ++ unsigned nr_pages, ++ int how) ++{ ++ struct objlayout_io_state *state; ++ ssize_t status; ++ ++ dprintk("%s: Begin inode %p offset %llu count %u\n", ++ __func__, wdata->inode, wdata->args.offset, wdata->args.count); ++ ++ state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, ++ wdata->args.pages, ++ wdata->args.pgbase, ++ nr_pages, ++ wdata->args.offset, ++ wdata->args.count, ++ wdata->pdata.lseg, wdata); ++ if (unlikely(!state)) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ state->sync = how & FLUSH_SYNC; ++ ++ status = objio_write_pagelist(state, how & FLUSH_STABLE); ++ out: ++ dprintk("%s: Return status %Zd\n", __func__, status); ++ wdata->pdata.pnfs_error = status; ++ return PNFS_ATTEMPTED; ++} ++ ++void ++objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *args) ++{ ++ struct objlayout *objlay = OBJLAYOUT(pnfslay); ++ struct pnfs_osd_layoutupdate lou; ++ __be32 *start; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ spin_lock(&objlay->lock); ++ lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); ++ lou.dsu_delta = objlay->delta_space_used; ++ objlay->delta_space_used = 0; ++ objlay->delta_space_valid = OBJ_DSU_INIT; ++ lou.olu_ioerr_flag = !list_empty(&objlay->err_list); ++ spin_unlock(&objlay->lock); ++ ++ start = xdr_reserve_space(xdr, 4); ++ ++ BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); ++ ++ *start = cpu_to_be32((xdr->p - start - 1) * 4); ++ ++ dprintk("%s: Return delta_space_used %lld err %d\n", __func__, ++ lou.dsu_delta, lou.olu_ioerr_flag); ++} ++ ++static int ++err_prio(u32 oer_errno) ++{ ++ switch (oer_errno) { ++ case 0: ++ return 0; ++ ++ case PNFS_OSD_ERR_RESOURCE: ++ return OSD_ERR_PRI_RESOURCE; ++ case PNFS_OSD_ERR_BAD_CRED: ++ return OSD_ERR_PRI_BAD_CRED; ++ case PNFS_OSD_ERR_NO_ACCESS: ++ return OSD_ERR_PRI_NO_ACCESS; ++ case PNFS_OSD_ERR_UNREACHABLE: ++ return OSD_ERR_PRI_UNREACHABLE; ++ case PNFS_OSD_ERR_NOT_FOUND: ++ return OSD_ERR_PRI_NOT_FOUND; ++ case PNFS_OSD_ERR_NO_SPACE: ++ return OSD_ERR_PRI_NO_SPACE; ++ default: ++ WARN_ON(1); ++ /* fallthrough */ ++ case PNFS_OSD_ERR_EIO: ++ return OSD_ERR_PRI_EIO; ++ } ++} ++ ++static void ++merge_ioerr(struct pnfs_osd_ioerr *dest_err, ++ const struct pnfs_osd_ioerr *src_err) ++{ ++ u64 dest_end, src_end; ++ ++ if (!dest_err->oer_errno) { ++ *dest_err = *src_err; ++ /* accumulated device must be blank */ ++ memset(&dest_err->oer_component.oid_device_id, 0, ++ sizeof(dest_err->oer_component.oid_device_id)); ++ ++ return; ++ } ++ ++ if (dest_err->oer_component.oid_partition_id != ++ src_err->oer_component.oid_partition_id) ++ dest_err->oer_component.oid_partition_id = 0; ++ ++ if (dest_err->oer_component.oid_object_id != ++ src_err->oer_component.oid_object_id) ++ dest_err->oer_component.oid_object_id = 0; ++ ++ if (dest_err->oer_comp_offset > src_err->oer_comp_offset) ++ dest_err->oer_comp_offset = src_err->oer_comp_offset; ++ ++ dest_end = end_offset(dest_err->oer_comp_offset, ++ dest_err->oer_comp_length); ++ src_end = end_offset(src_err->oer_comp_offset, ++ src_err->oer_comp_length); ++ if (dest_end < src_end) ++ dest_end = src_end; ++ ++ dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; ++ ++ if ((src_err->oer_iswrite == dest_err->oer_iswrite) && ++ (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { ++ dest_err->oer_errno = src_err->oer_errno; ++ } else if (src_err->oer_iswrite) { ++ dest_err->oer_iswrite = true; ++ dest_err->oer_errno = src_err->oer_errno; ++ } ++} ++ ++static void ++encode_accumulated_error(struct objlayout *objlay, struct xdr_stream *xdr) ++{ ++ struct objlayout_io_state *state, *tmp; ++ struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; ++ ++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { ++ unsigned i; ++ ++ for (i = 0; i < state->num_comps; i++) { ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; ++ ++ if (!ioerr->oer_errno) ++ continue; ++ ++ printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d " ++ "dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ __func__, i, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ ++ merge_ioerr(&accumulated_err, ioerr); ++ } ++ list_del(&state->err_list); ++ objlayout_free_io_state(state); ++ } ++ ++ BUG_ON(pnfs_osd_xdr_encode_ioerr(xdr, &accumulated_err)); ++} ++ ++void ++objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args) ++{ ++ struct objlayout *objlay = OBJLAYOUT(pnfslay); ++ struct objlayout_io_state *state, *tmp; ++ __be32 *start, *uninitialized_var(last_xdr); ++ ++ dprintk("%s: Begin\n", __func__); ++ start = xdr_reserve_space(xdr, 4); ++ BUG_ON(!start); ++ ++ spin_lock(&objlay->lock); ++ ++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { ++ unsigned i; ++ int res = 0; ++ ++ for (i = 0; i < state->num_comps && !res; i++) { ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; ++ ++ if (!ioerr->oer_errno) ++ continue; ++ ++ dprintk("%s: err[%d]: errno=%d is_write=%d " ++ "dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ __func__, i, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ ++ last_xdr = xdr->p; ++ res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]); ++ } ++ if (unlikely(res)) { ++ /* no space for even one error descriptor */ ++ BUG_ON(last_xdr == start + 1); ++ ++ /* we've encountered a situation with lots and lots of ++ * errors and no space to encode them all. Use the last ++ * available slot to report the union of all the ++ * remaining errors. ++ */ ++ xdr_rewind_stream(xdr, last_xdr - ++ pnfs_osd_ioerr_xdr_sz() / 4); ++ encode_accumulated_error(objlay, xdr); ++ goto loop_done; ++ } ++ list_del(&state->err_list); ++ objlayout_free_io_state(state); ++ } ++loop_done: ++ spin_unlock(&objlay->lock); ++ ++ *start = cpu_to_be32((xdr->p - start - 1) * 4); ++ dprintk("%s: Return\n", __func__); ++} ++ ++struct objlayout_deviceinfo { ++ struct page *page; ++ struct pnfs_osd_deviceaddr da; /* This must be last */ ++}; ++ ++/* Initialize and call nfs_getdeviceinfo, then decode and return a ++ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() ++ * should be called. ++ */ ++int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, ++ struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr) ++{ ++ struct objlayout_deviceinfo *odi; ++ struct pnfs_device pd; ++ struct super_block *sb; ++ struct page *page; ++ size_t sz; ++ u32 *p; ++ int err; ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ pd.area = page_address(page); ++ ++ memcpy(&pd.dev_id, d_id, sizeof(*d_id)); ++ pd.layout_type = LAYOUT_OSD2_OBJECTS; ++ pd.pages = &page; ++ pd.pgbase = 0; ++ pd.pglen = PAGE_SIZE; ++ pd.mincount = 0; ++ ++ sb = pnfslay->inode->i_sb; ++ err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->inode), &pd); ++ dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); ++ if (err) ++ goto err_out; ++ ++ p = pd.area; ++ sz = pnfs_osd_xdr_deviceaddr_incore_sz(p); ++ odi = kzalloc(sz + (sizeof(*odi) - sizeof(odi->da)), GFP_KERNEL); ++ if (!odi) { ++ err = -ENOMEM; ++ goto err_out; ++ } ++ pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); ++ odi->page = page; ++ *deviceaddr = &odi->da; ++ return 0; ++ ++err_out: ++ __free_page(page); ++ return err; ++} ++ ++void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) ++{ ++ struct objlayout_deviceinfo *odi = container_of(deviceaddr, ++ struct objlayout_deviceinfo, ++ da); ++ ++ __free_page(odi->page); ++ kfree(odi); ++} ++ ++/* ++ * Perform the objio specific init_mt method. ++ * Set the layout driver private data pointer for later use. ++ */ ++int ++objlayout_set_layoutdriver(struct nfs_server *server, ++ const struct nfs_fh *mntfh) ++{ ++ void *data; ++ ++ data = objio_init_mt(); ++ if (IS_ERR(data)) { ++ printk(KERN_INFO "%s: objlayout lib not ready err=%ld\n", ++ __func__, PTR_ERR(data)); ++ return PTR_ERR(data); ++ } ++ server->pnfs_ld_data = data; ++ ++ dprintk("%s: Return data=%p\n", __func__, data); ++ return 0; ++} ++ ++/* ++ * Perform the objio specific fini_mt method to release the ++ * layoutdriver private data. ++ */ ++int ++objlayout_clear_layoutdriver(struct nfs_server *server) ++{ ++ dprintk("%s: Begin %p\n", __func__, server->pnfs_ld_data); ++ objio_fini_mt(server->pnfs_ld_data); ++ return 0; ++} +diff -up linux-2.6.37.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.37.noarch/fs/nfs/objlayout/objlayout.h +--- linux-2.6.37.noarch/fs/nfs/objlayout/objlayout.h.orig 2011-01-28 09:43:53.334772561 -0500 ++++ linux-2.6.37.noarch/fs/nfs/objlayout/objlayout.h 2011-01-28 09:43:53.334772561 -0500 +@@ -0,0 +1,206 @@ ++/* ++ * objlayout.h ++ * ++ * Data types and function declerations for interfacing with the ++ * pNFS standard object layout driver. ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#ifndef _OBJLAYOUT_H ++#define _OBJLAYOUT_H ++ ++#include ++#include ++#include "../pnfs.h" ++ ++/* ++ * in-core layout segment ++ */ ++struct objlayout_segment { ++ struct pnfs_layout_segment lseg; ++ void *internal; /* for provider internal use */ ++ u8 pnfs_osd_layout[]; ++}; ++ ++/* ++ * per-inode layout ++ */ ++struct objlayout { ++ struct pnfs_layout_hdr pnfs_layout; ++ ++ /* for layout_commit */ ++ enum osd_delta_space_valid_enum { ++ OBJ_DSU_INIT = 0, ++ OBJ_DSU_VALID, ++ OBJ_DSU_INVALID, ++ } delta_space_valid; ++ s64 delta_space_used; /* consumed by write ops */ ++ ++ /* for layout_return */ ++ spinlock_t lock; ++ struct list_head err_list; ++}; ++ ++static inline struct objlayout * ++OBJLAYOUT(struct pnfs_layout_hdr *lo) ++{ ++ return container_of(lo, struct objlayout, pnfs_layout); ++} ++ ++/* ++ * per-I/O operation state ++ * embedded in objects provider io_state data structure ++ */ ++struct objlayout_io_state { ++ struct objlayout_segment *objlseg; ++ ++ struct page **pages; ++ unsigned pgbase; ++ unsigned nr_pages; ++ unsigned long count; ++ loff_t offset; ++ bool sync; ++ ++ void *rpcdata; ++ int status; /* res */ ++ int eof; /* res */ ++ int committed; /* res */ ++ ++ /* Error reporting (layout_return) */ ++ struct list_head err_list; ++ unsigned num_comps; ++ /* Pointer to array of error descriptors of size num_comps. ++ * It should contain as many entries as devices in the osd_layout ++ * that participate in the I/O. It is up to the io_engine to allocate ++ * needed space and set num_comps. ++ */ ++ struct pnfs_osd_ioerr *ioerrs; ++}; ++ ++/* ++ * Raid engine I/O API ++ */ ++extern void *objio_init_mt(void); ++extern void objio_fini_mt(void *mt); ++ ++extern int objio_alloc_lseg(void **outp, ++ struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout); ++extern void objio_free_lseg(void *p); ++ ++extern int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp); ++extern void objio_free_io_state(struct objlayout_io_state *state); ++ ++extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); ++extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, ++ bool stable); ++ ++/* ++ * callback API ++ */ ++extern void objlayout_io_set_result(struct objlayout_io_state *state, ++ unsigned index, int osd_error, ++ u64 offset, u64 length, bool is_write); ++ ++static inline void ++objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) ++{ ++ struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.layout); ++ ++ /* If one of the I/Os errored out and the delta_space_used was ++ * invalid we render the complete report as invalid. Protocol mandate ++ * the DSU be accurate or not reported. ++ */ ++ spin_lock(&objlay->lock); ++ if (objlay->delta_space_valid != OBJ_DSU_INVALID) { ++ objlay->delta_space_valid = OBJ_DSU_VALID; ++ objlay->delta_space_used += space_used; ++ } ++ spin_unlock(&objlay->lock); ++} ++ ++extern void objlayout_read_done(struct objlayout_io_state *state, ++ ssize_t status, bool sync); ++extern void objlayout_write_done(struct objlayout_io_state *state, ++ ssize_t status, bool sync); ++ ++extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, ++ struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr); ++extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); ++ ++/* ++ * exported generic objects function vectors ++ */ ++ ++extern int objlayout_set_layoutdriver( ++ struct nfs_server *, ++ const struct nfs_fh *); ++extern int objlayout_clear_layoutdriver(struct nfs_server *); ++ ++extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *); ++extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *); ++ ++extern struct pnfs_layout_segment *objlayout_alloc_lseg( ++ struct pnfs_layout_hdr *, ++ struct nfs4_layoutget_res *); ++extern void objlayout_free_lseg(struct pnfs_layout_segment *); ++ ++extern enum pnfs_try_status objlayout_read_pagelist( ++ struct nfs_read_data *, ++ unsigned nr_pages); ++ ++extern enum pnfs_try_status objlayout_write_pagelist( ++ struct nfs_write_data *, ++ unsigned nr_pages, ++ int how); ++ ++extern enum pnfs_try_status objlayout_commit( ++ struct nfs_write_data *, ++ int how); ++ ++extern void objlayout_encode_layoutcommit( ++ struct pnfs_layout_hdr *, ++ struct xdr_stream *, ++ const struct nfs4_layoutcommit_args *); ++ ++extern void objlayout_encode_layoutreturn( ++ struct pnfs_layout_hdr *, ++ struct xdr_stream *, ++ const struct nfs4_layoutreturn_args *); ++ ++#endif /* _OBJLAYOUT_H */ +diff -up linux-2.6.37.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.37.noarch/fs/nfs/objlayout/panfs_shim.c +--- linux-2.6.37.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2011-01-28 09:43:53.335772417 -0500 ++++ linux-2.6.37.noarch/fs/nfs/objlayout/panfs_shim.c 2011-01-28 09:43:53.335772417 -0500 +@@ -0,0 +1,702 @@ ++/* ++ * panfs_shim.c ++ * ++ * Shim layer for interfacing with the Panasas DirectFlow module I/O stack ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * See the file COPYING included with this distribution for more details. ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include "objlayout.h" ++#include "panfs_shim.h" ++ ++#include ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct panfs_export_operations *panfs_export_ops; ++ ++void * ++objio_init_mt(void) ++{ ++ return panfs_export_ops == NULL ? ERR_PTR(-EAGAIN) : NULL; ++} ++ ++void objio_fini_mt(void *mountid) ++{ ++} ++ ++static int ++panfs_shim_conv_raid01(struct pnfs_osd_layout *layout, ++ struct pnfs_osd_data_map *lo_map, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ if (lo_map->odm_mirror_cnt) { ++ hdr->type = PAN_AGG_RAID1; ++ hdr->hdr.raid1.num_comps = lo_map->odm_mirror_cnt + 1; ++ } else if (layout->olo_num_comps > 1) { ++ hdr->type = PAN_AGG_RAID0; ++ hdr->hdr.raid0.num_comps = layout->olo_num_comps; ++ hdr->hdr.raid0.stripe_unit = lo_map->odm_stripe_unit; ++ } else ++ hdr->type = PAN_AGG_SIMPLE; ++ return 0; ++} ++ ++static int ++panfs_shim_conv_raid5(struct pnfs_osd_layout *layout, ++ struct pnfs_osd_data_map *lo_map, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ if (lo_map->odm_mirror_cnt) ++ goto err; ++ ++ if (lo_map->odm_group_width || lo_map->odm_group_depth) { ++ if (!lo_map->odm_group_width || !lo_map->odm_group_depth) ++ goto err; ++ ++ hdr->type = PAN_AGG_GRP_RAID5_LEFT; ++ hdr->hdr.grp_raid5_left.num_comps = lo_map->odm_num_comps; ++ if (hdr->hdr.grp_raid5_left.num_comps != lo_map->odm_num_comps) ++ goto err; ++ hdr->hdr.grp_raid5_left.stripe_unit = lo_map->odm_stripe_unit; ++ hdr->hdr.grp_raid5_left.rg_width = lo_map->odm_group_width; ++ hdr->hdr.grp_raid5_left.rg_depth = lo_map->odm_group_depth; ++ /* this is a guess, panasas server is not supposed to ++ hand out layotu otherwise */ ++ hdr->hdr.grp_raid5_left.group_layout_policy = ++ PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN; ++ } else { ++ hdr->type = PAN_AGG_RAID5_LEFT; ++ hdr->hdr.raid5_left.num_comps = lo_map->odm_num_comps; ++ if (hdr->hdr.raid5_left.num_comps != lo_map->odm_num_comps) ++ goto err; ++ hdr->hdr.raid5_left.stripe_unit2 = ++ hdr->hdr.raid5_left.stripe_unit1 = ++ hdr->hdr.raid5_left.stripe_unit0 = lo_map->odm_stripe_unit; ++ } ++ ++ return 0; ++err: ++ return -EINVAL; ++} ++ ++/* ++ * Convert a pnfs_osd data map into Panasas aggregation layout header ++ */ ++static int ++panfs_shim_conv_pnfs_osd_data_map( ++ struct pnfs_osd_layout *layout, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ int status = -EINVAL; ++ struct pnfs_osd_data_map *lo_map = &layout->olo_map; ++ ++ if (!layout->olo_num_comps) { ++ dprintk("%s: !!layout.n_comps(%u)\n", __func__, ++ layout->olo_num_comps); ++ goto err; ++ } ++ ++ switch (lo_map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ if (layout->olo_num_comps != lo_map->odm_num_comps || ++ layout->olo_comps_index) { ++ dprintk("%s: !!PNFS_OSD_RAID_0 " ++ "layout.n_comps(%u) map.n_comps(%u) " ++ "comps_index(%u)\n", __func__, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ status = panfs_shim_conv_raid01(layout, lo_map, hdr); ++ break; ++ ++ case PNFS_OSD_RAID_5: ++ if (!lo_map->odm_group_width) { ++ if (layout->olo_num_comps != lo_map->odm_num_comps || ++ layout->olo_comps_index) { ++ dprintk("%s: !!PNFS_OSD_RAID_5 !group_width " ++ "layout.n_comps(%u)!=map.n_comps(%u) " ++ "|| comps_index(%u)\n", __func__, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ } else if ((layout->olo_num_comps != lo_map->odm_num_comps && ++ layout->olo_num_comps > lo_map->odm_group_width) || ++ (layout->olo_comps_index % lo_map->odm_group_width)){ ++ dprintk("%s: !!PNFS_OSD_RAID_5 group_width(%u) " ++ "layout.n_comps(%u) map.n_comps(%u) " ++ "comps_index(%u)\n", __func__, ++ lo_map->odm_group_width, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ status = panfs_shim_conv_raid5(layout, lo_map, hdr); ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_PQ: ++ default: ++ dprintk("%s: !!PNFS_OSD_RAID_(%d)\n", __func__, ++ lo_map->odm_raid_algorithm); ++ goto err; ++ } ++ ++ return 0; ++ ++err: ++ return status; ++} ++ ++/* ++ * Convert pnfs_osd layout into Panasas map and caps type ++ */ ++int ++objio_alloc_lseg(void **outp, ++ struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout) ++{ ++ int i, total_comps; ++ int status; ++ struct pnfs_osd_object_cred *lo_comp; ++ pan_size_t alloc_sz, local_sz; ++ pan_sm_map_cap_t *mcs = NULL; ++ u8 *buf; ++ pan_agg_comp_obj_t *pan_comp; ++ pan_sm_sec_t *pan_sec; ++ ++ status = -EINVAL; ++ if (layout->olo_num_comps < layout->olo_map.odm_group_width) { ++ total_comps = layout->olo_comps_index + layout->olo_num_comps; ++ } else { ++ /* allocate full map, otherwise SAM gets confused */ ++ total_comps = layout->olo_map.odm_num_comps; ++ } ++ alloc_sz = total_comps * ++ (sizeof(pan_agg_comp_obj_t) + sizeof(pan_sm_sec_t)); ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ void *p = layout->olo_comps[i].oc_cap.cred; ++ if (panfs_export_ops->sm_sec_t_get_size_otw( ++ (pan_sm_sec_otw_t *)&p, &local_sz, NULL, NULL)) ++ goto err; ++ alloc_sz += local_sz; ++ } ++ ++ status = -ENOMEM; ++ mcs = kzalloc(sizeof(*mcs) + alloc_sz, GFP_KERNEL); ++ if (!mcs) ++ goto err; ++ buf = (u8 *)&mcs[1]; ++ ++ mcs->offset = lseg->range.offset; ++ mcs->length = lseg->range.length; ++#if 0 ++ /* FIXME: for now */ ++ mcs->expiration_time.ts_sec = 0; ++ mcs->expiration_time.ts_nsec = 0; ++#endif ++ mcs->full_map.map_hdr.avail_state = PAN_AGG_OBJ_STATE_NORMAL; ++ status = panfs_shim_conv_pnfs_osd_data_map(layout, ++ &mcs->full_map.layout_hdr); ++ if (status) ++ goto err; ++ ++ mcs->full_map.components.size = total_comps; ++ mcs->full_map.components.data = (pan_agg_comp_obj_t *)buf; ++ buf += total_comps * sizeof(pan_agg_comp_obj_t); ++ ++ mcs->secs.size = total_comps; ++ mcs->secs.data = (pan_sm_sec_t *)buf; ++ buf += total_comps * sizeof(pan_sm_sec_t); ++ ++ lo_comp = layout->olo_comps; ++ pan_comp = mcs->full_map.components.data + layout->olo_comps_index; ++ pan_sec = mcs->secs.data + layout->olo_comps_index; ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ void *p; ++ pan_stor_obj_id_t *obj_id = &mcs->full_map.map_hdr.obj_id; ++ struct pnfs_osd_objid *oc_obj_id = &lo_comp->oc_object_id; ++ u64 dev_id = __be64_to_cpup( ++ (__be64 *)oc_obj_id->oid_device_id.data + 1); ++ ++ dprintk("%s: i=%d deviceid=%Lx:%Lx partition=%Lx object=%Lx\n", ++ __func__, i, ++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data), ++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data + 1), ++ oc_obj_id->oid_partition_id, oc_obj_id->oid_object_id); ++ ++ if (i == 0) { ++ /* make up mgr_id to calm sam down */ ++ pan_mgr_id_construct_artificial(PAN_MGR_SM, 0, ++ &obj_id->dev_id); ++ obj_id->grp_id = oc_obj_id->oid_partition_id; ++ obj_id->obj_id = oc_obj_id->oid_object_id; ++ } ++ ++ if (obj_id->grp_id != lo_comp->oc_object_id.oid_partition_id) { ++ dprintk("%s: i=%d grp_id=0x%Lx oid_partition_id=0x%Lx\n", ++ __func__, i, (u64)obj_id->grp_id, ++ lo_comp->oc_object_id.oid_partition_id); ++ status = -EINVAL; ++ goto err; ++ } ++ ++ if (obj_id->obj_id != lo_comp->oc_object_id.oid_object_id) { ++ dprintk("%s: i=%d obj_id=0x%Lx oid_object_id=0x%Lx\n", ++ __func__, i, obj_id->obj_id, ++ lo_comp->oc_object_id.oid_object_id); ++ status = -EINVAL; ++ goto err; ++ } ++ ++ pan_comp->dev_id = dev_id; ++ if (!pan_stor_is_device_id_an_obsd_id(pan_comp->dev_id)) { ++ dprintk("%s: i=%d dev_id=0x%Lx not an obsd_id\n", ++ __func__, i, obj_id->dev_id); ++ status = -EINVAL; ++ goto err; ++ } ++ if (lo_comp->oc_osd_version == PNFS_OSD_MISSING) { ++ dprintk("%s: degraded maps not supported yet\n", ++ __func__); ++ status = -ENOTSUPP; ++ goto err; ++ } ++ pan_comp->avail_state = PAN_AGG_COMP_STATE_NORMAL; ++ if (lo_comp->oc_cap_key_sec != PNFS_OSD_CAP_KEY_SEC_NONE) { ++ dprintk("%s: cap key security not supported yet\n", ++ __func__); ++ status = -ENOTSUPP; ++ goto err; ++ } ++ ++ p = lo_comp->oc_cap.cred; ++ panfs_export_ops->sm_sec_t_unmarshall( ++ (pan_sm_sec_otw_t *)&p, ++ pan_sec, ++ buf, ++ alloc_sz, ++ NULL, ++ &local_sz); ++ buf += local_sz; ++ alloc_sz -= local_sz; ++ ++ lo_comp++; ++ pan_comp++; ++ pan_sec++; ++ } ++ ++ *outp = mcs; ++ dprintk("%s:Return mcs=%p\n", __func__, mcs); ++ return 0; ++ ++err: ++ objio_free_lseg(mcs); ++ dprintk("%s:Error %d\n", __func__, status); ++ return status; ++} ++ ++/* ++ * Free a Panasas map and caps type ++ */ ++void ++objio_free_lseg(void *p) ++{ ++ kfree(p); ++} ++ ++/* ++ * I/O routines ++ */ ++int ++objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) ++{ ++ struct panfs_shim_io_state *p; ++ ++ dprintk("%s: allocating io_state\n", __func__); ++ p = kzalloc(sizeof(*p), GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ *outp = &p->ol_state; ++ return 0; ++} ++ ++/* ++ * Free an I/O state ++ */ ++void ++objio_free_io_state(struct objlayout_io_state *ol_state) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ int i; ++ ++ dprintk("%s: freeing io_state\n", __func__); ++ for (i = 0; i < state->ol_state.nr_pages; i++) ++ kunmap(state->ol_state.pages[i]); ++ ++ if (state->ucreds) ++ panfs_export_ops->ucreds_put(state->ucreds); ++ kfree(state->sg_list); ++ kfree(state); ++} ++ ++static int ++panfs_shim_pages_to_sg( ++ struct panfs_shim_io_state *state, ++ struct page **pages, ++ unsigned int pgbase, ++ unsigned nr_pages, ++ size_t count) ++{ ++ unsigned i, n; ++ pan_sg_entry_t *sg; ++ ++ dprintk("%s pgbase %u nr_pages %u count %d " ++ "pg0 %p flags 0x%x index %llu\n", ++ __func__, pgbase, nr_pages, (int)count, pages[0], ++ (unsigned)pages[0]->flags, (unsigned long long)pages[0]->index); ++ ++ sg = kmalloc(nr_pages * sizeof(*sg), GFP_KERNEL); ++ if (sg == NULL) ++ return -ENOMEM; ++ ++ dprintk("%s sg_list %p pages %p pgbase %u nr_pages %u\n", ++ __func__, sg, pages, pgbase, nr_pages); ++ ++ for (i = 0; i < nr_pages; i++) { ++ sg[i].buffer = (char *)kmap(pages[i]) + pgbase; ++ n = PAGE_SIZE - pgbase; ++ pgbase = 0; ++ if (n > count) ++ n = count; ++ sg[i].chunk_size = n; ++ count -= n; ++ if (likely(count)) { ++ sg[i].next = &sg[i+1]; ++ } else { ++ /* we're done */ ++ sg[i].next = NULL; ++ break; ++ } ++ } ++ BUG_ON(count); ++ ++ state->sg_list = sg; ++ return 0; ++} ++ ++/* ++ * Callback function for async reads ++ */ ++static void ++panfs_shim_read_done( ++ void *arg1, ++ void *arg2, ++ pan_sam_read_res_t *res_p, ++ pan_status_t rc) ++{ ++ struct panfs_shim_io_state *state = arg1; ++ ssize_t status; ++ ++ dprintk("%s: Begin\n", __func__); ++ if (!res_p) ++ res_p = &state->u.read.res; ++ if (rc == PAN_SUCCESS) ++ rc = res_p->result; ++ if (rc == PAN_SUCCESS) { ++ status = res_p->length; ++ WARN_ON(status < 0); ++ } else { ++ status = -panfs_export_ops->convert_rc(rc); ++ dprintk("%s: pan_sam_read rc %d: status %Zd\n", ++ __func__, rc, status); ++ } ++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); ++ objlayout_read_done(&state->ol_state, status, true); ++} ++ ++ssize_t ++objio_read_pagelist(struct objlayout_io_state *ol_state) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)ol_state->objlseg->internal; ++ ssize_t status = 0; ++ pan_status_t rc = PAN_SUCCESS; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ status = panfs_shim_pages_to_sg(state, ol_state->pages, ++ ol_state->pgbase, ol_state->nr_pages, ++ ol_state->count); ++ if (unlikely(status)) ++ goto err; ++ ++ state->obj_sec.min_security = 0; ++ state->obj_sec.map_ccaps = mcs; ++ ++ rc = panfs_export_ops->ucreds_get(&state->ucreds); ++ if (unlikely(rc)) { ++ status = -EACCES; ++ goto err; ++ } ++ ++ state->u.read.args.obj_id = mcs->full_map.map_hdr.obj_id; ++ state->u.read.args.offset = ol_state->offset; ++ rc = panfs_export_ops->sam_read(PAN_SAM_ACCESS_BYPASS_TIMESTAMP, ++ &state->u.read.args, ++ &state->obj_sec, ++ state->sg_list, ++ state->ucreds, ++ ol_state->sync ? ++ NULL : panfs_shim_read_done, ++ state, NULL, ++ &state->u.read.res); ++ if (rc != PAN_ERR_IN_PROGRESS) ++ panfs_shim_read_done(state, NULL, &state->u.read.res, rc); ++ err: ++ dprintk("%s: Return %Zd\n", __func__, status); ++ return status; ++} ++ ++/* ++ * Callback function for async writes ++ */ ++static void ++panfs_shim_write_done( ++ void *arg1, ++ void *arg2, ++ pan_sam_write_res_t *res_p, ++ pan_status_t rc) ++{ ++ struct panfs_shim_io_state *state = arg1; ++ ssize_t status; ++ ++ dprintk("%s: Begin\n", __func__); ++ if (!res_p) ++ res_p = &state->u.write.res; ++ if (rc == PAN_SUCCESS) ++ rc = res_p->result; ++ if (rc == PAN_SUCCESS) { ++/* state->ol_state.committed = NFS_FILE_SYNC;*/ ++ state->ol_state.committed = NFS_UNSTABLE; ++ status = res_p->length; ++ WARN_ON(status < 0); ++ ++ objlayout_add_delta_space_used(&state->ol_state, ++ res_p->delta_capacity_used); ++ } else { ++ status = -panfs_export_ops->convert_rc(rc); ++ dprintk("%s: pan_sam_write rc %u: status %Zd\n", ++ __func__, rc, status); ++ } ++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); ++ objlayout_write_done(&state->ol_state, status, true); ++} ++ ++ssize_t ++objio_write_pagelist(struct objlayout_io_state *ol_state, ++ bool stable /* unused, PanOSD writes are stable */) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)ol_state->objlseg->internal; ++ ssize_t status = 0; ++ pan_status_t rc = PAN_SUCCESS; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ status = panfs_shim_pages_to_sg(state, ol_state->pages, ++ ol_state->pgbase, ol_state->nr_pages, ++ ol_state->count); ++ if (unlikely(status)) ++ goto err; ++ ++ state->obj_sec.min_security = 0; ++ state->obj_sec.map_ccaps = mcs; ++ ++ rc = panfs_export_ops->ucreds_get(&state->ucreds); ++ if (unlikely(rc)) { ++ status = -EACCES; ++ goto err; ++ } ++ ++ state->u.write.args.obj_id = mcs->full_map.map_hdr.obj_id; ++ state->u.write.args.offset = ol_state->offset; ++ rc = panfs_export_ops->sam_write(PAN_SAM_ACCESS_NONE, ++ &state->u.write.args, ++ &state->obj_sec, ++ state->sg_list, ++ state->ucreds, ++ ol_state->sync ? ++ NULL : panfs_shim_write_done, ++ state, ++ NULL, ++ &state->u.write.res); ++ if (rc != PAN_ERR_IN_PROGRESS) ++ panfs_shim_write_done(state, NULL, &state->u.write.res, rc); ++ err: ++ dprintk("%s: Return %Zd\n", __func__, status); ++ return status; ++} ++ ++int ++panfs_shim_register(struct panfs_export_operations *ops) ++{ ++ if (panfs_export_ops) { ++ printk(KERN_INFO ++ "%s: panfs already registered (panfs ops %p)\n", ++ __func__, panfs_export_ops); ++ return -EINVAL; ++ } ++ ++ printk(KERN_INFO "%s: registering panfs ops %p\n", ++ __func__, ops); ++ ++ panfs_export_ops = ops; ++ return 0; ++} ++EXPORT_SYMBOL(panfs_shim_register); ++ ++int ++panfs_shim_unregister(void) ++{ ++ if (!panfs_export_ops) { ++ printk(KERN_INFO "%s: panfs is not registered\n", __func__); ++ return -EINVAL; ++ } ++ ++ printk(KERN_INFO "%s: unregistering panfs ops %p\n", ++ __func__, panfs_export_ops); ++ ++ panfs_export_ops = NULL; ++ return 0; ++} ++EXPORT_SYMBOL(panfs_shim_unregister); ++ ++/* ++ * Policy Operations ++ */ ++ ++#define PANLAYOUT_DEF_STRIPE_UNIT (64*1024) ++#define PANLAYOUT_DEF_STRIPE_WIDTH 9 ++#define PANLAYOUT_MAX_STRIPE_WIDTH 11 ++#define PANLAYOUT_MAX_GATHER_STRIPES 8 ++ ++/* ++ * Get the max [rw]size ++ */ ++static ssize_t ++panlayout_get_blocksize(void) ++{ ++ ssize_t sz = (PANLAYOUT_MAX_STRIPE_WIDTH-1) * ++ PANLAYOUT_DEF_STRIPE_UNIT * ++ PANLAYOUT_MAX_GATHER_STRIPES; ++ dprintk("%s: Return %Zd\n", __func__, sz); ++ return sz; ++} ++ ++/* ++ * Don't gather across stripes, but rather gather (coalesce) up to ++ * the stripe size. ++ * ++ * FIXME: change interface to use merge_align, merge_count ++ */ ++#define PNFS_LAYOUT_PANOSD (NFS4_PNFS_PRIVATE_LAYOUT | LAYOUT_OSD2_OBJECTS) ++ ++static struct pnfs_layoutdriver_type panlayout_type = { ++ .id = PNFS_LAYOUT_PANOSD, ++ .name = "PNFS_LAYOUT_PANOSD", ++ .flags = PNFS_LAYOUTRET_ON_SETATTR, ++ ++ .set_layoutdriver = objlayout_set_layoutdriver, ++ .clear_layoutdriver = objlayout_clear_layoutdriver, ++ ++ .alloc_layout_hdr = objlayout_alloc_layout_hdr, ++ .free_layout_hdr = objlayout_free_layout_hdr, ++ ++ .alloc_lseg = objlayout_alloc_lseg, ++ .free_lseg = objlayout_free_lseg, ++ ++ .get_blocksize = panlayout_get_blocksize, ++ ++ .read_pagelist = objlayout_read_pagelist, ++ .write_pagelist = objlayout_write_pagelist, ++ .commit = objlayout_commit, ++ ++ .encode_layoutcommit = objlayout_encode_layoutcommit, ++ .encode_layoutreturn = objlayout_encode_layoutreturn, ++}; ++ ++MODULE_DESCRIPTION("pNFS Layout Driver for Panasas OSDs"); ++MODULE_AUTHOR("Benny Halevy "); ++MODULE_LICENSE("GPL"); ++ ++static int __init ++panlayout_init(void) ++{ ++ int ret = pnfs_register_layoutdriver(&panlayout_type); ++ ++ if (ret) ++ printk(KERN_INFO ++ "%s: Registering Panasas OSD pNFS Layout Driver failed: error=%d\n", ++ __func__, ret); ++ else ++ printk(KERN_INFO "%s: Registered Panasas OSD pNFS Layout Driver\n", ++ __func__); ++ return ret; ++} ++ ++static void __exit ++panlayout_exit(void) ++{ ++ pnfs_unregister_layoutdriver(&panlayout_type); ++ printk(KERN_INFO "%s: Unregistered Panasas OSD pNFS Layout Driver\n", ++ __func__); ++} ++ ++module_init(panlayout_init); ++module_exit(panlayout_exit); +diff -up linux-2.6.37.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.37.noarch/fs/nfs/objlayout/panfs_shim.h +--- linux-2.6.37.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2011-01-28 09:43:53.336772273 -0500 ++++ linux-2.6.37.noarch/fs/nfs/objlayout/panfs_shim.h 2011-01-28 09:43:53.336772273 -0500 +@@ -0,0 +1,482 @@ ++/* ++ * panfs_shim.h ++ * ++ * Data types and external function declerations for interfacing with ++ * panfs (Panasas DirectFlow) I/O stack ++ * ++ * Copyright (C) 2007 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * See the file COPYING included with this distribution for more details. ++ * ++ */ ++ ++#ifndef _PANLAYOUT_PANFS_SHIM_H ++#define _PANLAYOUT_PANFS_SHIM_H ++ ++typedef s8 pan_int8_t; ++typedef u8 pan_uint8_t; ++typedef s16 pan_int16_t; ++typedef u16 pan_uint16_t; ++typedef s32 pan_int32_t; ++typedef u32 pan_uint32_t; ++typedef s64 pan_int64_t; ++typedef u64 pan_uint64_t; ++ ++/* ++ * from pan_base_types.h ++ */ ++typedef pan_uint64_t pan_rpc_none_t; ++typedef pan_uint32_t pan_rpc_arrdim_t; ++typedef pan_uint32_t pan_status_t; ++typedef pan_uint8_t pan_otw_t; ++typedef pan_uint8_t pan_pad_t; ++ ++typedef pan_uint32_t pan_timespec_sec_t; ++typedef pan_uint32_t pan_timespec_nsec_t; ++ ++typedef struct pan_timespec_s pan_timespec_t; ++struct pan_timespec_s { ++ pan_timespec_sec_t ts_sec; ++ pan_timespec_nsec_t ts_nsec; ++}; ++ ++/* ++ * from pan_std_types.h ++ */ ++typedef pan_uint32_t pan_size_t; ++typedef int pan_bool_t; ++ ++/* ++ * from pan_common_error.h ++ */ ++#define PAN_SUCCESS ((pan_status_t)0) ++#define PAN_ERR_IN_PROGRESS ((pan_status_t)55) ++ ++/* ++ * from pan_sg.h ++ */ ++typedef struct pan_sg_entry_s pan_sg_entry_t; ++struct pan_sg_entry_s { ++ void *buffer; /* pointer to memory */ ++ pan_uint32_t chunk_size; /* size of each chunk (bytes) */ ++ pan_sg_entry_t *next; ++}; ++ ++/* ++ * from pan_storage.h ++ */ ++typedef pan_uint64_t pan_stor_dev_id_t; ++typedef pan_uint32_t pan_stor_obj_grp_id_t; ++typedef pan_uint64_t pan_stor_obj_uniq_t; ++typedef pan_uint32_t pan_stor_action_t; ++typedef pan_uint8_t pan_stor_cap_key_t[20]; ++ ++typedef pan_uint8_t pan_stor_key_type_t; ++typedef pan_uint64_t pan_stor_len_t; ++typedef pan_int64_t pan_stor_delta_len_t; ++typedef pan_uint64_t pan_stor_offset_t; ++typedef pan_uint16_t pan_stor_op_t; ++ ++typedef pan_uint16_t pan_stor_sec_level_t; ++ ++struct pan_stor_obj_id_s { ++ pan_stor_dev_id_t dev_id; ++ pan_stor_obj_uniq_t obj_id; ++ pan_stor_obj_grp_id_t grp_id; ++}; ++ ++typedef struct pan_stor_obj_id_s pan_stor_obj_id_t; ++ ++#define PAN_STOR_OP_NONE ((pan_stor_op_t) 0U) ++#define PAN_STOR_OP_READ ((pan_stor_op_t) 8U) ++#define PAN_STOR_OP_WRITE ((pan_stor_op_t) 9U) ++#define PAN_STOR_OP_APPEND ((pan_stor_op_t) 10U) ++#define PAN_STOR_OP_GETATTR ((pan_stor_op_t) 11U) ++#define PAN_STOR_OP_SETATTR ((pan_stor_op_t) 12U) ++#define PAN_STOR_OP_FLUSH ((pan_stor_op_t) 13U) ++#define PAN_STOR_OP_CLEAR ((pan_stor_op_t) 14U) ++ ++/* ++ * from pan_aggregation_map.h ++ */ ++typedef pan_uint8_t pan_agg_type_t; ++typedef pan_uint64_t pan_agg_map_version_t; ++typedef pan_uint8_t pan_agg_obj_state_t; ++typedef pan_uint8_t pan_agg_comp_state_t; ++typedef pan_uint8_t pan_agg_comp_flag_t; ++ ++#define PAN_AGG_OBJ_STATE_INVALID ((pan_agg_obj_state_t) 0x00) ++#define PAN_AGG_OBJ_STATE_NORMAL ((pan_agg_obj_state_t) 0x01) ++#define PAN_AGG_OBJ_STATE_DEGRADED ((pan_agg_obj_state_t) 0x02) ++#define PAN_AGG_OBJ_STATE_RECONSTRUCT ((pan_agg_obj_state_t) 0x03) ++#define PAN_AGG_OBJ_STATE_COPYBACK ((pan_agg_obj_state_t) 0x04) ++#define PAN_AGG_OBJ_STATE_UNAVAILABLE ((pan_agg_obj_state_t) 0x05) ++#define PAN_AGG_OBJ_STATE_CREATING ((pan_agg_obj_state_t) 0x06) ++#define PAN_AGG_OBJ_STATE_DELETED ((pan_agg_obj_state_t) 0x07) ++#define PAN_AGG_COMP_STATE_INVALID ((pan_agg_comp_state_t) 0x00) ++#define PAN_AGG_COMP_STATE_NORMAL ((pan_agg_comp_state_t) 0x01) ++#define PAN_AGG_COMP_STATE_UNAVAILABLE ((pan_agg_comp_state_t) 0x02) ++#define PAN_AGG_COMP_STATE_COPYBACK ((pan_agg_comp_state_t) 0x03) ++#define PAN_AGG_COMP_F_NONE ((pan_agg_comp_flag_t) 0x00) ++#define PAN_AGG_COMP_F_ATTR_STORING ((pan_agg_comp_flag_t) 0x01) ++#define PAN_AGG_COMP_F_OBJ_CORRUPT_OBS ((pan_agg_comp_flag_t) 0x02) ++#define PAN_AGG_COMP_F_TEMP ((pan_agg_comp_flag_t) 0x04) ++ ++struct pan_aggregation_map_s { ++ pan_agg_map_version_t version; ++ pan_agg_obj_state_t avail_state; ++ pan_stor_obj_id_t obj_id; ++}; ++ ++typedef struct pan_aggregation_map_s pan_aggregation_map_t; ++ ++struct pan_agg_comp_obj_s { ++ pan_stor_dev_id_t dev_id; ++ pan_agg_comp_state_t avail_state; ++ pan_agg_comp_flag_t comp_flags; ++}; ++ ++typedef struct pan_agg_comp_obj_s pan_agg_comp_obj_t; ++ ++struct pan_agg_simple_header_s { ++ pan_uint8_t unused; ++}; ++ ++typedef struct pan_agg_simple_header_s pan_agg_simple_header_t; ++ ++struct pan_agg_raid1_header_s { ++ pan_uint16_t num_comps; ++}; ++ ++typedef struct pan_agg_raid1_header_s pan_agg_raid1_header_t; ++ ++struct pan_agg_raid0_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit; ++}; ++ ++typedef struct pan_agg_raid0_header_s pan_agg_raid0_header_t; ++ ++struct pan_agg_raid5_left_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit0; ++ pan_uint32_t stripe_unit1; ++ pan_uint32_t stripe_unit2; ++}; ++ ++typedef struct pan_agg_raid5_left_header_s pan_agg_raid5_left_header_t; ++ ++typedef struct pan_agg_grp_raid5_left_header_s pan_agg_grp_raid5_left_header_t; ++ ++struct pan_agg_grp_raid5_left_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit; ++ pan_uint16_t rg_width; ++ pan_uint16_t rg_depth; ++ pan_uint8_t group_layout_policy; ++}; ++ ++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_INVALID ((pan_uint8_t) 0x00) ++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN ((pan_uint8_t) 0x01) ++ ++#define PAN_AGG_NULL_MAP ((pan_agg_type_t) 0x00) ++#define PAN_AGG_SIMPLE ((pan_agg_type_t) 0x01) ++#define PAN_AGG_RAID1 ((pan_agg_type_t) 0x02) ++#define PAN_AGG_RAID0 ((pan_agg_type_t) 0x03) ++#define PAN_AGG_RAID5_LEFT ((pan_agg_type_t) 0x04) ++#define PAN_AGG_GRP_RAID5_LEFT ((pan_agg_type_t) 0x06) ++#define PAN_AGG_MINTYPE ((pan_agg_type_t) 0x01) ++#define PAN_AGG_MAXTYPE ((pan_agg_type_t) 0x06) ++ ++struct pan_agg_layout_hdr_s { ++ pan_agg_type_t type; ++ pan_pad_t pad[3]; ++ union { ++ pan_uint64_t null; ++ pan_agg_simple_header_t simple; ++ pan_agg_raid1_header_t raid1; ++ pan_agg_raid0_header_t raid0; ++ pan_agg_raid5_left_header_t raid5_left; ++ pan_agg_grp_raid5_left_header_t grp_raid5_left; ++ } hdr; ++}; ++ ++typedef struct pan_agg_layout_hdr_s pan_agg_layout_hdr_t; ++ ++struct pan_agg_comp_obj_a_s { ++ pan_rpc_arrdim_t size; ++ pan_agg_comp_obj_t *data; ++}; ++typedef struct pan_agg_comp_obj_a_s pan_agg_comp_obj_a; ++ ++struct pan_agg_full_map_s { ++ pan_aggregation_map_t map_hdr; ++ pan_agg_layout_hdr_t layout_hdr; ++ pan_agg_comp_obj_a components; ++}; ++ ++typedef struct pan_agg_full_map_s pan_agg_full_map_t; ++ ++/* ++ * from pan_obsd_rpc_types.h ++ */ ++typedef pan_uint8_t pan_obsd_security_key_a[16]; ++ ++typedef pan_uint8_t pan_obsd_capability_key_a[20]; ++ ++typedef pan_uint8_t pan_obsd_key_holder_id_t; ++ ++#define PAN_OBSD_KEY_HOLDER_BASIS_KEY ((pan_obsd_key_holder_id_t) 0x01) ++#define PAN_OBSD_KEY_HOLDER_CAP_KEY ((pan_obsd_key_holder_id_t) 0x02) ++ ++struct pan_obsd_key_holder_s { ++ pan_obsd_key_holder_id_t select; ++ pan_pad_t pad[3]; ++ union { ++ pan_obsd_security_key_a basis_key; ++ pan_obsd_capability_key_a cap_key; ++ } key; ++}; ++ ++typedef struct pan_obsd_key_holder_s pan_obsd_key_holder_t; ++ ++/* ++ * from pan_sm_sec.h ++ */ ++typedef pan_uint8_t pan_sm_sec_type_t; ++typedef pan_uint8_t pan_sm_sec_otw_allo_mode_t; ++ ++struct pan_obsd_capability_generic_otw_t_s { ++ pan_rpc_arrdim_t size; ++ pan_uint8_t *data; ++}; ++typedef struct pan_obsd_capability_generic_otw_t_s ++ pan_obsd_capability_generic_otw_t; ++ ++struct pan_sm_sec_obsd_s { ++ pan_obsd_key_holder_t key; ++ pan_obsd_capability_generic_otw_t cap_otw; ++ pan_sm_sec_otw_allo_mode_t allo_mode; ++}; ++ ++typedef struct pan_sm_sec_obsd_s pan_sm_sec_obsd_t; ++ ++struct pan_sm_sec_s { ++ pan_sm_sec_type_t type; ++ pan_pad_t pad[3]; ++ union { ++ pan_rpc_none_t none; ++ pan_sm_sec_obsd_t obsd; ++ } variant; ++}; ++ ++typedef struct pan_sm_sec_s pan_sm_sec_t; ++ ++struct pan_sm_sec_a_s { ++ pan_rpc_arrdim_t size; ++ pan_sm_sec_t *data; ++}; ++typedef struct pan_sm_sec_a_s pan_sm_sec_a; ++typedef pan_otw_t *pan_sm_sec_otw_t; ++ ++/* ++ * from pan_sm_types.h ++ */ ++typedef pan_uint64_t pan_sm_cap_handle_t; ++ ++struct pan_sm_map_cap_s { ++ pan_agg_full_map_t full_map; ++ pan_stor_offset_t offset; ++ pan_stor_len_t length; ++ pan_sm_sec_a secs; ++ pan_sm_cap_handle_t handle; ++ pan_timespec_t expiration_time; ++ pan_stor_action_t action_mask; ++ pan_uint32_t flags; ++}; ++ ++typedef struct pan_sm_map_cap_s pan_sm_map_cap_t; ++ ++/* ++ * from pan_sm_ops.h ++ */ ++typedef pan_rpc_none_t pan_sm_cache_ptr_t; ++ ++/* ++ * from pan_sam_api.h ++ */ ++typedef pan_uint32_t pan_sam_access_flags_t; ++ ++typedef struct pan_sam_dev_error_s pan_sam_dev_error_t; ++struct pan_sam_dev_error_s { ++ pan_stor_dev_id_t dev_id; ++ pan_stor_op_t stor_op; ++ pan_status_t error; ++}; ++ ++typedef struct pan_sam_ext_status_s pan_sam_ext_status_t; ++struct pan_sam_ext_status_s { ++ pan_uint32_t available; ++ pan_uint32_t size; ++ pan_sam_dev_error_t *errors; ++}; ++ ++enum pan_sam_rpc_sec_sel_e { ++ PAN_SAM_RPC_SEC_DEFAULT, ++ PAN_SAM_RPC_SEC_ATLEAST, ++ PAN_SAM_RPC_SEC_EXACTLY ++}; ++typedef enum pan_sam_rpc_sec_sel_e pan_sam_rpc_sec_sel_t; ++ ++typedef struct pan_sam_obj_sec_s pan_sam_obj_sec_t; ++struct pan_sam_obj_sec_s { ++ pan_stor_sec_level_t min_security; ++ pan_sm_map_cap_t *map_ccaps; ++}; ++ ++typedef struct pan_sam_rpc_sec_s pan_sam_rpc_sec_t; ++struct pan_sam_rpc_sec_s { ++ pan_sam_rpc_sec_sel_t selector; ++}; ++ ++typedef struct pan_sam_read_args_s pan_sam_read_args_t; ++struct pan_sam_read_args_s { ++ pan_stor_obj_id_t obj_id; ++ pan_sm_cache_ptr_t obj_ent; ++ void *return_attr; ++ void *checksum; ++ pan_stor_offset_t offset; ++ pan_uint16_t sm_options; ++ void *callout; ++ void *callout_arg; ++}; ++ ++typedef struct pan_sam_read_res_s pan_sam_read_res_t; ++struct pan_sam_read_res_s { ++ pan_status_t result; ++ pan_sam_ext_status_t ext_status; ++ pan_stor_len_t length; ++ void *attr; ++ void *checksum; ++}; ++ ++typedef void (*pan_sam_read_cb_t)( ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_read_res_t *res_p, ++ pan_status_t status); ++ ++#define PAN_SAM_ACCESS_NONE 0x0000 ++#define PAN_SAM_ACCESS_BYPASS_TIMESTAMP 0x0020 ++ ++typedef struct pan_sam_write_args_s pan_sam_write_args_t; ++struct pan_sam_write_args_s { ++ pan_stor_obj_id_t obj_id; ++ pan_sm_cache_ptr_t obj_ent; ++ pan_stor_offset_t offset; ++ void *attr; ++ void *return_attr; ++}; ++ ++typedef struct pan_sam_write_res_s pan_sam_write_res_t; ++struct pan_sam_write_res_s { ++ pan_status_t result; ++ pan_sam_ext_status_t ext_status; ++ pan_stor_len_t length; ++ pan_stor_delta_len_t delta_capacity_used; ++ pan_bool_t parity_dirty; ++ void *attr; ++}; ++ ++typedef void (*pan_sam_write_cb_t)( ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_write_res_t *res_p, ++ pan_status_t status); ++ ++/* ++ * from pan_mgr_types.h ++ */ ++#define PAN_MGR_ID_TYPE_SHIFT 56 ++#define PAN_MGR_ID_TYPE_MASK ((pan_mgr_id_t)18374686479671623680ULL) ++#define PAN_MGR_ID_UNIQ_MASK ((pan_mgr_id_t)72057594037927935ULL) ++ ++typedef pan_uint16_t pan_mgr_type_t; ++typedef pan_uint64_t pan_mgr_id_t; ++ ++#define PAN_MGR_SM ((pan_mgr_type_t) 2U) ++#define PAN_MGR_OBSD ((pan_mgr_type_t) 6U) ++ ++/* ++ * from pan_mgr_types_c.h ++ */ ++#define pan_mgr_id_construct_artificial(_mgr_type_, _mgr_uniq_, _mgr_id_p_) { \ ++ pan_mgr_id_t _id1, _id2; \ ++\ ++ _id1 = (_mgr_type_); \ ++ _id1 <<= PAN_MGR_ID_TYPE_SHIFT; \ ++ _id1 &= PAN_MGR_ID_TYPE_MASK; \ ++ _id2 = (_mgr_uniq_); \ ++ _id2 &= PAN_MGR_ID_UNIQ_MASK; \ ++ _id1 |= _id2; \ ++ *(_mgr_id_p_) = _id1; \ ++} ++ ++/* ++ * from pan_storage_c.h ++ */ ++#define pan_stor_is_device_id_an_obsd_id(_device_id_) \ ++ ((((_device_id_) & PAN_MGR_ID_TYPE_MASK) >> PAN_MGR_ID_TYPE_SHIFT) \ ++ == PAN_MGR_OBSD) ++ ++/* ++ * pnfs_shim internal definitions ++ */ ++ ++struct panfs_shim_io_state { ++ struct objlayout_io_state ol_state; ++ ++ pan_sg_entry_t *sg_list; ++ pan_sam_obj_sec_t obj_sec; ++ void *ucreds; ++ union { ++ struct { ++ pan_sam_read_args_t args; ++ pan_sam_read_res_t res; ++ } read; ++ struct { ++ pan_sam_write_args_t args; ++ pan_sam_write_res_t res; ++ } write; ++ } u; ++}; ++ ++#endif /* _PANLAYOUT_PANFS_SHIM_H */ +diff -up linux-2.6.37.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.37.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +--- linux-2.6.37.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2011-01-28 09:43:53.337772132 -0500 ++++ linux-2.6.37.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2011-01-28 09:43:53.337772132 -0500 +@@ -0,0 +1,435 @@ ++/* ++ * pnfs_osd_xdr.c ++ * ++ * Object-Based pNFS Layout XDR layer ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* ++ * The following implementation is based on these Internet Drafts: ++ * ++ * draft-ietf-nfsv4-minorversion-21 ++ * draft-ietf-nfsv4-pnfs-obj-12 ++ */ ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_objid(u32 *p, struct pnfs_osd_objid *objid) ++{ ++ COPYMEM(objid->oid_device_id.data, sizeof(objid->oid_device_id.data)); ++ READ64(objid->oid_partition_id); ++ READ64(objid->oid_object_id); ++ return p; ++} ++ ++static inline u32 * ++pnfs_osd_xdr_decode_opaque_cred(u32 *p, ++ struct pnfs_osd_opaque_cred *opaque_cred) ++{ ++ READ32(opaque_cred->cred_len); ++ COPYMEM(opaque_cred->cred, opaque_cred->cred_len); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_object_cred { ++ * struct pnfs_osd_objid oc_object_id; ++ * u32 oc_osd_version; ++ * u32 oc_cap_key_sec; ++ * struct pnfs_osd_opaque_cred oc_cap_key ++ * struct pnfs_osd_opaque_cred oc_cap; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_object_cred(u32 *p, struct pnfs_osd_object_cred *comp, ++ u8 **credp) ++{ ++ u8 *cred; ++ ++ p = pnfs_osd_xdr_decode_objid(p, &comp->oc_object_id); ++ READ32(comp->oc_osd_version); ++ READ32(comp->oc_cap_key_sec); ++ ++ cred = *credp; ++ comp->oc_cap_key.cred = cred; ++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap_key); ++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap_key.cred_len)); ++ comp->oc_cap.cred = cred; ++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap); ++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap.cred_len)); ++ *credp = cred; ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_data_map { ++ * u32 odm_num_comps; ++ * u64 odm_stripe_unit; ++ * u32 odm_group_width; ++ * u32 odm_group_depth; ++ * u32 odm_mirror_cnt; ++ * u32 odm_raid_algorithm; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_data_map(u32 *p, struct pnfs_osd_data_map *data_map) ++{ ++ READ32(data_map->odm_num_comps); ++ READ64(data_map->odm_stripe_unit); ++ READ32(data_map->odm_group_width); ++ READ32(data_map->odm_group_depth); ++ READ32(data_map->odm_mirror_cnt); ++ READ32(data_map->odm_raid_algorithm); ++ dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " ++ "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", ++ __func__, ++ data_map->odm_num_comps, ++ (unsigned long long)data_map->odm_stripe_unit, ++ data_map->odm_group_width, ++ data_map->odm_group_depth, ++ data_map->odm_mirror_cnt, ++ data_map->odm_raid_algorithm); ++ return p; ++} ++ ++struct pnfs_osd_layout * ++pnfs_osd_xdr_decode_layout(struct pnfs_osd_layout *layout, u32 *p) ++{ ++ int i; ++ u32 *start = p; ++ struct pnfs_osd_object_cred *comp; ++ u8 *cred; ++ ++ p = pnfs_osd_xdr_decode_data_map(p, &layout->olo_map); ++ READ32(layout->olo_comps_index); ++ READ32(layout->olo_num_comps); ++ layout->olo_comps = (struct pnfs_osd_object_cred *)(layout + 1); ++ comp = layout->olo_comps; ++ cred = (u8 *)(comp + layout->olo_num_comps); ++ dprintk("%s: comps_index=%u num_comps=%u\n", ++ __func__, layout->olo_comps_index, layout->olo_num_comps); ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ p = pnfs_osd_xdr_decode_object_cred(p, comp, &cred); ++ dprintk("%s: comp[%d]=dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "key_len=%u cap_len=%u\n", ++ __func__, i, ++ _DEVID_LO(&comp->oc_object_id.oid_device_id), ++ _DEVID_HI(&comp->oc_object_id.oid_device_id), ++ comp->oc_object_id.oid_partition_id, ++ comp->oc_object_id.oid_object_id, ++ comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); ++ comp++; ++ } ++ dprintk("%s: xdr_size=%Zd end=%p in_core_size=%Zd\n", __func__, ++ (char *)p - (char *)start, cred, (char *)cred - (char *)layout); ++ return layout; ++} ++ ++/* ++ * Get Device Information Decoding ++ * ++ * Note: since Device Information is currently done synchronously, most ++ * of the actual fields are left inside the rpc buffer and are only ++ * pointed to by the pnfs_osd_deviceaddr members. So the read buffer ++ * should not be freed while the returned information is in use. ++ */ ++ ++u32 *__xdr_read_calc_nfs4_string( ++ u32 *p, struct nfs4_string *str, u8 **freespace) ++{ ++ u32 len; ++ char *data; ++ bool need_copy; ++ ++ READ32(len); ++ data = (char *)p; ++ ++ if (data[len]) { /* Not null terminated we'll need extra space */ ++ data = *freespace; ++ *freespace += len + 1; ++ need_copy = true; ++ } else { ++ need_copy = false; ++ } ++ ++ if (str) { ++ str->len = len; ++ str->data = data; ++ if (need_copy) { ++ memcpy(data, p, len); ++ data[len] = 0; ++ } ++ } ++ ++ p += XDR_QUADLEN(len); ++ return p; ++} ++ ++u32 *__xdr_read_calc_u8_opaque( ++ u32 *p, struct nfs4_string *str) ++{ ++ u32 len; ++ ++ READ32(len); ++ ++ if (str) { ++ str->len = len; ++ str->data = (char *)p; ++ } ++ ++ p += XDR_QUADLEN(len); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_targetid { ++ * u32 oti_type; ++ * struct nfs4_string oti_scsi_device_id; ++ * }; ++ */ ++u32 *__xdr_read_calc_targetid( ++ u32 *p, struct pnfs_osd_targetid* targetid, u8 **freespace) ++{ ++ u32 oti_type; ++ ++ READ32(oti_type); ++ if (targetid) ++ targetid->oti_type = oti_type; ++ ++ switch (oti_type) { ++ case OBJ_TARGET_SCSI_NAME: ++ case OBJ_TARGET_SCSI_DEVICE_ID: ++ p = __xdr_read_calc_u8_opaque(p, ++ targetid ? &targetid->oti_scsi_device_id : NULL); ++ } ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_net_addr { ++ * struct nfs4_string r_netid; ++ * struct nfs4_string r_addr; ++ * }; ++ */ ++u32 *__xdr_read_calc_net_addr( ++ u32 *p, struct pnfs_osd_net_addr* netaddr, u8 **freespace) ++{ ++ ++ p = __xdr_read_calc_nfs4_string(p, ++ netaddr ? &netaddr->r_netid : NULL, ++ freespace); ++ ++ p = __xdr_read_calc_nfs4_string(p, ++ netaddr ? &netaddr->r_addr : NULL, ++ freespace); ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_targetaddr { ++ * u32 ota_available; ++ * struct pnfs_osd_net_addr ota_netaddr; ++ * }; ++ */ ++u32 *__xdr_read_calc_targetaddr( ++ u32 *p, struct pnfs_osd_targetaddr *targetaddr, u8 **freespace) ++{ ++ u32 ota_available; ++ ++ READ32(ota_available); ++ if (targetaddr) ++ targetaddr->ota_available = ota_available; ++ ++ if (ota_available) { ++ p = __xdr_read_calc_net_addr(p, ++ targetaddr ? &targetaddr->ota_netaddr : NULL, ++ freespace); ++ } ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_deviceaddr { ++ * struct pnfs_osd_targetid oda_targetid; ++ * struct pnfs_osd_targetaddr oda_targetaddr; ++ * u8 oda_lun[8]; ++ * struct nfs4_string oda_systemid; ++ * struct pnfs_osd_object_cred oda_root_obj_cred; ++ * struct nfs4_string oda_osdname; ++ * }; ++ */ ++u32 *__xdr_read_calc_deviceaddr( ++ u32 *p, struct pnfs_osd_deviceaddr *deviceaddr, u8 **freespace) ++{ ++ p = __xdr_read_calc_targetid(p, ++ deviceaddr ? &deviceaddr->oda_targetid : NULL, ++ freespace); ++ ++ p = __xdr_read_calc_targetaddr(p, ++ deviceaddr ? &deviceaddr->oda_targetaddr : NULL, ++ freespace); ++ ++ if (deviceaddr) ++ COPYMEM(deviceaddr->oda_lun, sizeof(deviceaddr->oda_lun)); ++ else ++ p += XDR_QUADLEN(sizeof(deviceaddr->oda_lun)); ++ ++ p = __xdr_read_calc_u8_opaque(p, ++ deviceaddr ? &deviceaddr->oda_systemid : NULL); ++ ++ if (deviceaddr) { ++ p = pnfs_osd_xdr_decode_object_cred(p, ++ &deviceaddr->oda_root_obj_cred, freespace); ++ } else { ++ *freespace += pnfs_osd_object_cred_incore_sz(p); ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ } ++ ++ p = __xdr_read_calc_u8_opaque(p, ++ deviceaddr ? &deviceaddr->oda_osdname : NULL); ++ ++ return p; ++} ++ ++size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p) ++{ ++ u8 *null_freespace = NULL; ++ size_t sz; ++ ++ __xdr_read_calc_deviceaddr(p, NULL, &null_freespace); ++ sz = sizeof(struct pnfs_osd_deviceaddr) + (size_t)null_freespace; ++ ++ return sz; ++} ++ ++void pnfs_osd_xdr_decode_deviceaddr( ++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p) ++{ ++ u8 *freespace = (u8 *)(deviceaddr + 1); ++ ++ __xdr_read_calc_deviceaddr(p, deviceaddr, &freespace); ++} ++ ++/* ++ * struct pnfs_osd_layoutupdate { ++ * u32 dsu_valid; ++ * s64 dsu_delta; ++ * u32 olu_ioerr_flag; ++ * }; ++ */ ++int ++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, ++ struct pnfs_osd_layoutupdate *lou) ++{ ++ __be32 *p = xdr_reserve_space(xdr, 16); ++ ++ if (!p) ++ return -E2BIG; ++ ++ *p++ = cpu_to_be32(lou->dsu_valid); ++ if (lou->dsu_valid) ++ p = xdr_encode_hyper(p, lou->dsu_delta); ++ *p++ = cpu_to_be32(lou->olu_ioerr_flag); ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ */ ++static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr, ++ struct pnfs_osd_objid *object_id) ++{ ++ __be32 *p; ++ ++ p = xdr_reserve_space(xdr, 32); ++ if (!p) ++ return -E2BIG; ++ ++ p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, ++ sizeof(object_id->oid_device_id.data)); ++ p = xdr_encode_hyper(p, object_id->oid_partition_id); ++ p = xdr_encode_hyper(p, object_id->oid_object_id); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_ioerr { ++ * struct pnfs_osd_objid oer_component; ++ * u64 oer_comp_offset; ++ * u64 oer_comp_length; ++ * u32 oer_iswrite; ++ * u32 oer_errno; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, ++ struct pnfs_osd_ioerr *ioerr) ++{ ++ __be32 *p; ++ int ret; ++ ++ ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component); ++ if (ret) ++ return ret; ++ ++ p = xdr_reserve_space(xdr, 24); ++ if (!p) ++ return -E2BIG; ++ ++ p = xdr_encode_hyper(p, ioerr->oer_comp_offset); ++ p = xdr_encode_hyper(p, ioerr->oer_comp_length); ++ *p++ = cpu_to_be32(ioerr->oer_iswrite); ++ *p = cpu_to_be32(ioerr->oer_errno); ++ ++ return 0; ++} +diff -up linux-2.6.37.noarch/fs/nfs/pagelist.c.orig linux-2.6.37.noarch/fs/nfs/pagelist.c +--- linux-2.6.37.noarch/fs/nfs/pagelist.c.orig 2011-01-28 09:37:32.547979774 -0500 ++++ linux-2.6.37.noarch/fs/nfs/pagelist.c 2011-01-28 09:43:53.338771991 -0500 +@@ -20,6 +20,7 @@ + #include + + #include "internal.h" ++#include "pnfs.h" + + static struct kmem_cache *nfs_page_cachep; + +@@ -53,7 +54,8 @@ nfs_page_free(struct nfs_page *p) + struct nfs_page * + nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, + struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_page *req; + +@@ -84,6 +86,9 @@ nfs_create_request(struct nfs_open_conte + req->wb_bytes = count; + req->wb_context = get_nfs_open_context(ctx); + kref_init(&req->wb_kref); ++ req->wb_lseg = lseg; ++ if (lseg) ++ get_lseg(lseg); + return req; + } + +@@ -159,9 +164,12 @@ void nfs_clear_request(struct nfs_page * + put_nfs_open_context(ctx); + req->wb_context = NULL; + } ++ if (req->wb_lseg != NULL) { ++ put_lseg(req->wb_lseg); ++ req->wb_lseg = NULL; ++ } + } + +- + /** + * nfs_release_request - Release the count on an NFS read/write request + * @req: request to release +@@ -240,7 +248,8 @@ void nfs_pageio_init(struct nfs_pageio_d + * Return 'true' if this is the case, else return 'false'. + */ + static int nfs_can_coalesce_requests(struct nfs_page *prev, +- struct nfs_page *req) ++ struct nfs_page *req, ++ struct nfs_pageio_descriptor *pgio) + { + if (req->wb_context->cred != prev->wb_context->cred) + return 0; +@@ -254,6 +263,12 @@ static int nfs_can_coalesce_requests(str + return 0; + if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) + return 0; ++ if (req->wb_lseg != prev->wb_lseg) ++ return 0; ++#ifdef CONFIG_NFS_V4_1 ++ if (pgio->pg_test && !pgio->pg_test(pgio, prev, req)) ++ return 0; ++#endif /* CONFIG_NFS_V4_1 */ + return 1; + } + +@@ -286,7 +301,7 @@ static int nfs_pageio_do_add_request(str + if (newlen > desc->pg_bsize) + return 0; + prev = nfs_list_entry(desc->pg_list.prev); +- if (!nfs_can_coalesce_requests(prev, req)) ++ if (!nfs_can_coalesce_requests(prev, req, desc)) + return 0; + } else + desc->pg_base = req->wb_pgbase; +@@ -375,6 +390,7 @@ void nfs_pageio_cond_complete(struct nfs + * @idx_start: lower bound of page->index to scan + * @npages: idx_start + npages sets the upper bound to scan. + * @tag: tag to scan for ++ * @use_pnfs: will be set TRUE if commit needs to be handled by layout driver + * + * Moves elements from one of the inode request lists. + * If the number of requests is set to 0, the entire address_space +@@ -384,7 +400,7 @@ void nfs_pageio_cond_complete(struct nfs + */ + int nfs_scan_list(struct nfs_inode *nfsi, + struct list_head *dst, pgoff_t idx_start, +- unsigned int npages, int tag) ++ unsigned int npages, int tag, int *use_pnfs) + { + struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; + struct nfs_page *req; +@@ -415,6 +431,8 @@ int nfs_scan_list(struct nfs_inode *nfsi + radix_tree_tag_clear(&nfsi->nfs_page_tree, + req->wb_index, tag); + nfs_list_add_request(req, dst); ++ if (req->wb_lseg) ++ *use_pnfs = 1; + res++; + if (res == INT_MAX) + goto out; +diff -up linux-2.6.37.noarch/fs/nfs/pnfs.c.orig linux-2.6.37.noarch/fs/nfs/pnfs.c +--- linux-2.6.37.noarch/fs/nfs/pnfs.c.orig 2011-01-28 09:37:32.548979739 -0500 ++++ linux-2.6.37.noarch/fs/nfs/pnfs.c 2011-01-28 09:43:53.339771853 -0500 +@@ -30,6 +30,7 @@ + #include + #include "internal.h" + #include "pnfs.h" ++#include "iostat.h" + + #define NFSDBG_FACILITY NFSDBG_PNFS + +@@ -71,6 +72,52 @@ find_pnfs_driver(u32 id) + return local; + } + ++/* Set cred to indicate we require a layoutcommit ++ * If we don't even have a layout, we don't need to commit it. ++ */ ++void ++pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx) ++{ ++ dprintk("%s: has_layout=%d ctx=%p\n", __func__, has_layout(nfsi), ctx); ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (has_layout(nfsi) && ++ !test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags)) { ++ nfsi->layout->cred = get_rpccred(ctx->state->owner->so_cred); ++ __set_bit(NFS_LAYOUT_NEED_LCOMMIT, ++ &nfsi->layout->plh_flags); ++ nfsi->change_attr++; ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ dprintk("%s: Set layoutcommit\n", __func__); ++ return; ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* Update last_write_offset for layoutcommit. ++ * TODO: We should only use commited extents, but the current nfs ++ * implementation does not calculate the written range in nfs_commit_done. ++ * We therefore update this field in writeback_done. ++ */ ++void ++pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent) ++{ ++ loff_t end_pos; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (offset < nfsi->layout->write_begin_pos) ++ nfsi->layout->write_begin_pos = offset; ++ end_pos = offset + extent - 1; /* I'm being inclusive */ ++ if (end_pos > nfsi->layout->write_end_pos) ++ nfsi->layout->write_end_pos = end_pos; ++ dprintk("%s: Wrote %lu@%lu bpos %lu, epos: %lu\n", ++ __func__, ++ (unsigned long) extent, ++ (unsigned long) offset , ++ (unsigned long) nfsi->layout->write_begin_pos, ++ (unsigned long) nfsi->layout->write_end_pos); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ + void + unset_pnfs_layoutdriver(struct nfs_server *nfss) + { +@@ -88,7 +135,8 @@ unset_pnfs_layoutdriver(struct nfs_serve + * @id layout type. Zero (illegal layout type) indicates pNFS not in use. + */ + void +-set_pnfs_layoutdriver(struct nfs_server *server, u32 id) ++set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, ++ u32 id) + { + struct pnfs_layoutdriver_type *ld_type = NULL; + +@@ -115,7 +163,7 @@ set_pnfs_layoutdriver(struct nfs_server + goto out_no_driver; + } + server->pnfs_curr_ld = ld_type; +- if (ld_type->set_layoutdriver(server)) { ++ if (ld_type->set_layoutdriver(server, mntfh)) { + printk(KERN_ERR + "%s: Error initializing mount point for layout driver %u.\n", + __func__, id); +@@ -146,6 +194,14 @@ pnfs_register_layoutdriver(struct pnfs_l + return status; + } + ++ if (!ld_type->read_pagelist || !ld_type->write_pagelist || ++ !ld_type->commit) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "read_pagelist, write_pagelist, and commit.\n", ++ __func__); ++ return status; ++ } ++ + spin_lock(&pnfs_spinlock); + tmp = find_pnfs_driver_locked(ld_type->id); + if (!tmp) { +@@ -184,18 +240,35 @@ get_layout_hdr(struct pnfs_layout_hdr *l + atomic_inc(&lo->plh_refcount); + } + ++static struct pnfs_layout_hdr * ++pnfs_alloc_layout_hdr(struct inode *ino) ++{ ++ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; ++ return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino) : ++ kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL); ++} ++ ++static void ++pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) ++{ ++ struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->inode)->pnfs_curr_ld; ++ return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); ++} ++ + static void + destroy_layout_hdr(struct pnfs_layout_hdr *lo) + { + dprintk("%s: freeing layout cache %p\n", __func__, lo); +- BUG_ON(!list_empty(&lo->plh_layouts)); +- NFS_I(lo->plh_inode)->layout = NULL; +- kfree(lo); ++ BUG_ON(!list_empty(&lo->layouts)); ++ NFS_I(lo->inode)->layout = NULL; ++ pnfs_free_layout_hdr(lo); + } + + static void + put_layout_hdr_locked(struct pnfs_layout_hdr *lo) + { ++ assert_spin_locked(&lo->inode->i_lock); ++ BUG_ON(atomic_read(&lo->plh_refcount) == 0); + if (atomic_dec_and_test(&lo->plh_refcount)) + destroy_layout_hdr(lo); + } +@@ -203,8 +276,9 @@ put_layout_hdr_locked(struct pnfs_layout + void + put_layout_hdr(struct pnfs_layout_hdr *lo) + { +- struct inode *inode = lo->plh_inode; ++ struct inode *inode = lo->inode; + ++ BUG_ON(atomic_read(&lo->plh_refcount) == 0); + if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { + destroy_layout_hdr(lo); + spin_unlock(&inode->i_lock); +@@ -214,27 +288,52 @@ put_layout_hdr(struct pnfs_layout_hdr *l + static void + init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) + { +- INIT_LIST_HEAD(&lseg->pls_list); ++ INIT_LIST_HEAD(&lseg->fi_list); + atomic_set(&lseg->pls_refcount, 1); + smp_mb(); + set_bit(NFS_LSEG_VALID, &lseg->pls_flags); +- lseg->pls_layout = lo; ++ lseg->layout = lo; ++ lseg->pls_notify_mask = 0; + } + + static void free_lseg(struct pnfs_layout_segment *lseg) + { +- struct inode *ino = lseg->pls_layout->plh_inode; ++ struct inode *ino = lseg->layout->inode; ++ u64 mask = lseg->pls_notify_mask; + ++ BUG_ON(atomic_read(&lseg->pls_refcount) != 0); + NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); +- /* Matched by get_layout_hdr in pnfs_insert_layout */ ++ notify_drained(NFS_SERVER(ino)->nfs_client, mask); ++ /* Matched by get_layout_hdr_locked in pnfs_insert_layout */ + put_layout_hdr(NFS_I(ino)->layout); + } + ++static void ++_put_lseg_common(struct pnfs_layout_segment *lseg) ++{ ++ struct inode *ino = lseg->layout->inode; ++ ++ BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); ++ list_del(&lseg->fi_list); ++ if (list_empty(&lseg->layout->segs)) { ++ struct nfs_client *clp; ++ ++ clp = NFS_SERVER(ino)->nfs_client; ++ spin_lock(&clp->cl_lock); ++ /* List does not take a reference, so no need for put here */ ++ list_del_init(&lseg->layout->layouts); ++ spin_unlock(&clp->cl_lock); ++ clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->layout->plh_flags); ++ if (!pnfs_layoutgets_blocked(lseg->layout, NULL)) ++ rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid); ++ } ++ rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq); ++} ++ + /* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg + * could sleep, so must be called outside of the lock. +- * Returns 1 if object was removed, otherwise return 0. + */ +-static int ++static void + put_lseg_locked(struct pnfs_layout_segment *lseg, + struct list_head *tmp_list) + { +@@ -242,74 +341,142 @@ put_lseg_locked(struct pnfs_layout_segme + atomic_read(&lseg->pls_refcount), + test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); + if (atomic_dec_and_test(&lseg->pls_refcount)) { +- struct inode *ino = lseg->pls_layout->plh_inode; ++ _put_lseg_common(lseg); ++ list_add(&lseg->fi_list, tmp_list); ++ } ++} + +- BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); +- list_del(&lseg->pls_list); +- if (list_empty(&lseg->pls_layout->plh_segs)) { +- struct nfs_client *clp; ++void ++put_lseg(struct pnfs_layout_segment *lseg) ++{ ++ struct inode *ino; + +- clp = NFS_SERVER(ino)->nfs_client; +- spin_lock(&clp->cl_lock); +- /* List does not take a reference, so no need for put here */ +- list_del_init(&lseg->pls_layout->plh_layouts); +- spin_unlock(&clp->cl_lock); +- clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags); +- } +- rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq); +- list_add(&lseg->pls_list, tmp_list); +- return 1; ++ if (!lseg) ++ return; ++ ++ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, ++ atomic_read(&lseg->pls_refcount), ++ test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); ++ ino = lseg->layout->inode; ++ if (atomic_dec_and_lock(&lseg->pls_refcount, &ino->i_lock)) { ++ _put_lseg_common(lseg); ++ spin_unlock(&ino->i_lock); ++ free_lseg(lseg); + } +- return 0; + } ++EXPORT_SYMBOL_GPL(put_lseg); + +-static bool +-should_free_lseg(u32 lseg_iomode, u32 recall_iomode) ++void get_lseg(struct pnfs_layout_segment *lseg) + { +- return (recall_iomode == IOMODE_ANY || +- lseg_iomode == recall_iomode); ++ atomic_inc(&lseg->pls_refcount); ++ smp_mb__after_atomic_inc(); + } ++EXPORT_SYMBOL_GPL(get_lseg); + +-/* Returns 1 if lseg is removed from list, 0 otherwise */ +-static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, +- struct list_head *tmp_list) ++static inline u64 ++end_offset(u64 start, u64 len) + { +- int rv = 0; ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end: NFS4_MAX_UINT64; ++} + ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1: NFS4_MAX_UINT64; ++} ++ ++/* ++ * is l2 fully contained in l1? ++ * start1 end1 ++ * [----------------------------------) ++ * start2 end2 ++ * [----------------) ++ */ ++static inline int ++lo_seg_contained(struct pnfs_layout_range *l1, ++ struct pnfs_layout_range *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ return (start1 <= start2) && (end1 >= end2); ++} ++ ++/* ++ * is l1 and l2 intersecting? ++ * start1 end1 ++ * [----------------------------------) ++ * start2 end2 ++ * [----------------) ++ */ ++static inline int ++lo_seg_intersecting(struct pnfs_layout_range *l1, ++ struct pnfs_layout_range *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ return (end1 == NFS4_MAX_UINT64 || end1 > start2) && ++ (end2 == NFS4_MAX_UINT64 || end2 > start1); ++} ++ ++bool ++should_free_lseg(struct pnfs_layout_range *lseg_range, ++ struct pnfs_layout_range *recall_range) ++{ ++ return (recall_range->iomode == IOMODE_ANY || ++ lseg_range->iomode == recall_range->iomode) && ++ lo_seg_intersecting(lseg_range, recall_range); ++} ++ ++static void mark_lseg_invalid(struct pnfs_layout_segment *lseg, ++ struct list_head *tmp_list) ++{ ++ assert_spin_locked(&lseg->layout->inode->i_lock); + if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { + /* Remove the reference keeping the lseg in the + * list. It will now be removed when all + * outstanding io is finished. + */ +- rv = put_lseg_locked(lseg, tmp_list); ++ put_lseg_locked(lseg, tmp_list); + } +- return rv; + } + +-/* Returns count of number of matching invalid lsegs remaining in list +- * after call. +- */ +-int +-mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, +- struct list_head *tmp_list, +- u32 iomode) ++/* Returns false if there was nothing to do, true otherwise */ ++static bool ++pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, ++ struct pnfs_layout_range *range) + { + struct pnfs_layout_segment *lseg, *next; +- int invalid = 0, removed = 0; ++ bool rv = false; + +- dprintk("%s:Begin lo %p\n", __func__, lo); ++ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", ++ __func__, lo, range->offset, range->length, range->iomode); + +- list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) +- if (should_free_lseg(lseg->pls_range.iomode, iomode)) { ++ assert_spin_locked(&lo->inode->i_lock); ++ list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) ++ if (should_free_lseg(&lseg->range, range)) { + dprintk("%s: freeing lseg %p iomode %d " + "offset %llu length %llu\n", __func__, +- lseg, lseg->pls_range.iomode, lseg->pls_range.offset, +- lseg->pls_range.length); +- invalid++; +- removed += mark_lseg_invalid(lseg, tmp_list); ++ lseg, lseg->range.iomode, lseg->range.offset, ++ lseg->range.length); ++ mark_lseg_invalid(lseg, tmp_list); ++ rv = true; + } +- dprintk("%s:Return %i\n", __func__, invalid - removed); +- return invalid - removed; ++ dprintk("%s:Return\n", __func__); ++ return rv; + } + + void +@@ -317,23 +484,29 @@ pnfs_free_lseg_list(struct list_head *fr + { + struct pnfs_layout_segment *lseg, *tmp; + +- list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { +- list_del(&lseg->pls_list); ++ list_for_each_entry_safe(lseg, tmp, free_me, fi_list) + free_lseg(lseg); +- } ++ INIT_LIST_HEAD(free_me); + } + + void + pnfs_destroy_layout(struct nfs_inode *nfsi) + { + struct pnfs_layout_hdr *lo; ++ struct pnfs_layout_range range = { ++ .iomode = IOMODE_ANY, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; + LIST_HEAD(tmp_list); + + spin_lock(&nfsi->vfs_inode.i_lock); + lo = nfsi->layout; + if (lo) { +- set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags); +- mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY); ++ pnfs_clear_lseg_list(lo, &tmp_list, &range); ++ WARN_ON(!list_empty(&nfsi->layout->segs)); ++ WARN_ON(!list_empty(&nfsi->layout->layouts)); ++ + /* Matched by refcount set to 1 in alloc_init_layout_hdr */ + put_layout_hdr_locked(lo); + } +@@ -357,30 +530,28 @@ pnfs_destroy_all_layouts(struct nfs_clie + + while (!list_empty(&tmp_list)) { + lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, +- plh_layouts); ++ layouts); + dprintk("%s freeing layout for inode %lu\n", __func__, +- lo->plh_inode->i_ino); +- pnfs_destroy_layout(NFS_I(lo->plh_inode)); ++ lo->inode->i_ino); ++ pnfs_destroy_layout(NFS_I(lo->inode)); + } + } + +-/* update lo->plh_stateid with new if is more recent */ ++/* update lo->stateid with new if is more recent */ + void + pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, + bool update_barrier) + { + u32 oldseq, newseq; + +- oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid); ++ assert_spin_locked(&lo->inode->i_lock); ++ oldseq = be32_to_cpu(lo->stateid.stateid.seqid); + newseq = be32_to_cpu(new->stateid.seqid); + if ((int)(newseq - oldseq) > 0) { +- memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid)); +- if (update_barrier) { +- u32 new_barrier = be32_to_cpu(new->stateid.seqid); +- +- if ((int)(new_barrier - lo->plh_barrier)) +- lo->plh_barrier = new_barrier; +- } else { ++ memcpy(&lo->stateid, &new->stateid, sizeof(new->stateid)); ++ if (update_barrier) ++ lo->plh_barrier = be32_to_cpu(new->stateid.seqid); ++ else { + /* Because of wraparound, we want to keep the barrier + * "close" to the current seqids. It needs to be + * within 2**31 to count as "behind", so if it +@@ -394,20 +565,6 @@ pnfs_set_layout_stateid(struct pnfs_layo + } + } + +-/* lget is set to 1 if called from inside send_layoutget call chain */ +-static bool +-pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, +- int lget) +-{ +- if ((stateid) && +- (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0) +- return true; +- return lo->plh_block_lgets || +- test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || +- (list_empty(&lo->plh_segs) && +- (atomic_read(&lo->plh_outstanding) > lget)); +-} +- + int + pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, + struct nfs4_state *open_state) +@@ -415,10 +572,11 @@ pnfs_choose_layoutget_stateid(nfs4_state + int status = 0; + + dprintk("--> %s\n", __func__); +- spin_lock(&lo->plh_inode->i_lock); +- if (pnfs_layoutgets_blocked(lo, NULL, 1)) { ++ assert_spin_locked(&lo->inode->i_lock); ++ if (lo->plh_block_lgets || ++ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + status = -EAGAIN; +- } else if (list_empty(&lo->plh_segs)) { ++ } else if (list_empty(&lo->segs)) { + int seq; + + do { +@@ -427,9 +585,8 @@ pnfs_choose_layoutget_stateid(nfs4_state + sizeof(open_state->stateid.data)); + } while (read_seqretry(&open_state->seqlock, seq)); + } else +- memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data)); +- spin_unlock(&lo->plh_inode->i_lock); +- dprintk("<-- %s\n", __func__); ++ memcpy(dst->data, lo->stateid.data, sizeof(lo->stateid.data)); ++ dprintk("<-- %s status=%d\n", __func__, status); + return status; + } + +@@ -442,9 +599,9 @@ pnfs_choose_layoutget_stateid(nfs4_state + static struct pnfs_layout_segment * + send_layoutget(struct pnfs_layout_hdr *lo, + struct nfs_open_context *ctx, +- u32 iomode) ++ struct pnfs_layout_range *range) + { +- struct inode *ino = lo->plh_inode; ++ struct inode *ino = lo->inode; + struct nfs_server *server = NFS_SERVER(ino); + struct nfs4_layoutget *lgp; + struct pnfs_layout_segment *lseg = NULL; +@@ -453,13 +610,15 @@ send_layoutget(struct pnfs_layout_hdr *l + + BUG_ON(ctx == NULL); + lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); +- if (lgp == NULL) ++ if (lgp == NULL) { ++ put_layout_hdr(lo); + return NULL; +- lgp->args.minlength = NFS4_MAX_UINT64; ++ } ++ lgp->args.minlength = PAGE_CACHE_SIZE; ++ if (lgp->args.minlength > range->length) ++ lgp->args.minlength = range->length; + lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; +- lgp->args.range.iomode = iomode; +- lgp->args.range.offset = 0; +- lgp->args.range.length = NFS4_MAX_UINT64; ++ lgp->args.range = *range; + lgp->args.type = server->pnfs_curr_ld->id; + lgp->args.inode = ino; + lgp->args.ctx = get_nfs_open_context(ctx); +@@ -471,86 +630,119 @@ send_layoutget(struct pnfs_layout_hdr *l + nfs4_proc_layoutget(lgp); + if (!lseg) { + /* remember that LAYOUTGET failed and suspend trying */ +- set_bit(lo_fail_bit(iomode), &lo->plh_flags); ++ set_bit(lo_fail_bit(range->iomode), &lo->plh_flags); + } + return lseg; + } + +-bool pnfs_roc(struct inode *ino) ++void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range, ++ int notify_bit, atomic_t *notify_count, ++ struct list_head *tmp_list) + { +- struct pnfs_layout_hdr *lo; + struct pnfs_layout_segment *lseg, *tmp; +- LIST_HEAD(tmp_list); +- bool found = false; + +- spin_lock(&ino->i_lock); +- lo = NFS_I(ino)->layout; +- if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || +- test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) +- goto out_nolayout; +- list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) +- if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { +- mark_lseg_invalid(lseg, &tmp_list); +- found = true; ++ assert_spin_locked(&lo->inode->i_lock); ++ list_for_each_entry_safe(lseg, tmp, &lo->segs, fi_list) ++ if (should_free_lseg(&lseg->range, range)) { ++ lseg->pls_notify_mask |= (1 << notify_bit); ++ atomic_inc(notify_count); ++ mark_lseg_invalid(lseg, tmp_list); + } +- if (!found) +- goto out_nolayout; +- lo->plh_block_lgets++; +- get_layout_hdr(lo); /* matched in pnfs_roc_release */ +- spin_unlock(&ino->i_lock); +- pnfs_free_lseg_list(&tmp_list); +- return true; +- +-out_nolayout: +- spin_unlock(&ino->i_lock); +- return false; + } + +-void pnfs_roc_release(struct inode *ino) ++/* Return true if there is layout based io in progress in the given range. ++ * Assumes range has already been marked invalid, and layout marked to ++ * prevent any new lseg from being inserted. ++ */ ++bool ++pnfs_return_layout_barrier(struct nfs_inode *nfsi, ++ struct pnfs_layout_range *range) + { +- struct pnfs_layout_hdr *lo; ++ struct pnfs_layout_segment *lseg; ++ bool ret = false; + +- spin_lock(&ino->i_lock); +- lo = NFS_I(ino)->layout; +- lo->plh_block_lgets--; +- put_layout_hdr_locked(lo); +- spin_unlock(&ino->i_lock); ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ list_for_each_entry(lseg, &nfsi->layout->segs, fi_list) ++ if (should_free_lseg(&lseg->range, range)) { ++ ret = true; ++ break; ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ dprintk("%s:Return %d\n", __func__, ret); ++ return ret; + } + +-void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) ++static int ++return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wait) + { +- struct pnfs_layout_hdr *lo; ++ struct nfs4_layoutreturn *lrp; ++ struct nfs_server *server = NFS_SERVER(ino); ++ int status = -ENOMEM; + +- spin_lock(&ino->i_lock); +- lo = NFS_I(ino)->layout; +- if ((int)(barrier - lo->plh_barrier) > 0) +- lo->plh_barrier = barrier; +- spin_unlock(&ino->i_lock); ++ dprintk("--> %s\n", __func__); ++ ++ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); ++ if (lrp == NULL) { ++ put_layout_hdr(NFS_I(ino)->layout); ++ goto out; ++ } ++ lrp->args.reclaim = 0; ++ lrp->args.layout_type = server->pnfs_curr_ld->id; ++ lrp->args.return_type = RETURN_FILE; ++ lrp->args.range = *range; ++ lrp->args.inode = ino; ++ lrp->clp = server->nfs_client; ++ ++ status = nfs4_proc_layoutreturn(lrp, wait); ++out: ++ dprintk("<-- %s status: %d\n", __func__, status); ++ return status; + } + +-bool pnfs_roc_drain(struct inode *ino, u32 *barrier) ++/* Initiates a LAYOUTRETURN(FILE) */ ++int ++_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range, ++ bool wait) + { ++ struct pnfs_layout_hdr *lo = NULL; + struct nfs_inode *nfsi = NFS_I(ino); +- struct pnfs_layout_segment *lseg; +- bool found = false; ++ struct pnfs_layout_range arg; ++ LIST_HEAD(tmp_list); ++ int status = 0; + +- spin_lock(&ino->i_lock); +- list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) +- if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { +- found = true; +- break; +- } +- if (!found) { +- struct pnfs_layout_hdr *lo = nfsi->layout; +- u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid); ++ dprintk("--> %s\n", __func__); + +- /* Since close does not return a layout stateid for use as +- * a barrier, we choose the worst-case barrier. +- */ +- *barrier = current_seqid + atomic_read(&lo->plh_outstanding); ++ arg.iomode = range ? range->iomode : IOMODE_ANY; ++ arg.offset = 0; ++ arg.length = NFS4_MAX_UINT64; ++ ++ spin_lock(&ino->i_lock); ++ lo = nfsi->layout; ++ if (!lo || !pnfs_clear_lseg_list(lo, &tmp_list, &arg)) { ++ spin_unlock(&ino->i_lock); ++ dprintk("%s: no layout segments to return\n", __func__); ++ goto out; + } ++ lo->plh_block_lgets++; ++ /* Reference matched in nfs4_layoutreturn_release */ ++ get_layout_hdr(lo); + spin_unlock(&ino->i_lock); +- return found; ++ pnfs_free_lseg_list(&tmp_list); ++ ++ if (layoutcommit_needed(nfsi)) { ++ status = pnfs_layoutcommit_inode(ino, wait); ++ if (status) { ++ /* Return layout even if layoutcommit fails */ ++ dprintk("%s: layoutcommit failed, status=%d. " ++ "Returning layout anyway\n", ++ __func__, status); ++ } ++ } ++ status = return_layout(ino, &arg, wait); ++out: ++ dprintk("<-- %s status: %d\n", __func__, status); ++ return status; + } + + /* +@@ -559,10 +751,24 @@ bool pnfs_roc_drain(struct inode *ino, u + * are seen first. + */ + static s64 +-cmp_layout(u32 iomode1, u32 iomode2) ++cmp_layout(struct pnfs_layout_range *l1, ++ struct pnfs_layout_range *l2) + { ++ s64 d; ++ ++ /* higher offset > lower offset */ ++ d = l1->offset - l2->offset; ++ if (d) ++ return d; ++ ++ /* longer length > shorter length */ ++ d = l1->length - l2->length; ++ if (d) ++ return d; ++ + /* read > read/write */ +- return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ); ++ return (int)(l2->iomode == IOMODE_READ) - ++ (int)(l1->iomode == IOMODE_READ); + } + + static void +@@ -574,27 +780,30 @@ pnfs_insert_layout(struct pnfs_layout_hd + + dprintk("%s:Begin\n", __func__); + +- assert_spin_locked(&lo->plh_inode->i_lock); +- list_for_each_entry(lp, &lo->plh_segs, pls_list) { +- if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0) ++ assert_spin_locked(&lo->inode->i_lock); ++ list_for_each_entry(lp, &lo->segs, fi_list) { ++ if (cmp_layout(&lp->range, &lseg->range) > 0) + continue; +- list_add_tail(&lseg->pls_list, &lp->pls_list); ++ list_add_tail(&lseg->fi_list, &lp->fi_list); + dprintk("%s: inserted lseg %p " + "iomode %d offset %llu length %llu before " + "lp %p iomode %d offset %llu length %llu\n", +- __func__, lseg, lseg->pls_range.iomode, +- lseg->pls_range.offset, lseg->pls_range.length, +- lp, lp->pls_range.iomode, lp->pls_range.offset, +- lp->pls_range.length); ++ __func__, lseg, lseg->range.iomode, ++ lseg->range.offset, lseg->range.length, ++ lp, lp->range.iomode, lp->range.offset, ++ lp->range.length); + found = 1; + break; + } + if (!found) { +- list_add_tail(&lseg->pls_list, &lo->plh_segs); ++ list_add_tail(&lseg->fi_list, &lo->segs); ++ if (list_is_singular(&lo->segs) && ++ !pnfs_layoutgets_blocked(lo, NULL)) ++ rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid); + dprintk("%s: inserted lseg %p " + "iomode %d offset %llu length %llu at tail\n", +- __func__, lseg, lseg->pls_range.iomode, +- lseg->pls_range.offset, lseg->pls_range.length); ++ __func__, lseg, lseg->range.iomode, ++ lseg->range.offset, lseg->range.length); + } + get_layout_hdr(lo); + +@@ -606,14 +815,14 @@ alloc_init_layout_hdr(struct inode *ino) + { + struct pnfs_layout_hdr *lo; + +- lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL); ++ lo = pnfs_alloc_layout_hdr(ino); + if (!lo) + return NULL; + atomic_set(&lo->plh_refcount, 1); +- INIT_LIST_HEAD(&lo->plh_layouts); +- INIT_LIST_HEAD(&lo->plh_segs); ++ INIT_LIST_HEAD(&lo->layouts); ++ INIT_LIST_HEAD(&lo->segs); + INIT_LIST_HEAD(&lo->plh_bulk_recall); +- lo->plh_inode = ino; ++ lo->inode = ino; + return lo; + } + +@@ -626,12 +835,9 @@ pnfs_find_alloc_layout(struct inode *ino + dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); + + assert_spin_locked(&ino->i_lock); +- if (nfsi->layout) { +- if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags)) +- return NULL; +- else +- return nfsi->layout; +- } ++ if (nfsi->layout) ++ return nfsi->layout; ++ + spin_unlock(&ino->i_lock); + new = alloc_init_layout_hdr(ino); + spin_lock(&ino->i_lock); +@@ -639,13 +845,13 @@ pnfs_find_alloc_layout(struct inode *ino + if (likely(nfsi->layout == NULL)) /* Won the race? */ + nfsi->layout = new; + else +- kfree(new); ++ pnfs_free_layout_hdr(new); + return nfsi->layout; + } + + /* + * iomode matching rules: +- * iomode lseg match ++ * range lseg match + * ----- ----- ----- + * ANY READ true + * ANY RW true +@@ -655,34 +861,47 @@ pnfs_find_alloc_layout(struct inode *ino + * READ RW true + */ + static int +-is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode) ++is_matching_lseg(struct pnfs_layout_segment *lseg, ++ struct pnfs_layout_range *range) + { +- return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW); ++ struct pnfs_layout_range range1; ++ ++ if ((range->iomode == IOMODE_RW && lseg->range.iomode != IOMODE_RW) || ++ !lo_seg_intersecting(&lseg->range, range)) ++ return 0; ++ ++ /* range1 covers only the first byte in the range */ ++ range1 = *range; ++ range1.length = 1; ++ return lo_seg_contained(&lseg->range, &range1); + } + + /* + * lookup range in layout + */ + static struct pnfs_layout_segment * +-pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode) ++pnfs_find_lseg(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range) + { + struct pnfs_layout_segment *lseg, *ret = NULL; + + dprintk("%s:Begin\n", __func__); + +- assert_spin_locked(&lo->plh_inode->i_lock); +- list_for_each_entry(lseg, &lo->plh_segs, pls_list) { ++ assert_spin_locked(&lo->inode->i_lock); ++ list_for_each_entry(lseg, &lo->segs, fi_list) { + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && +- is_matching_lseg(lseg, iomode)) { ++ is_matching_lseg(lseg, range)) { ++ get_lseg(lseg); + ret = lseg; + break; + } +- if (cmp_layout(iomode, lseg->pls_range.iomode) > 0) ++ if (cmp_layout(range, &lseg->range) > 0) + break; + } + +- dprintk("%s:Return lseg %p ref %d\n", +- __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0); ++ dprintk("%s:Return lseg %p ref %d valid %d\n", ++ __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0, ++ ret ? test_bit(NFS_LSEG_VALID, &ret->pls_flags) : 0); + return ret; + } + +@@ -693,8 +912,15 @@ pnfs_find_lseg(struct pnfs_layout_hdr *l + struct pnfs_layout_segment * + pnfs_update_layout(struct inode *ino, + struct nfs_open_context *ctx, ++ loff_t pos, ++ u64 count, + enum pnfs_iomode iomode) + { ++ struct pnfs_layout_range arg = { ++ .iomode = iomode, ++ .offset = pos, ++ .length = count, ++ }; + struct nfs_inode *nfsi = NFS_I(ino); + struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; + struct pnfs_layout_hdr *lo; +@@ -709,14 +935,8 @@ pnfs_update_layout(struct inode *ino, + goto out_unlock; + } + +- /* Do we even need to bother with this? */ +- if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) || +- test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { +- dprintk("%s matches recall, use MDS\n", __func__); +- goto out_unlock; +- } + /* Check to see if the layout for the given range already exists */ +- lseg = pnfs_find_lseg(lo, iomode); ++ lseg = pnfs_find_lseg(lo, &arg); + if (lseg) + goto out_unlock; + +@@ -724,35 +944,29 @@ pnfs_update_layout(struct inode *ino, + if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags)) + goto out_unlock; + +- if (pnfs_layoutgets_blocked(lo, NULL, 0)) +- goto out_unlock; +- atomic_inc(&lo->plh_outstanding); +- +- get_layout_hdr(lo); +- if (list_empty(&lo->plh_segs)) { ++ get_layout_hdr(lo); /* Matched in pnfs_layoutget_release */ ++ if (list_empty(&lo->segs)) { + /* The lo must be on the clp list if there is any + * chance of a CB_LAYOUTRECALL(FILE) coming in. + */ + spin_lock(&clp->cl_lock); +- BUG_ON(!list_empty(&lo->plh_layouts)); +- list_add_tail(&lo->plh_layouts, &clp->cl_layouts); ++ BUG_ON(!list_empty(&lo->layouts)); ++ list_add_tail(&lo->layouts, &clp->cl_layouts); + spin_unlock(&clp->cl_lock); + } + spin_unlock(&ino->i_lock); + +- lseg = send_layoutget(lo, ctx, iomode); ++ lseg = send_layoutget(lo, ctx, &arg); + if (!lseg) { + spin_lock(&ino->i_lock); +- if (list_empty(&lo->plh_segs)) { ++ if (list_empty(&lo->segs)) { + spin_lock(&clp->cl_lock); +- list_del_init(&lo->plh_layouts); ++ list_del_init(&lo->layouts); + spin_unlock(&clp->cl_lock); + clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); + } + spin_unlock(&ino->i_lock); + } +- atomic_dec(&lo->plh_outstanding); +- put_layout_hdr(lo); + out: + dprintk("%s end, state 0x%lx lseg %p\n", __func__, + nfsi->layout->plh_flags, lseg); +@@ -762,27 +976,29 @@ out_unlock: + goto out; + } + ++bool ++pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid) ++{ ++ assert_spin_locked(&lo->inode->i_lock); ++ if ((stateid) && ++ (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0) ++ return true; ++ return lo->plh_block_lgets || ++ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || ++ (list_empty(&lo->segs) && ++ (atomic_read(&lo->plh_outstanding) != 0)); ++} ++ + int + pnfs_layout_process(struct nfs4_layoutget *lgp) + { + struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; + struct nfs4_layoutget_res *res = &lgp->res; + struct pnfs_layout_segment *lseg; +- struct inode *ino = lo->plh_inode; ++ struct inode *ino = lo->inode; + struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; + int status = 0; + +- /* Verify we got what we asked for. +- * Note that because the xdr parsing only accepts a single +- * element array, this can fail even if the server is behaving +- * correctly. +- */ +- if (lgp->args.range.iomode > res->range.iomode || +- res->range.offset != 0 || +- res->range.length != NFS4_MAX_UINT64) { +- status = -EINVAL; +- goto out; +- } + /* Inject layout blob into I/O device driver */ + lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res); + if (!lseg || IS_ERR(lseg)) { +@@ -792,43 +1008,572 @@ pnfs_layout_process(struct nfs4_layoutge + status = PTR_ERR(lseg); + dprintk("%s: Could not allocate layout: error %d\n", + __func__, status); ++ spin_lock(&ino->i_lock); + goto out; + } + + spin_lock(&ino->i_lock); +- if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) || +- test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { ++ /* decrement needs to be done before call to pnfs_layoutget_blocked */ ++ atomic_dec(&lo->plh_outstanding); ++ spin_lock(&clp->cl_lock); ++ if (matches_outstanding_recall(ino, &res->range)) { ++ spin_unlock(&clp->cl_lock); + dprintk("%s forget reply due to recall\n", __func__); + goto out_forget_reply; + } ++ spin_unlock(&clp->cl_lock); + +- if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) { ++ if (pnfs_layoutgets_blocked(lo, &res->stateid)) { + dprintk("%s forget reply due to state\n", __func__); + goto out_forget_reply; + } + init_lseg(lo, lseg); +- lseg->pls_range = res->range; ++ lseg->range = res->range; ++ get_lseg(lseg); + *lgp->lsegpp = lseg; + pnfs_insert_layout(lo, lseg); + + if (res->return_on_close) { +- set_bit(NFS_LSEG_ROC, &lseg->pls_flags); +- set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); ++ /* FI: This needs to be re-examined. At lo level, ++ * all it needs is a bit indicating whether any of ++ * the lsegs in the list have the flags set. ++ */ ++ lo->roc_iomode |= res->range.iomode; + } + + /* Done processing layoutget. Set the layout stateid */ + pnfs_set_layout_stateid(lo, &res->stateid, false); +- spin_unlock(&ino->i_lock); + out: ++ if (!pnfs_layoutgets_blocked(lo, NULL)) ++ rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid); ++ spin_unlock(&ino->i_lock); + return status; + + out_forget_reply: + spin_unlock(&ino->i_lock); +- lseg->pls_layout = lo; ++ lseg->layout = lo; + NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); ++ spin_lock(&ino->i_lock); ++ goto out; ++} ++ ++void ++readahead_range(struct inode *inode, struct list_head *pages, loff_t *offset, ++ size_t *count) ++{ ++ struct page *first, *last; ++ loff_t foff, i_size = i_size_read(inode); ++ pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; ++ size_t range; ++ ++ first = list_entry((pages)->prev, struct page, lru); ++ last = list_entry((pages)->next, struct page, lru); ++ ++ foff = (loff_t)first->index << PAGE_CACHE_SHIFT; ++ ++ range = (last->index - first->index) * PAGE_CACHE_SIZE; ++ if (last->index == end_index) ++ range += ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; ++ else ++ range += PAGE_CACHE_SIZE; ++ dprintk("%s foff %lu, range %Zu\n", __func__, (unsigned long)foff, ++ range); ++ *offset = foff; ++ *count = range; ++} ++ ++void ++pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio) ++{ ++ struct pnfs_layout_hdr *lo; ++ struct pnfs_layoutdriver_type *ld; ++ ++ pgio->pg_test = NULL; ++ ++ lo = NFS_I(inode)->layout; ++ ld = NFS_SERVER(inode)->pnfs_curr_ld; ++ if (!ld || !lo) ++ return; ++ ++ pgio->pg_test = ld->pg_test; ++} ++ ++/* ++ * rsize is already set by caller to MDS rsize. ++ */ ++void ++pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, ++ struct inode *inode, ++ struct nfs_open_context *ctx, ++ struct list_head *pages, ++ size_t *rsize) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ size_t count = 0; ++ loff_t loff; ++ ++ pgio->pg_iswrite = 0; ++ pgio->pg_test = NULL; ++ pgio->pg_lseg = NULL; ++ ++ if (!pnfs_enabled_sb(nfss)) ++ return; ++ ++ readahead_range(inode, pages, &loff, &count); ++ pgio->pg_lseg = pnfs_update_layout(inode, ctx, loff, count, IOMODE_READ); ++ if (pgio->pg_lseg) { ++ pnfs_set_pg_test(inode, pgio); ++ *rsize = NFS_SERVER(inode)->ds_rsize; ++ } ++} ++ ++void ++pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, ++ size_t *wsize) ++{ ++ struct nfs_server *server = NFS_SERVER(inode); ++ ++ pgio->pg_iswrite = 1; ++ if (!pnfs_enabled_sb(server)) ++ pgio->pg_test = NULL; ++ else { ++ pnfs_set_pg_test(inode, pgio); ++ *wsize = server->ds_wsize; ++ } ++} ++ ++/* Set buffer size for data servers */ ++void ++pnfs_set_ds_iosize(struct nfs_server *server) ++{ ++ unsigned dssize = 0; ++ ++ if (server->pnfs_curr_ld && server->pnfs_curr_ld->get_blocksize) ++ dssize = server->pnfs_curr_ld->get_blocksize(); ++ if (dssize) ++ server->ds_rsize = server->ds_wsize = ++ nfs_block_size(dssize, NULL); ++ else { ++ server->ds_wsize = server->wsize; ++ server->ds_rsize = server->rsize; ++ } ++} ++ ++static int ++pnfs_call_done(struct pnfs_call_data *pdata, struct rpc_task *task, void *data) ++{ ++ put_lseg(pdata->lseg); ++ pdata->lseg = NULL; ++ pdata->call_ops->rpc_call_done(task, data); ++ if (pdata->pnfs_error == -EAGAIN || task->tk_status == -EAGAIN) ++ return -EAGAIN; ++ if (pdata->pnfsflags & PNFS_NO_RPC) { ++ pdata->call_ops->rpc_release(data); ++ } else { ++ /* ++ * just restore original rpc call ops ++ * rpc_release will be called later by the rpc scheduling layer. ++ */ ++ task->tk_ops = pdata->call_ops; ++ } ++ return 0; ++} ++ ++/* Post-write completion function ++ * Invoked by all layout drivers when write_pagelist is done. ++ * ++ * NOTE: callers set data->pnfsflags PNFS_NO_RPC ++ * so that the NFS cleanup routines perform only the page cache ++ * cleanup. ++ */ ++static void ++pnfs_write_retry(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ struct pnfs_layout_range range; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ range.iomode = IOMODE_RW; ++ range.offset = wdata->args.offset; ++ range.length = wdata->args.count; ++ _pnfs_return_layout(wdata->inode, &range, true); ++ pnfs_initiate_write(wdata, NFS_CLIENT(wdata->inode), ++ wdata->pdata.call_ops, wdata->pdata.how); ++} ++ ++void ++pnfs_writeback_done(struct nfs_write_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ /* update last write offset and need layout commit ++ * for non-files layout types (files layout calls ++ * pnfs4_write_done for this) ++ */ ++ if ((pdata->pnfsflags & PNFS_NO_RPC) && ++ data->task.tk_status >= 0 && data->res.count > 0) { ++ struct nfs_inode *nfsi = NFS_I(data->inode); ++ ++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); ++ pnfs_need_layoutcommit(nfsi, data->args.context); ++ } ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ INIT_WORK(&data->task.u.tk_work, pnfs_write_retry); ++ queue_work(nfsiod_workqueue, &data->task.u.tk_work); ++ } ++} ++EXPORT_SYMBOL_GPL(pnfs_writeback_done); ++ ++static void _pnfs_clear_lseg_from_pages(struct list_head *head) ++{ ++ struct nfs_page *req; ++ ++ list_for_each_entry(req, head, wb_list) { ++ put_lseg(req->wb_lseg); ++ req->wb_lseg = NULL; ++ } ++} ++ ++/* ++ * Call the appropriate parallel I/O subsystem write function. ++ * If no I/O device driver exists, or one does match the returned ++ * fstype, then return a positive status for regular NFS processing. ++ * ++ * TODO: Is wdata->how and wdata->args.stable always the same value? ++ * TODO: It seems in NFS, the server may not do a stable write even ++ * though it was requested (and vice-versa?). To check, it looks ++ * in data->res.verf->committed. Do we need this ability ++ * for non-file layout drivers? ++ */ ++enum pnfs_try_status ++pnfs_try_to_write_data(struct nfs_write_data *wdata, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ struct inode *inode = wdata->inode; ++ enum pnfs_try_status trypnfs; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct pnfs_layout_segment *lseg = wdata->req->wb_lseg; ++ ++ wdata->pdata.call_ops = call_ops; ++ wdata->pdata.pnfs_error = 0; ++ wdata->pdata.how = how; ++ ++ dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, ++ inode->i_ino, wdata->args.count, wdata->args.offset, how); ++ ++ get_lseg(lseg); ++ ++ if (!pnfs_use_rpc(nfss)) ++ wdata->pdata.pnfsflags |= PNFS_NO_RPC; ++ wdata->pdata.lseg = lseg; ++ trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, ++ nfs_page_array_len(wdata->args.pgbase, wdata->args.count), ++ how); ++ ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ wdata->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ wdata->pdata.lseg = NULL; ++ put_lseg(lseg); ++ _pnfs_clear_lseg_from_pages(&wdata->pages); ++ } else { ++ nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); ++ } ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++/* Post-read completion function. Invoked by all layout drivers when ++ * read_pagelist is done ++ */ ++static void ++pnfs_read_retry(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ struct pnfs_layout_range range; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ range.iomode = IOMODE_RW; ++ range.offset = rdata->args.offset; ++ range.length = rdata->args.count; ++ _pnfs_return_layout(rdata->inode, &range, true); ++ pnfs_initiate_read(rdata, NFS_CLIENT(rdata->inode), ++ rdata->pdata.call_ops); ++} ++ ++void ++pnfs_read_done(struct nfs_read_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ INIT_WORK(&data->task.u.tk_work, pnfs_read_retry); ++ queue_work(nfsiod_workqueue, &data->task.u.tk_work); ++ } ++} ++EXPORT_SYMBOL_GPL(pnfs_read_done); ++ ++/* ++ * Call the appropriate parallel I/O subsystem read function. ++ * If no I/O device driver exists, or one does match the returned ++ * fstype, then return a positive status for regular NFS processing. ++ */ ++enum pnfs_try_status ++pnfs_try_to_read_data(struct nfs_read_data *rdata, ++ const struct rpc_call_ops *call_ops) ++{ ++ struct inode *inode = rdata->inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct pnfs_layout_segment *lseg = rdata->req->wb_lseg; ++ enum pnfs_try_status trypnfs; ++ ++ rdata->pdata.call_ops = call_ops; ++ rdata->pdata.pnfs_error = 0; ++ ++ dprintk("%s: Reading ino:%lu %u@%llu\n", ++ __func__, inode->i_ino, rdata->args.count, rdata->args.offset); ++ ++ get_lseg(lseg); ++ ++ if (!pnfs_use_rpc(nfss)) ++ rdata->pdata.pnfsflags |= PNFS_NO_RPC; ++ rdata->pdata.lseg = lseg; ++ trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata, ++ nfs_page_array_len(rdata->args.pgbase, rdata->args.count)); ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ rdata->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ rdata->pdata.lseg = NULL; ++ put_lseg(lseg); ++ _pnfs_clear_lseg_from_pages(&rdata->pages); ++ } else { ++ nfs_inc_stats(inode, NFSIOS_PNFS_READ); ++ } ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++/* ++ * This gives the layout driver an opportunity to read in page "around" ++ * the data to be written. It returns 0 on success, otherwise an error code ++ * which will either be passed up to user, or ignored if ++ * some previous part of write succeeded. ++ * Note the range [pos, pos+len-1] is entirely within the page. ++ */ ++int _pnfs_write_begin(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata **fsdata) ++{ ++ struct pnfs_fsdata *data; ++ int status = 0; ++ ++ dprintk("--> %s: pos=%llu len=%u\n", ++ __func__, (unsigned long long)pos, len); ++ data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL); ++ if (!data) { ++ status = -ENOMEM; ++ goto out; ++ } ++ data->lseg = lseg; /* refcount passed into data to be managed there */ ++ status = NFS_SERVER(inode)->pnfs_curr_ld->write_begin( ++ lseg, page, pos, len, data); ++ if (status) { ++ kfree(data); ++ data = NULL; ++ } ++out: ++ *fsdata = data; ++ dprintk("<-- %s: status=%d\n", __func__, status); ++ return status; ++} ++ ++/* pNFS Commit callback function for all layout drivers */ ++void ++pnfs_commit_done(struct nfs_write_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ struct pnfs_layout_range range = { ++ .iomode = IOMODE_RW, ++ .offset = data->args.offset, ++ .length = data->args.count, ++ }; ++ dprintk("%s: retrying\n", __func__); ++ _pnfs_return_layout(data->inode, &range, true); ++ pnfs_initiate_commit(data, NFS_CLIENT(data->inode), ++ pdata->call_ops, pdata->how, 1); ++ } ++} ++EXPORT_SYMBOL_GPL(pnfs_commit_done); ++ ++enum pnfs_try_status ++pnfs_try_to_commit(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int sync) ++{ ++ struct inode *inode = data->inode; ++ struct nfs_server *nfss = NFS_SERVER(data->inode); ++ enum pnfs_try_status trypnfs; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ if (!pnfs_use_rpc(nfss)) ++ data->pdata.pnfsflags |= PNFS_NO_RPC; ++ /* We need to account for possibility that ++ * each nfs_page can point to a different lseg (or be NULL). ++ * For the immediate case of whole-file-only layouts, we at ++ * least know there can be only a single lseg. ++ * We still have to account for the possibility of some being NULL. ++ * This will be done by passing the buck to the layout driver. ++ */ ++ data->pdata.call_ops = call_ops; ++ data->pdata.pnfs_error = 0; ++ data->pdata.how = sync; ++ data->pdata.lseg = NULL; ++ trypnfs = nfss->pnfs_curr_ld->commit(data, sync); ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ data->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ _pnfs_clear_lseg_from_pages(&data->pages); ++ } else ++ nfs_inc_stats(inode, NFSIOS_PNFS_COMMIT); ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++void pnfs_cleanup_layoutcommit(struct inode *inode, ++ struct nfs4_layoutcommit_data *data) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ ++ /* TODO: Maybe we should avoid this by allowing the layout driver ++ * to directly xdr its layout on the wire. ++ */ ++ if (nfss->pnfs_curr_ld->cleanup_layoutcommit) ++ nfss->pnfs_curr_ld->cleanup_layoutcommit( ++ NFS_I(inode)->layout, data); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int ++pnfs_setup_layoutcommit(struct inode *inode, ++ struct nfs4_layoutcommit_data *data, ++ loff_t write_begin_pos, loff_t write_end_pos) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int result = 0; ++ ++ dprintk("--> %s\n", __func__); ++ ++ data->args.inode = inode; ++ data->args.fh = NFS_FH(inode); ++ data->args.layout_type = nfss->pnfs_curr_ld->id; ++ data->res.fattr = &data->fattr; ++ nfs_fattr_init(&data->fattr); ++ ++ /* TODO: Need to determine the correct values */ ++ data->args.time_modify_changed = 0; ++ ++ /* Set values from inode so it can be reset ++ */ ++ data->args.range.iomode = IOMODE_RW; ++ data->args.range.offset = write_begin_pos; ++ data->args.range.length = write_end_pos - write_begin_pos + 1; ++ data->args.lastbytewritten = min(write_end_pos, ++ i_size_read(inode) - 1); ++ data->args.bitmask = nfss->attr_bitmask; ++ data->res.server = nfss; ++ ++ /* Call layout driver to set the arguments */ ++ if (nfss->pnfs_curr_ld->setup_layoutcommit) ++ result = nfss->pnfs_curr_ld->setup_layoutcommit( ++ NFS_I(inode)->layout, &data->args); ++ ++ dprintk("<-- %s Status %d\n", __func__, result); ++ return result; ++} ++ ++/* Issue a async layoutcommit for an inode. ++ */ ++int ++pnfs_layoutcommit_inode(struct inode *inode, int sync) ++{ ++ struct nfs4_layoutcommit_data *data; ++ struct nfs_inode *nfsi = NFS_I(inode); ++ loff_t write_begin_pos; ++ loff_t write_end_pos; ++ ++ int status = 0; ++ ++ dprintk("%s Begin (sync:%d)\n", __func__, sync); ++ ++ BUG_ON(!has_layout(nfsi)); ++ ++ data = kzalloc(sizeof(*data), GFP_NOFS); ++ if (!data) ++ return -ENOMEM; ++ ++ spin_lock(&inode->i_lock); ++ if (!layoutcommit_needed(nfsi)) { ++ spin_unlock(&inode->i_lock); ++ goto out_free; ++ } ++ ++ /* Clear layoutcommit properties in the inode so ++ * new lc info can be generated ++ */ ++ write_begin_pos = nfsi->layout->write_begin_pos; ++ write_end_pos = nfsi->layout->write_end_pos; ++ data->cred = nfsi->layout->cred; ++ nfsi->layout->write_begin_pos = 0; ++ nfsi->layout->write_end_pos = 0; ++ nfsi->layout->cred = NULL; ++ __clear_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags); ++ memcpy(data->args.stateid.data, nfsi->layout->stateid.data, ++ NFS4_STATEID_SIZE); ++ ++ /* Reference for layoutcommit matched in pnfs_layoutcommit_release */ ++ get_layout_hdr(NFS_I(inode)->layout); ++ ++ spin_unlock(&inode->i_lock); ++ ++ /* Set up layout commit args */ ++ status = pnfs_setup_layoutcommit(inode, data, write_begin_pos, ++ write_end_pos); ++ if (status) { ++ /* The layout driver failed to setup the layoutcommit */ ++ put_rpccred(data->cred); ++ put_layout_hdr(NFS_I(inode)->layout); ++ goto out_free; ++ } ++ status = nfs4_proc_layoutcommit(data, sync); ++out: ++ dprintk("%s end (err:%d)\n", __func__, status); ++ return status; ++out_free: ++ kfree(data); + goto out; + } + ++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata) ++{ ++ /* lseg refcounting handled directly in nfs_write_end */ ++ kfree(fsdata); ++} ++ + /* + * Device ID cache. Currently supports one layout type per struct nfs_client. + * Add layout type to the lookup key to expand to support multiple types. +@@ -861,6 +1606,25 @@ pnfs_alloc_init_deviceid_cache(struct nf + } + EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache); + ++/* Must be called with locked c->dc_lock */ ++static struct pnfs_deviceid_node * ++pnfs_unhash_deviceid(struct pnfs_deviceid_cache *c, ++ struct nfs4_deviceid *id) ++{ ++ struct pnfs_deviceid_node *d; ++ struct hlist_node *n; ++ long h = nfs4_deviceid_hash(id); ++ ++ dprintk("%s hash %ld\n", __func__, h); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node) ++ if (!memcmp(&d->de_id, id, sizeof(*id))) { ++ hlist_del_rcu(&d->de_node); ++ return d; ++ } ++ ++ return NULL; ++} ++ + /* + * Called from pnfs_layoutdriver_type->free_lseg + * last layout segment reference frees deviceid +@@ -869,29 +1633,33 @@ void + pnfs_put_deviceid(struct pnfs_deviceid_cache *c, + struct pnfs_deviceid_node *devid) + { +- struct nfs4_deviceid *id = &devid->de_id; +- struct pnfs_deviceid_node *d; +- struct hlist_node *n; +- long h = nfs4_deviceid_hash(id); +- + dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref)); + if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock)) + return; + +- hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node) +- if (!memcmp(&d->de_id, id, sizeof(*id))) { +- hlist_del_rcu(&d->de_node); +- spin_unlock(&c->dc_lock); +- synchronize_rcu(); +- c->dc_free_callback(devid); +- return; +- } ++ pnfs_unhash_deviceid(c, &devid->de_id); + spin_unlock(&c->dc_lock); +- /* Why wasn't it found in the list? */ +- BUG(); ++ synchronize_rcu(); ++ c->dc_free_callback(devid); + } + EXPORT_SYMBOL_GPL(pnfs_put_deviceid); + ++void ++pnfs_delete_deviceid(struct pnfs_deviceid_cache *c, ++ struct nfs4_deviceid *id) ++{ ++ struct pnfs_deviceid_node *devid; ++ ++ spin_lock(&c->dc_lock); ++ devid = pnfs_unhash_deviceid(c, id); ++ spin_unlock(&c->dc_lock); ++ synchronize_rcu(); ++ dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref)); ++ if (atomic_dec_and_test(&devid->de_ref)) ++ c->dc_free_callback(devid); ++} ++EXPORT_SYMBOL_GPL(pnfs_delete_deviceid); ++ + /* Find and reference a deviceid */ + struct pnfs_deviceid_node * + pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id) +diff -up linux-2.6.37.noarch/fs/nfs/pnfs.h.orig linux-2.6.37.noarch/fs/nfs/pnfs.h +--- linux-2.6.37.noarch/fs/nfs/pnfs.h.orig 2011-01-28 09:37:32.549979704 -0500 ++++ linux-2.6.37.noarch/fs/nfs/pnfs.h 2011-01-28 09:43:53.341771581 -0500 +@@ -30,17 +30,31 @@ + #ifndef FS_NFS_PNFS_H + #define FS_NFS_PNFS_H + ++#include ++#include "callback.h" ++ + enum { + NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ +- NFS_LSEG_ROC, /* roc bit received from server */ + }; + + struct pnfs_layout_segment { +- struct list_head pls_list; +- struct pnfs_layout_range pls_range; ++ struct list_head fi_list; ++ struct pnfs_layout_range range; + atomic_t pls_refcount; + unsigned long pls_flags; +- struct pnfs_layout_hdr *pls_layout; ++ struct pnfs_layout_hdr *layout; ++ u64 pls_notify_mask; ++}; ++ ++enum pnfs_try_status { ++ PNFS_ATTEMPTED = 0, ++ PNFS_NOT_ATTEMPTED = 1, ++}; ++ ++struct pnfs_fsdata { ++ struct pnfs_layout_segment *lseg; ++ int bypass_eof; ++ void *private; + }; + + #ifdef CONFIG_NFS_V4_1 +@@ -51,8 +65,15 @@ enum { + NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ + NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ + NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ +- NFS_LAYOUT_ROC, /* some lseg had roc bit set */ +- NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */ ++ NFS_LAYOUT_NEED_LCOMMIT, /* LAYOUTCOMMIT needed */ ++}; ++ ++enum layoutdriver_policy_flags { ++ /* Should the full nfs rpc cleanup code be used after io */ ++ PNFS_USE_RPC_CODE = 1 << 0, ++ ++ /* Should the pNFS client commit and return the layout upon a setattr */ ++ PNFS_LAYOUTRET_ON_SETATTR = 1 << 1, + }; + + /* Per-layout driver specific registration structure */ +@@ -61,23 +82,88 @@ struct pnfs_layoutdriver_type { + const u32 id; + const char *name; + struct module *owner; +- int (*set_layoutdriver) (struct nfs_server *); ++ unsigned flags; ++ int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); + int (*clear_layoutdriver) (struct nfs_server *); ++ ++ struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode); ++ void (*free_layout_hdr) (struct pnfs_layout_hdr *); ++ + struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); + void (*free_lseg) (struct pnfs_layout_segment *lseg); ++ ++ /* test for nfs page cache coalescing */ ++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); ++ ++ /* Retreive the block size of the file system. ++ * If gather_across_stripes == 1, then the file system will gather ++ * requests into the block size. ++ * TODO: Where will the layout driver get this info? It is hard ++ * coded in PVFS2. ++ */ ++ ssize_t (*get_blocksize) (void); ++ ++/* read and write pagelist should return just 0 (to indicate that ++ * the layout code has taken control) or 1 (to indicate that the ++ * layout code wishes to fall back to normal nfs.) If 0 is returned, ++ * information can be passed back through nfs_data->res and ++ * nfs_data->task.tk_status, and the appropriate pnfs done function ++ * MUST be called. ++ */ ++ enum pnfs_try_status ++ (*read_pagelist) (struct nfs_read_data *nfs_data, unsigned nr_pages); ++ enum pnfs_try_status ++ (*write_pagelist) (struct nfs_write_data *nfs_data, unsigned nr_pages, int how); ++ int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page, ++ loff_t pos, unsigned count, ++ struct pnfs_fsdata *fsdata); ++ int (*write_end)(struct inode *inode, struct page *page, loff_t pos, ++ unsigned count, unsigned copied, ++ struct pnfs_layout_segment *lseg); ++ void (*write_end_cleanup)(struct file *filp, ++ struct pnfs_fsdata *fsdata); ++ ++ /* Consistency ops */ ++ /* 2 problems: ++ * 1) the page list contains nfs_pages, NOT pages ++ * 2) currently the NFS code doesn't create a page array (as it does with read/write) ++ */ ++ enum pnfs_try_status ++ (*commit) (struct nfs_write_data *nfs_data, int how); ++ ++ int (*setup_layoutcommit) (struct pnfs_layout_hdr *layoutid, ++ struct nfs4_layoutcommit_args *args); ++ ++ void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *args); ++ ++ void (*cleanup_layoutcommit) (struct pnfs_layout_hdr *layoutid, ++ struct nfs4_layoutcommit_data *data); ++ ++ void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args); + }; + + struct pnfs_layout_hdr { + atomic_t plh_refcount; +- struct list_head plh_layouts; /* other client layouts */ ++ struct list_head layouts; /* other client layouts */ + struct list_head plh_bulk_recall; /* clnt list of bulk recalls */ +- struct list_head plh_segs; /* layout segments list */ +- nfs4_stateid plh_stateid; ++ struct list_head segs; /* layout segments list */ ++ int roc_iomode;/* return on close iomode, 0=none */ ++ nfs4_stateid stateid; + atomic_t plh_outstanding; /* number of RPCs out */ + unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ + u32 plh_barrier; /* ignore lower seqids */ + unsigned long plh_flags; +- struct inode *plh_inode; ++ struct rpc_cred *cred; /* layoutcommit credential */ ++ /* DH: These vars keep track of the maximum write range ++ * so the values can be used for layoutcommit. ++ */ ++ loff_t write_begin_pos; ++ loff_t write_end_pos; ++ struct inode *inode; + }; + + struct pnfs_device { +@@ -90,6 +176,23 @@ struct pnfs_device { + unsigned int pglen; + }; + ++struct pnfs_cb_lrecall_info { ++ struct list_head pcl_list; /* hook into cl_layoutrecalls list */ ++ atomic_t pcl_count; ++ int pcl_notify_bit; ++ struct nfs_client *pcl_clp; ++ struct inode *pcl_ino; ++ struct cb_layoutrecallargs pcl_args; ++}; ++ ++#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 ++ ++struct pnfs_devicelist { ++ unsigned int eof; ++ unsigned int num_devs; ++ struct nfs4_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; ++}; ++ + /* + * Device ID RCU cache. A device ID is unique per client ID and layout type. + */ +@@ -135,22 +238,55 @@ extern struct pnfs_deviceid_node *pnfs_a + struct pnfs_deviceid_node *); + extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c, + struct pnfs_deviceid_node *devid); ++extern void pnfs_delete_deviceid(struct pnfs_deviceid_cache *, ++ struct nfs4_deviceid *); + + extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); + extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); + + /* nfs4proc.c */ ++extern int nfs4_proc_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist); + extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, + struct pnfs_device *dev); + extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); ++extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, ++ int issync); ++extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait); + + /* pnfs.c */ + void get_layout_hdr(struct pnfs_layout_hdr *lo); ++void get_lseg(struct pnfs_layout_segment *lseg); ++void put_lseg(struct pnfs_layout_segment *lseg); ++bool should_free_lseg(struct pnfs_layout_range *lseg_range, ++ struct pnfs_layout_range *recall_range); + struct pnfs_layout_segment * + pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, +- enum pnfs_iomode access_type); +-void set_pnfs_layoutdriver(struct nfs_server *, u32 id); ++ loff_t pos, u64 count, enum pnfs_iomode access_type); ++bool pnfs_return_layout_barrier(struct nfs_inode *, struct pnfs_layout_range *); ++int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, bool wait); ++void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *mntfh, u32 id); + void unset_pnfs_layoutdriver(struct nfs_server *); ++enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, ++ const struct rpc_call_ops *, int); ++enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, ++ const struct rpc_call_ops *); ++void pnfs_cleanup_layoutcommit(struct inode *, ++ struct nfs4_layoutcommit_data *); ++int pnfs_layoutcommit_inode(struct inode *inode, int sync); ++void pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent); ++void pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx); ++void pnfs_set_ds_iosize(struct nfs_server *server); ++enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *, ++ const struct rpc_call_ops *, int); ++void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, ++ struct nfs_open_context *, struct list_head *, ++ size_t *); ++void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, ++ size_t *); ++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata); ++bool pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid); + int pnfs_layout_process(struct nfs4_layoutget *lgp); + void pnfs_free_lseg_list(struct list_head *tmp_list); + void pnfs_destroy_layout(struct nfs_inode *); +@@ -162,14 +298,23 @@ void pnfs_set_layout_stateid(struct pnfs + int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, + struct pnfs_layout_hdr *lo, + struct nfs4_state *open_state); +-int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, +- struct list_head *tmp_list, +- u32 iomode); +-bool pnfs_roc(struct inode *ino); +-void pnfs_roc_release(struct inode *ino); +-void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); +-bool pnfs_roc_drain(struct inode *ino, u32 *barrier); ++void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range, ++ int notify_bit, atomic_t *notify_count, ++ struct list_head *tmp_list); ++void pnfs_read_done(struct nfs_read_data *); ++void pnfs_writeback_done(struct nfs_write_data *); ++void pnfs_commit_done(struct nfs_write_data *); ++int _pnfs_write_begin(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata **fsdata); + ++static inline bool ++has_layout(struct nfs_inode *nfsi) ++{ ++ return nfsi->layout != NULL; ++} + + static inline int lo_fail_bit(u32 iomode) + { +@@ -183,6 +328,125 @@ static inline int pnfs_enabled_sb(struct + return nfss->pnfs_curr_ld != NULL; + } + ++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata *fsdata) ++{ ++ return !fsdata || ((struct pnfs_layout_segment *)fsdata == lseg) || ++ !fsdata->bypass_eof; ++} ++ ++/* Should the pNFS client commit and return the layout upon a setattr */ ++static inline bool ++pnfs_ld_layoutret_on_setattr(struct inode *inode) ++{ ++ if (!pnfs_enabled_sb(NFS_SERVER(inode))) ++ return false; ++ return NFS_SERVER(inode)->pnfs_curr_ld->flags & ++ PNFS_LAYOUTRET_ON_SETATTR; ++} ++ ++static inline bool pnfs_use_rpc(struct nfs_server *nfss) ++{ ++ if (pnfs_enabled_sb(nfss)) ++ return nfss->pnfs_curr_ld->flags & PNFS_USE_RPC_CODE; ++ ++ return true; ++} ++ ++/* Should the pNFS client commit and return the layout on close ++ */ ++static inline int ++pnfs_layout_roc_iomode(struct nfs_inode *nfsi) ++{ ++ return nfsi->layout->roc_iomode; ++} ++ ++static inline int pnfs_write_begin(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ void **fsdata) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int status = 0; ++ ++ *fsdata = lseg; ++ if (lseg && nfss->pnfs_curr_ld->write_begin) ++ status = _pnfs_write_begin(inode, page, pos, len, lseg, ++ (struct pnfs_fsdata **) fsdata); ++ return status; ++} ++ ++/* CAREFUL - what happens if copied < len??? */ ++static inline int pnfs_write_end(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ ++ if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_end) ++ return nfss->pnfs_curr_ld->write_end(inode, page, pos, len, ++ copied, lseg); ++ else ++ return 0; ++} ++ ++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) ++{ ++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); ++ ++ if (fsdata && nfss->pnfs_curr_ld) { ++ if (nfss->pnfs_curr_ld->write_end_cleanup) ++ nfss->pnfs_curr_ld->write_end_cleanup(filp, fsdata); ++ if (nfss->pnfs_curr_ld->write_begin) ++ pnfs_free_fsdata(fsdata); ++ } ++} ++ ++static inline int pnfs_return_layout(struct inode *ino, ++ struct pnfs_layout_range *range, ++ bool wait) ++{ ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct nfs_server *nfss = NFS_SERVER(ino); ++ ++ if (pnfs_enabled_sb(nfss) && has_layout(nfsi)) ++ return _pnfs_return_layout(ino, range, wait); ++ ++ return 0; ++} ++ ++static inline bool ++layoutcommit_needed(struct nfs_inode *nfsi) ++{ ++ return has_layout(nfsi) && ++ test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags); ++} ++ ++static inline int pnfs_get_write_status(struct nfs_write_data *data) ++{ ++ return data->pdata.pnfs_error; ++} ++ ++static inline int pnfs_get_read_status(struct nfs_read_data *data) ++{ ++ return data->pdata.pnfs_error; ++} ++ ++static inline struct pnfs_layout_segment * ++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) ++{ ++ if (fsdata) { ++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); ++ ++ if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_begin) ++ return ((struct pnfs_fsdata *) fsdata)->lseg; ++ return (struct pnfs_layout_segment *)fsdata; ++ } ++ return NULL; ++} ++ + #else /* CONFIG_NFS_V4_1 */ + + static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) +@@ -193,36 +457,90 @@ static inline void pnfs_destroy_layout(s + { + } + ++static inline void get_lseg(struct pnfs_layout_segment *lseg) ++{ ++} ++ ++static inline void put_lseg(struct pnfs_layout_segment *lseg) ++{ ++} ++ + static inline struct pnfs_layout_segment * + pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, +- enum pnfs_iomode access_type) ++ loff_t pos, u64 count, enum pnfs_iomode access_type) + { + return NULL; + } + + static inline bool +-pnfs_roc(struct inode *ino) ++has_layout(struct nfs_inode *nfsi) + { + return false; + } + +-static inline void +-pnfs_roc_release(struct inode *ino) ++static inline bool ++layoutcommit_needed(struct nfs_inode *nfsi) + { ++ return 0; + } + +-static inline void +-pnfs_roc_set_barrier(struct inode *ino, u32 barrier) ++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata *fsdata) + { ++ return 1; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_read_data(struct nfs_read_data *data, ++ const struct rpc_call_ops *call_ops) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_write_data(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_commit(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline int pnfs_layoutcommit_inode(struct inode *inode, int sync) ++{ ++ return 0; + } + + static inline bool +-pnfs_roc_drain(struct inode *ino, u32 *barrier) ++pnfs_ld_layoutret_on_setattr(struct inode *inode) + { + return false; + } + +-static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) ++static inline bool pnfs_use_rpc(struct nfs_server *nfss) ++{ ++ return true; ++} ++ ++static inline int ++pnfs_layout_roc_iomode(struct nfs_inode *nfsi) ++{ ++ return 0; ++} ++ ++static inline int pnfs_return_layout(struct inode *ino, ++ struct pnfs_layout_range *range, ++ bool wait) ++{ ++ return 0; ++} ++ ++static inline void set_pnfs_layoutdriver(struct nfs_server *s, const struct nfs_fh *mntfh, u32 id) + { + } + +@@ -230,6 +548,62 @@ static inline void unset_pnfs_layoutdriv + { + } + ++static inline void pnfs_set_ds_iosize(struct nfs_server *server) ++{ ++ server->ds_wsize = server->ds_rsize = -1; ++} ++ ++static inline int pnfs_write_begin(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ void **fsdata) ++{ ++ *fsdata = NULL; ++ return 0; ++} ++ ++static inline int pnfs_write_end(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ return 0; ++} ++ ++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) ++{ ++} ++ ++static inline int pnfs_get_write_status(struct nfs_write_data *data) ++{ ++ return 0; ++} ++ ++static inline int pnfs_get_read_status(struct nfs_read_data *data) ++{ ++ return 0; ++} ++ ++static inline void ++pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino, ++ struct nfs_open_context *ctx, struct list_head *pages, ++ size_t *rsize) ++{ ++ pgio->pg_lseg = NULL; ++} ++ ++static inline void ++pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino, ++ size_t *wsize) ++{ ++ pgio->pg_lseg = NULL; ++} ++ ++static inline struct pnfs_layout_segment * ++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) ++{ ++ return NULL; ++} ++ + #endif /* CONFIG_NFS_V4_1 */ + + #endif /* FS_NFS_PNFS_H */ +diff -up linux-2.6.37.noarch/fs/nfs/read.c.orig linux-2.6.37.noarch/fs/nfs/read.c +--- linux-2.6.37.noarch/fs/nfs/read.c.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/fs/nfs/read.c 2011-01-28 09:43:53.342771448 -0500 +@@ -18,14 +18,17 @@ + #include + #include + #include ++#include ++#include + + #include ++#include ++#include "pnfs.h" + + #include "nfs4_fs.h" + #include "internal.h" + #include "iostat.h" + #include "fscache.h" +-#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_PAGECACHE + +@@ -117,12 +120,16 @@ int nfs_readpage_async(struct nfs_open_c + LIST_HEAD(one_request); + struct nfs_page *new; + unsigned int len; ++ loff_t pgoffs; ++ struct pnfs_layout_segment *lseg; + + len = nfs_page_length(page); + if (len == 0) + return nfs_return_empty_page(page); +- pnfs_update_layout(inode, ctx, IOMODE_READ); +- new = nfs_create_request(ctx, inode, page, 0, len); ++ pgoffs = (loff_t)page->index << PAGE_CACHE_SHIFT; ++ lseg = pnfs_update_layout(inode, ctx, pgoffs, len, IOMODE_READ); ++ new = nfs_create_request(ctx, inode, page, 0, len, lseg); ++ put_lseg(lseg); + if (IS_ERR(new)) { + unlock_page(page); + return PTR_ERR(new); +@@ -155,24 +162,20 @@ static void nfs_readpage_release(struct + nfs_release_request(req); + } + +-/* +- * Set up the NFS read request struct +- */ +-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, +- const struct rpc_call_ops *call_ops, +- unsigned int count, unsigned int offset) ++int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops) + { +- struct inode *inode = req->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = req->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = data, +@@ -180,9 +183,46 @@ static int nfs_read_rpcsetup(struct nfs_ + .flags = RPC_TASK_ASYNC | swap_flags, + }; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->read_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ rpc_put_task(task); ++ return 0; ++} ++EXPORT_SYMBOL(nfs_initiate_read); ++ ++int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops) ++{ ++ if (data->req->wb_lseg && ++ (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) ++ return pnfs_get_read_status(data); ++ ++ return nfs_initiate_read(data, clnt, call_ops); ++} ++ ++/* ++ * Set up the NFS read request struct ++ */ ++static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, ++ const struct rpc_call_ops *call_ops, ++ unsigned int count, unsigned int offset) ++{ ++ struct inode *inode = req->wb_context->path.dentry->d_inode; ++ + data->req = req; + data->inode = inode; +- data->cred = msg.rpc_cred; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; +@@ -197,21 +237,7 @@ static int nfs_read_rpcsetup(struct nfs_ + data->res.eof = 0; + nfs_fattr_init(&data->fattr); + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->read_setup(data, &msg); +- +- dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- count, +- (unsigned long long)data->args.offset); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- return PTR_ERR(task); +- rpc_put_task(task); +- return 0; ++ return pnfs_initiate_read(data, NFS_CLIENT(inode), call_ops); + } + + static void +@@ -355,7 +381,14 @@ static void nfs_readpage_retry(struct rp + { + struct nfs_readargs *argp = &data->args; + struct nfs_readres *resp = &data->res; ++ struct nfs_client *clp = NFS_SERVER(data->inode)->nfs_client; + ++#ifdef CONFIG_NFS_V4_1 ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ clp = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + if (resp->eof || resp->count == argp->count) + return; + +@@ -369,7 +402,10 @@ static void nfs_readpage_retry(struct rp + argp->offset += resp->count; + argp->pgbase += resp->count; + argp->count -= resp->count; +- nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); ++#ifdef CONFIG_NFS_V4_1 ++ data->pdata.pnfs_error = -EAGAIN; ++#endif /* CONFIG_NFS_V4_1 */ ++ nfs_restart_rpc(task, clp); + } + + /* +@@ -410,13 +446,19 @@ static void nfs_readpage_release_partial + void nfs_read_prepare(struct rpc_task *task, void *calldata) + { + struct nfs_read_data *data = calldata; ++ struct nfs4_session *ds_session = NULL; + +- if (nfs4_setup_sequence(NFS_SERVER(data->inode), ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ ds_session = data->fldata.ds_nfs_client->cl_session; ++ } ++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, + &data->args.seq_args, &data->res.seq_res, + 0, task)) + return; + rpc_call_start(task); + } ++EXPORT_SYMBOL(nfs_read_prepare); + #endif /* CONFIG_NFS_V4_1 */ + + static const struct rpc_call_ops nfs_read_partial_ops = { +@@ -569,7 +611,20 @@ readpage_async_filler(void *data, struct + if (len == 0) + return nfs_return_empty_page(page); + +- new = nfs_create_request(desc->ctx, inode, page, 0, len); ++ if (desc->pgio->pg_lseg) { ++ loff_t pgoff = (loff_t)page->index << PAGE_CACHE_SHIFT; ++ struct pnfs_layout_range *range = &desc->pgio->pg_lseg->range; ++ ++ /* retry later with the right lseg? */ ++ if (range->offset > pgoff + len || ++ range->offset + range->length < pgoff) { ++ new = ERR_PTR(-EAGAIN); ++ goto out_error; ++ } ++ } ++ ++ new = nfs_create_request(desc->ctx, inode, page, 0, len, ++ desc->pgio->pg_lseg); + if (IS_ERR(new)) + goto out_error; + +@@ -625,7 +680,7 @@ int nfs_readpages(struct file *filp, str + if (ret == 0) + goto read_complete; /* all pages were read */ + +- pnfs_update_layout(inode, desc.ctx, IOMODE_READ); ++ pnfs_pageio_init_read(&pgio, inode, desc.ctx, pages, &rsize); + if (rsize < PAGE_CACHE_SIZE) + nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); + else +@@ -634,6 +689,7 @@ int nfs_readpages(struct file *filp, str + ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); + + nfs_pageio_complete(&pgio); ++ put_lseg(pgio.pg_lseg); + npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nfs_add_stats(inode, NFSIOS_READPAGES, npages); + read_complete: +diff -up linux-2.6.37.noarch/fs/nfs/super.c.orig linux-2.6.37.noarch/fs/nfs/super.c +--- linux-2.6.37.noarch/fs/nfs/super.c.orig 2011-01-28 09:37:32.551979635 -0500 ++++ linux-2.6.37.noarch/fs/nfs/super.c 2011-01-28 09:43:53.343771315 -0500 +@@ -63,6 +63,7 @@ + #include "iostat.h" + #include "internal.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_VFS + +@@ -725,6 +726,28 @@ static int nfs_show_options(struct seq_f + + return 0; + } ++#ifdef CONFIG_NFS_V4_1 ++void show_sessions(struct seq_file *m, struct nfs_server *server) ++{ ++ if (nfs4_has_session(server->nfs_client)) ++ seq_printf(m, ",sessions"); ++} ++#else ++void show_sessions(struct seq_file *m, struct nfs_server *server) {} ++#endif ++ ++#ifdef CONFIG_NFS_V4_1 ++void show_pnfs(struct seq_file *m, struct nfs_server *server) ++{ ++ seq_printf(m, ",pnfs="); ++ if (server->pnfs_curr_ld) ++ seq_printf(m, "%s", server->pnfs_curr_ld->name); ++ else ++ seq_printf(m, "not configured"); ++} ++#else /* CONFIG_NFS_V4_1 */ ++void show_pnfs(struct seq_file *m, struct nfs_server *server) {} ++#endif /* CONFIG_NFS_V4_1 */ + + /* + * Present statistical information for this VFS mountpoint +@@ -763,6 +786,8 @@ static int nfs_show_stats(struct seq_fil + seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); + seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); + seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); ++ show_sessions(m, nfss); ++ show_pnfs(m, nfss); + } + #endif + +diff -up linux-2.6.37.noarch/fs/nfs/unlink.c.orig linux-2.6.37.noarch/fs/nfs/unlink.c +--- linux-2.6.37.noarch/fs/nfs/unlink.c.orig 2011-01-28 09:37:32.552979600 -0500 ++++ linux-2.6.37.noarch/fs/nfs/unlink.c 2011-01-28 09:43:53.344771185 -0500 +@@ -113,7 +113,7 @@ void nfs_unlink_prepare(struct rpc_task + struct nfs_unlinkdata *data = calldata; + struct nfs_server *server = NFS_SERVER(data->dir); + +- if (nfs4_setup_sequence(server, &data->args.seq_args, ++ if (nfs4_setup_sequence(server, NULL, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +@@ -388,7 +388,7 @@ static void nfs_rename_prepare(struct rp + struct nfs_renamedata *data = calldata; + struct nfs_server *server = NFS_SERVER(data->old_dir); + +- if (nfs4_setup_sequence(server, &data->args.seq_args, ++ if (nfs4_setup_sequence(server, NULL, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +diff -up linux-2.6.37.noarch/fs/nfs/write.c.orig linux-2.6.37.noarch/fs/nfs/write.c +--- linux-2.6.37.noarch/fs/nfs/write.c.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/fs/nfs/write.c 2011-01-28 09:43:53.345771055 -0500 +@@ -28,6 +28,7 @@ + #include "iostat.h" + #include "nfs4_fs.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_PAGECACHE + +@@ -58,6 +59,7 @@ struct nfs_write_data *nfs_commitdata_al + } + return p; + } ++EXPORT_SYMBOL(nfs_commitdata_alloc); + + void nfs_commit_free(struct nfs_write_data *p) + { +@@ -426,6 +428,17 @@ static void nfs_inode_remove_request(str + spin_unlock(&inode->i_lock); + nfs_release_request(req); + } ++static void ++nfs_mark_request_nopnfs(struct nfs_page *req) ++{ ++ struct pnfs_layout_segment *lseg = req->wb_lseg; ++ ++ if (req->wb_lseg == NULL) ++ return; ++ req->wb_lseg = NULL; ++ put_lseg(lseg); ++ dprintk(" retry through MDS\n"); ++} + + static void + nfs_mark_request_dirty(struct nfs_page *req) +@@ -531,7 +544,7 @@ nfs_need_commit(struct nfs_inode *nfsi) + * The requests are *not* checked to ensure that they form a contiguous set. + */ + static int +-nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) ++nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int *use_pnfs) + { + struct nfs_inode *nfsi = NFS_I(inode); + int ret; +@@ -539,7 +552,8 @@ nfs_scan_commit(struct inode *inode, str + if (!nfs_need_commit(nfsi)) + return 0; + +- ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); ++ ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT, ++ use_pnfs); + if (ret > 0) + nfsi->ncommit -= ret; + if (nfs_need_commit(NFS_I(inode))) +@@ -568,7 +582,8 @@ static inline int nfs_scan_commit(struct + static struct nfs_page *nfs_try_to_update_request(struct inode *inode, + struct page *page, + unsigned int offset, +- unsigned int bytes) ++ unsigned int bytes, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_page *req; + unsigned int rqend; +@@ -593,8 +608,8 @@ static struct nfs_page *nfs_try_to_updat + * Note: nfs_flush_incompatible() will already + * have flushed out requests having wrong owners. + */ +- if (offset > rqend +- || end < req->wb_offset) ++ if (offset > rqend || end < req->wb_offset || ++ req->wb_lseg != lseg) + goto out_flushme; + + if (nfs_set_page_tag_locked(req)) +@@ -642,16 +657,17 @@ out_err: + * already called nfs_flush_incompatible() if necessary. + */ + static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, +- struct page *page, unsigned int offset, unsigned int bytes) ++ struct page *page, unsigned int offset, unsigned int bytes, ++ struct pnfs_layout_segment *lseg) + { + struct inode *inode = page->mapping->host; + struct nfs_page *req; + int error; + +- req = nfs_try_to_update_request(inode, page, offset, bytes); ++ req = nfs_try_to_update_request(inode, page, offset, bytes, lseg); + if (req != NULL) + goto out; +- req = nfs_create_request(ctx, inode, page, offset, bytes); ++ req = nfs_create_request(ctx, inode, page, offset, bytes, lseg); + if (IS_ERR(req)) + goto out; + error = nfs_inode_add_request(inode, req); +@@ -664,23 +680,27 @@ out: + } + + static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, ++ void *fsdata) + { + struct nfs_page *req; + +- req = nfs_setup_write_request(ctx, page, offset, count); ++ req = nfs_setup_write_request(ctx, page, offset, count, lseg); + if (IS_ERR(req)) + return PTR_ERR(req); + nfs_mark_request_dirty(req); + /* Update file length */ +- nfs_grow_file(page, offset, count); ++ if (pnfs_grow_ok(lseg, fsdata)) ++ nfs_grow_file(page, offset, count); + nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + return 0; + } + +-int nfs_flush_incompatible(struct file *file, struct page *page) ++int nfs_flush_incompatible(struct file *file, struct page *page, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct nfs_page *req; +@@ -699,7 +719,8 @@ int nfs_flush_incompatible(struct file * + return 0; + do_flush = req->wb_page != page || req->wb_context != ctx || + req->wb_lock_context->lockowner != current->files || +- req->wb_lock_context->pid != current->tgid; ++ req->wb_lock_context->pid != current->tgid || ++ req->wb_lseg != lseg; + nfs_release_request(req); + if (!do_flush) + return 0; +@@ -726,7 +747,8 @@ static int nfs_write_pageuptodate(struct + * things with a page scheduled for an RPC call (e.g. invalidate it). + */ + int nfs_updatepage(struct file *file, struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, void *fsdata) + { + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct inode *inode = page->mapping->host; +@@ -751,7 +773,7 @@ int nfs_updatepage(struct file *file, st + offset = 0; + } + +- status = nfs_writepage_setup(ctx, page, offset, count); ++ status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata); + if (status < 0) + nfs_set_pageerror(page); + +@@ -781,25 +803,21 @@ static int flush_task_priority(int how) + return RPC_PRIORITY_NORMAL; + } + +-/* +- * Set up the argument/result storage required for the RPC call. +- */ +-static int nfs_write_rpcsetup(struct nfs_page *req, +- struct nfs_write_data *data, +- const struct rpc_call_ops *call_ops, +- unsigned int count, unsigned int offset, +- int how) ++int nfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) + { +- struct inode *inode = req->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = req->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .task = &data->task, + .rpc_message = &msg, + .callback_ops = call_ops, +@@ -810,12 +828,62 @@ static int nfs_write_rpcsetup(struct nfs + }; + int ret = 0; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->write_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated write call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ if (how & FLUSH_SYNC) { ++ ret = rpc_wait_for_completion_task(task); ++ if (ret == 0) ++ ret = task->tk_status; ++ } ++ rpc_put_task(task); ++out: ++ return ret; ++} ++EXPORT_SYMBOL(nfs_initiate_write); ++ ++int pnfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) ++{ ++ if (data->req->wb_lseg && ++ (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) ++ return pnfs_get_write_status(data); ++ ++ return nfs_initiate_write(data, clnt, call_ops, how); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int nfs_write_rpcsetup(struct nfs_page *req, ++ struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, ++ unsigned int count, unsigned int offset, ++ int how) ++{ ++ struct inode *inode = req->wb_context->path.dentry->d_inode; ++ + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + data->req = req; + data->inode = inode = req->wb_context->path.dentry->d_inode; +- data->cred = msg.rpc_cred; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; +@@ -836,30 +904,7 @@ static int nfs_write_rpcsetup(struct nfs + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- dprintk("NFS: %5u initiated write call " +- "(req %s/%lld, %u bytes @ offset %llu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- count, +- (unsigned long long)data->args.offset); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) { +- ret = PTR_ERR(task); +- goto out; +- } +- if (how & FLUSH_SYNC) { +- ret = rpc_wait_for_completion_task(task); +- if (ret == 0) +- ret = task->tk_status; +- } +- rpc_put_task(task); +-out: +- return ret; ++ return pnfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); + } + + /* If a nfs_flush_* function fails, it should remove reqs from @head and +@@ -870,6 +915,7 @@ static void nfs_redirty_request(struct n + { + struct page *page = req->wb_page; + ++ nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + nfs_end_page_writeback(page); +@@ -982,6 +1028,8 @@ static void nfs_pageio_init_write(struct + { + size_t wsize = NFS_SERVER(inode)->wsize; + ++ pnfs_pageio_init_write(pgio, inode, &wsize); ++ + if (wsize < PAGE_CACHE_SIZE) + nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); + else +@@ -1047,13 +1095,27 @@ out: + void nfs_write_prepare(struct rpc_task *task, void *calldata) + { + struct nfs_write_data *data = calldata; ++ struct nfs4_session *ds_session = NULL; + +- if (nfs4_setup_sequence(NFS_SERVER(data->inode), ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ ds_session = data->fldata.ds_nfs_client->cl_session; ++ } else if (data->args.count > NFS_SERVER(data->inode)->wsize) { ++ /* retrying via MDS? */ ++ data->pdata.orig_count = data->args.count; ++ data->args.count = NFS_SERVER(data->inode)->wsize; ++ dprintk("%s: trimmed count %u to wsize %u\n", __func__, ++ data->pdata.orig_count, data->args.count); ++ } else ++ data->pdata.orig_count = 0; ++ ++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, + &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); + } ++EXPORT_SYMBOL(nfs_write_prepare); + #endif /* CONFIG_NFS_V4_1 */ + + static const struct rpc_call_ops nfs_write_partial_ops = { +@@ -1137,10 +1199,11 @@ int nfs_writeback_done(struct rpc_task * + struct nfs_writeargs *argp = &data->args; + struct nfs_writeres *resp = &data->res; + struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *clp = server->nfs_client; + int status; + +- dprintk("NFS: %5u nfs_writeback_done (status %d)\n", +- task->tk_pid, task->tk_status); ++ dprintk("NFS: %5u nfs_writeback_done (status %d count %u)\n", ++ task->tk_pid, task->tk_status, resp->count); + + /* + * ->write_done will attempt to use post-op attributes to detect +@@ -1153,6 +1216,13 @@ int nfs_writeback_done(struct rpc_task * + if (status != 0) + return status; + nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); ++#ifdef CONFIG_NFS_V4_1 ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS write\n", __func__); ++ clp = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) + if (resp->verf->committed < argp->stable && task->tk_status >= 0) { +@@ -1169,7 +1239,7 @@ int nfs_writeback_done(struct rpc_task * + if (time_before(complain, jiffies)) { + dprintk("NFS: faulty NFS server %s:" + " (committed = %d) != (stable = %d)\n", +- server->nfs_client->cl_hostname, ++ clp->cl_hostname, + resp->verf->committed, argp->stable); + complain = jiffies + 300 * HZ; + } +@@ -1195,6 +1265,9 @@ int nfs_writeback_done(struct rpc_task * + */ + argp->stable = NFS_FILE_SYNC; + } ++#ifdef CONFIG_NFS_V4_1 ++ data->pdata.pnfs_error = -EAGAIN; ++#endif /* CONFIG_NFS_V4_1 */ + nfs_restart_rpc(task, server->nfs_client); + return -EAGAIN; + } +@@ -1239,40 +1312,73 @@ static void nfs_commitdata_release(void + nfs_commit_free(wdata); + } + +-/* +- * Set up the argument/result storage required for the RPC call. +- */ +-static int nfs_commit_rpcsetup(struct list_head *head, +- struct nfs_write_data *data, +- int how) ++int nfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) + { +- struct nfs_page *first = nfs_list_entry(head->next); +- struct inode *inode = first->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = first->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .rpc_message = &msg, +- .callback_ops = &nfs_commit_ops, ++ .callback_ops = call_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + .priority = priority, + }; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->commit_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ rpc_put_task(task); ++ return 0; ++} ++EXPORT_SYMBOL(nfs_initiate_commit); ++ ++ ++int pnfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how, int pnfs) ++{ ++ if (pnfs && ++ (pnfs_try_to_commit(data, &nfs_commit_ops, how) == PNFS_ATTEMPTED)) ++ return pnfs_get_write_status(data); ++ ++ return nfs_initiate_commit(data, clnt, &nfs_commit_ops, how); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int nfs_commit_rpcsetup(struct list_head *head, ++ struct nfs_write_data *data, ++ int how, int pnfs) ++{ ++ struct nfs_page *first = nfs_list_entry(head->next); ++ struct inode *inode = first->wb_context->path.dentry->d_inode; ++ + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + list_splice_init(head, &data->pages); + + data->inode = inode; +- data->cred = msg.rpc_cred; ++ data->cred = first->wb_context->cred; + + data->args.fh = NFS_FH(data->inode); + /* Note: we always request a commit of the entire inode */ +@@ -1283,45 +1389,47 @@ static int nfs_commit_rpcsetup(struct li + data->res.fattr = &data->fattr; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); ++ kref_init(&data->refcount); ++ data->parent = NULL; ++ data->args.context = first->wb_context; /* used by commit done */ + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->commit_setup(data, &msg); ++ return pnfs_initiate_commit(data, NFS_CLIENT(inode), &nfs_commit_ops, ++ how, pnfs); ++} + +- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++/* Handle memory error during commit */ ++void nfs_mark_list_commit(struct list_head *head) ++{ ++ struct nfs_page *req; + +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- return PTR_ERR(task); +- rpc_put_task(task); +- return 0; ++ while (!list_empty(head)) { ++ req = nfs_list_entry(head->next); ++ nfs_list_remove_request(req); ++ nfs_mark_request_commit(req); ++ dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); ++ dec_bdi_stat(req->wb_page->mapping->backing_dev_info, ++ BDI_RECLAIMABLE); ++ nfs_clear_page_tag_locked(req); ++ } + } ++EXPORT_SYMBOL(nfs_mark_list_commit); + + /* + * Commit dirty pages + */ + static int +-nfs_commit_list(struct inode *inode, struct list_head *head, int how) ++nfs_commit_list(struct inode *inode, struct list_head *head, int how, int pnfs) + { + struct nfs_write_data *data; +- struct nfs_page *req; + + data = nfs_commitdata_alloc(); +- + if (!data) + goto out_bad; + + /* Set up the argument struct */ +- return nfs_commit_rpcsetup(head, data, how); ++ return nfs_commit_rpcsetup(head, data, how, pnfs); + out_bad: +- while (!list_empty(head)) { +- req = nfs_list_entry(head->next); +- nfs_list_remove_request(req); +- nfs_mark_request_commit(req); +- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); +- dec_bdi_stat(req->wb_page->mapping->backing_dev_info, +- BDI_RECLAIMABLE); +- nfs_clear_page_tag_locked(req); +- } ++ nfs_mark_list_commit(head); + nfs_commit_clear_lock(NFS_I(inode)); + return -ENOMEM; + } +@@ -1341,6 +1449,19 @@ static void nfs_commit_done(struct rpc_t + return; + } + ++static inline void nfs_commit_cleanup(struct kref *kref) ++{ ++ struct nfs_write_data *data; ++ ++ data = container_of(kref, struct nfs_write_data, refcount); ++ /* Clear lock only when all cloned commits are finished */ ++ if (data->parent) ++ kref_put(&data->parent->refcount, nfs_commit_cleanup); ++ else ++ nfs_commit_clear_lock(NFS_I(data->inode)); ++ nfs_commitdata_release(data); ++} ++ + static void nfs_commit_release(void *calldata) + { + struct nfs_write_data *data = calldata; +@@ -1358,6 +1479,11 @@ static void nfs_commit_release(void *cal + req->wb_bytes, + (long long)req_offset(req)); + if (status < 0) { ++ if (req->wb_lseg) { ++ nfs_mark_request_nopnfs(req); ++ nfs_mark_request_dirty(req); ++ goto next; ++ } + nfs_context_set_write_error(req->wb_context, status); + nfs_inode_remove_request(req); + dprintk(", error = %d\n", status); +@@ -1374,12 +1500,12 @@ static void nfs_commit_release(void *cal + } + /* We have a mismatch. Write the page again */ + dprintk(" mismatch\n"); ++ nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + next: + nfs_clear_page_tag_locked(req); + } +- nfs_commit_clear_lock(NFS_I(data->inode)); +- nfs_commitdata_release(calldata); ++ kref_put(&data->refcount, nfs_commit_cleanup); + } + + static const struct rpc_call_ops nfs_commit_ops = { +@@ -1395,21 +1521,22 @@ int nfs_commit_inode(struct inode *inode + LIST_HEAD(head); + int may_wait = how & FLUSH_SYNC; + int res = 0; ++ int use_pnfs = 0; + + if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) + goto out_mark_dirty; + spin_lock(&inode->i_lock); +- res = nfs_scan_commit(inode, &head, 0, 0); ++ res = nfs_scan_commit(inode, &head, 0, 0, &use_pnfs); + spin_unlock(&inode->i_lock); + if (res) { +- int error = nfs_commit_list(inode, &head, how); ++ int error = nfs_commit_list(inode, &head, how, use_pnfs); + if (error < 0) + return error; +- if (may_wait) ++ if (may_wait) { + wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); +- else ++ } else + goto out_mark_dirty; + } else + nfs_commit_clear_lock(NFS_I(inode)); +@@ -1464,7 +1591,18 @@ static int nfs_commit_unstable_pages(str + + int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) + { +- return nfs_commit_unstable_pages(inode, wbc); ++ int ret; ++ ret = nfs_commit_unstable_pages(inode, wbc); ++ if (ret >= 0 && layoutcommit_needed(NFS_I(inode))) { ++ int err, sync = wbc->sync_mode; ++ ++ if (wbc->nonblocking || wbc->for_background) ++ sync = 0; ++ err = pnfs_layoutcommit_inode(inode, sync); ++ if (err < 0) ++ ret = err; ++ } ++ return ret; + } + + /* +diff -up linux-2.6.37.noarch/include/linux/exportfs.h.orig linux-2.6.37.noarch/include/linux/exportfs.h +--- linux-2.6.37.noarch/include/linux/exportfs.h.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/include/linux/exportfs.h 2011-01-28 09:43:53.375767416 -0500 +@@ -2,6 +2,7 @@ + #define LINUX_EXPORTFS_H 1 + + #include ++#include + + struct dentry; + struct inode; +@@ -188,4 +189,62 @@ extern struct dentry *generic_fh_to_pare + struct fid *fid, int fh_len, int fh_type, + struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)); + ++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) ++struct pnfs_filelayout_device; ++struct pnfs_filelayout_layout; ++ ++extern int filelayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_device *fdev); ++extern enum nfsstat4 filelayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_layout *flp); ++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ ++ ++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) ++struct list_head; ++ ++extern int blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct list_head *volumes); ++ ++extern enum nfsstat4 blocklayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct list_head *layouts); ++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ ++ ++#if defined(CONFIG_PNFSD) ++#include ++ ++struct pnfsd_cb_operations; ++ ++struct pnfsd_cb_ctl { ++ spinlock_t lock; ++ struct module *module; ++ const struct pnfsd_cb_operations *cb_op; ++}; ++ ++/* in expfs.c so that file systems can depend on it */ ++extern struct pnfsd_cb_ctl pnfsd_cb_ctl; ++ ++static inline int ++pnfsd_get_cb_op(struct pnfsd_cb_ctl *ctl) ++{ ++ int ret = -ENOENT; ++ ++ spin_lock(&pnfsd_cb_ctl.lock); ++ if (!pnfsd_cb_ctl.cb_op) ++ goto out; ++ if (!try_module_get(pnfsd_cb_ctl.module)) ++ goto out; ++ ctl->cb_op = pnfsd_cb_ctl.cb_op; ++ ctl->module = pnfsd_cb_ctl.module; ++ ret = 0; ++out: ++ spin_unlock(&pnfsd_cb_ctl.lock); ++ return ret; ++} ++ ++static inline void ++pnfsd_put_cb_op(struct pnfsd_cb_ctl *ctl) ++{ ++ module_put(ctl->module); ++} ++#endif /* CONFIG_PNFSD */ + #endif /* LINUX_EXPORTFS_H */ +diff -up linux-2.6.37.noarch/include/linux/exp_xdr.h.orig linux-2.6.37.noarch/include/linux/exp_xdr.h +--- linux-2.6.37.noarch/include/linux/exp_xdr.h.orig 2011-01-28 09:43:53.373767706 -0500 ++++ linux-2.6.37.noarch/include/linux/exp_xdr.h 2011-01-28 09:43:53.374767561 -0500 +@@ -0,0 +1,141 @@ ++#ifndef _LINUX_EXP_XDR_H ++#define _LINUX_EXP_XDR_H ++ ++#include ++#include ++#include ++ ++struct exp_xdr_stream { ++ __be32 *p; ++ __be32 *end; ++}; ++ ++/** ++ * exp_xdr_qwords - Calculate the number of quad-words holding nbytes ++ * @nbytes: number of bytes to encode ++ */ ++static inline size_t ++exp_xdr_qwords(__u32 nbytes) ++{ ++ return DIV_ROUND_UP(nbytes, 4); ++} ++ ++/** ++ * exp_xdr_qbytes - Calculate the number of bytes holding qwords ++ * @qwords: number of quad-words to encode ++ */ ++static inline size_t ++exp_xdr_qbytes(size_t qwords) ++{ ++ return qwords << 2; ++} ++ ++/** ++ * exp_xdr_reserve_space - Reserve buffer space for sending ++ * @xdr: pointer to exp_xdr_stream ++ * @nbytes: number of bytes to reserve ++ * ++ * Checks that we have enough buffer space to encode 'nbytes' more ++ * bytes of data. If so, update the xdr stream. ++ */ ++static inline __be32 * ++exp_xdr_reserve_space(struct exp_xdr_stream *xdr, size_t nbytes) ++{ ++ __be32 *p = xdr->p; ++ __be32 *q; ++ ++ /* align nbytes on the next 32-bit boundary */ ++ q = p + exp_xdr_qwords(nbytes); ++ if (unlikely(q > xdr->end || q < p)) ++ return NULL; ++ xdr->p = q; ++ return p; ++} ++ ++/** ++ * exp_xdr_reserve_qwords - Reserve buffer space for sending ++ * @xdr: pointer to exp_xdr_stream ++ * @nwords: number of quad words (u32's) to reserve ++ */ ++static inline __be32 * ++exp_xdr_reserve_qwords(struct exp_xdr_stream *xdr, size_t qwords) ++{ ++ return exp_xdr_reserve_space(xdr, exp_xdr_qbytes(qwords)); ++} ++ ++/** ++ * exp_xdr_encode_u32 - Encode an unsigned 32-bit value onto a xdr stream ++ * @p: pointer to encoding destination ++ * @val: value to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_u32(__be32 *p, __u32 val) ++{ ++ *p = cpu_to_be32(val); ++ return p + 1; ++} ++ ++/** ++ * exp_xdr_encode_u64 - Encode an unsigned 64-bit value onto a xdr stream ++ * @p: pointer to encoding destination ++ * @val: value to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_u64(__be32 *p, __u64 val) ++{ ++ put_unaligned_be64(val, p); ++ return p + 2; ++} ++ ++/** ++ * exp_xdr_encode_bytes - Encode an array of bytes onto a xdr stream ++ * @p: pointer to encoding destination ++ * @ptr: pointer to the array of bytes ++ * @nbytes: number of bytes to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_bytes(__be32 *p, const void *ptr, __u32 nbytes) ++{ ++ if (likely(nbytes != 0)) { ++ unsigned int qwords = exp_xdr_qwords(nbytes); ++ unsigned int padding = exp_xdr_qbytes(qwords) - nbytes; ++ ++ memcpy(p, ptr, nbytes); ++ if (padding != 0) ++ memset((char *)p + nbytes, 0, padding); ++ p += qwords; ++ } ++ return p; ++} ++ ++/** ++ * exp_xdr_encode_opaque - Encode an opaque type onto a xdr stream ++ * @p: pointer to encoding destination ++ * @ptr: pointer to the opaque array ++ * @nbytes: number of bytes to encode ++ * ++ * Encodes the 32-bit opaque size in bytes followed by the opaque value. ++ */ ++static inline __be32 * ++exp_xdr_encode_opaque(__be32 *p, const void *ptr, __u32 nbytes) ++{ ++ p = exp_xdr_encode_u32(p, nbytes); ++ return exp_xdr_encode_bytes(p, ptr, nbytes); ++} ++ ++/** ++ * exp_xdr_encode_opaque_qlen - Encode the opaque length onto a xdr stream ++ * @lenp: pointer to the opaque length destination ++ * @endp: pointer to the end of the opaque array ++ * ++ * Encodes the 32-bit opaque size in bytes given the start and end pointers ++ */ ++static inline __be32 * ++exp_xdr_encode_opaque_len(__be32 *lenp, const void *endp) ++{ ++ size_t nbytes = (char *)endp - (char *)(lenp + 1); ++ ++ exp_xdr_encode_u32(lenp, nbytes); ++ return lenp + 1 + exp_xdr_qwords(nbytes); ++} ++#endif /* _LINUX_EXP_XDR_H */ +diff -up linux-2.6.37.noarch/include/linux/fs.h.orig linux-2.6.37.noarch/include/linux/fs.h +--- linux-2.6.37.noarch/include/linux/fs.h.orig 2011-01-28 09:37:32.791971306 -0500 ++++ linux-2.6.37.noarch/include/linux/fs.h 2011-01-28 09:43:53.377767138 -0500 +@@ -399,6 +399,7 @@ struct inodes_stat_t { + #include + + struct export_operations; ++struct pnfs_export_operations; + struct hd_geometry; + struct iovec; + struct nameidata; +@@ -1367,6 +1368,7 @@ struct super_block { + const struct dquot_operations *dq_op; + const struct quotactl_ops *s_qcop; + const struct export_operations *s_export_op; ++ const struct pnfs_export_operations *s_pnfs_op; + unsigned long s_flags; + unsigned long s_magic; + struct dentry *s_root; +diff -up linux-2.6.37.noarch/include/linux/nfs4.h.orig linux-2.6.37.noarch/include/linux/nfs4.h +--- linux-2.6.37.noarch/include/linux/nfs4.h.orig 2011-01-28 09:37:32.862968843 -0500 ++++ linux-2.6.37.noarch/include/linux/nfs4.h 2011-01-28 09:43:53.379766874 -0500 +@@ -17,6 +17,7 @@ + + #define NFS4_BITMAP_SIZE 2 + #define NFS4_VERIFIER_SIZE 8 ++#define NFS4_CLIENTID_SIZE 8 + #define NFS4_STATEID_SEQID_SIZE 4 + #define NFS4_STATEID_OTHER_SIZE 12 + #define NFS4_STATEID_SIZE (NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE) +@@ -131,6 +132,13 @@ + #define EXCHGID4_FLAG_MASK_A 0x40070103 + #define EXCHGID4_FLAG_MASK_R 0x80070103 + ++static inline bool ++is_ds_only_session(u32 exchange_flags) ++{ ++ u32 mask = EXCHGID4_FLAG_USE_PNFS_DS | EXCHGID4_FLAG_USE_PNFS_MDS; ++ return (exchange_flags & mask) == EXCHGID4_FLAG_USE_PNFS_DS; ++} ++ + #define SEQ4_STATUS_CB_PATH_DOWN 0x00000001 + #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING 0x00000002 + #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED 0x00000004 +@@ -181,7 +189,13 @@ struct nfs4_acl { + struct nfs4_ace aces[0]; + }; + ++struct nfs4_fsid { ++ u64 major; ++ u64 minor; ++}; ++ + typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier; ++typedef struct { char data[NFS4_CLIENTID_SIZE]; } nfs4_clientid; + + struct nfs41_stateid { + __be32 seqid; +@@ -559,7 +573,12 @@ enum { + NFSPROC4_CLNT_GET_LEASE_TIME, + NFSPROC4_CLNT_RECLAIM_COMPLETE, + NFSPROC4_CLNT_LAYOUTGET, ++ NFSPROC4_CLNT_LAYOUTCOMMIT, ++ NFSPROC4_CLNT_LAYOUTRETURN, ++ NFSPROC4_CLNT_GETDEVICELIST, + NFSPROC4_CLNT_GETDEVICEINFO, ++ NFSPROC4_CLNT_PNFS_WRITE, ++ NFSPROC4_CLNT_PNFS_COMMIT, + }; + + /* nfs41 types */ +@@ -582,6 +601,8 @@ enum pnfs_layouttype { + LAYOUT_NFSV4_1_FILES = 1, + LAYOUT_OSD2_OBJECTS = 2, + LAYOUT_BLOCK_VOLUME = 3, ++ ++ NFS4_PNFS_PRIVATE_LAYOUT = 0x80000000 + }; + + /* used for both layout return and recall */ +diff -up linux-2.6.37.noarch/include/linux/nfsd4_block.h.orig linux-2.6.37.noarch/include/linux/nfsd4_block.h +--- linux-2.6.37.noarch/include/linux/nfsd4_block.h.orig 2011-01-28 09:43:53.392765435 -0500 ++++ linux-2.6.37.noarch/include/linux/nfsd4_block.h 2011-01-28 09:43:53.392765435 -0500 +@@ -0,0 +1,101 @@ ++#ifndef NFSD4_BLOCK ++#define NFSD4_BLOCK ++ ++#include ++#include ++#include ++#include ++ ++#define PNFS_BLOCK_SUCCESS 1 ++#define PNFS_BLOCK_FAILURE 0 ++ ++#define PNFS_BLOCK_CTL_START 1 ++#define PNFS_BLOCK_CTL_STOP 2 ++#define PNFS_BLOCK_CTL_VERS 3 /* Allows daemon to request current ++ * version from kernel via an upcall. ++ */ ++ ++#define PNFS_UPCALL_MSG_STOP 0 ++#define PNFS_UPCALL_MSG_GETSIG 1 ++#define PNFS_UPCALL_MSG_GETSLICE 2 ++#define PNFS_UPCALL_MSG_DMCHK 3 // See if dev_t is a DM volume ++#define PNFS_UPCALL_MSG_DMGET 4 ++#define PNFS_UPCALL_MSG_VERS 5 ++ ++#define PNFS_UPCALL_VERS 8 ++ ++typedef struct stripe_dev { ++ int major, ++ minor, ++ offset; ++} stripe_dev_t; ++ ++typedef struct bl_comm_res { ++ int res_status; ++ union { ++ struct { ++ long long start, ++ length; ++ } slice; ++ struct { ++ int num_stripes, ++ stripe_size; ++ stripe_dev_t devs[]; ++ } stripe; ++ struct { ++ long long sector; ++ int offset, ++ len; ++ char sig[]; ++ } sig; ++ int vers, ++ dm_vol; ++ } u; ++} bl_comm_res_t; ++ ++typedef struct bl_comm_msg { ++ int msg_type, ++ msg_status; ++ union { ++ dev_t msg_dev; ++ int msg_vers; ++ } u; ++ bl_comm_res_t *msg_res; ++} bl_comm_msg_t; ++ ++#ifdef __KERNEL__ ++ ++typedef struct bl_comm { ++ /* ---- protects access to this structure ---- */ ++ struct mutex lock; ++ /* ---- protects access to rpc pipe ---- */ ++ struct mutex pipe_lock; ++ struct dentry *pipe_dentry; ++ wait_queue_head_t pipe_wq; ++ bl_comm_msg_t msg; ++} bl_comm_t; ++ ++int pnfs_block_enabled(struct inode *, int); ++int bl_layout_type(struct super_block *sb); ++int bl_getdeviceiter(struct super_block *, u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++int bl_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++enum nfsstat4 bl_layoutget(struct inode *, struct exp_xdr_stream *, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++int bl_layoutcommit(struct inode *, ++ const struct nfsd4_pnfs_layoutcommit_arg *, ++ struct nfsd4_pnfs_layoutcommit_res *); ++int bl_layoutreturn(struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++int bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len); ++int bl_init_proc(void); ++int bl_upcall(bl_comm_t *, bl_comm_msg_t *, bl_comm_res_t **); ++ ++extern bl_comm_t *bl_comm_global; // Ugly... ++#endif /* __KERNEL__ */ ++ ++#endif /* NFSD4_BLOCK */ ++ +diff -up linux-2.6.37.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.37.noarch/include/linux/nfsd4_spnfs.h +--- linux-2.6.37.noarch/include/linux/nfsd4_spnfs.h.orig 2011-01-28 09:43:53.394765249 -0500 ++++ linux-2.6.37.noarch/include/linux/nfsd4_spnfs.h 2011-01-28 09:43:53.394765249 -0500 +@@ -0,0 +1,345 @@ ++/* ++ * include/linux/nfsd4_spnfs.h ++ * ++ * spNFS - simple pNFS implementation with userspace daemon ++ * ++ */ ++ ++/****************************************************************************** ++ ++(c) 2007 Network Appliance, Inc. All Rights Reserved. ++ ++Network Appliance provides this source code under the GPL v2 License. ++The GPL v2 license is available at ++http://opensource.org/licenses/gpl-license.php. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++******************************************************************************/ ++ ++#ifndef NFS_SPNFS_H ++#define NFS_SPNFS_H ++ ++ ++#ifdef __KERNEL__ ++#include "exportfs.h" ++#include "sunrpc/svc.h" ++#include "nfsd/nfsfh.h" ++#else ++#include ++#endif /* __KERNEL__ */ ++ ++#define SPNFS_STATUS_INVALIDMSG 0x01 ++#define SPNFS_STATUS_AGAIN 0x02 ++#define SPNFS_STATUS_FAIL 0x04 ++#define SPNFS_STATUS_SUCCESS 0x08 ++ ++#define SPNFS_TYPE_LAYOUTGET 0x01 ++#define SPNFS_TYPE_LAYOUTCOMMIT 0x02 ++#define SPNFS_TYPE_LAYOUTRETURN 0x03 ++#define SPNFS_TYPE_GETDEVICEITER 0x04 ++#define SPNFS_TYPE_GETDEVICEINFO 0x05 ++#define SPNFS_TYPE_SETATTR 0x06 ++#define SPNFS_TYPE_OPEN 0x07 ++#define SPNFS_TYPE_CLOSE 0x08 ++#define SPNFS_TYPE_CREATE 0x09 ++#define SPNFS_TYPE_REMOVE 0x0a ++#define SPNFS_TYPE_COMMIT 0x0b ++#define SPNFS_TYPE_READ 0x0c ++#define SPNFS_TYPE_WRITE 0x0d ++ ++#define SPNFS_MAX_DEVICES 1 ++#define SPNFS_MAX_DATA_SERVERS 16 ++#define SPNFS_MAX_IO 512 ++ ++/* layout */ ++struct spnfs_msg_layoutget_args { ++ unsigned long inode; ++ unsigned long generation; ++}; ++ ++struct spnfs_filelayout_list { ++ u_int32_t fh_len; ++ unsigned char fh_val[128]; /* DMXXX fix this const */ ++}; ++ ++struct spnfs_msg_layoutget_res { ++ int status; ++ u_int64_t devid; ++ u_int64_t stripe_size; ++ u_int32_t stripe_type; ++ u_int32_t stripe_count; ++ struct spnfs_filelayout_list flist[SPNFS_MAX_DATA_SERVERS]; ++}; ++ ++/* layoutcommit */ ++struct spnfs_msg_layoutcommit_args { ++ unsigned long inode; ++ unsigned long generation; ++ u_int64_t file_size; ++}; ++ ++struct spnfs_msg_layoutcommit_res { ++ int status; ++}; ++ ++/* layoutreturn */ ++/* No op for the daemon */ ++/* ++struct spnfs_msg_layoutreturn_args { ++}; ++ ++struct spnfs_msg_layoutreturn_res { ++}; ++*/ ++ ++/* getdeviceiter */ ++struct spnfs_msg_getdeviceiter_args { ++ unsigned long inode; ++ u_int64_t cookie; ++ u_int64_t verf; ++}; ++ ++struct spnfs_msg_getdeviceiter_res { ++ int status; ++ u_int64_t devid; ++ u_int64_t cookie; ++ u_int64_t verf; ++ u_int32_t eof; ++}; ++ ++/* getdeviceinfo */ ++struct spnfs_data_server { ++ u_int32_t dsid; ++ char netid[5]; ++ char addr[29]; ++}; ++ ++struct spnfs_device { ++ u_int64_t devid; ++ int dscount; ++ struct spnfs_data_server dslist[SPNFS_MAX_DATA_SERVERS]; ++}; ++ ++struct spnfs_msg_getdeviceinfo_args { ++ u_int64_t devid; ++}; ++ ++struct spnfs_msg_getdeviceinfo_res { ++ int status; ++ struct spnfs_device devinfo; ++}; ++ ++/* setattr */ ++struct spnfs_msg_setattr_args { ++ unsigned long inode; ++ unsigned long generation; ++ int file_size; ++}; ++ ++struct spnfs_msg_setattr_res { ++ int status; ++}; ++ ++/* open */ ++struct spnfs_msg_open_args { ++ unsigned long inode; ++ unsigned long generation; ++ int create; ++ int createmode; ++ int truncate; ++}; ++ ++struct spnfs_msg_open_res { ++ int status; ++}; ++ ++/* close */ ++/* No op for daemon */ ++struct spnfs_msg_close_args { ++ int x; ++}; ++ ++struct spnfs_msg_close_res { ++ int y; ++}; ++ ++/* create */ ++/* ++struct spnfs_msg_create_args { ++ int x; ++}; ++ ++struct spnfs_msg_create_res { ++ int y; ++}; ++*/ ++ ++/* remove */ ++struct spnfs_msg_remove_args { ++ unsigned long inode; ++ unsigned long generation; ++}; ++ ++struct spnfs_msg_remove_res { ++ int status; ++}; ++ ++/* commit */ ++/* ++struct spnfs_msg_commit_args { ++ int x; ++}; ++ ++struct spnfs_msg_commit_res { ++ int y; ++}; ++*/ ++ ++/* read */ ++struct spnfs_msg_read_args { ++ unsigned long inode; ++ unsigned long generation; ++ loff_t offset; ++ unsigned long len; ++}; ++ ++struct spnfs_msg_read_res { ++ int status; ++ char data[SPNFS_MAX_IO]; ++}; ++ ++/* write */ ++struct spnfs_msg_write_args { ++ unsigned long inode; ++ unsigned long generation; ++ loff_t offset; ++ unsigned long len; ++ char data[SPNFS_MAX_IO]; ++}; ++ ++struct spnfs_msg_write_res { ++ int status; ++}; ++ ++/* bundle args and responses */ ++union spnfs_msg_args { ++ struct spnfs_msg_layoutget_args layoutget_args; ++ struct spnfs_msg_layoutcommit_args layoutcommit_args; ++/* ++ struct spnfs_msg_layoutreturn_args layoutreturn_args; ++*/ ++ struct spnfs_msg_getdeviceiter_args getdeviceiter_args; ++ struct spnfs_msg_getdeviceinfo_args getdeviceinfo_args; ++ struct spnfs_msg_setattr_args setattr_args; ++ struct spnfs_msg_open_args open_args; ++ struct spnfs_msg_close_args close_args; ++/* ++ struct spnfs_msg_create_args create_args; ++*/ ++ struct spnfs_msg_remove_args remove_args; ++/* ++ struct spnfs_msg_commit_args commit_args; ++*/ ++ struct spnfs_msg_read_args read_args; ++ struct spnfs_msg_write_args write_args; ++}; ++ ++union spnfs_msg_res { ++ struct spnfs_msg_layoutget_res layoutget_res; ++ struct spnfs_msg_layoutcommit_res layoutcommit_res; ++/* ++ struct spnfs_msg_layoutreturn_res layoutreturn_res; ++*/ ++ struct spnfs_msg_getdeviceiter_res getdeviceiter_res; ++ struct spnfs_msg_getdeviceinfo_res getdeviceinfo_res; ++ struct spnfs_msg_setattr_res setattr_res; ++ struct spnfs_msg_open_res open_res; ++ struct spnfs_msg_close_res close_res; ++/* ++ struct spnfs_msg_create_res create_res; ++*/ ++ struct spnfs_msg_remove_res remove_res; ++/* ++ struct spnfs_msg_commit_res commit_res; ++*/ ++ struct spnfs_msg_read_res read_res; ++ struct spnfs_msg_write_res write_res; ++}; ++ ++/* a spnfs message, args and response */ ++struct spnfs_msg { ++ unsigned char im_type; ++ unsigned char im_status; ++ union spnfs_msg_args im_args; ++ union spnfs_msg_res im_res; ++}; ++ ++/* spnfs configuration info */ ++struct spnfs_config { ++ unsigned char dense_striping; ++ int stripe_size; ++ int num_ds; ++ char ds_dir[SPNFS_MAX_DATA_SERVERS][80]; /* XXX */ ++}; ++ ++#if defined(__KERNEL__) && defined(CONFIG_SPNFS) ++ ++#include ++ ++/* pipe mgmt structure. messages flow through here */ ++struct spnfs { ++ struct dentry *spnfs_dentry; /* dentry for pipe */ ++ wait_queue_head_t spnfs_wq; ++ struct spnfs_msg spnfs_im; /* spnfs message */ ++ struct mutex spnfs_lock; /* Serializes upcalls */ ++ struct mutex spnfs_plock; ++}; ++ ++struct nfsd4_open; ++ ++int spnfs_layout_type(struct super_block *); ++enum nfsstat4 spnfs_layoutget(struct inode *, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++int spnfs_layoutcommit(void); ++int spnfs_layoutreturn(struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++int spnfs_getdeviceiter(struct super_block *, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++int spnfs_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++int spnfs_setattr(void); ++int spnfs_open(struct inode *, struct nfsd4_open *); ++int spnfs_get_state(struct inode *, struct knfsd_fh *, struct pnfs_get_state *); ++int spnfs_remove(unsigned long, unsigned long); ++__be32 spnfs_read(struct inode *, loff_t, unsigned long *, ++ int, struct svc_rqst *); ++__be32 spnfs_write(struct inode *, loff_t, size_t, int, struct svc_rqst *); ++int spnfs_getfh(int, struct nfs_fh *); ++int spnfs_test_layoutrecall(char *, u64, u64); ++int spnfs_layoutrecall(struct inode *, int, u64, u64); ++ ++int nfsd_spnfs_new(void); ++void nfsd_spnfs_delete(void); ++int spnfs_upcall(struct spnfs *, struct spnfs_msg *, union spnfs_msg_res *); ++int spnfs_enabled(void); ++int spnfs_init_proc(void); ++ ++extern struct spnfs_config *spnfs_config; ++ ++#endif /* __KERNEL__ && CONFIG_SPNFS */ ++ ++#endif /* NFS_SPNFS_H */ +diff -up linux-2.6.37.noarch/include/linux/nfsd/const.h.orig linux-2.6.37.noarch/include/linux/nfsd/const.h +--- linux-2.6.37.noarch/include/linux/nfsd/const.h.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/include/linux/nfsd/const.h 2011-01-28 09:43:53.387765940 -0500 +@@ -29,6 +29,7 @@ + #ifdef __KERNEL__ + + #include ++#include + + /* + * Largest number of bytes we need to allocate for an NFS +diff -up linux-2.6.37.noarch/include/linux/nfsd/debug.h.orig linux-2.6.37.noarch/include/linux/nfsd/debug.h +--- linux-2.6.37.noarch/include/linux/nfsd/debug.h.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/include/linux/nfsd/debug.h 2011-01-28 09:43:53.388765835 -0500 +@@ -32,6 +32,8 @@ + #define NFSDDBG_REPCACHE 0x0080 + #define NFSDDBG_XDR 0x0100 + #define NFSDDBG_LOCKD 0x0200 ++#define NFSDDBG_PNFS 0x0400 ++#define NFSDDBG_FILELAYOUT 0x0800 + #define NFSDDBG_ALL 0x7FFF + #define NFSDDBG_NOCHANGE 0xFFFF + +diff -up linux-2.6.37.noarch/include/linux/nfsd/export.h.orig linux-2.6.37.noarch/include/linux/nfsd/export.h +--- linux-2.6.37.noarch/include/linux/nfsd/export.h.orig 2011-01-28 09:37:32.865968740 -0500 ++++ linux-2.6.37.noarch/include/linux/nfsd/export.h 2011-01-28 09:43:53.388765835 -0500 +@@ -79,6 +79,20 @@ struct nfsd4_fs_locations { + }; + + /* ++ * Callbacks ++ */ ++struct nfsd4_callback { ++ void *cb_op; ++ struct nfs4_client *cb_clp; ++ struct list_head cb_per_client; ++ u32 cb_minorversion; ++ struct rpc_message cb_msg; ++ const struct rpc_call_ops *cb_ops; ++ struct work_struct cb_work; ++ bool cb_done; ++}; ++ ++/* + * We keep an array of pseudoflavors with the export, in order from most + * to least preferred. For the forseeable future, we don't expect more + * than the eight pseudoflavors null, unix, krb5, krb5i, krb5p, skpm3, +@@ -100,6 +114,7 @@ struct svc_export { + uid_t ex_anon_uid; + gid_t ex_anon_gid; + int ex_fsid; ++ int ex_pnfs; + unsigned char * ex_uuid; /* 16 byte fsid */ + struct nfsd4_fs_locations ex_fslocs; + int ex_nflavors; +diff -up linux-2.6.37.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.37.noarch/include/linux/nfsd/nfs4layoutxdr.h +--- linux-2.6.37.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2011-01-28 09:43:53.389765732 -0500 ++++ linux-2.6.37.noarch/include/linux/nfsd/nfs4layoutxdr.h 2011-01-28 09:43:53.389765732 -0500 +@@ -0,0 +1,132 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef NFSD_NFS4LAYOUTXDR_H ++#define NFSD_NFS4LAYOUTXDR_H ++ ++#include ++#include ++ ++/* the nfsd4_pnfs_devlist dev_addr for the file layout type */ ++struct pnfs_filelayout_devaddr { ++ struct xdr_netobj r_netid; ++ struct xdr_netobj r_addr; ++}; ++ ++/* list of multipath servers */ ++struct pnfs_filelayout_multipath { ++ u32 fl_multipath_length; ++ struct pnfs_filelayout_devaddr *fl_multipath_list; ++}; ++ ++struct pnfs_filelayout_device { ++ u32 fl_stripeindices_length; ++ u32 *fl_stripeindices_list; ++ u32 fl_device_length; ++ struct pnfs_filelayout_multipath *fl_device_list; ++}; ++ ++struct pnfs_filelayout_layout { ++ u32 lg_layout_type; /* response */ ++ u32 lg_stripe_type; /* response */ ++ u32 lg_commit_through_mds; /* response */ ++ u64 lg_stripe_unit; /* response */ ++ u64 lg_pattern_offset; /* response */ ++ u32 lg_first_stripe_index; /* response */ ++ struct nfsd4_pnfs_deviceid device_id; /* response */ ++ u32 lg_fh_length; /* response */ ++ struct knfsd_fh *lg_fh_list; /* response */ ++}; ++ ++enum stripetype4 { ++ STRIPE_SPARSE = 1, ++ STRIPE_DENSE = 2 ++}; ++ ++enum pnfs_block_extent_state4 { ++ PNFS_BLOCK_READWRITE_DATA = 0, ++ PNFS_BLOCK_READ_DATA = 1, ++ PNFS_BLOCK_INVALID_DATA = 2, ++ PNFS_BLOCK_NONE_DATA = 3 ++}; ++ ++enum pnfs_block_volume_type4 { ++ PNFS_BLOCK_VOLUME_SIMPLE = 0, ++ PNFS_BLOCK_VOLUME_SLICE = 1, ++ PNFS_BLOCK_VOLUME_CONCAT = 2, ++ PNFS_BLOCK_VOLUME_STRIPE = 3, ++}; ++typedef enum pnfs_block_volume_type4 pnfs_block_volume_type4; ++ ++enum bl_cache_state { ++ BLOCK_LAYOUT_NEW = 0, ++ BLOCK_LAYOUT_CACHE = 1, ++ BLOCK_LAYOUT_UPDATE = 2, ++}; ++ ++typedef struct pnfs_blocklayout_layout { ++ struct list_head bll_list; ++ struct nfsd4_pnfs_deviceid bll_vol_id; ++ u64 bll_foff; // file offset ++ u64 bll_len; ++ u64 bll_soff; // storage offset ++ int bll_recalled; ++ enum pnfs_block_extent_state4 bll_es; ++ enum bl_cache_state bll_cache_state; ++} pnfs_blocklayout_layout_t; ++ ++typedef struct pnfs_blocklayout_devinfo { ++ struct list_head bld_list; ++ pnfs_block_volume_type4 bld_type; ++ struct nfsd4_pnfs_deviceid bld_devid; ++ int bld_index_loc; ++ union { ++ struct { ++ u64 bld_offset; ++ u32 bld_sig_len, ++ *bld_sig; ++ } simple; ++ struct { ++ u64 bld_start, ++ bld_len; ++ u32 bld_index; /* Index of Simple Volume */ ++ } slice; ++ struct { ++ u32 bld_stripes; ++ u64 bld_chunk_size; ++ u32 *bld_stripe_indexs; ++ } stripe; ++ } u; ++} pnfs_blocklayout_devinfo_t; ++ ++#endif /* NFSD_NFS4LAYOUTXDR_H */ +diff -up linux-2.6.37.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.37.noarch/include/linux/nfsd/nfs4pnfsdlm.h +--- linux-2.6.37.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2011-01-28 09:43:53.389765732 -0500 ++++ linux-2.6.37.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2011-01-28 09:43:53.389765732 -0500 +@@ -0,0 +1,54 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ ******************************************************************************/ ++#include ++ ++/* ++ * Length of comma separated pnfs data server IPv4 addresses. Enough room for ++ * 32 addresses. ++ */ ++#define NFSD_DLM_DS_LIST_MAX 512 ++/* ++ * Length of colon separated pnfs dlm device of the form ++ * disk_name:comma separated data server IPv4 address ++ */ ++#define NFSD_PNFS_DLM_DEVICE_MAX (NFSD_DLM_DS_LIST_MAX + DISK_NAME_LEN + 1) ++ ++#ifdef CONFIG_PNFSD ++ ++/* For use by DLM cluster file systems exported by pNFSD */ ++extern const struct pnfs_export_operations pnfs_dlm_export_ops; ++ ++int nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len); ++ ++void nfsd4_pnfs_dlm_shutdown(void); ++ ++ssize_t nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen); ++ ++#else /* CONFIG_PNFSD */ ++ ++static inline void nfsd4_pnfs_dlm_shutdown(void) ++{ ++ return; ++} ++ ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.37.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.37.noarch/include/linux/nfsd/nfsd4_pnfs.h +--- linux-2.6.37.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2011-01-28 09:43:53.390765631 -0500 ++++ linux-2.6.37.noarch/include/linux/nfsd/nfsd4_pnfs.h 2011-01-28 09:43:53.390765631 -0500 +@@ -0,0 +1,273 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef _LINUX_NFSD_NFSD4_PNFS_H ++#define _LINUX_NFSD_NFSD4_PNFS_H ++ ++#include ++#include ++#include ++#include ++ ++struct nfsd4_pnfs_deviceid { ++ u64 sbid; /* per-superblock unique ID */ ++ u64 devid; /* filesystem-wide unique device ID */ ++}; ++ ++struct nfsd4_pnfs_dev_iter_res { ++ u64 gd_cookie; /* request/repsonse */ ++ u64 gd_verf; /* request/repsonse */ ++ u64 gd_devid; /* response */ ++ u32 gd_eof; /* response */ ++}; ++ ++/* Arguments for set_device_notify */ ++struct pnfs_devnotify_arg { ++ struct nfsd4_pnfs_deviceid dn_devid; /* request */ ++ u32 dn_layout_type; /* request */ ++ u32 dn_notify_types; /* request/response */ ++}; ++ ++struct nfsd4_layout_seg { ++ u64 clientid; ++ u32 layout_type; ++ u32 iomode; ++ u64 offset; ++ u64 length; ++}; ++ ++/* Used by layout_get to encode layout (loc_body var in spec) ++ * Args: ++ * minlength - min number of accessible bytes given by layout ++ * fsid - Major part of struct pnfs_deviceid. File system uses this ++ * to build the deviceid returned in the layout. ++ * fh - fs can modify the file handle for use on data servers ++ * seg - layout info requested and layout info returned ++ * xdr - xdr info ++ * return_on_close - true if layout to be returned on file close ++ */ ++ ++struct nfsd4_pnfs_layoutget_arg { ++ u64 lg_minlength; ++ u64 lg_sbid; ++ const struct knfsd_fh *lg_fh; ++}; ++ ++struct nfsd4_pnfs_layoutget_res { ++ struct nfsd4_layout_seg lg_seg; /* request/resopnse */ ++ u32 lg_return_on_close; ++}; ++ ++struct nfsd4_pnfs_layoutcommit_arg { ++ struct nfsd4_layout_seg lc_seg; /* request */ ++ u32 lc_reclaim; /* request */ ++ u32 lc_newoffset; /* request */ ++ u64 lc_last_wr; /* request */ ++ struct nfstime4 lc_mtime; /* request */ ++ u32 lc_up_len; /* layout length */ ++ void *lc_up_layout; /* decoded by callback */ ++}; ++ ++struct nfsd4_pnfs_layoutcommit_res { ++ u32 lc_size_chg; /* boolean for response */ ++ u64 lc_newsize; /* response */ ++}; ++ ++#define PNFS_LAST_LAYOUT_NO_RECALLS ((void *)-1) /* used with lr_cookie below */ ++ ++struct nfsd4_pnfs_layoutreturn_arg { ++ u32 lr_return_type; /* request */ ++ struct nfsd4_layout_seg lr_seg; /* request */ ++ u32 lr_reclaim; /* request */ ++ u32 lrf_body_len; /* request */ ++ void *lrf_body; /* request */ ++ void *lr_cookie; /* fs private */ ++}; ++ ++/* pNFS Metadata to Data server state communication */ ++struct pnfs_get_state { ++ u32 dsid; /* request */ ++ u64 ino; /* request */ ++ nfs4_stateid stid; /* request;response */ ++ nfs4_clientid clid; /* response */ ++ u32 access; /* response */ ++ u32 stid_gen; /* response */ ++ u32 verifier[2]; /* response */ ++}; ++ ++/* ++ * pNFS export operations vector. ++ * ++ * The filesystem must implement the following methods: ++ * layout_type ++ * get_device_info ++ * layout_get ++ * ++ * All other methods are optional and can be set to NULL if not implemented. ++ */ ++struct pnfs_export_operations { ++ /* Returns the supported pnfs_layouttype4. */ ++ int (*layout_type) (struct super_block *); ++ ++ /* Encode device info onto the xdr stream. */ ++ int (*get_device_info) (struct super_block *, ++ struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++ ++ /* Retrieve all available devices via an iterator. ++ * arg->cookie == 0 indicates the beginning of the list, ++ * otherwise arg->verf is used to verify that the list hasn't changed ++ * while retrieved. ++ * ++ * On output, the filesystem sets the devid based on the current cookie ++ * and sets res->cookie and res->verf corresponding to the next entry. ++ * When the last entry in the list is retrieved, res->eof is set to 1. ++ */ ++ int (*get_device_iter) (struct super_block *, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++ ++ int (*set_device_notify) (struct super_block *, ++ struct pnfs_devnotify_arg *); ++ ++ /* Retrieve and encode a layout for inode onto the xdr stream. ++ * arg->minlength is the minimum number of accessible bytes required ++ * by the client. ++ * The maximum number of bytes to encode the layout is given by ++ * the xdr stream end pointer. ++ * arg->fsid contains the major part of struct pnfs_deviceid. ++ * The file system uses this to build the deviceid returned ++ * in the layout. ++ * res->seg - layout segment requested and layout info returned. ++ * res->fh can be modified the file handle for use on data servers ++ * res->return_on_close - true if layout to be returned on file close ++ * ++ * return one of the following nfs errors: ++ * NFS_OK Success ++ * NFS4ERR_ACCESS Permission error ++ * NFS4ERR_BADIOMODE Server does not support requested iomode ++ * NFS4ERR_BADLAYOUT No layout matching loga_minlength rules ++ * NFS4ERR_INVAL Parameter other than layout is invalid ++ * NFS4ERR_IO I/O error ++ * NFS4ERR_LAYOUTTRYLATER Layout may be retrieved later ++ * NFS4ERR_LAYOUTUNAVAILABLE Layout unavailable for this file ++ * NFS4ERR_LOCKED Lock conflict ++ * NFS4ERR_NOSPC Out-of-space error occured ++ * NFS4ERR_RECALLCONFLICT Layout currently unavialable due to ++ * a conflicting CB_LAYOUTRECALL ++ * NFS4ERR_SERVERFAULT Server went bezerk ++ * NFS4ERR_TOOSMALL loga_maxcount too small to fit layout ++ * NFS4ERR_WRONG_TYPE Wrong file type (not a regular file) ++ */ ++ enum nfsstat4 (*layout_get) (struct inode *, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++ ++ /* Commit changes to layout */ ++ int (*layout_commit) (struct inode *, ++ const struct nfsd4_pnfs_layoutcommit_arg *, ++ struct nfsd4_pnfs_layoutcommit_res *); ++ ++ /* Returns the layout */ ++ int (*layout_return) (struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++ ++ /* Can layout segments be merged for this layout type? */ ++ int (*can_merge_layouts) (u32 layout_type); ++ ++ /* pNFS Files layout specific operations */ ++ ++ /* Get the write verifier for DS (called on MDS only) */ ++ void (*get_verifier) (struct super_block *, u32 *p); ++ /* Call fs on DS only */ ++ int (*get_state) (struct inode *, struct knfsd_fh *, ++ struct pnfs_get_state *); ++}; ++ ++struct nfsd4_pnfs_cb_layout { ++ u32 cbl_recall_type; /* request */ ++ struct nfsd4_layout_seg cbl_seg; /* request */ ++ u32 cbl_layoutchanged; /* request */ ++ nfs4_stateid cbl_sid; /* request */ ++ struct nfs4_fsid cbl_fsid; ++ void *cbl_cookie; /* fs private */ ++}; ++ ++/* layoutrecall request (from exported filesystem) */ ++struct nfs4_layoutrecall { ++ struct kref clr_ref; ++ struct nfsd4_pnfs_cb_layout cb; /* request */ ++ struct list_head clr_perclnt; /* on cl_layoutrecalls */ ++ struct nfs4_client *clr_client; ++ struct nfs4_file *clr_file; ++ struct timespec clr_time; /* last activity */ ++ struct super_block *clr_sb; /* We might not have a file */ ++ struct nfs4_layoutrecall *parent; /* The initiating recall */ ++ ++ /* nfsd internal */ ++ struct nfsd4_callback clr_recall; ++}; ++ ++struct nfsd4_pnfs_cb_dev_item { ++ u32 cbd_notify_type; /* request */ ++ u32 cbd_layout_type; /* request */ ++ struct nfsd4_pnfs_deviceid cbd_devid; /* request */ ++ u32 cbd_immediate; /* request */ ++}; ++ ++struct nfsd4_pnfs_cb_dev_list { ++ u32 cbd_len; /* request */ ++ struct nfsd4_pnfs_cb_dev_item *cbd_list; /* request */ ++}; ++ ++/* ++ * callbacks provided by the nfsd ++ */ ++struct pnfsd_cb_operations { ++ /* Generic callbacks */ ++ int (*cb_layout_recall) (struct super_block *, struct inode *, ++ struct nfsd4_pnfs_cb_layout *); ++ int (*cb_device_notify) (struct super_block *, ++ struct nfsd4_pnfs_cb_dev_list *); ++ ++ /* pNFS Files layout specific callbacks */ ++ ++ /* Callback from fs on MDS only */ ++ int (*cb_get_state) (struct super_block *, struct pnfs_get_state *); ++ /* Callback from fs on DS only */ ++ int (*cb_change_state) (struct pnfs_get_state *); ++}; ++ ++#endif /* _LINUX_NFSD_NFSD4_PNFS_H */ +diff -up linux-2.6.37.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.37.noarch/include/linux/nfsd/syscall.h +--- linux-2.6.37.noarch/include/linux/nfsd/syscall.h.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/include/linux/nfsd/syscall.h 2011-01-28 09:43:53.391765532 -0500 +@@ -29,6 +29,7 @@ + /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ + #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ + #define NFSCTL_GETFS 8 /* get an fh by path with max FH len */ ++#define NFSCTL_FD2FH 9 /* get a fh from a fd */ + + /* SVC */ + struct nfsctl_svc { +@@ -71,6 +72,11 @@ struct nfsctl_fsparm { + int gd_maxlen; + }; + ++/* FD2FH */ ++struct nfsctl_fd2fh { ++ int fd; ++}; ++ + /* + * This is the argument union. + */ +@@ -82,6 +88,7 @@ struct nfsctl_arg { + struct nfsctl_export u_export; + struct nfsctl_fdparm u_getfd; + struct nfsctl_fsparm u_getfs; ++ struct nfsctl_fd2fh u_fd2fh; + /* + * The following dummy member is needed to preserve binary compatibility + * on platforms where alignof(void*)>alignof(int). It's needed because +@@ -95,6 +102,7 @@ struct nfsctl_arg { + #define ca_export u.u_export + #define ca_getfd u.u_getfd + #define ca_getfs u.u_getfs ++#define ca_fd2fh u.u_fd2fh + }; + + union nfsctl_res { +diff -up linux-2.6.37.noarch/include/linux/nfs_fs.h.orig linux-2.6.37.noarch/include/linux/nfs_fs.h +--- linux-2.6.37.noarch/include/linux/nfs_fs.h.orig 2011-01-28 09:37:32.863968809 -0500 ++++ linux-2.6.37.noarch/include/linux/nfs_fs.h 2011-01-28 09:43:53.381766626 -0500 +@@ -190,6 +190,8 @@ struct nfs_inode { + struct rw_semaphore rwsem; + + /* pNFS layout information */ ++ struct rpc_wait_queue lo_rpcwaitq; ++ struct rpc_wait_queue lo_rpcwaitq_stateid; + struct pnfs_layout_hdr *layout; + #endif /* CONFIG_NFS_V4*/ + #ifdef CONFIG_NFS_FSCACHE +@@ -499,8 +501,12 @@ extern int nfs_sillyrename(struct inode + extern int nfs_congestion_kb; + extern int nfs_writepage(struct page *page, struct writeback_control *wbc); + extern int nfs_writepages(struct address_space *, struct writeback_control *); +-extern int nfs_flush_incompatible(struct file *file, struct page *page); +-extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); ++struct pnfs_layout_segment; ++extern int nfs_flush_incompatible(struct file *file, struct page *page, ++ struct pnfs_layout_segment *lseg); ++extern int nfs_updatepage(struct file *, struct page *, ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, void *fsdata); + extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); + + /* +diff -up linux-2.6.37.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.37.noarch/include/linux/nfs_fs_sb.h +--- linux-2.6.37.noarch/include/linux/nfs_fs_sb.h.orig 2011-01-28 09:37:32.863968809 -0500 ++++ linux-2.6.37.noarch/include/linux/nfs_fs_sb.h 2011-01-28 09:43:53.382766505 -0500 +@@ -79,6 +79,12 @@ struct nfs_client { + u32 cl_exchange_flags; + struct nfs4_session *cl_session; /* sharred session */ + struct list_head cl_layouts; ++ atomic_t cl_recall_count; /* no. of lsegs in recall */ ++ struct list_head cl_layoutrecalls; ++ unsigned long cl_cb_lrecall_count; ++#define PNFS_MAX_CB_LRECALLS (64) ++ atomic_t *cl_drain_notification[PNFS_MAX_CB_LRECALLS]; ++ struct rpc_wait_queue cl_rpcwaitq_recall; + struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */ + #endif /* CONFIG_NFS_V4_1 */ + +@@ -87,6 +93,16 @@ struct nfs_client { + #endif + }; + ++static inline bool ++is_ds_only_client(struct nfs_client *clp) ++{ ++#ifdef CONFIG_NFS_V4_1 ++ return is_ds_only_session(clp->cl_exchange_flags); ++#else ++ return false; ++#endif ++} ++ + /* + * NFS client parameters stored in the superblock. + */ +@@ -132,7 +148,7 @@ struct nfs_server { + #endif + + #ifdef CONFIG_NFS_V4 +- u32 attr_bitmask[2];/* V4 bitmask representing the set ++ u32 attr_bitmask[3];/* V4 bitmask representing the set + of attributes supported on this + filesystem */ + u32 cache_consistency_bitmask[2]; +@@ -144,7 +160,10 @@ struct nfs_server { + that are supported on this + filesystem */ + struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ +- struct rpc_wait_queue roc_rpcwaitq; ++ void *pnfs_ld_data; /* Per-mount data */ ++ unsigned int ds_rsize; /* Data server read size */ ++ unsigned int ds_wsize; /* Data server write size */ ++ u32 pnfs_blksize; /* layout_blksize attr */ + + /* the following fields are protected by nfs_client->cl_lock */ + struct rb_root state_owners; +diff -up linux-2.6.37.noarch/include/linux/nfs_iostat.h.orig linux-2.6.37.noarch/include/linux/nfs_iostat.h +--- linux-2.6.37.noarch/include/linux/nfs_iostat.h.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/include/linux/nfs_iostat.h 2011-01-28 09:43:53.384766270 -0500 +@@ -113,6 +113,9 @@ enum nfs_stat_eventcounters { + NFSIOS_SHORTREAD, + NFSIOS_SHORTWRITE, + NFSIOS_DELAY, ++ NFSIOS_PNFS_READ, ++ NFSIOS_PNFS_WRITE, ++ NFSIOS_PNFS_COMMIT, + __NFSIOS_COUNTSMAX, + }; + +diff -up linux-2.6.37.noarch/include/linux/nfs_page.h.orig linux-2.6.37.noarch/include/linux/nfs_page.h +--- linux-2.6.37.noarch/include/linux/nfs_page.h.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/include/linux/nfs_page.h 2011-01-28 09:43:53.385766158 -0500 +@@ -49,6 +49,7 @@ struct nfs_page { + struct kref wb_kref; /* reference count */ + unsigned long wb_flags; + struct nfs_writeverf wb_verf; /* Commit cookie */ ++ struct pnfs_layout_segment *wb_lseg; /* Pnfs layout info */ + }; + + struct nfs_pageio_descriptor { +@@ -62,6 +63,11 @@ struct nfs_pageio_descriptor { + int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int); + int pg_ioflags; + int pg_error; ++ struct pnfs_layout_segment *pg_lseg; ++#ifdef CONFIG_NFS_V4_1 ++ int pg_iswrite; ++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); ++#endif /* CONFIG_NFS_V4_1 */ + }; + + #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) +@@ -70,13 +76,15 @@ extern struct nfs_page *nfs_create_reque + struct inode *inode, + struct page *page, + unsigned int offset, +- unsigned int count); ++ unsigned int count, ++ struct pnfs_layout_segment *lseg); + extern void nfs_clear_request(struct nfs_page *req); + extern void nfs_release_request(struct nfs_page *req); + + + extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst, +- pgoff_t idx_start, unsigned int npages, int tag); ++ pgoff_t idx_start, unsigned int npages, int tag, ++ int *use_pnfs); + extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, + struct inode *inode, + int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), +diff -up linux-2.6.37.noarch/include/linux/nfs_xdr.h.orig linux-2.6.37.noarch/include/linux/nfs_xdr.h +--- linux-2.6.37.noarch/include/linux/nfs_xdr.h.orig 2011-01-28 09:37:32.864968775 -0500 ++++ linux-2.6.37.noarch/include/linux/nfs_xdr.h 2011-01-28 09:43:53.387765940 -0500 +@@ -3,6 +3,8 @@ + + #include + #include ++#include ++#include + + /* + * To change the maximum rsize and wsize supported by the NFS client, adjust +@@ -10,7 +12,7 @@ + * support a megabyte or more. The default is left at 4096 bytes, which is + * reasonable for NFS over UDP. + */ +-#define NFS_MAX_FILE_IO_SIZE (1048576U) ++#define NFS_MAX_FILE_IO_SIZE (4U * 1048576U) + #define NFS_DEF_FILE_IO_SIZE (4096U) + #define NFS_MIN_FILE_IO_SIZE (1024U) + +@@ -115,6 +117,7 @@ struct nfs_fsinfo { + struct timespec time_delta; /* server time granularity */ + __u32 lease_time; /* in seconds */ + __u32 layouttype; /* supported pnfs layout driver */ ++ __u32 blksize; /* preferred pnfs io block size */ + }; + + struct nfs_fsstat { +@@ -226,6 +229,73 @@ struct nfs4_layoutget { + struct pnfs_layout_segment **lsegpp; + }; + ++struct nfs4_layoutcommit_args { ++ nfs4_stateid stateid; ++ __u64 lastbytewritten; ++ __u32 time_modify_changed; ++ struct timespec time_modify; ++ const u32 *bitmask; ++ struct nfs_fh *fh; ++ struct inode *inode; ++ ++ /* Values set by layout driver */ ++ struct pnfs_layout_range range; ++ __u32 layout_type; ++ void *layoutdriver_data; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_layoutcommit_res { ++ __u32 sizechanged; ++ __u64 newsize; ++ struct nfs_fattr *fattr; ++ const struct nfs_server *server; ++ struct nfs4_sequence_res seq_res; ++ int status; ++}; ++ ++struct nfs4_layoutcommit_data { ++ struct rpc_task task; ++ struct rpc_cred *cred; ++ struct nfs_fattr fattr; ++ struct nfs4_layoutcommit_args args; ++ struct nfs4_layoutcommit_res res; ++}; ++ ++struct nfs4_layoutreturn_args { ++ __u32 reclaim; ++ __u32 layout_type; ++ __u32 return_type; ++ struct pnfs_layout_range range; ++ struct inode *inode; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_layoutreturn_res { ++ struct nfs4_sequence_res seq_res; ++ u32 lrs_present; ++ nfs4_stateid stateid; ++}; ++ ++struct nfs4_layoutreturn { ++ struct nfs4_layoutreturn_args args; ++ struct nfs4_layoutreturn_res res; ++ struct rpc_cred *cred; ++ struct nfs_client *clp; ++ int rpc_status; ++}; ++ ++struct nfs4_getdevicelist_args { ++ const struct nfs_fh *fh; ++ u32 layoutclass; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_getdevicelist_res { ++ struct pnfs_devicelist *devlist; ++ struct nfs4_sequence_res seq_res; ++}; ++ + struct nfs4_getdeviceinfo_args { + struct pnfs_device *pdev; + struct nfs4_sequence_args seq_args; +@@ -889,7 +959,7 @@ struct nfs4_server_caps_arg { + }; + + struct nfs4_server_caps_res { +- u32 attr_bitmask[2]; ++ u32 attr_bitmask[3]; + u32 acl_bitmask; + u32 has_links; + u32 has_symlinks; +@@ -1004,6 +1074,30 @@ struct nfs_page; + + #define NFS_PAGEVEC_SIZE (8U) + ++#if defined(CONFIG_NFS_V4_1) ++ ++/* pnfsflag values */ ++enum pnfs_flags { ++ PNFS_NO_RPC = 1 << 0, /* non rpc result callback switch */ ++}; ++ ++/* pnfs-specific data needed for read, write, and commit calls */ ++struct pnfs_call_data { ++ struct pnfs_layout_segment *lseg; ++ const struct rpc_call_ops *call_ops; ++ u32 orig_count; /* for retry via MDS */ ++ int pnfs_error; ++ u8 pnfsflags; ++ u8 how; /* for FLUSH_STABLE */ ++}; ++ ++/* files layout-type specific data for read, write, and commit */ ++struct pnfs_fl_call_data { ++ struct nfs_client *ds_nfs_client; ++ __u64 orig_offset; ++}; ++#endif /* CONFIG_NFS_V4_1 */ ++ + struct nfs_read_data { + int flags; + struct rpc_task task; +@@ -1019,10 +1113,16 @@ struct nfs_read_data { + #ifdef CONFIG_NFS_V4 + unsigned long timestamp; /* For lease renewal */ + #endif ++#if defined(CONFIG_NFS_V4_1) ++ struct pnfs_call_data pdata; ++ struct pnfs_fl_call_data fldata; ++#endif /* CONFIG_NFS_V4_1 */ + struct page *page_array[NFS_PAGEVEC_SIZE]; + }; + + struct nfs_write_data { ++ struct kref refcount; /* For pnfs commit splitting */ ++ struct nfs_write_data *parent; /* For pnfs commit splitting */ + int flags; + struct rpc_task task; + struct inode *inode; +@@ -1038,6 +1138,10 @@ struct nfs_write_data { + #ifdef CONFIG_NFS_V4 + unsigned long timestamp; /* For lease renewal */ + #endif ++#if defined(CONFIG_NFS_V4_1) ++ struct pnfs_call_data pdata; ++ struct pnfs_fl_call_data fldata; ++#endif /* CONFIG_NFS_V4_1 */ + struct page *page_array[NFS_PAGEVEC_SIZE]; + }; + +diff -up linux-2.6.37.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.37.noarch/include/linux/panfs_shim_api.h +--- linux-2.6.37.noarch/include/linux/panfs_shim_api.h.orig 2011-01-28 09:43:53.395765159 -0500 ++++ linux-2.6.37.noarch/include/linux/panfs_shim_api.h 2011-01-28 09:43:53.395765159 -0500 +@@ -0,0 +1,57 @@ ++#ifndef _PANFS_SHIM_API_H ++#define _PANFS_SHIM_API_H ++ ++/* ++ * imported panfs functions ++ */ ++struct panfs_export_operations { ++ int (*convert_rc)(pan_status_t rc); ++ ++ int (*sm_sec_t_get_size_otw)( ++ pan_sm_sec_otw_t *var, ++ pan_size_t *core_sizep, ++ pan_size_t *wire_size, ++ void *buf_end); ++ ++ int (*sm_sec_t_unmarshall)( ++ pan_sm_sec_otw_t *in, ++ pan_sm_sec_t *out, ++ void *buf, ++ pan_size_t size, ++ pan_size_t *otw_consumed, ++ pan_size_t *in_core_consumed); ++ ++ int (*ucreds_get)(void **ucreds_pp); ++ ++ void (*ucreds_put)(void *ucreds); ++ ++ int (*sam_read)( ++ pan_sam_access_flags_t flags, ++ pan_sam_read_args_t *args_p, ++ pan_sam_obj_sec_t *obj_sec_p, ++ pan_sg_entry_t *data_p, ++ void *ucreds, ++ pan_sam_read_cb_t closure, ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_read_res_t *res_p); ++ ++ int (*sam_write)( ++ pan_sam_access_flags_t flags, ++ pan_sam_write_args_t *args_p, ++ pan_sam_obj_sec_t *obj_sec_p, ++ pan_sg_entry_t *data_p, ++ void *ucreds, ++ pan_sam_write_cb_t closure, ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_write_res_t *res_p); ++}; ++ ++extern int ++panfs_shim_register(struct panfs_export_operations *ops); ++ ++extern int ++panfs_shim_unregister(void); ++ ++#endif /* _PANFS_SHIM_API_H */ +diff -up linux-2.6.37.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.37.noarch/include/linux/pnfs_osd_xdr.h +--- linux-2.6.37.noarch/include/linux/pnfs_osd_xdr.h.orig 2011-01-28 09:43:53.397764982 -0500 ++++ linux-2.6.37.noarch/include/linux/pnfs_osd_xdr.h 2011-01-28 09:43:53.397764982 -0500 +@@ -0,0 +1,439 @@ ++/* ++ * pnfs_osd_xdr.h ++ * ++ * pNFS-osd on-the-wire data structures ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#ifndef __PNFS_OSD_XDR_H__ ++#define __PNFS_OSD_XDR_H__ ++ ++#include ++#include ++#include ++#include ++ ++#define PNFS_OSD_OSDNAME_MAXSIZE 256 ++ ++/* ++ * START OF "GENERIC" DECODE ROUTINES. ++ * These may look a little ugly since they are imported from a "generic" ++ * set of XDR encode/decode routines which are intended to be shared by ++ * all of our NFSv4 implementations (OpenBSD, MacOS X...). ++ * ++ * If the pain of reading these is too great, it should be a straightforward ++ * task to translate them into Linux-specific versions which are more ++ * consistent with the style used in NFSv2/v3... ++ */ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (u64)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define COPYMEM(x, nbytes) do { \ ++ memcpy((x), p, nbytes); \ ++ p += XDR_QUADLEN(nbytes); \ ++} while (0) ++ ++/* ++ * draft-ietf-nfsv4-minorversion-22 ++ * draft-ietf-nfsv4-pnfs-obj-12 ++ */ ++ ++/* Layout Structure */ ++ ++enum pnfs_osd_raid_algorithm4 { ++ PNFS_OSD_RAID_0 = 1, ++ PNFS_OSD_RAID_4 = 2, ++ PNFS_OSD_RAID_5 = 3, ++ PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ ++}; ++ ++/* struct pnfs_osd_data_map4 { ++ * uint32_t odm_num_comps; ++ * length4 odm_stripe_unit; ++ * uint32_t odm_group_width; ++ * uint32_t odm_group_depth; ++ * uint32_t odm_mirror_cnt; ++ * pnfs_osd_raid_algorithm4 odm_raid_algorithm; ++ * }; ++ */ ++struct pnfs_osd_data_map { ++ u32 odm_num_comps; ++ u64 odm_stripe_unit; ++ u32 odm_group_width; ++ u32 odm_group_depth; ++ u32 odm_mirror_cnt; ++ u32 odm_raid_algorithm; ++}; ++ ++static inline int ++pnfs_osd_data_map_xdr_sz(void) ++{ ++ return 1 + 2 + 1 + 1 + 1 + 1; ++} ++ ++static inline size_t ++pnfs_osd_data_map_incore_sz(void) ++{ ++ return sizeof(struct pnfs_osd_data_map); ++} ++ ++/* struct pnfs_osd_objid4 { ++ * deviceid4 oid_device_id; ++ * uint64_t oid_partition_id; ++ * uint64_t oid_object_id; ++ * }; ++ */ ++struct pnfs_osd_objid { ++ struct nfs4_deviceid oid_device_id; ++ u64 oid_partition_id; ++ u64 oid_object_id; ++}; ++ ++/* For printout. I use "dev(%llx:%llx)", _DEVID_LO(), _DEVID_HI BE style */ ++#define _DEVID_LO(oid_device_id) \ ++ (unsigned long long)be64_to_cpup((__be64 *)oid_device_id.data) ++ ++#define _DEVID_HI(oid_device_id) \ ++ (unsigned long long)be64_to_cpup(((__be64 *)oid_device_id.data) + 1) ++ ++static inline int ++pnfs_osd_objid_xdr_sz(void) ++{ ++ return (NFS4_DEVICEID4_SIZE / 4) + 2 + 2; ++} ++ ++static inline size_t ++pnfs_osd_objid_incore_sz(void) ++{ ++ return sizeof(struct pnfs_osd_objid); ++} ++ ++enum pnfs_osd_version { ++ PNFS_OSD_MISSING = 0, ++ PNFS_OSD_VERSION_1 = 1, ++ PNFS_OSD_VERSION_2 = 2 ++}; ++ ++struct pnfs_osd_opaque_cred { ++ u32 cred_len; ++ u8 *cred; ++}; ++ ++static inline int ++pnfs_osd_opaque_cred_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ u32 n; ++ ++ READ32(n); ++ p += XDR_QUADLEN(n); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_opaque_cred_incore_sz(u32 *p) ++{ ++ u32 n; ++ ++ READ32(n); ++ return XDR_QUADLEN(n) * 4; ++} ++ ++enum pnfs_osd_cap_key_sec { ++ PNFS_OSD_CAP_KEY_SEC_NONE = 0, ++ PNFS_OSD_CAP_KEY_SEC_SSV = 1, ++}; ++ ++/* struct pnfs_osd_object_cred4 { ++ * pnfs_osd_objid4 oc_object_id; ++ * pnfs_osd_version4 oc_osd_version; ++ * pnfs_osd_cap_key_sec4 oc_cap_key_sec; ++ * opaque oc_capability_key<>; ++ * opaque oc_capability<>; ++ * }; ++ */ ++struct pnfs_osd_object_cred { ++ struct pnfs_osd_objid oc_object_id; ++ u32 oc_osd_version; ++ u32 oc_cap_key_sec; ++ struct pnfs_osd_opaque_cred oc_cap_key; ++ struct pnfs_osd_opaque_cred oc_cap; ++}; ++ ++static inline int ++pnfs_osd_object_cred_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ ++ p += pnfs_osd_objid_xdr_sz() + 2; ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_object_cred_incore_sz(u32 *p) ++{ ++ size_t sz = sizeof(struct pnfs_osd_object_cred); ++ ++ p += pnfs_osd_objid_xdr_sz() + 2; ++ sz += pnfs_osd_opaque_cred_incore_sz(p); ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ sz += pnfs_osd_opaque_cred_incore_sz(p); ++ return sz; ++} ++ ++/* struct pnfs_osd_layout4 { ++ * pnfs_osd_data_map4 olo_map; ++ * uint32_t olo_comps_index; ++ * pnfs_osd_object_cred4 olo_components<>; ++ * }; ++ */ ++struct pnfs_osd_layout { ++ struct pnfs_osd_data_map olo_map; ++ u32 olo_comps_index; ++ u32 olo_num_comps; ++ struct pnfs_osd_object_cred *olo_comps; ++}; ++ ++static inline int ++pnfs_osd_layout_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ u32 n; ++ ++ p += pnfs_osd_data_map_xdr_sz() + 1; ++ READ32(n); ++ while ((int)(n--) > 0) ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_layout_incore_sz(u32 *p) ++{ ++ u32 n; ++ size_t sz; ++ ++ p += pnfs_osd_data_map_xdr_sz() + 1; ++ READ32(n); ++ sz = sizeof(struct pnfs_osd_layout); ++ while ((int)(n--) > 0) { ++ sz += pnfs_osd_object_cred_incore_sz(p); ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ } ++ return sz; ++} ++ ++/* Device Address */ ++ ++enum pnfs_osd_targetid_type { ++ OBJ_TARGET_ANON = 1, ++ OBJ_TARGET_SCSI_NAME = 2, ++ OBJ_TARGET_SCSI_DEVICE_ID = 3, ++}; ++ ++/* union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) { ++ * case OBJ_TARGET_SCSI_NAME: ++ * string oti_scsi_name<>; ++ * ++ * case OBJ_TARGET_SCSI_DEVICE_ID: ++ * opaque oti_scsi_device_id<>; ++ * ++ * default: ++ * void; ++ * }; ++ * ++ * union pnfs_osd_targetaddr4 switch (bool ota_available) { ++ * case TRUE: ++ * netaddr4 ota_netaddr; ++ * case FALSE: ++ * void; ++ * }; ++ * ++ * struct pnfs_osd_deviceaddr4 { ++ * pnfs_osd_targetid4 oda_targetid; ++ * pnfs_osd_targetaddr4 oda_targetaddr; ++ * uint64_t oda_lun; ++ * opaque oda_systemid<>; ++ * pnfs_osd_object_cred4 oda_root_obj_cred; ++ * opaque oda_osdname<>; ++ * }; ++ */ ++struct pnfs_osd_targetid { ++ u32 oti_type; ++ struct nfs4_string oti_scsi_device_id; ++}; ++ ++enum { PNFS_OSD_TARGETID_MAX = 1 + PNFS_OSD_OSDNAME_MAXSIZE / 4 }; ++ ++/* struct netaddr4 { ++ * // see struct rpcb in RFC1833 ++ * string r_netid<>; // network id ++ * string r_addr<>; // universal address ++ * }; ++ */ ++struct pnfs_osd_net_addr { ++ struct nfs4_string r_netid; ++ struct nfs4_string r_addr; ++}; ++ ++struct pnfs_osd_targetaddr { ++ u32 ota_available; ++ struct pnfs_osd_net_addr ota_netaddr; ++}; ++ ++enum { ++ NETWORK_ID_MAX = 16 / 4, ++ UNIVERSAL_ADDRESS_MAX = 64 / 4, ++ PNFS_OSD_TARGETADDR_MAX = 3 + NETWORK_ID_MAX + UNIVERSAL_ADDRESS_MAX, ++}; ++ ++struct pnfs_osd_deviceaddr { ++ struct pnfs_osd_targetid oda_targetid; ++ struct pnfs_osd_targetaddr oda_targetaddr; ++ u8 oda_lun[8]; ++ struct nfs4_string oda_systemid; ++ struct pnfs_osd_object_cred oda_root_obj_cred; ++ struct nfs4_string oda_osdname; ++}; ++ ++enum { ++ ODA_OSDNAME_MAX = PNFS_OSD_OSDNAME_MAXSIZE / 4, ++ PNFS_OSD_DEVICEADDR_MAX = ++ PNFS_OSD_TARGETID_MAX + PNFS_OSD_TARGETADDR_MAX + ++ 2 /*oda_lun*/ + ++ 1 + OSD_SYSTEMID_LEN + ++ 1 + ODA_OSDNAME_MAX, ++}; ++ ++/* LAYOUTCOMMIT: layoutupdate */ ++ ++/* union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) { ++ * case TRUE: ++ * int64_t dsu_delta; ++ * case FALSE: ++ * void; ++ * }; ++ * ++ * struct pnfs_osd_layoutupdate4 { ++ * pnfs_osd_deltaspaceused4 olu_delta_space_used; ++ * bool olu_ioerr_flag; ++ * }; ++ */ ++struct pnfs_osd_layoutupdate { ++ u32 dsu_valid; ++ s64 dsu_delta; ++ u32 olu_ioerr_flag; ++}; ++ ++/* LAYOUTRETURN: I/O Rrror Report */ ++ ++enum pnfs_osd_errno { ++ PNFS_OSD_ERR_EIO = 1, ++ PNFS_OSD_ERR_NOT_FOUND = 2, ++ PNFS_OSD_ERR_NO_SPACE = 3, ++ PNFS_OSD_ERR_BAD_CRED = 4, ++ PNFS_OSD_ERR_NO_ACCESS = 5, ++ PNFS_OSD_ERR_UNREACHABLE = 6, ++ PNFS_OSD_ERR_RESOURCE = 7 ++}; ++ ++/* struct pnfs_osd_ioerr4 { ++ * pnfs_osd_objid4 oer_component; ++ * length4 oer_comp_offset; ++ * length4 oer_comp_length; ++ * bool oer_iswrite; ++ * pnfs_osd_errno4 oer_errno; ++ * }; ++ */ ++struct pnfs_osd_ioerr { ++ struct pnfs_osd_objid oer_component; ++ u64 oer_comp_offset; ++ u64 oer_comp_length; ++ u32 oer_iswrite; ++ u32 oer_errno; ++}; ++ ++static inline unsigned ++pnfs_osd_ioerr_xdr_sz(void) ++{ ++ return pnfs_osd_objid_xdr_sz() + 2 + 2 + 1 + 1; ++} ++ ++/* OSD XDR API */ ++ ++/* Layout helpers */ ++extern struct pnfs_osd_layout *pnfs_osd_xdr_decode_layout( ++ struct pnfs_osd_layout *layout, u32 *p); ++ ++extern int pnfs_osd_xdr_encode_layout( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_layout *layout); ++ ++/* Device Info helpers */ ++ ++/* First pass calculate total size for space needed */ ++extern size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p); ++ ++/* Note: some strings pointed to inside @deviceaddr might point ++ * to space inside @p. @p should stay valid while @deviceaddr ++ * is in use. ++ * It is assumed that @deviceaddr points to bigger memory of size ++ * calculated in first pass by pnfs_osd_xdr_deviceaddr_incore_sz() ++ */ ++extern void pnfs_osd_xdr_decode_deviceaddr( ++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p); ++ ++/* For Servers */ ++extern int pnfs_osd_xdr_encode_deviceaddr( ++ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr); ++ ++/* layoutupdate (layout_commit) xdr helpers */ ++extern int ++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, ++ struct pnfs_osd_layoutupdate *lou); ++extern __be32 * ++pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p); ++ ++/* osd_ioerror encoding/decoding (layout_return) */ ++extern int ++pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, struct pnfs_osd_ioerr *ioerr); ++extern __be32 * ++pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p); ++ ++#endif /* __PNFS_OSD_XDR_H__ */ +diff -up linux-2.6.37.noarch/include/linux/posix_acl.h.orig linux-2.6.37.noarch/include/linux/posix_acl.h +--- linux-2.6.37.noarch/include/linux/posix_acl.h.orig 2011-01-28 09:37:32.880968218 -0500 ++++ linux-2.6.37.noarch/include/linux/posix_acl.h 2011-01-28 09:43:53.398764895 -0500 +@@ -8,6 +8,7 @@ + #ifndef __LINUX_POSIX_ACL_H + #define __LINUX_POSIX_ACL_H + ++#include + #include + + #define ACL_UNDEFINED_ID (-1) +diff -up linux-2.6.37.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.37.noarch/include/linux/sunrpc/msg_prot.h +--- linux-2.6.37.noarch/include/linux/sunrpc/msg_prot.h.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/include/linux/sunrpc/msg_prot.h 2011-01-28 09:43:53.399764812 -0500 +@@ -14,6 +14,8 @@ + /* size of an XDR encoding unit in bytes, i.e. 32bit */ + #define XDR_UNIT (4) + ++#include ++ + /* spec defines authentication flavor as an unsigned 32 bit integer */ + typedef u32 rpc_authflavor_t; + +diff -up linux-2.6.37.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.37.noarch/include/linux/sunrpc/rpc_pipe_fs.h +--- linux-2.6.37.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2011-01-28 09:43:53.399764812 -0500 +@@ -3,6 +3,7 @@ + + #ifdef __KERNEL__ + ++#include + #include + + struct rpc_pipe_msg { +@@ -11,6 +12,10 @@ struct rpc_pipe_msg { + size_t len; + size_t copied; + int errno; ++#define PIPEFS_AUTOFREE_RPCMSG 0x01 /* frees rpc_pipe_msg */ ++#define PIPEFS_AUTOFREE_RPCMSG_DATA 0x02 /* frees rpc_pipe_msg->data */ ++#define PIPEFS_AUTOFREE_UPCALL_MSG PIPEFS_AUTOFREE_RPCMSG_DATA ++ u8 flags; + }; + + struct rpc_pipe_ops { +diff -up linux-2.6.37.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.37.noarch/include/linux/sunrpc/simple_rpc_pipefs.h +--- linux-2.6.37.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2011-01-28 09:43:53.400764729 -0500 ++++ linux-2.6.37.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2011-01-28 09:43:53.400764729 -0500 +@@ -0,0 +1,105 @@ ++/* ++ * Copyright (c) 2008 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * David M. Richter ++ * ++ * Drawing on work done by Andy Adamson and ++ * Marius Eriksen . Thanks for the help over the ++ * years, guys. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * With thanks to CITI's project sponsor and partner, IBM. ++ */ ++ ++#ifndef _SIMPLE_RPC_PIPEFS_H_ ++#define _SIMPLE_RPC_PIPEFS_H_ ++ ++#include ++ ++#define payload_of(headerp) ((void *)(headerp + 1)) ++ ++/* ++ * struct pipefs_hdr -- the generic message format for simple_rpc_pipefs. ++ * Messages may simply be the header itself, although having an optional ++ * data payload follow the header allows much more flexibility. ++ * ++ * Messages are created using pipefs_alloc_init_msg() and ++ * pipefs_alloc_init_msg_padded(), both of which accept a pointer to an ++ * (optional) data payload. ++ * ++ * Given a struct pipefs_hdr *msg that has a struct foo payload, the data ++ * can be accessed using: struct foo *foop = payload_of(msg) ++ */ ++struct pipefs_hdr { ++ u32 msgid; ++ u8 type; ++ u8 flags; ++ u16 totallen; /* length of entire message, including hdr itself */ ++ u32 status; ++}; ++ ++/* ++ * struct pipefs_list -- a type of list used for tracking callers who've made an ++ * upcall and are blocked waiting for a reply. ++ * ++ * See pipefs_queue_upcall_waitreply() and pipefs_assign_upcall_reply(). ++ */ ++struct pipefs_list { ++ struct list_head list; ++ spinlock_t list_lock; ++}; ++ ++ ++/* See net/sunrpc/simple_rpc_pipefs.c for more info on using these functions. */ ++extern struct dentry *pipefs_mkpipe(const char *name, ++ const struct rpc_pipe_ops *ops, ++ int wait_for_open); ++extern void pipefs_closepipe(struct dentry *pipe); ++extern void pipefs_init_list(struct pipefs_list *list); ++extern struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen); ++extern struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, ++ u8 flags, void *data, ++ u16 datalen, u16 padlen); ++extern struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, ++ struct pipefs_list ++ *uplist, u8 upflags, ++ u32 timeout); ++extern int pipefs_queue_upcall_noreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, u8 upflags); ++extern int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, ++ struct pipefs_list *uplist); ++extern struct pipefs_hdr *pipefs_readmsg(struct file *filp, ++ const char __user *src, size_t len); ++extern ssize_t pipefs_generic_upcall(struct file *filp, ++ struct rpc_pipe_msg *rpcmsg, ++ char __user *dst, size_t buflen); ++extern void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg); ++ ++#endif /* _SIMPLE_RPC_PIPEFS_H_ */ +diff -up linux-2.6.37.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.37.noarch/include/linux/sunrpc/svc_xprt.h +--- linux-2.6.37.noarch/include/linux/sunrpc/svc_xprt.h.orig 2011-01-28 09:37:32.915967004 -0500 ++++ linux-2.6.37.noarch/include/linux/sunrpc/svc_xprt.h 2011-01-28 09:43:53.400764729 -0500 +@@ -205,4 +205,41 @@ static inline char *__svc_print_addr(con + + return buf; + } ++ ++/* ++ * Print a network address in a universal format (see rfc1833 and nfsv4.1) ++ */ ++static inline int __svc_print_netaddr(struct sockaddr *addr, ++ struct xdr_netobj *na) ++{ ++ u16 port; ++ ssize_t len; ++ ++ switch (addr->sa_family) { ++ case AF_INET: { ++ struct sockaddr_in *sin = (struct sockaddr_in *)addr; ++ port = ntohs(sin->sin_port); ++ ++ len = snprintf(na->data, na->len, "%pI4.%u.%u", ++ &sin->sin_addr, ++ port >> 8, port & 0xff); ++ break; ++ } ++ case AF_INET6: { ++ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; ++ port = ntohs(sin6->sin6_port); ++ ++ len = snprintf(na->data, na->len, "%pI6.%u.%u", ++ &sin6->sin6_addr, ++ port >> 8, port & 0xff); ++ break; ++ } ++ default: ++ snprintf(na->data, na->len, "unknown address type: %d", ++ addr->sa_family); ++ len = -EINVAL; ++ break; ++ } ++ return len; ++} + #endif /* SUNRPC_SVC_XPRT_H */ +diff -up linux-2.6.37.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.37.noarch/include/linux/sunrpc/xdr.h +--- linux-2.6.37.noarch/include/linux/sunrpc/xdr.h.orig 2011-01-28 09:37:32.916966969 -0500 ++++ linux-2.6.37.noarch/include/linux/sunrpc/xdr.h 2011-01-28 09:43:53.401764649 -0500 +@@ -213,6 +213,7 @@ typedef int (*kxdrdproc_t)(void *rqstp, + + extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); + extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); ++extern __be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q); + extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, + unsigned int base, unsigned int len); + extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); +diff -up linux-2.6.37.noarch/net/sunrpc/Makefile.orig linux-2.6.37.noarch/net/sunrpc/Makefile +--- linux-2.6.37.noarch/net/sunrpc/Makefile.orig 2011-01-04 19:50:19.000000000 -0500 ++++ linux-2.6.37.noarch/net/sunrpc/Makefile 2011-01-28 09:43:53.402764570 -0500 +@@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt + svc.o svcsock.o svcauth.o svcauth_unix.o \ + addr.o rpcb_clnt.o timer.o xdr.o \ + sunrpc_syms.o cache.o rpc_pipe.o \ +- svc_xprt.o ++ svc_xprt.o simple_rpc_pipefs.o + sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o + sunrpc-$(CONFIG_PROC_FS) += stats.o + sunrpc-$(CONFIG_SYSCTL) += sysctl.o +diff -up linux-2.6.37.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.37.noarch/net/sunrpc/simple_rpc_pipefs.c +--- linux-2.6.37.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2011-01-28 09:43:53.403764492 -0500 ++++ linux-2.6.37.noarch/net/sunrpc/simple_rpc_pipefs.c 2011-01-28 09:43:53.403764492 -0500 +@@ -0,0 +1,423 @@ ++/* ++ * net/sunrpc/simple_rpc_pipefs.c ++ * ++ * Copyright (c) 2008 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * David M. Richter ++ * ++ * Drawing on work done by Andy Adamson and ++ * Marius Eriksen . Thanks for the help over the ++ * years, guys. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * With thanks to CITI's project sponsor and partner, IBM. ++ */ ++ ++#include ++#include ++#include ++ ++ ++/* ++ * Make an rpc_pipefs pipe named @name at the root of the mounted rpc_pipefs ++ * filesystem. ++ * ++ * If @wait_for_open is non-zero and an upcall is later queued but the userland ++ * end of the pipe has not yet been opened, the upcall will remain queued until ++ * the pipe is opened; otherwise, the upcall queueing will return with -EPIPE. ++ */ ++struct dentry *pipefs_mkpipe(const char *name, const struct rpc_pipe_ops *ops, ++ int wait_for_open) ++{ ++ struct dentry *dir, *pipe; ++ struct vfsmount *mnt; ++ ++ mnt = rpc_get_mount(); ++ if (IS_ERR(mnt)) { ++ pipe = ERR_CAST(mnt); ++ goto out; ++ } ++ dir = mnt->mnt_root; ++ if (!dir) { ++ pipe = ERR_PTR(-ENOENT); ++ goto out; ++ } ++ pipe = rpc_mkpipe(dir, name, NULL, ops, ++ wait_for_open ? RPC_PIPE_WAIT_FOR_OPEN : 0); ++out: ++ return pipe; ++} ++EXPORT_SYMBOL(pipefs_mkpipe); ++ ++/* ++ * Shutdown a pipe made by pipefs_mkpipe(). ++ * XXX: do we need to retain an extra reference on the mount? ++ */ ++void pipefs_closepipe(struct dentry *pipe) ++{ ++ rpc_unlink(pipe); ++ rpc_put_mount(); ++} ++EXPORT_SYMBOL(pipefs_closepipe); ++ ++/* ++ * Initialize a struct pipefs_list -- which are a way to keep track of callers ++ * who're blocked having made an upcall and are awaiting a reply. ++ * ++ * See pipefs_queue_upcall_waitreply() and pipefs_find_upcall_msgid() for how ++ * to use them. ++ */ ++inline void pipefs_init_list(struct pipefs_list *list) ++{ ++ INIT_LIST_HEAD(&list->list); ++ spin_lock_init(&list->list_lock); ++} ++EXPORT_SYMBOL(pipefs_init_list); ++ ++/* ++ * Alloc/init a generic pipefs message header and copy into its message body ++ * an arbitrary data payload. ++ * ++ * struct pipefs_hdr's are meant to serve as generic, general-purpose message ++ * headers for easy rpc_pipefs I/O. When an upcall is made, the ++ * struct pipefs_hdr is assigned to a struct rpc_pipe_msg and delivered ++ * therein. --And yes, the naming can seem a little confusing at first: ++ * ++ * When one thinks of an upcall "message", in simple_rpc_pipefs that's a ++ * struct pipefs_hdr (possibly with an attached message body). A ++ * struct rpc_pipe_msg is actually only the -vehicle- by which the "real" ++ * message is delivered and processed. ++ */ ++struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen, u16 padlen) ++{ ++ u16 totallen; ++ struct pipefs_hdr *msg = NULL; ++ ++ totallen = sizeof(*msg) + datalen + padlen; ++ if (totallen > PAGE_SIZE) { ++ msg = ERR_PTR(-E2BIG); ++ goto out; ++ } ++ ++ msg = kzalloc(totallen, GFP_KERNEL); ++ if (!msg) { ++ msg = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ msg->msgid = msgid; ++ msg->type = type; ++ msg->flags = flags; ++ msg->totallen = totallen; ++ memcpy(payload_of(msg), data, datalen); ++out: ++ return msg; ++} ++EXPORT_SYMBOL(pipefs_alloc_init_msg_padded); ++ ++/* ++ * See the description of pipefs_alloc_init_msg_padded(). ++ */ ++struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen) ++{ ++ return pipefs_alloc_init_msg_padded(msgid, type, flags, data, ++ datalen, 0); ++} ++EXPORT_SYMBOL(pipefs_alloc_init_msg); ++ ++ ++static void pipefs_init_rpcmsg(struct rpc_pipe_msg *rpcmsg, ++ struct pipefs_hdr *msg, u8 upflags) ++{ ++ memset(rpcmsg, 0, sizeof(*rpcmsg)); ++ rpcmsg->data = msg; ++ rpcmsg->len = msg->totallen; ++ rpcmsg->flags = upflags; ++} ++ ++static struct rpc_pipe_msg *pipefs_alloc_init_rpcmsg(struct pipefs_hdr *msg, ++ u8 upflags) ++{ ++ struct rpc_pipe_msg *rpcmsg; ++ ++ rpcmsg = kmalloc(sizeof(*rpcmsg), GFP_KERNEL); ++ if (!rpcmsg) ++ return ERR_PTR(-ENOMEM); ++ ++ pipefs_init_rpcmsg(rpcmsg, msg, upflags); ++ return rpcmsg; ++} ++ ++ ++/* represents an upcall that'll block and wait for a reply */ ++struct pipefs_upcall { ++ u32 msgid; ++ struct rpc_pipe_msg rpcmsg; ++ struct list_head list; ++ wait_queue_head_t waitq; ++ struct pipefs_hdr *reply; ++}; ++ ++ ++static void pipefs_init_upcall_waitreply(struct pipefs_upcall *upcall, ++ struct pipefs_hdr *msg, u8 upflags) ++{ ++ upcall->reply = NULL; ++ upcall->msgid = msg->msgid; ++ INIT_LIST_HEAD(&upcall->list); ++ init_waitqueue_head(&upcall->waitq); ++ pipefs_init_rpcmsg(&upcall->rpcmsg, msg, upflags); ++} ++ ++static int __pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_upcall *upcall, ++ struct pipefs_list *uplist, ++ u32 timeout) ++{ ++ int err = 0; ++ DECLARE_WAITQUEUE(wq, current); ++ ++ add_wait_queue(&upcall->waitq, &wq); ++ spin_lock(&uplist->list_lock); ++ list_add(&upcall->list, &uplist->list); ++ spin_unlock(&uplist->list_lock); ++ ++ err = rpc_queue_upcall(pipe->d_inode, &upcall->rpcmsg); ++ if (err < 0) ++ goto out; ++ ++ if (timeout) { ++ /* retval of 0 means timer expired */ ++ err = schedule_timeout_uninterruptible(timeout); ++ if (err == 0 && upcall->reply == NULL) ++ err = -ETIMEDOUT; ++ } else { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ } ++ ++out: ++ spin_lock(&uplist->list_lock); ++ list_del_init(&upcall->list); ++ spin_unlock(&uplist->list_lock); ++ remove_wait_queue(&upcall->waitq, &wq); ++ return err; ++} ++ ++/* ++ * Queue a pipefs msg for an upcall to userspace, place the calling thread ++ * on @uplist, and block the thread to wait for a reply. If @timeout is ++ * nonzero, the thread will be blocked for at most @timeout jiffies. ++ * ++ * (To convert time units into jiffies, consider the functions ++ * msecs_to_jiffies(), usecs_to_jiffies(), timeval_to_jiffies(), and ++ * timespec_to_jiffies().) ++ * ++ * Once a reply is received by your downcall handler, call ++ * pipefs_assign_upcall_reply() with @uplist to find the corresponding upcall, ++ * assign the reply, and wake the waiting thread. ++ * ++ * This function's return value pointer may be an error and should be checked ++ * with IS_ERR() before attempting to access the reply message. ++ * ++ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() ++ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG ++ * flag is set in @upflags. See also rpc_pipe_fs.h. ++ */ ++struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, ++ struct pipefs_list *uplist, ++ u8 upflags, u32 timeout) ++{ ++ int err = 0; ++ struct pipefs_upcall upcall; ++ ++ pipefs_init_upcall_waitreply(&upcall, msg, upflags); ++ err = __pipefs_queue_upcall_waitreply(pipe, &upcall, uplist, timeout); ++ if (err < 0) { ++ kfree(upcall.reply); ++ upcall.reply = ERR_PTR(err); ++ } ++ ++ return upcall.reply; ++} ++EXPORT_SYMBOL(pipefs_queue_upcall_waitreply); ++ ++/* ++ * Queue a pipefs msg for an upcall to userspace and immediately return (i.e., ++ * no reply is expected). ++ * ++ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() ++ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG ++ * flag is set in @upflags. See also rpc_pipe_fs.h. ++ */ ++int pipefs_queue_upcall_noreply(struct dentry *pipe, struct pipefs_hdr *msg, ++ u8 upflags) ++{ ++ int err = 0; ++ struct rpc_pipe_msg *rpcmsg; ++ ++ upflags |= PIPEFS_AUTOFREE_RPCMSG; ++ rpcmsg = pipefs_alloc_init_rpcmsg(msg, upflags); ++ if (IS_ERR(rpcmsg)) { ++ err = PTR_ERR(rpcmsg); ++ goto out; ++ } ++ err = rpc_queue_upcall(pipe->d_inode, rpcmsg); ++out: ++ return err; ++} ++EXPORT_SYMBOL(pipefs_queue_upcall_noreply); ++ ++ ++static struct pipefs_upcall *pipefs_find_upcall_msgid(u32 msgid, ++ struct pipefs_list *uplist) ++{ ++ struct pipefs_upcall *upcall; ++ ++ spin_lock(&uplist->list_lock); ++ list_for_each_entry(upcall, &uplist->list, list) ++ if (upcall->msgid == msgid) ++ goto out; ++ upcall = NULL; ++out: ++ spin_unlock(&uplist->list_lock); ++ return upcall; ++} ++ ++/* ++ * In your rpc_pipe_ops->downcall() handler, once you've read in a downcall ++ * message and have determined that it is a reply to a waiting upcall, ++ * you can use this function to find the appropriate upcall, assign the result, ++ * and wake the upcall thread. ++ * ++ * The reply message must have the same msgid as the original upcall message's. ++ * ++ * See also pipefs_queue_upcall_waitreply() and pipefs_readmsg(). ++ */ ++int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, ++ struct pipefs_list *uplist) ++{ ++ int err = 0; ++ struct pipefs_upcall *upcall; ++ ++ upcall = pipefs_find_upcall_msgid(reply->msgid, uplist); ++ if (!upcall) { ++ printk(KERN_ERR "%s: ERROR: have reply but no matching upcall " ++ "for msgid %d\n", __func__, reply->msgid); ++ err = -ENOENT; ++ goto out; ++ } ++ upcall->reply = reply; ++ wake_up(&upcall->waitq); ++out: ++ return err; ++} ++EXPORT_SYMBOL(pipefs_assign_upcall_reply); ++ ++/* ++ * Generic method to read-in and return a newly-allocated message which begins ++ * with a struct pipefs_hdr. ++ */ ++struct pipefs_hdr *pipefs_readmsg(struct file *filp, const char __user *src, ++ size_t len) ++{ ++ int err = 0, hdrsize; ++ struct pipefs_hdr *msg = NULL; ++ ++ hdrsize = sizeof(*msg); ++ if (len < hdrsize) { ++ printk(KERN_ERR "%s: ERROR: header is too short (%d vs %d)\n", ++ __func__, (int) len, hdrsize); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ msg = kzalloc(len, GFP_KERNEL); ++ if (!msg) { ++ err = -ENOMEM; ++ goto out; ++ } ++ if (copy_from_user(msg, src, len)) ++ err = -EFAULT; ++out: ++ if (err) { ++ kfree(msg); ++ msg = ERR_PTR(err); ++ } ++ return msg; ++} ++EXPORT_SYMBOL(pipefs_readmsg); ++ ++/* ++ * Generic rpc_pipe_ops->upcall() handler implementation. ++ * ++ * Don't call this directly: to make an upcall, use ++ * pipefs_queue_upcall_waitreply() or pipefs_queue_upcall_noreply(). ++ */ ++ssize_t pipefs_generic_upcall(struct file *filp, struct rpc_pipe_msg *rpcmsg, ++ char __user *dst, size_t buflen) ++{ ++ char *data; ++ ssize_t len, left; ++ ++ data = (char *)rpcmsg->data + rpcmsg->copied; ++ len = rpcmsg->len - rpcmsg->copied; ++ if (len > buflen) ++ len = buflen; ++ ++ left = copy_to_user(dst, data, len); ++ if (left < 0) { ++ rpcmsg->errno = left; ++ return left; ++ } ++ ++ len -= left; ++ rpcmsg->copied += len; ++ rpcmsg->errno = 0; ++ return len; ++} ++EXPORT_SYMBOL(pipefs_generic_upcall); ++ ++/* ++ * Generic rpc_pipe_ops->destroy_msg() handler implementation. ++ * ++ * Items are only freed if @rpcmsg->flags has been set appropriately. ++ * See pipefs_queue_upcall_noreply() and rpc_pipe_fs.h. ++ */ ++void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg) ++{ ++ if (rpcmsg->flags & PIPEFS_AUTOFREE_UPCALL_MSG) ++ kfree(rpcmsg->data); ++ if (rpcmsg->flags & PIPEFS_AUTOFREE_RPCMSG) ++ kfree(rpcmsg); ++} ++EXPORT_SYMBOL(pipefs_generic_destroy_msg); +diff -up linux-2.6.37.noarch/net/sunrpc/xdr.c.orig linux-2.6.37.noarch/net/sunrpc/xdr.c +--- linux-2.6.37.noarch/net/sunrpc/xdr.c.orig 2011-01-28 09:37:33.428949202 -0500 ++++ linux-2.6.37.noarch/net/sunrpc/xdr.c 2011-01-28 09:43:53.404764414 -0500 +@@ -518,6 +518,27 @@ __be32 * xdr_reserve_space(struct xdr_st + EXPORT_SYMBOL_GPL(xdr_reserve_space); + + /** ++ * xdr_rewind_stream - rewind a stream back to some checkpoint ++ * @xdr: pointer to xdr_stream ++ * @q: some checkpoint at historical place of @xdr ++ * ++ * Restors an xdr stream to some historical point. @q must be ++ * a logical xdr point in the past that was sampled by @q = @xdr->p. ++ */ ++__be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q) ++{ ++ size_t nbytes = (xdr->p - q) << 2; ++ ++ BUG_ON(xdr->p < q); ++ BUG_ON(nbytes > xdr->iov->iov_len || nbytes > xdr->buf->len); ++ xdr->p = q; ++ xdr->iov->iov_len -= nbytes; ++ xdr->buf->len -= nbytes; ++ return q; ++} ++EXPORT_SYMBOL_GPL(xdr_rewind_stream); ++ ++/** + * xdr_write_pages - Insert a list of pages into an XDR buffer for sending + * @xdr: pointer to xdr_stream + * @pages: list of pages