Commit 06886a5a authored by Boaz Harrosh's avatar Boaz Harrosh
Browse files

exofs: Move all operations to an io_engine

In anticipation for multi-device operations, we separate osd operations
into an abstract I/O API. Currently only one device is used but later
when adding more devices, we will drive all devices in parallel according
to a "data_map" that describes how data is arranged on multiple devices.
The file system level operates, like before, as if there is one object
(inode-number) and an i_size. The io engine will split this to the same
object-number but on multiple device.

At first we introduce Mirror (raid 1) layout. But at the final outcome
we intend to fully implement the pNFS-Objects data-map, including
raid 0,4,5,6 over mirrored devices, over multiple device-groups. And
more. See: http://tools.ietf.org/html/draft-ietf-nfsv4-pnfs-obj-12



* Define an io_state based API for accessing osd storage devices
  in an abstract way.
  Usage:
	First a caller allocates an io state with:
		exofs_get_io_state(struct exofs_sb_info *sbi,
				   struct exofs_io_state** ios);

	Then calles one of:
		exofs_sbi_create(struct exofs_io_state *ios);
		exofs_sbi_remove(struct exofs_io_state *ios);
		exofs_sbi_write(struct exofs_io_state *ios);
		exofs_sbi_read(struct exofs_io_state *ios);
		exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len);

	And when done
		exofs_put_io_state(struct exofs_io_state *ios);

* Convert all source files to use this new API
* Convert from bio_alloc to bio_kmalloc
* In io engine we make use of the now fixed osd_req_decode_sense

There are no functional changes or on disk additions after this patch.
Signed-off-by: default avatarBoaz Harrosh <bharrosh@panasas.com>
parent 8ce9bdd1
......@@ -155,22 +155,4 @@ enum {
(((name_len) + offsetof(struct exofs_dir_entry, name) + \
EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
/*************************
* function declarations *
*************************/
/* osd.c */
void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
const struct osd_obj_id *obj);
int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
static inline int exofs_check_ok(struct osd_request *or)
{
return exofs_check_ok_resid(or, NULL, NULL);
}
int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
int exofs_async_op(struct osd_request *or,
osd_req_done_fn *async_done, void *caller_context, u8 *cred);
int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
#endif /*ifndef __EXOFS_COM_H__*/
......@@ -30,14 +30,13 @@
* along with exofs; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef __EXOFS_H__
#define __EXOFS_H__
#include <linux/fs.h>
#include <linux/time.h>
#include "common.h"
#ifndef __EXOFS_H__
#define __EXOFS_H__
#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
#ifdef CONFIG_EXOFS_DEBUG
......@@ -56,6 +55,7 @@
*/
struct exofs_sb_info {
struct osd_dev *s_dev; /* returned by get_osd_dev */
struct exofs_fscb s_fscb; /* Written often, pre-allocate*/
osd_id s_pid; /* partition ID of file system*/
int s_timeout; /* timeout for OSD operations */
uint64_t s_nextid; /* highest object ID used */
......@@ -79,6 +79,50 @@ struct exofs_i_info {
struct inode vfs_inode; /* normal in-memory inode */
};
static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
{
return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
}
struct exofs_io_state;
typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private);
struct exofs_io_state {
struct kref kref;
void *private;
exofs_io_done_fn done;
struct exofs_sb_info *sbi;
struct osd_obj_id obj;
u8 *cred;
/* Global read/write IO*/
loff_t offset;
unsigned long length;
void *kern_buff;
struct bio *bio;
/* Attributes */
unsigned in_attr_len;
struct osd_attr *in_attr;
unsigned out_attr_len;
struct osd_attr *out_attr;
/* Variable array of size numdevs */
unsigned numdevs;
struct exofs_per_dev_state {
struct osd_request *or;
struct bio *bio;
} per_dev[];
};
static inline unsigned exofs_io_state_size(unsigned numdevs)
{
return sizeof(struct exofs_io_state) +
sizeof(struct exofs_per_dev_state) * numdevs;
}
/*
* our inode flags
*/
......@@ -130,6 +174,42 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
/*************************
* function declarations *
*************************/
/* ios.c */
void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
const struct osd_obj_id *obj);
int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
u64 offset, void *p, unsigned length);
int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** ios);
void exofs_put_io_state(struct exofs_io_state *ios);
int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
int exofs_sbi_create(struct exofs_io_state *ios);
int exofs_sbi_remove(struct exofs_io_state *ios);
int exofs_sbi_write(struct exofs_io_state *ios);
int exofs_sbi_read(struct exofs_io_state *ios);
int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr);
int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len);
static inline int exofs_oi_write(struct exofs_i_info *oi,
struct exofs_io_state *ios)
{
ios->obj.id = exofs_oi_objno(oi);
ios->cred = oi->i_cred;
return exofs_sbi_write(ios);
}
static inline int exofs_oi_read(struct exofs_i_info *oi,
struct exofs_io_state *ios)
{
ios->obj.id = exofs_oi_objno(oi);
ios->cred = oi->i_cred;
return exofs_sbi_read(ios);
}
/* inode.c */
void exofs_truncate(struct inode *inode);
int exofs_setattr(struct dentry *, struct iattr *);
......@@ -169,6 +249,7 @@ extern const struct file_operations exofs_file_operations;
/* inode.c */
extern const struct address_space_operations exofs_aops;
extern const struct osd_attr g_attr_logical_length;
/* namei.c */
extern const struct inode_operations exofs_dir_inode_operations;
......
This diff is collapsed.
......@@ -23,88 +23,327 @@
*/
#include <scsi/scsi_device.h>
#include <scsi/osd_sense.h>
#include "exofs.h"
int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
{
struct osd_sense_info osi;
int ret = osd_req_decode_sense(or, &osi);
if (ret) { /* translate to Linux codes */
if (osi.additional_code == scsi_invalid_field_in_cdb) {
if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
ret = -EFAULT;
if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
ret = -ENOENT;
else
ret = -EINVAL;
} else if (osi.additional_code == osd_quota_error)
ret = -ENOSPC;
else
ret = -EIO;
osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
}
int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
u64 offset, void *p, unsigned length)
{
struct osd_request *or = osd_start_request(od, GFP_KERNEL);
/* struct osd_sense_info osi = {.key = 0};*/
int ret;
if (unlikely(!or)) {
EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
return -ENOMEM;
}
ret = osd_req_read_kern(or, obj, offset, p, length);
if (unlikely(ret)) {
EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
goto out;
}
/* FIXME: should be include in osd_sense_info */
if (in_resid)
*in_resid = or->in.req ? or->in.req->resid_len : 0;
ret = osd_finalize_request(or, 0, cred, NULL);
if (unlikely(ret)) {
EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
goto out;
}
if (out_resid)
*out_resid = or->out.req ? or->out.req->resid_len : 0;
ret = osd_execute_request(or);
if (unlikely(ret))
EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
/* osd_req_decode_sense(or, ret); */
out:
osd_end_request(or);
return ret;
}
void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios)
{
osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
struct exofs_io_state *ios;
/*TODO: Maybe use kmem_cach per sbi of size
* exofs_io_state_size(sbi->s_numdevs)
*/
ios = kzalloc(exofs_io_state_size(1), GFP_KERNEL);
if (unlikely(!ios)) {
*pios = NULL;
return -ENOMEM;
}
ios->sbi = sbi;
ios->obj.partition = sbi->s_pid;
*pios = ios;
return 0;
}
/*
* Perform a synchronous OSD operation.
*/
int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
void exofs_put_io_state(struct exofs_io_state *ios)
{
int ret;
if (ios) {
unsigned i;
or->timeout = timeout;
ret = osd_finalize_request(or, 0, credential, NULL);
if (ret) {
EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
return ret;
for (i = 0; i < ios->numdevs; i++) {
struct exofs_per_dev_state *per_dev = &ios->per_dev[i];
if (per_dev->or)
osd_end_request(per_dev->or);
if (per_dev->bio)
bio_put(per_dev->bio);
}
kfree(ios);
}
}
ret = osd_execute_request(or);
static void _sync_done(struct exofs_io_state *ios, void *p)
{
struct completion *waiting = p;
if (ret)
EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
/* osd_req_decode_sense(or, ret); */
complete(waiting);
}
static void _last_io(struct kref *kref)
{
struct exofs_io_state *ios = container_of(
kref, struct exofs_io_state, kref);
ios->done(ios, ios->private);
}
static void _done_io(struct osd_request *or, void *p)
{
struct exofs_io_state *ios = p;
kref_put(&ios->kref, _last_io);
}
static int exofs_io_execute(struct exofs_io_state *ios)
{
DECLARE_COMPLETION_ONSTACK(wait);
bool sync = (ios->done == NULL);
int i, ret;
if (sync) {
ios->done = _sync_done;
ios->private = &wait;
}
for (i = 0; i < ios->numdevs; i++) {
struct osd_request *or = ios->per_dev[i].or;
if (unlikely(!or))
continue;
ret = osd_finalize_request(or, 0, ios->cred, NULL);
if (unlikely(ret)) {
EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n",
ret);
return ret;
}
}
kref_init(&ios->kref);
for (i = 0; i < ios->numdevs; i++) {
struct osd_request *or = ios->per_dev[i].or;
if (unlikely(!or))
continue;
kref_get(&ios->kref);
osd_execute_request_async(or, _done_io, ios);
}
kref_put(&ios->kref, _last_io);
ret = 0;
if (sync) {
wait_for_completion(&wait);
ret = exofs_check_io(ios, NULL);
}
return ret;
}
/*
* Perform an asynchronous OSD operation.
*/
int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
void *caller_context, u8 *cred)
int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
{
int ret;
enum osd_err_priority acumulated_osd_err = 0;
int acumulated_lin_err = 0;
int i;
ret = osd_finalize_request(or, 0, cred, NULL);
if (ret) {
EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
return ret;
for (i = 0; i < ios->numdevs; i++) {
struct osd_sense_info osi;
int ret = osd_req_decode_sense(ios->per_dev[i].or, &osi);
if (likely(!ret))
continue;
if (unlikely(ret == -EFAULT)) {
EXOFS_DBGMSG("%s: EFAULT Need page clear\n", __func__);
/*FIXME: All the pages in this device range should:
* clear_highpage(page);
*/
}
if (osi.osd_err_pri >= acumulated_osd_err) {
acumulated_osd_err = osi.osd_err_pri;
acumulated_lin_err = ret;
}
}
/* TODO: raid specific residual calculations */
if (resid) {
if (likely(!acumulated_lin_err))
*resid = 0;
else
*resid = ios->length;
}
return acumulated_lin_err;
}
int exofs_sbi_create(struct exofs_io_state *ios)
{
int i, ret;
for (i = 0; i < 1; i++) {
struct osd_request *or;
or = osd_start_request(ios->sbi->s_dev, GFP_KERNEL);
if (unlikely(!or)) {
EXOFS_ERR("%s: osd_start_request failed\n", __func__);
ret = -ENOMEM;
goto out;
}
ios->per_dev[i].or = or;
ios->numdevs++;
osd_req_create_object(or, &ios->obj);
}
ret = exofs_io_execute(ios);
out:
return ret;
}
int exofs_sbi_remove(struct exofs_io_state *ios)
{
int i, ret;
for (i = 0; i < 1; i++) {
struct osd_request *or;
or = osd_start_request(ios->sbi->s_dev, GFP_KERNEL);
if (unlikely(!or)) {
EXOFS_ERR("%s: osd_start_request failed\n", __func__);
ret = -ENOMEM;
goto out;
}
ios->per_dev[i].or = or;
ios->numdevs++;
osd_req_remove_object(or, &ios->obj);
}
ret = exofs_io_execute(ios);
out:
return ret;
}
int exofs_sbi_write(struct exofs_io_state *ios)
{
int i, ret;
for (i = 0; i < 1; i++) {
struct osd_request *or;
or = osd_start_request(ios->sbi->s_dev, GFP_KERNEL);
if (unlikely(!or)) {
EXOFS_ERR("%s: osd_start_request failed\n", __func__);
ret = -ENOMEM;
goto out;
}
ios->per_dev[i].or = or;
ios->numdevs++;
if (ios->bio) {
struct bio *bio;
bio = ios->bio;
osd_req_write(or, &ios->obj, ios->offset, bio,
ios->length);
/* EXOFS_DBGMSG("write sync=%d\n", sync);*/
} else if (ios->kern_buff) {
osd_req_write_kern(or, &ios->obj, ios->offset,
ios->kern_buff, ios->length);
/* EXOFS_DBGMSG("write_kern sync=%d\n", sync);*/
} else {
osd_req_set_attributes(or, &ios->obj);
/* EXOFS_DBGMSG("set_attributes sync=%d\n", sync);*/
}
if (ios->out_attr)
osd_req_add_set_attr_list(or, ios->out_attr,
ios->out_attr_len);
if (ios->in_attr)
osd_req_add_get_attr_list(or, ios->in_attr,
ios->in_attr_len);
}
ret = exofs_io_execute(ios);
out:
return ret;
}
int exofs_sbi_read(struct exofs_io_state *ios)
{
int i, ret;
for (i = 0; i < 1; i++) {
struct osd_request *or;
or = osd_start_request(ios->sbi->s_dev, GFP_KERNEL);
if (unlikely(!or)) {
EXOFS_ERR("%s: osd_start_request failed\n", __func__);
ret = -ENOMEM;
goto out;
}
ios->per_dev[i].or = or;
ios->numdevs++;
if (ios->bio) {
osd_req_read(or, &ios->obj, ios->offset, ios->bio,
ios->length);
/* EXOFS_DBGMSG("read sync=%d\n", sync);*/
} else if (ios->kern_buff) {
osd_req_read_kern(or, &ios->obj, ios->offset,
ios->kern_buff, ios->length);
/* EXOFS_DBGMSG("read_kern sync=%d\n", sync);*/
} else {
osd_req_get_attributes(or, &ios->obj);
/* EXOFS_DBGMSG("get_attributes sync=%d\n", sync);*/
}
if (ios->out_attr)
osd_req_add_set_attr_list(or, ios->out_attr,
ios->out_attr_len);
ret = osd_execute_request_async(or, async_done, caller_context);
if (ios->in_attr)
osd_req_add_get_attr_list(or, ios->in_attr,
ios->in_attr_len);
}
ret = exofs_io_execute(ios);
if (ret)
EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
out:
return ret;
}
int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
{
struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
void *iter = NULL;
......@@ -112,7 +351,8 @@ int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
do {
nelem = 1;
osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
osd_req_decode_get_attr_list(ios->per_dev[0].or,
&cur_attr, &nelem, &iter);
if ((cur_attr.attr_page == attr->attr_page) &&
(cur_attr.attr_id == attr->attr_id)) {
attr->len = cur_attr.len;
......@@ -123,3 +363,43 @@ int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
return -EIO;
}
int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
{
struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
struct exofs_io_state *ios;
struct osd_attr attr;
__be64 newsize;
int i, ret;
if (exofs_get_io_state(sbi, &ios))
return -ENOMEM;
ios->obj.id = exofs_oi_objno(oi);
ios->cred = oi->i_cred;
newsize = cpu_to_be64(size);
attr = g_attr_logical_length;
attr.val_ptr = &newsize;
for (i = 0; i < 1; i++) {
struct osd_request *or;
or = osd_start_request(sbi->s_dev, GFP_KERNEL);
if (unlikely(!or)) {
EXOFS_ERR("%s: osd_start_request failed\n", __func__);
ret = -ENOMEM;
goto out;
}
ios->per_dev[i].or = or;
ios->numdevs++;
osd_req_set_attributes(or, &ios->obj);
osd_req_add_set_attr_list(or, &attr, 1);
}
ret = exofs_io_execute(ios);
out:
exofs_put_io_state(ios);
return ret;
}
......@@ -203,49 +203,40 @@ int exofs_sync_fs(struct super_block *sb, int wait)
{
struct exofs_sb_info *sbi;
struct exofs_fscb *fscb;
struct osd_request *or;
struct osd_obj_id obj;
struct exofs_io_state *ios;
int ret = -ENOMEM;
fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
if (!fscb) {
EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
return -ENOMEM;
}
lock_super(sb);
sbi = sb->s_fs_info;
fscb = &sbi->s_fscb;
ret = exofs_get_io_state(sbi, &ios);
if (ret)
goto out;
ios->length = sizeof(*fscb);
memset(fscb, 0, ios->length);
fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
fscb->s_magic = cpu_to_le16(sb->s_magic);
fscb->s_newfs = 0;
or = osd_start_request(sbi->s_dev, GFP_KERNEL);
if (unlikely(!or)) {
EXOFS_ERR("exofs_write_super: osd_start_request failed.\n");
goto out;
}
obj.partition = sbi->s_pid;
obj.id = EXOFS_SUPER_ID;
ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
if (unlikely(ret)) {
EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
goto out;
}
ios->obj.id = EXOFS_SUPER_ID;
ios->offset = 0;