summaryrefslogtreecommitdiffstats
path: root/fs/orangefs/dir.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--fs/orangefs/dir.c404
1 files changed, 404 insertions, 0 deletions
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
new file mode 100644
index 000000000..9cacce5d5
--- /dev/null
+++ b/fs/orangefs/dir.c
@@ -0,0 +1,404 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2017 Omnibond Systems, L.L.C.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+
+struct orangefs_dir_part {
+ struct orangefs_dir_part *next;
+ size_t len;
+};
+
+struct orangefs_dir {
+ __u64 token;
+ struct orangefs_dir_part *part;
+ loff_t end;
+ int error;
+};
+
+#define PART_SHIFT (24)
+#define PART_SIZE (1<<24)
+#define PART_MASK (~(PART_SIZE - 1))
+
+/*
+ * There can be up to 512 directory entries. Each entry is encoded as
+ * follows:
+ * 4 bytes: string size (n)
+ * n bytes: string
+ * 1 byte: trailing zero
+ * padding to 8 bytes
+ * 16 bytes: khandle
+ * padding to 8 bytes
+ *
+ * The trailer_buf starts with a struct orangefs_readdir_response_s
+ * which must be skipped to get to the directory data.
+ *
+ * The data which is received from the userspace daemon is termed a
+ * part and is stored in a linked list in case more than one part is
+ * needed for a large directory.
+ *
+ * The position pointer (ctx->pos) encodes the part and offset on which
+ * to begin reading at. Bits above PART_SHIFT encode the part and bits
+ * below PART_SHIFT encode the offset. Parts are stored in a linked
+ * list which grows as data is received from the server. The overhead
+ * associated with managing the list is presumed to be small compared to
+ * the overhead of communicating with the server.
+ *
+ * As data is received from the server, it is placed at the end of the
+ * part list. Data is parsed from the current position as it is needed.
+ * When data is determined to be corrupt, it is either because the
+ * userspace component has sent back corrupt data or because the file
+ * pointer has been moved to an invalid location. Since the two cannot
+ * be differentiated, return EIO.
+ *
+ * Part zero is synthesized to contains `.' and `..'. Part one is the
+ * first part of the part list.
+ */
+
+static int do_readdir(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry,
+ struct orangefs_kernel_op_s *op)
+{
+ struct orangefs_readdir_response_s *resp;
+ int bufi, r;
+
+ /*
+ * Despite the badly named field, readdir does not use shared
+ * memory. However, there are a limited number of readdir
+ * slots, which must be allocated here. This flag simply tells
+ * the op scheduler to return the op here for retry.
+ */
+ op->uses_shared_memory = 1;
+ op->upcall.req.readdir.refn = oi->refn;
+ op->upcall.req.readdir.token = od->token;
+ op->upcall.req.readdir.max_dirent_count =
+ ORANGEFS_MAX_DIRENT_COUNT_READDIR;
+
+again:
+ bufi = orangefs_readdir_index_get();
+ if (bufi < 0) {
+ od->error = bufi;
+ return bufi;
+ }
+
+ op->upcall.req.readdir.buf_index = bufi;
+
+ r = service_operation(op, "orangefs_readdir",
+ get_interruptible_flag(dentry->d_inode));
+
+ orangefs_readdir_index_put(bufi);
+
+ if (op_state_purged(op)) {
+ if (r == -EAGAIN) {
+ vfree(op->downcall.trailer_buf);
+ goto again;
+ } else if (r == -EIO) {
+ vfree(op->downcall.trailer_buf);
+ od->error = r;
+ return r;
+ }
+ }
+
+ if (r < 0) {
+ vfree(op->downcall.trailer_buf);
+ od->error = r;
+ return r;
+ } else if (op->downcall.status) {
+ vfree(op->downcall.trailer_buf);
+ od->error = op->downcall.status;
+ return op->downcall.status;
+ }
+
+ /*
+ * The maximum size is size per entry times the 512 entries plus
+ * the header. This is well under the limit.
+ */
+ if (op->downcall.trailer_size > PART_SIZE) {
+ vfree(op->downcall.trailer_buf);
+ od->error = -EIO;
+ return -EIO;
+ }
+
+ resp = (struct orangefs_readdir_response_s *)
+ op->downcall.trailer_buf;
+ od->token = resp->token;
+ return 0;
+}
+
+static int parse_readdir(struct orangefs_dir *od,
+ struct orangefs_kernel_op_s *op)
+{
+ struct orangefs_dir_part *part, *new;
+ size_t count;
+
+ count = 1;
+ part = od->part;
+ while (part) {
+ count++;
+ if (part->next)
+ part = part->next;
+ else
+ break;
+ }
+
+ new = (void *)op->downcall.trailer_buf;
+ new->next = NULL;
+ new->len = op->downcall.trailer_size -
+ sizeof(struct orangefs_readdir_response_s);
+ if (!od->part)
+ od->part = new;
+ else
+ part->next = new;
+ count++;
+ od->end = count << PART_SHIFT;
+
+ return 0;
+}
+
+static int orangefs_dir_more(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry)
+{
+ struct orangefs_kernel_op_s *op;
+ int r;
+
+ op = op_alloc(ORANGEFS_VFS_OP_READDIR);
+ if (!op) {
+ od->error = -ENOMEM;
+ return -ENOMEM;
+ }
+ r = do_readdir(oi, od, dentry, op);
+ if (r) {
+ od->error = r;
+ goto out;
+ }
+ r = parse_readdir(od, op);
+ if (r) {
+ od->error = r;
+ goto out;
+ }
+
+ od->error = 0;
+out:
+ op_release(op);
+ return od->error;
+}
+
+static int fill_from_part(struct orangefs_dir_part *part,
+ struct dir_context *ctx)
+{
+ const int offset = sizeof(struct orangefs_readdir_response_s);
+ struct orangefs_khandle *khandle;
+ __u32 *len, padlen;
+ loff_t i;
+ char *s;
+ i = ctx->pos & ~PART_MASK;
+
+ /* The file offset from userspace is too large. */
+ if (i > part->len)
+ return 1;
+
+ /*
+ * If the seek pointer is positioned just before an entry it
+ * should find the next entry.
+ */
+ if (i % 8)
+ i = i + (8 - i%8)%8;
+
+ while (i < part->len) {
+ if (part->len < i + sizeof *len)
+ break;
+ len = (void *)part + offset + i;
+ /*
+ * len is the size of the string itself. padlen is the
+ * total size of the encoded string.
+ */
+ padlen = (sizeof *len + *len + 1) +
+ (8 - (sizeof *len + *len + 1)%8)%8;
+ if (part->len < i + padlen + sizeof *khandle)
+ goto next;
+ s = (void *)part + offset + i + sizeof *len;
+ if (s[*len] != 0)
+ goto next;
+ khandle = (void *)part + offset + i + padlen;
+ if (!dir_emit(ctx, s, *len,
+ orangefs_khandle_to_ino(khandle),
+ DT_UNKNOWN))
+ return 0;
+ i += padlen + sizeof *khandle;
+ i = i + (8 - i%8)%8;
+ BUG_ON(i > part->len);
+ ctx->pos = (ctx->pos & PART_MASK) | i;
+ continue;
+next:
+ i += 8;
+ }
+ return 1;
+}
+
+static int orangefs_dir_fill(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry,
+ struct dir_context *ctx)
+{
+ struct orangefs_dir_part *part;
+ size_t count;
+
+ count = ((ctx->pos & PART_MASK) >> PART_SHIFT) - 1;
+
+ part = od->part;
+ while (part->next && count) {
+ count--;
+ part = part->next;
+ }
+ /* This means the userspace file offset is invalid. */
+ if (count) {
+ od->error = -EIO;
+ return -EIO;
+ }
+
+ while (part && part->len) {
+ int r;
+ r = fill_from_part(part, ctx);
+ if (r < 0) {
+ od->error = r;
+ return r;
+ } else if (r == 0) {
+ /* Userspace buffer is full. */
+ break;
+ } else {
+ /*
+ * The part ran out of data. Move to the next
+ * part. */
+ ctx->pos = (ctx->pos & PART_MASK) +
+ (1 << PART_SHIFT);
+ part = part->next;
+ }
+ }
+ return 0;
+}
+
+static loff_t orangefs_dir_llseek(struct file *file, loff_t offset,
+ int whence)
+{
+ struct orangefs_dir *od = file->private_data;
+ /*
+ * Delete the stored data so userspace sees new directory
+ * entries.
+ */
+ if (!whence && offset < od->end) {
+ struct orangefs_dir_part *part = od->part;
+ while (part) {
+ struct orangefs_dir_part *next = part->next;
+ vfree(part);
+ part = next;
+ }
+ od->token = ORANGEFS_ITERATE_START;
+ od->part = NULL;
+ od->end = 1 << PART_SHIFT;
+ }
+ return default_llseek(file, offset, whence);
+}
+
+static int orangefs_dir_iterate(struct file *file,
+ struct dir_context *ctx)
+{
+ struct orangefs_inode_s *oi;
+ struct orangefs_dir *od;
+ struct dentry *dentry;
+ int r;
+
+ dentry = file->f_path.dentry;
+ oi = ORANGEFS_I(dentry->d_inode);
+ od = file->private_data;
+
+ if (od->error)
+ return od->error;
+
+ if (ctx->pos == 0) {
+ if (!dir_emit_dot(file, ctx))
+ return 0;
+ ctx->pos++;
+ }
+ if (ctx->pos == 1) {
+ if (!dir_emit_dotdot(file, ctx))
+ return 0;
+ ctx->pos = 1 << PART_SHIFT;
+ }
+
+ /*
+ * The seek position is in the first synthesized part but is not
+ * valid.
+ */
+ if ((ctx->pos & PART_MASK) == 0)
+ return -EIO;
+
+ r = 0;
+
+ /*
+ * Must read more if the user has sought past what has been read
+ * so far. Stop a user who has sought past the end.
+ */
+ while (od->token != ORANGEFS_ITERATE_END &&
+ ctx->pos > od->end) {
+ r = orangefs_dir_more(oi, od, dentry);
+ if (r)
+ return r;
+ }
+ if (od->token == ORANGEFS_ITERATE_END && ctx->pos > od->end)
+ return -EIO;
+
+ /* Then try to fill if there's any left in the buffer. */
+ if (ctx->pos < od->end) {
+ r = orangefs_dir_fill(oi, od, dentry, ctx);
+ if (r)
+ return r;
+ }
+
+ /* Finally get some more and try to fill. */
+ if (od->token != ORANGEFS_ITERATE_END) {
+ r = orangefs_dir_more(oi, od, dentry);
+ if (r)
+ return r;
+ r = orangefs_dir_fill(oi, od, dentry, ctx);
+ }
+
+ return r;
+}
+
+static int orangefs_dir_open(struct inode *inode, struct file *file)
+{
+ struct orangefs_dir *od;
+ file->private_data = kmalloc(sizeof(struct orangefs_dir),
+ GFP_KERNEL);
+ if (!file->private_data)
+ return -ENOMEM;
+ od = file->private_data;
+ od->token = ORANGEFS_ITERATE_START;
+ od->part = NULL;
+ od->end = 1 << PART_SHIFT;
+ od->error = 0;
+ return 0;
+}
+
+static int orangefs_dir_release(struct inode *inode, struct file *file)
+{
+ struct orangefs_dir *od = file->private_data;
+ struct orangefs_dir_part *part = od->part;
+ while (part) {
+ struct orangefs_dir_part *next = part->next;
+ vfree(part);
+ part = next;
+ }
+ kfree(od);
+ return 0;
+}
+
+const struct file_operations orangefs_dir_operations = {
+ .llseek = orangefs_dir_llseek,
+ .read = generic_read_dir,
+ .iterate_shared = orangefs_dir_iterate,
+ .open = orangefs_dir_open,
+ .release = orangefs_dir_release
+};