Signed-off-by: Joern Engel <joern@logfs.org>

 fs/Kconfig            |   27 
 fs/Makefile           |    1 
 fs/logfs/Locking      |   48 +
 fs/logfs/Makefile     |   15 
 fs/logfs/compr.c      |   95 ++
 fs/logfs/dev_bdev.c   |  151 ++++
 fs/logfs/dev_mtd.c    |  410 ++++++++++++
 fs/logfs/dir.c        |  765 +++++++++++++++++++++++
 fs/logfs/file.c       |  236 +++++++
 fs/logfs/gc.c         |  730 ++++++++++++++++++++++
 fs/logfs/inode.c      |  589 ++++++++++++++++++
 fs/logfs/journal.c    |  805 ++++++++++++++++++++++++
 fs/logfs/logfs.h      |  587 ++++++++++++++++++
 fs/logfs/logfs_abi.h  |  523 ++++++++++++++++
 fs/logfs/memtree.c    |  402 ++++++++++++
 fs/logfs/progs/fsck.c |  347 ++++++++++
 fs/logfs/readwrite.c  | 1618 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/logfs/segment.c    |  595 ++++++++++++++++++
 fs/logfs/super.c      |  381 +++++++++++
 19 files changed, 8325 insertions(+)

--- git/fs/Kconfig~logfs	2008-04-07 11:39:32.989960448 +0200
+++ git/fs/Kconfig	2008-04-07 11:53:20.906568828 +0200
@@ -1347,6 +1347,33 @@ config JFFS2_CMODE_FAVOURLZO
 
 endchoice
 
+config LOGFS
+	bool "Log Filesystem (EXPERIMENTAL)"
+	depends on (MTD || BLOCK) && EXPERIMENTAL
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
+	select CRC32
+	help
+	  Flash filesystem aimed to scale efficiently to large devices.
+	  In comparison to JFFS2 it offers significantly faster mount
+	  times and potentially less RAM usage, although the latter has
+	  not been measured yet.
+
+	  In its current state it is still very experimental and should
+	  not be used for other than testing purposes.
+
+	  If unsure, say N.
+
+config LOGFS_FSCK
+	bool "Run LogFS fsck at mount time"
+	depends on LOGFS
+	help
+	  Run a full filesystem check on every mount.  If any errors are
+	  found, mounting the filesystem will fail.  This is a debug option
+	  for developers.
+
+	  If unsure, say N.
+
 config CRAMFS
 	tristate "Compressed ROM file system support (cramfs)"
 	depends on BLOCK
--- git/fs/Makefile~logfs	2008-04-07 11:39:32.989960448 +0200
+++ git/fs/Makefile	2008-04-07 11:53:20.906568828 +0200
@@ -100,6 +100,7 @@ obj-$(CONFIG_NTFS_FS)		+= ntfs/
 obj-$(CONFIG_UFS_FS)		+= ufs/
 obj-$(CONFIG_EFS_FS)		+= efs/
 obj-$(CONFIG_JFFS2_FS)		+= jffs2/
+obj-$(CONFIG_LOGFS)		+= logfs/
 obj-$(CONFIG_AFFS_FS)		+= affs/
 obj-$(CONFIG_ROMFS_FS)		+= romfs/
 obj-$(CONFIG_QNX4FS_FS)		+= qnx4/
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/Makefile	2008-04-07 11:53:20.909877890 +0200
@@ -0,0 +1,15 @@
+obj-$(CONFIG_LOGFS)	+= logfs.o
+
+logfs-y	+= compr.o
+logfs-y	+= dir.o
+logfs-y	+= file.o
+logfs-y	+= gc.o
+logfs-y	+= inode.o
+logfs-y	+= journal.o
+logfs-y	+= memtree.o
+logfs-y	+= readwrite.o
+logfs-y	+= segment.o
+logfs-y	+= super.o
+logfs-$(CONFIG_BLOCK)	+= dev_bdev.o
+logfs-$(CONFIG_MTD)	+= dev_mtd.o
+logfs-$(CONFIG_LOGFS_FSCK)	+= progs/fsck.o
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/logfs.h	2008-04-07 12:50:55.359552665 +0200
@@ -0,0 +1,587 @@
+/*
+ * fs/logfs/logfs.h
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ *
+ * Private header for logfs.
+ */
+#ifndef fs_logfs_logfs_h
+#define fs_logfs_logfs_h
+
+#define __CHECK_ENDIAN__
+
+#include <linux/crc32.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/mtd/mtd.h>
+#include "logfs_abi.h"
+
+#define PG_zero			PG_owner_priv_1
+#define PageZero(page)		test_bit(PG_zero, &(page)->flags)
+#define SetPageZero(page)	set_bit(PG_zero, &(page)->flags)
+#define ClearPageZero(page)	clear_bit(PG_zero, &(page)->flags)
+
+/*
+ * There is no generic kernel btree library yet.  When such a thing gets
+ * introduced, this definition and the corresponding source file should
+ * get removed.
+ */
+struct btree_head {
+	struct btree_node *node;
+	int height;
+	void *null_ptr;
+};
+
+static inline void build_bug_on_needs_a_function(void)
+{
+	BUILD_BUG_ON(sizeof(struct logfs_object_header) != LOGFS_HEADERSIZE);
+	BUILD_BUG_ON(sizeof(struct logfs_segment_header)
+			!= LOGFS_SEGMENT_HEADERSIZE);
+}
+
+/* FIXME: This should really be somewhere in the 64bit area. */
+#define LOGFS_LINK_MAX		(1<<30)
+
+/*
+ * Private errno for accessed beyond end-of-file.  Only used internally to
+ * logfs.  If this ever gets exposed to userspace or even other parts of the
+ * kernel, it is a bug.  256 was chosen as a number sufficiently above all
+ * used errno #defines.
+ *
+ * It can be argued that this is a hack and should be replaced with something
+ * else.  My last attempt to do this failed spectacularly and there are more
+ * urgent problems that users actually care about.  This will remain for the
+ * moment.  Patches are welcome, of course.
+ */
+#define EOF			(512)
+
+/* Read-only filesystem */
+#define LOGFS_SB_FLAG_RO	1
+
+/* Write Control Flags */
+#define WF_LOCK		0x01	/* take write lock */
+#define WF_WRITE	0x02	/* write block */
+#define WF_DELETE	0x04	/* delete old block */
+#define WF_SYNC		0x08	/* sync every indirect block */
+#define WF_GC		0x10	/* GC write, move to GC list */
+
+/**
+ * struct logfs_area - area management information
+ *
+ * @a_sb:			the superblock this area belongs to
+ * @a_is_open:			1 if the area is currently open, else 0
+ * @a_segno:			segment number of area
+ * @a_used_bytes:		number of used bytes
+ * @a_ops:			area operations (either journal or ostore)
+ * @a_wbuf:			write buffer
+ * @a_erase_count:		erase count
+ * @a_level:			GC level
+ */
+struct logfs_area { /* a segment open for writing */
+	struct super_block *a_sb;
+	int	a_is_open;
+	u32	a_segno;
+	u32	a_used_bytes;
+	const struct logfs_area_ops *a_ops;
+	void			*a_wbuf;
+	u32	a_erase_count;
+	u8	a_level;
+};
+
+/**
+ * struct logfs_area_ops - area operations
+ *
+ * @get_free_segment:		fill area->ofs with the offset of a free segment
+ * @get_erase_count:		fill area->erase_count (needs area->ofs)
+ * @erase_segment:		erase and setup segment
+ * @finish_area:		flush buffers, etc.
+ */
+struct logfs_area_ops {
+	void	(*get_free_segment)(struct logfs_area *area);
+	void	(*get_erase_count)(struct logfs_area *area);
+	int	(*erase_segment)(struct logfs_area *area);
+	void	(*finish_area)(struct logfs_area *area);
+};
+
+/**
+ * struct logfs_device_ops - device access operations
+ *
+ * @read:			read from the device
+ * @write:			write to the device
+ * @erase:			erase part of the device
+ */
+struct logfs_device_ops {
+	s64 (*find_sb)(struct super_block *sb);
+	int (*read)(struct super_block *sb, loff_t ofs, size_t len, void *buf);
+	int (*write)(struct super_block *sb, loff_t ofs, size_t len, void *buf);
+	int (*erase)(struct super_block *sb, loff_t ofs, size_t len);
+	void (*sync)(struct super_block *sb);
+};
+
+/**
+ * struct gc_candidate - "candidate" segment to be garbage collected next
+ *
+ * @list:			list (either free of low)
+ * @segno:			segment number
+ * @valid:			number of valid bytes
+ * @erase_count:		erase count of segment
+ * @dist:			distance from tree root
+ *
+ * Candidates can be on two lists.  The free list contains electees rather
+ * than candidates - segments that no longer contain any valid data.  The
+ * low list contains candidates to be picked for GC.  It should be kept
+ * short.  It is not required to always pick a perfect candidate.  In the
+ * worst case GC will have to move more data than absolutely necessary.
+ */
+struct gc_candidate {
+	struct list_head list;
+	u64	gec;
+	u32	segno;
+	u32	valid;
+	u32	erase_count;
+	u8	dist;
+};
+
+/**
+ * struct candidate_list - list of similar candidates
+ */
+struct candidate_list {
+	struct list_head list;
+	int count;
+	int maxcount;
+	int sort_by_ec;
+};
+
+/**
+ * struct logfs_journal_entry - temporary structure used during journal scan
+ *
+ * @used:
+ * @version:			normalized version
+ * @len:			length
+ * @offset:			offset
+ */
+struct logfs_journal_entry {
+	int used;
+	s16 version;
+	u16 len;
+	u16 datalen;
+	u64 offset;
+};
+
+enum transaction_state {
+	CREATE_1 = 1,
+	CREATE_2,
+	UNLINK_1,
+	UNLINK_2,
+	CROSS_RENAME_1,
+	CROSS_RENAME_2,
+	TARGET_RENAME_1,
+	TARGET_RENAME_2,
+	TARGET_RENAME_3
+};
+
+/**
+ * struct logfs_transaction - essential fields to support atomic dirops
+ *
+ * @ino:			target inode
+ * @dir:			inode of directory containing dentry
+ * @pos:			pos of dentry in directory
+ */
+struct logfs_transaction {
+	enum transaction_state state;
+	u64	 ino;
+	u64	 dir;
+	u64	 pos;
+};
+
+/**
+ * struct logfs_shadow - old block in the shadow of a not-yet-committed new one
+ * @old_ofs:			offset of old block on medium
+ * @new_ofs:			offset of new block on medium
+ * @ino:			inode number
+ * @bix:			block index
+ * @old_len:			size of old block, including header
+ * @new_len:			size of new block, including header
+ * @level:			block level
+ */
+struct logfs_shadow {
+	u64 old_ofs;
+	u64 new_ofs;
+	u64 ino;
+	u64 bix;
+	int old_len;
+	int new_len;
+	u8 level;
+};
+
+/**
+ * struct shadow_tree
+ * @new:			shadows where old_ofs==0, indexed by new_ofs
+ * @old:			shadows where old_ofs!=0, indexed by old_ofs
+ */
+struct shadow_tree {
+	struct btree_head new;
+	struct btree_head old;
+};
+
+/**
+ * struct logfs_block - contains any block state
+ * @tree:			btree of shadows, indexed by old_ofs
+ */
+struct logfs_block {
+	struct list_head dirty_list;
+	struct shadow_tree shadow_tree;
+	struct page *page;
+	struct logfs_transaction *ta;
+};
+
+struct mtd_inode {
+	struct mtd_info *mtd;
+	long openers;
+	struct inode vfs_inode;
+};
+
+struct logfs_super {
+	struct mtd_inode *s_mtd;		/* underlying device */
+	struct block_device *s_bdev;		/* underlying device */
+	int	 s_sync;			/* sync on next io? */
+	const struct logfs_device_ops *s_devops;/* device access */
+	struct inode	*s_master_inode;	/* ifile */
+	long	 s_flags;
+	/* dir.c fields */
+	struct mutex s_dirop_mutex;		/* for creat/unlink/rename */
+	u64	 s_victim_ino;			/* used for atomic dir-ops */
+	u64	 s_rename_dir;			/* source directory ino */
+	u64	 s_rename_pos;			/* position of source dd */
+	/* gc.c fields */
+	long	 s_segsize;			/* size of a segment */
+	int	 s_segshift;			/* log2 of segment size */
+	long	 s_no_segs;			/* segments on device */
+	long	 s_no_blocks;			/* blocks per segment */
+	long	 s_writesize;			/* minimum write size */
+	int	 s_writeshift;			/* log2 of write size */
+	u64	 s_size;			/* filesystem size */
+	struct logfs_area *s_area[LOGFS_NO_AREAS];	/* open segment array */
+	u64	 s_gec;				/* global erase count */
+	u64	 s_sweeper;			/* current sweeper pos */
+	u8	 s_ifile_levels;		/* max level of ifile */
+	u8	 s_iblock_levels;		/* max level of regular files */
+	u8	 s_data_levels;			/* # of segments to leaf block*/
+	u8	 s_total_levels;		/* sum of above three */
+	struct candidate_list s_free_list;	/* 100% free segments */
+	struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */
+	struct candidate_list s_ec_list;	/* wear level candidates */
+	struct btree_head s_reserved_segments;	/* sb, journal, bad, etc. */
+	struct list_head s_dirty_list;		/* list of dirty blocks */
+	struct list_head s_gc_dirty_list[LOGFS_NO_AREAS];/* blocks dirtied during GC */
+	/* inode.c fields */
+	spinlock_t s_ino_lock;			/* lock s_last_ino on 32bit */
+	u64	 s_last_ino;			/* highest ino used */
+	struct list_head s_freeing_list;	/* inodes being freed */
+	/* journal.c fields */
+	struct mutex s_journal_mutex;
+	void	*s_je;				/* journal entry to compress */
+	void	*s_compressed_je;		/* block to write to journal */
+	u64	 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */
+	u32	 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
+	u64	 s_last_version;
+	struct logfs_area *s_journal_area;	/* open journal segment */
+	struct logfs_journal_entry s_retired[JE_LAST+1]; /* for journal scan */
+	struct logfs_journal_entry s_speculative[JE_LAST+1]; /* dito */
+	struct logfs_journal_entry s_first;		/* dito */
+	int	 s_sum_index;			/* for the 12 summaries */
+	__be32	*s_bb_array;			/* bad segments */
+	/* readwrite.c fields */
+	struct mutex s_w_mutex;
+	struct page *s_write_page;		/* page under writeback now */
+	mempool_t *s_block_pool;		/* struct logfs_block pool */
+	mempool_t *s_shadow_pool;		/* struct logfs_shadow pool */
+	/*
+	 * Space accounting in LogFS:
+	 * - s_used_bytes specifies space used to store valid data objects.
+	 * - s_dirty_used_bytes is space used to store non-committed data
+	 *   objects.  Those objects have already been written themselves,
+	 *   but they don't become valid until all indirect blocks up to the
+	 *   journal have been written as well.
+	 * - s_dirty_free_bytes is space used to store the old copy of a
+	 *   replaced object, as long as the replacement is non-committed.
+	 *   In other words, it is the amount of space freed when all dirty
+	 *   blocks are written back.
+	 * - s_free_bytes is the amount of free space available for any
+	 *   purpose.
+	 * - s_root_reserve is the amount of free space available only to
+	 *   the root user.
+	 * - s_gc_reserve is currently a mess.
+	 */
+	u64	 s_free_bytes;			/* free space */
+	u64	 s_used_bytes;			/* used (valid) data */
+	u64	 s_dirty_free_bytes;		/* space freed on commit */
+	u64	 s_dirty_used_bytes;		/* space used on commit */
+	u64	 s_gc_reserve;			/* space reserved for GC */
+	u64	 s_root_reserve;	/* FIXME: currently unused */
+	u32	 s_bad_segments;		/* number of bad segments */
+};
+
+/**
+ * struct logfs_inode - in-memory inode
+ *
+ * @vfs_inode:			struct inode
+ * @li_data:			data pointers
+ * @li_used_bytes:		number of used bytes
+ * @li_freeing_list:		used to track inodes currently being freed
+ * @li_flags:			inode flags
+ */
+struct logfs_inode {
+	struct inode vfs_inode;
+	u64	li_data[LOGFS_EMBEDDED_FIELDS];
+	u64	li_used_bytes;
+	struct list_head li_freeing_list;
+	struct logfs_transaction *li_transaction;
+	struct shadow_tree li_shadow_tree;
+	u32	li_flags;
+	u8	li_height;
+};
+
+#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++)
+#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++)
+
+/* compr.c */
+int logfs_compress(void *in, void *out, size_t inlen, size_t outlen);
+int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen);
+int __init logfs_compr_init(void);
+void logfs_compr_exit(void);
+
+/* dev_bdev.c */
+#ifdef CONFIG_BLOCK
+int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+		const char *devname, struct vfsmount *mnt);
+
+static inline void logfs_put_bdev(struct block_device *bdev)
+{
+	if (bdev)
+		close_bdev_excl(bdev);
+}
+#else
+static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+		const char *devname, struct vfsmount *mnt)
+{
+	return -ENODEV;
+}
+
+static inline void logfs_put_bdev(struct block_device *bdev)
+{
+}
+#endif
+
+/* dev_mtd.c */
+#ifdef CONFIG_MTD
+int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+		int mtdnr, struct vfsmount *mnt);
+void logfs_put_mtd(struct mtd_inode *mi);
+#else
+static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+		int mtdnr, struct vfsmount *mnt)
+{
+	return -ENODEV;
+}
+
+static inline void logfs_put_mtd(struct mtd_inode *mi)
+{
+}
+#endif
+
+/* dir.c */
+extern const struct inode_operations logfs_symlink_iops;
+extern const struct inode_operations logfs_dir_iops;
+extern const struct file_operations logfs_dir_fops;
+int logfs_replay_journal(struct super_block *sb);
+
+/* file.c */
+extern const struct inode_operations logfs_reg_iops;
+extern const struct file_operations logfs_reg_fops;
+extern const struct address_space_operations logfs_reg_aops;
+int logfs_readpage(struct file *file, struct page *page);
+int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+		unsigned long arg);
+int logfs_fsync(struct file *file, struct dentry *dentry, int datasync);
+
+/* gc.c */
+int logfs_safe_to_write_block(struct super_block *sb, u8 level);
+struct gc_candidate *get_best_cand(struct candidate_list *list);
+int add_free_segments_from_journal(struct super_block *sb,
+		struct logfs_je_free_segments *segs, int count);
+void logfs_dirty_for_gc(struct super_block *sb, struct logfs_block *block);
+void logfs_gc_pass(struct super_block *sb);
+int logfs_check_areas(struct super_block *sb);
+int logfs_init_gc(struct logfs_super *super);
+void logfs_cleanup_gc(struct logfs_super *super);
+
+/* inode.c */
+extern const struct super_operations logfs_super_operations;
+struct inode *logfs_iget(struct super_block *sb, ino_t ino, int *cookie);
+void logfs_iput(struct inode *inode, int cookie);
+struct inode *logfs_new_inode(struct inode *dir, int mode);
+struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
+int logfs_init_inode_cache(void);
+void logfs_destroy_inode_cache(void);
+int __logfs_write_inode(struct inode *inode, long flags);
+void __logfs_destroy_inode(struct inode *inode);
+void logfs_set_blocks(struct inode *inode, u64 no);
+struct inode *dhowells_iget(struct super_block *sb, ino_t ino);
+
+/* journal.c */
+int logfs_write_anchor(struct inode *inode);
+int logfs_init_journal(struct super_block *sb);
+void logfs_cleanup_journal(struct super_block *sb);
+
+/* memtree.c */
+void btree_init(struct btree_head *head);
+void *btree_lookup(struct btree_head *head, u64 val);
+int btree_insert(struct btree_head *head, u64 val, void *ptr);
+void *btree_remove(struct btree_head *head, u64 val);
+int btree_merge(struct btree_head *target, struct btree_head *victim);
+void btree_visitor(struct btree_head *head, long opaque,
+		void (*func)(void *elem, long opaque, u64 val));
+void btree_grim_visitor(struct btree_head *head, long opaque,
+		void (*func)(void *elem, long opaque, u64 val));
+
+/* readwrite.c */
+void logfs_unpack_index(pgoff_t index, u64 *bix, u8 *level);
+void logfs_flush_dirty(struct super_block *sb, int sync);
+int logfs_inode_read(struct inode *inode, void *buf, size_t n, loff_t _pos);
+int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
+		loff_t bix, long flags, struct logfs_transaction *ta,
+		struct shadow_tree *shadow_tree);
+int logfs_readpage_nolock(struct page *page);
+int logfs_write_buf(struct inode *inode, struct page *page,
+		struct logfs_transaction *ta, long flags);
+int logfs_delete(struct inode *inode, pgoff_t index,
+		struct shadow_tree *shadow_tree, struct logfs_transaction *ta);
+int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, int level,
+		long flags);
+int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 pos,
+		u8 level);
+int logfs_truncate(struct inode *inode, u64 size);
+u64 logfs_seek_hole(struct inode *inode, u64 bix);
+u64 logfs_seek_data(struct inode *inode, u64 bix);
+int logfs_init_rw(struct logfs_super *super);
+void logfs_cleanup_rw(struct logfs_super *super);
+
+/* segment.c */
+int logfs_erase_segment(struct super_block *sb, u32 ofs);
+int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf);
+int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix,
+		u8 level);
+int logfs_segment_write(struct inode *inode, struct page *page,
+		struct logfs_shadow *shadow);
+int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow);
+void logfs_buf_write(struct logfs_area *area, u64 ofs, void *data, size_t len);
+
+/* area handling */
+int logfs_init_areas(struct super_block *sb);
+void logfs_cleanup_areas(struct logfs_super *super);
+int logfs_open_area(struct logfs_area *area);
+void logfs_close_area(struct logfs_area *area);
+
+/* super.c */
+void logfs_crash_dump(struct super_block *sb);
+void *memchr_inv(const void *s, int c, size_t n);
+int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
+int logfs_get_sb_device(struct file_system_type *type, int flags,
+		struct mtd_inode *mtd, struct block_device *bdev,
+		const struct logfs_device_ops *devops, struct vfsmount *mnt);
+
+/* progs/fsck.c */
+#ifdef CONFIG_LOGFS_FSCK
+int logfs_fsck(struct super_block *sb);
+#else
+static inline int logfs_fsck(struct super_block *sb)
+{
+	return 0;
+}
+#endif
+
+static inline struct logfs_super *logfs_super(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct logfs_inode *logfs_inode(struct inode *inode)
+{
+	return container_of(inode, struct logfs_inode, vfs_inode);
+}
+
+static inline void logfs_set_ro(struct super_block *sb)
+{
+	logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO;
+}
+
+#define LOGFS_BUG(sb) do {					\
+	struct super_block *__sb = sb;				\
+	logfs_crash_dump(__sb);					\
+	logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO;		\
+	BUG();							\
+} while (0)
+
+#define LOGFS_BUG_ON(condition, sb) \
+	do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0)
+
+static inline __be32 logfs_crc32(void *data, size_t len, size_t skip)
+{
+	return cpu_to_be32(crc32(~0, data+skip, len-skip));
+}
+
+static inline u8 logfs_type(struct inode *inode)
+{
+	return (inode->i_mode >> 12) & 15;
+}
+
+static inline pgoff_t logfs_index(struct super_block *sb, u64 pos)
+{
+	return pos >> sb->s_blocksize_bits;
+}
+
+static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs)
+{
+	return ((u64)segno << logfs_super(sb)->s_segshift) + ofs;
+}
+
+static inline int device_read(struct super_block *sb, u32 segno, u32 ofs,
+		size_t len, void *buf)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	return super->s_devops->read(sb, dev_ofs(sb, segno, ofs), len, buf);
+}
+
+static inline struct logfs_block *logfs_block(struct page *page)
+{
+	return (void *)page->private;
+}
+
+/* Compat mess */
+
+#include <linux/version.h>
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 22)
+#define DTOR /* nothing */
+#else
+#define DTOR , NULL
+#endif
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 24)
+#define	zero_user_segment(page, start, end)	\
+	zero_user_page((page), (start), (end) - (start), KM_USER0);
+static inline void iget_failed(struct inode *inode)
+{
+	make_bad_inode(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+}
+#endif
+
+#endif
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/compr.c	2008-04-07 11:53:20.909877890 +0200
@@ -0,0 +1,95 @@
+/*
+ * fs/logfs/compr.c	- compression routines
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/vmalloc.h>
+#include <linux/zlib.h>
+
+#define COMPR_LEVEL 3
+
+static DEFINE_MUTEX(compr_mutex);
+static struct z_stream_s stream;
+
+int logfs_compress(void *in, void *out, size_t inlen, size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	mutex_lock(&compr_mutex);
+	err = zlib_deflateInit(&stream, COMPR_LEVEL);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_deflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_deflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	if (stream.total_out >= stream.total_in)
+		goto error;
+
+	ret = stream.total_out;
+error:
+	mutex_unlock(&compr_mutex);
+	return ret;
+}
+
+int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	mutex_lock(&compr_mutex);
+	err = zlib_inflateInit(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_inflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_inflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	ret = 0;
+error:
+	mutex_unlock(&compr_mutex);
+	return ret;
+}
+
+int __init logfs_compr_init(void)
+{
+	size_t size = max(zlib_deflate_workspacesize(),
+			zlib_inflate_workspacesize());
+	stream.workspace = vmalloc(size);
+	if (!stream.workspace)
+		return -ENOMEM;
+	return 0;
+}
+
+void logfs_compr_exit(void)
+{
+	vfree(stream.workspace);
+}
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/dir.c	2008-04-07 12:14:21.845470394 +0200
@@ -0,0 +1,765 @@
+/*
+ * fs/logfs/dir.c	- directory-related code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+
+
+/*
+ * Atomic dir operations
+ *
+ * Directory operations are by default not atomic.  Dentries and Inodes are
+ * created/removed/altered in seperate operations.  Therefore we need to do
+ * a small amount of journaling.
+ *
+ * Create, link, mkdir, mknod and symlink all share the same function to do
+ * the work: __logfs_create.  This function works in two atomic steps:
+ * 1. allocate inode (remember in journal)
+ * 2. allocate dentry (clear journal)
+ *
+ * As we can only get interrupted between the two, when the inode we just
+ * created is simply stored in the anchor.  On next mount, if we were
+ * interrupted, we delete the inode.  From a users point of view the
+ * operation never happened.
+ *
+ * Unlink and rmdir also share the same function: unlink.  Again, this
+ * function works in two atomic steps
+ * 1. remove dentry (remember inode in journal)
+ * 2. unlink inode (clear journal)
+ *
+ * And again, on the next mount, if we were interrupted, we delete the inode.
+ * From a users point of view the operation succeeded.
+ *
+ * Rename is the real pain to deal with, harder than all the other methods
+ * combined.  Depending on the circumstances we can run into three cases.
+ * A "target rename" where the target dentry already existed, a "local
+ * rename" where both parent directories are identical or a "cross-directory
+ * rename" in the remaining case.
+ *
+ * Local rename is atomic, as the old dentry is simply rewritten with a new
+ * name.
+ *
+ * Cross-directory rename works in two steps, similar to __logfs_create and
+ * logfs_unlink:
+ * 1. Write new dentry (remember old dentry in journal)
+ * 2. Remove old dentry (clear journal)
+ *
+ * Here we remember a dentry instead of an inode.  On next mount, if we were
+ * interrupted, we delete the dentry.  From a users point of view, the
+ * operation succeeded.
+ *
+ * Target rename works in three atomic steps:
+ * 1. Attach old inode to new dentry (remember old dentry and new inode)
+ * 2. Remove old dentry (still remember the new inode)
+ * 3. Remove victim inode
+ *
+ * Here we remember both an inode an a dentry.  If we get interrupted
+ * between steps 1 and 2, we delete both the dentry and the inode.  If
+ * we get interrupted between steps 2 and 3, we delete just the inode.
+ * In either case, the remaining objects are deleted on next mount.  From
+ * a users point of view, the operation succeeded.
+ */
+
+typedef int (*dir_callback)(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, loff_t pos, void *arg);
+
+static inline void logfs_inc_count(struct inode *inode)
+{
+	inode->i_nlink++;
+	mark_inode_dirty_sync(inode);
+}
+
+static inline void logfs_dec_count(struct inode *inode)
+{
+	inode->i_nlink--;
+	mark_inode_dirty_sync(inode);
+}
+
+static int read_dir(struct inode *dir, struct logfs_disk_dentry *dd, loff_t pos)
+{
+	return logfs_inode_read(dir, dd, sizeof(*dd), pos);
+}
+
+static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
+		loff_t pos, struct logfs_transaction *ta)
+{
+	return logfs_inode_write(dir, dd, sizeof(*dd), pos,
+			WF_SYNC|WF_LOCK, ta, NULL);
+}
+
+static int write_inode(struct inode *inode)
+{
+	return __logfs_write_inode(inode, WF_LOCK|WF_SYNC);
+}
+
+static s64 dir_seek_data(struct inode *inode, s64 pos)
+{
+	s64 new_pos = logfs_seek_data(inode, pos);
+
+	return max(pos, new_pos - 1);
+}
+
+static int __logfs_dir_walk(struct inode *dir, struct dentry *dentry,
+		dir_callback handler, void *arg,
+		struct logfs_disk_dentry *dd, loff_t *pos)
+{
+	struct qstr *name = dentry ? &dentry->d_name : NULL;
+	int ret;
+
+	for (; ; (*pos)++) {
+		ret = read_dir(dir, dd, *pos);
+		if (ret == -EOF)
+			return 0;
+		if (ret == -ENODATA) {
+			/* deleted dentry */
+			*pos = dir_seek_data(dir, *pos);
+			continue;
+		}
+		if (ret)
+			return ret;
+		BUG_ON(dd->namelen == 0);
+
+		if (name) {
+			if (name->len != be16_to_cpu(dd->namelen))
+				continue;
+			if (memcmp(name->name, dd->name, name->len))
+				continue;
+		}
+
+		return handler(dir, dentry, dd, *pos, arg);
+	}
+	return ret;
+}
+
+static int logfs_dir_walk(struct inode *dir, struct dentry *dentry,
+		dir_callback handler, void *arg)
+{
+	struct logfs_disk_dentry dd;
+	loff_t pos = 0;
+
+	return __logfs_dir_walk(dir, dentry, handler, arg, &dd, &pos);
+}
+
+static int logfs_lookup_handler(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, loff_t pos, void *arg)
+{
+	struct inode *inode;
+
+	inode = dhowells_iget(dir->i_sb, be64_to_cpu(dd->ino));
+	if (!inode)
+		return -EIO;
+	return PTR_ERR(d_splice_alias(inode, dentry));
+}
+
+static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
+		struct nameidata *nd)
+{
+	return ERR_PTR(logfs_dir_walk(dir, dentry, logfs_lookup_handler, NULL));
+}
+
+static int logfs_unlink_handler(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, loff_t pos, void *ta)
+{
+	return logfs_delete(dir, pos, NULL, ta);
+}
+
+static int logfs_remove_inode(struct inode *inode)
+{
+	int ret;
+
+	inode->i_nlink--;
+	if (S_ISDIR(inode->i_mode))
+		inode->i_nlink--;
+	ret = write_inode(inode);
+	LOGFS_BUG_ON(ret, inode->i_sb);
+	return ret;
+}
+
+static void abort_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+	logfs_inode(inode)->li_transaction = NULL;
+	kfree(ta);
+}
+
+static int logfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct logfs_super *super = logfs_super(dir->i_sb);
+	struct inode *inode = dentry->d_inode;
+	struct logfs_transaction *ta;
+	int ret;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = UNLINK_1;
+	ta->ino = inode->i_ino;
+
+	if (S_ISDIR(inode->i_mode))
+		dir->i_nlink--;
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	mutex_lock(&super->s_dirop_mutex);
+	ret = logfs_dir_walk(dir, dentry, logfs_unlink_handler, ta);
+	if (!ret)
+		ret = write_inode(dir);
+
+	if (ret) {
+		super->s_victim_ino = 0;
+		abort_transaction(dir, ta);
+		printk(KERN_ERR"LOGFS: unable to delete inode\n");
+		if (S_ISDIR(inode->i_mode))
+			logfs_inc_count(dir);
+		goto out;
+	}
+
+	ta->state = UNLINK_2;
+	logfs_inode(inode)->li_transaction = ta;
+	ret = logfs_remove_inode(inode);
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return ret;
+}
+
+static int logfs_empty_handler(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, loff_t pos, void *arg)
+{
+	return -ENOTEMPTY;
+}
+
+static inline int logfs_empty_dir(struct inode *dir)
+{
+	return logfs_dir_walk(dir, NULL, logfs_empty_handler, NULL) == 0;
+}
+
+static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (!logfs_empty_dir(inode))
+		return -ENOTEMPTY;
+
+	return logfs_unlink(dir, dentry);
+}
+
+/* FIXME: readdir currently has it's own dir_walk code.  I don't see a good
+ * way to combine the two copies */
+#define IMPLICIT_NODES 2
+static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
+{
+	struct logfs_disk_dentry dd;
+	struct inode *dir = file->f_dentry->d_inode;
+	loff_t pos = file->f_pos - IMPLICIT_NODES;
+	int err;
+
+	BUG_ON(pos < 0);
+	for (;; pos++) {
+		err = read_dir(dir, &dd, pos);
+		if (err == -EOF)
+			break;
+		if (err == -ENODATA) {
+			/* deleted dentry */
+			pos = dir_seek_data(dir, pos);
+			continue;
+		}
+		if (err)
+			return err;
+		BUG_ON(dd.namelen == 0);
+
+		if (filldir(buf, dd.name, be16_to_cpu(dd.namelen), pos,
+					be64_to_cpu(dd.ino), dd.type))
+			break;
+	}
+
+	file->f_pos = pos + IMPLICIT_NODES;
+	return 0;
+}
+
+static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	ino_t pino = parent_ino(file->f_dentry);
+	int err;
+
+	if (file->f_pos < 0)
+		return -EINVAL;
+
+	if (file->f_pos == 0) {
+		if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
+			return 0;
+		file->f_pos++;
+	}
+	if (file->f_pos == 1) {
+		if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
+			return 0;
+		file->f_pos++;
+	}
+
+	err = __logfs_readdir(file, buf, filldir);
+	return err;
+}
+
+static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
+{
+	dd->namelen = cpu_to_be16(name->len);
+	memcpy(dd->name, name->name, name->len);
+}
+
+static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
+		struct inode *inode, struct logfs_transaction *ta)
+{
+	struct logfs_disk_dentry dd;
+	int err;
+
+	if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
+		return -ENAMETOOLONG;
+
+	memset(&dd, 0, sizeof(dd));
+	dd.ino = cpu_to_be64(inode->i_ino);
+	dd.type = logfs_type(inode);
+	logfs_set_name(&dd, &dentry->d_name);
+
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	/*
+	 * FIXME: the file size should actually get aligned when writing,
+	 * not when reading.
+	 */
+	err = write_dir(dir, &dd, logfs_seek_hole(dir, 0), ta);
+	if (err)
+		return err;
+	return 0;
+}
+
+static int __logfs_create(struct inode *dir, struct dentry *dentry,
+		struct inode *inode, const char *dest, long destlen)
+{
+	struct logfs_super *super = logfs_super(dir->i_sb);
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_transaction *ta;
+	int ret;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = CREATE_1;
+	mutex_lock(&super->s_dirop_mutex);
+	ta->ino = inode->i_ino;
+	if (S_ISDIR(inode->i_mode))
+		inode->i_nlink++;
+
+	if (dest) {
+		/* symlink */
+		ret = logfs_inode_write(inode, dest, destlen, 0,
+				WF_SYNC|WF_LOCK, ta, NULL);
+		if (!ret)
+			ret = write_inode(inode);
+	} else {
+		/* creat/mkdir/mknod */
+		super->s_victim_ino = inode->i_ino;
+		ret = write_inode(inode);
+	}
+	if (ret) {
+		super->s_victim_ino = 0;
+		abort_transaction(inode, ta);
+		li->li_flags |= LOGFS_IF_STILLBORN;
+		/* FIXME: truncate symlink */
+		inode->i_nlink--;
+		iput(inode);
+		goto out;
+	}
+
+	ta->state = CREATE_2;
+	if (S_ISDIR(inode->i_mode))
+		dir->i_nlink++;
+	ret = logfs_write_dir(dir, dentry, inode, ta);
+	/* sync directory */
+	if (!ret)
+		ret = write_inode(dir);
+
+	if (ret) {
+		super->s_victim_ino = 0;
+		abort_transaction(dir, ta);
+		if (S_ISDIR(inode->i_mode))
+			dir->i_nlink--;
+		logfs_remove_inode(inode);
+		iput(inode);
+		goto out;
+	}
+	d_instantiate(dentry, inode);
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return ret;
+}
+
+static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+
+	if (dir->i_nlink >= LOGFS_LINK_MAX)
+		return -EMLINK;
+
+	/*
+	 * FIXME: why do we have to fill in S_IFDIR, while the mode is
+	 * correct for mknod, creat, etc.?  Smells like the vfs *should*
+	 * do it for us but for some reason fails to do so.
+	 */
+	inode = logfs_new_inode(dir, S_IFDIR | mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &logfs_dir_iops;
+	inode->i_fop = &logfs_dir_fops;
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	struct inode *inode;
+
+	inode = logfs_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &logfs_reg_iops;
+	inode->i_fop = &logfs_reg_fops;
+	inode->i_mapping->a_ops = &logfs_reg_aops;
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+		dev_t rdev)
+{
+	struct inode *inode;
+
+	if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
+		return -ENAMETOOLONG;
+
+	inode = logfs_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	init_special_inode(inode, mode, rdev);
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_symlink(struct inode *dir, struct dentry *dentry,
+		const char *target)
+{
+	struct inode *inode;
+	size_t destlen = strlen(target) + 1;
+
+	if (destlen > dir->i_sb->s_blocksize)
+		return -ENAMETOOLONG;
+
+	inode = logfs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &logfs_symlink_iops;
+	inode->i_mapping->a_ops = &logfs_reg_aops;
+
+	return __logfs_create(dir, dentry, inode, target, destlen);
+}
+
+static int logfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	return generic_permission(inode, mask, NULL);
+}
+
+static int logfs_link(struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+
+	if (inode->i_nlink >= LOGFS_LINK_MAX)
+		return -EMLINK;
+
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	atomic_inc(&inode->i_count);
+	logfs_inc_count(inode);
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_nop_handler(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, loff_t pos, void *arg)
+{
+	return 0;
+}
+
+static inline int logfs_get_dd(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, loff_t *pos)
+{
+	*pos = 0;
+	return __logfs_dir_walk(dir, dentry, logfs_nop_handler, NULL, dd, pos);
+}
+
+/* Easiest case, a local rename and the target doesn't exist.  Just change
+ * the name in the old dd.
+ */
+static int logfs_rename_local(struct inode *dir, struct dentry *old_dentry,
+		struct dentry *new_dentry)
+{
+	struct logfs_disk_dentry dd;
+	loff_t pos;
+	int err;
+
+	err = logfs_get_dd(dir, old_dentry, &dd, &pos);
+	if (err)
+		return err;
+
+	logfs_set_name(&dd, &new_dentry->d_name);
+	return write_dir(dir, &dd, pos, NULL);
+}
+
+static int logfs_delete_dd(struct inode *dir, struct logfs_disk_dentry *dd,
+		loff_t pos, struct logfs_transaction *ta)
+{
+	int err;
+
+	err = read_dir(dir, dd, pos);
+
+	/*
+	 * Getting called with pos somewhere beyond eof is either a goofup
+	 * within this file or means someone maliciously edited the
+	 * (crc-protected) journal.
+	 */
+	LOGFS_BUG_ON(err == -EOF, dir->i_sb);
+	if (err)
+		return err;
+
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	if (dd->type == DT_DIR)
+		dir->i_nlink--;
+	return logfs_delete(dir, pos, NULL, ta);
+}
+
+/*
+ * Cross-directory rename, target does not exist.  Just a little nasty.
+ * Create a new dentry in the target dir, then remove the old dentry,
+ * all the while taking care to remember our operation in the journal.
+ */
+static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
+			      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct logfs_super *super = logfs_super(old_dir->i_sb);
+	struct logfs_disk_dentry dd;
+	struct logfs_transaction *ta;
+	loff_t pos;
+	int err;
+
+	/* 1. locate source dd */
+	err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
+	if (err)
+		return err;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = CROSS_RENAME_1;
+	ta->dir = old_dir->i_ino;
+	ta->pos = pos;
+
+	/* 2. write target dd */
+	if (dd.type == DT_DIR)
+		new_dir->i_nlink++;
+	mutex_lock(&super->s_dirop_mutex);
+	err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode, ta);
+	if (!err)
+		err = write_inode(new_dir);
+
+	if (err) {
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		abort_transaction(new_dir, ta);
+		goto out;
+	}
+
+	ta->state = CROSS_RENAME_2;
+	/* 3. remove source dd */
+	err = logfs_delete_dd(old_dir, &dd, pos, ta);
+	if (!err)
+		err = write_inode(old_dir);
+	LOGFS_BUG_ON(err, old_dir->i_sb);
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return err;
+}
+
+static int logfs_replace_inode(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, struct inode *inode,
+		struct logfs_transaction *ta)
+{
+	loff_t pos;
+	int err;
+
+	err = logfs_get_dd(dir, dentry, dd, &pos);
+	if (err)
+		return err;
+	dd->ino = cpu_to_be64(inode->i_ino);
+	dd->type = logfs_type(inode);
+
+	err = write_dir(dir, dd, pos, ta);
+	if (err)
+		return err;
+	return write_inode(dir);
+}
+
+/* Target dentry exists - the worst case.  We need to attach the source
+ * inode to the target dentry, then remove the orphaned target inode and
+ * source dentry.
+ */
+static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry,
+			       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct logfs_super *super = logfs_super(old_dir->i_sb);
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	int isdir = S_ISDIR(old_inode->i_mode);
+	struct logfs_disk_dentry dd;
+	struct logfs_transaction *ta;
+	loff_t pos;
+	int err;
+
+	BUG_ON(isdir != S_ISDIR(new_inode->i_mode));
+	if (isdir) {
+		if (!logfs_empty_dir(new_inode))
+			return -ENOTEMPTY;
+	}
+
+	/* 1. locate source dd */
+	err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
+	if (err)
+		return err;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = TARGET_RENAME_1;
+	ta->dir = old_dir->i_ino;
+	ta->pos = pos;
+	ta->ino = new_inode->i_ino;
+
+	/* 2. attach source inode to target dd */
+	mutex_lock(&super->s_dirop_mutex);
+	err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode, ta);
+	if (err) {
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		super->s_victim_ino = 0;
+		abort_transaction(new_dir, ta);
+		goto out;
+	}
+
+	/* 3. remove source dd */
+	ta->state = TARGET_RENAME_2;
+	err = logfs_delete_dd(old_dir, &dd, pos, ta);
+	if (!err)
+		err = write_inode(old_dir);
+	LOGFS_BUG_ON(err, old_dir->i_sb);
+
+	/* 4. remove target inode */
+	ta->state = TARGET_RENAME_3;
+	logfs_inode(new_inode)->li_transaction = ta;
+	err = logfs_remove_inode(new_inode);
+
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return err;
+}
+
+static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	if (new_dentry->d_inode)
+		return logfs_rename_target(old_dir, old_dentry,
+					   new_dir, new_dentry);
+	else if (old_dir == new_dir)
+		return logfs_rename_local(old_dir, old_dentry, new_dentry);
+	return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry);
+}
+
+/* No locking done here, as this is called before .get_sb() returns. */
+int logfs_replay_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_disk_dentry dd;
+	struct inode *inode;
+	u64 ino, pos;
+	int err;
+
+	if (super->s_victim_ino) {
+		/* delete victim inode */
+		ino = super->s_victim_ino;
+		printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino);
+		inode = dhowells_iget(sb, ino);
+		if (!inode)
+			goto fail;
+
+		LOGFS_BUG_ON(i_size_read(inode) > 0, sb);
+		super->s_victim_ino = 0;
+		err = logfs_remove_inode(inode);
+		iput(inode);
+		if (err) {
+			super->s_victim_ino = ino;
+			goto fail;
+		}
+	}
+	if (super->s_rename_dir) {
+		/* delete old dd from rename */
+		ino = super->s_rename_dir;
+		pos = super->s_rename_pos;
+		printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n",
+				ino, pos);
+		inode = dhowells_iget(sb, ino);
+		if (!inode)
+			goto fail;
+
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		err = logfs_delete_dd(inode, &dd, pos, NULL);
+		iput(inode);
+		if (err) {
+			super->s_rename_dir = ino;
+			super->s_rename_pos = pos;
+			goto fail;
+		}
+	}
+	return 0;
+fail:
+	LOGFS_BUG(sb);
+	return -EIO;
+}
+
+const struct inode_operations logfs_symlink_iops = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+};
+
+const struct inode_operations logfs_dir_iops = {
+	.create		= logfs_create,
+	.link		= logfs_link,
+	.lookup		= logfs_lookup,
+	.mkdir		= logfs_mkdir,
+	.mknod		= logfs_mknod,
+	.rename		= logfs_rename,
+	.rmdir		= logfs_rmdir,
+	.permission	= logfs_permission,
+	.symlink	= logfs_symlink,
+	.unlink		= logfs_unlink,
+};
+const struct file_operations logfs_dir_fops = {
+	.fsync		= logfs_fsync,
+	.ioctl		= logfs_ioctl,
+	.readdir	= logfs_readdir,
+	.read		= generic_read_dir,
+};
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/file.c	2008-04-07 12:47:30.156219529 +0200
@@ -0,0 +1,236 @@
+/*
+ * fs/logfs/file.c	- prepare_write, commit_write and friends
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+#include <linux/writeback.h>
+
+static int logfs_prepare_write(struct file *file, struct page *page,
+		unsigned start, unsigned end)
+{
+	if (PageUptodate(page))
+		return 0;
+
+	if ((start == 0) && (end == PAGE_CACHE_SIZE))
+		return 0;
+
+	return logfs_readpage_nolock(page);
+}
+
+static int logfs_commit_write(struct file *file, struct page *page,
+		unsigned start, unsigned end)
+{
+	struct inode *inode = page->mapping->host;
+	pgoff_t index = page->index;
+	int ret;
+
+	BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize);
+	BUG_ON(page->index > I3_BLOCKS);
+
+	if (start == end)
+		return 0; /* FIXME: do we need to update inode? */
+
+	if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) {
+		i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end);
+		mark_inode_dirty_sync(inode);
+	}
+
+	ClearPageZero(page);
+	ret = logfs_write_buf(inode, page, NULL, WF_LOCK);
+	return ret;
+}
+
+int logfs_readpage(struct file *file, struct page *page)
+{
+	int ret;
+
+	ret = logfs_readpage_nolock(page);
+	unlock_page(page);
+	return ret;
+}
+
+/* Clear the page's dirty flag in the radix tree. */
+/* TODO: mucking with PageWriteback is silly.  Add a generic function to clear
+ * the dirty bit from the radix tree for filesystems that don't have to wait
+ * for page writeback to finish (i.e. any compressing filesystem).
+ */
+static void clear_radix_tree_dirty(struct page *page)
+{
+	BUG_ON(PagePrivate(page) || page->private);
+	set_page_writeback(page);
+	end_page_writeback(page);
+}
+
+static int __logfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	int err;
+
+	ClearPageZero(page);
+	err = logfs_write_buf(inode, page, NULL, WF_LOCK);
+	if (err)
+		set_page_dirty(page);
+	else
+		clear_radix_tree_dirty(page);
+	unlock_page(page);
+	return err;
+}
+
+static int logfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+	u64 bix;
+	u8 level;
+
+	pr_debug("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index,
+			page);
+
+	logfs_unpack_index(page->index, &bix, &level);
+
+	/* Indirect blocks are never truncated */
+	if (level > 0)
+		return __logfs_writepage(page, wbc);
+
+	/*
+	 * TODO: everything below is a near-verbatim copy of nobh_writepage().
+	 * The relevant bits should be factored out after logfs is merged.
+	 */
+
+	/* Is the page fully inside i_size? */
+	if (bix < end_index)
+		return __logfs_writepage(page, wbc);
+
+	 /* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (bix > end_index || offset == 0) {
+		unlock_page(page);
+		return 0; /* don't care */
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invokation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	return __logfs_writepage(page, wbc);
+}
+
+static void logfs_invalidatepage(struct page *page, unsigned long offset)
+{
+	u64 bix;
+	u8 level;
+
+	/* Gets called for dirty indirect blocks at umount time */
+	logfs_unpack_index(page->index, &bix, &level);
+	BUG_ON(level == 0);
+	BUG_ON(PageZero(page));
+	logfs_write_buf(page->mapping->host, page, NULL, WF_LOCK);
+}
+
+static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
+{
+	return 0; /* None of these are easy to release */
+}
+
+
+int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	unsigned int oldflags, flags;
+	int err;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		flags = li->li_flags & LOGFS_FL_USER_VISIBLE;
+		return put_user(flags, (int __user *)arg);
+	case FS_IOC_SETFLAGS:
+		if (IS_RDONLY(inode))
+			return -EROFS;
+
+		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+			return -EACCES;
+
+		err = get_user(flags, (int __user *)arg);
+		if (err)
+			return err;
+
+		mutex_lock(&inode->i_mutex);
+		oldflags = li->li_flags;
+		flags &= LOGFS_FL_USER_MODIFIABLE;
+		flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
+		li->li_flags = flags;
+		mutex_unlock(&inode->i_mutex);
+
+		inode->i_ctime = CURRENT_TIME;
+		mark_inode_dirty_sync(inode);
+		return 0;
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct super_block *sb = dentry->d_inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+
+	logfs_flush_dirty(sb, 1);
+
+	super->s_devops->sync(sb);
+	return 0;
+}
+
+static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int err = 0;
+
+	if (attr->ia_valid & ATTR_SIZE)
+		err = logfs_truncate(inode, attr->ia_size);
+	attr->ia_valid &= ~ATTR_SIZE;
+
+	if (!err)
+		err = inode_change_ok(inode, attr);
+	if (!err)
+		err = inode_setattr(inode, attr);
+	return err;
+}
+
+const struct inode_operations logfs_reg_iops = {
+	.setattr	= logfs_setattr,
+};
+
+const struct file_operations logfs_reg_fops = {
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+	.fsync		= logfs_fsync,
+	.ioctl		= logfs_ioctl,
+	.llseek		= generic_file_llseek,
+	.mmap		= generic_file_readonly_mmap,
+	.open		= generic_file_open,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+};
+
+const struct address_space_operations logfs_reg_aops = {
+	.commit_write	= logfs_commit_write,
+	.invalidatepage	= logfs_invalidatepage,
+	.prepare_write	= logfs_prepare_write,
+	.readpage	= logfs_readpage,
+	.releasepage	= logfs_releasepage,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.writepage	= logfs_writepage,
+	.writepages	= generic_writepages,
+};
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/gc.c	2008-04-07 11:53:20.913211255 +0200
@@ -0,0 +1,730 @@
+/*
+ * fs/logfs/gc.c	- garbage collection code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ *
+ * GC design as it should be (and isn't, as of 15.3.07):
+ * 1. Pick a good candidate for GC, constrained by the number of currently
+ *    free segments.
+ * 2. Move all valid blocks in this segment until
+ *   a) they are all gone, or
+ *   b) the number of currently free segments drops too low.
+ * 3. Mark the segment as GC-pending or so, because not all indirect blocks
+ *    have been written yet.
+ * 4. Either
+ *   a) goto 1. or
+ *   b) write dirty indirect blocks directly
+ *
+ * Sooner or later 4b) should be taken, causing a number of segments to be
+ * freed.  2b) will consume free segments until this point is reached.  Overall
+ * progress will be made, even though less than a full segment may be gained.
+ *
+ * Crucial question is when to choose 4b) over 4a.
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+
+#define SCAN_RATIO 16	/* number of scanned segments per gc'd segment */
+#define LIST_SIZE 16	/* base size of candidate lists */
+#define SCAN_ROUNDS 128	/* maximum number of complete medium scans */
+#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
+
+static void __logfs_gc_pass(struct super_block *sb, int target);
+
+/* journal has distance -1, top-most ifile layer distance 0 */
+static u8 root_distance(struct super_block *sb, u8 level)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	switch (level) {
+	case 0: /* fall through */
+	case 1: /* fall through */
+	case 2: /* fall through */
+	case 3:
+		/* file data or indirect blocks */
+		return super->s_ifile_levels + super->s_iblock_levels - level;
+	case 6: /* fall through */
+	case 7: /* fall through */
+	case 8: /* fall through */
+	case 9:
+		/* inode file data or indirect blocks */
+		return super->s_ifile_levels - (level-6);
+	default:
+		printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
+				level);
+		WARN_ON(1);
+		return super->s_ifile_levels + super->s_iblock_levels;
+	}
+}
+
+int logfs_safe_to_write_block(struct super_block *sb, u8 level)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	return root_distance(sb, level) <= super->s_free_list.count;
+}
+
+static int segment_is_reserved(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area;
+	struct gc_candidate *cand;
+	void *reserved;
+	int i;
+
+	/* Some segments are reserved.  Just pretend they were all valid */
+	reserved = btree_lookup(&super->s_reserved_segments, segno);
+	if (reserved)
+		return 1;
+
+	/* Currently open segments */
+	for_each_area(i) {
+		area = super->s_area[i];
+		if (area->a_is_open && area->a_segno == segno)
+			return 1;
+	}
+
+	/* On the free list */
+	list_for_each_entry(cand, &super->s_free_list.list, list)
+		if (cand->segno == segno)
+			return 1;
+
+	return 0;
+}
+
+static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
+{
+	BUG();
+}
+
+/*
+ * Count the bytes consumed by valid objects in this segment.  Object headers
+ * are counted, the segment header is not.
+ */
+static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
+		u8 *level, u64 *segment_gec)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_segment_header sh;
+	struct logfs_object_header oh;
+	u64 ofs, ino, bix;
+	u32 seg_ofs, valid, size;
+	int err;
+
+	err = device_read(sb, segno, 0, sizeof(sh), &sh);
+	BUG_ON(err);
+	if (!memchr_inv(&sh, 0xff, sizeof(sh)))
+		return 0;
+
+	*level = sh.level;
+	*ec = be32_to_cpu(sh.ec);
+	*segment_gec = be64_to_cpu(sh.gec);
+	if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
+		logfs_mark_segment_bad(sb, segno);
+		return super->s_segsize - 1;
+	}
+
+	valid = 0;
+	for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
+			seg_ofs + sizeof(oh) < super->s_segsize;) {
+		err = device_read(sb, segno, seg_ofs, sizeof(oh), &oh);
+		BUG_ON(err);
+		if (!memchr_inv(&oh, 0xff, sizeof(oh)))
+			break;
+
+		if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
+			logfs_mark_segment_bad(sb, segno);
+			return super->s_segsize - 1;
+		}
+
+		ofs = dev_ofs(sb, segno, seg_ofs);
+		ino = be64_to_cpu(oh.ino);
+		bix = be64_to_cpu(oh.bix);
+		size = (u32)be16_to_cpu(oh.len) + sizeof(oh);
+
+		if (logfs_is_valid_block(sb, ofs, ino, bix, *level))
+			valid += size;
+		seg_ofs += size;
+	}
+	pr_debug(KERN_INFO "LOGFS valid(%x) = %x\n", segno, valid);
+	return valid;
+}
+
+/* FIXME: check for off-by-one on max_dist */
+/*
+ * GC distance N:
+ * while (valid blocks in segment) {
+ * 	rewrite one block
+ * 	if (fewer than N free segments)
+ * 		GC distance N-1
+ * }
+ * While (N > 1) {
+ * 	N--;
+ * 	flush dirty list N
+ * }
+ * Exit criterium: free segments >= N
+ *
+ * flush dirty list N:
+ * while (dirty blocks on level) {
+ * 	write block
+ * 	if (fewer then N free segments)
+ * 		GC distance N-1
+ * }
+ */
+
+/*
+ * Move all block from the gc_dirty lists to a private one, then recursively
+ * call into GC again to free some segments on higher layers.
+ */
+static void recursive_backoff(struct super_block *sb, int max_dist)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block, *tmp;
+	int i;
+	LIST_HEAD(backoff_list);
+
+	for_each_area(i) {
+		if (i > max_dist)
+			break;
+		list_for_each_entry_safe(block, tmp, &super->s_gc_dirty_list[i], dirty_list) {
+			list_move_tail(&block->dirty_list, &backoff_list);
+		}
+	}
+	__logfs_gc_pass(sb, max_dist);
+	list_for_each_entry_safe(block, tmp, &backoff_list, dirty_list)
+		logfs_dirty_for_gc(sb, block);
+}
+
+/* Write back any indirect/inode blocks dirtied during the GC run. */
+static void flush_gc_dirty_list(struct super_block *sb, int dist)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block;
+	struct page *page;
+	struct inode *inode;
+	long flags;
+	int err;
+
+	for (;;) {
+		if (dist < 0)
+			break;
+		if (list_empty(&super->s_gc_dirty_list[dist])) {
+			dist--;
+			continue;
+		}
+
+		block = list_entry(super->s_gc_dirty_list[dist].next,
+				struct logfs_block, dirty_list);
+		page = block->page;
+		inode = page->mapping->host;
+		if (super->s_free_list.count < dist) {
+			/* We ran out of room on this level. */
+			/* Interestingly enough, this case is possible in
+			 * theory, but never triggered in practice.
+			 */
+			recursive_backoff(sb, dist);
+			continue;
+		}
+
+		flags = WF_GC;
+		if (super->s_free_bytes < dist * LOGFS_MAX_OBJECTSIZE
+				+ super->s_gc_reserve
+				+ super->s_dirty_free_bytes)
+			flags |= WF_SYNC;
+		err = logfs_write_buf(inode, page, NULL, WF_GC);
+		BUG_ON(err);
+	}
+}
+
+void logfs_dirty_for_gc(struct super_block *sb, struct logfs_block *block)
+{
+	struct page *page;
+	struct inode *inode;
+	u64 bix;
+	u8 level, dist;
+
+	page = block->page;
+	inode = page->mapping->host;
+	logfs_unpack_index(block->page->index, &bix, &level);
+	if (inode->i_ino == LOGFS_INO_MASTER)
+		level += LOGFS_MAX_LEVELS;
+
+	dist = root_distance(sb, level);
+	list_move_tail(&block->dirty_list, &logfs_super(sb)->s_gc_dirty_list[dist]);
+}
+
+static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
+		u64 bix, int level, long flags)
+{
+	struct inode *inode;
+	int err, cookie;
+
+	inode = logfs_iget(sb, ino, &cookie);
+	BUG_ON(!inode);
+	err = logfs_rewrite_block(inode, bix, ofs, level, flags);
+	BUG_ON(err);
+	logfs_iput(inode, cookie);
+}
+
+static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_segment_header sh;
+	struct logfs_object_header oh;
+	u64 ofs, ino, bix;
+	u32 seg_ofs, cleaned = 0;
+	int level, err, len, valid;
+	long flags;
+
+	LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
+
+	btree_insert(&super->s_reserved_segments, segno, (void *)1);
+	err = device_read(sb, segno, 0, sizeof(sh), &sh);
+	BUG_ON(err);
+	level = sh.level;
+	if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
+		logfs_mark_segment_bad(sb, segno);
+		cleaned = -1;
+		goto out;
+	}
+
+	for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
+			seg_ofs + sizeof(oh) < super->s_segsize; ) {
+		ofs = dev_ofs(sb, segno, seg_ofs);
+		err = device_read(sb, segno, seg_ofs, sizeof(oh), &oh);
+		BUG_ON(err);
+
+		if (!memchr_inv(&oh, 0xff, sizeof(oh)))
+			break;
+
+		if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
+			logfs_mark_segment_bad(sb, segno);
+			cleaned = super->s_segsize - 1;
+			goto out;
+		}
+
+		ino = be64_to_cpu(oh.ino);
+		bix = be64_to_cpu(oh.bix);
+		len = sizeof(oh) + be16_to_cpu(oh.len);
+		valid = logfs_is_valid_block(sb, ofs, ino, bix, level);
+		if (valid) {
+			/* Garbage collection may consume further free segments.
+			 * If space gets too tight, abort and continue with a
+			 * higher level segment for the moment. */
+			if (super->s_free_list.count < dist)
+				recursive_backoff(sb, dist);
+
+			flags = WF_GC;
+			if (super->s_free_bytes < dist * LOGFS_MAX_OBJECTSIZE
+					+ super->s_gc_reserve
+					+ super->s_dirty_free_bytes)
+				flags |= WF_SYNC;
+			logfs_cleanse_block(sb, ofs, ino, bix, level, flags);
+			cleaned += len;
+		}
+		seg_ofs += len;
+	}
+
+	flush_gc_dirty_list(sb, dist - 1);
+out:
+	btree_remove(&super->s_reserved_segments, segno);
+	return cleaned;
+}
+
+static struct gc_candidate *add_list(struct gc_candidate *cand,
+		struct candidate_list *list)
+{
+	struct gc_candidate *cur, *removed = NULL;
+	int cont;
+
+	/* insert sorted */
+	list_for_each_entry(cur, &list->list, list) {
+		if (list->sort_by_ec)
+			cont = cur->erase_count < cand->erase_count;
+		else
+			cont = cur->valid < cand->valid;
+		if (cont)
+			continue;
+		list_add(&cand->list, &cur->list);
+		cand = NULL;
+		break;
+	}
+	/* if list is empty or candidate is worse than entire list */
+	if (cand)
+		list_add_tail(&cand->list, &list->list);
+
+	/* remove worst entry if list is full */
+	if (list->count >= list->maxcount) {
+		removed = list_entry(list->list.prev,
+				struct gc_candidate, list);
+		list_del(&removed->list);
+	} else
+		list->count++;
+
+	return removed;
+}
+
+struct gc_candidate *get_best_cand(struct candidate_list *list)
+{
+	struct gc_candidate *cand;
+
+	if (list->count == 0)
+		return NULL;
+
+	cand = list_entry(list->list.next, struct gc_candidate, list);
+	list_del(&cand->list);
+	list->count--;
+	return cand;
+}
+
+/*
+ * We try to stuff the candidate in several lists in order.  Any list may
+ * either reject it or evict another candidate when full.  Anything left after
+ * trying the last list gets freed.
+ */
+static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
+
+	/* 100% free segments */
+	if (cand->valid == 0)
+		cand = add_list(cand, &super->s_free_list);
+	/* good candidates for Garbage Collection */
+	if (cand && cand->valid < full)
+		cand = add_list(cand, &super->s_low_list[cand->dist]);
+	/* good candidates for wear leveling,
+	 * segments that were recently written get ignored */
+	if (cand && cand->gec < super->s_gec + 2*super->s_no_segs)
+		cand = add_list(cand, &super->s_ec_list);
+
+	kfree(cand);
+}
+
+static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
+		u8 dist, u64 segment_gec)
+{
+	struct gc_candidate *cand;
+
+	cand = kmalloc(sizeof(*cand), GFP_KERNEL);
+	if (!cand)
+		return -ENOMEM;
+
+	cand->segno = segno;
+	cand->valid = valid;
+	cand->erase_count = ec;
+	cand->dist = dist;
+	cand->gec = segment_gec;
+
+	/* FIXME: add to btree */
+	__add_candidate(sb, cand);
+	return 0;
+}
+
+int add_free_segments_from_journal(struct super_block *sb,
+		struct logfs_je_free_segments *segs, int count)
+{
+	int i, err;
+
+	for (i = 0; i < count; i++) {
+		u32 segno = be32_to_cpu(segs[i].segno);
+		u32 ec = be32_to_cpu(segs[i].ec);
+		err = add_candidate(sb, segno, 0, ec, 0, 0);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void __del_segment(struct candidate_list *list, u32 segno)
+{
+	struct gc_candidate *cand;
+
+	list_for_each_entry(cand, &list->list, list)
+		if (cand->segno == segno) {
+			list_del(&cand->list);
+			list->count -= 1;
+			kfree(cand);
+			return;
+		}
+}
+
+static void del_segment(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	__del_segment(&super->s_free_list, segno);
+	for_each_area(i)
+		__del_segment(&super->s_low_list[i], segno);
+	__del_segment(&super->s_ec_list, segno);
+}
+
+static void scan_segment(struct super_block *sb, u32 segno)
+{
+	u64 segment_gec = 0;
+	u32 valid, ec = 0;
+	u8 dist, level = 0;
+
+	if (segment_is_reserved(sb, segno))
+		return;
+
+	del_segment(sb, segno);
+	valid = logfs_valid_bytes(sb, segno, &ec, &level, &segment_gec);
+	dist = root_distance(sb, level);
+	add_candidate(sb, segno, valid, ec, dist, segment_gec);
+}
+
+static struct gc_candidate *first_in_list(struct candidate_list *list)
+{
+	if (list->count == 0)
+		return NULL;
+	return list_entry(list->list.next, struct gc_candidate, list);
+}
+
+static struct gc_candidate *get_wl_candidate(struct super_block *sb,
+		int max_dist)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *cand;
+	u64 cand_gec;
+
+	if (super->s_gec & 0xf)
+		return NULL;
+
+	cand = first_in_list(&super->s_ec_list);
+	if (!cand)
+		return NULL;
+	if (cand->dist > max_dist)
+		return NULL;
+
+	/* instead of comparing candidate erasecount to average erasecount,
+	 * which would involve a 64bit division we multiply candidate erasecount
+	 * with the number of segment..  In effect, the comparison is:
+	 * if (cand->erase_count + 20 > average_erasecount)
+	 */
+	cand_gec = cand->erase_count;
+	cand_gec += 20; /* FIXME: should be a superblock variable */
+	cand_gec *= super->s_no_segs;
+	if (cand_gec > super->s_gec)
+		return NULL;
+
+	list_del(&cand->list);
+	super->s_ec_list.count--;
+	return cand;
+}
+
+static struct gc_candidate *get_candidate(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i, max_dist;
+	struct gc_candidate *cand = NULL, *this;
+
+	max_dist = min(super->s_free_list.count, LOGFS_NO_AREAS);
+
+	cand = get_wl_candidate(sb, max_dist);
+	if (cand)
+		return cand;
+
+	for (i = 0; i <= max_dist; i++) {
+		this = first_in_list(&super->s_low_list[i]);
+		if (!this)
+			continue;
+		if (!cand)
+			cand = this;
+		if (this->valid < cand->valid)
+			cand = this;
+	}
+	if (cand) {
+		list_del(&cand->list);
+		super->s_low_list[cand->dist].count--;
+	}
+	return cand;
+}
+
+static int logfs_gc_once(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *cand;
+	u64 segment_gec;
+	u32 cleaned, valid, segno, ec;
+	u8 dist;
+
+	cand = get_candidate(sb);
+	if (!cand)
+		return 0;
+
+	valid = cand->valid;
+	segno = cand->segno;
+	dist = cand->dist;
+	ec = cand->erase_count;
+	segment_gec = cand->gec;
+	kfree(cand);
+	pr_debug("GC segment #%02x at %x, %x required, %x free, %x valid, %llx free, %llx reserve\n",
+			segno, segno << super->s_segshift,
+			dist, super->s_free_list.count, valid,
+			super->s_free_bytes, super->s_gc_reserve);
+	cleaned = logfs_gc_segment(sb, segno, dist);
+	pr_debug("GC segment #%02x complete\n", segno);
+	add_candidate(sb, segno, valid - cleaned, ec, dist, segment_gec);
+	return 1;
+}
+
+/* returns 1 if a wrap occurs, 0 otherwise */
+static int logfs_scan_some(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 segno;
+	int i, ret = 0;
+
+	segno = super->s_sweeper;
+	for (i = SCAN_RATIO; i > 0; i--) {
+		segno++;
+		if (segno >= super->s_no_segs) {
+			segno = 0;
+			ret = 1;
+		}
+
+		scan_segment(sb, segno);
+	}
+	super->s_sweeper = segno;
+	return ret;
+}
+
+/*
+ * In principle, this function should loop forever, looking for GC candidates
+ * and moving data.  LogFS is designed in such a way that this loop is
+ * guaranteed to terminate.
+ *
+ * Limiting the loop to some iterations serves purely to catch cases when
+ * these guarantees have failed.  An actual endless loop is an obvious bug
+ * and should be reported as such.
+ */
+static void __logfs_gc_pass(struct super_block *sb, int target)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int round, progress, last_progress = 0;
+
+	pr_debug("__logfs_gc_pass(%x)\n", target);
+	for (round = 0; round < SCAN_ROUNDS; ) {
+		if (super->s_free_list.count >= target)
+			return;
+		round += logfs_scan_some(sb);
+		if (super->s_free_list.count >= target)
+			return;
+		progress = logfs_gc_once(sb);
+		if (progress)
+			last_progress = round;
+		else if (round - last_progress > 2)
+			break;
+	}
+	logfs_fsck(sb);
+	LOGFS_BUG(sb);
+}
+
+void logfs_gc_pass(struct super_block *sb)
+{
+	__logfs_gc_pass(sb, logfs_super(sb)->s_total_levels);
+}
+
+static int check_area(struct super_block *sb, int i)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_area[i];
+	struct logfs_object_header h;
+	u32 segno = area->a_segno;
+	u32 ofs = area->a_used_bytes;
+	__be32 crc;
+	int err;
+
+	if (!area->a_is_open)
+		return 0;
+
+	for (ofs = area->a_used_bytes;
+	    ofs <= super->s_segsize - sizeof(h);
+	    ofs += (u32)be16_to_cpu(h.len) + sizeof(h)) {
+		err = device_read(sb, segno, ofs, sizeof(h), &h);
+		if (err)
+			return err;
+
+		if (!memchr_inv(&h, 0xff, sizeof(h)))
+			break;
+
+		crc = logfs_crc32(&h, sizeof(h) - 4, 4);
+		if (crc != h.crc) {
+			printk(KERN_INFO "interrupted header at %llx\n",
+					dev_ofs(sb, segno, ofs));
+			return 0;
+		}
+	}
+	if (ofs > super->s_segsize - LOGFS_MAX_OBJECTSIZE) {
+		printk(KERN_INFO "%x bytes unaccounted data found at %llx - closing it\n",
+				ofs - area->a_used_bytes,
+				dev_ofs(sb, segno, ofs));
+		area->a_segno = 0;
+		area->a_is_open = 0;
+	} else if (ofs != area->a_used_bytes) {
+		printk(KERN_INFO "%x bytes unaccounted data found at %llx\n",
+				ofs - area->a_used_bytes,
+				dev_ofs(sb, segno, ofs));
+		area->a_used_bytes = ofs;
+	}
+	return 0;
+}
+
+int logfs_check_areas(struct super_block *sb)
+{
+	int i, err;
+
+	for_each_area(i) {
+		err = check_area(sb, i);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void logfs_init_candlist(struct candidate_list *list, int maxcount,
+		int sort_by_ec)
+{
+	list->count = 0;
+	list->maxcount = maxcount;
+	list->sort_by_ec = sort_by_ec;
+	INIT_LIST_HEAD(&list->list);
+}
+
+int logfs_init_gc(struct logfs_super *super)
+{
+	int i;
+
+	logfs_init_candlist(&super->s_free_list, 4*LIST_SIZE, 1);
+	for_each_area(i)
+		logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
+	logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
+	return 0;
+}
+
+static void logfs_cleanup_list(struct candidate_list *list)
+{
+	struct gc_candidate *cand, *next;
+
+	list_for_each_entry_safe(cand, next, &list->list, list) {
+		list_del(&cand->list);
+		kfree(cand);
+	}
+}
+
+void logfs_cleanup_gc(struct logfs_super *super)
+{
+	int i;
+
+	if (!super->s_free_list.list.next)
+		return;
+
+	logfs_cleanup_list(&super->s_free_list);
+	for_each_area(i)
+		logfs_cleanup_list(&super->s_low_list[i]);
+	logfs_cleanup_list(&super->s_ec_list);
+}
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/inode.c	2008-04-07 12:13:48.193106183 +0200
@@ -0,0 +1,589 @@
+/*
+ * fs/logfs/inode.c	- inode handling code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+
+/*
+ * We need to include <linux/writeback.h> - a header we normally shouldn't
+ * be mucking with.  If life only were that easy!
+ *
+ * As it is, LogFS' requirement to read inodes for garbage collection keeps
+ * breaking Linux assumptions.  In particular, an inode can get be in
+ * I_DELETING state when being written out.  Logfs then notices that it
+ * needs some space, does a little GC and tries to read just the inode in
+ * I_DELETING state.  So the code is waiting for itself to finish, lovely.
+ *
+ * Our strategy to solve this problem is to overload the generic drop_inode()
+ * and destroy_inode() methods.  Writeback happens between those two calls,
+ * so add the inode to a list in drop_inode() and remove it again in
+ * destroy_inode().  Any iget() in the GC path is replaced with logfs_iget(),
+ * which will check the list and only call the blocking iget() if the inode
+ * in question cannot deadlock.
+ *
+ * And of course this would be racy if we didn't take inode_lock in a few
+ * key moments.
+ */
+static struct kmem_cache *logfs_inode_cache;
+
+static int __logfs_read_inode(struct inode *inode);
+
+static struct inode *__logfs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct inode *inode = iget_locked(sb, ino);
+	int err;
+
+	if (inode && (inode->i_state & I_NEW)) {
+		err = __logfs_read_inode(inode);
+		unlock_new_inode(inode);
+		if (err) {
+			/* set i_nlink to 0 to prevent caching */
+			inode->i_nlink = 0;
+			logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
+			iput(inode);
+			return NULL;
+		}
+		BUG_ON(inode->i_nlink == 0);
+	}
+
+	return inode;
+}
+
+/*
+ * is_cached is set to 1 if we hand out a cached inode, 0 otherwise.
+ * this allows logfs_iput to do the right thing later
+ */
+struct inode *logfs_iget(struct super_block *sb, ino_t ino, int *is_cached)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_inode *li;
+
+	if (ino == LOGFS_INO_MASTER)
+		return super->s_master_inode;
+
+	spin_lock(&inode_lock);
+	list_for_each_entry(li, &super->s_freeing_list, li_freeing_list)
+		if (li->vfs_inode.i_ino == ino) {
+			spin_unlock(&inode_lock);
+			*is_cached = 1;
+			return &li->vfs_inode;
+		}
+	spin_unlock(&inode_lock);
+
+	*is_cached = 0;
+	return __logfs_iget(sb, ino);
+}
+
+void logfs_iput(struct inode *inode, int is_cached)
+{
+	if (inode->i_ino == LOGFS_INO_MASTER)
+		return;
+
+	if (is_cached)
+		return;
+
+	iput(inode);
+}
+
+static void logfs_init_inode(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	li->li_flags	= LOGFS_IF_VALID;
+	li->li_used_bytes = 0;
+	li->li_transaction = NULL;
+	inode->i_uid	= 0;
+	inode->i_gid	= 0;
+	inode->i_size	= 0;
+	inode->i_blocks	= 0;
+	inode->i_ctime	= CURRENT_TIME;
+	inode->i_mtime	= CURRENT_TIME;
+	inode->i_nlink	= 1;
+	INIT_LIST_HEAD(&li->li_freeing_list);
+	btree_init(&li->li_shadow_tree.new);
+	btree_init(&li->li_shadow_tree.old);
+
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = 0;
+
+	return;
+}
+
+static struct inode *logfs_alloc_inode(struct super_block *sb)
+{
+	struct logfs_inode *li;
+
+	li = kmem_cache_alloc(logfs_inode_cache, GFP_KERNEL);
+	if (!li)
+		return NULL;
+	logfs_init_inode(&li->vfs_inode);
+	return &li->vfs_inode;
+}
+
+/*
+ * In logfs inodes are written to an inode file.  The inode file, like any
+ * other file, is managed with a inode.  The inode file's inode, aka master
+ * inode, requires special handling in several respects.  First, it cannot be
+ * written to the inode file, so it is stored in the journal instead.
+ *
+ * Secondly, this inode cannot be written back and destroyed before all other
+ * inodes have been written.  The ordering is important.  Linux' VFS is happily
+ * unaware of the ordering constraint and would ordinarily destroy the master
+ * inode at umount time while other inodes are still in use and dirty.  Not
+ * good.
+ *
+ * So logfs makes sure the master inode is not written until all other inodes
+ * have been destroyed.  Sadly, this method has another side-effect.  The VFS
+ * will notice one remaining inode and print a frightening warning message.
+ * Worse, it is impossible to judge whether such a warning was caused by the
+ * master inode or any other inodes have leaked as well.
+ *
+ * Our attempt of solving this is with logfs_new_meta_inode() below.  Its
+ * purpose is to create a new inode that will not trigger the warning if such
+ * an inode is still in use.  An ugly hack, no doubt.  Suggections for
+ * improvement are welcome.
+ */
+struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
+{
+	struct inode *inode;
+
+	inode = logfs_alloc_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	inode->i_mode = 0;
+	inode->i_ino = ino;
+	inode->i_sb = sb;
+
+	/* This is a blatant copy of alloc_inode code.  We'd need alloc_inode
+	 * to be nonstatic, alas. */
+	{
+		struct address_space * const mapping = &inode->i_data;
+
+		mapping->a_ops = &logfs_reg_aops;
+		mapping->host = inode;
+		mapping->flags = 0;
+		mapping_set_gfp_mask(mapping, GFP_HIGHUSER);
+		mapping->assoc_mapping = NULL;
+		mapping->backing_dev_info = &default_backing_dev_info;
+		inode->i_mapping = mapping;
+	}
+
+	return inode;
+}
+
+/*
+ * Time is stored as nanoseconds since the epoch.
+ */
+static struct timespec be64_to_timespec(__be64 betime)
+{
+	return ns_to_timespec(be64_to_cpu(betime));
+}
+
+static __be64 timespec_to_be64(struct timespec tsp)
+{
+	return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec);
+}
+
+static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	inode->i_mode	= be16_to_cpu(di->di_mode);
+	li->li_height	= di->di_height;
+	li->li_flags	= be32_to_cpu(di->di_flags);
+	inode->i_uid	= be32_to_cpu(di->di_uid);
+	inode->i_gid	= be32_to_cpu(di->di_gid);
+	inode->i_size	= be64_to_cpu(di->di_size);
+	logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
+	inode->i_ctime	= be64_to_timespec(di->di_ctime);
+	inode->i_mtime	= be64_to_timespec(di->di_mtime);
+	inode->i_nlink	= be32_to_cpu(di->di_refcount);
+	inode->i_generation = be32_to_cpu(di->di_generation);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFCHR: /* fall through */
+	case S_IFBLK: /* fall through */
+	case S_IFIFO:
+		inode->i_rdev = be64_to_cpu(di->di_data[0]);
+		break;
+	default:
+		/* li_flags must be read before this */
+		if (li->li_flags & LOGFS_IF_EMBEDDED)
+			memcpy(li->li_data, di->di_data,
+					LOGFS_EMBEDDED_FIELDS * sizeof(u64));
+		else
+			for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+				li->li_data[i] = be64_to_cpu(di->di_data[i]);
+		break;
+	}
+}
+
+static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	di->di_mode	= cpu_to_be16(inode->i_mode);
+	di->di_height	= li->li_height;
+	di->di_pad	= 0;
+	di->di_flags	= cpu_to_be32(li->li_flags);
+	di->di_uid	= cpu_to_be32(inode->i_uid);
+	di->di_gid	= cpu_to_be32(inode->i_gid);
+	di->di_size	= cpu_to_be64(i_size_read(inode));
+	di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
+	di->di_ctime	= timespec_to_be64(inode->i_ctime);
+	di->di_mtime	= timespec_to_be64(inode->i_mtime);
+	di->di_refcount	= cpu_to_be32(inode->i_nlink);
+	di->di_generation = cpu_to_be32(inode->i_generation);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFCHR: /* fall through */
+	case S_IFBLK: /* fall through */
+	case S_IFIFO:
+		di->di_data[0] = cpu_to_be64(inode->i_rdev);
+		break;
+	default:
+		if (li->li_flags & LOGFS_IF_EMBEDDED)
+			memcpy(di->di_data, li->li_data,
+					LOGFS_EMBEDDED_FIELDS * sizeof(u64));
+		else
+			for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+				di->di_data[i] = cpu_to_be64(li->li_data[i]);
+		break;
+	}
+}
+
+#define VALID_MASK (LOGFS_IF_VALID | LOGFS_IF_INVALID)
+static int logfs_read_disk_inode(struct logfs_disk_inode *di,
+		struct inode *inode)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	ino_t ino = inode->i_ino;
+	int ret;
+
+	BUG_ON(!super->s_master_inode);
+	ret = logfs_inode_read(super->s_master_inode, di, sizeof(*di), ino);
+	if (ret)
+		return ret;
+
+	if ((be32_to_cpu(di->di_flags) & VALID_MASK) != LOGFS_IF_VALID) {
+		/*
+		 * We read wrong data, someone scribbled over it or we
+		 * have a bug.  Worth mentioning in the logs.
+		 */
+		printk(KERN_WARNING"LOGFS: read corrupt inode #%lx\n", ino);
+		WARN_ON(1);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static void logfs_inode_setops(struct inode *inode)
+{
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFDIR:
+		inode->i_op = &logfs_dir_iops;
+		inode->i_fop = &logfs_dir_fops;
+		inode->i_mapping->a_ops = &logfs_reg_aops;
+		break;
+	case S_IFREG:
+		inode->i_op = &logfs_reg_iops;
+		inode->i_fop = &logfs_reg_fops;
+		inode->i_mapping->a_ops = &logfs_reg_aops;
+		break;
+	case S_IFLNK:
+		inode->i_op = &logfs_symlink_iops;
+		inode->i_mapping->a_ops = &logfs_reg_aops;
+		break;
+	default:
+		;
+	}
+}
+
+static int __logfs_read_inode(struct inode *inode)
+{
+	struct logfs_disk_inode di;
+	int ret;
+
+	ret = logfs_read_disk_inode(&di, inode);
+	if (ret)
+		return ret;
+	logfs_disk_to_inode(&di, inode);
+
+	logfs_inode_setops(inode);
+	return 0;
+}
+
+struct inode *dhowells_iget(struct super_block *sb, ino_t ino)
+{
+	struct inode *inode;
+	int ret;
+
+	BUG_ON(ino == LOGFS_INO_MASTER);
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	ret = __logfs_read_inode(inode);
+	if (ret) {
+		iget_failed(inode);
+		return ERR_PTR(ret);
+	}
+	unlock_new_inode(inode);
+	return inode;
+}
+
+/* This function should move to fs/fs-writeback.c or similar. */
+static void clear_inode_dirty_sync(struct inode *inode)
+{
+	spin_lock(&inode_lock);
+	inode->i_state &= ~I_DIRTY_SYNC;
+	spin_unlock(&inode_lock);
+}
+
+static void __logfs_set_blocks(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_inode *li = logfs_inode(inode);
+
+	inode->i_blocks = ULONG_MAX;
+	if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX)
+		inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9;
+}
+
+void logfs_set_blocks(struct inode *inode, u64 bytes)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	li->li_used_bytes = bytes;
+	__logfs_set_blocks(inode);
+}
+
+static void account_shadow(void *_shadow, long _inode, u64 ignore)
+{
+	struct logfs_shadow *shadow = _shadow;
+	struct inode *inode = (void *)_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+
+	li->li_used_bytes += shadow->new_len - shadow->old_len;
+}
+
+static int logfs_write_disk_inode(struct logfs_disk_inode *di,
+		struct inode *inode, long flags)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_transaction *ta;
+	int ret;
+
+	/* FIXME: should we take inode->i_mutex? */
+	ta = logfs_inode(inode)->li_transaction;
+	logfs_inode(inode)->li_transaction = NULL;
+
+	btree_visitor(&li->li_shadow_tree.new, (long)inode, account_shadow);
+	btree_visitor(&li->li_shadow_tree.old, (long)inode, account_shadow);
+	__logfs_set_blocks(inode);
+	BUG_ON((s64)li->li_used_bytes < 0);
+
+	logfs_inode_to_disk(inode, di);
+	clear_inode_dirty_sync(inode);
+	ret = logfs_inode_write(super->s_master_inode, di, sizeof(*di),
+			inode->i_ino, flags, ta, &li->li_shadow_tree);
+	return ret;
+}
+
+int __logfs_write_inode(struct inode *inode, long flags)
+{
+	struct logfs_disk_inode di;
+
+	BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
+	return logfs_write_disk_inode(&di, inode, flags);
+}
+
+static int logfs_write_inode(struct inode *inode, int do_sync)
+{
+	int ret;
+	long flags = WF_LOCK;
+
+	if (do_sync)
+		flags |= WF_SYNC;
+
+	/* Can only happen if creat() failed.  Safe to skip. */
+	if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
+		return 0;
+
+	ret = __logfs_write_inode(inode, flags);
+	LOGFS_BUG_ON(ret, inode->i_sb);
+	return ret;
+}
+
+static void logfs_truncate_inode(struct inode *inode)
+{
+	logfs_truncate(inode, 0);
+	truncate_inode_pages(&inode->i_data, 0);
+}
+
+/*
+ * ZOMBIE inodes have already been deleted before and should remain dead,
+ * if it weren't for valid checking.  No need to kill them again here.
+ */
+static void logfs_delete_inode(struct inode *inode)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
+		li->li_flags |= LOGFS_IF_ZOMBIE;
+		if (i_size_read(inode) > 0)
+			logfs_truncate_inode(inode);
+		logfs_delete(super->s_master_inode, inode->i_ino,
+				&li->li_shadow_tree, NULL);
+	}
+	clear_inode(inode);
+}
+
+void __logfs_destroy_inode(struct inode *inode)
+{
+	if (likely(inode))
+		kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
+}
+
+/*
+ * We need to remember which inodes are currently being dropped.  They
+ * would deadlock the cleaner, if it were to iget() them.  So
+ * logfs_drop_inode() adds them to super->s_freeing_list,
+ * logfs_destroy_inode() removes them again and logfs_iget() checks the
+ * list.
+ */
+static void logfs_destroy_inode(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	BUG_ON(list_empty(&li->li_freeing_list));
+	spin_lock(&inode_lock);
+	list_del(&li->li_freeing_list);
+	spin_unlock(&inode_lock);
+	kmem_cache_free(logfs_inode_cache, li);
+}
+
+/* called with inode_lock held */
+static void logfs_drop_inode(struct inode *inode)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_inode *li = logfs_inode(inode);
+
+	list_move(&li->li_freeing_list, &super->s_freeing_list);
+	generic_drop_inode(inode);
+}
+
+static u64 logfs_get_ino(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 ino;
+
+	/*
+	 * FIXME: ino allocation should work in two modes:
+	 * o nonsparse - ifile is mostly occupied, just append
+	 * o sparse - ifile has lots of holes, fill them up
+	 *
+	 * SEEK_HOLE would obviously help a lot here.
+	 */
+	spin_lock(&super->s_ino_lock);
+	ino = super->s_last_ino;
+	super->s_last_ino++;
+	spin_unlock(&super->s_ino_lock);
+	return ino;
+}
+
+struct inode *logfs_new_inode(struct inode *dir, int mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	logfs_init_inode(inode);
+
+	/* inherit parent flags */
+	logfs_inode(inode)->li_flags |=
+		logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED;
+
+	inode->i_mode = mode;
+	inode->i_ino = logfs_get_ino(sb);
+
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	if (dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			inode->i_mode |= S_ISGID;
+	}
+
+	logfs_inode_setops(inode);
+	insert_inode_hash(inode);
+
+	return inode;
+}
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 23)
+static void logfs_init_once(struct kmem_cache *cachep, void *_li)
+#else
+static void logfs_init_once(void *_li, struct kmem_cache *cachep,
+		unsigned long flags)
+#endif
+{
+	struct logfs_inode *li = _li;
+	int i;
+
+	li->li_flags = 0;
+	li->li_used_bytes = 0;
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = 0;
+	inode_init_once(&li->vfs_inode);
+}
+
+static int logfs_sync_fs(struct super_block *sb, int wait)
+{
+	logfs_flush_dirty(sb, 1);
+	logfs_super(sb)->s_devops->sync(sb);
+	return 0;
+}
+
+const struct super_operations logfs_super_operations = {
+	.alloc_inode	= logfs_alloc_inode,
+	.delete_inode	= logfs_delete_inode,
+	.destroy_inode	= logfs_destroy_inode,
+	.drop_inode	= logfs_drop_inode,
+	.write_inode	= logfs_write_inode,
+	.statfs		= logfs_statfs,
+	.sync_fs	= logfs_sync_fs,
+};
+
+int logfs_init_inode_cache(void)
+{
+	logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
+			sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
+			logfs_init_once DTOR);
+	if (!logfs_inode_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void logfs_destroy_inode_cache(void)
+{
+	kmem_cache_destroy(logfs_inode_cache);
+}
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/journal.c	2008-04-07 11:53:20.913211255 +0200
@@ -0,0 +1,805 @@
+/*
+ * fs/logfs/journal.c	- journal handling code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+
+static void clear_retired(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	for (i = 0; i < JE_LAST; i++)
+		super->s_retired[i].used = 0;
+	super->s_first.used = 0;
+}
+
+static void clear_speculatives(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	for (i = 0; i < JE_LAST; i++)
+		super->s_speculative[i].used = 0;
+}
+
+static void retire_speculatives(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_journal_entry *spec, *retired;
+	int i;
+
+	for (i = 0; i < JE_LAST; i++) {
+		spec = super->s_speculative + i;
+		retired = super->s_retired + i;
+		if (!spec->used)
+			continue;
+		if (retired->used && (spec->version <= retired->version))
+			continue;
+		retired->used = 1;
+		retired->version = spec->version;
+		retired->offset = spec->offset;
+		retired->len = spec->len;
+		retired->datalen = spec->datalen;
+	}
+	clear_speculatives(sb);
+}
+
+/*
+ * Journal entries are versioned and highest version always wins.  To save
+ * some bytes, the version is only be16 instead of be64.  This means versions
+ * can and regularly will wrap.  However, all versions should be in a strict
+ * sequence and the total number of entries significantly lower than 2^16.
+ *
+ * So we read the first entry, store its version and substract that from
+ * any version read to normalize them.  Normalized versions should all be
+ * fairly close to zero and we can again easily judge which is the highest
+ * number.
+ */
+static int scan_segment(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_journal_area;
+	struct logfs_journal_header *h = super->s_compressed_je;
+	struct logfs_journal_entry *spec, *retired;
+	u64 ofs, seg_ofs = dev_ofs(sb, segno, 0);
+	u32 h_ofs;
+	s16 len, datalen, type, version;
+	int err;
+
+	for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*h)) {
+		ofs = seg_ofs + h_ofs;
+		err = super->s_devops->read(sb, ofs, sizeof(*h), h);
+		if (err)
+			return err;
+		/* stop scanning if all 0xff */
+		if (0 && !memchr_inv(h, 0xff, sizeof(*h))) /* FIXME */
+			break;
+
+		len = be16_to_cpu(h->h_len);
+		datalen = be16_to_cpu(h->h_datalen);
+		type = be16_to_cpu(h->h_type);
+		version = be16_to_cpu(h->h_version);
+
+		if ((len < 16) || (len > sb->s_blocksize))
+			continue;
+		if ((type < JE_FIRST) || (type > JE_LAST))
+			continue;
+
+		err = super->s_devops->read(sb, ofs, len + sizeof(*h), h);
+		if (err)
+			return err;
+
+		if (h->h_crc != logfs_crc32(h, len, 4))
+			continue;
+
+		if (!super->s_first.used) {
+			super->s_first.used = 1;
+			super->s_first.version = version;
+		}
+		version -= super->s_first.version;
+
+		if (abs(version) > 1<<14)
+			return -EIO;
+
+		h_ofs += len - sizeof(*h);
+		spec = &super->s_speculative[type];
+		retired = &super->s_retired[type];
+		switch (type) {
+		default:
+			if (spec->used && (version <= spec->version))
+				break;
+			/* store speculative entry */
+			spec->used = 1;
+			spec->version = version;
+			spec->offset = ofs;
+			spec->len = len;
+			spec->datalen = datalen;
+			break;
+		case JE_COMMIT:
+			if (retired->used && (version <= retired->version))
+				break;
+			/* retire speculative entries */
+			retired->used = 1;
+			retired->version = version;
+			retired->offset = ofs;
+			retired->len = len;
+			retired->datalen = datalen;
+			retire_speculatives(sb);
+			/* and set up journal area */
+			area->a_segno = segno;
+			/*
+			 * On every mount we switch to a new segment instead
+			 * of writing further in the current one.  While safe
+			 * this method is quite wasteful and may get changed
+			 * sooner or later.
+			 */
+			area->a_is_open = 0;
+			break;
+		}
+	}
+	return 0;
+}
+
+static int logfs_scan_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 segno;
+	int i, err;
+
+	clear_speculatives(sb);
+	clear_retired(sb);
+	journal_for_each(i) {
+		segno = super->s_journal_seg[i];
+		if (!segno)
+			continue;
+		err = scan_segment(sb, segno);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void read_commit(struct logfs_super *super,
+		struct logfs_journal_header *h)
+{
+	super->s_last_version = be16_to_cpu(h->h_version);
+}
+
+static void logfs_calc_free(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 no_segs = super->s_no_segs;
+	s64 free;
+	int i;
+
+	/* superblock segment */
+	no_segs -= 1;
+	/* bad blocks */
+	no_segs -= super->s_bad_segments;
+	/* journal */
+	journal_for_each(i)
+		if (super->s_journal_seg[i])
+			no_segs--;
+
+	free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE);
+	free -= super->s_used_bytes;
+
+#if 0
+	/* reserve some extra to speed up GC for full filesystems */
+	free -= 10 * (super->s_size >> 10);
+	/* in case this reserve exceeds currently free space */
+	free = max(free, 0LL);
+#endif
+	super->s_free_bytes = free;
+}
+
+static void reserve_sb_and_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct btree_head *head = &super->s_reserved_segments;
+	int i, err;
+
+	err = btree_insert(head, 0, (void *)1);
+	BUG_ON(err);
+
+	journal_for_each(i) {
+		if (!super->s_journal_seg[i])
+			continue;
+		err = btree_insert(head, super->s_journal_seg[i], (void *)1);
+		BUG_ON(err);
+	}
+}
+
+static void read_dynsb(struct super_block *sb,
+		struct logfs_je_dynsb *dynsb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	super->s_gec		= be64_to_cpu(dynsb->ds_gec);
+	super->s_sweeper	= be64_to_cpu(dynsb->ds_sweeper);
+	super->s_victim_ino	= be64_to_cpu(dynsb->ds_victim_ino);
+	super->s_rename_dir	= be64_to_cpu(dynsb->ds_rename_dir);
+	super->s_rename_pos	= be64_to_cpu(dynsb->ds_rename_pos);
+	super->s_used_bytes	= be64_to_cpu(dynsb->ds_used_bytes);
+}
+
+static void read_anchor(struct super_block *sb,
+		struct logfs_je_anchor *da)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode = super->s_master_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	super->s_last_ino = be64_to_cpu(da->da_last_ino);
+	li->li_flags	= LOGFS_IF_VALID;
+	i_size_write(inode, be64_to_cpu(da->da_size));
+	li->li_used_bytes = be64_to_cpu(da->da_used_bytes);
+
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = be64_to_cpu(da->da_data[i]);
+}
+
+static void read_erasecount(struct super_block *sb,
+		struct logfs_je_journal_ec *ec)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	journal_for_each(i)
+		super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]);
+}
+
+static void read_badsegments(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct btree_head *head = &super->s_reserved_segments;
+	__be32 *seg, *bad = super->s_bb_array;
+	int err;
+
+	super->s_bad_segments = 0;
+	for (seg = bad; seg - bad < sb->s_blocksize >> 2; seg++) {
+		if (*seg == 0)
+			continue;
+		err = btree_insert(head, be32_to_cpu(*seg), (void *)1);
+		BUG_ON(err);
+		super->s_bad_segments++;
+	}
+}
+
+static void read_areas(struct super_block *sb, struct logfs_je_areas *a)
+{
+	struct logfs_area *area;
+	int i;
+
+	for_each_area(i) {
+		area = logfs_super(sb)->s_area[i];
+		area->a_used_bytes = be32_to_cpu(a->used_bytes[i]);
+		area->a_segno = be32_to_cpu(a->segno[i]);
+		if (area->a_segno)
+			area->a_is_open = 1;
+	}
+}
+
+static void read_free_segments(struct super_block *sb,
+		struct logfs_je_free_segments *f, u16 len)
+{
+	u32 count = len / sizeof(struct logfs_je_free_segments);
+
+	add_free_segments_from_journal(sb, f, count);
+}
+
+static void *unpack(void *from, void *to)
+{
+	struct logfs_journal_header *h = from;
+	void *data = from + sizeof(struct logfs_journal_header);
+	int err;
+	size_t inlen, outlen;
+
+	if (h->h_compr == COMPR_NONE)
+		return data;
+
+	inlen = be16_to_cpu(h->h_len) - sizeof(*h);
+	outlen = be16_to_cpu(h->h_datalen);
+	err = logfs_uncompress(data, to, inlen, outlen);
+	BUG_ON(err);
+	return to;
+}
+
+/*
+ * Journal entries come in groups of 16.  The first group contains unique
+ * entries, the second group contains the write buffers for all levels.
+ * As of now, there are only two groups.
+ * The outer switch statement deals with groups (high nibble), the inner
+ * one with unique entries
+ */
+/* FIXME: make sure there are enough per-area objects in journal */
+static int logfs_read_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	void *block = super->s_compressed_je;
+	void *scratch = super->s_je;
+	int i, err, level;
+	struct logfs_area *area;
+
+	for (i = 0; i < JE_LAST; i++) {
+		struct logfs_journal_entry *je = super->s_retired + i;
+		if (!super->s_retired[i].used) {
+			switch (i) {
+			case JE_COMMIT:
+			case JE_DYNSB:
+			case JE_ANCHOR:
+				printk(KERN_WARNING "LogFS: Missing journal "
+						"entry %x?\n", i);
+				return -EIO;
+			default:
+				continue;
+			}
+		}
+		err = super->s_devops->read(sb, je->offset, sb->s_blocksize, block);
+		if (err)
+			return err;
+
+		switch (i & ~0xf) {
+		case JEG_BASE:
+			switch (i) {
+			case JE_COMMIT:
+				/* just reads the latest version number */
+				read_commit(super, block);
+				break;
+			case JE_DYNSB:
+				read_dynsb(sb, unpack(block, scratch));
+				break;
+			case JE_ANCHOR:
+				read_anchor(sb, unpack(block, scratch));
+				break;
+			case JE_ERASECOUNT:
+				read_erasecount(sb, unpack(block, scratch));
+				break;
+			case JE_BADSEGMENTS:
+				unpack(block, super->s_bb_array);
+				read_badsegments(sb);
+				break;
+			case JE_AREAS:
+				read_areas(sb, unpack(block, scratch));
+				break;
+			case JE_FREESEGS:
+				read_free_segments(sb, unpack(block, scratch),
+						je->datalen);
+				break;
+			default:
+				/*
+				 * Any unknown entries in this group are
+				 * considered optional.
+				 */
+				break;
+			}
+			break;
+		case JEG_WBUF:
+			if (super->s_writesize <= 1)
+				return -EIO;
+			level = i & 0xf;
+			area = super->s_area[level];
+			unpack(block, area->a_wbuf);
+			break;
+		default:
+			LOGFS_BUG(sb);
+			return -EIO;
+		}
+
+	}
+	return 0;
+}
+
+/*
+ * First search the current segment (outer loop), then pick the next segment
+ * in the array, skipping any zero entries (inner loop).
+ */
+static void journal_get_free_segment(struct logfs_area *area)
+{
+	struct logfs_super *super = logfs_super(area->a_sb);
+	int i;
+
+	journal_for_each(i) {
+		if (area->a_segno != super->s_journal_seg[i])
+			continue;
+
+		do {
+			i++;
+			if (i == LOGFS_JOURNAL_SEGS)
+				i = 0;
+		} while (!super->s_journal_seg[i]);
+
+		area->a_segno = super->s_journal_seg[i];
+		++(super->s_journal_ec[i]);
+		return;
+	}
+	BUG();
+}
+
+static void journal_get_erase_count(struct logfs_area *area)
+{
+	/* erase count is stored globally and incremented in
+	 * journal_get_free_segment() - nothing to do here */
+}
+
+static int journal_erase_segment(struct logfs_area *area)
+{
+	return logfs_erase_segment(area->a_sb, area->a_segno);
+}
+
+static void journal_finish_area(struct logfs_area *area)
+{
+	area->a_is_open = 0;
+	area->a_used_bytes = 0;
+}
+
+static size_t __logfs_write_header(struct logfs_super *super,
+		struct logfs_journal_header *h, size_t len, size_t datalen,
+		u16 type, u8 compr)
+{
+	h->h_len	= cpu_to_be16(len);
+	h->h_type	= cpu_to_be16(type);
+	h->h_version	= cpu_to_be16(++super->s_last_version);
+	h->h_datalen	= cpu_to_be16(datalen);
+	h->h_compr	= compr;
+	h->h_pad[0]	= 'H';
+	h->h_pad[1]	= 'A';
+	h->h_pad[2]	= 'T';
+	h->h_crc	= logfs_crc32(h, len, 4);
+	return len;
+}
+
+static size_t logfs_write_header(struct logfs_super *super,
+		struct logfs_journal_header *h, size_t datalen, u16 type)
+{
+	size_t len = datalen + sizeof(*h);
+
+	return __logfs_write_header(super, h, len, datalen, type, COMPR_NONE);
+}
+
+static void *logfs_write_bb(struct super_block *sb, void *h,
+		u16 *type, size_t *len)
+{
+	*type = JE_BADSEGMENTS;
+	*len = sb->s_blocksize;
+	return logfs_super(sb)->s_bb_array;
+}
+
+static inline size_t logfs_journal_erasecount_size(struct logfs_super *super)
+{
+	return LOGFS_JOURNAL_SEGS * sizeof(__be32);
+}
+
+static void *logfs_write_erasecount(struct super_block *sb, void *_ec,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_journal_ec *ec = _ec;
+	int i;
+
+	journal_for_each(i)
+		ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]);
+	*type = JE_ERASECOUNT;
+	*len = logfs_journal_erasecount_size(super);
+	return ec;
+}
+
+static void *logfs_write_wbuf(struct super_block *sb, void *h,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_area[super->s_sum_index];
+
+	*type = JEG_WBUF + super->s_sum_index;
+	*len = super->s_writesize;
+	return area->a_wbuf;
+}
+
+static void account_shadow(void *_shadow, long _sb, u64 ignore)
+{
+	struct logfs_shadow *shadow = _shadow;
+	struct super_block *sb = (void *)_sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_inode *li = logfs_inode(super->s_master_inode);
+
+	/* consume new space */
+	super->s_free_bytes	  -= shadow->new_len;
+	super->s_used_bytes	  += shadow->new_len;
+	super->s_dirty_used_bytes -= shadow->new_len;
+
+	/* free up old space */
+	super->s_free_bytes	  += shadow->old_len;
+	super->s_used_bytes	  -= shadow->old_len;
+	super->s_dirty_free_bytes -= shadow->old_len;
+
+	if (shadow->ino == LOGFS_INO_MASTER)
+		li->li_used_bytes += shadow->new_len - shadow->old_len;
+	mempool_free(shadow, super->s_block_pool);
+}
+
+static void *__logfs_write_anchor(struct super_block *sb, void *_da,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_anchor *da = _da;
+	struct inode *inode = super->s_master_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	btree_grim_visitor(&li->li_shadow_tree.new, (long)sb, account_shadow);
+	btree_grim_visitor(&li->li_shadow_tree.old, (long)sb, account_shadow);
+	BUG_ON((s64)li->li_used_bytes < 0);
+
+	da->da_last_ino = cpu_to_be64(super->s_last_ino);
+	da->da_size	= cpu_to_be64(i_size_read(inode));
+	da->da_used_bytes = cpu_to_be64(li->li_used_bytes);
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		da->da_data[i] = cpu_to_be64(li->li_data[i]);
+	*type = JE_ANCHOR;
+	*len = sizeof(*da);
+	return da;
+}
+
+static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_dynsb *dynsb = _dynsb;
+
+	dynsb->ds_gec		= cpu_to_be64(super->s_gec);
+	dynsb->ds_sweeper	= cpu_to_be64(super->s_sweeper);
+	dynsb->ds_victim_ino	= cpu_to_be64(super->s_victim_ino);
+	dynsb->ds_rename_dir	= cpu_to_be64(super->s_rename_dir);
+	dynsb->ds_rename_pos	= cpu_to_be64(super->s_rename_pos);
+	dynsb->ds_used_bytes	= cpu_to_be64(super->s_used_bytes);
+	*type = JE_DYNSB;
+	*len = sizeof(*dynsb);
+	return dynsb;
+}
+
+static void *logfs_write_areas(struct super_block *sb, void *_a,
+		u16 *type, size_t *len)
+{
+	struct logfs_area *area;
+	struct logfs_je_areas *a = _a;
+	int i;
+
+	for (i = 0; i < 16; i++) {
+		/* FIXME: have all 16 areas */
+		a->used_bytes[i] = 0;
+		a->segno[i] = 0;
+	}
+	for_each_area(i) {
+		area = logfs_super(sb)->s_area[i];
+		a->used_bytes[i] = cpu_to_be32(area->a_used_bytes);
+		a->segno[i] = cpu_to_be32(area->a_segno);
+	}
+	*type = JE_AREAS;
+	*len = sizeof(*a);
+	return a;
+}
+
+static void *logfs_write_free_segments(struct super_block *sb, void *_f,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_free_segments *f = _f;
+	struct gc_candidate *cand;
+	int i = 0;
+
+	list_for_each_entry(cand, &super->s_free_list.list, list) {
+		f[i].segno = cpu_to_be32(cand->segno);
+		f[i].ec = cpu_to_be32(cand->erase_count);
+		i++;
+		if (i > MAX_CACHED_SEGS)
+			break;
+	}
+
+	*type = JE_FREESEGS;
+	*len = i * sizeof(struct logfs_je_free_segments);
+	return f;
+}
+
+static void *logfs_write_commit(struct super_block *sb, void *h,
+		u16 *type, size_t *len)
+{
+	*type = JE_COMMIT;
+	*len = 0;
+	return NULL;
+}
+
+static size_t __logfs_write_je(struct super_block *sb, size_t jpos,
+		void* (*write)(struct super_block *sb, void *scratch,
+			u16 *type, size_t *len))
+{
+	struct logfs_super *super = logfs_super(sb);
+	void *scratch = super->s_je;
+	void *header = super->s_compressed_je + jpos;
+	void *data = header + sizeof(struct logfs_journal_header);
+	ssize_t max, compr_len, pad_len, full_len;
+	size_t len;
+	u16 type;
+	u8 compr = COMPR_ZLIB;
+
+	scratch = write(sb, scratch, &type, &len);
+	if (len == 0)
+		return logfs_write_header(super, header, 0, type);
+
+	max = sb->s_blocksize - jpos;
+	compr_len = logfs_compress(scratch, data, len, max);
+	if (compr_len < 0 || type == JE_ANCHOR) {
+		BUG_ON(len > max);
+		memcpy(data, scratch, len);
+		compr_len = len;
+		compr = COMPR_NONE;
+	}
+
+	pad_len = ALIGN(compr_len, 16);
+	memset(data + compr_len, 0, pad_len - compr_len);
+	full_len = pad_len + sizeof(struct logfs_journal_header);
+
+	return __logfs_write_header(super, header, full_len, len, type, compr);
+}
+
+static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes,
+		int must_pad)
+{
+	u32 writesize = logfs_super(area->a_sb)->s_writesize;
+	s32 ofs;
+	int ret;
+
+	ret = logfs_open_area(area);
+	BUG_ON(ret);
+
+	ofs = area->a_used_bytes;
+	area->a_used_bytes += *bytes;
+
+	if (area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize) {
+		logfs_close_area(area);
+		return -EAGAIN;
+	}
+	if (must_pad) {
+		area->a_used_bytes = ALIGN(area->a_used_bytes, writesize);
+		*bytes = area->a_used_bytes - ofs;
+	}
+
+	return dev_ofs(area->a_sb, area->a_segno, ofs);
+}
+
+static int logfs_write_je(struct super_block *sb,
+		void* (*write)(struct super_block *sb, void *scratch,
+			u16 *type, size_t *len))
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_journal_area;
+	struct logfs_journal_header *h = super->s_compressed_je;
+	size_t len;
+	int must_pad = 0;
+	s64 ofs;
+
+	len = __logfs_write_je(sb, 0, write);
+	if (h->h_type == cpu_to_be16(JE_COMMIT))
+		must_pad = 1;
+
+	ofs = logfs_get_free_bytes(area, &len, must_pad);
+	if (ofs < 0)
+		return ofs;
+	logfs_buf_write(area, ofs, super->s_compressed_je, len);
+	return 0;
+}
+
+/*
+ * Write all journal entries.  The goto logic ensures that all journal entries
+ * are written whenever a new segment is used.  It is ugly and potentially a
+ * bit wasteful, but robustness is more important.  With this we can *always*
+ * erase all journal segments except the one containing the most recent commit.
+ */
+int logfs_write_anchor(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+	int i, err;
+
+	mutex_lock(&super->s_journal_mutex);
+
+again:
+	if (super->s_writesize > 1)
+		for_each_area(i) {
+			super->s_sum_index = i;
+			err = logfs_write_je(sb, logfs_write_wbuf);
+			if (err)
+				goto again;
+		}
+	err = logfs_write_je(sb, logfs_write_bb);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, logfs_write_erasecount);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, __logfs_write_anchor);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, logfs_write_dynsb);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, logfs_write_areas);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, logfs_write_free_segments);
+	if (err)
+		goto again;
+	super->s_devops->sync(sb);
+	err = logfs_write_je(sb, logfs_write_commit);
+	if (err)
+		goto again;
+
+	mutex_unlock(&super->s_journal_mutex);
+	return 0;
+}
+
+static const struct logfs_area_ops journal_area_ops = {
+	.get_free_segment	= journal_get_free_segment,
+	.get_erase_count	= journal_get_erase_count,
+	.erase_segment		= journal_erase_segment,
+	.finish_area		= journal_finish_area,
+};
+
+int logfs_init_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int ret = -ENOMEM;
+
+	mutex_init(&super->s_journal_mutex);
+
+	super->s_je = kzalloc(sb->s_blocksize, GFP_KERNEL);
+	if (!super->s_je)
+		return ret;
+
+	super->s_compressed_je = kzalloc(sb->s_blocksize, GFP_KERNEL);
+	if (!super->s_compressed_je)
+		return ret;
+
+	super->s_bb_array = kzalloc(sb->s_blocksize, GFP_KERNEL);
+	if (!super->s_bb_array)
+		return ret;
+
+	super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER);
+	if (!super->s_master_inode)
+		return ret;
+
+	/* make sure noone tries to evict this inode */
+	super->s_master_inode->i_nlink = 1;
+
+	/* logfs_scan_journal() is looking for the latest journal entries, but
+	 * doesn't copy them into data structures yet.  logfs_read_journal()
+	 * then re-reads those entries and copies their contents over. */
+	ret = logfs_scan_journal(sb);
+	if (ret)
+		return ret;
+	ret = logfs_read_journal(sb);
+	if (ret)
+		return ret;
+
+	reserve_sb_and_journal(sb);
+	logfs_calc_free(sb);
+
+	super->s_journal_area->a_ops = &journal_area_ops;
+	return 0;
+}
+
+void logfs_cleanup_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	__logfs_destroy_inode(super->s_master_inode);
+	super->s_master_inode = NULL;
+
+	kfree(super->s_bb_array);
+	kfree(super->s_compressed_je);
+	kfree(super->s_je);
+}
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/memtree.c	2008-04-07 11:53:20.916544201 +0200
@@ -0,0 +1,402 @@
+/*
+ * fs/logfs/memtree.c	- Simple In-memory B+Tree
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2007 Joern Engel <joern@logfs.org>
+ *
+ *
+ * This could possibly get moved to lib/.
+ *
+ * A relatively simple B+Tree implementation.  I have written it as a learning
+ * excercise to understand how B+Trees work.  Turned out to be useful as well.
+ *
+ * B+Trees can be used similar to Linux radix trees (which don't have anything
+ * in common with textbook radix trees, beware).  Prerequisite for them working
+ * well is that access to a random tree node is much faster than a large number
+ * of operations within each node.
+ *
+ * Disks have fulfilled the prerequite for a long time.  More recently DRAM
+ * has gained similar properties, as memory access times, when measured in cpu
+ * cycles, have increased.  Cacheline sizes have increased as well, which also
+ * helps B+Trees.
+ *
+ * Compared to radix trees, B+Trees are more efficient when dealing with a
+ * sparsely populated address space.  Between 25% and 50% of the memory is
+ * occupied with valid pointers.  When densely populated, radix trees contain
+ * ~98% pointers - hard to beat.  Very sparse radix trees contain only ~2%
+ * pointers.
+ *
+ * This particular implementation stores pointers identified by a long value.
+ * Storing NULL pointers is illegal, lookup will return NULL when no entry
+ * was found.
+ *
+ * Two tricks were used that are not commonly found in textbooks.  First, the
+ * lowest values are to the right, not to the left.  All used slots within a
+ * node are on the left, all unused slots contain NUL values.  Most operations
+ * simply loop once over all slots and terminate on the first NUL.
+ *
+ * Second trick is to special-case the key "0" or NUL.  As seen above, this
+ * value indicates an unused slot, so such a value should not be stored in the
+ * tree itself.  Instead it is stored in the null_ptr field in the btree_head.
+ */
+/* FIXME: use mempool for allocations */
+#include "logfs.h"
+
+/*
+ * Prerequisite of B+Trees performing well is that node lookup is much slower
+ * than a large number of operations within a node.  That can be true if the
+ * node size is identical to cacheline size.  All that is highly
+ * machine-dependent, just like the #define below is not.
+ *
+ * Patches to do something smarter are welcome.  Just beware that too small
+ * node with less than 8 slots have a bad fan-out and won't perform well
+ * either.
+ */
+#if BITS_PER_LONG == 32
+#define BTREE_NODES 20	/* 32bit, 240 byte nodes */
+#else
+#define BTREE_NODES 16	/* 64bit, 256 byte nodes */
+#endif
+
+struct btree_node {
+	u64 key;
+	struct btree_node *node;
+};
+
+void btree_init(struct btree_head *head)
+{
+	head->node = NULL;
+	head->height = 0;
+	head->null_ptr = NULL;
+}
+
+#if 0
+static void __dump_tree(struct btree_node *node, int height)
+{
+	int i;
+
+	if (!height)
+		return;
+
+	printk(KERN_DEBUG"%p ", node);
+	for (i = 0; i < BTREE_NODES; i++)
+		printk("(%llx,%p) ", node[i].key, node[i].node);
+	printk("\n");
+
+	for (i = 0; i < BTREE_NODES; i++)
+		if (node[i].key)
+			__dump_tree(node[i].node, height-1);
+}
+
+static void dump_tree(struct btree_head *head)
+{
+	printk(KERN_DEBUG"%p\n", head->null_ptr);
+	__dump_tree(head->node, head->height);
+}
+#endif
+
+static u64 btree_last(struct btree_head *head)
+{
+	int height = head->height;
+	struct btree_node *node = head->node;
+
+	if (height == 0)
+		return 0;
+
+	for ( ; height > 1; height--)
+		node = node[0].node;
+
+	return node[0].key;
+}
+
+void *btree_lookup(struct btree_head *head, u64 key)
+{
+	int i, height = head->height;
+	struct btree_node *node = head->node;
+
+	if (key == 0)
+		return head->null_ptr;
+
+	if (height == 0)
+		return NULL;
+
+	for ( ; height > 1; height--) {
+		for (i = 0; i < BTREE_NODES; i++)
+			if (node[i].key <= key)
+				break;
+		node = node[i].node;
+		if (!node)
+			return NULL;
+	}
+
+	if (!node)
+		return NULL;
+
+	for (i = 0; i < BTREE_NODES; i++)
+		if (node[i].key == key)
+			return node[i].node;
+
+	return NULL;
+}
+
+/*
+ * Returns two values:
+ * pos - the position of the first slot equal or less than key
+ * fill - the number of positions filled with any value
+ */
+static void find_pos(struct btree_node *node, u64 key, int *pos, int *fill)
+{
+	int i;
+
+	for (i = 0; i < BTREE_NODES; i++)
+		if (node[i].key <= key)
+			break;
+	*pos = i;
+	for (i = *pos; i < BTREE_NODES; i++)
+		if (node[i].key == 0)
+			break;
+	*fill = i;
+}
+
+/*
+ * locate the correct leaf node in the btree
+ */
+static struct btree_node *find_level(struct btree_head *head, u64 key,
+		int level)
+{
+	struct btree_node *node = head->node;
+	int i, height;
+
+	for (height = head->height; height > level; height--) {
+		for (i = 0; i < BTREE_NODES; i++)
+			if (node[i].key <= key)
+				break;
+
+		if ((i == BTREE_NODES) || !node[i].key) {
+			/* right-most key is too large, update it */
+			i--;
+			node[i].key = key;
+		}
+		BUG_ON(i < 0);
+		node = node[i].node;
+	}
+	BUG_ON(!node);
+	return node;
+}
+
+static int btree_grow(struct btree_head *head)
+{
+	struct btree_node *node;
+
+	node = kcalloc(BTREE_NODES, sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return -ENOMEM;
+	if (head->node) {
+		node->key = head->node[BTREE_NODES-1].key;
+		node->node = head->node;
+	}
+	head->node = node;
+	head->height++;
+	return 0;
+}
+
+static int btree_insert_level(struct btree_head *head, u64 key, void *ptr,
+		int level)
+{
+	struct btree_node *node;
+	int i, pos, fill, err;
+
+	BUG_ON(!ptr);
+	if (key == 0) {
+		/* 0 identifies empty slots, so special-case this */
+		BUG_ON(level != 1);
+		head->null_ptr = ptr;
+		return 0;
+	}
+
+	if (head->height < level) {
+		err = btree_grow(head);
+		if (err)
+			return err;
+	}
+
+retry:
+	node = find_level(head, key, level);
+	find_pos(node, key, &pos, &fill);
+	BUG_ON(node[pos].key == key);
+
+	if (fill == BTREE_NODES) {
+		/* need to split node */
+		struct btree_node *new;
+
+		new = kcalloc(BTREE_NODES, sizeof(*node), GFP_KERNEL);
+		if (!new)
+			return -ENOMEM;
+		err = btree_insert_level(head, node[BTREE_NODES/2 - 1].key, new,
+				level+1);
+		if (err) {
+			kfree(new);
+			return err;
+		}
+		for (i = 0; i < BTREE_NODES / 2; i++) {
+			new[i].key = node[i].key;
+			new[i].node = node[i].node;
+			node[i].key = node[i + BTREE_NODES/2].key;
+			node[i].node = node[i + BTREE_NODES/2].node;
+			node[i + BTREE_NODES/2].key = 0;
+			node[i + BTREE_NODES/2].node = NULL;
+		}
+		goto retry;
+	}
+	BUG_ON(fill >= BTREE_NODES);
+
+	/* shift and insert */
+	for (i = fill; i > pos; i--) {
+		node[i].key = node[i-1].key;
+		node[i].node = node[i-1].node;
+	}
+	node[pos].key = key;
+	node[pos].node = ptr;
+
+	return 0;
+}
+
+int btree_insert(struct btree_head *head, u64 key, void *ptr)
+{
+	return btree_insert_level(head, key, ptr, 1);
+}
+
+static void *btree_remove_level(struct btree_head *head, u64 key, int level)
+{
+	struct btree_node *node;
+	int i, pos, fill;
+	void *ret;
+
+	if (level > head->height) {
+		/* we recursed all the way up */
+		head->height = 0;
+		head->node = NULL;
+		return NULL;
+	}
+
+	node = find_level(head, key, level);
+	find_pos(node, key, &pos, &fill);
+	if ((level == 1) && (node[pos].key != key))
+		return NULL;
+	ret = node[pos].node;
+
+	/* remove and shift */
+	for (i = pos; i < fill-1; i++) {
+		node[i].key = node[i+1].key;
+		node[i].node = node[i+1].node;
+	}
+	node[fill-1].key = 0;
+	node[fill-1].node = NULL;
+
+	if (fill-1 < BTREE_NODES/2) {
+		/*
+		 * At this point there *should* be code to either merge with
+		 * a neighboring node or steal some entries from it to preserve
+		 * the btree invariant of only having nodes with n/2..n
+		 * elements.
+		 *
+		 * As you can see, that code is left as an excercise to the
+		 * reader or anyone noticing severe performance problems in
+		 * very rare cases.
+		 *
+		 * As-is this code "implements" a method called lazy deletion,
+		 * which according to text books is relatively common in
+		 * databases and usually works quite well.
+		 * Not so usually, the btree can degrade into very long lists
+		 * of 1-element nodes and perform accordingly.
+		 */
+	}
+	if (fill-1 == 0) {
+		btree_remove_level(head, key, level+1);
+		kfree(node);
+	}
+
+	return ret;
+}
+
+void *btree_remove(struct btree_head *head, u64 key)
+{
+	void *ret;
+
+	if (key == 0) {
+		/* 0 identifies empty slots, so special-case this */
+		ret = head->null_ptr;
+		head->null_ptr = NULL;
+		return ret;
+	}
+	if (head->height == 0)
+		return NULL;
+
+	return btree_remove_level(head, key, 1);
+}
+
+int btree_merge(struct btree_head *target, struct btree_head *victim)
+{
+	struct btree_node *node;
+	u64 key;
+	int err;
+
+	BUG_ON(target == victim);
+
+	if (!(target->node || target->null_ptr)) {
+		/* target is empty, just copy fields over */
+		target->null_ptr = victim->null_ptr;
+		target->node = victim->node;
+		target->height = victim->height;
+		btree_init(victim);
+		return 0;
+	}
+
+	for (;;) {
+		key = btree_last(victim);
+		node = btree_remove(victim, key);
+		if (!node)
+			break;
+		err = btree_insert(target, key, node);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void __btree_for_each(struct btree_node *node, long opaque,
+		void (*func)(void *elem, long opaque, u64 key),  int reap,
+		int height)
+{
+	int i;
+
+	for (i = 0; i < BTREE_NODES && node[i].key; i++) {
+		if (height > 1)
+			__btree_for_each(node[i].node, opaque, func, reap,
+					height-1);
+		else
+			func(node[i].node, opaque, node[i].key);
+	}
+	if (reap)
+		kfree(node);
+}
+
+void btree_visitor(struct btree_head *head, long opaque,
+		void (*func)(void *elem, long opaque, u64 key))
+{
+	if (head->node)
+		__btree_for_each(head->node, opaque, func, 0, head->height);
+	if (head->null_ptr)
+		func(head->null_ptr, opaque, 0);
+}
+
+void btree_grim_visitor(struct btree_head *head, long opaque,
+		void (*func)(void *elem, long opaque, u64 key))
+{
+	if (head->node)
+		__btree_for_each(head->node, opaque, func, 1, head->height);
+	if (head->null_ptr)
+		func(head->null_ptr, opaque, 0);
+	btree_init(head);
+}
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/readwrite.c	2008-04-07 12:24:48.276946628 +0200
@@ -0,0 +1,1618 @@
+/*
+ * fs/logfs/readwrite.c
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ *
+ *
+ * Actually contains five sets of very similar functions:
+ * read		read blocks from a file
+ * seek_hole	find next hole
+ * seek_data	find next data block
+ * valid	check whether a block still belongs to a file
+ * write	write blocks to a file
+ * delete	delete a block (for directories and ifile)
+ * rewrite	move existing blocks of a file to a new location (gc helper)
+ * truncate	truncate a file
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+
+static int adjust_level(int level)
+{
+	if (level >= LOGFS_MAX_LEVELS)
+		level -= LOGFS_MAX_LEVELS;
+	WARN_ON(level >= LOGFS_MAX_LEVELS);
+	return level;
+}
+
+static u64 adjust_bix(u64 bix, u8 level)
+{
+	switch (adjust_level(level)) {
+	case 0:
+		return bix;
+	case 1:
+		return max_t(u64, bix, I0_BLOCKS);
+	case 2:
+		return max_t(u64, bix, I1_BLOCKS);
+	case 3:
+		return max_t(u64, bix, I2_BLOCKS);
+	default:
+		WARN_ON(1);
+		return bix;
+	}
+}
+
+/**
+ * The inode address space is cut in two halves.  Lower half belongs to data
+ * pages, upper half to indirect blocks.  If the high bit (INDIRECT_BIT) is
+ * set, the actual block index (bix) and level can be derived from the page
+ * index.
+ *
+ * The lowest three bits of the block index are set to 0 after packing and
+ * unpacking.  Since the lowest n bits (9 for 4KiB blocksize) are ignored
+ * anyway this is harmless.
+ */
+#define ARCH_SHIFT	(BITS_PER_LONG - 32)
+#define INDIRECT_BIT	(0x80000000UL << ARCH_SHIFT)
+#define LEVEL_SHIFT	(28 + ARCH_SHIFT)
+static pgoff_t logfs_pack_index(u64 bix, u8 level)
+{
+	pgoff_t index;
+
+	BUG_ON(bix >= INDIRECT_BIT);
+	BUG_ON(level > 7);
+	if (level == 0)
+		return bix;
+
+	index  = INDIRECT_BIT;
+	index |= (long)level << LEVEL_SHIFT;
+	index |= bix >> (level*LOGFS_BLOCK_BITS);
+	return index;
+}
+
+void logfs_unpack_index(pgoff_t index, u64 *bix, u8 *level)
+{
+	if (!(index & INDIRECT_BIT)) {
+		*bix = index;
+		*level = 0;
+		return;
+	}
+
+	*level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT;
+	*bix = (index << (*level*LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT;
+	*bix = adjust_bix(*bix, *level);
+	return;
+}
+#undef ARCH_SHIFT
+#undef INDIRECT_BIT
+#undef LEVEL_SHIFT
+
+/**
+ * logfs_flush_dirty - flush dirty blocks
+ * @sb:		filesystem superblock
+ * @sync:	if 0, only flush enough to continue writing,
+ *		if 1, completely flush list
+ */
+void logfs_flush_dirty(struct super_block *sb, int sync)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 bytes = LOGFS_MAX_LEVELS * LOGFS_MAX_OBJECTSIZE;
+	struct logfs_block *block;
+	struct page *page;
+	struct inode *inode;
+	int ret;
+
+	while (super->s_dirty_free_bytes || super->s_dirty_used_bytes) {
+		if (!sync && (super->s_free_bytes >= bytes + super->s_gc_reserve
+				+ super->s_dirty_used_bytes))
+			break;
+
+		BUG_ON(list_empty(&super->s_dirty_list));
+		block = list_entry(super->s_dirty_list.next, struct logfs_block,
+				dirty_list);
+		page = block->page;
+		inode = page->mapping->host;
+		ret = logfs_write_buf(inode, page, NULL, 0);
+		BUG_ON(ret);
+		/* We may need to GC some more after writing a page */
+		logfs_gc_pass(sb);
+	}
+}
+
+/*
+ * Logfs is prone to an AB-BA deadlock where one task tries to acquire
+ * s_w_mutex with a locked page and GC tries to get that page while holding
+ * s_w_mutex.
+ * To solve this issue logfs will ignore the page lock iff the page in question
+ * is waiting for s_w_mutex.  We annotate this fact by setting PG_pre_locked
+ * in addition to PG_locked.
+ *
+ * FIXME: Logfs already uses PG_owner_priv_1 for other purposes and there is
+ * no PG_owner_priv_2.  Currently we abuse the (hopefully) free flag 18.  But
+ * that flag may get reused any minute.  In fact, Christoph Lameter recently
+ * sent a patchset to reshuffle page flags.  Highly dangerous.
+ */
+#define PG_pre_locked		18
+#define PagePreLocked(page)	test_bit(PG_pre_locked, &(page)->flags)
+#define SetPagePreLocked(page)	set_bit(PG_pre_locked, &(page)->flags)
+#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags)
+static void logfs_get_wblocks(struct super_block *sb, struct page *page,
+		int lock)
+{
+	if (lock) {
+		struct logfs_super *super = logfs_super(sb);
+
+		if (page)
+			SetPagePreLocked(page);
+		mutex_lock(&super->s_w_mutex);
+		super->s_write_page = page;
+		logfs_gc_pass(sb);
+		/* FIXME: We also have to check for shadowed space
+		 * and mempool fill grade */
+		logfs_flush_dirty(sb, 0);
+	}
+}
+
+static void logfs_put_wblocks(struct super_block *sb, struct page *page,
+		int lock)
+{
+	if (lock) {
+		logfs_super(sb)->s_write_page = NULL;
+		/* Order matters - we must clear PG_pre_locked before releasing
+		 * s_w_mutex or we could race against another task. */
+		if (page)
+			ClearPagePreLocked(page);
+		mutex_unlock(&logfs_super(sb)->s_w_mutex);
+	}
+}
+
+static struct page *logfs_get_read_page(struct inode *inode, u64 bix, u8 level)
+{
+	return find_or_create_page(inode->i_mapping,
+			logfs_pack_index(bix, level), GFP_NOFS);
+}
+
+static void logfs_put_read_page(struct page *page)
+{
+	unlock_page(page);
+	page_cache_release(page);
+}
+
+static struct page *logfs_get_page(struct inode *inode, u64 bix, u8 level)
+{
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t index = logfs_pack_index(bix, level);
+	struct page *page;
+	int err;
+	int loop = 0;
+
+repeat:
+	page = find_get_page(mapping, index);
+	if (!page) {
+		page = __page_cache_alloc(GFP_NOFS);
+		if (!page)
+			return NULL;
+		err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
+		if (unlikely(err)) {
+			page_cache_release(page);
+			if (err == -EEXIST)
+				goto repeat;
+			return NULL;
+		}
+	} else while (unlikely(TestSetPageLocked(page))) {
+		if (PagePreLocked(page)) {
+			/* Holder of page lock is waiting for us, it
+			 * is safe to use this page. */
+			return page;
+		}
+		if (loop++ > 0x1000) {
+			/* Has been observed once so far... */
+			printk(KERN_ERR "stack at %p\n", &loop);
+			BUG();
+		}
+		/* Some other process has this page locked and has
+		 * nothing to do with us.  Wait for it to finish.
+		 */
+		schedule();
+	}
+	return page;
+}
+
+static void logfs_put_page(struct inode *inode, struct page *page)
+{
+	if (likely(!PagePreLocked(page)))
+		unlock_page(page);
+	page_cache_release(page);
+}
+
+static struct page *logfs_get_write_page(struct inode *inode, u64 bix, u8 level)
+{
+	struct page *write_page = logfs_super(inode->i_sb)->s_write_page;
+	pgoff_t index = logfs_pack_index(bix, level);
+
+	if (write_page && (inode->i_mapping == write_page->mapping)
+			&& (index == write_page->index))
+		return write_page;
+	else
+		return logfs_get_page(inode, bix, level);
+}
+
+static void logfs_put_write_page(struct inode *inode, struct page *page)
+{
+	struct page *write_page = logfs_super(inode->i_sb)->s_write_page;
+
+	if (page != write_page)
+		logfs_put_page(inode, page);
+}
+
+static unsigned long __get_bits(u64 val, int skip, int no)
+{
+	u64 ret = val;
+
+	ret >>= skip * no;
+	ret <<= 64 - no;
+	ret >>= 64 - no;
+	return ret;
+}
+
+static unsigned long get_bits(u64 val, int skip)
+{
+	return __get_bits(val, skip, LOGFS_BLOCK_BITS);
+}
+
+/*
+ * Returns:
+ * 0 if all pointers are NUL
+ * -1 if all pointers have LOGFS_FULLY_POPULATED set
+ * 1 if at least one pointer is non-NUL and
+ *      at least one has LOGFS_FULLY_POPULATED cleared
+ */
+enum blockstate {
+	bs_fully_populated	= -1,
+	bs_all_zero		= 0,
+	bs_mixed		= 1,
+};
+
+static int block_state(__be64 *block)
+{
+	int i;
+
+	if (block[0]) {
+		for (i = 1; i < LOGFS_BLOCK_FACTOR; i++)
+			if (!(block[i] & cpu_to_be64(LOGFS_FULLY_POPULATED)))
+				return bs_mixed;
+		return bs_fully_populated;
+	} else {
+		for (i = 1; i < LOGFS_BLOCK_FACTOR; i++)
+			if (block[i])
+				return bs_mixed;
+		return bs_all_zero;
+	}
+}
+
+static int page_state(struct page *page)
+{
+	__be64 *block;
+	int ret;
+
+	block = kmap_atomic(page, KM_USER0);
+	ret = block_state(block);
+	kunmap_atomic(block, KM_USER0);
+	return ret;
+}
+
+static void alloc_block(struct page *page)
+{
+	struct logfs_super *super = logfs_super(page->mapping->host->i_sb);
+	struct logfs_block *block;
+
+	if (PagePrivate(page))
+		return;
+
+	block = mempool_alloc(super->s_block_pool, GFP_KERNEL);
+	INIT_LIST_HEAD(&block->dirty_list);
+	block->page = page;
+	SetPagePrivate(page);
+	page->private = (unsigned long)block;
+}
+
+static struct shadow_tree *logfs_page_to_tree(struct page *page)
+{
+	alloc_block(page);
+	return &logfs_block(page)->shadow_tree;
+}
+
+static void block_set_pointer(struct page *page, int index, u64 ptr)
+{
+	__be64 *block;
+
+	block = kmap_atomic(page, KM_USER0);
+	block[index] = cpu_to_be64(ptr);
+	flush_dcache_page(page);
+	kunmap_atomic(block, KM_USER0);
+	SetPageUptodate(page);
+}
+
+static u64 block_get_pointer(struct page *page, int index)
+{
+	__be64 *block;
+	u64 ptr;
+
+	block = kmap_atomic(page, KM_USER0);
+	ptr = be64_to_cpu(block[index]);
+	kunmap_atomic(block, KM_USER0);
+	return ptr;
+}
+
+static int logfs_read_empty(struct page *page)
+{
+	zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+	SetPageZero(page);
+	return 0;
+}
+
+static int logfs_read_embedded(struct page *page, struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	void *buf;
+
+	buf = kmap_atomic(page, KM_USER0);
+	memcpy(buf, li->li_data, LOGFS_EMBEDDED_SIZE);
+	memset(buf + LOGFS_EMBEDDED_SIZE, 0,
+			PAGE_CACHE_SIZE - LOGFS_EMBEDDED_SIZE);
+	flush_dcache_page(page);
+	kunmap_atomic(buf, KM_USER0);
+	return 0;
+}
+
+static int logfs_read_direct(struct inode *inode, struct page *page)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	pgoff_t index = page->index;
+	u64 block;
+
+	block = li->li_data[index];
+	if (!block)
+		return logfs_read_empty(page);
+
+	return logfs_segment_read(inode, page, block, index, 0);
+}
+
+static int logfs_read_loop(struct inode *inode, struct page *page, int count)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 bofs = li->li_data[I1_INDEX + count];
+	pgoff_t bix = page->index;
+	int level, ret;
+	struct page *ipage;
+
+	if (!bofs)
+		return logfs_read_empty(page);
+
+	for (level = count + 1; level > 0; level--) {
+		ipage = logfs_get_read_page(inode, bix, level);
+		if (!ipage)
+			return -ENOMEM;
+
+		ret = logfs_segment_read(inode, ipage, bofs, bix, level);
+		if (ret) {
+			logfs_put_read_page(ipage);
+			return ret;
+		}
+
+		bofs = block_get_pointer(ipage, get_bits(bix, level-1));
+		logfs_put_read_page(ipage);
+		if (!bofs)
+			return logfs_read_empty(page);
+	}
+
+	return logfs_segment_read(inode, page, bofs, bix, 0);
+}
+
+static int logfs_read_block(struct inode *inode, struct page *page)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	pgoff_t index = page->index;
+
+	if (li->li_flags & LOGFS_IF_EMBEDDED) {
+		if (index != 0)
+			return logfs_read_empty(page);
+		else
+			return logfs_read_embedded(page, inode);
+	} else if (index < I0_BLOCKS)
+		return logfs_read_direct(inode, page);
+	else if (index < I1_BLOCKS)
+		return logfs_read_loop(inode, page, 0);
+	else if (index < I2_BLOCKS)
+		return logfs_read_loop(inode, page, 1);
+	else if (index < I3_BLOCKS)
+		return logfs_read_loop(inode, page, 2);
+
+	BUG();
+	return -EIO;
+}
+
+static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	for (; bix < I0_BLOCKS; bix++)
+		if (data ^ (li->li_data[bix] == 0))
+			return bix;
+	return I0_BLOCKS;
+}
+
+static u64 seek_holedata_loop(struct inode *inode, u64 bix, int count, int data)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	__be64 *rblock;
+	u64 bofs = li->li_data[I1_INDEX + count];
+	int level, ret, slot;
+	struct page *page;
+
+	BUG_ON(!bofs);
+
+	for (level = count + 1; level > 0; level--) {
+		page = logfs_get_read_page(inode, bix, level);
+		if (!page)
+			return bix;
+
+		ret = logfs_segment_read(inode, page, bofs, bix, level);
+		if (ret) {
+			logfs_put_read_page(page);
+			return bix;
+		}
+
+		slot = get_bits(bix, level-1);
+		rblock = kmap_atomic(page, KM_USER0);
+		while (slot < LOGFS_BLOCK_FACTOR) {
+			if (data && (rblock[slot] != 0))
+				break;
+			if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED))
+				break;
+			slot++;
+			bix += 1 << (LOGFS_BLOCK_BITS * (level-1));
+		}
+		if (slot >= LOGFS_BLOCK_FACTOR) {
+			kunmap_atomic(rblock, KM_USER0);
+			logfs_put_read_page(page);
+			return bix;
+		}
+		bofs = be64_to_cpu(rblock[slot]);
+		kunmap_atomic(rblock, KM_USER0);
+		logfs_put_read_page(page);
+		if (!bofs) {
+			BUG_ON(data);
+			return bix;
+		}
+	}
+	return bix;
+}
+
+/**
+ * logfs_seek_hole - find next hole starting at a given block index
+ * @inode:		inode to search in
+ * @bix:		block index to start searching
+ *
+ * Returns next hole.  If the file doesn't contain any further holes, the
+ * block address next to eof is returned instead.
+ */
+u64 logfs_seek_hole(struct inode *inode, u64 bix)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (li->li_flags & LOGFS_IF_EMBEDDED)
+		return 1;
+
+	if (bix < I0_BLOCKS) {
+		bix = seek_holedata_direct(inode, bix, 0);
+		if (bix < I0_BLOCKS)
+			return bix;
+	}
+
+#define SEEK_HOLE_LOOP_WRAPPER(index, blocks, count) do {		\
+	if (bix < blocks) {						\
+		if (!li->li_data[index])				\
+			return bix;					\
+		else if (li->li_data[index] & LOGFS_FULLY_POPULATED)	\
+			bix = blocks;					\
+		else {							\
+			bix = seek_holedata_loop(inode, bix, count, 0);	\
+			if (bix < blocks)				\
+				return bix;				\
+			/* LOGFS_FULLY_POPULATED should have been set */\
+			WARN_ON_ONCE(bix == blocks);			\
+		}							\
+	}								\
+} while (0)
+	SEEK_HOLE_LOOP_WRAPPER(I1_INDEX, I1_BLOCKS, 0);
+	SEEK_HOLE_LOOP_WRAPPER(I2_INDEX, I2_BLOCKS, 1);
+	SEEK_HOLE_LOOP_WRAPPER(I3_INDEX, I3_BLOCKS, 2);
+#undef SEEK_HOLE_LOOP_WRAPPER
+
+	return bix;
+}
+
+static u64 __logfs_seek_data(struct inode *inode, u64 bix)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (li->li_flags & LOGFS_IF_EMBEDDED)
+		return bix;
+
+	if (bix < I0_BLOCKS) {
+		bix = seek_holedata_direct(inode, bix, 1);
+		if (bix < I0_BLOCKS)
+			return bix;
+	}
+
+#define SEEK_DATA_LOOP_WRAPPER(index, blocks, count) do {		\
+	if (bix < blocks) {						\
+		if (!li->li_data[index])				\
+			bix = blocks;					\
+		else							\
+			return seek_holedata_loop(inode, bix, count, 1);\
+	}								\
+} while (0)
+	SEEK_DATA_LOOP_WRAPPER(I1_INDEX, I1_BLOCKS, 0);
+	SEEK_DATA_LOOP_WRAPPER(I2_INDEX, I2_BLOCKS, 1);
+	SEEK_DATA_LOOP_WRAPPER(I3_INDEX, I3_BLOCKS, 2);
+#undef SEEK_DATA_LOOP_WRAPPER
+
+	return bix;
+}
+
+/**
+ * logfs_seek_data - find next data block after a given block index
+ * @inode:		inode to search in
+ * @bix:		block index to start searching
+ *
+ * Returns next data block.  If the file doesn't contain any further data
+ * blocks, the last block in the file is returned instead.
+ */
+u64 logfs_seek_data(struct inode *inode, u64 bix)
+{
+	struct super_block *sb = inode->i_sb;
+	u64 ret, end;
+
+	ret = __logfs_seek_data(inode, bix);
+	end = i_size_read(inode) >> sb->s_blocksize_bits;
+	if (ret >= end)
+		ret = max(bix, end);
+	return ret;
+}
+
+static int logfs_is_valid_direct(struct logfs_inode *li, pgoff_t index, u64 ofs)
+{
+	return pure_ofs(li->li_data[index]) == ofs;
+}
+
+static int logfs_is_valid_shadow(struct page *page, u64 ofs)
+{
+	return PagePrivate(page) &&
+		btree_lookup(&logfs_page_to_tree(page)->old, ofs);
+}
+
+static int __logfs_is_valid_loop(struct inode *inode, u64 bix, int count,
+		u64 ofs, u64 bofs)
+{
+	int level, ret;
+	struct page *page;
+
+	for (level = count + 1; level > 0; level--) {
+		page = logfs_get_write_page(inode, bix, level);
+		BUG_ON(!page);
+
+		if (logfs_is_valid_shadow(page, ofs)) {
+			logfs_put_write_page(inode, page);
+			return 1;
+		}
+
+		ret = logfs_segment_read(inode, page, bofs, bix, level);
+		if (ret) {
+			logfs_put_write_page(inode, page);
+			return 0;
+		}
+
+		bofs = block_get_pointer(page, get_bits(bix, level-1));
+		logfs_put_write_page(inode, page);
+		if (!bofs)
+			return 0;
+
+		if (pure_ofs(bofs) == ofs)
+			return 1;
+	}
+	return 0;
+}
+
+static int logfs_is_valid_loop(struct inode *inode, pgoff_t index,
+		int count, u64 ofs)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 bofs = li->li_data[I1_INDEX + count];
+
+	if (!bofs)
+		return 0;
+
+	if (pure_ofs(bofs) == ofs)
+		return 1;
+
+	return __logfs_is_valid_loop(inode, index, count, ofs, bofs);
+}
+
+static int __logfs_is_valid_block(struct inode *inode, pgoff_t index, u64 ofs)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (btree_lookup(&li->li_shadow_tree.old, ofs)) {
+		/* block is still valid on medium */
+		return 1;
+	}
+
+	if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1)
+		return 0;
+
+	if (li->li_flags & LOGFS_IF_EMBEDDED)
+		return 0;
+
+	if (index < I0_BLOCKS)
+		return logfs_is_valid_direct(li, index, ofs);
+	else if (index < I1_BLOCKS)
+		return logfs_is_valid_loop(inode, index, 0, ofs);
+	else if (index < I2_BLOCKS)
+		return logfs_is_valid_loop(inode, index, 1, ofs);
+	else if (index < I3_BLOCKS)
+		return logfs_is_valid_loop(inode, index, 2, ofs);
+
+	BUG();
+	return 0;
+}
+
+/**
+ * logfs_is_valid_block - check whether this block is still valid
+ *
+ * @sb	- superblock
+ * @ofs	- block physical offset
+ * @ino	- block inode number
+ * @bix	- block index
+ * @level - block level
+ *
+ * Returns 0 if block is invalid, 1 if it is valid.
+ */
+int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
+		u8 level)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode;
+	struct page *page;
+	int ret, cookie;
+
+	/* Umount closes a segment with free blocks remaining.  Those
+	 * blocks are by definition invalid. */
+	if (ino == -1)
+		return 0;
+
+	LOGFS_BUG_ON((u64)(u_long)ino != ino, sb);
+
+	inode = logfs_iget(sb, ino, &cookie);
+	if (!inode)
+		return 0;
+
+	ret = __logfs_is_valid_block(inode, bix, ofs);
+	logfs_iput(inode, cookie);
+	if (ret)
+		return ret;
+
+	/* Block may sit in the shadow of a dirty ifile block, so check again
+	 * in the ifile, with properly forged parameters */
+	ret = __logfs_is_valid_block(super->s_master_inode, ino, ofs);
+	if (ret)
+		return ret;
+
+	/* Another check - the leaf blocks are usually ignored */
+	page = logfs_get_write_page(super->s_master_inode, ino, 0);
+	if (!page)
+		return 0;
+	ret = logfs_is_valid_shadow(page, ofs);
+	logfs_put_write_page(super->s_master_inode, page);
+	return ret;
+}
+
+int logfs_readpage_nolock(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	int ret = -EIO;
+
+	ret = logfs_read_block(inode, page);
+
+	if (ret) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+	flush_dcache_page(page);
+
+	return ret;
+}
+
+static int logfs_reserve_bytes(struct inode *inode, int bytes)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+
+	if (!bytes)
+		return 0;
+
+	if (super->s_free_bytes < bytes + super->s_gc_reserve)
+		return -ENOSPC;
+
+	return 0;
+}
+
+/*
+ * Not strictly a reservation, but rather a check that we still have enough
+ * space to satisfy the write.
+ */
+static int logfs_reserve_blocks(struct inode *inode, int blocks)
+{
+	return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE);
+}
+
+static int logfs_write_inode_now(struct inode *inode, long flags)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (!(li->li_flags & LOGFS_IF_DIRTY))
+		return 0;
+
+	li->li_flags &= ~LOGFS_IF_DIRTY;
+	if (inode->i_ino == LOGFS_INO_MASTER)
+		return logfs_write_anchor(inode);
+
+	return __logfs_write_inode(inode, flags);
+}
+
+static int logfs_write_embedded(struct page *page, struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	void *buf, *dst = li->li_data;
+
+	buf = kmap_atomic(page, KM_USER0);
+	memcpy(dst, buf, i_size_read(inode));
+	flush_dcache_page(page);
+	kunmap_atomic(buf, KM_USER0);
+
+	li->li_flags |= LOGFS_IF_EMBEDDED | LOGFS_IF_DIRTY;
+
+	return 0;
+}
+
+struct write_control {
+	struct shadow_tree *shadow_tree;
+	u64 ofs;
+	long flags;
+};
+
+static int adj_level(u64 ino, int level)
+{
+	BUG_ON(level >= LOGFS_MAX_LEVELS);
+
+	if (ino == LOGFS_INO_MASTER) {
+		/* ifile has seperate areas */
+		level += LOGFS_MAX_LEVELS;
+	}
+	return level;
+}
+
+static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix, u8 level,
+		u64 old_ofs)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_shadow *shadow;
+
+	shadow = mempool_alloc(super->s_shadow_pool, GFP_KERNEL);
+	shadow->ino = inode->i_ino;
+	shadow->bix = bix;
+	shadow->level = adj_level(inode->i_ino, level);
+	shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED;
+	return shadow;
+}
+
+static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+
+	mempool_free(shadow, super->s_block_pool);
+}
+
+static void shadow_tree_merge(struct shadow_tree *target,
+		struct shadow_tree *victim)
+{
+	btree_merge(&target->new, &victim->new);
+	btree_merge(&target->old, &victim->old);
+}
+
+static void add_shadow_tree_to_page(struct page *page,
+		struct shadow_tree *shadow_tree)
+{
+	if (!shadow_tree)
+		return;
+	if ((shadow_tree->old.height == 0) && (shadow_tree->new.height == 0))
+		return;
+
+	shadow_tree_merge(logfs_page_to_tree(page), shadow_tree);
+}
+
+static void fill_shadow_tree(struct shadow_tree *tree, struct page *page,
+		struct logfs_shadow *shadow)
+{
+	struct logfs_super *super = logfs_super(page->mapping->host->i_sb);
+
+	if (PagePrivate(page)) {
+		shadow_tree_merge(tree, logfs_page_to_tree(page));
+		list_del(&logfs_block(page)->dirty_list);
+		ClearPagePrivate(page);
+		mempool_free(logfs_block(page), super->s_block_pool);
+		page->private = 0;
+	}
+	if (shadow->old_ofs)
+		btree_insert(&tree->old, shadow->old_ofs, shadow);
+	else
+		btree_insert(&tree->new, shadow->new_ofs, shadow);
+
+	super->s_dirty_used_bytes += shadow->new_len;
+	super->s_dirty_free_bytes += shadow->old_len;
+}
+
+/*
+ * File is too large for embedded data when called.  Move data to first
+ * block and clear embedded area.
+ */
+static int logfs_move_embedded(struct inode *inode, struct page *page)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_shadow *shadow;
+	void *buf;
+	int err;
+	int i;
+	pgoff_t index = page->index;
+
+	if (!(li->li_flags & LOGFS_IF_EMBEDDED))
+		return 0;
+
+	if (logfs_reserve_blocks(inode, 1))
+		return -ENOSPC;
+
+	if (index == 0) {
+		/* No need to write the page twice */
+		li->li_data[0] = 0;
+	} else {
+		page = logfs_get_read_page(inode, 0, 0);
+		if (!page)
+			return -ENOMEM;
+
+		buf = kmap_atomic(page, KM_USER0);
+		memcpy(buf, li->li_data, LOGFS_EMBEDDED_SIZE);
+		flush_dcache_page(page);
+		kunmap_atomic(buf, KM_USER0);
+
+		shadow = alloc_shadow(inode, 0, 0, 0);
+		err = logfs_segment_write(inode, page, shadow);
+		logfs_put_read_page(page);
+		if (err) {
+			free_shadow(inode, shadow);
+			return err;
+		}
+		fill_shadow_tree(&li->li_shadow_tree, page, shadow);
+
+		li->li_data[0] = shadow->new_ofs | LOGFS_FULLY_POPULATED;
+	}
+
+	li->li_flags &= ~LOGFS_IF_EMBEDDED;
+	li->li_flags |= LOGFS_IF_DIRTY;
+	for (i = 1; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = 0;
+
+	return 0;
+}
+
+static int logfs_write_i0(struct inode *inode, struct page *page,
+		struct write_control *wc)
+{
+	struct logfs_shadow *shadow;
+	u64 bix;
+	u8 level;
+	int err = 0;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	if (wc->ofs == 0)
+		if (logfs_reserve_blocks(inode, 1))
+			return -ENOSPC;
+
+	shadow = alloc_shadow(inode, bix, level, wc->ofs);
+	if (wc->flags & WF_WRITE)
+		err = logfs_segment_write(inode, page, shadow);
+	if (wc->flags & WF_DELETE)
+		logfs_segment_delete(inode, shadow);
+	if (err) {
+		free_shadow(inode, shadow);
+		return err;
+	}
+
+	fill_shadow_tree(wc->shadow_tree, page, shadow);
+	wc->ofs = shadow->new_ofs;
+	if (wc->ofs && ((level == 0) || (page_state(page) == bs_fully_populated)))
+		wc->ofs |= LOGFS_FULLY_POPULATED;
+	return 0;
+}
+
+static int logfs_write_direct(struct inode *inode, struct page *page,
+		long flags)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc = {
+		.ofs = li->li_data[page->index],
+		.shadow_tree = &li->li_shadow_tree,
+		.flags = flags,
+	};
+	int err;
+
+	err = logfs_write_i0(inode, page, &wc);
+	if (err)
+		return err;
+
+	li->li_data[page->index] = wc.ofs;
+	li->li_flags |= LOGFS_IF_DIRTY;
+	return 0;
+}
+
+static void logfs_dirty_page(struct inode *inode, struct page *page, long flags)
+{
+	struct logfs_block *block;
+	struct logfs_super *super = logfs_super(inode->i_sb);
+
+	/* The assertion below is nicer to debug than random corruption due
+	 * to buggerhead being the default. */
+	BUG_ON(!page_mapping(page)->a_ops->set_page_dirty);
+
+	alloc_block(page);
+	block = logfs_block(page);
+	mark_inode_dirty(inode);
+	set_page_dirty(page);
+	if (flags & WF_GC)
+		logfs_dirty_for_gc(inode->i_sb, block);
+	else
+		list_move_tail(&block->dirty_list, &super->s_dirty_list);
+}
+
+static int __logfs_write_rec(struct inode *inode, struct page *page,
+		struct write_control *this_wc,
+		pgoff_t bix, int target_level, int level)
+{
+	int ret;
+	struct page *ipage;
+	struct write_control child_wc = {
+		.flags = this_wc->flags,
+	};
+
+	ipage = logfs_get_write_page(inode, bix, level);
+	if (!ipage)
+		return -ENOMEM;
+
+	if (this_wc->ofs) {
+		ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
+		if (ret)
+			goto out;
+	} else {
+		if (PageZero(ipage))
+			ClearPageZero(ipage);
+		else if (!PageUptodate(ipage))
+			zero_user_segment(ipage, 0, PAGE_SIZE);
+	}
+	child_wc.shadow_tree = logfs_page_to_tree(ipage);
+	child_wc.ofs = block_get_pointer(ipage, get_bits(bix, level-1));
+
+	if (level-1 > target_level)
+		ret = __logfs_write_rec(inode, page, &child_wc, bix,
+				target_level, level-1);
+	else
+		ret = logfs_write_i0(inode, page, &child_wc);
+
+	if (ret)
+		goto out;
+
+	/* TODO: both operations use kmap_atomic, combine them */
+	block_set_pointer(ipage, get_bits(bix, level-1), child_wc.ofs);
+	if (child_wc.ofs || page_state(ipage) != bs_all_zero)
+		this_wc->flags |= WF_WRITE;
+	/* TODO: use write-back caching for ifile as well */
+	/* the condition on this_wc->ofs ensures that we won't consume extra
+	 * space for indirect blocks in the future, which we cannot reserve */
+	if ((this_wc->flags & WF_SYNC) || !this_wc->ofs)
+		ret = logfs_write_i0(inode, ipage, this_wc);
+	else
+		logfs_dirty_page(inode, ipage, this_wc->flags);
+out:
+	logfs_put_write_page(inode, ipage);
+	return ret;
+}
+
+static int logfs_write_rec(struct inode *inode, struct page *page,
+		pgoff_t bix, int count, int target_level, long flags)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc = {
+		.ofs = li->li_data[I1_INDEX + count],
+		.shadow_tree = &li->li_shadow_tree,
+		.flags = flags,
+	};
+	int ret;
+
+	if (count+1 > target_level)
+		ret = __logfs_write_rec(inode, page, &wc, bix, target_level,
+				count+1);
+	else
+		ret = logfs_write_i0(inode, page, &wc);
+	if (!ret) {
+		if (li->li_data[I1_INDEX + count] != wc.ofs) {
+			li->li_flags |= LOGFS_IF_DIRTY;
+			li->li_data[I1_INDEX + count] = wc.ofs;
+		}
+	}
+	return ret;
+}
+
+/*
+ * We are protected by write lock.  Push victims up to superblock level
+ * and release transaction when appropriate.  logfs_write_inode_now(inode)
+ * will then finish the transaction when writing the master inode to the
+ * journal.
+ */
+static void logfs_handle_transaction(struct inode *inode,
+		struct logfs_transaction *ta)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+
+	if (!ta)
+		return;
+
+	if (inode->i_ino != LOGFS_INO_MASTER) {
+		/* just remember the transaction until inode is written */
+		BUG_ON(logfs_inode(inode)->li_transaction);
+		logfs_inode(inode)->li_transaction = ta;
+		logfs_inode(inode)->li_flags |= LOGFS_IF_DIRTY;
+		return;
+	}
+
+	switch (ta->state) {
+	case CREATE_1: /* fall through */
+	case UNLINK_1:
+		BUG_ON(super->s_victim_ino);
+		super->s_victim_ino = ta->ino;
+		break;
+	case CREATE_2: /* fall through */
+	case UNLINK_2:
+		BUG_ON(super->s_victim_ino != ta->ino);
+		super->s_victim_ino = 0;
+		/* transaction ends here - free it */
+		kfree(ta);
+		break;
+	case CROSS_RENAME_1:
+		BUG_ON(super->s_rename_dir);
+		BUG_ON(super->s_rename_pos);
+		super->s_rename_dir = ta->dir;
+		super->s_rename_pos = ta->pos;
+		break;
+	case CROSS_RENAME_2:
+		BUG_ON(super->s_rename_dir != ta->dir);
+		BUG_ON(super->s_rename_pos != ta->pos);
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		kfree(ta);
+		break;
+	case TARGET_RENAME_1:
+		BUG_ON(super->s_rename_dir);
+		BUG_ON(super->s_rename_pos);
+		BUG_ON(super->s_victim_ino);
+		super->s_rename_dir = ta->dir;
+		super->s_rename_pos = ta->pos;
+		super->s_victim_ino = ta->ino;
+		break;
+	case TARGET_RENAME_2:
+		BUG_ON(super->s_rename_dir != ta->dir);
+		BUG_ON(super->s_rename_pos != ta->pos);
+		BUG_ON(super->s_victim_ino != ta->ino);
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		break;
+	case TARGET_RENAME_3:
+		BUG_ON(super->s_rename_dir);
+		BUG_ON(super->s_rename_pos);
+		BUG_ON(super->s_victim_ino != ta->ino);
+		super->s_victim_ino = 0;
+		kfree(ta);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static int __logfs_write_buf(struct inode *inode, struct page *page,
+		struct logfs_transaction *ta, long flags)
+{
+	u64 size = i_size_read(inode);
+	pgoff_t index = page->index;
+	int err;
+	u64 bix;
+	u8 level;
+
+	flags |= WF_WRITE | WF_DELETE;
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	logfs_handle_transaction(inode, ta);
+
+	if (size <= LOGFS_EMBEDDED_SIZE)
+		return logfs_write_embedded(page, inode);
+
+	err = logfs_move_embedded(inode, page);
+	if (err)
+		return err;
+
+	if (index < I0_BLOCKS)
+		return logfs_write_direct(inode, page, flags);
+
+	logfs_unpack_index(index, &bix, &level);
+	bix = adjust_bix(bix, level);
+	if (bix < I1_BLOCKS)
+		return logfs_write_rec(inode, page, bix, 0, level, flags);
+	if (bix < I2_BLOCKS)
+		return logfs_write_rec(inode, page, bix, 1, level, flags);
+	if (bix < I3_BLOCKS)
+		return logfs_write_rec(inode, page, bix, 2, level, flags);
+
+	BUG();
+	return -EIO;
+}
+
+int logfs_write_buf(struct inode *inode, struct page *page,
+		struct logfs_transaction *ta, long flags)
+{
+	struct super_block *sb = inode->i_sb;
+	int ret;
+
+	logfs_get_wblocks(sb, page, flags & WF_LOCK);
+
+	ret = __logfs_write_buf(inode, page, ta, flags);
+	BUG_ON(PagePrivate(page));
+	if (!ret)
+		ret = logfs_write_inode_now(inode, flags & ~WF_LOCK);
+	logfs_put_wblocks(sb, page, flags & WF_LOCK);
+	return ret;
+}
+
+static int __logfs_delete(struct inode *inode, struct page *page)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	long flags = WF_DELETE | WF_SYNC;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	if (li->li_flags & LOGFS_IF_EMBEDDED) {
+		i_size_write(inode, 0);
+		li->li_flags |= LOGFS_IF_DIRTY;
+		return 0;
+	}
+
+	if (page->index < I0_BLOCKS)
+		return logfs_write_direct(inode, page, flags);
+	if (page->index < I1_BLOCKS)
+		return logfs_write_rec(inode, page, page->index, 0, 0, flags);
+	if (page->index < I2_BLOCKS)
+		return logfs_write_rec(inode, page, page->index, 1, 0, flags);
+	if (page->index < I3_BLOCKS)
+		return logfs_write_rec(inode, page, page->index, 2, 0, flags);
+	return 0;
+}
+
+int logfs_delete(struct inode *inode, pgoff_t index,
+		struct shadow_tree *shadow_tree, struct logfs_transaction *ta)
+{
+	struct super_block *sb = inode->i_sb;
+	struct page *page;
+	int ret;
+
+	page = logfs_get_read_page(inode, index, 0);
+	if (!page)
+		return -ENOMEM;
+
+	add_shadow_tree_to_page(page, shadow_tree);
+	logfs_get_wblocks(sb, page, 1);
+	logfs_handle_transaction(inode, ta);
+	ret = __logfs_delete(inode, page);
+	if (!ret)
+		ret = logfs_write_inode_now(inode, WF_SYNC);
+	logfs_put_wblocks(sb, page, 1);
+
+	SetPageZero(page);
+	logfs_put_read_page(page);
+
+	return ret;
+}
+
+/* Rewrite cannot mark the inode dirty but has to write it immediatly. */
+int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, int level,
+		long flags)
+{
+	struct page *page;
+	int err;
+
+	level = adjust_level(level);
+	page = logfs_get_write_page(inode, bix, level);
+	if (!page)
+		return -ENOMEM;
+
+	err = logfs_segment_read(inode, page, ofs, bix, level);
+	if (!err)
+		err = logfs_write_buf(inode, page, NULL, flags);
+	logfs_put_write_page(inode, page);
+	return err;
+}
+
+static int truncate_data_block(struct inode *inode, struct page *page,
+		u64 ofs, struct logfs_shadow *shadow)
+{
+	loff_t size = i_size_read(inode);
+	loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits;
+	u64 bix;
+	u8 level;
+	int err;
+
+	/* Does truncation happen within this page? */
+	if (size <= pageofs || size - pageofs >= PAGE_SIZE)
+		return 0;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	BUG_ON(level > 0);
+
+	err = logfs_segment_read(inode, page, ofs, bix, level);
+	if (err)
+		return err;
+
+	zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE);
+	return logfs_segment_write(inode, page, shadow);
+}
+
+static int __logfs_truncate_i0(struct inode *inode, struct page *page,
+		struct write_control *wc)
+{
+	struct logfs_shadow *shadow;
+	u64 bix;
+	u8 level;
+	int err = 0;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	shadow = alloc_shadow(inode, bix, level, wc->ofs);
+
+	if (level == 0)
+		err = truncate_data_block(inode, page, wc->ofs, shadow);
+	/* Indirect blocks can get removed completely */
+	if (err) {
+		free_shadow(inode, shadow);
+		return err;
+	}
+
+	logfs_segment_delete(inode, shadow);
+	fill_shadow_tree(wc->shadow_tree, page, shadow);
+	wc->ofs = shadow->new_ofs;
+	return 0;
+}
+
+static int logfs_truncate_direct(struct inode *inode, u64 size)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc = {
+		.shadow_tree =  &li->li_shadow_tree,
+	};
+	struct page *page;
+	int e;
+	int err;
+
+	for (e = I1_INDEX - 1; e >= 0; e--) {
+		if (size > (e+1) * LOGFS_BLOCKSIZE)
+			break;
+
+		wc.ofs = li->li_data[e];
+		if (!wc.ofs)
+			continue;
+
+		page = logfs_get_write_page(inode, e, 0);
+		if (!page)
+			return -ENOMEM;
+#if 0 /* I believe this is unnecessary */
+		err = logfs_segment_read(inode, page, wc.ofs, e, 0);
+		if (err) {
+			logfs_put_write_page(page);
+			return err;
+		}
+#endif
+		err = __logfs_truncate_i0(inode, page, &wc);
+		logfs_put_write_page(inode, page);
+		if (err)
+			return err;
+
+		li->li_data[e] = wc.ofs;
+		li->li_flags |= LOGFS_IF_DIRTY;
+	}
+	return 0;
+}
+
+/* FIXME: these need to become per-sb once we support different blocksizes */
+static u64 logfs_factor[] = {
+	LOGFS_BLOCKSIZE,
+	LOGFS_I1_SIZE,
+	LOGFS_I2_SIZE,
+	LOGFS_I3_SIZE
+};
+
+static u64 logfs_foo[] = {
+	1,
+	I1_BLOCKS,
+	I2_BLOCKS,
+	I3_BLOCKS,
+};
+
+static u64 logfs_start_index[] = {
+	I0_BLOCKS,
+	I1_BLOCKS,
+	I2_BLOCKS,
+	I3_BLOCKS
+};
+
+static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, u8 *level)
+{
+	logfs_unpack_index(index, bix, level);
+	if (*bix <= logfs_start_index[*level-1])
+		*bix = 0;
+}
+
+static int __logfs_truncate_rec(struct inode *inode, struct page *ipage,
+		struct write_control *this_wc, u64 size)
+{
+	int truncate_happened = 0;
+	int e;
+	int err = 0;
+	u64 bix, child_bix;
+	u8 level;
+	struct page *page;
+	struct write_control child_wc = {
+		.shadow_tree = logfs_page_to_tree(ipage),
+	};
+
+	logfs_unpack_raw_index(ipage->index, &bix, &level);
+	err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
+	if (err)
+		return err;
+
+	for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) {
+		child_bix = bix + e*logfs_foo[level-1];
+		if (size > (e+1) * logfs_factor[level-1]) {
+			if (truncate_happened)
+				BUG(); /* FIXME: Write out truncated block */
+			return 0;
+		}
+
+		child_wc.ofs = pure_ofs(block_get_pointer(ipage, e));
+		if (!child_wc.ofs)
+			continue;
+
+		truncate_happened = 1;
+		page = logfs_get_write_page(inode, child_bix, level-1);
+		if (!page)
+			return -ENOMEM;
+
+		if (level > 1)
+			err = __logfs_truncate_rec(inode, page, &child_wc, size);
+		else
+			err = __logfs_truncate_i0(inode, page, &child_wc);
+		logfs_put_write_page(inode, page);
+		if (err)
+			return err;
+
+		block_set_pointer(ipage, e, child_wc.ofs);
+	}
+	/* Complete block can get removed if we get here */
+	return __logfs_truncate_i0(inode, ipage, this_wc);
+}
+
+static int logfs_truncate_rec(struct inode *inode, u64 size, int level)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc = {
+		.ofs = li->li_data[I1_INDEX + level-1],
+		.shadow_tree = &li->li_shadow_tree,
+	};
+	struct page *page;
+	int err;
+
+	if (!wc.ofs)
+		return 0;
+
+	page = logfs_get_write_page(inode, 0, level);
+	if (!page)
+		return -ENOMEM;
+
+	err = __logfs_truncate_rec(inode, page, &wc, size);
+	logfs_put_write_page(inode, page);
+	if (err)
+		return err;
+
+	if (li->li_data[I1_INDEX + level-1] != wc.ofs) {
+		li->li_data[I1_INDEX + level-1] = wc.ofs;
+		li->li_flags |= LOGFS_IF_DIRTY;
+	}
+	return 0;
+}
+
+static int logfs_truncate_embedded(struct inode *inode, u64 size)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	void *buf = (void *)li->li_data + size;
+	size_t len = LOGFS_EMBEDDED_SIZE - size;
+
+	if (size < LOGFS_EMBEDDED_SIZE)
+		memset(buf, 0, len);
+	li->li_flags |= LOGFS_IF_DIRTY;
+	return 0;
+}
+
+static int __logfs_truncate(struct inode *inode, u64 size)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int ret;
+
+	if (li->li_flags & LOGFS_IF_EMBEDDED)
+		return logfs_truncate_embedded(inode, size);
+
+	if (size >= logfs_factor[3])
+		return 0;
+	ret = logfs_truncate_rec(inode, size, 3);
+	if (ret)
+		return ret;
+
+	if (size >= logfs_factor[2])
+		return 0;
+	ret = logfs_truncate_rec(inode, size, 2);
+	if (ret)
+		return ret;
+
+	if (size >= logfs_factor[1])
+		return 0;
+	ret = logfs_truncate_rec(inode, size, 1);
+	if (ret)
+		return ret;
+
+	ret = logfs_truncate_direct(inode, size);
+	return ret;
+}
+
+int logfs_truncate(struct inode *inode, u64 size)
+{
+	struct super_block *sb = inode->i_sb;
+	int err;
+
+	logfs_get_wblocks(sb, NULL, 1);
+	err = __logfs_truncate(inode, size);
+	if (!err)
+		err = logfs_write_inode_now(inode, 0);
+	logfs_put_wblocks(sb, NULL, 1);
+
+	if (!err)
+		err = vmtruncate(inode, size);
+
+	if (!err && size == 0)
+		logfs_inode(inode)->li_flags |= LOGFS_IF_EMBEDDED;
+
+	return err;
+}
+
+int logfs_inode_read(struct inode *inode, void *buf, size_t n, loff_t bix)
+{
+	loff_t pos = bix << inode->i_sb->s_blocksize_bits;
+	struct page *page;
+	void *pagebuf;
+
+	if (pos >= i_size_read(inode))
+		return -EOF;
+
+	page = read_cache_page(inode->i_mapping, bix,
+			(filler_t *)logfs_readpage, NULL);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+
+	if (PageZero(page))
+		return -ENODATA;
+
+	pagebuf = kmap_atomic(page, KM_USER0);
+	memcpy(buf, pagebuf, n);
+	kunmap_atomic(pagebuf, KM_USER0);
+	return 0;
+}
+
+/**
+ * logfs_inode_write - write inode or dentry objects
+ *
+ * @inode:		parent inode (ifile or directory)
+ * @buf:		object to write (inode or dentry)
+ * @n:			object size
+ * @_pos:		object number (file position in blocks/objects)
+ * @flags:		write flags
+ * @lock:		0 if write lock is already taken, 1 otherwise
+ * @ta:			transaction this write is part of or NULL
+ * @shadow_tree:	shadow below this inode
+ */
+int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
+		loff_t bix, long flags, struct logfs_transaction *ta,
+		struct shadow_tree *shadow_tree)
+{
+	loff_t pos = bix << inode->i_sb->s_blocksize_bits;
+	int err;
+	struct page *page;
+	void *pagebuf;
+
+	BUG_ON(pos & (LOGFS_BLOCKSIZE-1));
+	BUG_ON(count > LOGFS_BLOCKSIZE);
+	page = logfs_get_read_page(inode, bix, 0);
+	if (!page)
+		return -ENOMEM;
+
+	pagebuf = kmap_atomic(page, KM_USER0);
+	memcpy(pagebuf, buf, count);
+	memset(pagebuf+count, 0, LOGFS_BLOCKSIZE-count);
+	flush_dcache_page(page);
+	kunmap_atomic(pagebuf, KM_USER0);
+	ClearPageZero(page);
+	add_shadow_tree_to_page(page, shadow_tree);
+
+	if (!(flags & WF_SYNC)) {
+		logfs_dirty_page(inode, page, flags);
+		logfs_put_read_page(page);
+		return 0;
+	}
+	/*
+	 * Drop the page lock, but keep a reference on the page until
+	 * logfs_write_buf returns.  This allows GC to move this page while
+	 * ensuring the page doesn't get assigned elsewhere under memory
+	 * pressure.
+	 */
+	unlock_page(page);
+
+	if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)
+		i_size_write(inode, pos + LOGFS_BLOCKSIZE);
+
+	err = logfs_write_buf(inode, page, ta, flags);
+	page_cache_release(page);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+int logfs_init_rw(struct logfs_super *super)
+{
+	int min_fill = 3 * super->s_no_blocks;
+
+	mutex_init(&super->s_w_mutex);
+	super->s_block_pool = mempool_create_kzalloc_pool(min_fill,
+			sizeof(struct logfs_block));
+	super->s_shadow_pool = mempool_create_kzalloc_pool(min_fill,
+			sizeof(struct logfs_shadow));
+	return 0;
+}
+
+void logfs_cleanup_rw(struct logfs_super *super)
+{
+	mempool_destroy(super->s_block_pool);
+	mempool_destroy(super->s_shadow_pool);
+}
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/segment.c	2008-04-07 11:53:20.919877985 +0200
@@ -0,0 +1,595 @@
+/*
+ * fs/logfs/segment.c	- Handling the Object Store
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ *
+ * Object store or ostore makes up the complete device with exception of
+ * the superblock and journal areas.  Apart from its own metadata it stores
+ * three kinds of objects: inodes, dentries and blocks, both data and indirect.
+ */
+#include "logfs.h"
+
+int logfs_erase_segment(struct super_block *sb, u32 index)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	super->s_gec++;
+
+	super->s_devops->sync(sb);
+	return super->s_devops->erase(sb, (u64)index << super->s_segshift,
+			super->s_segsize);
+}
+
+static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes)
+{
+	s32 ofs;
+	int ret;
+
+	ret = logfs_open_area(area);
+	BUG_ON(ret > 0);
+	if (ret)
+		return ret;
+
+	ofs = area->a_used_bytes;
+	area->a_used_bytes += bytes;
+	BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize);
+
+	return dev_ofs(area->a_sb, area->a_segno, ofs);
+}
+
+void logfs_buf_write(struct logfs_area *area, u64 ofs, void *data, size_t len)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+	long write_mask = super->s_writesize - 1;
+	u64 buf_start;
+	size_t space, buf_ofs;
+
+	buf_ofs = (long)ofs & write_mask;
+	if (buf_ofs) {
+		/* buf already used - fill it */
+		space = super->s_writesize - buf_ofs;
+		if (len < space) {
+			/* not enough to fill it - just copy */
+			memcpy(area->a_wbuf + buf_ofs, data, len);
+			return;
+		}
+		/* enough data to fill and flush the buffer */
+		memcpy(area->a_wbuf + buf_ofs, data, space);
+		buf_start = ofs & ~write_mask;
+		super->s_devops->write(sb, buf_start, super->s_writesize, area->a_wbuf);
+		ofs += space;
+		data += space;
+		len -= space;
+	}
+
+	/* write complete hunks */
+	space = len & ~write_mask;
+	if (space) {
+		super->s_devops->write(sb, ofs, space, data);
+		ofs += space;
+		data += space;
+		len -= space;
+	}
+
+	/* store anything remaining in wbuf */
+	if (len)
+		memcpy(area->a_wbuf, data, len);
+}
+
+static struct logfs_area *get_area(struct super_block *sb, int level)
+{
+	return logfs_super(sb)->s_area[level];
+}
+
+static int __logfs_segment_write(struct inode *inode, void *buf,
+		struct logfs_shadow *shadow, int len, int compr)
+{
+	struct logfs_area *area;
+	struct super_block *sb = inode->i_sb;
+	s64 ofs;
+	struct logfs_object_header h;
+	int acc_len = (shadow->level == 0) ? len : sb->s_blocksize;
+
+	h.len	= cpu_to_be16(len);
+	h.type	= OBJ_BLOCK;
+	h.compr	= compr;
+	h.ino	= cpu_to_be64(inode->i_ino);
+	h.bix	= cpu_to_be64(shadow->bix);
+	h.crc	= logfs_crc32(&h, sizeof(h) - 4, 4);
+	h.data_crc = logfs_crc32(buf, len, 0);
+
+	area = get_area(sb, shadow->level);
+	ofs = logfs_get_free_bytes(area, len + LOGFS_HEADERSIZE);
+	LOGFS_BUG_ON(ofs <= 0, sb);
+
+	logfs_buf_write(area, ofs, &h, sizeof(h));
+	logfs_buf_write(area, ofs + LOGFS_HEADERSIZE, buf, len);
+
+	shadow->new_ofs = ofs;
+	shadow->new_len = acc_len + LOGFS_HEADERSIZE;
+
+	pr_debug("%2x %2x\n", area->a_level, area->a_segno);
+	/* FIXME merge with open_area */
+	logfs_close_area(area);
+
+	return 0;
+}
+
+static s64 logfs_segment_write_compress(struct inode *inode, void *buf,
+		struct logfs_shadow *shadow)
+{
+	struct super_block *sb = inode->i_sb;
+	void *compressor_buf = logfs_super(sb)->s_compressed_je;
+	int bs = sb->s_blocksize;
+	ssize_t compr_len;
+	int ret;
+
+	mutex_lock(&logfs_super(sb)->s_journal_mutex);
+	compr_len = logfs_compress(buf, compressor_buf, bs, bs);
+
+	if (compr_len >= 0) {
+		ret = __logfs_segment_write(inode, compressor_buf, shadow,
+				compr_len, COMPR_ZLIB);
+	} else {
+		ret = __logfs_segment_write(inode, buf, shadow, bs, COMPR_NONE);
+	}
+	mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+	return ret;
+}
+
+/**
+ * logfs_segment_write - write data block to object store
+ * @inode:		inode containing data
+ * @buf:		data buffer
+ * @bix:		block index
+ * @level:		block level
+ * @alloc:		1 if new allocation is needs, 0 otherwise
+ *
+ * Returns the physical offset of data written or a negative errno.
+ */
+int logfs_segment_write(struct inode *inode, struct page *page,
+		struct logfs_shadow *shadow)
+{
+	struct super_block *sb = inode->i_sb;
+	int bs = sb->s_blocksize;
+	int do_compress;
+	int ret;
+	void *buf;
+
+	do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED;
+	if (shadow->level != 0) {
+		/* temporarily disable compression for indirect blocks */
+		do_compress = 0;
+	}
+
+	buf = kmap(page);
+	if (do_compress)
+		ret = logfs_segment_write_compress(inode, buf, shadow);
+	else
+		ret = __logfs_segment_write(inode, buf, shadow, bs, COMPR_NONE);
+	kunmap(page);
+
+	/* this BUG_ON did catch a locking bug.  useful */
+	BUG_ON(!(shadow->new_ofs & (logfs_super(sb)->s_segsize - 1)));
+	return ret;
+}
+
+/* FIXME: all this mess should get replaced by using the page cache */
+static void fixup_from_wbuf(struct super_block *sb, struct logfs_area *area,
+		void *read, u64 ofs, size_t readlen)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 read_start = ofs & (super->s_segsize - 1);
+	u32 read_end = read_start + readlen;
+	u32 writemask = super->s_writesize - 1;
+	u32 buf_start = area->a_used_bytes & ~writemask;
+	u32 buf_end = area->a_used_bytes;
+	void *buf = area->a_wbuf;
+	size_t buflen = buf_end - buf_start;
+
+	if (!buf || read_end < buf_start)
+		return;
+	if ((ofs & (super->s_segsize - 1)) >= area->a_used_bytes) {
+		memset(read, 0xff, readlen);
+		return;
+	}
+
+	if (buf_start > read_start) {
+		read += buf_start - read_start;
+		readlen -= buf_start - read_start;
+	} else {
+		buf += read_start - buf_start;
+		buflen -= read_start - buf_start;
+	}
+	memcpy(read, buf, min(readlen, buflen));
+	if (buflen < readlen)
+		memset(read + buflen, 0xff, readlen - buflen);
+}
+
+int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area;
+	u32 segno = ofs >> super->s_segshift;
+	int i, err;
+
+	err = super->s_devops->read(sb, ofs, len, buf);
+	if (err)
+		return err;
+
+	for_each_area(i) {
+		area = super->s_area[i];
+		if (area->a_segno == segno) {
+			fixup_from_wbuf(sb, area, buf, ofs, len);
+			break;
+		}
+	}
+	return 0;
+}
+
+static u64 logfs_block_mask[] = {
+	~0,
+	~(I1_BLOCKS-1),
+	~(I2_BLOCKS-1),
+	~(I3_BLOCKS-1)
+};
+
+/*
+ * The "position" of indirect blocks is ambiguous.  It can be the position
+ * of any data block somewhere behind this indirect block.  So we need to
+ * normalize the positions through logfs_block_mask[level] before comparing.
+ */
+static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, int level)
+{
+	return	(pos1 & logfs_block_mask[level]) !=
+		(pos2 & logfs_block_mask[level]);
+}
+
+static int __logfs_segment_read(struct inode *inode, void *buf,
+		u64 ofs, u64 bix, u8 level)
+{
+	struct super_block *sb = inode->i_sb;
+	void *compressor_buf = logfs_super(sb)->s_compressed_je;
+	struct logfs_object_header h;
+	__be32 crc;
+	u16 len;
+	int err, bs = sb->s_blocksize;
+
+	err = wbuf_read(sb, ofs, LOGFS_HEADERSIZE, &h);
+	if (err)
+		goto out_err;
+	err = -EIO;
+	crc = logfs_crc32(&h, sizeof(h) - 4, 4);
+	if (crc != h.crc) {
+		printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
+				"got %x\n", ofs, be32_to_cpu(h.crc),
+				be32_to_cpu(crc));
+		goto out_err;
+	}
+
+	if (be64_to_cpu(h.ino) != inode->i_ino
+			|| check_pos(sb, be64_to_cpu(h.bix), bix, level)) {
+		printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: "
+				"expected (%lx, %llx), got %llx, %llx)\n",
+				ofs, inode->i_ino, bix,
+				be64_to_cpu(h.ino), be64_to_cpu(h.bix));
+		goto out_err;
+	}
+
+	len = be16_to_cpu(h.len);
+
+	switch (h.compr) {
+	case COMPR_NONE:
+		err = wbuf_read(sb, ofs + LOGFS_HEADERSIZE, len, buf);
+		if (err)
+			goto out_err;
+		crc = logfs_crc32(buf, len, 0);
+		if (crc != h.data_crc) {
+			printk(KERN_ERR"LOGFS: uncompressed data crc error at "
+					"%llx: expected %x, got %x\n", ofs,
+					be32_to_cpu(h.data_crc),
+					be32_to_cpu(crc));
+			goto out_err;
+		}
+		break;
+	case COMPR_ZLIB:
+		mutex_lock(&logfs_super(sb)->s_journal_mutex);
+		err = wbuf_read(sb, ofs + LOGFS_HEADERSIZE, len, compressor_buf);
+		if (err) {
+			mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+			goto out_err;
+		}
+		crc = logfs_crc32(compressor_buf, len, 0);
+		if (crc != h.data_crc) {
+			printk(KERN_ERR"LOGFS: compressed data crc error at "
+					"%llx: expected %x, got %x\n", ofs,
+					be32_to_cpu(h.data_crc),
+					be32_to_cpu(crc));
+			mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+			goto out_err;
+		}
+		err = logfs_uncompress(compressor_buf, buf, len, bs);
+		mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+		if (err) {
+			printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs);
+			goto out_err;
+		}
+		break;
+	default:
+		LOGFS_BUG(sb);
+		err = -EIO;
+	}
+	return 0;
+
+out_err:
+	logfs_set_ro(sb);
+	printk(KERN_ERR"LOGFS: device is read-only now\n");
+	BUG();
+	return err;
+}
+
+/**
+ * logfs_segment_read - read data block from object store
+ * @inode:		inode containing data
+ * @buf:		data buffer
+ * @ofs:		physical data offset
+ * @bix:		block index
+ * @level:		block level
+ *
+ * Returns 0 on success or a negative errno.
+ */
+int logfs_segment_read(struct inode *inode, struct page *page,
+		u64 ofs, u64 bix, u8 level)
+{
+	int err;
+	void *buf;
+
+	if (PageUptodate(page))
+		return 0;
+
+	ofs &= ~LOGFS_FULLY_POPULATED;
+
+	buf = kmap(page);
+	err = __logfs_segment_read(inode, buf, ofs, bix, level);
+	kunmap(page);
+	if (!err)
+		SetPageUptodate(page);
+	return err;
+}
+
+int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_object_header h;
+	u16 len;
+	int err;
+
+	BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED);
+	if (!shadow->old_ofs)
+		return 0;
+
+	err = wbuf_read(sb, shadow->old_ofs, sizeof(h), &h);
+	LOGFS_BUG_ON(err, sb);
+	LOGFS_BUG_ON(h.crc != logfs_crc32(&h, sizeof(h)-4, 4), sb);
+	LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb);
+	LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix),
+				shadow->level), sb);
+
+	len = be16_to_cpu(h.len);
+	len = (shadow->level == 0) ? len : sb->s_blocksize;
+	shadow->old_len = len + sizeof(h);
+	return 0;
+}
+
+static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
+{
+	struct btree_head *head = &logfs_super(sb)->s_reserved_segments;
+	int err;
+
+	err = btree_insert(head, segno, (void *)1);
+	if (err)
+		return err;
+	logfs_super(sb)->s_bad_segments++;
+	/* FIXME: write to journal */
+	return 0;
+}
+
+int logfs_open_area(struct logfs_area *area)
+{
+	size_t writesize = logfs_super(area->a_sb)->s_writesize;
+	int err;
+
+	if (area->a_is_open)
+		return 0;
+
+again:
+	area->a_ops->get_free_segment(area);
+	area->a_used_bytes = 0;
+	area->a_ops->get_erase_count(area);
+
+	if (area->a_wbuf)
+		memset(area->a_wbuf, 0, writesize);
+	area->a_is_open = 1;
+
+	err = area->a_ops->erase_segment(area);
+	if (unlikely(err)) {
+		printk(KERN_WARNING "LogFS: Error erasing segment %x\n",
+				area->a_segno);
+		logfs_mark_segment_bad(area->a_sb, area->a_segno);
+		goto again;
+	}
+	return 0;
+}
+
+void logfs_close_area(struct logfs_area *area)
+{
+	if (!area->a_is_open)
+		return;
+
+	area->a_ops->finish_area(area);
+}
+
+/*
+ * Pick a free segment to be used for this area.  Effectively takes a
+ * candidate from the free list (not really a candidate anymore).
+ */
+static void ostore_get_free_segment(struct logfs_area *area)
+{
+	struct logfs_super *super = logfs_super(area->a_sb);
+	struct gc_candidate *cand;
+
+	if (list_empty(&super->s_free_list.list)) {
+		printk(KERN_ERR"LOGFS: ran out of free segments\n");
+		BUG();
+	}
+
+	cand = get_best_cand(&super->s_free_list);
+	area->a_segno = cand->segno;
+	kfree(cand);
+	if (super->s_free_list.count < 5)
+		pr_debug("use segment #%02x, level %x, %2x remaining\n",
+				area->a_segno, area->a_level,
+				super->s_free_list.count);
+}
+
+static void ostore_get_erase_count(struct logfs_area *area)
+{
+	struct logfs_segment_header h;
+	int err;
+
+	err = device_read(area->a_sb, area->a_segno, 0, sizeof(h), &h);
+	BUG_ON(err);
+	area->a_erase_count = be32_to_cpu(h.ec) + 1;
+}
+
+static int ostore_erase_segment(struct logfs_area *area)
+{
+	struct logfs_segment_header h;
+	u64 ofs;
+	int err;
+
+	err = logfs_erase_segment(area->a_sb, area->a_segno);
+	if (err)
+		return err;
+
+	h.pad = 0;
+	h.type = OBJ_OSTORE;
+	h.level = area->a_level;
+	h.segno = cpu_to_be32(area->a_segno);
+	h.ec = cpu_to_be32(area->a_erase_count);
+	h.gec = cpu_to_be64(logfs_super(area->a_sb)->s_gec);
+	h.crc = logfs_crc32(&h, sizeof(h), 4);
+
+	ofs = dev_ofs(area->a_sb, area->a_segno, 0);
+	area->a_used_bytes = sizeof(h);
+	logfs_buf_write(area, ofs, &h, sizeof(h));
+	return 0;
+}
+
+static void flush_buf(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+	u32 used, free;
+	u64 ofs;
+	u32 writemask = super->s_writesize - 1;
+	int err;
+
+	ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
+	ofs &= ~writemask;
+	used = area->a_used_bytes & writemask;
+	free = super->s_writesize - area->a_used_bytes;
+	free &= writemask;
+	if (used == 0)
+		return;
+
+	memset(area->a_wbuf + used, 0xff, free);
+	err = super->s_devops->write(sb, ofs, super->s_writesize, area->a_wbuf);
+	LOGFS_BUG_ON(err, sb);
+}
+
+static void ostore_finish_area(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+	u32 remaining = super->s_segsize - area->a_used_bytes;
+
+	if (remaining >= LOGFS_MAX_OBJECTSIZE)
+		return;
+
+	flush_buf(area);
+
+	area->a_segno = 0;
+	area->a_is_open = 0;
+}
+
+static const struct logfs_area_ops ostore_area_ops = {
+	.get_free_segment	= ostore_get_free_segment,
+	.get_erase_count	= ostore_get_erase_count,
+	.erase_segment		= ostore_erase_segment,
+	.finish_area		= ostore_finish_area,
+};
+
+static void free_area(struct logfs_area *area)
+{
+	if (area)
+		kfree(area->a_wbuf);
+	kfree(area);
+}
+
+static struct logfs_area *alloc_area(struct super_block *sb)
+{
+	struct logfs_area *area;
+	size_t writesize = logfs_super(sb)->s_writesize;
+
+	area = kzalloc(sizeof(*area), GFP_KERNEL);
+	if (!area)
+		return NULL;
+
+	area->a_sb = sb;
+	if (writesize > 1) {
+		area->a_wbuf = kmalloc(writesize, GFP_KERNEL);
+		if (!area->a_wbuf) {
+			kfree(area);
+			return NULL;
+		}
+	}
+	return area;
+}
+
+int logfs_init_areas(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	super->s_journal_area = alloc_area(sb);
+	if (!super->s_journal_area)
+		return -ENOMEM;
+
+	for_each_area(i) {
+		super->s_area[i] = alloc_area(sb);
+		if (!super->s_area[i])
+			goto err;
+		super->s_area[i]->a_level = i;
+		super->s_area[i]->a_ops = &ostore_area_ops;
+	}
+	return 0;
+
+err:
+	for (i--; i >= 0; i--)
+		free_area(super->s_area[i]);
+	free_area(super->s_journal_area);
+	return -ENOMEM;
+}
+
+void logfs_cleanup_areas(struct logfs_super *super)
+{
+	int i;
+
+	for_each_area(i)
+		free_area(super->s_area[i]);
+	kfree(super->s_journal_area);
+}
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/super.c	2008-04-07 12:25:08.360030590 +0200
@@ -0,0 +1,381 @@
+/*
+ * fs/logfs/super.c
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ *
+ * Generally contains mount/umount code and also serves as a dump area for
+ * any functions that don't fit elsewhere and neither justify a file of their
+ * own.
+ */
+#include "logfs.h"
+#include <linux/bio.h>
+#include <linux/mtd/mtd.h>
+#include <linux/statfs.h>
+#include <linux/buffer_head.h>
+
+static void dump_write(struct super_block *sb, int blockno, void *buf)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	if (blockno << sb->s_blocksize_bits >= super->s_segsize)
+		return;
+	super->s_devops->write(sb, blockno << sb->s_blocksize_bits,
+			sb->s_blocksize, buf);
+}
+
+/*
+ * logfs_crash_dump - dump debug information to device
+ *
+ * The LogFS superblock only occupies part of a segment.  This function will
+ * write as much debug information as it can gather into the spare space.
+ */
+void logfs_crash_dump(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i, blockno = 2, bs = sb->s_blocksize;
+	static char scratch[4096];
+	void *stack = (void *) ((ulong)current & ~0x1fffUL);
+
+	/* all wbufs */
+	if (super->s_writesize > 1)
+		for (i = 0; i < LOGFS_NO_AREAS; i++) {
+			void *wbuf = super->s_area[i]->a_wbuf;
+			u64 ofs = sb->s_blocksize + i*super->s_writesize;
+			super->s_devops->write(sb, ofs, super->s_writesize,
+					wbuf);
+		}
+	/* both superblocks */
+	memset(scratch, 0, bs);
+	memcpy(scratch, super, sizeof(*super));
+	memcpy(scratch + sizeof(*super) + 32, sb, sizeof(*sb));
+	dump_write(sb, blockno++, scratch);
+	/* process stack */
+	dump_write(sb, blockno++, stack);
+	dump_write(sb, blockno++, stack + 0x1000);
+}
+
+/*
+ * TODO: move to lib/string.c
+ */
+/**
+ * memchr_inv - Find a character in an area of memory.
+ * @s: The memory area
+ * @c: The byte to search for
+ * @n: The size of the area.
+ *
+ * returns the address of the first character other than @c, or %NULL
+ * if the whole buffer contains just @c.
+ */
+void *memchr_inv(const void *s, int c, size_t n)
+{
+	const unsigned char *p = s;
+	while (n-- != 0)
+		if ((unsigned char)c != *p++)
+			return (void *)(p - 1);
+
+	return NULL;
+}
+
+/*
+ * FIXME: There should be a reserve for root, similar to ext2.
+ */
+int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct logfs_super *super = logfs_super(sb);
+
+	stats->f_type		= LOGFS_MAGIC_U32;
+	stats->f_bsize		= sb->s_blocksize;
+	stats->f_blocks		= super->s_size >> LOGFS_BLOCK_BITS >> 3;
+	stats->f_bfree		= super->s_free_bytes >> sb->s_blocksize_bits;
+	stats->f_bavail		= super->s_free_bytes >> sb->s_blocksize_bits;
+	stats->f_files		= 0;
+	stats->f_ffree		= 0;
+	stats->f_namelen	= LOGFS_MAX_NAMELEN;
+	return 0;
+}
+
+static int logfs_sb_set(struct super_block *sb, void *_super)
+{
+	struct logfs_super *super = _super;
+
+	sb->s_fs_info = super;
+	sb->s_mtd = super->s_mtd ? super->s_mtd->mtd : NULL;
+	sb->s_bdev = super->s_bdev;
+	return 0;
+}
+
+static int logfs_sb_test(struct super_block *sb, void *_super)
+{
+	struct logfs_super *super = _super;
+	struct mtd_info *mtd = super->s_mtd ? super->s_mtd->mtd : NULL;
+
+	if (mtd && sb->s_mtd == mtd)
+		return 1;
+	if (super->s_bdev && sb->s_bdev == super->s_bdev)
+		return 1;
+	return 0;
+}
+
+static int logfs_make_writeable(struct super_block *sb)
+{
+	int err;
+
+	/* Check areas for trailing unaccounted data */
+	err = logfs_check_areas(sb);
+	if (err)
+		return err;
+
+	/* Do one GC pass before any data gets dirtied */
+	logfs_gc_pass(sb);
+
+	/* after all initializations are done, replay the journal
+	 * for rw-mounts, if necessary */
+	err = logfs_replay_journal(sb);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
+{
+	struct inode *rootdir;
+	int err;
+
+	/* root dir */
+	rootdir = dhowells_iget(sb, LOGFS_INO_ROOT);
+	if (!rootdir)
+		goto fail;
+
+	sb->s_root = d_alloc_root(rootdir);
+	if (!sb->s_root)
+		goto fail;
+
+	err = logfs_fsck(sb);
+	if (err) {
+		printk(KERN_ERR"LOGFS: fsck failed, refusing to mount\n");
+		goto fail;
+	}
+
+	/* FIXME: check for read-only mounts */
+	err = logfs_make_writeable(sb);
+	if (err)
+		goto fail;
+
+	return simple_set_mnt(mnt, sb);
+
+fail:
+	iput(logfs_super(sb)->s_master_inode);
+	return -EIO;
+}
+
+static int logfs_read_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_disk_super ds;
+	s64 ofs;
+	int i, ret;
+
+	ofs = super->s_devops->find_sb(sb);
+	if (ofs < 0)
+		return -EIO;
+	ret = super->s_devops->read(sb, ofs, sizeof(ds), &ds);
+	if (ret)
+		return ret;
+
+	if (be64_to_cpu(ds.ds_magic) != LOGFS_MAGIC)
+		return -EIO;
+
+	super->s_size = be64_to_cpu(ds.ds_filesystem_size);
+	super->s_root_reserve = be64_to_cpu(ds.ds_root_reserve);
+	super->s_segsize = 1 << ds.ds_segment_shift;
+	super->s_segshift = ds.ds_segment_shift;
+	sb->s_blocksize = 1 << ds.ds_block_shift;
+	sb->s_blocksize_bits = ds.ds_block_shift;
+	super->s_writesize = 1 << ds.ds_write_shift;
+	super->s_writeshift = ds.ds_write_shift;
+	super->s_no_segs = super->s_size >> super->s_segshift;
+	super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits;
+
+	journal_for_each(i)
+		super->s_journal_seg[i] = be64_to_cpu(ds.ds_journal_seg[i]);
+
+	super->s_ifile_levels = ds.ds_ifile_levels;
+	super->s_iblock_levels = ds.ds_iblock_levels;
+	super->s_data_levels = ds.ds_data_levels;
+	super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
+		+ super->s_data_levels;
+	super->s_gc_reserve = super->s_total_levels *
+		(2 * super->s_no_blocks - 1);
+	super->s_gc_reserve <<= sb->s_blocksize_bits;
+
+	mutex_init(&super->s_dirop_mutex);
+	spin_lock_init(&super->s_ino_lock);
+	INIT_LIST_HEAD(&super->s_freeing_list);
+
+	ret = logfs_init_rw(super);
+	if (ret)
+		return ret;
+
+	ret = logfs_init_areas(sb);
+	if (ret)
+		return ret;
+
+	ret = logfs_init_gc(super);
+	if (ret)
+		return ret;
+
+	ret = logfs_init_journal(sb);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static void logfs_kill_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	generic_shutdown_super(sb);
+
+	/* inode file is special and needs manual flushing */
+	logfs_flush_dirty(sb, 1);
+	BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
+
+	logfs_cleanup_gc(super);
+	logfs_cleanup_journal(sb);
+	logfs_cleanup_areas(super);
+	logfs_cleanup_rw(super);
+	logfs_put_mtd(super->s_mtd);
+	logfs_put_bdev(super->s_bdev);
+	kfree(super);
+}
+
+int logfs_get_sb_device(struct file_system_type *type, int flags,
+		struct mtd_inode *mtd, struct block_device *bdev,
+		const struct logfs_device_ops *devops, struct vfsmount *mnt)
+{
+	struct logfs_super *super;
+	struct super_block *sb;
+	int i, err = -ENOMEM;
+
+	super = kzalloc(sizeof(*super), GFP_KERNEL);
+	if (!super)
+		goto err0;
+
+	super->s_mtd	= mtd;
+	super->s_bdev	= bdev;
+	err = -EINVAL;
+	sb = sget(type, logfs_sb_test, logfs_sb_set, super);
+	if (IS_ERR(sb))
+		goto err0;
+
+	if (sb->s_root) {
+		/* Device is already in use */
+		err = simple_set_mnt(mnt, sb);
+		goto err0;
+	}
+
+	super->s_devops = devops;
+	INIT_LIST_HEAD(&super->s_dirty_list);
+	for_each_area(i)
+		INIT_LIST_HEAD(&super->s_gc_dirty_list[i]);
+
+	/*
+	 * Careful here.  In principle s_maxbytes could easily be bumped up to
+	 * LOGFS_I5_SIZE.  Most of the code just needs a thorough audit and
+	 * a couple of changes to allow for 4x and 5x indirect blocks.
+	 *
+	 * There is one detail requiring a little more care, though:
+	 * As we use the upper half of each file's address space for metadata,
+	 * s_maxbytes must remain below LONG_MAX.  Which means a different
+	 * limit on 64bit and 32bit systems and potentially files created on
+	 * one system that cannot be fully read on another.
+	 *
+	 * LOGFS_I3_SIZE is below 8TB, which is a safe choice for now.
+	 */
+	sb->s_maxbytes	= LOGFS_I3_SIZE;
+	sb->s_op	= &logfs_super_operations;
+	sb->s_flags	= flags | MS_NOATIME;
+
+	err = logfs_read_sb(sb);
+	if (err)
+		goto err1;
+
+	sb->s_flags |= MS_ACTIVE;
+	err = logfs_get_sb_final(sb, mnt);
+	if (err)
+		goto err1;
+	return 0;
+
+err1:
+	up_write(&sb->s_umount);
+	deactivate_super(sb);
+	return err;
+err0:
+	kfree(super);
+	logfs_put_mtd(mtd);
+	logfs_put_bdev(bdev);
+	return err;
+}
+
+static int logfs_get_sb(struct file_system_type *type, int flags,
+		const char *devname, void *data, struct vfsmount *mnt)
+{
+	ulong mtdnr;
+
+	if (!devname)
+		return logfs_get_sb_bdev(type, flags, devname, mnt);
+	if (strncmp(devname, "mtd", 3))
+		return logfs_get_sb_bdev(type, flags, devname, mnt);
+
+	{
+		char *garbage;
+		mtdnr = simple_strtoul(devname+3, &garbage, 0);
+		if (*garbage)
+			return -EINVAL;
+	}
+
+	return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+}
+
+static struct file_system_type logfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "logfs",
+	.get_sb		= logfs_get_sb,
+	.kill_sb	= logfs_kill_sb,
+};
+
+static int __init logfs_init(void)
+{
+	int ret;
+
+	ret = logfs_compr_init();
+	if (ret)
+		return ret;
+
+	ret = logfs_init_inode_cache();
+	if (ret) {
+		logfs_compr_exit();
+		return ret;
+	}
+
+	return register_filesystem(&logfs_fs_type);
+}
+
+static void __exit logfs_exit(void)
+{
+	unregister_filesystem(&logfs_fs_type);
+	logfs_destroy_inode_cache();
+	logfs_compr_exit();
+}
+
+module_init(logfs_init);
+module_exit(logfs_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
+MODULE_DESCRIPTION("scalable flash filesystem");
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/progs/fsck.c	2008-04-07 11:53:20.919877985 +0200
@@ -0,0 +1,347 @@
+/*
+ * fs/logfs/prog/fsck.c	- filesystem check
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ *
+ * In principle this could get moved to userspace.  However it might still
+ * make some sense to keep it in the kernel.  It is a pure checker and will
+ * only report problems, not attempt to repair them.
+ */
+#include "../logfs.h"
+
+static u64 used_bytes;
+static u64 free_bytes;
+static u64 last_ino;
+static u64 *inode_bytes;
+static u64 *inode_links;
+
+/*
+ * Pass 1: blocks
+ */
+
+static u32 logfs_free_bytes(struct super_block *sb, u32 segno, u8 *level)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_segment_header sh;
+	struct logfs_object_header oh;
+	struct inode *inode;
+	int cookie;
+	u64 ofs, ino, bix;
+	u32 seg_ofs, free, size;
+	u16 len;
+	int err;
+	void *reserved;
+
+	*level = 0xf;
+	/* Some segments are reserved.  Just pretend they were all valid */
+	reserved = btree_lookup(&super->s_reserved_segments, segno);
+	if (reserved)
+		return 0;
+
+	err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
+	BUG_ON(err);
+	if (!memchr_inv(&sh, 0xff, sizeof(sh)))
+		return super->s_segsize;
+
+	free = super->s_segsize;
+	if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4))
+		return free;
+	*level = sh.level;
+
+	for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
+			seg_ofs + sizeof(oh) < super->s_segsize; ) {
+		wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh), &oh);
+		BUG_ON(err);
+		if (!memchr_inv(&oh, 0xff, sizeof(oh)))
+			break;
+
+		if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4))
+			break;
+
+		ofs = dev_ofs(sb, segno, seg_ofs);
+		ino = be64_to_cpu(oh.ino);
+		bix = be64_to_cpu(oh.bix);
+		len = be16_to_cpu(oh.len);
+		size = (u32)be16_to_cpu(oh.len) + sizeof(oh);
+		if (logfs_is_valid_block(sb, ofs, ino, bix, *level)) {
+			if (sh.level != 0)
+				len = sb->s_blocksize;
+			inode_bytes[ino] += len + sizeof(oh);
+			free -= len + sizeof(oh);
+			inode = logfs_iget(sb, ino, &cookie);
+			WARN_ON(bix > inode->i_size >> sb->s_blocksize_bits);
+			logfs_iput(inode, cookie);
+		}
+		seg_ofs += size;
+	}
+	return free;
+}
+
+static void logfsck_blocks(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i, free;
+	u8 level;
+
+	printk(KERN_INFO);
+	for (i = 0; i < super->s_no_segs; i++) {
+		free = logfs_free_bytes(sb, i, &level);
+		free_bytes += free;
+		printk(KERN_CONT" %5x %x", free, level);
+		if (i % 8 == 7)
+			printk(KERN_CONT"\n" KERN_INFO);
+	}
+	printk(KERN_CONT"\n");
+}
+
+/*
+ * Pass 2: directories
+ */
+
+static noinline int read_one_dd(struct inode *dir, loff_t bix, u64 *ino,
+		u8 *type)
+{
+	struct logfs_disk_dentry dd;
+	int err;
+
+	err = logfs_inode_read(dir, &dd, sizeof(dd), bix);
+	if (err)
+		return err;
+	*ino = be64_to_cpu(dd.ino);
+	*type = dd.type;
+	return 0;
+}
+
+static s64 dir_seek_data(struct inode *inode, s64 bix)
+{
+	s64 new_pos = logfs_seek_data(inode, bix);
+
+	return max((s64)bix, new_pos - 1);
+}
+
+static int __logfsck_dirs(struct inode *dir)
+{
+	struct inode *inode;
+	loff_t bix;
+	u64 ino;
+	u8 type;
+	int cookie, err, ret = 0;
+
+	for (bix = 0; ; bix++) {
+		err = read_one_dd(dir, bix, &ino, &type);
+		if (err == -ENODATA) {
+			/* dentry was deleted */
+			bix = dir_seek_data(dir, bix);
+			continue;
+		}
+		if (err == -EOF)
+			break;
+		if (err)
+			goto error0;
+
+		err = -EIO;
+		if (ino > last_ino) {
+			printk(KERN_INFO "ino %llx > last_ino %llx\n",
+					ino, last_ino);
+			goto error0;
+		}
+		inode = logfs_iget(dir->i_sb, ino, &cookie);
+		if (!inode) {
+			printk(KERN_INFO"Could not find inode #%llx in dentry"
+					"(%lx, %llx)\n", ino, dir->i_ino, bix);
+			goto error0;
+		}
+		if (type != logfs_type(inode)) {
+			printk(KERN_INFO "dd type %x != inode type %x\n",
+					type, logfs_type(inode));
+			goto error1;
+		}
+		inode_links[ino]++;
+		err = 0;
+		if (type == DT_DIR) {
+			inode_links[dir->i_ino]++;
+			inode_links[ino]++;
+			err = __logfsck_dirs(inode);
+		}
+error1:
+		logfs_iput(inode, cookie);
+error0:
+		if (!ret)
+			ret = err;
+		continue;
+	}
+	return 1;
+}
+
+static int logfsck_dirs(struct super_block *sb)
+{
+	struct inode *dir;
+	int cookie;
+
+	dir = logfs_iget(sb, LOGFS_INO_ROOT, &cookie);
+	if (!dir)
+		return 0;
+
+	inode_links[LOGFS_INO_MASTER] += 1;
+	inode_links[LOGFS_INO_ROOT] += 2;
+	__logfsck_dirs(dir);
+
+	logfs_iput(dir, cookie);
+	return 1;
+}
+
+/*
+ * Pass 3: inodes
+ */
+
+static int logfs_check_inode(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_super *super = logfs_super(inode->i_sb);
+
+	u64 bytes0 = li->li_used_bytes;
+	u64 bytes1 = inode_bytes[inode->i_ino];
+	u64 links0 = inode->i_nlink;
+	u64 links1 = inode_links[inode->i_ino];
+
+	/* handle unfinished journal replay */
+	if (inode->i_ino == super->s_victim_ino)
+		links0--;
+
+	if (bytes0 != bytes1 || links0 != links1
+			|| inode->i_ino == logfs_super(inode->i_sb)->s_last_ino)
+		printk(KERN_INFO "%lx: %llx(%llx) bytes, %llx(%llx) links\n",
+				inode->i_ino, bytes0, bytes1, links0, links1);
+	used_bytes += bytes1;
+	return (bytes0 == bytes1) && (links0 == links1);
+}
+
+static int logfs_check_ino(struct super_block *sb, u64 ino)
+{
+	struct inode *inode;
+	int ret, cookie;
+
+	inode = logfs_iget(sb, ino, &cookie);
+	if (!inode)
+		return 1;
+	ret = logfs_check_inode(inode);
+	logfs_iput(inode, cookie);
+	return ret;
+}
+
+static int logfsck_inodes(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	s64 i;
+	int ret = 1;
+
+	if (!logfs_check_ino(sb, LOGFS_INO_MASTER))
+		ret = 0;;
+	if (!logfs_check_ino(sb, LOGFS_INO_ROOT))
+		ret = 0;
+	for (i = 16; i < super->s_last_ino; i++) {
+		i = dir_seek_data(super->s_master_inode, i);
+		if (!logfs_check_ino(sb, i))
+			ret = 0;;
+	}
+	return ret;
+}
+
+/*
+ * Pass 4: Total blocks
+ */
+
+static int logfsck_stats(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 ostore_segs, total, expected;
+	int i, reserved_segs;
+
+	/* one for the superblock */
+	reserved_segs = 1;
+	journal_for_each(i)
+		if (super->s_journal_seg[i])
+			reserved_segs++;
+	reserved_segs += super->s_bad_segments;
+
+	ostore_segs = super->s_no_segs - reserved_segs;
+	expected = ostore_segs << super->s_segshift;
+	total = free_bytes + used_bytes;
+
+	printk(KERN_INFO "free:%8llx, used:%8llx, total:%8llx",
+			free_bytes, used_bytes, expected);
+	if (total > expected)
+		printk(KERN_CONT" + %llx\n", total - expected);
+	else if (total < expected)
+		printk(KERN_CONT" - %llx\n", expected - total);
+	else
+		printk(KERN_CONT"\n");
+
+	if (used_bytes != super->s_used_bytes) {
+		printk(KERN_CONT"expected %llx (", super->s_used_bytes);
+		if (super->s_used_bytes < used_bytes)
+			printk("-%llx", used_bytes - super->s_used_bytes);
+		else
+			printk("+%llx", super->s_used_bytes - used_bytes);
+		printk(KERN_CONT") bytes used\n");
+		return 0;
+	}
+
+	return total == expected;
+}
+
+static int __logfs_fsck(struct super_block *sb)
+{
+	int ret;
+	int err = 0;
+
+	/* pass 1: check blocks */
+	logfsck_blocks(sb);
+	/* pass 2: check directories */
+	ret = logfsck_dirs(sb);
+	if (!ret) {
+		printk(KERN_ERR "Pass 2: directory check failed\n");
+		err = -EIO;
+	}
+	/* pass 3: check inodes */
+	ret = logfsck_inodes(sb);
+	if (!ret) {
+		printk(KERN_ERR "Pass 3: inode check failed\n");
+		err = -EIO;
+	}
+	/* Pass 4: Total blocks */
+	ret = logfsck_stats(sb);
+	if (!ret) {
+		printk(KERN_ERR "Pass 4: statistic check failed\n");
+		err = -EIO;
+	}
+
+	return err;
+}
+
+int logfs_fsck(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int ret = -ENOMEM;
+
+	used_bytes = 0;
+	free_bytes = 0;
+	last_ino = super->s_last_ino;
+	inode_bytes = kzalloc(last_ino * sizeof(u64), GFP_KERNEL);
+	if (!inode_bytes)
+		return ret;
+	inode_links = kzalloc(last_ino * sizeof(u64), GFP_KERNEL);
+	if (!inode_links)
+		goto err;
+
+	ret = __logfs_fsck(sb);
+
+	kfree(inode_links);
+	inode_links = NULL;
+err:
+	kfree(inode_bytes);
+	inode_bytes = NULL;
+	return ret;
+}
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/Locking	2008-04-07 11:53:20.919877985 +0200
@@ -0,0 +1,48 @@
+Locks:
+
+s_victim_mutex
+Protects victim inode for create, unlink, mkdir, rmdir, mknod, link,
+symlink and one variant of rename.  Only one victim inode may exist at
+a time.  In case of unclean unmount, victim inode has to be deleted
+before next read-writable mount.
+
+s_rename_mutex
+Protects victim dd for rename.  Only one victim dd may exist at a
+time.  In case of unclean unmount, victim dd has to be deleted before
+next read-writable mount.
+
+s_write_inode_mutex
+Taken when writing an inode.  Deleted inodes can be locked, preventing
+further iget operations during writeout.  Logfs may need to iget the
+inode for garbage collection, so the inode in question needs to be
+stored in the superblock and used directly without calling iget.
+
+s_journal_sem
+Used for allocating space in journal.
+Also protects super->s_je and super->s_compressed_je.  Those two
+buffers are used unprotected in the mount path, the only valid
+exception.
+
+s_r_sem
+Protects the memory required for reads from the filesystem.
+
+s_w_sem
+Protects the memory required for writes to the filesystem.
+
+s_ino_lock
+Protects s_last_ino.
+
+
+Lock order:
+s_rename_mutex --> s_victim_mutex
+s_rename_mutex --> s_write_inode_mutex
+s_rename_mutex --> s_w_sem
+
+s_victim_mutex --> s_write_inode_mutex
+s_victim_mutex --> s_w_sem
+s_victim_mutex --> s_ino_lock
+
+s_write_inode_mutex --> s_w_sem
+
+s_w_sem --> s_log_sem
+s_w_sem --> s_r_sem
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/dev_bdev.c	2008-04-07 11:53:20.919877985 +0200
@@ -0,0 +1,151 @@
+/*
+ * fs/logfs/dev_bdev.c	- Device access methods for block devices
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/buffer_head.h>
+
+#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
+
+static int bdev_read(struct super_block *sb, loff_t from, size_t len, void *buf)
+{
+	struct block_device *bdev = logfs_super(sb)->s_bdev;
+	struct address_space *mapping = bdev->bd_inode->i_mapping;
+	struct page *page;
+	long index = from >> PAGE_SHIFT;
+	long offset = from & (PAGE_SIZE-1);
+	long copylen;
+
+	while (len) {
+		copylen = min((ulong)len, PAGE_SIZE - offset);
+
+		page = read_cache_page(mapping, index,
+				(filler_t *)mapping->a_ops->readpage, NULL);
+		if (!page)
+			return -ENOMEM;
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+
+		memcpy(buf, page_address(page) + offset, copylen);
+		page_cache_release(page);
+
+		buf += copylen;
+		len -= copylen;
+		offset = 0;
+		index++;
+	}
+	return 0;
+}
+
+static int bdev_write(struct super_block *sb, loff_t to, size_t len, void *buf)
+{
+	struct block_device *bdev = logfs_super(sb)->s_bdev;
+	struct address_space *mapping = bdev->bd_inode->i_mapping;
+	struct page *page;
+	long index = to >> PAGE_SHIFT;
+	long offset = to & (PAGE_SIZE-1);
+	long copylen;
+
+	if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
+		return -EROFS;
+
+	while (len) {
+		copylen = min((ulong)len, PAGE_SIZE - offset);
+
+		page = read_cache_page(mapping, index,
+				(filler_t *)mapping->a_ops->readpage, NULL);
+		if (!page)
+			return -ENOMEM;
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		lock_page(page);
+		memcpy(page_address(page) + offset, buf, copylen);
+		set_page_dirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+
+		buf += copylen;
+		len -= copylen;
+		offset = 0;
+		index++;
+	}
+	return 0;
+}
+
+static int bdev_erase(struct super_block *sb, loff_t to, size_t len)
+{
+	struct block_device *bdev = logfs_super(sb)->s_bdev;
+	struct address_space *mapping = bdev->bd_inode->i_mapping;
+	struct page *page;
+	long index = to >> PAGE_SHIFT;
+	long offset = to & (PAGE_SIZE-1);
+	long copylen;
+
+	if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
+		return -EROFS;
+
+	while (len) {
+		copylen = min((ulong)len, PAGE_SIZE - offset);
+
+		page = read_cache_page(mapping, index,
+				(filler_t *)mapping->a_ops->readpage, NULL);
+		if (!page)
+			return -ENOMEM;
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		lock_page(page);
+		memset(page_address(page) + offset, 0xFF, copylen);
+		set_page_dirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+
+		len -= copylen;
+		offset = 0;
+		index++;
+	}
+	return 0;
+}
+
+static void bdev_sync(struct super_block *sb)
+{
+	sync_blockdev(logfs_super(sb)->s_bdev);
+}
+
+static s64 bdev_find_sb(struct super_block *sb)
+{
+	struct address_space *mapping;
+
+	/* Prevent bdev from calling back into fs */
+	mapping = &logfs_super(sb)->s_bdev->bd_inode->i_data;
+	mapping_set_gfp_mask(mapping, mapping_gfp_mask(mapping) & ~__GFP_FS);
+	return 0;
+}
+
+static const struct logfs_device_ops bd_devops = {
+	.find_sb	= bdev_find_sb,
+	.read		= bdev_read,
+	.write		= bdev_write,
+	.erase		= bdev_erase,
+	.sync		= bdev_sync,
+};
+
+int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+		const char *devname, struct vfsmount *mnt)
+{
+	struct block_device *bdev;
+
+	bdev = open_bdev_excl(devname, O_RDWR, NULL);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
+
+	if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
+		int mtdnr = MINOR(bdev->bd_dev);
+		close_bdev_excl(bdev);
+		return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+	}
+
+	return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt);
+}
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/dev_mtd.c	2008-04-07 11:53:20.919877985 +0200
@@ -0,0 +1,410 @@
+/*
+ * fs/logfs/dev_mtd.c	- Device access methods for MTD
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/completion.h>
+#include <linux/mount.h>
+
+#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
+
+static struct vfsmount *mtd_mount __read_mostly;
+static struct kmem_cache *mtd_cache __read_mostly;
+
+static inline struct mtd_inode *mtd_inode(struct inode *inode)
+{
+	return container_of(inode, struct mtd_inode, vfs_inode);
+}
+
+static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+{
+	struct mtd_inode *mi = logfs_super(sb)->s_mtd;
+	struct mtd_info *mtd = mi->mtd;
+	size_t retlen;
+	int ret;
+
+	ret = mtd->read(mtd, ofs, len, &retlen, buf);
+	BUG_ON(ret == -EINVAL);
+	if (ret)
+		return ret;
+
+	/* Not sure if we should loop instead. */
+	if (retlen != len)
+		return -EIO;
+
+	return 0;
+}
+
+static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct mtd_inode *mi = super->s_mtd;
+	struct mtd_info *mtd = mi->mtd;
+	size_t retlen;
+	loff_t page_start, page_end;
+	int ret;
+
+	if (super->s_flags & LOGFS_SB_FLAG_RO)
+		return -EROFS;
+
+	BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
+	BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
+	BUG_ON(len > PAGE_CACHE_SIZE);
+	page_start = ofs & PAGE_CACHE_MASK;
+	page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
+	ret = mtd->write(mtd, ofs, len, &retlen, buf);
+	if (ret || (retlen != len))
+		return -EIO;
+
+	return 0;
+}
+
+/*
+ * For as long as I can remember (since about 2001) mtd->erase has been an
+ * asynchronous interface lacking the first driver to actually use the
+ * asynchronous properties.  So just to prevent the first implementor of such
+ * a thing from breaking logfs in 2350, we do the usual pointless dance to
+ * declare a completion variable and wait for completion before returning
+ * from mtd_erase().  What an excercise in futility!
+ */
+static void logfs_erase_callback(struct erase_info *ei)
+{
+	complete((struct completion *)ei->priv);
+}
+
+static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len)
+{
+	struct mtd_inode *mi = logfs_super(sb)->s_mtd;
+	struct mtd_info *mtd = mi->mtd;
+	struct erase_info ei;
+	DECLARE_COMPLETION_ONSTACK(complete);
+	int ret;
+
+	BUG_ON(len % mtd->erasesize);
+
+	if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
+		return -EROFS;
+
+	memset(&ei, 0, sizeof(ei));
+	ei.mtd = mtd;
+	ei.addr = ofs;
+	ei.len = len;
+	ei.callback = logfs_erase_callback;
+	ei.priv = (long)&complete;
+	ret = mtd->erase(mtd, &ei);
+	if (ret)
+		return -EIO;
+
+	wait_for_completion(&complete);
+	if (ei.state != MTD_ERASE_DONE)
+		return -EIO;
+	return 0;
+}
+
+static void mtd_sync(struct super_block *sb)
+{
+	struct mtd_inode *mi = logfs_super(sb)->s_mtd;
+	struct mtd_info *mtd = mi->mtd;
+
+	if (mtd->sync)
+		mtd->sync(mtd);
+}
+
+static s64 mtd_find_sb(struct super_block *sb)
+{
+	struct mtd_inode *mi = logfs_super(sb)->s_mtd;
+	struct mtd_info *mtd = mi->mtd;
+	s64 ofs = 0;
+
+	if (!mtd->block_isbad)
+		return 0;
+
+	while (mtd->block_isbad(mtd, ofs)) {
+		ofs += mtd->erasesize;
+		if (ofs > mtd->size)
+			return -EIO;
+	}
+	return ofs;
+}
+
+static int map_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+{
+	struct mtd_inode *mi = logfs_super(sb)->s_mtd;
+	struct inode *inode = &mi->vfs_inode;
+	struct page *page;
+	void *buf0;
+	unsigned long page_ofs, cplen;
+	int err;
+
+	while (len) {
+		page = find_or_create_page(inode->i_mapping, ofs>>PAGE_SHIFT,
+				GFP_NOIO);
+		if (!page)
+			return -ENOMEM;
+
+		if (!PageUptodate(page)) {
+			buf0 = kmap(page);
+			err = mtd_read(sb, ofs&PAGE_MASK, PAGE_SIZE, buf0);
+			kunmap(page);
+			if (err) {
+				unlock_page(page);
+				page_cache_release(page);
+				return err;
+			}
+			SetPageUptodate(page);
+		}
+
+		page_ofs = PAGE_OFS(ofs);
+		cplen = min(PAGE_SIZE - page_ofs, (unsigned long)len);
+
+		buf0 = kmap_atomic(page, KM_USER0);
+		memcpy(buf, buf0 + page_ofs, cplen);
+		kunmap_atomic(buf0, KM_USER0);
+		unlock_page(page);
+		page_cache_release(page);
+
+		ofs += cplen;
+		buf += cplen;
+		len -= cplen;
+	}
+	return 0;
+}
+
+#ifdef CACHE_WRITES
+/* This variant is about 4% slower than the write-invalidate variant */
+static int map_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+{
+	struct mtd_inode *mi = logfs_super(sb)->s_mtd;
+	struct inode *inode = &mi->vfs_inode;
+	struct page *page;
+	void *buf0;
+	unsigned long page_ofs, cplen;
+	int err;
+
+	while (len) {
+		page = find_or_create_page(inode->i_mapping, ofs>>PAGE_SHIFT,
+				GFP_NOIO);
+		if (!page)
+			return -ENOMEM;
+
+		if (!PageUptodate(page) &&
+				(PAGE_OFS(ofs) || (len < PAGE_SIZE))) {
+			buf0 = kmap(page);
+			err = mtd_read(sb, ofs&PAGE_MASK, PAGE_SIZE, buf0);
+			kunmap(page);
+			if (err) {
+				unlock_page(page);
+				page_cache_release(page);
+				return err;
+			}
+			SetPageUptodate(page);
+		}
+
+		page_ofs = PAGE_OFS(ofs);
+		cplen = min(PAGE_SIZE - page_ofs, (unsigned long)len);
+
+		buf0 = kmap_atomic(page, KM_USER0);
+		memcpy(buf0 + page_ofs, buf, cplen);
+		kunmap_atomic(buf0, KM_USER0);
+
+		buf0 = kmap(page);
+		err = mtd_write(sb, ofs, cplen, buf0 + page_ofs);
+		kunmap(page);
+		unlock_page(page);
+		page_cache_release(page);
+		if (err)
+			return err;
+
+		ofs += cplen;
+		buf += cplen;
+		len -= cplen;
+	}
+	return 0;
+}
+#else
+static int map_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+{
+	struct mtd_inode *mi = logfs_super(sb)->s_mtd;
+	struct inode *inode = &mi->vfs_inode;
+	struct page *page;
+	unsigned long page_ofs, cplen;
+	int err;
+
+	err = mtd_write(sb, ofs, len, buf);
+	if (err)
+		return err;
+
+	while (len) {
+		page = find_get_page(inode->i_mapping, ofs>>PAGE_SHIFT);
+		if (page) {
+			ClearPageUptodate(page);
+			page_cache_release(page);
+		}
+
+		page_ofs = PAGE_OFS(ofs);
+		cplen = min(PAGE_SIZE - page_ofs, (unsigned long)len);
+
+		ofs += cplen;
+		buf += cplen;
+		len -= cplen;
+	}
+	return 0;
+}
+#endif
+
+static int map_erase(struct super_block *sb, loff_t ofs, size_t len)
+{
+	struct mtd_inode *mi = logfs_super(sb)->s_mtd;
+	struct inode *inode = &mi->vfs_inode;
+	struct page *page;
+	int err;
+
+	BUG_ON(PAGE_OFS(ofs) || PAGE_OFS(len));
+
+	err = mtd_erase(sb, ofs, len);
+	if (err)
+		return err;
+
+	while (len) {
+		page = find_get_page(inode->i_mapping, ofs>>PAGE_SHIFT);
+		if (page) {
+			ClearPageUptodate(page);
+			page_cache_release(page);
+		}
+
+		ofs += PAGE_SIZE;
+		len -= PAGE_SIZE;
+	}
+	return 0;
+}
+
+static const struct logfs_device_ops mtd_devops = {
+	.find_sb	= mtd_find_sb,
+	.read		= map_read,
+	.write		= map_write,
+	.erase		= map_erase,
+	.sync		= mtd_sync,
+};
+
+int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+		int mtdnr, struct vfsmount *mnt)
+{
+	struct inode *inode;
+
+	inode = iget_locked(mtd_mount->mnt_sb, mtdnr);
+	if (!inode)
+		return -ENOMEM;
+
+	if (inode->i_state & I_NEW) {
+		inode->i_mode = S_IFCHR;
+		inode->i_rdev = MKDEV(MTD_CHAR_MAJOR, mtdnr);
+		mtd_inode(inode)->mtd = get_mtd_device(NULL, mtdnr);
+		if (!mtd_inode(inode)->mtd) {
+			make_bad_inode(inode);
+			unlock_new_inode(inode);
+			iput(inode);
+			return -EINVAL;
+		}
+		unlock_new_inode(inode);
+	}
+
+	mtd_inode(inode)->openers++;
+
+	return logfs_get_sb_device(type, flags, mtd_inode(inode), NULL,
+			&mtd_devops, mnt);
+}
+
+void logfs_put_mtd(struct mtd_inode *mi)
+{
+	if (mi) {
+		if (!--mi->openers)
+			truncate_inode_pages(mi->vfs_inode.i_mapping, 0);
+		iput(&mi->vfs_inode);
+	}
+}
+
+static struct inode *mtd_alloc_inode(struct super_block *sb)
+{
+	struct mtd_inode *mi = kmem_cache_alloc(mtd_cache, GFP_KERNEL);
+
+	if (!mi)
+		return NULL;
+	return &mi->vfs_inode;
+}
+
+static void mtd_destroy_inode(struct inode *inode)
+{
+	struct mtd_inode *mi = mtd_inode(inode);
+
+	put_mtd_device(mi->mtd);
+	kmem_cache_free(mtd_cache, mi);
+}
+
+static const struct super_operations mtd_sops = {
+	.alloc_inode	= mtd_alloc_inode,
+	.destroy_inode	= mtd_destroy_inode,
+};
+
+static int mtd_get_sb(struct file_system_type *fs_type, int flags,
+		const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	return get_sb_pseudo(fs_type, "mtd:", NULL, 0x6D746400, mnt);
+}
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 23)
+static void init_once(struct kmem_cache *cache, void *_mi)
+#else
+static void init_once(void *_mi, struct kmem_cache *cache, unsigned long flags)
+#endif
+{
+	struct mtd_inode *mi = _mi;
+
+	mi->mtd = NULL;
+	mi->openers = 0;
+	inode_init_once(&mi->vfs_inode);
+}
+
+static struct file_system_type mtd_fs_type = {
+	.name		= "mtd",
+	.get_sb		= mtd_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static int __init logfs_mtd_init(void)
+{
+	int err;
+
+	mtd_cache = kmem_cache_create("mtd_cache", sizeof(struct mtd_inode), 0,
+			(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+			 SLAB_MEM_SPREAD|SLAB_PANIC),
+			init_once DTOR);
+	if (!mtd_cache)
+		return -ENOMEM;
+
+	err = register_filesystem(&mtd_fs_type);
+	if (err)
+		goto out1;
+
+	mtd_mount = kern_mount(&mtd_fs_type);
+	err = PTR_ERR(mtd_mount);
+	if (IS_ERR(mtd_mount))
+		goto out2;
+
+	return 0;
+out2:
+	unregister_filesystem(&mtd_fs_type);
+out1:
+	kmem_cache_destroy(mtd_cache);
+	return err;
+}
+
+static void __exit logfs_mtd_exit(void)
+{
+	unregister_filesystem(&mtd_fs_type);
+	kmem_cache_destroy(mtd_cache);
+}
+
+fs_initcall(logfs_mtd_init); /* FIXME: remove */
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/logfs_abi.h	2008-04-07 11:53:20.923211070 +0200
@@ -0,0 +1,523 @@
+/*
+ * fs/logfs/logfs.h
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ *
+ * Public header for logfs.
+ */
+#ifndef linux_logfs_h
+#define linux_logfs_h
+
+
+/*
+ * Throughout the logfs code, we're constantly dealing with blocks at
+ * various positions or offsets.  To remove confusion, we stricly
+ * distinguish between a "position" - the logical position within a
+ * file and an "offset" - the physical location within the device.
+ *
+ * Any usage of the term offset for a logical location or position for
+ * a physical one is a bug and should get fixed.
+ */
+
+/*
+ * Block are allocated in one of several segments depending on their
+ * level.  The following levels are used:
+ *  0	- regular data block
+ *  1	- i1 indirect blocks
+ *  2	- i2 indirect blocks
+ *  3	- i3 indirect blocks
+ *  4	- i4 indirect blocks
+ *  5	- i5 indirect blocks
+ *  6	- ifile data blocks
+ *  7	- ifile i1 indirect blocks
+ *  8	- ifile i2 indirect blocks
+ *  9	- ifile i3 indirect blocks
+ * 10	- ifile i4 indirect blocks
+ * 11	- ifile i5 indirect blocks
+ * Potential levels to be used in the future:
+ * 12	- gc recycled blocks, long-lived data
+ * 13	- replacement blocks, short-lived data
+ *
+ * Levels 1-11 are necessary for robust gc operations and help seperate
+ * short-lived metadata from longer-lived file data.  In the future,
+ * file data should get seperated into several segments based on simple
+ * heuristics.  Old data recycled during gc operation is expected to be
+ * long-lived.  New data is of uncertain life expectancy.  New data
+ * used to replace older blocks in existing files is expected to be
+ * short-lived.
+ */
+
+
+/* Magic numbers.  64bit for superblock, 32bit for statfs f_type */
+#define LOGFS_MAGIC		0xb21f205ac97e8168ull
+#define LOGFS_MAGIC_U32		0xc97e8168u
+
+/*
+ * Various blocksize related macros.  Blocksize is currently fixed at 4KiB.
+ * Sooner or later that should become configurable and the macros replaced
+ * by something superblock-dependent.  Pointers in indirect blocks are and
+ * will remain 64bit.
+ *
+ * LOGFS_BLOCKSIZE	- self-explaining
+ * LOGFS_BLOCK_FACTOR	- number of pointers per indirect block
+ * LOGFS_BLOCK_BITS	- log2 of LOGFS_BLOCK_FACTOR, used for shifts
+ */
+#define LOGFS_BLOCKSIZE		(4096ull)
+#define LOGFS_BLOCK_FACTOR	(LOGFS_BLOCKSIZE / sizeof(u64))
+#define LOGFS_BLOCK_BITS	(9)
+
+/*
+ * Number of blocks at various levels of indirection.  Each inode originally
+ * had 9 block pointers.  Later the inode size was doubled and there are now
+ * 9+16 pointers - the notation is just historical.
+ *
+ * I0_BLOCKS is the number of direct block pointer in each inode.  The
+ * remaining five pointers are for indirect pointers, up to 5x indirect.
+ * Only 3x is tested and supported at the moment.  5x would allow for truly
+ * humongous files if the need ever arises.
+ * I1_BLOCKS is the number of blocks behind a 1x indirect block,
+ * I2_BLOCKS is the number of blocks behind a 2x indirect block, not counting
+ * the 1x indirect blocks.  etc.
+ */
+#define I0_BLOCKS		(4+16)
+#define I1_BLOCKS		LOGFS_BLOCK_FACTOR
+#define I2_BLOCKS		(LOGFS_BLOCK_FACTOR * I1_BLOCKS)
+#define I3_BLOCKS		(LOGFS_BLOCK_FACTOR * I2_BLOCKS)
+#define I4_BLOCKS		(LOGFS_BLOCK_FACTOR * I3_BLOCKS)
+#define I5_BLOCKS		(LOGFS_BLOCK_FACTOR * I4_BLOCKS)
+
+/* The indices in the block array where the Nx indirect block pointers reside */
+#define I1_INDEX		(4+16)
+#define I2_INDEX		(5+16)
+#define I3_INDEX		(6+16)
+#define I4_INDEX		(7+16)
+#define I5_INDEX		(8+16)
+
+/* The total number of block pointers in each inode */
+#define LOGFS_EMBEDDED_FIELDS	(9+16)
+
+/*
+ * Sizes at which files require another level of indirection.  Files smaller
+ * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
+ * similar like ext2 fast symlinks.
+ *
+ * Data at a position smaller than LOGFS_I0_SIZE is accessed through the
+ * direct pointers, else through the 1x indirect pointer and so forth.
+ */
+#define LOGFS_EMBEDDED_SIZE	(LOGFS_EMBEDDED_FIELDS * sizeof(u64))
+#define LOGFS_I0_SIZE		(I0_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I1_SIZE		(I1_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I2_SIZE		(I2_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I3_SIZE		(I3_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I4_SIZE		(I4_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I5_SIZE		(I5_BLOCKS * LOGFS_BLOCKSIZE)
+
+/*
+ * Each indirect block pointer must have this flag set, if all block pointers
+ * behind it are set, i.e. there is no hole hidden in the shadow of this
+ * indirect block pointer.
+ */
+#define LOGFS_FULLY_POPULATED (1ULL << 63)
+#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
+
+/*
+ * LogFS needs to seperate data into levels.  Each level is defined as the
+ * maximal possible distance from the master inode (inode of the inode file).
+ * Data blocks reside on level 0, 1x indirect block on level 1, etc.
+ * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
+ * This effort is necessary to guarantee garbage collection to always make
+ * progress.
+ *
+ * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
+ * LOGFS_MAX_LEVELS is one more for the actual data level of a file.  It is
+ * the maximal number of levels for one file.
+ * LOGFS_NO_AREAS is twice that, as the inode file and regular files are
+ * effectively stacked on top of each other.
+ */
+#define LOGFS_MAX_INDIRECT	(5)
+#define LOGFS_MAX_LEVELS	(LOGFS_MAX_INDIRECT + 1)
+#define LOGFS_NO_AREAS		(2 * LOGFS_MAX_LEVELS)
+
+/* Maximum size of filenames */
+#define LOGFS_MAX_NAMELEN	(255)
+
+/* Number of segments in the primary journal. */
+#define LOGFS_JOURNAL_SEGS	(4)
+
+/* Maximum number of free/erased/etc. segments in journal entries */
+#define MAX_CACHED_SEGS		(64)
+
+
+/*
+ * LOGFS_HEADERSIZE is the size of a single header in the object store,
+ * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
+ * its header,
+ * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
+ * its segment header and the padded space at the end when no further objects
+ * fit.
+ */
+#define LOGFS_HEADERSIZE	(0x1c)
+#define LOGFS_SEGMENT_HEADERSIZE (0x18)
+#define LOGFS_MAX_OBJECTSIZE	(LOGFS_HEADERSIZE + LOGFS_BLOCKSIZE)
+#define LOGFS_SEGMENT_RESERVE	(LOGFS_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
+
+
+/**
+ * struct logfs_disk_super - on-medium superblock
+ *
+ * @ds_magic:			magic number, must equal LOGFS_MAGIC
+ * @ds_crc:			crc32 of structure starting with the next field
+ * @ds_ifile_levels:		maximum number of levels for ifile
+ * @ds_iblock_levels:		maximum number of levels for regular files
+ * @ds_data_levels:		number of seperate levels for data
+ * @pad0:			reserved, must be 0
+ * @ds_feature_incompat:	incompatible filesystem features
+ * @ds_feature_ro_compat:	read-only compatible filesystem features
+ * @ds_feature_compat:		compatible filesystem features
+ * @ds_flags:			flags
+ * @ds_segment_shift:		log2 of segment size
+ * @ds_block_shift:		log2 of block size
+ * @ds_write_shift:		log2 of write size
+ * @pad1:			reserved, must be 0
+ * @ds_journal_seg:		segments used by primary journal
+ * @ds_root_reserve:		bytes reserved for the superuser
+ * @pad2:			reserved, must be 0
+ *
+ * Contains only read-only fields.  Read-write fields like the amount of used
+ * space is tracked in the dynamic superblock, which is stored in the journal.
+ */
+struct logfs_disk_super {
+	__be64	ds_magic;
+	__be32	ds_crc;
+	__u8	ds_ifile_levels;
+	__u8	ds_iblock_levels;
+	__u8	ds_data_levels;
+	__u8	pad0;
+
+	__be64	ds_feature_incompat;
+	__be64	ds_feature_ro_compat;
+
+	__be64	ds_feature_compat;
+	__be64	ds_flags;
+
+	__be64	ds_filesystem_size;
+	__u8	ds_segment_shift;
+	__u8	ds_block_shift;
+	__u8	ds_write_shift;
+	__u8	pad1[5];
+
+	__be64	ds_journal_seg[LOGFS_JOURNAL_SEGS];
+
+	__be64	ds_root_reserve;
+
+	__be64	pad2[19];
+} __attribute__((packed));
+
+
+/*
+ * Inode flags.  High bits should never be written to the medium.  Used either
+ * to catch obviously corrupt data (all 0xff) or for flags that are used
+ * in-memory only.
+ *
+ * LOGFS_IF_VALID	Inode is valid, must be 1 (catch all 0x00 case)
+ * LOGFS_IF_EMBEDDED	Inode is a fast inode (data embedded in pointers)
+ *
+ * LOGFS_IF_DIRTY	Inode must be written back
+ * LOGFS_IF_ZOMBIE	Inode has been deleted
+ * LOGFS_IF_STILLBORN	-ENOSPC happened when creating inode
+ * LOGFS_IF_INVALID	Inode is invalid, must be 0 (catch all 0xff case)
+ */
+#define LOGFS_IF_VALID		0x00000001
+#define LOGFS_IF_EMBEDDED	0x00000002
+#define LOGFS_IF_COMPRESSED	0x00000004 /* == FS_COMPR_FL */
+#define LOGFS_IF_DIRTY		0x10000000
+#define LOGFS_IF_ZOMBIE		0x20000000
+#define LOGFS_IF_STILLBORN	0x40000000
+#define LOGFS_IF_INVALID	0x80000000
+
+
+/* Flags available to chattr */
+#define LOGFS_FL_USER_VISIBLE	(LOGFS_IF_COMPRESSED)
+#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
+/* Flags inherited from parent directory on file/directory creation */
+#define LOGFS_FL_INHERITED	(LOGFS_IF_COMPRESSED)
+
+
+/**
+ * struct logfs_disk_inode - on-medium inode
+ *
+ * @di_mode:			file mode
+ * @di_pad:			reserved, must be 0
+ * @di_flags:			inode flags, see above
+ * @di_uid:			user id
+ * @di_gid:			group id
+ * @di_ctime:			change time
+ * @di_mtime:			modify time
+ * @di_refcount:		reference count (aka nlink or link count)
+ * @di_generation:		inode generation, for nfs
+ * @di_used_bytes:		number of bytes used
+ * @di_size:			file size
+ * @di_data:			data pointers
+ */
+struct logfs_disk_inode {
+	__be16	di_mode;
+	__u8	di_height;
+	__u8	di_pad;
+	__be32	di_flags;
+	__be32	di_uid;
+	__be32	di_gid;
+
+	__be64	di_ctime;
+	__be64	di_mtime;
+
+	__be32	di_refcount;
+	__be32	di_generation;
+	__be64	di_used_bytes;
+
+	__be64	di_size;
+	__be64	di_data[LOGFS_EMBEDDED_FIELDS];
+} __attribute__((packed));
+
+
+/**
+ * struct logfs_disk_dentry - on-medium dentry structure
+ *
+ * @ino:			inode number
+ * @namelen:			length of file name
+ * @type:			file type, identical to bits 12..15 of mode
+ * @name:			file name
+ */
+struct logfs_disk_dentry {
+	__be64	ino;
+	__be16	namelen;
+	__u8	type;
+	__u8	name[LOGFS_MAX_NAMELEN];
+} __attribute__((packed));
+
+
+#define OBJ_TOP_JOURNAL	1	/* segment header for master journal */
+#define OBJ_JOURNAL	2	/* segment header for journal */
+#define OBJ_OSTORE	3	/* segment header for ostore */
+#define OBJ_BLOCK	4	/* data block */
+#define OBJ_INODE	5	/* inode */
+#define OBJ_DENTRY	6	/* dentry */
+
+
+/**
+ * struct logfs_object_header - per-object header in the ostore
+ *
+ * @crc:			crc32 of header, excluding data_crc
+ * @len:			length of data
+ * @type:			object type, see above
+ * @compr:			compression type
+ * @ino:			inode number
+ * @bix:			block index
+ * @data_crc:			crc32 of payload
+ */
+struct logfs_object_header {
+	__be32	crc;
+	__be16	len;
+	__u8	type;
+	__u8	compr;
+	__be64	ino;
+	__be64	bix;
+	__be32	data_crc;
+} __attribute__((packed));
+
+
+/**
+ * struct logfs_segment_header - per-segment header in the ostore
+ *
+ * @crc:			crc32 of header (there is no data)
+ * @pad:			unused, must be 0
+ * @type:			object type, see above
+ * @level:			GC level for all objects in this segment
+ * @segno:			segment number
+ * @ec:				erase count for this segment
+ * @gec:			global erase count at time of writing
+ */
+struct logfs_segment_header {
+	__be32	crc;
+	__be16	pad;
+	__u8	type;
+	__u8	level;
+	__be32	segno;
+	__be32	ec;
+	__be64	gec;
+} __attribute__((packed));
+
+
+/**
+ * struct logfs_journal_header - header for journal entries (JEs)
+ *
+ * @h_crc:			crc32 of journal entry
+ * @h_len:			length of compressed journal entry
+ * @h_datalen:			length of uncompressed data
+ * @h_type:			JE type
+ * @h_version:			unnormalized version of journal entry
+ * @h_compr:			compression type
+ * @h_pad:			reserved
+ */
+struct logfs_journal_header {
+	__be32	h_crc;
+	__be16	h_len;
+	__be16	h_datalen;
+	__be16	h_type;
+	__be16	h_version;
+	__u8	h_compr;
+	__u8	h_pad[3];
+} __attribute__((packed));
+
+
+/**
+ * struct logfs_je_dynsb - dynamic superblock
+ *
+ * @ds_gec:			global erase count
+ * @ds_sweeper:			current position of GC "sweeper"
+ * @ds_rename_dir:		source directory ino (see dir.c documentation)
+ * @ds_rename_pos:		position of source dd (see dir.c documentation)
+ * @ds_victim_ino:		victims of incomplete dir operation (see dir.c)
+ * @ds_used_bytes:		number of used bytes
+ */
+struct logfs_je_dynsb {
+	__be64	ds_gec;
+	__be64	ds_sweeper;
+
+	__be64	ds_rename_dir;
+	__be64	ds_rename_pos;
+
+	__be64	ds_victim_ino;
+	__be64	ds_used_bytes;
+};
+
+
+/**
+ * struct logfs_je_anchor - anchor of filesystem tree, aka master inode
+ *
+ * @da_size:			size of inode file
+ * @da_last_ino:		last created inode
+ * @da_used_bytes:		number of bytes used
+ * @da_data:			data pointers
+ */
+struct logfs_je_anchor {
+	__be64	da_size;
+	__be64	da_last_ino;
+
+	__be64	da_used_bytes;
+	__be64	da_data[LOGFS_EMBEDDED_FIELDS];
+} __attribute__((packed));
+
+
+/**
+ * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal)
+ *
+ * @so_segment:			segments used for 2nd journal
+ *
+ * Length of the array is given by h_len field in the header.
+ */
+struct logfs_je_spillout {
+	__be64	so_segment[0];
+} __attribute__((packed));
+
+
+/**
+ * struct logfs_je_journal_ec - erase counts for all journal segments
+ *
+ * @ec:				erase count
+ *
+ * Length of the array is given by h_len field in the header.
+ */
+struct logfs_je_journal_ec {
+	__be32	ec[0];
+} __attribute__((packed));
+
+
+/**
+ * struct logfs_je_free_segments - list of free segmetns with erase count
+ */
+struct logfs_je_free_segments {
+	__be32	segno;
+	__be32	ec;
+} __attribute__((packed));
+
+
+/**
+ * struct logfs_je_areas - management information for current areas
+ *
+ * @used_bytes:			number of bytes already used
+ * @segno:			segment number of area
+ *
+ * "Areas" are segments currently being used for writing.  There is one area
+ * per GC level.  Each erea also has a write buffer that is stored in the
+ * journal, in entries 0x10..0x1f.
+ */
+struct logfs_je_areas {
+	__be32	used_bytes[16];
+	__be32	segno[16];
+} __attribute__((packed));
+
+
+enum {
+	COMPR_NONE	= 0,
+	COMPR_ZLIB	= 1,
+};
+
+
+/*
+ * Journal entries come in groups of 16.  First group contains unique
+ * entries, next groups contain one entry per level
+ *
+ * JE_FIRST	- smallest possible journal entry number
+ *
+ * JEG_BASE	- base group, containing unique entries
+ * JE_COMMIT	- commit entry, validates all previous entries
+ * JE_ABORT	- abort entry, invalidates all previous non-committed entries
+ * JE_DYNSB	- dynamic superblock, anything that ought to be in the
+ *		  superblock but cannot because it is read-write data
+ * JE_ANCHOR	- anchor aka master inode aka inode file's inode
+ * JE_ERASECOUNT  erasecounts for all journal segments
+ * JE_SPILLOUT	- unused
+ * JE_BADSEGMENTS bad segments
+ * JE_AREAS	- area description sans wbuf
+ * JE_FREESEGS	- free segments that can get deleted immediatly
+ * JE_LOWSEGS	- segments with low population, good candidates for GC
+ * JE_OLDSEGS	- segments with low erasecount, may need to get moved
+ *
+ * JEG_WBUF	- wbuf group, one entry per area
+ *
+ * JE_LAST	- largest possible journal entry number
+ */
+enum {
+	JE_FIRST	= 0x01,
+
+	JEG_BASE	= 0x00,
+	JE_COMMIT	= 0x01,
+	JE_ABORT	= 0x02,
+	JE_DYNSB	= 0x03,
+	JE_ANCHOR	= 0x04,
+	JE_ERASECOUNT	= 0x05,
+	JE_SPILLOUT	= 0x06,
+	JE_BADSEGMENTS	= 0x08,
+	JE_AREAS	= 0x09,
+	JE_FREESEGS	= 0x0a,
+	JE_LOWSEGS	= 0x0b,
+	JE_OLDSEGS	= 0x0c,
+
+	JEG_WBUF	= 0x10,
+
+	JE_LAST		= 0x1f,
+};
+
+
+			/*	0	reserved for gc markers */
+#define LOGFS_INO_MASTER	1	/* inode file */
+#define LOGFS_INO_ROOT		2	/* root directory */
+#define LOGFS_INO_ATIME		4	/* atime for all inodes */
+#define LOGFS_INO_BAD_BLOCKS	5	/* bad blocks */
+#define LOGFS_INO_OBSOLETE	6	/* obsolete block count */
+#define LOGFS_INO_ERASE_COUNT	7	/* erase count */
+#define LOGFS_RESERVED_INOS	16
+
+#endif
