Signed-off-by: Joern Engel <joern@logfs.org>

 fs/Kconfig            |   27 
 fs/Makefile           |    1 
 fs/logfs/Locking      |   48 +
 fs/logfs/Makefile     |   15 
 fs/logfs/compr.c      |   95 ++
 fs/logfs/dev_bdev.c   |  151 ++++
 fs/logfs/dev_mtd.c    |  410 ++++++++++++
 fs/logfs/dir.c        |  765 +++++++++++++++++++++++
 fs/logfs/file.c       |  236 +++++++
 fs/logfs/gc.c         |  730 ++++++++++++++++++++++
 fs/logfs/inode.c      |  589 ++++++++++++++++++
 fs/logfs/journal.c    |  805 ++++++++++++++++++++++++
 fs/logfs/logfs.h      |  587 ++++++++++++++++++
 fs/logfs/logfs_abi.h  |  523 ++++++++++++++++
 fs/logfs/memtree.c    |  402 ++++++++++++
 fs/logfs/progs/fsck.c |  347 ++++++++++
 fs/logfs/readwrite.c  | 1618 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/logfs/segment.c    |  595 ++++++++++++++++++
 fs/logfs/super.c      |  381 +++++++++++
 19 files changed, 8325 insertions(+)

--- git/fs/Kconfig~logfs	2008-04-07 11:39:32.989960448 +0200
+++ git/fs/Kconfig	2008-04-07 11:53:20.906568828 +0200
@@ -1347,6 +1347,33 @@ config JFFS2_CMODE_FAVOURLZO
 
 endchoice
 
+config LOGFS
+	bool "Log Filesystem (EXPERIMENTAL)"
+	depends on (MTD || BLOCK) && EXPERIMENTAL
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
+	select CRC32
+	help
+	  Flash filesystem aimed to scale efficiently to large devices.
+	  In comparison to JFFS2 it offers significantly faster mount
+	  times and potentially less RAM usage, although the latter has
+	  not been measured yet.
+
+	  In its current state it is still very experimental and should
+	  not be used for other than testing purposes.
+
+	  If unsure, say N.
+
+config LOGFS_FSCK
+	bool "Run LogFS fsck at mount time"
+	depends on LOGFS
+	help
+	  Run a full filesystem check on every mount.  If any errors are
+	  found, mounting the filesystem will fail.  This is a debug option
+	  for developers.
+
+	  If unsure, say N.
+
 config CRAMFS
 	tristate "Compressed ROM file system support (cramfs)"
 	depends on BLOCK
--- git/fs/Makefile~logfs	2008-04-07 11:39:32.989960448 +0200
+++ git/fs/Makefile	2008-04-07 11:53:20.906568828 +0200
@@ -100,6 +100,7 @@ obj-$(CONFIG_NTFS_FS)		+= ntfs/
 obj-$(CONFIG_UFS_FS)		+= ufs/
 obj-$(CONFIG_EFS_FS)		+= efs/
 obj-$(CONFIG_JFFS2_FS)		+= jffs2/
+obj-$(CONFIG_LOGFS)		+= logfs/
 obj-$(CONFIG_AFFS_FS)		+= affs/
 obj-$(CONFIG_ROMFS_FS)		+= romfs/
 obj-$(CONFIG_QNX4FS_FS)		+= qnx4/
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/Makefile	2008-04-07 11:53:20.909877890 +0200
@@ -0,0 +1,15 @@
+obj-$(CONFIG_LOGFS)	+= logfs.o
+
+logfs-y	+= compr.o
+logfs-y	+= dir.o
+logfs-y	+= file.o
+logfs-y	+= gc.o
+logfs-y	+= inode.o
+logfs-y	+= journal.o
+logfs-y	+= memtree.o
+logfs-y	+= readwrite.o
+logfs-y	+= segment.o
+logfs-y	+= super.o
+logfs-$(CONFIG_BLOCK)	+= dev_bdev.o
+logfs-$(CONFIG_MTD)	+= dev_mtd.o
+logfs-$(CONFIG_LOGFS_FSCK)	+= progs/fsck.o
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/logfs.h	2008-04-07 12:50:55.359552665 +0200
@@ -0,0 +1,587 @@
+/*
+ * fs/logfs/logfs.h
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ *
+ * Private header for logfs.
+ */
+#ifndef fs_logfs_logfs_h
+#define fs_logfs_logfs_h
+
+#define __CHECK_ENDIAN__
+
+#include <linux/crc32.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/mtd/mtd.h>
+#include "logfs_abi.h"
+
+#define PG_zero			PG_owner_priv_1
+#define PageZero(page)		test_bit(PG_zero, &(page)->flags)
+#define SetPageZero(page)	set_bit(PG_zero, &(page)->flags)
+#define ClearPageZero(page)	clear_bit(PG_zero, &(page)->flags)
+
+/*
+ * There is no generic kernel btree library yet.  When such a thing gets
+ * introduced, this definition and the corresponding source file should
+ * get removed.
+ */
+struct btree_head {
+	struct btree_node *node;
+	int height;
+	void *null_ptr;
+};
+
+static inline void build_bug_on_needs_a_function(void)
+{
+	BUILD_BUG_ON(sizeof(struct logfs_object_header) != LOGFS_HEADERSIZE);
+	BUILD_BUG_ON(sizeof(struct logfs_segment_header)
+			!= LOGFS_SEGMENT_HEADERSIZE);
+}
+
+/* FIXME: This should really be somewhere in the 64bit area. */
+#define LOGFS_LINK_MAX		(1<<30)
+
+/*
+ * Private errno for accessed beyond end-of-file.  Only used internally to
+ * logfs.  If this ever gets exposed to userspace or even other parts of the
+ * kernel, it is a bug.  256 was chosen as a number sufficiently above all
+ * used errno #defines.
+ *
+ * It can be argued that this is a hack and should be replaced with something
+ * else.  My last attempt to do this failed spectacularly and there are more
+ * urgent problems that users actually care about.  This will remain for the
+ * moment.  Patches are welcome, of course.
+ */
+#define EOF			(512)
+
+/* Read-only filesystem */
+#define LOGFS_SB_FLAG_RO	1
+
+/* Write Control Flags */
+#define WF_LOCK		0x01	/* take write lock */
+#define WF_WRITE	0x02	/* write block */
+#define WF_DELETE	0x04	/* delete old block */
+#define WF_SYNC		0x08	/* sync every indirect block */
+#define WF_GC		0x10	/* GC write, move to GC list */
+
+/**
+ * struct logfs_area - area management information
+ *
+ * @a_sb:			the superblock this area belongs to
+ * @a_is_open:			1 if the area is currently open, else 0
+ * @a_segno:			segment number of area
+ * @a_used_bytes:		number of used bytes
+ * @a_ops:			area operations (either journal or ostore)
+ * @a_wbuf:			write buffer
+ * @a_erase_count:		erase count
+ * @a_level:			GC level
+ */
+struct logfs_area { /* a segment open for writing */
+	struct super_block *a_sb;
+	int	a_is_open;
+	u32	a_segno;
+	u32	a_used_bytes;
+	const struct logfs_area_ops *a_ops;
+	void			*a_wbuf;
+	u32	a_erase_count;
+	u8	a_level;
+};
+
+/**
+ * struct logfs_area_ops - area operations
+ *
+ * @get_free_segment:		fill area->ofs with the offset of a free segment
+ * @get_erase_count:		fill area->erase_count (needs area->ofs)
+ * @erase_segment:		erase and setup segment
+ * @finish_area:		flush buffers, etc.
+ */
+struct logfs_area_ops {
+	void	(*get_free_segment)(struct logfs_area *area);
+	void	(*get_erase_count)(struct logfs_area *area);
+	int	(*erase_segment)(struct logfs_area *area);
+	void	(*finish_area)(struct logfs_area *area);
+};
+
+/**
+ * struct logfs_device_ops - device access operations
+ *
+ * @read:			read from the device
+ * @write:			write to the device
+ * @erase:			erase part of the device
+ */
+struct logfs_device_ops {
+	s64 (*find_sb)(struct super_block *sb);
+	int (*read)(struct super_block *sb, loff_t ofs, size_t len, void *buf);
+	int (*write)(struct super_block *sb, loff_t ofs, size_t len, void *buf);
+	int (*erase)(struct super_block *sb, loff_t ofs, size_t len);
+	void (*sync)(struct super_block *sb);
+};
+
+/**
+ * struct gc_candidate - "candidate" segment to be garbage collected next
+ *
+ * @list:			list (either free of low)
+ * @segno:			segment number
+ * @valid:			number of valid bytes
+ * @erase_count:		erase count of segment
+ * @dist:			distance from tree root
+ *
+ * Candidates can be on two lists.  The free list contains electees rather
+ * than candidates - segments that no longer contain any valid data.  The
+ * low list contains candidates to be picked for GC.  It should be kept
+ * short.  It is not required to always pick a perfect candidate.  In the
+ * worst case GC will have to move more data than absolutely necessary.
+ */
+struct gc_candidate {
+	struct list_head list;
+	u64	gec;
+	u32	segno;
+	u32	valid;
+	u32	erase_count;
+	u8	dist;
+};
+
+/**
+ * struct candidate_list - list of similar candidates
+ */
+struct candidate_list {
+	struct list_head list;
+	int count;
+	int maxcount;
+	int sort_by_ec;
+};
+
+/**
+ * struct logfs_journal_entry - temporary structure used during journal scan
+ *
+ * @used:
+ * @version:			normalized version
+ * @len:			length
+ * @offset:			offset
+ */
+struct logfs_journal_entry {
+	int used;
+	s16 version;
+	u16 len;
+	u16 datalen;
+	u64 offset;
+};
+
+enum transaction_state {
+	CREATE_1 = 1,
+	CREATE_2,
+	UNLINK_1,
+	UNLINK_2,
+	CROSS_RENAME_1,
+	CROSS_RENAME_2,
+	TARGET_RENAME_1,
+	TARGET_RENAME_2,
+	TARGET_RENAME_3
+};
+
+/**
+ * struct logfs_transaction - essential fields to support atomic dirops
+ *
+ * @ino:			target inode
+ * @dir:			inode of directory containing dentry
+ * @pos:			pos of dentry in directory
+ */
+struct logfs_transaction {
+	enum transaction_state state;
+	u64	 ino;
+	u64	 dir;
+	u64	 pos;
+};
+
+/**
+ * struct logfs_shadow - old block in the shadow of a not-yet-committed new one
+ * @old_ofs:			offset of old block on medium
+ * @new_ofs:			offset of new block on medium
+ * @ino:			inode number
+ * @bix:			block index
+ * @old_len:			size of old block, including header
+ * @new_len:			size of new block, including header
+ * @level:			block level
+ */
+struct logfs_shadow {
+	u64 old_ofs;
+	u64 new_ofs;
+	u64 ino;
+	u64 bix;
+	int old_len;
+	int new_len;
+	u8 level;
+};
+
+/**
+ * struct shadow_tree
+ * @new:			shadows where old_ofs==0, indexed by new_ofs
+ * @old:			shadows where old_ofs!=0, indexed by old_ofs
+ */
+struct shadow_tree {
+	struct btree_head new;
+	struct btree_head old;
+};
+
+/**
+ * struct logfs_block - contains any block state
+ * @tree:			btree of shadows, indexed by old_ofs
+ */
+struct logfs_block {
+	struct list_head dirty_list;
+	struct shadow_tree shadow_tree;
+	struct page *page;
+	struct logfs_transaction *ta;
+};
+
+struct mtd_inode {
+	struct mtd_info *mtd;
+	long openers;
+	struct inode vfs_inode;
+};
+
+struct logfs_super {
+	struct mtd_inode *s_mtd;		/* underlying device */
+	struct block_device *s_bdev;		/* underlying device */
+	int	 s_sync;			/* sync on next io? */
+	const struct logfs_device_ops *s_devops;/* device access */
+	struct inode	*s_master_inode;	/* ifile */
+	long	 s_flags;
+	/* dir.c fields */
+	struct mutex s_dirop_mutex;		/* for creat/unlink/rename */
+	u64	 s_victim_ino;			/* used for atomic dir-ops */
+	u64	 s_rename_dir;			/* source directory ino */
+	u64	 s_rename_pos;			/* position of source dd */
+	/* gc.c fields */
+	long	 s_segsize;			/* size of a segment */
+	int	 s_segshift;			/* log2 of segment size */
+	long	 s_no_segs;			/* segments on device */
+	long	 s_no_blocks;			/* blocks per segment */
+	long	 s_writesize;			/* minimum write size */
+	int	 s_writeshift;			/* log2 of write size */
+	u64	 s_size;			/* filesystem size */
+	struct logfs_area *s_area[LOGFS_NO_AREAS];	/* open segment array */
+	u64	 s_gec;				/* global erase count */
+	u64	 s_sweeper;			/* current sweeper pos */
+	u8	 s_ifile_levels;		/* max level of ifile */
+	u8	 s_iblock_levels;		/* max level of regular files */
+	u8	 s_data_levels;			/* # of segments to leaf block*/
+	u8	 s_total_levels;		/* sum of above three */
+	struct candidate_list s_free_list;	/* 100% free segments */
+	struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */
+	struct candidate_list s_ec_list;	/* wear level candidates */
+	struct btree_head s_reserved_segments;	/* sb, journal, bad, etc. */
+	struct list_head s_dirty_list;		/* list of dirty blocks */
+	struct list_head s_gc_dirty_list[LOGFS_NO_AREAS];/* blocks dirtied during GC */
+	/* inode.c fields */
+	spinlock_t s_ino_lock;			/* lock s_last_ino on 32bit */
+	u64	 s_last_ino;			/* highest ino used */
+	struct list_head s_freeing_list;	/* inodes being freed */
+	/* journal.c fields */
+	struct mutex s_journal_mutex;
+	void	*s_je;				/* journal entry to compress */
+	void	*s_compressed_je;		/* block to write to journal */
+	u64	 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */
+	u32	 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
+	u64	 s_last_version;
+	struct logfs_area *s_journal_area;	/* open journal segment */
+	struct logfs_journal_entry s_retired[JE_LAST+1]; /* for journal scan */
+	struct logfs_journal_entry s_speculative[JE_LAST+1]; /* dito */
+	struct logfs_journal_entry s_first;		/* dito */
+	int	 s_sum_index;			/* for the 12 summaries */
+	__be32	*s_bb_array;			/* bad segments */
+	/* readwrite.c fields */
+	struct mutex s_w_mutex;
+	struct page *s_write_page;		/* page under writeback now */
+	mempool_t *s_block_pool;		/* struct logfs_block pool */
+	mempool_t *s_shadow_pool;		/* struct logfs_shadow pool */
+	/*
+	 * Space accounting in LogFS:
+	 * - s_used_bytes specifies space used to store valid data objects.
+	 * - s_dirty_used_bytes is space used to store non-committed data
+	 *   objects.  Those objects have already been written themselves,
+	 *   but they don't become valid until all indirect blocks up to the
+	 *   journal have been written as well.
+	 * - s_dirty_free_bytes is space used to store the old copy of a
+	 *   replaced object, as long as the replacement is non-committed.
+	 *   In other words, it is the amount of space freed when all dirty
+	 *   blocks are written back.
+	 * - s_free_bytes is the amount of free space available for any
+	 *   purpose.
+	 * - s_root_reserve is the amount of free space available only to
+	 *   the root user.
+	 * - s_gc_reserve is currently a mess.
+	 */
+	u64	 s_free_bytes;			/* free space */
+	u64	 s_used_bytes;			/* used (valid) data */
+	u64	 s_dirty_free_bytes;		/* space freed on commit */
+	u64	 s_dirty_used_bytes;		/* space used on commit */
+	u64	 s_gc_reserve;			/* space reserved for GC */
+	u64	 s_root_reserve;	/* FIXME: currently unused */
+	u32	 s_bad_segments;		/* number of bad segments */
+};
+
+/**
+ * struct logfs_inode - in-memory inode
+ *
+ * @vfs_inode:			struct inode
+ * @li_data:			data pointers
+ * @li_used_bytes:		number of used bytes
+ * @li_freeing_list:		used to track inodes currently being freed
+ * @li_flags:			inode flags
+ */
+struct logfs_inode {
+	struct inode vfs_inode;
+	u64	li_data[LOGFS_EMBEDDED_FIELDS];
+	u64	li_used_bytes;
+	struct list_head li_freeing_list;
+	struct logfs_transaction *li_transaction;
+	struct shadow_tree li_shadow_tree;
+	u32	li_flags;
+	u8	li_height;
+};
+
+#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++)
+#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++)
+
+/* compr.c */
+int logfs_compress(void *in, void *out, size_t inlen, size_t outlen);
+int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen);
+int __init logfs_compr_init(void);
+void logfs_compr_exit(void);
+
+/* dev_bdev.c */
+#ifdef CONFIG_BLOCK
+int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+		const char *devname, struct vfsmount *mnt);
+
+static inline void logfs_put_bdev(struct block_device *bdev)
+{
+	if (bdev)
+		close_bdev_excl(bdev);
+}
+#else
+static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+		const char *devname, struct vfsmount *mnt)
+{
+	return -ENODEV;
+}
+
+static inline void logfs_put_bdev(struct block_device *bdev)
+{
+}
+#endif
+
+/* dev_mtd.c */
+#ifdef CONFIG_MTD
+int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+		int mtdnr, struct vfsmount *mnt);
+void logfs_put_mtd(struct mtd_inode *mi);
+#else
+static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+		int mtdnr, struct vfsmount *mnt)
+{
+	return -ENODEV;
+}
+
+static inline void logfs_put_mtd(struct mtd_inode *mi)
+{
+}
+#endif
+
+/* dir.c */
+extern const struct inode_operations logfs_symlink_iops;
+extern const struct inode_operations logfs_dir_iops;
+extern const struct file_operations logfs_dir_fops;
+int logfs_replay_journal(struct super_block *sb);
+
+/* file.c */
+extern const struct inode_operations logfs_reg_iops;
+extern const struct file_operations logfs_reg_fops;
+extern const struct address_space_operations logfs_reg_aops;
+int logfs_readpage(struct file *file, struct page *page);
+int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+		unsigned long arg);
+int logfs_fsync(struct file *file, struct dentry *dentry, int datasync);
+
+/* gc.c */
+int logfs_safe_to_write_block(struct super_block *sb, u8 level);
+struct gc_candidate *get_best_cand(struct candidate_list *list);
+int add_free_segments_from_journal(struct super_block *sb,
+		struct logfs_je_free_segments *segs, int count);
+void logfs_dirty_for_gc(struct super_block *sb, struct logfs_block *block);
+void logfs_gc_pass(struct super_block *sb);
+int logfs_check_areas(struct super_block *sb);
+int logfs_init_gc(struct logfs_super *super);
+void logfs_cleanup_gc(struct logfs_super *super);
+
+/* inode.c */
+extern const struct super_operations logfs_super_operations;
+struct inode *logfs_iget(struct super_block *sb, ino_t ino, int *cookie);
+void logfs_iput(struct inode *inode, int cookie);
+struct inode *logfs_new_inode(struct inode *dir, int mode);
+struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
+int logfs_init_inode_cache(void);
+void logfs_destroy_inode_cache(void);
+int __logfs_write_inode(struct inode *inode, long flags);
+void __logfs_destroy_inode(struct inode *inode);
+void logfs_set_blocks(struct inode *inode, u64 no);
+struct inode *dhowells_iget(struct super_block *sb, ino_t ino);
+
+/* journal.c */
+int logfs_write_anchor(struct inode *inode);
+int logfs_init_journal(struct super_block *sb);
+void logfs_cleanup_journal(struct super_block *sb);
+
+/* memtree.c */
+void btree_init(struct btree_head *head);
+void *btree_lookup(struct btree_head *head, u64 val);
+int btree_insert(struct btree_head *head, u64 val, void *ptr);
+void *btree_remove(struct btree_head *head, u64 val);
+int btree_merge(struct btree_head *target, struct btree_head *victim);
+void btree_visitor(struct btree_head *head, long opaque,
+		void (*func)(void *elem, long opaque, u64 val));
+void btree_grim_visitor(struct btree_head *head, long opaque,
+		void (*func)(void *elem, long opaque, u64 val));
+
+/* readwrite.c */
+void logfs_unpack_index(pgoff_t index, u64 *bix, u8 *level);
+void logfs_flush_dirty(struct super_block *sb, int sync);
+int logfs_inode_read(struct inode *inode, void *buf, size_t n, loff_t _pos);
+int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
+		loff_t bix, long flags, struct logfs_transaction *ta,
+		struct shadow_tree *shadow_tree);
+int logfs_readpage_nolock(struct page *page);
+int logfs_write_buf(struct inode *inode, struct page *page,
+		struct logfs_transaction *ta, long flags);
+int logfs_delete(struct inode *inode, pgoff_t index,
+		struct shadow_tree *shadow_tree, struct logfs_transaction *ta);
+int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, int level,
+		long flags);
+int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 pos,
+		u8 level);
+int logfs_truncate(struct inode *inode, u64 size);
+u64 logfs_seek_hole(struct inode *inode, u64 bix);
+u64 logfs_seek_data(struct inode *inode, u64 bix);
+int logfs_init_rw(struct logfs_super *super);
+void logfs_cleanup_rw(struct logfs_super *super);
+
+/* segment.c */
+int logfs_erase_segment(struct super_block *sb, u32 ofs);
+int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf);
+int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix,
+		u8 level);
+int logfs_segment_write(struct inode *inode, struct page *page,
+		struct logfs_shadow *shadow);
+int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow);
+void logfs_buf_write(struct logfs_area *area, u64 ofs, void *data, size_t len);
+
+/* area handling */
+int logfs_init_areas(struct super_block *sb);
+void logfs_cleanup_areas(struct logfs_super *super);
+int logfs_open_area(struct logfs_area *area);
+void logfs_close_area(struct logfs_area *area);
+
+/* super.c */
+void logfs_crash_dump(struct super_block *sb);
+void *memchr_inv(const void *s, int c, size_t n);
+int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
+int logfs_get_sb_device(struct file_system_type *type, int flags,
+		struct mtd_inode *mtd, struct block_device *bdev,
+		const struct logfs_device_ops *devops, struct vfsmount *mnt);
+
+/* progs/fsck.c */
+#ifdef CONFIG_LOGFS_FSCK
+int logfs_fsck(struct super_block *sb);
+#else
+static inline int logfs_fsck(struct super_block *sb)
+{
+	return 0;
+}
+#endif
+
+static inline struct logfs_super *logfs_super(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct logfs_inode *logfs_inode(struct inode *inode)
+{
+	return container_of(inode, struct logfs_inode, vfs_inode);
+}
+
+static inline void logfs_set_ro(struct super_block *sb)
+{
+	logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO;
+}
+
+#define LOGFS_BUG(sb) do {					\
+	struct super_block *__sb = sb;				\
+	logfs_crash_dump(__sb);					\
+	logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO;		\
+	BUG();							\
+} while (0)
+
+#define LOGFS_BUG_ON(condition, sb) \
+	do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0)
+
+static inline __be32 logfs_crc32(void *data, size_t len, size_t skip)
+{
+	return cpu_to_be32(crc32(~0, data+skip, len-skip));
+}
+
+static inline u8 logfs_type(struct inode *inode)
+{
+	return (inode->i_mode >> 12) & 15;
+}
+
+static inline pgoff_t logfs_index(struct super_block *sb, u64 pos)
+{
+	return pos >> sb->s_blocksize_bits;
+}
+
+static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs)
+{
+	return ((u64)segno << logfs_super(sb)->s_segshift) + ofs;
+}
+
+static inline int device_read(struct super_block *sb, u32 segno, u32 ofs,
+		size_t len, void *buf)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	return super->s_devops->read(sb, dev_ofs(sb, segno, ofs), len, buf);
+}
+
+static inline struct logfs_block *logfs_block(struct page *page)
+{
+	return (void *)page->private;
+}
+
+/* Compat mess */
+
+#include <linux/version.h>
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 22)
+#define DTOR /* nothing */
+#else
+#define DTOR , NULL
+#endif
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 24)
+#define	zero_user_segment(page, start, end)	\
+	zero_user_page((page), (start), (end) - (start), KM_USER0);
+static inline void iget_failed(struct inode *inode)
+{
+	make_bad_inode(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+}
+#endif
+
+#endif
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/compr.c	2008-04-07 11:53:20.909877890 +0200
@@ -0,0 +1,95 @@
+/*
+ * fs/logfs/compr.c	- compression routines
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/vmalloc.h>
+#include <linux/zlib.h>
+
+#define COMPR_LEVEL 3
+
+static DEFINE_MUTEX(compr_mutex);
+static struct z_stream_s stream;
+
+int logfs_compress(void *in, void *out, size_t inlen, size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	mutex_lock(&compr_mutex);
+	err = zlib_deflateInit(&stream, COMPR_LEVEL);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_deflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_deflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	if (stream.total_out >= stream.total_in)
+		goto error;
+
+	ret = stream.total_out;
+error:
+	mutex_unlock(&compr_mutex);
+	return ret;
+}
+
+int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	mutex_lock(&compr_mutex);
+	err = zlib_inflateInit(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_inflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_inflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	ret = 0;
+error:
+	mutex_unlock(&compr_mutex);
+	return ret;
+}
+
+int __init logfs_compr_init(void)
+{
+	size_t size = max(zlib_deflate_workspacesize(),
+			zlib_inflate_workspacesize());
+	stream.workspace = vmalloc(size);
+	if (!stream.workspace)
+		return -ENOMEM;
+	return 0;
+}
+
+void logfs_compr_exit(void)
+{
+	vfree(stream.workspace);
+}
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/dir.c	2008-04-07 12:14:21.845470394 +0200
@@ -0,0 +1,765 @@
+/*
+ * fs/logfs/dir.c	- directory-related code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+
+
+/*
+ * Atomic dir operations
+ *
+ * Directory operations are by default not atomic.  Dentries and Inodes are
+ * created/removed/altered in seperate operations.  Therefore we need to do
+ * a small amount of journaling.
+ *
+ * Create, link, mkdir, mknod and symlink all share the same function to do
+ * the work: __logfs_create.  This function works in two atomic steps:
+ * 1. allocate inode (remember in journal)
+ * 2. allocate dentry (clear journal)
+ *
+ * As we can only get interrupted between the two, when the inode we just
+ * created is simply stored in the anchor.  On next mount, if we were
+ * interrupted, we delete the inode.  From a users point of view the
+ * operation never happened.
+ *
+ * Unlink and rmdir also share the same function: unlink.  Again, this
+ * function works in two atomic steps
+ * 1. remove dentry (remember inode in journal)
+ * 2. unlink inode (clear journal)
+ *
+ * And again, on the next mount, if we were interrupted, we delete the inode.
+ * From a users point of view the operation succeeded.
+ *
+ * Rename is the real pain to deal with, harder than all the other methods
+ * combined.  Depending on the circumstances we can run into three cases.
+ * A "target rename" where the target dentry already existed, a "local
+ * rename" where both parent directories are identical or a "cross-directory
+ * rename" in the remaining case.
+ *
+ * Local rename is atomic, as the old dentry is simply rewritten with a new
+ * name.
+ *
+ * Cross-directory rename works in two steps, similar to __logfs_create and
+ * logfs_unlink:
+ * 1. Write new dentry (remember old dentry in journal)
+ * 2. Remove old dentry (clear journal)
+ *
+ * Here we remember a dentry instead of an inode.  On next mount, if we were
+ * interrupted, we delete the dentry.  From a users point of view, the
+ * operation succeeded.
+ *
+ * Target rename works in three atomic steps:
+ * 1. Attach old inode to new dentry (remember old dentry and new inode)
+ * 2. Remove old dentry (still remember the new inode)
+ * 3. Remove victim inode
+ *
+ * Here we remember both an inode an a dentry.  If we get interrupted
+ * between steps 1 and 2, we delete both the dentry and the inode.  If
+ * we get interrupted between steps 2 and 3, we delete just the inode.
+ * In either case, the remaining objects are deleted on next mount.  From
+ * a users point of view, the operation succeeded.
+ */
+
+typedef int (*dir_callback)(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, loff_t pos, void *arg);
+
+static inline void logfs_inc_count(struct inode *inode)
+{
+	inode->i_nlink++;
+	mark_inode_dirty_sync(inode);
+}
+
+static inline void logfs_dec_count(struct inode *inode)
+{
+	inode->i_nlink--;
+	mark_inode_dirty_sync(inode);
+}
+
+static int read_dir(struct inode *dir, struct logfs_disk_dentry *dd, loff_t pos)
+{
+	return logfs_inode_read(dir, dd, sizeof(*dd), pos);
+}
+
+static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
+		loff_t pos, struct logfs_transaction *ta)
+{
+	return logfs_inode_write(dir, dd, sizeof(*dd), pos,
+			WF_SYNC|WF_LOCK, ta, NULL);
+}
+
+static int write_inode(struct inode *inode)
+{
+	return __logfs_write_inode(inode, WF_LOCK|WF_SYNC);
+}
+
+static s64 dir_seek_data(struct inode *inode, s64 pos)
+{
+	s64 new_pos = logfs_seek_data(inode, pos);
+
+	return max(pos, new_pos - 1);
+}
+
+static int __logfs_dir_walk(struct inode *dir, struct dentry *dentry,
+		dir_callback handler, void *arg,
+		struct logfs_disk_dentry *dd, loff_t *pos)
+{
+	struct qstr *name = dentry ? &dentry->d_name : NULL;
+	int ret;
+
+	for (; ; (*pos)++) {
+		ret = read_dir(dir, dd, *pos);
+		if (ret == -EOF)
+			return 0;
+		if (ret == -ENODATA) {
+			/* deleted dentry */
+			*pos = dir_seek_data(dir, *pos);
+			continue;
+		}
+		if (ret)
+			return ret;
+		BUG_ON(dd->namelen == 0);
+
+		if (name) {
+			if (name->len != be16_to_cpu(dd->namelen))
+				continue;
+			if (memcmp(name->name, dd->name, name->len))
+				continue;
+		}
+
+		return handler(dir, dentry, dd, *pos, arg);
+	}
+	return ret;
+}
+
+static int logfs_dir_walk(struct inode *dir, struct dentry *dentry,
+		dir_callback handler, void *arg)
+{
+	struct logfs_disk_dentry dd;
+	loff_t pos = 0;
+
+	return __logfs_dir_walk(dir, dentry, handler, arg, &dd, &pos);
+}
+
+static int logfs_lookup_handler(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, loff_t pos, void *arg)
+{
+	struct inode *inode;
+
+	inode = dhowells_iget(dir->i_sb, be64_to_cpu(dd->ino));
+	if (!inode)
+		return -EIO;
+	return PTR_ERR(d_splice_alias(inode, dentry));
+}
+
+static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
+		struct nameidata *nd)
+{
+	return ERR_PTR(logfs_dir_walk(dir, dentry, logfs_lookup_handler, NULL));
+}
+
+static int logfs_unlink_handler(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, loff_t pos, void *ta)
+{
+	return logfs_delete(dir, pos, NULL, ta);
+}
+
+static int logfs_remove_inode(struct inode *inode)
+{
+	int ret;
+
+	inode->i_nlink--;
+	if (S_ISDIR(inode->i_mode))
+		inode->i_nlink--;
+	ret = write_inode(inode);
+	LOGFS_BUG_ON(ret, inode->i_sb);
+	return ret;
+}
+
+static void abort_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+	logfs_inode(inode)->li_transaction = NULL;
+	kfree(ta);
+}
+
+static int logfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct logfs_super *super = logfs_super(dir->i_sb);
+	struct inode *inode = dentry->d_inode;
+	struct logfs_transaction *ta;
+	int ret;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = UNLINK_1;
+	ta->ino = inode->i_ino;
+
+	if (S_ISDIR(inode->i_mode))
+		dir->i_nlink--;
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	mutex_lock(&super->s_dirop_mutex);
+	ret = logfs_dir_walk(dir, dentry, logfs_unlink_handler, ta);
+	if (!ret)
+		ret = write_inode(dir);
+
+	if (ret) {
+		super->s_victim_ino = 0;
+		abort_transaction(dir, ta);
+		printk(KERN_ERR"LOGFS: unable to delete inode\n");
+		if (S_ISDIR(inode->i_mode))
+			logfs_inc_count(dir);
+		goto out;
+	}
+
+	ta->state = UNLINK_2;
+	logfs_inode(inode)->li_transaction = ta;
+	ret = logfs_remove_inode(inode);
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return ret;
+}
+
+static int logfs_empty_handler(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, loff_t pos, void *arg)
+{
+	return -ENOTEMPTY;
+}
+
+static inline int logfs_empty_dir(struct inode *dir)
+{
+	return logfs_dir_walk(dir, NULL, logfs_empty_handler, NULL) == 0;
+}
+
+static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (!logfs_empty_dir(inode))
+		return -ENOTEMPTY;
+
+	return logfs_unlink(dir, dentry);
+}
+
+/* FIXME: readdir currently has it's own dir_walk code.  I don't see a good
+ * way to combine the two copies */
+#define IMPLICIT_NODES 2
+static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
+{
+	struct logfs_disk_dentry dd;
+	struct inode *dir = file->f_dentry->d_inode;
+	loff_t pos = file->f_pos - IMPLICIT_NODES;
+	int err;
+
+	BUG_ON(pos < 0);
+	for (;; pos++) {
+		err = read_dir(dir, &dd, pos);
+		if (err == -EOF)
+			break;
+		if (err == -ENODATA) {
+			/* deleted dentry */
+			pos = dir_seek_data(dir, pos);
+			continue;
+		}
+		if (err)
+			return err;
+		BUG_ON(dd.namelen == 0);
+
+		if (filldir(buf, dd.name, be16_to_cpu(dd.namelen), pos,
+					be64_to_cpu(dd.ino), dd.type))
+			break;
+	}
+
+	file->f_pos = pos + IMPLICIT_NODES;
+	return 0;
+}
+
+static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	ino_t pino = parent_ino(file->f_dentry);
+	int err;
+
+	if (file->f_pos < 0)
+		return -EINVAL;
+
+	if (file->f_pos == 0) {
+		if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
+			return 0;
+		file->f_pos++;
+	}
+	if (file->f_pos == 1) {
+		if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
+			return 0;
+		file->f_pos++;
+	}
+
+	err = __logfs_readdir(file, buf, filldir);
+	return err;
+}
+
+static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
+{
+	dd->namelen = cpu_to_be16(name->len);
+	memcpy(dd->name, name->name, name->len);
+}
+
+static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
+		struct inode *inode, struct logfs_transaction *ta)
+{
+	struct logfs_disk_dentry dd;
+	int err;
+
+	if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
+		return -ENAMETOOLONG;
+
+	memset(&dd, 0, sizeof(dd));
+	dd.ino = cpu_to_be64(inode->i_ino);
+	dd.type = logfs_type(inode);
+	logfs_set_name(&dd, &dentry->d_name);
+
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	/*
+	 * FIXME: the file size should actually get aligned when writing,
+	 * not when reading.
+	 */
+	err = write_dir(dir, &dd, logfs_seek_hole(dir, 0), ta);
+	if (err)
+		return err;
+	return 0;
+}
+
+static int __logfs_create(struct inode *dir, struct dentry *dentry,
+		struct inode *inode, const char *dest, long destlen)
+{
+	struct logfs_super *super = logfs_super(dir->i_sb);
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_transaction *ta;
+	int ret;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = CREATE_1;
+	mutex_lock(&super->s_dirop_mutex);
+	ta->ino = inode->i_ino;
+	if (S_ISDIR(inode->i_mode))
+		inode->i_nlink++;
+
+	if (dest) {
+		/* symlink */
+		ret = logfs_inode_write(inode, dest, destlen, 0,
+				WF_SYNC|WF_LOCK, ta, NULL);
+		if (!ret)
+			ret = write_inode(inode);
+	} else {
+		/* creat/mkdir/mknod */
+		super->s_victim_ino = inode->i_ino;
+		ret = write_inode(inode);
+	}
+	if (ret) {
+		super->s_victim_ino = 0;
+		abort_transaction(inode, ta);
+		li->li_flags |= LOGFS_IF_STILLBORN;
+		/* FIXME: truncate symlink */
+		inode->i_nlink--;
+		iput(inode);
+		goto out;
+	}
+
+	ta->state = CREATE_2;
+	if (S_ISDIR(inode->i_mode))
+		dir->i_nlink++;
+	ret = logfs_write_dir(dir, dentry, inode, ta);
+	/* sync directory */
+	if (!ret)
+		ret = write_inode(dir);
+
+	if (ret) {
+		super->s_victim_ino = 0;
+		abort_transaction(dir, ta);
+		if (S_ISDIR(inode->i_mode))
+			dir->i_nlink--;
+		logfs_remove_inode(inode);
+		iput(inode);
+		goto out;
+	}
+	d_instantiate(dentry, inode);
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return ret;
+}
+
+static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+
+	if (dir->i_nlink >= LOGFS_LINK_MAX)
+		return -EMLINK;
+
+	/*
+	 * FIXME: why do we have to fill in S_IFDIR, while the mode is
+	 * correct for mknod, creat, etc.?  Smells like the vfs *should*
+	 * do it for us but for some reason fails to do so.
+	 */
+	inode = logfs_new_inode(dir, S_IFDIR | mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &logfs_dir_iops;
+	inode->i_fop = &logfs_dir_fops;
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	struct inode *inode;
+
+	inode = logfs_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &logfs_reg_iops;
+	inode->i_fop = &logfs_reg_fops;
+	inode->i_mapping->a_ops = &logfs_reg_aops;
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+		dev_t rdev)
+{
+	struct inode *inode;
+
+	if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
+		return -ENAMETOOLONG;
+
+	inode = logfs_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	init_special_inode(inode, mode, rdev);
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_symlink(struct inode *dir, struct dentry *dentry,
+		const char *target)
+{
+	struct inode *inode;
+	size_t destlen = strlen(target) + 1;
+
+	if (destlen > dir->i_sb->s_blocksize)
+		return -ENAMETOOLONG;
+
+	inode = logfs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &logfs_symlink_iops;
+	inode->i_mapping->a_ops = &logfs_reg_aops;
+
+	return __logfs_create(dir, dentry, inode, target, destlen);
+}
+
+static int logfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	return generic_permission(inode, mask, NULL);
+}
+
+static int logfs_link(struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+
+	if (inode->i_nlink >= LOGFS_LINK_MAX)
+		return -EMLINK;
+
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	atomic_inc(&inode->i_count);
+	logfs_inc_count(inode);
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_nop_handler(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, loff_t pos, void *arg)
+{
+	return 0;
+}
+
+static inline int logfs_get_dd(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, loff_t *pos)
+{
+	*pos = 0;
+	return __logfs_dir_walk(dir, dentry, logfs_nop_handler, NULL, dd, pos);
+}
+
+/* Easiest case, a local rename and the target doesn't exist.  Just change
+ * the name in the old dd.
+ */
+static int logfs_rename_local(struct inode *dir, struct dentry *old_dentry,
+		struct dentry *new_dentry)
+{
+	struct logfs_disk_dentry dd;
+	loff_t pos;
+	int err;
+
+	err = logfs_get_dd(dir, old_dentry, &dd, &pos);
+	if (err)
+		return err;
+
+	logfs_set_name(&dd, &new_dentry->d_name);
+	return write_dir(dir, &dd, pos, NULL);
+}
+
+static int logfs_delete_dd(struct inode *dir, struct logfs_disk_dentry *dd,
+		loff_t pos, struct logfs_transaction *ta)
+{
+	int err;
+
+	err = read_dir(dir, dd, pos);
+
+	/*
+	 * Getting called with pos somewhere beyond eof is either a goofup
+	 * within this file or means someone maliciously edited the
+	 * (crc-protected) journal.
+	 */
+	LOGFS_BUG_ON(err == -EOF, dir->i_sb);
+	if (err)
+		return err;
+
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	if (dd->type == DT_DIR)
+		dir->i_nlink--;
+	return logfs_delete(dir, pos, NULL, ta);
+}
+
+/*
+ * Cross-directory rename, target does not exist.  Just a little nasty.
+ * Create a new dentry in the target dir, then remove the old dentry,
+ * all the while taking care to remember our operation in the journal.
+ */
+static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
+			      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct logfs_super *super = logfs_super(old_dir->i_sb);
+	struct logfs_disk_dentry dd;
+	struct logfs_transaction *ta;
+	loff_t pos;
+	int err;
+
+	/* 1. locate source dd */
+	err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
+	if (err)
+		return err;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = CROSS_RENAME_1;
+	ta->dir = old_dir->i_ino;
+	ta->pos = pos;
+
+	/* 2. write target dd */
+	if (dd.type == DT_DIR)
+		new_dir->i_nlink++;
+	mutex_lock(&super->s_dirop_mutex);
+	err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode, ta);
+	if (!err)
+		err = write_inode(new_dir);
+
+	if (err) {
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		abort_transaction(new_dir, ta);
+		goto out;
+	}
+
+	ta->state = CROSS_RENAME_2;
+	/* 3. remove source dd */
+	err = logfs_delete_dd(old_dir, &dd, pos, ta);
+	if (!err)
+		err = write_inode(old_dir);
+	LOGFS_BUG_ON(err, old_dir->i_sb);
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return err;
+}
+
+static int logfs_replace_inode(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, struct inode *inode,
+		struct logfs_transaction *ta)
+{
+	loff_t pos;
+	int err;
+
+	err = logfs_get_dd(dir, dentry, dd, &pos);
+	if (err)
+		return err;
+	dd->ino = cpu_to_be64(inode->i_ino);
+	dd->type = logfs_type(inode);
+
+	err = write_dir(dir, dd, pos, ta);
+	if (err)
+		return err;
+	return write_inode(dir);
+}
+
+/* Target dentry exists - the worst case.  We need to attach the source
+ * inode to the target dentry, then remove the orphaned target inode and
+ * source dentry.
+ */
+static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry,
+			       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct logfs_super *super = logfs_super(old_dir->i_sb);
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	int isdir = S_ISDIR(old_inode->i_mode);
+	struct logfs_disk_dentry dd;
+	struct logfs_transaction *ta;
+	loff_t pos;
+	int err;
+
+	BUG_ON(isdir != S_ISDIR(new_inode->i_mode));
+	if (isdir) {
+		if (!logfs_empty_dir(new_inode))
+			return -ENOTEMPTY;
+	}
+
+	/* 1. locate source dd */
+	err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
+	if (err)
+		return err;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = TARGET_RENAME_1;
+	ta->dir = old_dir->i_ino;
+	ta->pos = pos;
+	ta->ino = new_inode->i_ino;
+
+	/* 2. attach source inode to target dd */
+	mutex_lock(&super->s_dirop_mutex);
+	err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode, ta);
+	if (err) {
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		super->s_victim_ino = 0;
+		abort_transaction(new_dir, ta);
+		goto out;
+	}
+
+	/* 3. remove source dd */
+	ta->state = TARGET_RENAME_2;
+	err = logfs_delete_dd(old_dir, &dd, pos, ta);
+	if (!err)
+		err = write_inode(old_dir);
+	LOGFS_BUG_ON(err, old_dir->i_sb);
+
+	/* 4. remove target inode */
+	ta->state = TARGET_RENAME_3;
+	logfs_inode(new_inode)->li_transaction = ta;
+	err = logfs_remove_inode(new_inode);
+
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return err;
+}
+
+static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	if (new_dentry->d_inode)
+		return logfs_rename_target(old_dir, old_dentry,
+					   new_dir, new_dentry);
+	else if (old_dir == new_dir)
+		return logfs_rename_local(old_dir, old_dentry, new_dentry);
+	return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry);
+}
+
+/* No locking done here, as this is called before .get_sb() returns. */
+int logfs_replay_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_disk_dentry dd;
+	struct inode *inode;
+	u64 ino, pos;
+	int err;
+
+	if (super->s_victim_ino) {
+		/* delete victim inode */
+		ino = super->s_victim_ino;
+		printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino);
+		inode = dhowells_iget(sb, ino);
+		if (!inode)
+			goto fail;
+
+		LOGFS_BUG_ON(i_size_read(inode) > 0, sb);
+		super->s_victim_ino = 0;
+		err = logfs_remove_inode(inode);
+		iput(inode);
+		if (err) {
+			super->s_victim_ino = ino;
+			goto fail;
+		}
+	}
+	if (super->s_rename_dir) {
+		/* delete old dd from rename */
+		ino = super->s_rename_dir;
+		pos = super->s_rename_pos;
+		printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n",
+				ino, pos);
+		inode = dhowells_iget(sb, ino);
+		if (!inode)
+			goto fail;
+
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		err = logfs_delete_dd(inode, &dd, pos, NULL);
+		iput(inode);
+		if (err) {
+			super->s_rename_dir = ino;
+			super->s_rename_pos = pos;
+			goto fail;
+		}
+	}
+	return 0;
+fail:
+	LOGFS_BUG(sb);
+	return -EIO;
+}
+
+const struct inode_operations logfs_symlink_iops = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+};
+
+const struct inode_operations logfs_dir_iops = {
+	.create		= logfs_create,
+	.link		= logfs_link,
+	.lookup		= logfs_lookup,
+	.mkdir		= logfs_mkdir,
+	.mknod		= logfs_mknod,
+	.rename		= logfs_rename,
+	.rmdir		= logfs_rmdir,
+	.permission	= logfs_permission,
+	.symlink	= logfs_symlink,
+	.unlink		= logfs_unlink,
+};
+const struct file_operations logfs_dir_fops = {
+	.fsync		= logfs_fsync,
+	.ioctl		= logfs_ioctl,
+	.readdir	= logfs_readdir,
+	.read		= generic_read_dir,
+};
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/file.c	2008-04-07 12:47:30.156219529 +0200
@@ -0,0 +1,236 @@
+/*
+ * fs/logfs/file.c	- prepare_write, commit_write and friends
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+#include <linux/writeback.h>
+
+static int logfs_prepare_write(struct file *file, struct page *page,
+		unsigned start, unsigned end)
+{
+	if (PageUptodate(page))
+		return 0;
+
+	if ((start == 0) && (end == PAGE_CACHE_SIZE))
+		return 0;
+
+	return logfs_readpage_nolock(page);
+}
+
+static int logfs_commit_write(struct file *file, struct page *page,
+		unsigned start, unsigned end)
+{
+	struct inode *inode = page->mapping->host;
+	pgoff_t index = page->index;
+	int ret;
+
+	BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize);
+	BUG_ON(page->index > I3_BLOCKS);
+
+	if (start == end)
+		return 0; /* FIXME: do we need to update inode? */
+
+	if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) {
+		i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end);
+		mark_inode_dirty_sync(inode);
+	}
+
+	ClearPageZero(page);
+	ret = logfs_write_buf(inode, page, NULL, WF_LOCK);
+	return ret;
+}
+
+int logfs_readpage(struct file *file, struct page *page)
+{
+	int ret;
+
+	ret = logfs_readpage_nolock(page);
+	unlock_page(page);
+	return ret;
+}
+
+/* Clear the page's dirty flag in the radix tree. */
+/* TODO: mucking with PageWriteback is silly.  Add a generic function to clear
+ * the dirty bit from the radix tree for filesystems that don't have to wait
+ * for page writeback to finish (i.e. any compressing filesystem).
+ */
+static void clear_radix_tree_dirty(struct page *page)
+{
+	BUG_ON(PagePrivate(page) || page->private);
+	set_page_writeback(page);
+	end_page_writeback(page);
+}
+
+static int __logfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	int err;
+
+	ClearPageZero(page);
+	err = logfs_write_buf(inode, page, NULL, WF_LOCK);
+	if (err)
+		set_page_dirty(page);
+	else
+		clear_radix_tree_dirty(page);
+	unlock_page(page);
+	return err;
+}
+
+static int logfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+	u64 bix;
+	u8 level;
+
+	pr_debug("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index,
+			page);
+
+	logfs_unpack_index(page->index, &bix, &level);
+
+	/* Indirect blocks are never truncated */
+	if (level > 0)
+		return __logfs_writepage(page, wbc);
+
+	/*
+	 * TODO: everything below is a near-verbatim copy of nobh_writepage().
+	 * The relevant bits should be factored out after logfs is merged.
+	 */
+
+	/* Is the page fully inside i_size? */
+	if (bix < end_index)
+		return __logfs_writepage(page, wbc);
+
+	 /* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (bix > end_index || offset == 0) {
+		unlock_page(page);
+		return 0; /* don't care */
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invokation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	return __logfs_writepage(page, wbc);
+}
+
+static void logfs_invalidatepage(struct page *page, unsigned long offset)
+{
+	u64 bix;
+	u8 level;
+
+	/* Gets called for dirty indirect blocks at umount time */
+	logfs_unpack_index(page->index, &bix, &level);
+	BUG_ON(level == 0);
+	BUG_ON(PageZero(page));
+	logfs_write_buf(page->mapping->host, page, NULL, WF_LOCK);
+}
+
+static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
+{
+	return 0; /* None of these are easy to release */
+}
+
+
+int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	unsigned int oldflags, flags;
+	int err;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		flags = li->li_flags & LOGFS_FL_USER_VISIBLE;
+		return put_user(flags, (int __user *)arg);
+	case FS_IOC_SETFLAGS:
+		if (IS_RDONLY(inode))
+			return -EROFS;
+
+		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+			return -EACCES;
+
+		err = get_user(flags, (int __user *)arg);
+		if (err)
+			return err;
+
+		mutex_lock(&inode->i_mutex);
+		oldflags = li->li_flags;
+		flags &= LOGFS_FL_USER_MODIFIABLE;
+		flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
+		li->li_flags = flags;
+		mutex_unlock(&inode->i_mutex);
+
+		inode->i_ctime = CURRENT_TIME;
+		mark_inode_dirty_sync(inode);
+		return 0;
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct super_block *sb = dentry->d_inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+
+	logfs_flush_dirty(sb, 1);
+
+	super->s_devops->sync(sb);
+	return 0;
+}
+
+static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int err = 0;
+
+	if (attr->ia_valid & ATTR_SIZE)
+		err = logfs_truncate(inode, attr->ia_size);
+	attr->ia_valid &= ~ATTR_SIZE;
+
+	if (!err)
+		err = inode_change_ok(inode, attr);
+	if (!err)
+		err = inode_setattr(inode, attr);
+	return err;
+}
+
+const struct inode_operations logfs_reg_iops = {
+	.setattr	= logfs_setattr,
+};
+
+const struct file_operations logfs_reg_fops = {
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+	.fsync		= logfs_fsync,
+	.ioctl		= logfs_ioctl,
+	.llseek		= generic_file_llseek,
+	.mmap		= generic_file_readonly_mmap,
+	.open		= generic_file_open,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+};
+
+const struct address_space_operations logfs_reg_aops = {
+	.commit_write	= logfs_commit_write,
+	.invalidatepage	= logfs_invalidatepage,
+	.prepare_write	= logfs_prepare_write,
+	.readpage	= logfs_readpage,
+	.releasepage	= logfs_releasepage,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.writepage	= logfs_writepage,
+	.writepages	= generic_writepages,
+};
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/gc.c	2008-04-07 11:53:20.913211255 +0200
@@ -0,0 +1,730 @@
+/*
+ * fs/logfs/gc.c	- garbage collection code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ *
+ * GC design as it should be (and isn't, as of 15.3.07):
+ * 1. Pick a good candidate for GC, constrained by the number of currently
+ *    free segments.
+ * 2. Move all valid blocks in this segment until
+ *   a) they are all gone, or
+ *   b) the number of currently free segments drops too low.
+ * 3. Mark the segment as GC-pending or so, because not all indirect blocks
+ *    have been written yet.
+ * 4. Either
+ *   a) goto 1. or
+ *   b) write dirty indirect blocks directly
+ *
+ * Sooner or later 4b) should be taken, causing a number of segments to be
+ * freed.  2b) will consume free segments until this point is reached.  Overall
+ * progress will be made, even though less than a full segment may be gained.
+ *
+ * Crucial question is when to choose 4b) over 4a.
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+
+#define SCAN_RATIO 16	/* number of scanned segments per gc'd segment */
+#define LIST_SIZE 16	/* base size of candidate lists */
+#define SCAN_ROUNDS 128	/* maximum number of complete medium scans */
+#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
+
+static void __logfs_gc_pass(struct super_block *sb, int target);
+
+/* journal has distance -1, top-most ifile layer distance 0 */
+static u8 root_distance(struct super_block *sb, u8 level)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	switch (level) {
+	case 0: /* fall through */
+	case 1: /* fall through */
+	case 2: /* fall through */
+	case 3:
+		/* file data or indirect blocks */
+		return super->s_ifile_levels + super->s_iblock_levels - level;
+	case 6: /* fall through */
+	case 7: /* fall through */
+	case 8: /* fall through */
+	case 9:
+		/* inode file data or indirect blocks */
+		return super->s_ifile_levels - (level-6);
+	default:
+		printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
+				level);
+		WARN_ON(1);
+		return super->s_ifile_levels + super->s_iblock_levels;
+	}
+}
+
+int logfs_safe_to_write_block(struct super_block *sb, u8 level)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	return root_distance(sb, level) <= super->s_free_list.count;
+}
+
+static int segment_is_reserved(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area;
+	struct gc_candidate *cand;
+	void *reserved;
+	int i;
+
+	/* Some segments are reserved.  Just pretend they were all valid */
+	reserved = btree_lookup(&super->s_reserved_segments, segno);
+	if (reserved)
+		return 1;
+
+	/* Currently open segments */
+	for_each_area(i) {
+		area = super->s_area[i];
+		if (area->a_is_open && area->a_segno == segno)
+			return 1;
+	}
+
+	/* On the free list */
+	list_for_each_entry(cand, &super->s_free_list.list, list)
+		if (cand->segno == segno)
+			return 1;
+
+	return 0;
+}
+
+static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
+{
+	BUG();
+}
+
+/*
+ * Count the bytes consumed by valid objects in this segment.  Object headers
+ * are counted, the segment header is not.
+ */
+static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
+		u8 *level, u64 *segment_gec)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_segment_header sh;
+	struct logfs_object_header oh;
+	u64 ofs, ino, bix;
+	u32 seg_ofs, valid, size;
+	int err;
+
+	err = device_read(sb, segno, 0, sizeof(sh), &sh);
+	BUG_ON(err);
+	if (!memchr_inv(&sh, 0xff, sizeof(sh)))
+		return 0;
+
+	*level = sh.level;
+	*ec = be32_to_cpu(sh.ec);
+	*segment_gec = be64_to_cpu(sh.gec);
+	if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
+		logfs_mark_segment_bad(sb, segno);
+		return super->s_segsize - 1;
+	}
+
+	valid = 0;
+	for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
+			seg_ofs + sizeof(oh) < super->s_segsize;) {
+		err = device_read(sb, segno, seg_ofs, sizeof(oh), &oh);
+		BUG_ON(err);
+		if (!memchr_inv(&oh, 0xff, sizeof(oh)))
+			break;
+
+		if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
+			logfs_mark_segment_bad(sb, segno);
+			return super->s_segsize - 1;
+		}
+
+		ofs = dev_ofs(sb, segno, seg_ofs);
+		ino = be64_to_cpu(oh.ino);
+		bix = be64_to_cpu(oh.bix);
+		size = (u32)be16_to_cpu(oh.len) + sizeof(oh);
+
+		if (logfs_is_valid_block(sb, ofs, ino, bix, *level))
+			valid += size;
+		seg_ofs += size;
+	}
+	pr_debug(KERN_INFO "LOGFS valid(%x) = %x\n", segno, valid);
+	return valid;
+}
+
+/* FIXME: check for off-by-one on max_dist */
+/*
+ * GC distance N:
+ * while (valid blocks in segment) {
+ * 	rewrite one block
+ * 	if (fewer than N free segments)
+ * 		GC distance N-1
+ * }
+ * While (N > 1) {
+ * 	N--;
+ * 	flush dirty list N
+ * }
+ * Exit criterium: free segments >= N
+ *
+ * flush dirty list N:
+ * while (dirty blocks on level) {
+ * 	write block
+ * 	if (fewer then N free segments)
+ * 		GC distance N-1
+ * }
+ */
+
+/*
+ * Move all block from the gc_dirty lists to a private one, then recursively
+ * call into GC again to free some segments on higher layers.
+ */
+static void recursive_backoff(struct super_block *sb, int max_dist)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block, *tmp;
+	int i;
+	LIST_HEAD(backoff_list);
+
+	for_each_area(i) {
+		if (i > max_dist)
+			break;
+		list_for_each_entry_safe(block, tmp, &super->s_gc_dirty_list[i], dirty_list) {
+			list_move_tail(&block->dirty_list, &backoff_list);
+		}
+	}
+	__logfs_gc_pass(sb, max_dist);
+	list_for_each_entry_safe(block, tmp, &backoff_list, dirty_list)
+		logfs_dirty_for_gc(sb, block);
+}
+
+/* Write back any indirect/inode blocks dirtied during the GC run. */
+static void flush_gc_dirty_list(struct super_block *sb, int dist)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block;
+	struct page *page;
+	struct inode *inode;
+	long flags;
+	int err;
+
+	for (;;) {
+		if (dist < 0)
+			break;
+		if (list_empty(&super->s_gc_dirty_list[dist])) {
+			dist--;
+			continue;
+		}
+
+		block = list_entry(super->s_gc_dirty_list[dist].next,
+				struct logfs_block, dirty_list);
+		page = block->page;
+		inode = page->mapping->host;
+		if (super->s_free_list.count < dist) {
+			/* We ran out of room on this level. */
+			/* Interestingly enough, this case is possible in
+			 * theory, but never triggered in practice.
+			 */
+			recursive_backoff(sb, dist);
+			continue;
+		}
+
+		flags = WF_GC;
+		if (super->s_free_bytes < dist * LOGFS_MAX_OBJECTSIZE
+				+ super->s_gc_reserve
+				+ super->s_dirty_free_bytes)
+			flags |= WF_SYNC;
+		err = logfs_write_buf(inode, page, NULL, WF_GC);
+		BUG_ON(err);
+	}
+}
+
+void logfs_dirty_for_gc(struct super_block *sb, struct logfs_block *block)
+{
+	struct page *page;
+	struct inode *inode;
+	u64 bix;
+	u8 level, dist;
+
+	page = block->page;
+	inode = page->mapping->host;
+	logfs_unpack_index(block->page->index, &bix, &level);
+	if (inode->i_ino == LOGFS_INO_MASTER)
+		level += LOGFS_MAX_LEVELS;
+
+	dist = root_distance(sb, level);
+	list_move_tail(&block->dirty_list, &logfs_super(sb)->s_gc_dirty_list[dist]);
+}
+
+static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
+		u64 bix, int level, long flags)
+{
+	struct inode *inode;
+	int err, cookie;
+
+	inode = logfs_iget(sb, ino, &cookie);
+	BUG_ON(!inode);
+	err = logfs_rewrite_block(inode, bix, ofs, level, flags);
+	BUG_ON(err);
+	logfs_iput(inode, cookie);
+}
+
+static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_segment_header sh;
+	struct logfs_object_header oh;
+	u64 ofs, ino, bix;
+	u32 seg_ofs, cleaned = 0;
+	int level, err, len, valid;
+	long flags;
+
+	LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
+
+	btree_insert(&super->s_reserved_segments, segno, (void *)1);
+	err = device_read(sb, segno, 0, sizeof(sh), &sh);
+	BUG_ON(err);
+	level = sh.level;
+	if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
+		logfs_mark_segment_bad(sb, segno);
+		cleaned = -1;
+		goto out;
+	}
+
+	for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
+			seg_ofs + sizeof(oh) < super->s_segsize; ) {
+		ofs = dev_ofs(sb, segno, seg_ofs);
+		err = device_read(sb, segno, seg_ofs, sizeof(oh), &oh);
+		BUG_ON(err);
+
+		if (!memchr_inv(&oh, 0xff, sizeof(oh)))
+			break;
+
+		if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
+			logfs_mark_segment_bad(sb, segno);
+			cleaned = super->s_segsize - 1;
+			goto out;
+		}
+
+		ino = be64_to_cpu(oh.ino);
+		bix = be64_to_cpu(oh.bix);
+		len = sizeof(oh) + be16_to_cpu(oh.len);
+		valid = logfs_is_valid_block(sb, ofs, ino, bix, level);
+		if (valid) {
+			/* Garbage collection may consume further free segments.
+			 * If space gets too tight, abort and continue with a
+			 * higher level segment for the moment. */
+			if (super->s_free_list.count < dist)
+				recursive_backoff(sb, dist);
+
+			flags = WF_GC;
+			if (super->s_free_bytes < dist * LOGFS_MAX_OBJECTSIZE
+					+ super->s_gc_reserve
+					+ super->s_dirty_free_bytes)
+				flags |= WF_SYNC;
+			logfs_cleanse_block(sb, ofs, ino, bix, level, flags);
+			cleaned += len;
+		}
+		seg_ofs += len;
+	}
+
+	flush_gc_dirty_list(sb, dist - 1);
+out:
+	btree_remove(&super->s_reserved_segments, segno);
+	return cleaned;
+}
+
+static struct gc_candidate *add_list(struct gc_candidate *cand,
+		struct candidate_list *list)
+{
+	struct gc_candidate *cur, *removed = NULL;
+	int cont;
+
+	/* insert sorted */
+	list_for_each_entry(cur, &list->list, list) {
+		if (list->sort_by_ec)
+			cont = cur->erase_count < cand->erase_count;
+		else
+			cont = cur->valid < cand->valid;
+		if (cont)
+			continue;
+		list_add(&cand->list, &cur->list);
+		cand = NULL;
+		break;
+	}
+	/* if list is empty or candidate is worse than entire list */
+	if (cand)
+		list_add_tail(&cand->list, &list->list);
+
+	/* remove worst entry if list is full */
+	if (list->count >= list->maxcount) {
+		removed = list_entry(list->list.prev,
+				struct gc_candidate, list);
+		list_del(&removed->list);
+	} else
+		list->count++;
+
+	return removed;
+}
+
+struct gc_candidate *get_best_cand(struct candidate_list *list)
+{
+	struct gc_candidate *cand;
+
+	if (list->count == 0)
+		return NULL;
+
+	cand = list_entry(list->list.next, struct gc_candidate, list);
+	list_del(&cand->list);
+	list->count--;
+	return cand;
+}
+
+/*
+ * We try to stuff the candidate in several lists in order.  Any list may
+ * either reject it or evict another candidate when full.  Anything left after
+ * trying the last list gets freed.
+ */
+static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
+
+	/* 100% free segments */
+	if (cand->valid == 0)
+		cand = add_list(cand, &super->s_free_list);
+	/* good candidates for Garbage Collection */
+	if (cand && cand->valid < full)
+		cand = add_list(cand, &super->s_low_list[cand->dist]);
+	/* good candidates for wear leveling,
+	 * segments that were recently written get ignored */
+	if (cand && cand->gec < super->s_gec + 2*super->s_no_segs)
+		cand = add_list(cand, &super->s_ec_list);
+
+	kfree(cand);
+}
+
+static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
+		u8 dist, u64 segment_gec)
+{
+	struct gc_candidate *cand;
+
+	cand = kmalloc(sizeof(*cand), GFP_KERNEL);
+	if (!cand)
+		return -ENOMEM;
+
+	cand->segno = segno;
+	cand->valid = valid;
+	cand->erase_count = ec;
+	cand->dist = dist;
+	cand->gec = segment_gec;
+
+	/* FIXME: add to btree */
+	__add_candidate(sb, cand);
+	return 0;
+}
+
+int add_free_segments_from_journal(struct super_block *sb,
+		struct logfs_je_free_segments *segs, int count)
+{
+	int i, err;
+
+	for (i = 0; i < count; i++) {
+		u32 segno = be32_to_cpu(segs[i].segno);
+		u32 ec = be32_to_cpu(segs[i].ec);
+		err = add_candidate(sb, segno, 0, ec, 0, 0);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void __del_segment(struct candidate_list *list, u32 segno)
+{
+	struct gc_candidate *cand;
+
+	list_for_each_entry(cand, &list->list, list)
+		if (cand->segno == segno) {
+			list_del(&cand->list);
+			list->count -= 1;
+			kfree(cand);
+			return;
+		}
+}
+
+static void del_segment(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	__del_segment(&super->s_free_list, segno);
+	for_each_area(i)
+		__del_segment(&super->s_low_list[i], segno);
+	__del_segment(&super->s_ec_list, segno);
+}
+
+static void scan_segment(struct super_block *sb, u32 segno)
+{
+	u64 segment_gec = 0;
+	u32 valid, ec = 0;
+	u8 dist, level = 0;
+
+	if (segment_is_reserved(sb, segno))
+		return;
+
+	del_segment(sb, segno);
+	valid = logfs_valid_bytes(sb, segno, &ec, &level, &segment_gec);
+	dist = root_distance(sb, level);
+	add_candidate(sb, segno, valid, ec, dist, segment_gec);
+}
+
+static struct gc_candidate *first_in_list(struct candidate_list *list)
+{
+	if (list->count == 0)
+		return NULL;
+	return list_entry(list->list.next, struct gc_candidate, list);
+}
+
+static struct gc_candidate *get_wl_candidate(struct super_block *sb,
+		int max_dist)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *cand;
+	u64 cand_gec;
+
+	if (super->s_gec & 0xf)
+		return NULL;
+
+	cand = first_in_list(&super->s_ec_list);
+	if (!cand)
+		return NULL;
+	if (cand->dist > max_dist)
+		return NULL;
+
+	/* instead of comparing candidate erasecount to average erasecount,
+	 * which would involve a 64bit division we multiply candidate erasecount
+	 * with the number of segment..  In effect, the comparison is:
+	 * if (cand->erase_count + 20 > average_erasecount)
+	 */
+	cand_gec = cand->erase_count;
+	cand_gec += 20; /* FIXME: should be a superblock variable */
+	cand_gec *= super->s_no_segs;
+	if (cand_gec > super->s_gec)
+		return NULL;
+
+	list_del(&cand->list);
+	super->s_ec_list.count--;
+	return cand;
+}
+
+static struct gc_candidate *get_candidate(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i, max_dist;
+	struct gc_candidate *cand = NULL, *this;
+
+	max_dist = min(super->s_free_list.count, LOGFS_NO_AREAS);
+
+	cand = get_wl_candidate(sb, max_dist);
+	if (cand)
+		return cand;
+
+	for (i = 0; i <= max_dist; i++) {
+		this = first_in_list(&super->s_low_list[i]);
+		if (!this)
+			continue;
+		if (!cand)
+			cand = this;
+		if (this->valid < cand->valid)
+			cand = this;
+	}
+	if (cand) {
+		list_del(&cand->list);
+		super->s_low_list[cand->dist].count--;
+	}
+	return cand;
+}
+
+static int logfs_gc_once(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *cand;
+	u64 segment_gec;
+	u32 cleaned, valid, segno, ec;
+	u8 dist;
+
+	cand = get_candidate(sb);
+	if (!cand)
+		return 0;
+
+	valid = cand->valid;
+	segno = cand->segno;
+	dist = cand->dist;
+	ec = cand->erase_count;
+	segment_gec = cand->gec;
+	kfree(cand);
+	pr_debug("GC segment #%02x at %x, %x required, %x free, %x valid, %llx free, %llx reserve\n",
+			segno, segno << super->s_segshift,
+			dist, super->s_free_list.count, valid,
+			super->s_free_bytes, super->s_gc_reserve);
+	cleaned = logfs_gc_segment(sb, segno, dist);
+	pr_debug("GC segment #%02x complete\n", segno);
+	add_candidate(sb, segno, valid - cleaned, ec, dist, segment_gec);
+	return 1;
+}
+
+/* returns 1 if a wrap occurs, 0 otherwise */
+static int logfs_scan_some(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 segno;
+	int i, ret = 0;
+
+	segno = super->s_sweeper;
+	for (i = SCAN_RATIO; i > 0; i--) {
+		segno++;
+		if (segno >= super->s_no_segs) {
+			segno = 0;
+			ret = 1;
+		}
+
+		scan_segment(sb, segno);
+	}
+	super->s_sweeper = segno;
+	return ret;
+}
+
+/*
+ * In principle, this function should loop forever, looking for GC candidates
+ * and moving data.  LogFS is designed in such a way that this loop is
+ * guaranteed to terminate.
+ *
+ * Limiting the loop to some iterations serves purely to catch cases when
+ * these guarantees have failed.  An actual endless loop is an obvious bug
+ * and should be reported as such.
+ */
+static void __logfs_gc_pass(struct super_block *sb, int target)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int round, progress, last_progress = 0;
+
+	pr_debug("__logfs_gc_pass(%x)\n", target);
+	for (round = 0; round < SCAN_ROUNDS; ) {
+		if (super->s_free_list.count >= target)
+			return;
+		round += logfs_scan_some(sb);
+		if (super->s_free_list.count >= target)
+			return;
+		progress = logfs_gc_once(sb);
+		if (progress)
+			last_progress = round;
+		else if (round - last_progress > 2)
+			break;
+	}
+	logfs_fsck(sb);
+	LOGFS_BUG(sb);
+}
+
+void logfs_gc_pass(struct super_block *sb)
+{
+	__logfs_gc_pass(sb, logfs_super(sb)->s_total_levels);
+}
+
+static int check_area(struct super_block *sb, int i)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_area[i];
+	struct logfs_object_header h;
+	u32 segno = area->a_segno;
+	u32 ofs = area->a_used_bytes;
+	__be32 crc;
+	int err;
+
+	if (!area->a_is_open)
+		return 0;
+
+	for (ofs = area->a_used_bytes;
+	    ofs <= super->s_segsize - sizeof(h);
+	    ofs += (u32)be16_to_cpu(h.len) + sizeof(h)) {
+		err = device_read(sb, segno, ofs, sizeof(h), &h);
+		if (err)
+			return err;
+
+		if (!memchr_inv(&h, 0xff, sizeof(h)))
+			break;
+
+		crc = logfs_crc32(&h, sizeof(h) - 4, 4);
+		if (crc != h.crc) {
+			printk(KERN_INFO "interrupted header at %llx\n",
+					dev_ofs(sb, segno, ofs));
+			return 0;
+		}
+	}
+	if (ofs > super->s_segsize - LOGFS_MAX_OBJECTSIZE) {
+		printk(KERN_INFO "%x bytes unaccounted data found at %llx - closing it\n",
+				ofs - area->a_used_bytes,
+				dev_ofs(sb, segno, ofs));
+		area->a_segno = 0;
+		area->a_is_open = 0;
+	} else if (ofs != area->a_used_bytes) {
+		printk(KERN_INFO "%x bytes unaccounted data found at %llx\n",
+				ofs - area->a_used_bytes,
+				dev_ofs(sb, segno, ofs));
+		area->a_used_bytes = ofs;
+	}
+	return 0;
+}
+
+int logfs_check_areas(struct super_block *sb)
+{
+	int i, err;
+
+	for_each_area(i) {
+		err = check_area(sb, i);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void logfs_init_candlist(struct candidate_list *list, int maxcount,
+		int sort_by_ec)
+{
+	list->count = 0;
+	list->maxcount = maxcount;
+	list->sort_by_ec = sort_by_ec;
+	INIT_LIST_HEAD(&list->list);
+}
+
+int logfs_init_gc(struct logfs_super *super)
+{
+	int i;
+
+	logfs_init_candlist(&super->s_free_list, 4*LIST_SIZE, 1);
+	for_each_area(i)
+		logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
+	logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
+	return 0;
+}
+
+static void logfs_cleanup_list(struct candidate_list *list)
+{
+	struct gc_candidate *cand, *next;
+
+	list_for_each_entry_safe(cand, next, &list->list, list) {
+		list_del(&cand->list);
+		kfree(cand);
+	}
+}
+
+void logfs_cleanup_gc(struct logfs_super *super)
+{
+	int i;
+
+	if (!super->s_free_list.list.next)
+		return;
+
+	logfs_cleanup_list(&super->s_free_list);
+	for_each_area(i)
+		logfs_cleanup_list(&super->s_low_list[i]);
+	logfs_cleanup_list(&super->s_ec_list);
+}
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/inode.c	2008-04-07 12:13:48.193106183 +0200
@@ -0,0 +1,589 @@
+/*
+ * fs/logfs/inode.c	- inode handling code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+
+/*
+ * We need to include <linux/writeback.h> - a header we normally shouldn't
+ * be mucking with.  If life only were that easy!
+ *
+ * As it is, LogFS' requirement to read inodes for garbage collection keeps
+ * breaking Linux assumptions.  In particular, an inode can get be in
+ * I_DELETING state when being written out.  Logfs then notices that it
+ * needs some space, does a little GC and tries to read just the inode in
+ * I_DELETING state.  So the code is waiting for itself to finish, lovely.
+ *
+ * Our strategy to solve this problem is to overload the generic drop_inode()
+ * and destroy_inode() methods.  Writeback happens between those two calls,
+ * so add the inode to a list in drop_inode() and remove it again in
+ * destroy_inode().  Any iget() in the GC path is replaced with logfs_iget(),
+ * which will check the list and only call the blocking iget() if the inode
+ * in question cannot deadlock.
+ *
+ * And of course this would be racy if we didn't take inode_lock in a few
+ * key moments.
+ */
+static struct kmem_cache *logfs_inode_cache;
+
+static int __logfs_read_inode(struct inode *inode);
+
+static struct inode *__logfs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct inode *inode = iget_locked(sb, ino);
+	int err;
+
+	if (inode && (inode->i_state & I_NEW)) {
+		err = __logfs_read_inode(inode);
+		unlock_new_inode(inode);
+		if (err) {
+			/* set i_nlink to 0 to prevent caching */
+			inode->i_nlink = 0;
+			logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
+			iput(inode);
+			return NULL;
+		}
+		BUG_ON(inode->i_nlink == 0);
+	}
+
+	return inode;
+}
+
+/*
+ * is_cached is set to 1 if we hand out a cached inode, 0 otherwise.
+ * this allows logfs_iput to do the right thing later
+ */
+struct inode *logfs_iget(struct super_block *sb, ino_t ino, int *is_cached)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_inode *li;
+
+	if (ino == LOGFS_INO_MASTER)
+		return super->s_master_inode;
+
+	spin_lock(&inode_lock);
+	list_for_each_entry(li, &super->s_freeing_list, li_freeing_list)
+		if (li->vfs_inode.i_ino == ino) {
+			spin_unlock(&inode_lock);
+			*is_cached = 1;
+			return &li->vfs_inode;
+		}
+	spin_unlock(&inode_lock);
+
+	*is_cached = 0;
+	return __logfs_iget(sb, ino);
+}
+
+void logfs_iput(struct inode *inode, int is_cached)
+{
+	if (inode->i_ino == LOGFS_INO_MASTER)
+		return;
+
+	if (is_cached)
+		return;
+
+	iput(inode);
+}
+
+static void logfs_init_inode(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	li->li_flags	= LOGFS_IF_VALID;
+	li->li_used_bytes = 0;
+	li->li_transaction = NULL;
+	inode->i_uid	= 0;
+	inode->i_gid	= 0;
+	inode->i_size	= 0;
+	inode->i_blocks	= 0;
+	inode->i_ctime	= CURRENT_TIME;
+	inode->i_mtime	= CURRENT_TIME;
+	inode->i_nlink	= 1;
+	INIT_LIST_HEAD(&li->li_freeing_list);
+	btree_init(&li->li_shadow_tree.new);
+	btree_init(&li->li_shadow_tree.old);
+
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = 0;
+
+	return;
+}
+
+static struct inode *logfs_alloc_inode(struct super_block *sb)
+{
+	struct logfs_inode *li;
+
+	li = kmem_cache_alloc(logfs_inode_cache, GFP_KERNEL);
+	if (!li)
+		return NULL;
+	logfs_init_inode(&li->vfs_inode);
+	return &li->vfs_inode;
+}
+
+/*
+ * In logfs inodes are written to an inode file.  The inode file, like any
+ * other file, is managed with a inode.  The inode file's inode, aka master
+ * inode, requires special handling in several respects.  First, it cannot be
+ * written to the inode file, so it is stored in the journal instead.
+ *
+ * Secondly, this inode cannot be written back and destroyed before all other
+ * inodes have been written.  The ordering is important.  Linux' VFS is happily
+ * unaware of the ordering constraint and would ordinarily destroy the master
+ * inode at umount time while other inodes are still in use and dirty.  Not
+ * good.
+ *
+ * So logfs makes sure the master inode is not written until all other inodes
+ * have been destroyed.  Sadly, this method has another side-effect.  The VFS
+ * will notice one remaining inode and print a frightening warning message.
+ * Worse, it is impossible to judge whether such a warning was caused by the
+ * master inode or any other inodes have leaked as well.
+ *
+ * Our attempt of solving this is with logfs_new_meta_inode() below.  Its
+ * purpose is to create a new inode that will not trigger the warning if such
+ * an inode is still in use.  An ugly hack, no doubt.  Suggections for
+ * improvement are welcome.
+ */
+struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
+{
+	struct inode *inode;
+
+	inode = logfs_alloc_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	inode->i_mode = 0;
+	inode->i_ino = ino;
+	inode->i_sb = sb;
+
+	/* This is a blatant copy of alloc_inode code.  We'd need alloc_inode
+	 * to be nonstatic, alas. */
+	{
+		struct address_space * const mapping = &inode->i_data;
+
+		mapping->a_ops = &logfs_reg_aops;
+		mapping->host = inode;
+		mapping->flags = 0;
+		mapping_set_gfp_mask(mapping, GFP_HIGHUSER);
+		mapping->assoc_mapping = NULL;
+		mapping->backing_dev_info = &default_backing_dev_info;
+		inode->i_mapping = mapping;
+	}
+
+	return inode;
+}
+
+/*
+ * Time is stored as nanoseconds since the epoch.
+ */
+static struct timespec be64_to_timespec(__be64 betime)
+{
+	return ns_to_timespec(be64_to_cpu(betime));
+}
+
+static __be64 timespec_to_be64(struct timespec tsp)
+{
+	return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec);
+}
+
+static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	inode->i_mode	= be16_to_cpu(di->di_mode);
+	li->li_height	= di->di_height;
+	li->li_flags	= be32_to_cpu(di->di_flags);
+	inode->i_uid	= be32_to_cpu(di->di_uid);
+	inode->i_gid	= be32_to_cpu(di->di_gid);
+	inode->i_size	= be64_to_cpu(di->di_size);
+	logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
+	inode->i_ctime	= be64_to_timespec(di->di_ctime);
+	inode->i_mtime	= be64_to_timespec(di->di_mtime);
+	inode->i_nlink	= be32_to_cpu(di->di_refcount);
+	inode->i_generation = be32_to_cpu(di->di_generation);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFCHR: /* fall through */
+	case S_IFBLK: /* fall through */
+	case S_IFIFO:
+		inode->i_rdev = be64_to_cpu(di->di_data[0]);
+		break;
+	default:
+		/* li_flags must be read before this */
+		if (li->li_flags & LOGFS_IF_EMBEDDED)
+			memcpy(li->li_data, di->di_data,
+					LOGFS_EMBEDDED_FIELDS * sizeof(u64));
+		else
+			for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+				li->li_data[i] = be64_to_cpu(di->di_data[i]);
+		break;
+	}
+}
+
+static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	di->di_mode	= cpu_to_be16(inode->i_mode);
+	di->di_height	= li->li_height;
+	di->di_pad	= 0;
+	di->di_flags	= cpu_to_be32(li->li_flags);
+	di->di_uid	= cpu_to_be32(inode->i_uid);
+	di->di_gid	= cpu_to_be32(inode->i_gid);
+	di->di_size	= cpu_to_be64(i_size_read(inode));
+	di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
+	di->di_ctime	= timespec_to_be64(inode->i_ctime);
+	di->di_mtime	= timespec_to_be64(inode->i_mtime);
+	di->di_refcount	= cpu_to_be32(inode->i_nlink);
+	di->di_generation = cpu_to_be32(inode->i_generation);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFCHR: /* fall through */
+	case S_IFBLK: /* fall through */
+	case S_IFIFO:
+		di->di_data[0] = cpu_to_be64(inode->i_rdev);
+		break;
+	default:
+		if (li->li_flags & LOGFS_IF_EMBEDDED)
+			memcpy(di->di_data, li->li_data,
+					LOGFS_EMBEDDED_FIELDS * sizeof(u64));
+		else
+			for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+				di->di_data[i] = cpu_to_be64(li->li_data[i]);
+		break;
+	}
+}
+
+#define VALID_MASK (LOGFS_IF_VALID | LOGFS_IF_INVALID)
+static int logfs_read_disk_inode(struct logfs_disk_inode *di,
+		struct inode *inode)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	ino_t ino = inode->i_ino;
+	int ret;
+
+	BUG_ON(!super->s_master_inode);
+	ret = logfs_inode_read(super->s_master_inode, di, sizeof(*di), ino);
+	if (ret)
+		return ret;
+
+	if ((be32_to_cpu(di->di_flags) & VALID_MASK) != LOGFS_IF_VALID) {
+		/*
+		 * We read wrong data, someone scribbled over it or we
+		 * have a bug.  Worth mentioning in the logs.
+		 */
+		printk(KERN_WARNING"LOGFS: read corrupt inode #%lx\n", ino);
+		WARN_ON(1);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static void logfs_inode_setops(struct inode *inode)
+{
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFDIR:
+		inode->i_op = &logfs_dir_iops;
+		inode->i_fop = &logfs_dir_fops;
+		inode->i_mapping->a_ops = &logfs_reg_aops;
+		break;
+	case S_IFREG:
+		inode->i_op = &logfs_reg_iops;
+		inode->i_fop = &logfs_reg_fops;
+		inode->i_mapping->a_ops = &logfs_reg_aops;
+		break;
+	case S_IFLNK:
+		inode->i_op = &logfs_symlink_iops;
+		inode->i_mapping->a_ops = &logfs_reg_aops;
+		break;
+	default:
+		;
+	}
+}
+
+static int __logfs_read_inode(struct inode *inode)
+{
+	struct logfs_disk_inode di;
+	int ret;
+
+	ret = logfs_read_disk_inode(&di, inode);
+	if (ret)
+		return ret;
+	logfs_disk_to_inode(&di, inode);
+
+	logfs_inode_setops(inode);
+	return 0;
+}
+
+struct inode *dhowells_iget(struct super_block *sb, ino_t ino)
+{
+	struct inode *inode;
+	int ret;
+
+	BUG_ON(ino == LOGFS_INO_MASTER);
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	ret = __logfs_read_inode(inode);
+	if (ret) {
+		iget_failed(inode);
+		return ERR_PTR(ret);
+	}
+	unlock_new_inode(inode);
+	return inode;
+}
+
+/* This function should move to fs/fs-writeback.c or similar. */
+static void clear_inode_dirty_sync(struct inode *inode)
+{
+	spin_lock(&inode_lock);
+	inode->i_state &= ~I_DIRTY_SYNC;
+	spin_unlock(&inode_lock);
+}
+
+static void __logfs_set_blocks(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_inode *li = logfs_inode(inode);
+
+	inode->i_blocks = ULONG_MAX;
+	if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX)
+		inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9;
+}
+
+void logfs_set_blocks(struct inode *inode, u64 bytes)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	li->li_used_bytes = bytes;
+	__logfs_set_blocks(inode);
+}
+
+static void account_shadow(void *_shadow, long _inode, u64 ignore)
+{
+	struct logfs_shadow *shadow = _shadow;
+	struct inode *inode = (void *)_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+
+	li->li_used_bytes += shadow->new_len - shadow->old_len;
+}
+
+static int logfs_write_disk_inode(struct logfs_disk_inode *di,
+		struct inode *inode, long flags)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_transaction *ta;
+	int ret;
+
+	/* FIXME: should we take inode->i_mutex? */
+	ta = logfs_inode(inode)->li_transaction;
+	logfs_inode(inode)->li_transaction = NULL;
+
+	btree_visitor(&li->li_shadow_tree.new, (long)inode, account_shadow);
+	btree_visitor(&li->li_shadow_tree.old, (long)inode, account_shadow);
+	__logfs_set_blocks(inode);
+	BUG_ON((s64)li->li_used_bytes < 0);
+
+	logfs_inode_to_disk(inode, di);
+	clear_inode_dirty_sync(inode);
+	ret = logfs_inode_write(super->s_master_inode, di, sizeof(*di),
+			inode->i_ino, flags, ta, &li->li_shadow_tree);
+	return ret;
+}
+
+int __logfs_write_inode(struct inode *inode, long flags)
+{
+	struct logfs_disk_inode di;
+
+	BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
+	return logfs_write_disk_inode(&di, inode, flags);
+}
+
+static int logfs_write_inode(struct inode *inode, int do_sync)
+{
+	int ret;
+	long flags = WF_LOCK;
+
+	if (do_sync)
+		flags |= WF_SYNC;
+
+	/* Can only happen if creat() failed.  Safe to skip. */
+	if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
+		return 0;
+
+	ret = __logfs_write_inode(inode, flags);
+	LOGFS_BUG_ON(ret, inode->i_sb);
+	return ret;
+}
+
+static void logfs_truncate_inode(struct inode *inode)
+{
+	logfs_truncate(inode, 0);
+	truncate_inode_pages(&inode->i_data, 0);
+}
+
+/*
+ * ZOMBIE inodes have already been deleted before and should remain dead,
+ * if it weren't for valid checking.  No need to kill them again here.
+ */
+static void logfs_delete_inode(struct inode *inode)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
+		li->li_flags |= LOGFS_IF_ZOMBIE;
+		if (i_size_read(inode) > 0)
+			logfs_truncate_inode(inode);
+		logfs_delete(super->s_master_inode, inode->i_ino,
+				&li->li_shadow_tree, NULL);
+	}
+	clear_inode(inode);
+}
+
+void __logfs_destroy_inode(struct inode *inode)
+{
+	if (likely(inode))
+		kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
+}
+
+/*
+ * We need to remember which inodes are currently being dropped.  They
+ * would deadlock the cleaner, if it were to iget() them.  So
+ * logfs_drop_inode() adds them to super->s_freeing_list,
+ * logfs_destroy_inode() removes them again and logfs_iget() checks the
+ * list.
+ */
+static void logfs_destroy_inode(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	BUG_ON(list_empty(&li->li_freeing_list));
+	spin_lock(&inode_lock);
+	list_del(&li->li_freeing_list);
+	spin_unlock(&inode_lock);
+	kmem_cache_free(logfs_inode_cache, li);
+}
+
+/* called with inode_lock held */
+static void logfs_drop_inode(struct inode *inode)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_inode *li = logfs_inode(inode);
+
+	list_move(&li->li_freeing_list, &super->s_freeing_list);
+	generic_drop_inode(inode);
+}
+
+static u64 logfs_get_ino(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 ino;
+
+	/*
+	 * FIXME: ino allocation should work in two modes:
+	 * o nonsparse - ifile is mostly occupied, just append
+	 * o sparse - ifile has lots of holes, fill them up
+	 *
+	 * SEEK_HOLE would obviously help a lot here.
+	 */
+	spin_lock(&super->s_ino_lock);
+	ino = super->s_last_ino;
+	super->s_last_ino++;
+	spin_unlock(&super->s_ino_lock);
+	return ino;
+}
+
+struct inode *logfs_new_inode(struct inode *dir, int mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	logfs_init_inode(inode);
+
+	/* inherit parent flags */
+	logfs_inode(inode)->li_flags |=
+		logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED;
+
+	inode->i_mode = mode;
+	inode->i_ino = logfs_get_ino(sb);
+
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	if (dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			inode->i_mode |= S_ISGID;
+	}
+
+	logfs_inode_setops(inode);
+	insert_inode_hash(inode);
+
+	return inode;
+}
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 23)
+static void logfs_init_once(struct kmem_cache *cachep, void *_li)
+#else
+static void logfs_init_once(void *_li, struct kmem_cache *cachep,
+		unsigned long flags)
+#endif
+{
+	struct logfs_inode *li = _li;
+	int i;
+
+	li->li_flags = 0;
+	li->li_used_bytes = 0;
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = 0;
+	inode_init_once(&li->vfs_inode);
+}
+
+static int logfs_sync_fs(struct super_block *sb, int wait)
+{
+	logfs_flush_dirty(sb, 1);
+	logfs_super(sb)->s_devops->sync(sb);
+	return 0;
+}
+
+const struct super_operations logfs_super_operations = {
+	.alloc_inode	= logfs_alloc_inode,
+	.delete_inode	= logfs_delete_inode,
+	.destroy_inode	= logfs_destroy_inode,
+	.drop_inode	= logfs_drop_inode,
+	.write_inode	= logfs_write_inode,
+	.statfs		= logfs_statfs,
+	.sync_fs	= logfs_sync_fs,
+};
+
+int logfs_init_inode_cache(void)
+{
+	logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
+			sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
+			logfs_init_once DTOR);
+	if (!logfs_inode_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void logfs_destroy_inode_cache(void)
+{
+	kmem_cache_destroy(logfs_inode_cache);
+}
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/journal.c	2008-04-07 11:53:20.913211255 +0200
@@ -0,0 +1,805 @@
+/*
+ * fs/logfs/journal.c	- journal handling code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+
+static void clear_retired(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	for (i = 0; i < JE_LAST; i++)
+		super->s_retired[i].used = 0;
+	super->s_first.used = 0;
+}
+
+static void clear_speculatives(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	for (i = 0; i < JE_LAST; i++)
+		super->s_speculative[i].used = 0;
+}
+
+static void retire_speculatives(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_journal_entry *spec, *retired;
+	int i;
+
+	for (i = 0; i < JE_LAST; i++) {
+		spec = super->s_speculative + i;
+		retired = super->s_retired + i;
+		if (!spec->used)
+			continue;
+		if (retired->used && (spec->version <= retired->version))
+			continue;
+		retired->used = 1;
+		retired->version = spec->version;
+		retired->offset = spec->offset;
+		retired->len = spec->len;
+		retired->datalen = spec->datalen;
+	}
+	clear_speculatives(sb);
+}
+
+/*
+ * Journal entries are versioned and highest version always wins.  To save
+ * some bytes, the version is only be16 instead of be64.  This means versions
+ * can and regularly will wrap.  However, all versions should be in a strict
+ * sequence and the total number of entries significantly lower than 2^16.
+ *
+ * So we read the first entry, store its version and substract that from
+ * any version read to normalize them.  Normalized versions should all be
+ * fairly close to zero and we can again easily judge which is the highest
+ * number.
+ */
+static int scan_segment(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_journal_area;
+	struct logfs_journal_header *h = super->s_compressed_je;
+	struct logfs_journal_entry *spec, *retired;
+	u64 ofs, seg_ofs = dev_ofs(sb, segno, 0);
+	u32 h_ofs;
+	s16 len, datalen, type, version;
+	int err;
+
+	for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*h)) {
+		ofs = seg_ofs + h_ofs;
+		err = super->s_devops->read(sb, ofs, sizeof(*h), h);
+		if (err)
+			return err;
+		/* stop scanning if all 0xff */
+		if (0 && !memchr_inv(h, 0xff, sizeof(*h))) /* FIXME */
+			break;
+
+		len = be16_to_cpu(h->h_len);
+		datalen = be16_to_cpu(h->h_datalen);
+		type = be16_to_cpu(h->h_type);
+		version = be16_to_cpu(h->h_version);
+
+		if ((len < 16) || (len > sb->s_blocksize))
+			continue;
+		if ((type < JE_FIRST) || (type > JE_LAST))
+			continue;
+
+		err = super->s_devops->read(sb, ofs, len + sizeof(*h), h);
+		if (err)
+			return err;
+
+		if (h->h_crc != logfs_crc32(h, len, 4))
+			continue;
+
+		if (!super->s_first.used) {
+			super->s_first.used = 1;
+			super->s_first.version = version;
+		}
+		version -= super->s_first.version;
+
+		if (abs(version) > 1<<14)
+			return -EIO;
+
+		h_ofs += len - sizeof(*h);
+		spec = &super->s_speculative[type];
+		retired = &super->s_retired[type];
+		switch (type) {
+		default:
+			if (spec->used && (version <= spec->version))
+				break;
+			/* store speculative entry */
+			spec->used = 1;
+			spec->version = version;
+			spec->offset = ofs;
+			spec->len = len;
+			spec->datalen = datalen;
+			break;
+		case JE_COMMIT:
+			if (retired->used && (version <= retired->version))
+				break;
+			/* retire speculative entries */
+			retired->used = 1;
+			retired->version = version;
+			retired->offset = ofs;
+			retired->len = len;
+			retired->datalen = datalen;
+			retire_speculatives(sb);
+			/* and set up journal area */
+			area->a_segno = segno;
+			/*
+			 * On every mount we switch to a new segment instead
+			 * of writing further in the current one.  While safe
+			 * this method is quite wasteful and may get changed
+			 * sooner or later.
+			 */
+			area->a_is_open = 0;
+			break;
+		}
+	}
+	return 0;
+}
+
+static int logfs_scan_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 segno;
+	int i, err;
+
+	clear_speculatives(sb);
+	clear_retired(sb);
+	journal_for_each(i) {
+		segno = super->s_journal_seg[i];
+		if (!segno)
+			continue;
+		err = scan_segment(sb, segno);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void read_commit(struct logfs_super *super,
+		struct logfs_journal_header *h)
+{
+	super->s_last_version = be16_to_cpu(h->h_version);
+}
+
+static void logfs_calc_free(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 no_segs = super->s_no_segs;
+	s64 free;
+	int i;
+
+	/* superblock segment */
+	no_segs -= 1;
+	/* bad blocks */
+	no_segs -= super->s_bad_segments;
+	/* journal */
+	journal_for_each(i)
+		if (super->s_journal_seg[i])
+			no_segs--;
+
+	free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE);
+	free -= super->s_used_bytes;
+
+#if 0
+	/* reserve some extra to speed up GC for full filesystems */
+	free -= 10 * (super->s_size >> 10);
+	/* in case this reserve exceeds currently free space */
+	free = max(free, 0LL);
+#endif
+	super->s_free_bytes = free;
+}
+
+static void reserve_sb_and_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct btree_head *head = &super->s_reserved_segments;
+	int i, err;
+
+	err = btree_insert(head, 0, (void *)1);
+	BUG_ON(err);
+
+	journal_for_each(i) {
+		if (!super->s_journal_seg[i])
+			continue;
+		err = btree_insert(head, super->s_journal_seg[i], (void *)1);
+		BUG_ON(err);
+	}
+}
+
+static void read_dynsb(struct super_block *sb,
+		struct logfs_je_dynsb *dynsb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	super->s_gec		= be64_to_cpu(dynsb->ds_gec);
+	super->s_sweeper	= be64_to_cpu(dynsb->ds_sweeper);
+	super->s_victim_ino	= be64_to_cpu(dynsb->ds_victim_ino);
+	super->s_rename_dir	= be64_to_cpu(dynsb->ds_rename_dir);
+	super->s_rename_pos	= be64_to_cpu(dynsb->ds_rename_pos);
+	super->s_used_bytes	= be64_to_cpu(dynsb->ds_used_bytes);
+}
+
+static void read_anchor(struct super_block *sb,
+		struct logfs_je_anchor *da)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode = super->s_master_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	super->s_last_ino = be64_to_cpu(da->da_last_ino);
+	li->li_flags	= LOGFS_IF_VALID;
+	i_size_write(inode, be64_to_cpu(da->da_size));
+	li->li_used_bytes = be64_to_cpu(da->da_used_bytes);
+
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = be64_to_cpu(da->da_data[i]);
+}
+
+static void read_erasecount(struct super_block *sb,
+		struct logfs_je_journal_ec *ec)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	journal_for_each(i)
+		super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]);
+}
+
+static void read_badsegments(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct btree_head *head = &super->s_reserved_segments;
+	__be32 *seg, *bad = super->s_bb_array;
+	int err;
+
+	super->s_bad_segments = 0;
+	for (seg = bad; seg - bad < sb->s_blocksize >> 2; seg++) {
+		if (*seg == 0)
+			continue;
+		err = btree_insert(head, be32_to_cpu(*seg), (void *)1);
+		BUG_ON(err);
+		super->s_bad_segments++;
+	}
+}
+
+static void read_areas(struct super_block *sb, struct logfs_je_areas *a)
+{
+	struct logfs_area *area;
+	int i;
+
+	for_each_area(i) {
+		area = logfs_super(sb)->s_area[i];
+		area->a_used_bytes = be32_to_cpu(a->used_bytes[i]);
+		area->a_segno = be32_to_cpu(a->segno[i]);
+		if (area->a_segno)
+			area->a_is_open = 1;
+	}
+}
+
+static void read_free_segments(struct super_block *sb,
+		struct logfs_je_free_segments *f, u16 len)
+{
+	u32 count = len / sizeof(struct logfs_je_free_segments);
+
+	add_free_segments_from_journal(sb, f, count);
+}
+
+static void *unpack(void *from, void *to)
+{
+	struct logfs_journal_header *h = from;
+	void *data = from + sizeof(struct logfs_journal_header);
+	int err;
+	size_t inlen, outlen;
+
+	if (h->h_compr == COMPR_NONE)
+		return data;
+
+	inlen = be16_to_cpu(h->h_len) - sizeof(*h);
+	outlen = be16_to_cpu(h->h_datalen);
+	err = logfs_uncompress(data, to, inlen, outlen);
+	BUG_ON(err);
+	return to;
+}
+
+/*
+ * Journal entries come in groups of 16.  The first group contains unique
+ * entries, the second group contains the write buffers for all levels.
+ * As of now, there are only two groups.
+ * The outer switch statement deals with groups (high nibble), the inner
+ * one with unique entries
+ */
+/* FIXME: make sure there are enough per-area objects in journal */
+static int logfs_read_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	void *block = super->s_compressed_je;
+	void *scratch = super->s_je;
+	int i, err, level;
+	struct logfs_area *area;
+
+	for (i = 0; i < JE_LAST; i++) {
+		struct logfs_journal_entry *je = super->s_retired + i;
+		if (!super->s_retired[i].used) {
+			switch (i) {
+			case JE_COMMIT:
+			case JE_DYNSB:
+			case JE_ANCHOR:
+				printk(KERN_WARNING "LogFS: Missing journal "
+						"entry %x?\n", i);
+				return -EIO;
+			default:
+				continue;
+			}
+		}
+		err = super->s_devops->read(sb, je->offset, sb->s_blocksize, block);
+		if (err)
+			return err;
+
+		switch (i & ~0xf) {
+		case JEG_BASE:
+			switch (i) {
+			case JE_COMMIT:
+				/* just reads the latest version number */
+				read_commit(super, block);
+				break;
+			case JE_DYNSB:
+				read_dynsb(sb, unpack(block, scratch));
+				break;
+			case JE_ANCHOR:
+				read_anchor(sb, unpack(block, scratch));
+				break;
+			case JE_ERASECOUNT:
+				read_erasecount(sb, unpack(block, scratch));
+				break;
+			case JE_BADSEGMENTS:
+				unpack(block, super->s_bb_array);
+				read_badsegments(sb);
+				break;
+			case JE_AREAS:
+				read_areas(sb, unpack(block, scratch));
+				break;
+			case JE_FREESEGS:
+				read_free_segments(sb, unpack(block, scratch),
+						je->datalen);
+				break;
+			default:
+				/*
+				 * Any unknown entries in this group are
+				 * considered optional.
+				 */
+				break;
+			}
+			break;
+		case JEG_WBUF:
+			if (super->s_writesize <= 1)
+				return -EIO;
+			level = i & 0xf;
+			area = super->s_area[level];
+			unpack(block, area->a_wbuf);
+			break;
+		default:
+			LOGFS_BUG(sb);
+			return -EIO;
+		}
+
+	}
+	return 0;
+}
+
+/*
+ * First search the current segment (outer loop), then pick the next segment
+ * in the array, skipping any zero entries (inner loop).
+ */
+static void journal_get_free_segment(struct logfs_area *area)
+{
+	struct logfs_super *super = logfs_super(area->a_sb);
+	int i;
+
+	journal_for_each(i) {
+		if (area->a_segno != super->s_journal_seg[i])
+			continue;
+
+		do {
+			i++;
+			if (i == LOGFS_JOURNAL_SEGS)
+				i = 0;
+		} while (!super->s_journal_seg[i]);
+
+		area->a_segno = super->s_journal_seg[i];
+		++(super->s_journal_ec[i]);
+		return;
+	}
+	BUG();
+}
+
+static void journal_get_erase_count(struct logfs_area *area)
+{
+	/* erase count is stored globally and incremented in
+	 * journal_get_free_segment() - nothing to do here */
+}
+
+static int journal_erase_segment(struct logfs_area *area)
+{
+	return logfs_erase_segment(area->a_sb, area->a_segno);
+}
+
+static void journal_finish_area(struct logfs_area *area)
+{
+	area->a_is_open = 0;
+	area->a_used_bytes = 0;
+}
+
+static size_t __logfs_write_header(struct logfs_super *super,
+		struct logfs_journal_header *h, size_t len, size_t datalen,
+		u16 type, u8 compr)
+{
+	h->h_len	= cpu_to_be16(len);
+	h->h_type	= cpu_to_be16(type);
+	h->h_version	= cpu_to_be16(++super->s_last_version);
+	h->h_datalen	= cpu_to_be16(datalen);
+	h->h_compr	= compr;
+	h->h_pad[0]	= 'H';
+	h->h_pad[1]	= 'A';
+	h->h_pad[2]	= 'T';
+	h->h_crc	= logfs_crc32(h, len, 4);
+	return len;
+}
+
+static size_t logfs_write_header(struct logfs_super *super,
+		struct logfs_journal_header *h, size_t datalen, u16 type)
+{
+	size_t len = datalen + sizeof(*h);
+
+	return __logfs_write_header(super, h, len, datalen, type, COMPR_NONE);
+}
+
+static void *logfs_write_bb(struct super_block *sb, void *h,
+		u16 *type, size_t *len)
+{
+	*type = JE_BADSEGMENTS;
+	*len = sb->s_blocksize;
+	return logfs_super(sb)->s_bb_array;
+}
+
+static inline size_t logfs_journal_erasecount_size(struct logfs_super *super)
+{
+	return LOGFS_JOURNAL_SEGS * sizeof(__be32);
+}
+
+static void *logfs_write_erasecount(struct super_block *sb, void *_ec,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_journal_ec *ec = _ec;
+	int i;
+
+	journal_for_each(i)
+		ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]);
+	*type = JE_ERASECOUNT;
+	*len = logfs_journal_erasecount_size(super);
+	return ec;
+}
+
+static void *logfs_write_wbuf(struct super_block *sb, void *h,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_area[super->s_sum_index];
+
+	*type = JEG_WBUF + super->s_sum_index;
+	*len = super->s_writesize;
+	return area->a_wbuf;
+}
+
+static void account_shadow(void *_shadow, long _sb, u64 ignore)
+{
+	struct logfs_shadow *shadow = _shadow;
+	struct super_block *sb = (void *)_sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_inode *li = logfs_inode(super->s_master_inode);
+
+	/* consume new space */
+	super->s_free_bytes	  -= shadow->new_len;
+	super->s_used_bytes	  += shadow->new_len;
+	super->s_dirty_used_bytes -= shadow->new_len;
+
+	/* free up old space */
+	super->s_free_bytes	  += shadow->old_len;
+	super->s_used_bytes	  -= shadow->old_len;
+	super->s_dirty_free_bytes -= shadow->old_len;
+
+	if (shadow->ino == LOGFS_INO_MASTER)
+		li->li_used_bytes += shadow->new_len - shadow->old_len;
+	mempool_free(shadow, super->s_block_pool);
+}
+
+static void *__logfs_write_anchor(struct super_block *sb, void *_da,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_anchor *da = _da;
+	struct inode *inode = super->s_master_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	btree_grim_visitor(&li->li_shadow_tree.new, (long)sb, account_shadow);
+	btree_grim_visitor(&li->li_shadow_tree.old, (long)sb, account_shadow);
+	BUG_ON((s64)li->li_used_bytes < 0);
+
+	da->da_last_ino = cpu_to_be64(super->s_last_ino);
+	da->da_size	= cpu_to_be64(i_size_read(inode));
+	da->da_used_bytes = cpu_to_be64(li->li_used_bytes);
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		da->da_data[i] = cpu_to_be64(li->li_data[i]);
+	*type = JE_ANCHOR;
+	*len = sizeof(*da);
+	return da;
+}
+
+static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_dynsb *dynsb = _dynsb;
+
+	dynsb->ds_gec		= cpu_to_be64(super->s_gec);
+	dynsb->ds_sweeper	= cpu_to_be64(super->s_sweeper);
+	dynsb->ds_victim_ino	= cpu_to_be64(super->s_victim_ino);
+	dynsb->ds_rename_dir	= cpu_to_be64(super->s_rename_dir);
+	dynsb->ds_rename_pos	= cpu_to_be64(super->s_rename_pos);
+	dynsb->ds_used_bytes	= cpu_to_be64(super->s_used_bytes);
+	*type = JE_DYNSB;
+	*len = sizeof(*dynsb);
+	return dynsb;
+}
+
+static void *logfs_write_areas(struct super_block *sb, void *_a,
+		u16 *type, size_t *len)
+{
+	struct logfs_area *area;
+	struct logfs_je_areas *a = _a;
+	int i;
+
+	for (i = 0; i < 16; i++) {
+		/* FIXME: have all 16 areas */
+		a->used_bytes[i] = 0;
+		a->segno[i] = 0;
+	}
+	for_each_area(i) {
+		area = logfs_super(sb)->s_area[i];
+		a->used_bytes[i] = cpu_to_be32(area->a_used_bytes);
+		a->segno[i] = cpu_to_be32(area->a_segno);
+	}
+	*type = JE_AREAS;
+	*len = sizeof(*a);
+	return a;
+}
+
+static void *logfs_write_free_segments(struct super_block *sb, void *_f,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_free_segments *f = _f;
+	struct gc_candidate *cand;
+	int i = 0;
+
+	list_for_each_entry(cand, &super->s_free_list.list, list) {
+		f[i].segno = cpu_to_be32(cand->segno);
+		f[i].ec = cpu_to_be32(cand->erase_count);
+		i++;
+		if (i > MAX_CACHED_SEGS)
+			break;
+	}
+
+	*type = JE_FREESEGS;
+	*len = i * sizeof(struct logfs_je_free_segments);
+	return f;
+}
+
+static void *logfs_write_commit(struct super_block *sb, void *h,
+		u16 *type, size_t *len)
+{
+	*type = JE_COMMIT;
+	*len = 0;
+	return NULL;
+}
+
+static size_t __logfs_write_je(struct super_block *sb, size_t jpos,
+		void* (*write)(struct super_block *sb, void *scratch,
+			u16 *type, size_t *len))
+{
+	struct logfs_super *super = logfs_super(sb);
+	void *scratch = super->s_je;
+	void *header = super->s_compressed_je + jpos;
+	void *data = header + sizeof(struct logfs_journal_header);
+	ssize_t max, compr_len, pad_len, full_len;
+	size_t len;
+	u16 type;
+	u8 compr = COMPR_ZLIB;
+
+	scratch = write(sb, scratch, &type, &len);
+	if (len == 0)
+		return logfs_write_header(super, header, 0, type);
+
+	max = sb->s_blocksize - jpos;
+	compr_len = logfs_compress(scratch, data, len, max);
+	if (compr_len < 0 || type == JE_ANCHOR) {
+		BUG_ON(len > max);
+		memcpy(data, scratch, len);
+		compr_len = len;
+		compr = COMPR_NONE;
+	}
+
+	pad_len = ALIGN(compr_len, 16);
+	memset(data + compr_len, 0, pad_len - compr_len);
+	full_len = pad_len + sizeof(struct logfs_journal_header);
+
+	return __logfs_write_header(super, header, full_len, len, type, compr);
+}
+
+static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes,
+		int must_pad)
+{
+	u32 writesize = logfs_super(area->a_sb)->s_writesize;
+	s32 ofs;
+	int ret;
+
+	ret = logfs_open_area(area);
+	BUG_ON(ret);
+
+	ofs = area->a_used_bytes;
+	area->a_used_bytes += *bytes;
+
+	if (area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize) {
+		logfs_close_area(area);
+		return -EAGAIN;
+	}
+	if (must_pad) {
+		area->a_used_bytes = ALIGN(area->a_used_bytes, writesize);
+		*bytes = area->a_used_bytes - ofs;
+	}
+
+	return dev_ofs(area->a_sb, area->a_segno, ofs);
+}
+
+static int logfs_write_je(struct super_block *sb,
+		void* (*write)(struct super_block *sb, void *scratch,
+			u16 *type, size_t *len))
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_journal_area;
+	struct logfs_journal_header *h = super->s_compressed_je;
+	size_t len;
+	int must_pad = 0;
+	s64 ofs;
+
+	len = __logfs_write_je(sb, 0, write);
+	if (h->h_type == cpu_to_be16(JE_COMMIT))
+		must_pad = 1;
+
+	ofs = logfs_get_free_bytes(area, &len, must_pad);
+	if (ofs < 0)
+		return ofs;
+	logfs_buf_write(area, ofs, super->s_compressed_je, len);
+	return 0;
+}
+
+/*
+ * Write all journal entries.  The goto logic ensures that all journal entries
+ * are written whenever a new segment is used.  It is ugly and potentially a
+ * bit wasteful, but robustness is more important.  With this we can *always*
+ * erase all journal segments except the one containing the most recent commit.
+ */
+int logfs_write_anchor(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+	int i, err;
+
+	mutex_lock(&super->s_journal_mutex);
+
+again:
+	if (super->s_writesize > 1)
+		for_each_area(i) {
+			super->s_sum_index = i;
+			err = logfs_write_je(sb, logfs_write_wbuf);
+			if (err)
+				goto again;
+		}
+	err = logfs_write_je(sb, logfs_write_bb);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, logfs_write_erasecount);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, __logfs_write_anchor);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, logfs_write_dynsb);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, logfs_write_areas);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, logfs_write_free_segments);
+	if (err)
+		goto again;
+	super->s_devops->sync(sb);
+	err = logfs_write_je(sb, logfs_write_commit);
+	if (err)
+		goto again;
+
+	mutex_unlock(&super->s_journal_mutex);
+	return 0;
+}
+
+static const struct logfs_area_ops journal_area_ops = {
+	.get_free_segment	= journal_get_free_segment,
+	.get_erase_count	= journal_get_erase_count,
+	.erase_segment		= journal_erase_segment,
+	.finish_area		= journal_finish_area,
+};
+
+int logfs_init_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int ret = -ENOMEM;
+
+	mutex_init(&super->s_journal_mutex);
+
+	super->s_je = kzalloc(sb->s_blocksize, GFP_KERNEL);
+	if (!super->s_je)
+		return ret;
+
+	super->s_compressed_je = kzalloc(sb->s_blocksize, GFP_KERNEL);
+	if (!super->s_compressed_je)
+		return ret;
+
+	super->s_bb_array = kzalloc(sb->s_blocksize, GFP_KERNEL);
+	if (!super->s_bb_array)
+		return ret;
+
+	super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER);
+	if (!super->s_master_inode)
+		return ret;
+
+	/* make sure noone tries to evict this inode */
+	super->s_master_inode->i_nlink = 1;
+
+	/* logfs_scan_journal() is looking for the latest journal entries, but
+	 * doesn't copy them into data structures yet.  logfs_read_journal()
+	 * then re-reads those entries and copies their contents over. */
+	ret = logfs_scan_journal(sb);
+	if (ret)
+		return ret;
+	ret = logfs_read_journal(sb);
+	if (ret)
+		return ret;
+
+	reserve_sb_and_journal(sb);
+	logfs_calc_free(sb);
+
+	super->s_journal_area->a_ops = &journal_area_ops;
+	return 0;
+}
+
+void logfs_cleanup_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	__logfs_destroy_inode(super->s_master_inode);
+	super->s_master_inode = NULL;
+
+	kfree(super->s_bb_array);
+	kfree(super->s_compressed_je);
+	kfree(super->s_je);
+}
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/memtree.c	2008-04-07 11:53:20.916544201 +0200
@@ -0,0 +1,402 @@
+/*
+ * fs/logfs/memtree.c	- Simple In-memory B+Tree
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2007 Joern Engel <joern@logfs.org>
+ *
+ *
+ * This could possibly get moved to lib/.
+ *
+ * A relatively simple B+Tree implementation.  I have written it as a learning
+ * excercise to understand how B+Trees work.  Turned out to be useful as well.
+ *
+ * B+Trees can be used similar to Linux radix trees (which don't have anything
+ * in common with textbook radix trees, beware).  Prerequisite for them working
+ * well is that access to a random tree node is much faster than a large number
+ * of operations within each node.
+ *
+ * Disks have fulfilled the prerequite for a long time.  More recently DRAM
+ * has gained similar properties, as memory access times, when measured in cpu
+ * cycles, have increased.  Cacheline sizes have increased as well, which also
+ * helps B+Trees.
+ *
+ * Compared to radix trees, B+Trees are more efficient when dealing with a
+ * sparsely populated address space.  Between 25% and 50% of the memory is
+ * occupied with valid pointers.  When densely populated, radix trees contain
+ * ~98% pointers - hard to beat.  Very sparse radix trees contain only ~2%
+ * pointers.
+ *
+ * This particular implementation stores pointers identified by a long value.
+ * Storing NULL pointers is illegal, lookup will return NULL when no entry
+ * was found.
+ *
+ * Two tricks were used that are not commonly found in textbooks.  First, the
+ * lowest values are to the right, not to the left.  All used slots within a
+ * node are on the left, all unused slots contain NUL values.  Most operations
+ * simply loop once over all slots and terminate on the first NUL.
+ *
+ * Second trick is to special-case the key "0" or NUL.  As seen above, this
+ * value indicates an unused slot, so such a value should not be stored in the
+ * tree itself.  Instead it is stored in the null_ptr field in the btree_head.
+ */
+/* FIXME: use mempool for allocations */
+#include "logfs.h"
+
+/*
+ * Prerequisite of B+Trees performing well is that node lookup is much slower
+ * than a large number of operations within a node.  That can be true if the
+ * node size is identical to cacheline size.  All that is highly
+ * machine-dependent, just like the #define below is not.
+ *
+ * Patches to do something smarter are welcome.  Just beware that too small
+ * node with less than 8 slots have a bad fan-out and won't perform well
+ * either.
+ */
+#if BITS_PER_LONG == 32
+#define BTREE_NODES 20	/* 32bit, 240 byte nodes */
+#else
+#define BTREE_NODES 16	/* 64bit, 256 byte nodes */
+#endif
+
+struct btree_node {
+	u64 key;
+	struct btree_node *node;
+};
+
+void btree_init(struct btree_head *head)
+{
+	head->node = NULL;
+	head->height = 0;
+	head->null_ptr = NULL;
+}
+
+#if 0
+static void __dump_tree(struct btree_node *node, int height)
+{
+	int i;
+
+	if (!height)
+		return;
+
+	printk(KERN_DEBUG"%p ", node);
+	for (i = 0; i < BTREE_NODES; i++)
+		printk("(%llx,%p) ", node[i].key, node[i].node);
+	printk("\n");
+
+	for (i = 0; i < BTREE_NODES; i++)
+		if (node[i].key)
+			__dump_tree(node[i].node, height-1);
+}
+
+static void dump_tree(struct btree_head *head)
+{
+	printk(KERN_DEBUG"%p\n", head->null_ptr);
+	__dump_tree(head->node, head->height);
+}
+#endif
+
+static u64 btree_last(struct btree_head *head)
+{
+	int height = head->height;
+	struct btree_node *node = head->node;
+
+	if (height == 0)
+		return 0;
+
+	for ( ; height > 1; height--)
+		node = node[0].node;
+
+	return node[0].key;
+}
+
+void *btree_lookup(struct btree_head *head, u64 key)
+{
+	int i, height = head->height;
+	struct btree_node *node = head->node;
+
+	if (key == 0)
+		return head->null_ptr;
+
+	if (height == 0)
+		return NULL;
+
+	for ( ; height > 1; height--) {
+		for (i = 0; i < BTREE_NODES; i++)
+			if (node[i].key <= key)
+				break;
+		node = node[i].node;
+		if (!node)
+			return NULL;
+	}
+
+	if (!node)
+		return NULL;
+
+	for (i = 0; i < BTREE_NODES; i++)
+		if (node[i].key == key)
+			return node[i].node;
+
+	return NULL;
+}
+
+/*
+ * Returns two values:
+ * pos - the position of the first slot equal or less than key
+ * fill - the number of positions filled with any value
+ */
+static void find_pos(struct btree_node *node, u64 key, int *pos, int *fill)
+{
+	int i;
+
+	for (i = 0; i < BTREE_NODES; i++)
+		if (node[i].key <= key)
+			break;
+	*pos = i;
+	for (i = *pos; i < BTREE_NODES; i++)
+		if (node[i].key == 0)
+			break;
+	*fill = i;
+}
+
+/*
+ * locate the correct leaf node in the btree
+ */
+static struct btree_node *find_level(struct btree_head *head, u64 key,
+		int level)
+{
+	struct btree_node *node = head->node;
+	int i, height;
+
+	for (height = head->height; height > level; height--) {
+		for (i = 0; i < BTREE_NODES; i++)
+			if (node[i].key <= key)
+				break;
+
+		if ((i == BTREE_NODES) || !node[i].key) {
+			/* right-most key is too large, update it */
+			i--;
+			node[i].key = key;
+		}
+		BUG_ON(i < 0);
+		node = node[i].node;
+	}
+	BUG_ON(!node);
+	return node;
+}
+
+static int btree_grow(struct btree_head *head)
+{
+	struct btree_node *node;
+
+	node = kcalloc(BTREE_NODES, sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return -ENOMEM;
+	if (head->node) {
+		node->key = head->node[BTREE_NODES-1].key;
+		node->node = head->node;
+	}
+	head->node = node;
+	head->height++;
+	return 0;
+}
+
+static int btree_insert_level(struct btree_head *head, u64 key, void *ptr,
+		int level)
+{
+	struct btree_node *node;
+	int i, pos, fill, err;
+
+	BUG_ON(!ptr);
+	if (key == 0) {
+		/* 0 identifies empty slots, so special-case this */
+		BUG_ON(level != 1);
+		head->null_ptr = ptr;
+		return 0;
+	}
+
+	if (head->height < level) {
+		err = btree_grow(head);
+		if (err)
+			return err;
+	}
+
+retry:
+	node = find_level(head, key, level);
+	find_pos(node, key, &pos, &fill);
+	BUG_ON(node[pos].key == key);
+
+	if (fill == BTREE_NODES) {
+		/* need to split node */
+		struct btree_node *new;
+
+		new = kcalloc(BTREE_NODES, sizeof(*node), GFP_KERNEL);
+		if (!new)
+			return -ENOMEM;
+		err = btree_insert_level(head, node[BTREE_NODES/2 - 1].key, new,
+				level+1);
+		if (err) {
+			kfree(new);
+			return err;
+		}
+		for (i = 0; i < BTREE_NODES / 2; i++) {
+			new[i].key = node[i].key;
+			new[i].node = node[i].node;
+			node[i].key = node[i + BTREE_NODES/2].key;
+			node[i].node = node[i + BTREE_NODES/2].node;
+			node[i + BTREE_NODES/2].key = 0;
+			node[i + BTREE_NODES/2].node = NULL;
+		}
+		goto retry;
+	}
+	BUG_ON(fill >= BTREE_NODES);
+
+	/* shift and insert */
+	for (i = fill; i > pos; i--) {
+		node[i].key = node[i-1].key;
+		node[i].node = node[i-1].node;
+	}
+	node[pos].key = key;
+	node[pos].node = ptr;
+
+	return 0;
+}
+
+int btree_insert(struct btree_head *head, u64 key, void *ptr)
+{
+	return btree_insert_level(head, key, ptr, 1);
+}
+
+static void *btree_remove_level(struct btree_head *head, u64 key, int level)
+{
+	struct btree_node *node;
+	int i, pos, fill;
+	void *ret;
+
+	if (level > head->height) {
+		/* we recursed all the way up */
+		head->height = 0;
+		head->node = NULL;
+		return NULL;
+	}
+
+	node = find_level(head, key, level);
+	find_pos(node, key, &pos, &fill);
+	if ((level == 1) && (node[pos].key != key))
+		return NULL;
+	ret = node[pos].node;
+
+	/* remove and shift */
+	for (i = pos; i < fill-1; i++) {
+		node[i].key = node[i+1].key;
+		node[i].node = node[i+1].node;
+	}
+	node[fill-1].key = 0;
+	node[fill-1].node = NULL;
+
+	if (fill-1 < BTREE_NODES/2) {
+		/*
+		 * At this point there *should* be code to either merge with
+		 * a neighboring node or steal some entries from it to preserve
+		 * the btree invariant of only having nodes with n/2..n
+		 * elements.
+		 *
+		 * As you can see, that code is left as an excercise to the
+		 * reader or anyone noticing severe performance problems in
+		 * very rare cases.
+		 *
+		 * As-is this code "implements" a method called lazy deletion,
+		 * which according to text books is relatively common in
+		 * databases and usually works quite well.
+		 * Not so usually, the btree can degrade into very long lists
+		 * of 1-element nodes and perform accordingly.
+		 */
+	}
+	if (fill-1 == 0) {
+		btree_remove_level(head, key, level+1);
+		kfree(node);
+	}
+
+	return ret;
+}
+
+void *btree_remove(struct btree_head *head, u64 key)
+{
+	void *ret;
+
+	if (key == 0) {
+		/* 0 identifies empty slots, so special-case this */
+		ret = head->null_ptr;
+		head->null_ptr = NULL;
+		return ret;
+	}
+	if (head->height == 0)
+		return NULL;
+
+	return btree_remove_level(head, key, 1);
+}
+
+int btree_merge(struct btree_head *target, struct btree_head *victim)
+{
+	struct btree_node *node;
+	u64 key;
+	int err;
+
+	BUG_ON(target == victim);
+
+	if (!(target->node || target->null_ptr)) {
+		/* target is empty, just copy fields over */
+		target->null_ptr = victim->null_ptr;
+		target->node = victim->node;
+		target->height = victim->height;
+		btree_init(victim);
+		return 0;
+	}
+
+	for (;;) {
+		key = btree_last(victim);
+		node = btree_remove(victim, key);
+		if (!node)
+			break;
+		err = btree_insert(target, key, node);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void __btree_for_each(struct btree_node *node, long opaque,
+		void (*func)(void *elem, long opaque, u64 key),  int reap,
+		int height)
+{
+	int i;
+
+	for (i = 0; i < BTREE_NODES && node[i].key; i++) {
+		if (height > 1)
+			__btree_for_each(node[i].node, opaque, func, reap,
+					height-1);
+		else
+			func(node[i].node, opaque, node[i].key);
+	}
+	if (reap)
+		kfree(node);
+}
+
+void btree_visitor(struct btree_head *head, long opaque,
+		void (*func)(void *elem, long opaque, u64 key))
+{
+	if (head->node)
+		__btree_for_each(head->node, opaque, func, 0, head->height);
+	if (head->null_ptr)
+		func(head->null_ptr, opaque, 0);
+}
+
+void btree_grim_visitor(struct btree_head *head, long opaque,
+		void (*func)(void *elem, long opaque, u64 key))
+{
+	if (head->node)
+		__btree_for_each(head->node, opaque, func, 1, head->height);
+	if (head->null_ptr)
+		func(head->null_ptr, opaque, 0);
+	btree_init(head);
+}
--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ git/fs/logfs/readwrite.c	2008-04-07 12:24:48.276946628 +0200
@@ -0,0 +1,1618 @@
+/*
+ * fs/logfs/readwrite.c
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2007 Joern Engel <joern@logfs.org>
+ *
+ *
+ * Actually contains five sets of very similar functions:
+ * read		read blocks from a file
+ * seek_hole	find next hole
+ * seek_data	find next data block
+ * valid	check whether a block still belongs to a file
+ * write	write blocks to a file
+ * delete	delete a block (for directories and ifile)
+ * rewrite	move existing blocks of a file to a new location (gc helper)
+ * truncate	truncate a file
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+
+static int adjust_level(int level)
+{
+	if (level >= LOGFS_MAX_LEVELS)
+		level -= LOGFS_MAX_LEVELS;
+	WARN_ON(level >= LOGFS_MAX_LEVELS);
+	return level;
+}
+
+static u64 adjust_bix(u64 bix, u8 level)
+{
+	switch (adjust_level(level)) {
+	case 0:
+		return bix;
+	case 1:
+		return max_t(u64, bix, I0_BLOCKS);
+	case 2:
+		return max_t(u64, bix, I1_BLOCKS);
+	case 3:
+		return max_t(u64, bix, I2_BLOCKS);
+	default:
+		WARN_ON(1);
+		return bix;
+	}
+}
+
+/**
+ * The inode address space is cut in two halves.  Lower half belongs to data
+ * pages, upper half to indirect blocks.  If the high bit (INDIRECT_BIT) is
+ * set, the actual block index (bix) and level can be derived from the page
+ * index.
+ *
+ * The lowest three bits of the block index are set to 0 after packing and
+ * unpacking.  Since the lowest n bits (9 for 4KiB blocksize) are ignored
+ * anyway this is harmless.
+ */
+#define ARCH_SHIFT	(BITS_PER_LONG - 32)
+#define INDIRECT_BIT	(0x80000000UL << ARCH_SHIFT)
+#define LEVEL_SHIFT	(28 + ARCH_SHIFT)
+static pgoff_t logfs_pack_index(u64 bix, u8 level)
+{
+	pgoff_t index;
+
+	BUG_ON(bix >= INDIRECT_BIT);
+	BUG_ON(level > 7);
+	if (level == 0)
+		return bix;
+
+	index  = INDIRECT_BIT;
+	index |= (long)level << LEVEL_SHIFT;
+	index |= bix >> (level*LOGFS_BLOCK_BITS);
+	return index;
+}
+
+void logfs_unpack_index(pgoff_t index, u64 *bix, u8 *level)
+{
+	if (!(index & INDIRECT_BIT)) {
+		*bix = index;
+		*level = 0;
+		return;
+	}
+
+	*level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT;
+	*bix = (index << (*level*LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT;
+	*bix = adjust_bix(*bix, *level);
+	return;
+}
+#undef ARCH_SHIFT
+#undef INDIRECT_BIT
+#undef LEVEL_SHIFT
+
+/**
+ * logfs_flush_dirty - flush dirty blocks
+ * @sb:		filesystem superblock
+ * @sync:	if 0, only flush enough to continue writing,
+ *		if 1, completely flush list
+ */
+void logfs_flush_dirty(struct super_block *sb, int sync)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 bytes = LOGFS_MAX_LEVELS * LOGFS_MAX_OBJECTSIZE;
+	struct logfs_block *block;
+	struct page *page;
+	struct inode *inode;
+	int ret;
+
+	while (super->s_dirty_free_bytes || super->s_dirty_used_bytes) {
+		if (!sync && (super->s_free_bytes >= bytes + super->s_gc_reserve
+				+ super->s_dirty_used_bytes))
+			break;
+
+		BUG_ON(list_empty(&super->s_dirty_list));
+		block = list_entry(super->s_dirty_list.next, struct logfs_block,
+				dirty_list);
+		page = block->page;
+		inode = page->mapping->host;
+		ret = logfs_write_buf(inode, page, NULL, 0);
+		BUG_ON(ret);
+		/* We may need to GC some more after writing a page */
+		logfs_gc_pass(sb);
+	}
+}
+
+/*
+ * Logfs is prone to an AB-BA deadlock where one task tries to acquire
+ * s_w_mutex with a locked page and GC tries to get that page while holding
+ * s_w_mutex.
+ * To solve this issue logfs will ignore the page lock iff the page in question
+ * is waiting for s_w_mutex.  We annotate this fact by setting PG_pre_locked
+ * in addition to PG_locked.
+ *
+ * FIXME: Logfs already uses PG_owner_priv_1 for other purposes and there is
+ * no PG_owner_priv_2.  Currently we abuse the (hopefully) free flag 18.  But
+ * that flag may get reused any minute.  In fact, Christoph Lameter recently
+ * sent a patchset to reshuffle page flags.  Highly dangerous.
+ */
+#define PG_pre_locked		18
+#define PagePreLocked(page)	test_bit(PG_pre_locked, &(page)->flags)
+#define SetPagePreLocked(page)	set_bit(PG_pre_locked, &(page)->flags)
+#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags)
+static void logfs_get_wblocks(struct super_block *sb, struct page *page,
+		int lock)
+{
+	if (lock) {
+		struct logfs_super *super = logfs_super(sb);
+
+		if (page)
+			SetPagePreLocked(page);
+		mutex_lock(&super->s_w_mutex);
+		super->s_write_page = page;
+		logfs_gc_pass(sb);
+		/* FIXME: We also have to check for shadowed space
+		 * and mempool fill grade */
+		logfs_flush_dirty(sb, 0);
+	}
+}
+
+static void logfs_put_wblocks(struct super_block *sb, struct page *page,
+		int lock)
+{
+	if (lock) {
+		logfs_super(sb)->s_write_page = NULL;
+		/* Order matters - we must clear PG_pre_locked before releasing
+		 * s_w_mutex or we could race against another task. */
+		if (page)
+			ClearPagePreLocked(page);
+		mutex_unlock(&logfs_super(sb)->s_w_mutex);
+	}
+}
+
+static struct page *logfs_get_read_page(struct inode *inode, u64 bix, u8 level)
+{
+	return find_or_create_page(inode->i_mapping,
+			logfs_pack_index(bix, level), GFP_NOFS);
+}
+
+static void logfs_put_read_page(struct page *page)
+{
+	unlock_page(page);
+	page_cache_release(page);
+}
+
+static struct page *logfs_get_page(struct inode *inode, u64 bix, u8 level)
+{
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t index = logfs_pack_index(bix, level);
+	struct page *page;
+	int err;
+	int loop = 0;
+
+repeat:
+	page = find_get_page(mapping, index);
+	if (!page) {
+		page = __page_cache_alloc(GFP_NOFS);
+		if (!page)
+			return NULL;
+		err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
+		if (unlikely(err)) {
+			page_cache_release(page);
+			if (err == -EEXIST)
+				goto repeat;
+			return NULL;
+		}
+	} else while (unlikely(TestSetPageLocked(page))) {
+		if (PagePreLocked(page)) {
+			/* Holder of page lock is waiting for us, it
+			 * is safe to use this page. */
+			return page;
+		}
+		if (loop++ > 0x1000) {
+			/* Has been observed once so far... */
+			printk(KERN_ERR "stack at %p\n", &loop);
+			BUG();
+		}
+		/* Some other process has this page locked and has
+		 * nothing to do with us.  Wait for it to finish.
+		 */
+		schedule();
+	}
+	return page;
+}
+
+static void logfs_put_page(struct inode *inode, struct page *page)
+{
+	if (likely(!PagePreLocked(page)))
+		unlock_page(page);
+	page_cache_release(page);
+}
+
+static struct page *logfs_get_write_page(struct inode *inode, u64 bix, u8 level)
+{
+	struct page *write_page = logfs_super(inode->i_sb)->s_write_page;
+	pgoff_t index = logfs_pack_index(bix, level);
+
+	if (write_page && (inode->i_mapping == write_page->mapping)
+			&& (index == write_page->index))
+		return write_page;
+	else
+		return logfs_get_page(inode, bix, level);
+}
+
+static void logfs_put_write_page(struct inode *inode, struct page *page)
+{
+	struct page *write_page = logfs_super(inode->i_sb)->s_write_page;
+
+	if (page != write_page)
+		logfs_put_page(inode, page);
+}
+
+static unsigned long __get_bits(u64 val, int skip, int no)
+{
+	u64 ret = val;
+
+	ret >>= skip * no;
+	ret <<= 64 - no;
+	ret >>= 64 - no;
+	return ret;
+}
+
+static unsigned long get_bits(u64 val, int skip)
+{
+	return __get_bits(val, skip, LOGFS_BLOCK_BITS);
+}
+
+/*
+ * Returns:
+ * 0 if all pointers are NUL
+ * -1 if all pointers have LOGFS_FULLY_POPULATED set
+ * 1 if at least one pointer is non-NUL and
+ *      at least one has LOGFS_FULLY_POPULATED cleared
+ */
+enum blockstate {
+	bs_fully_populated	= -1,
+	bs_all_zero		= 0,
+	bs_mixed		= 1,
+};
+
+static int block_state(__be64 *block)
+{
+	int i;
+
+	if (block[0]) {
+		for (i = 1; i < LOGFS_BLOCK_FACTOR; i++)
+			if (!(block[i] & cpu_to_be64(LOGFS_FULLY_POPULATED)))
+				return bs_mixed;
+		return bs_fully_populated;
+	} else {
+		for (i = 1; i < LOGFS_BLOCK_FACTOR; i++)
+			if (block[i])
+				return bs_mixed;
+		return bs_all_zero;
+	}
+}
+
+static int page_state(struct page *page)
+{
+	__be64 *block;
+	int ret;
+
+	block = kmap_atomic(page, KM_USER0);
+	ret = block_state(block);
+	kunmap_atomic(block, KM_USER0);
+	return ret;
+}
+
+static void alloc_block(struct page *page)
+{
+	struct logfs_super *super = logfs_super(page->mapping->host->i_sb);
+	struct logfs_block *block;
+
+	if (PagePrivate(page))
+		return;
+
+	block = mempool_alloc(super->s_block_pool, GFP_KERNEL);
+	INIT_LIST_HEAD(&block->dirty_list);
+	block->page = page;
+	SetPagePrivate(page);
+	page->private = (unsigned long)block;
+}
+
+static struct shadow_tree *logfs_page_to_tree(struct page *page)
+{
+	alloc_block(page);
+	return &logfs_block(page)->shadow_tree;
+}
+
+static void block_set_pointer(struct page *page, int index, u64 ptr)
+{
+	__be64 *block;
+
+	block = kmap_atomic(page, KM_USER0);
+	block[index] = cpu_to_be64(ptr);
+	flush_dcache_page(page);
+	kunmap_atomic(block, KM_USER0);
+	SetPageUptodate(page);
+}
+
+static u64 block_get_pointer(struct page *page, int index)
+{
+	__be64 *block;
+	u64 ptr;
+
+	block = kmap_atomic(page, KM_USER0);
+	ptr = be64_to_cpu(block[index]);
+	kunmap_atomic(block, KM_USER0);
+	return ptr;
+}
+
+static int logfs_read_empty(struct page *page)
+{
+	zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+	SetPageZero(page);
+	return 0;
+}
+
+static int logfs_read_embedded(struct page *page, struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	void *buf;
+
+	buf = kmap_atomic(page, KM_USER0);
+	memcpy(buf, li->li_data, LOGFS_EMBEDDED_SIZE);
+	memset(buf + LOGFS_EMBEDDED_SIZE, 0,
+			PAGE_CACHE_SIZE - LOGFS_EMBEDDED_SIZE);
+	flush_dcache_page(page);
+	kunmap_atomic(buf, KM_USER0);
+	return 0;
+}
+
+static int logfs_read_direct(struct inode *inode, struct page *page)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	pgoff_t index = page->index;
+	u64 block;
+
+	block = li->li_data[index];
+	if (!block)
+		return logfs_read_empty(page);
+
+	return logfs_segment_read(inode, page, block, index, 0);
+}
+
+static int logfs_read_loop(struct inode *inode, struct page *page, int count)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 bofs = li->li_data[I1_INDEX + count];
+	pgoff_t bix = page->index;
+	int level, ret;
+	struct page *ipage;
+
+	if (!bofs)
+		return logfs_read_empty(page);
+
+	for (level = count + 1; level > 0; level--) {
+		ipage = logfs_get_read_page(inode, bix, level);
+		if (!ipage)
+			return -ENOMEM;
+
+		ret = logfs_segment_read(inode, ipage, bofs, bix, level);
+		if (ret) {
+			logfs_put_read_page(ipage);
+			return ret;
+		}
+
+		bofs = block_get_pointer(ipage, get_bits(bix, level-1));
+		logfs_put_read_page(ipage);
+		if (!bofs)
+			return logfs_read_empty(page);
+	}
+
+	return logfs_segment_read(inode, page, bofs, bix, 0);
+}
+
+static int logfs_read_block(struct inode *inode, struct page *page)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	pgoff_t index = page->index;
+
+	if (li->li_flags & LOGFS_IF_EMBEDDED) {
+		if (index != 0)
+			return logfs_read_empty(page);
+		else
+			return logfs_read_embedded(page, inode);
+	} else if (index < I0_BLOCKS)
+		return logfs_read_direct(inode, page);
+	else if (index < I1_BLOCKS)
+		return logfs_read_loop(inode, page, 0);
+	else if (index < I2_BLOCKS)
+		return logfs_read_loop(inode, page, 1);
+	else if (index < I3_BLOCKS)
+		return logfs_read_loop(inode, page, 2);
+
+	BUG();
+	return -EIO;
+}
+
+static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	for (; bix < I0_BLOCKS; bix++)
+		if (data ^ (li->li_data[bix] == 0))
+			return bix;
+	return I0_BLOCKS;
+}
+
+static u64 seek_holedata_loop(struct inode *inode, u64 bix, int count, int data)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	__be64 *rblock;
+	u64 bofs = li->li_data[I1_INDEX + count];
+	int level, ret, slot;
+	struct page *page;
+
+	BUG_ON(!bofs);
+
+	for (level = count + 1; level > 0; level--) {
+		page = logfs_get_read_page(inode, bix, level);
+		if (!page)
+			return bix;
+
+		ret = logfs_segment_read(inode, page, bofs, bix, level);
+		if (ret) {
+			logfs_put_read_page(page);
+			return bix;
+		}
+
+		slot = get_bits(bix, level-1);
+		rblock = kmap_atomic(page, KM_USER0);
+		while (slot < LOGFS_BLOCK_FACTOR) {
+			if (data && (rblock[slot] != 0))
+				break;
+			if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED))
+				break;
+			slot++;
+			bix += 1 << (LOGFS_BLOCK_BITS * (level-1));
+		}
+		if (slot >= LOGFS_BLOCK_FACTOR) {
+			kunmap_atomic(rblock, KM_USER0);
+			logfs_put_read_page(page);
+			return bix;
+		}
+		bofs = be64_to