[PATCH v2 04/19] read-cache: Re-read index if index file changed

2013-07-12 Thread Thomas Gummerer
Add the possibility of re-reading the index file, if it changed
while reading.

The index file might change during the read, causing outdated
information to be displayed. We check if the index file changed
by using its stat data as heuristic.

Helped-by: Ramsay Jones ram...@ramsay1.demon.co.uk
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache.c | 91 +---
 1 file changed, 57 insertions(+), 34 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 1e7ffc2..3e3a0e2 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1275,11 +1275,31 @@ int read_index(struct index_state *istate)
return read_index_from(istate, get_index_file());
 }
 
+static int index_changed(struct stat *st_old, struct stat *st_new)
+{
+   if (st_old-st_mtime != st_new-st_mtime ||
+#if !defined (__CYGWIN__)
+   st_old-st_uid   != st_new-st_uid ||
+   st_old-st_gid   != st_new-st_gid ||
+   st_old-st_ino   != st_new-st_ino ||
+#endif
+#if USE_NSEC
+   ST_MTIME_NSEC(*st_old) != ST_MTIME_NSEC(*st_new) ||
+#endif
+#if USE_STDEV
+   st_old-st_dev != st_new-st_dev ||
+#endif
+   st_old-st_size != st_new-st_size)
+   return 1;
+
+   return 0;
+}
+
 /* remember to discard_cache() before reading a different cache! */
 int read_index_from(struct index_state *istate, const char *path)
 {
-   int fd;
-   struct stat st;
+   int fd, err, i;
+   struct stat st_old, st_new;
struct cache_version_header *hdr;
void *mmap;
size_t mmap_size;
@@ -1291,41 +1311,44 @@ int read_index_from(struct index_state *istate, const 
char *path)
errno = ENOENT;
istate-timestamp.sec = 0;
istate-timestamp.nsec = 0;
+   for (i = 0; i  50; i++) {
+   err = 0;
+   fd = open(path, O_RDONLY);
+   if (fd  0) {
+   if (errno == ENOENT)
+   return 0;
+   die_errno(index file open failed);
+   }
 
-   fd = open(path, O_RDONLY);
-   if (fd  0) {
-   if (errno == ENOENT)
-   return 0;
-   die_errno(index file open failed);
+   if (fstat(fd, st_old))
+   die_errno(cannot stat the open index);
+
+   errno = EINVAL;
+   mmap_size = xsize_t(st_old.st_size);
+   mmap = xmmap(NULL, mmap_size, PROT_READ | PROT_WRITE, 
MAP_PRIVATE, fd, 0);
+   close(fd);
+   if (mmap == MAP_FAILED)
+   die_errno(unable to map index file);
+
+   hdr = mmap;
+   if (verify_hdr_version(istate, hdr, mmap_size)  0)
+   err = 1;
+
+   if (istate-ops-verify_hdr(mmap, mmap_size)  0)
+   err = 1;
+
+   if (istate-ops-read_index(istate, mmap, mmap_size)  0)
+   err = 1;
+   istate-timestamp.sec = st_old.st_mtime;
+   istate-timestamp.nsec = ST_MTIME_NSEC(st_old);
+   if (lstat(path, st_new))
+   die_errno(cannot stat the open index);
+
+   munmap(mmap, mmap_size);
+   if (!index_changed(st_old, st_new)  !err)
+   return istate-cache_nr;
}
 
-   if (fstat(fd, st))
-   die_errno(cannot stat the open index);
-
-   errno = EINVAL;
-   mmap_size = xsize_t(st.st_size);
-   mmap = xmmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 
0);
-   close(fd);
-   if (mmap == MAP_FAILED)
-   die_errno(unable to map index file);
-
-   hdr = mmap;
-   if (verify_hdr_version(istate, hdr, mmap_size)  0)
-   goto unmap;
-
-   if (istate-ops-verify_hdr(mmap, mmap_size)  0)
-   goto unmap;
-
-   if (istate-ops-read_index(istate, mmap, mmap_size)  0)
-   goto unmap;
-   istate-timestamp.sec = st.st_mtime;
-   istate-timestamp.nsec = ST_MTIME_NSEC(st);
-
-   munmap(mmap, mmap_size);
-   return istate-cache_nr;
-
-unmap:
-   munmap(mmap, mmap_size);
die(index file corrupt);
 }
 
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 03/19] read-cache: move index v2 specific functions to their own file

2013-07-12 Thread Thomas Gummerer
Move index version 2 specific functions to their own file. The non-index
specific functions will be in read-cache.c, while the index version 2
specific functions will be in read-cache-v2.c.

Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Makefile |   2 +
 cache.h  |  16 +-
 read-cache-v2.c  | 556 +
 read-cache.c | 575 ---
 read-cache.h |  57 +
 test-index-version.c |   5 +
 6 files changed, 661 insertions(+), 550 deletions(-)
 create mode 100644 read-cache-v2.c
 create mode 100644 read-cache.h

diff --git a/Makefile b/Makefile
index 5a68fe5..73369ae 100644
--- a/Makefile
+++ b/Makefile
@@ -711,6 +711,7 @@ LIB_H += progress.h
 LIB_H += prompt.h
 LIB_H += quote.h
 LIB_H += reachable.h
+LIB_H += read-cache.h
 LIB_H += reflog-walk.h
 LIB_H += refs.h
 LIB_H += remote.h
@@ -854,6 +855,7 @@ LIB_OBJS += prompt.o
 LIB_OBJS += quote.o
 LIB_OBJS += reachable.o
 LIB_OBJS += read-cache.o
+LIB_OBJS += read-cache-v2.o
 LIB_OBJS += reflog-walk.o
 LIB_OBJS += refs.o
 LIB_OBJS += remote.o
diff --git a/cache.h b/cache.h
index 7af853b..5082b34 100644
--- a/cache.h
+++ b/cache.h
@@ -95,19 +95,8 @@ unsigned long git_deflate_bound(git_zstream *, unsigned 
long);
  */
 #define DEFAULT_GIT_PORT 9418
 
-/*
- * Basic data structures for the directory cache
- */
 
 #define CACHE_SIGNATURE 0x44495243 /* DIRC */
-struct cache_version_header {
-   unsigned int hdr_signature;
-   unsigned int hdr_version;
-};
-
-struct cache_header {
-   unsigned int hdr_entries;
-};
 
 #define INDEX_FORMAT_LB 2
 #define INDEX_FORMAT_UB 4
@@ -280,6 +269,7 @@ struct index_state {
 initialized : 1;
struct hash_table name_hash;
struct hash_table dir_hash;
+   struct index_ops *ops;
 };
 
 extern struct index_state the_index;
@@ -489,8 +479,8 @@ extern void *read_blob_data_from_index(struct index_state 
*, const char *, unsig
 #define CE_MATCH_RACY_IS_DIRTY 02
 /* do stat comparison even if CE_SKIP_WORKTREE is true */
 #define CE_MATCH_IGNORE_SKIP_WORKTREE  04
-extern int ie_match_stat(const struct index_state *, const struct cache_entry 
*, struct stat *, unsigned int);
-extern int ie_modified(const struct index_state *, const struct cache_entry *, 
struct stat *, unsigned int);
+extern int ie_match_stat(struct index_state *, const struct cache_entry *, 
struct stat *, unsigned int);
+extern int ie_modified(struct index_state *, const struct cache_entry *, 
struct stat *, unsigned int);
 
 #define PATHSPEC_ONESTAR 1 /* the pathspec pattern sastisfies GFNM_ONESTAR 
*/
 
diff --git a/read-cache-v2.c b/read-cache-v2.c
new file mode 100644
index 000..a6883c3
--- /dev/null
+++ b/read-cache-v2.c
@@ -0,0 +1,556 @@
+#include cache.h
+#include read-cache.h
+#include resolve-undo.h
+#include cache-tree.h
+#include varint.h
+
+/* Mask for the name length in ce_flags in the on-disk index */
+#define CE_NAMEMASK  (0x0fff)
+
+struct cache_header {
+   unsigned int hdr_entries;
+};
+
+/*
+ * Index File I/O
+ */
+
+/*
+ * dev/ino/uid/gid/size are also just tracked to the low 32 bits
+ * Again - this is just a (very strong in practice) heuristic that
+ * the inode hasn't changed.
+ *
+ * We save the fields in big-endian order to allow using the
+ * index file over NFS transparently.
+ */
+struct ondisk_cache_entry {
+   struct cache_time ctime;
+   struct cache_time mtime;
+   unsigned int dev;
+   unsigned int ino;
+   unsigned int mode;
+   unsigned int uid;
+   unsigned int gid;
+   unsigned int size;
+   unsigned char sha1[20];
+   unsigned short flags;
+   char name[FLEX_ARRAY]; /* more */
+};
+
+/*
+ * This struct is used when CE_EXTENDED bit is 1
+ * The struct must match ondisk_cache_entry exactly from
+ * ctime till flags
+ */
+struct ondisk_cache_entry_extended {
+   struct cache_time ctime;
+   struct cache_time mtime;
+   unsigned int dev;
+   unsigned int ino;
+   unsigned int mode;
+   unsigned int uid;
+   unsigned int gid;
+   unsigned int size;
+   unsigned char sha1[20];
+   unsigned short flags;
+   unsigned short flags2;
+   char name[FLEX_ARRAY]; /* more */
+};
+
+/* These are only used for v3 or lower */
+#define align_flex_name(STRUCT,len) ((offsetof(struct STRUCT,name) + (len) + 
8)  ~7)
+#define ondisk_cache_entry_size(len) align_flex_name(ondisk_cache_entry,len)
+#define ondisk_cache_entry_extended_size(len) 
align_flex_name(ondisk_cache_entry_extended,len)
+#define ondisk_ce_size(ce) (((ce)-ce_flags  CE_EXTENDED) ? \
+   ondisk_cache_entry_extended_size(ce_namelen(ce)) : \
+   ondisk_cache_entry_size(ce_namelen

[PATCH v2 05/19] Add documentation for the index api

2013-07-12 Thread Thomas Gummerer
Add documentation for the index reading api.  This also includes
documentation for the new api functions introduced in the next patch.

Helped-by: Nguyễn Thái Ngọc Duy pclo...@gmail.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Documentation/technical/api-in-core-index.txt | 54 +--
 1 file changed, 50 insertions(+), 4 deletions(-)

diff --git a/Documentation/technical/api-in-core-index.txt 
b/Documentation/technical/api-in-core-index.txt
index adbdbf5..9b8c37c 100644
--- a/Documentation/technical/api-in-core-index.txt
+++ b/Documentation/technical/api-in-core-index.txt
@@ -1,14 +1,60 @@
 in-core index API
 =
 
+Reading API
+---
+
+`cache`::
+
+   An array of cache entries.  This is used to access the cache
+   entries directly.  Use `index_name_pos` to search for the
+   index of a specific cache entry.
+
+`read_index_filtered`::
+
+   Read a part of the index, filtered by the pathspec given in
+   the opts.  The function may load more than necessary, so the
+   caller still responsible to apply filters appropriately.  The
+   filtering is only done for performance reasons, as it's
+   possible to only read part of the index when the on-disk
+   format is index-v5.
+
+   To iterate only over the entries that match the pathspec, use
+   the for_each_index_entry function.
+
+`read_index`::
+
+   Read the whole index file from disk.
+
+`index_name_pos`::
+
+   Find a cache_entry with name in the index.  Returns pos if an
+   entry is matched exactly and -1-pos if an entry is matched
+   partially.
+   e.g.
+   index:
+   file1
+   file2
+   path/file1
+   zzz
+
+   index_name_pos(path/file1, 10) returns 2, while
+   index_name_pos(path, 4) returns -3
+
+`for_each_index_entry`::
+
+   Iterates over all cache_entries in the index filtered by
+   filter_opts in the index_state.  For each cache entry fn is
+   executed with cb_data as callback data.  From within the loop
+   do `return 0` to continue, or `return 1` to break the loop.
+
+TODO
+
 Talk about read-cache.c and cache-tree.c, things like:
 
-* cache - the_index macros
-* read_index()
 * write_index()
 * ie_match_stat() and ie_modified(); how they are different and when to
   use which.
-* index_name_pos()
 * remove_index_entry_at()
 * remove_file_from_index()
 * add_file_to_index()
@@ -18,4 +64,4 @@ Talk about read-cache.c and cache-tree.c, things like:
 * cache_tree_invalidate_path()
 * cache_tree_update()
 
-(JC, Linus)
+(JC, Linus, Thomas Gummerer)
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 06/19] read-cache: add index reading api

2013-07-12 Thread Thomas Gummerer
Add an api for access to the index file.  Currently there is only a very
basic api for accessing the index file, which only allows a full read of
the index, and lets the users of the data filter it.  The new index api
gives the users the possibility to use only part of the index and
provides functions for iterating over and accessing cache entries.

This simplifies future improvements to the in-memory format, as changes
will be concentrated on one file, instead of the whole git source code.

Helped-by: Nguyễn Thái Ngọc Duy pclo...@gmail.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache.h | 42 +-
 read-cache-v2.c | 35 +--
 read-cache.c| 44 
 read-cache.h|  8 +++-
 4 files changed, 121 insertions(+), 8 deletions(-)

diff --git a/cache.h b/cache.h
index 5082b34..d305d21 100644
--- a/cache.h
+++ b/cache.h
@@ -127,7 +127,7 @@ struct cache_entry {
unsigned int ce_flags;
unsigned int ce_namelen;
unsigned char sha1[20];
-   struct cache_entry *next;
+   struct cache_entry *next; /* used by name_hash */
char name[FLEX_ARRAY]; /* more */
 };
 
@@ -258,6 +258,29 @@ static inline unsigned int canon_mode(unsigned int mode)
 
 #define cache_entry_size(len) (offsetof(struct cache_entry,name) + (len) + 1)
 
+/*
+ * Options by which the index should be filtered when read partially.
+ *
+ * pathspec: The pathspec which the index entries have to match
+ * seen: Used to return the seen parameter from match_pathspec()
+ * max_prefix_len: The common prefix length of the pathspecs
+ *
+ * read_staged: used to indicate if the conflicted entries (entries
+ * with a stage) should be included
+ * read_cache_tree: used to indicate if the cache-tree should be read
+ * read_resolve_undo: used to indicate if the resolve undo data should
+ * be read
+ */
+struct filter_opts {
+   const struct pathspec *pathspec;
+   char *seen;
+   int max_prefix_len;
+
+   int read_staged;
+   int read_cache_tree;
+   int read_resolve_undo;
+};
+
 struct index_state {
struct cache_entry **cache;
unsigned int version;
@@ -270,6 +293,8 @@ struct index_state {
struct hash_table name_hash;
struct hash_table dir_hash;
struct index_ops *ops;
+   struct internal_ops *internal_ops;
+   struct filter_opts *filter_opts;
 };
 
 extern struct index_state the_index;
@@ -311,6 +336,12 @@ extern void free_name_hash(struct index_state *istate);
 #define unmerge_cache_entry_at(at) unmerge_index_entry_at(the_index, at)
 #define unmerge_cache(pathspec) unmerge_index(the_index, pathspec)
 #define read_blob_data_from_cache(path, sz) 
read_blob_data_from_index(the_index, (path), (sz))
+
+/* index api */
+#define read_cache_filtered(opts) read_index_filtered(the_index, (opts))
+#define read_cache_filtered_from(path, opts) 
read_index_filtered_from(the_index, (path), (opts))
+#define for_each_cache_entry(fn, cb_data) \
+   for_each_index_entry(the_index, (fn), (cb_data))
 #endif
 
 enum object_type {
@@ -438,6 +469,15 @@ extern int init_db(const char *template_dir, unsigned int 
flags);
} \
} while (0)
 
+/* index api */
+extern int read_index_filtered(struct index_state *, struct filter_opts *opts);
+extern int read_index_filtered_from(struct index_state *, const char *path, 
struct filter_opts *opts);
+
+typedef int each_cache_entry_fn(struct cache_entry *ce, void *);
+extern int for_each_index_entry(struct index_state *istate,
+   each_cache_entry_fn, void *);
+
+
 /* Initialize and use the cache information */
 extern int read_index(struct index_state *);
 extern int read_index_preload(struct index_state *, const char **pathspec);
diff --git a/read-cache-v2.c b/read-cache-v2.c
index a6883c3..51b618f 100644
--- a/read-cache-v2.c
+++ b/read-cache-v2.c
@@ -3,6 +3,7 @@
 #include resolve-undo.h
 #include cache-tree.h
 #include varint.h
+#include dir.h
 
 /* Mask for the name length in ce_flags in the on-disk index */
 #define CE_NAMEMASK  (0x0fff)
@@ -207,8 +208,14 @@ static int read_index_extension(struct index_state *istate,
return 0;
 }
 
+/*
+ * The performance is the same if we read the whole index or only
+ * part of it, therefore we always read the whole index to avoid
+ * having to re-read it later.  The filter_opts will determine
+ * what part of the index is used when retrieving the cache-entries.
+ */
 static int read_index_v2(struct index_state *istate, void *mmap,
-unsigned long mmap_size)
+unsigned long mmap_size, struct filter_opts *opts)
 {
int i;
unsigned long src_offset;
@@ -238,7 +245,6 @@ static int read_index_v2(struct index_state *istate, void 
*mmap,
disk_ce = (struct ondisk_cache_entry *)((char *)mmap + 
src_offset);
ce

[PATCH v2 07/19] make sure partially read index is not changed

2013-07-12 Thread Thomas Gummerer
A partially read index file currently cannot be written to disk.  Make
sure that never happens, by erroring out when a caller tries to change a
partially read index.  The caller is responsible for reading the whole
index when it's trying to change it later.

Forcing the caller to load the right part of the index file instead of
re-reading it when changing it, gives a bit of a performance advantage,
by avoiding to read parts of the index twice.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 builtin/update-index.c |  4 
 cache.h|  1 +
 read-cache-v2.c|  2 ++
 read-cache.c   | 10 ++
 4 files changed, 17 insertions(+)

diff --git a/builtin/update-index.c b/builtin/update-index.c
index 5c7762e..4c6e3a6 100644
--- a/builtin/update-index.c
+++ b/builtin/update-index.c
@@ -49,6 +49,8 @@ static int mark_ce_flags(const char *path, int flag, int mark)
int namelen = strlen(path);
int pos = cache_name_pos(path, namelen);
if (0 = pos) {
+   if (active_cache_partially_read)
+   die(BUG: Can't change a partially read index);
if (mark)
active_cache[pos]-ce_flags |= flag;
else
@@ -253,6 +255,8 @@ static void chmod_path(int flip, const char *path)
pos = cache_name_pos(path, strlen(path));
if (pos  0)
goto fail;
+   if (active_cache_partially_read)
+   die(BUG: Can't change a partially read index);
ce = active_cache[pos];
mode = ce-ce_mode;
if (!S_ISREG(mode))
diff --git a/cache.h b/cache.h
index d305d21..455b772 100644
--- a/cache.h
+++ b/cache.h
@@ -311,6 +311,7 @@ extern void free_name_hash(struct index_state *istate);
 #define active_alloc (the_index.cache_alloc)
 #define active_cache_changed (the_index.cache_changed)
 #define active_cache_tree (the_index.cache_tree)
+#define active_cache_partially_read (the_index.filter_opts)
 
 #define read_cache() read_index(the_index)
 #define read_cache_from(path) read_index_from(the_index, (path))
diff --git a/read-cache-v2.c b/read-cache-v2.c
index 51b618f..f3c0685 100644
--- a/read-cache-v2.c
+++ b/read-cache-v2.c
@@ -479,6 +479,8 @@ static int write_index_v2(struct index_state *istate, int 
newfd)
struct stat st;
struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 
+   if (istate-filter_opts)
+   die(BUG: index: cannot write a partially read index);
for (i = removed = extended = 0; i  entries; i++) {
if (cache[i]-ce_flags  CE_REMOVE)
removed++;
diff --git a/read-cache.c b/read-cache.c
index 9053d43..ab716ed 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -30,6 +30,8 @@ static void replace_index_entry(struct index_state *istate, 
int nr, struct cache
 {
struct cache_entry *old = istate-cache[nr];
 
+   if (istate-filter_opts)
+   die(BUG: Can't change a partially read index);
remove_name_hash(istate, old);
set_index_entry(istate, nr, ce);
istate-cache_changed = 1;
@@ -467,6 +469,8 @@ int remove_index_entry_at(struct index_state *istate, int 
pos)
 {
struct cache_entry *ce = istate-cache[pos];
 
+   if (istate-filter_opts)
+   die(BUG: Can't change a partially read index);
record_resolve_undo(istate, ce);
remove_name_hash(istate, ce);
istate-cache_changed = 1;
@@ -973,6 +977,8 @@ int add_index_entry(struct index_state *istate, struct 
cache_entry *ce, int opti
 {
int pos;
 
+   if (istate-filter_opts)
+   die(BUG: Can't change a partially read index);
if (option  ADD_CACHE_JUST_APPEND)
pos = istate-cache_nr;
else {
@@ -1173,6 +1179,8 @@ int refresh_index(struct index_state *istate, unsigned 
int flags, const char **p
/* If we are doing --really-refresh that
 * means the index is not valid anymore.
 */
+   if (istate-filter_opts)
+   die(BUG: Can't change a partially read 
index);
ce-ce_flags = ~CE_VALID;
istate-cache_changed = 1;
}
@@ -1331,6 +1339,8 @@ int read_index_filtered_from(struct index_state *istate, 
const char *path,
void *mmap;
size_t mmap_size;
 
+   if (istate-filter_opts)
+   die(BUG: Can't re-read partially read index);
errno = EBUSY;
if (istate-initialized)
return istate-cache_nr;
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 08/19] grep.c: Use index api

2013-07-12 Thread Thomas Gummerer
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 builtin/grep.c | 71 ++
 1 file changed, 37 insertions(+), 34 deletions(-)

diff --git a/builtin/grep.c b/builtin/grep.c
index a419cda..8b02644 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -368,41 +368,33 @@ static void run_pager(struct grep_opt *opt, const char 
*prefix)
free(argv);
 }
 
-static int grep_cache(struct grep_opt *opt, const struct pathspec *pathspec, 
int cached)
+struct grep_opts {
+   struct grep_opt *opt;
+   const struct pathspec *pathspec;
+   int cached;
+   int hit;
+};
+
+static int grep_cache(struct cache_entry *ce, void *cb_data)
 {
-   int hit = 0;
-   int nr;
-   read_cache();
+   struct grep_opts *opts = cb_data;
 
-   for (nr = 0; nr  active_nr; nr++) {
-   struct cache_entry *ce = active_cache[nr];
-   if (!S_ISREG(ce-ce_mode))
-   continue;
-   if (!match_pathspec_depth(pathspec, ce-name, ce_namelen(ce), 
0, NULL))
-   continue;
-   /*
-* If CE_VALID is on, we assume worktree file and its cache 
entry
-* are identical, even if worktree file has been modified, so 
use
-* cache version instead
-*/
-   if (cached || (ce-ce_flags  CE_VALID) || 
ce_skip_worktree(ce)) {
-   if (ce_stage(ce))
-   continue;
-   hit |= grep_sha1(opt, ce-sha1, ce-name, 0, ce-name);
-   }
-   else
-   hit |= grep_file(opt, ce-name);
-   if (ce_stage(ce)) {
-   do {
-   nr++;
-   } while (nr  active_nr 
-!strcmp(ce-name, active_cache[nr]-name));
-   nr--; /* compensate for loop control */
-   }
-   if (hit  opt-status_only)
-   break;
-   }
-   return hit;
+   if (!S_ISREG(ce-ce_mode))
+   return 0;
+   if (!match_pathspec_depth(opts-pathspec, ce-name, ce_namelen(ce), 0, 
NULL))
+   return 0;
+   /*
+* If CE_VALID is on, we assume worktree file and its cache entry
+* are identical, even if worktree file has been modified, so use
+* cache version instead
+*/
+   if (opts-cached || (ce-ce_flags  CE_VALID) || ce_skip_worktree(ce))
+   opts-hit |= grep_sha1(opts-opt, ce-sha1, ce-name, 0, 
ce-name);
+   else
+   opts-hit |= grep_file(opts-opt, ce-name);
+   if (opts-hit  opts-opt-status_only)
+   return 1;
+   return 0;
 }
 
 static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec,
@@ -895,10 +887,21 @@ int cmd_grep(int argc, const char **argv, const char 
*prefix)
} else if (0 = opt_exclude) {
die(_(--[no-]exclude-standard cannot be used for tracked 
contents.));
} else if (!list.nr) {
+   struct grep_opts opts;
+   struct filter_opts *filter_opts = xmalloc(sizeof(*filter_opts));
+
if (!cached)
setup_work_tree();
 
-   hit = grep_cache(opt, pathspec, cached);
+   memset(filter_opts, 0, sizeof(*filter_opts));
+   filter_opts-pathspec = pathspec;
+   opts.opt = opt;
+   opts.pathspec = pathspec;
+   opts.cached = cached;
+   opts.hit = 0;
+   read_cache_filtered(filter_opts);
+   for_each_cache_entry(grep_cache, opts);
+   hit = opts.hit;
} else {
if (cached)
die(_(both --cached and trees are given.));
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 09/19] ls-files.c: use index api

2013-07-12 Thread Thomas Gummerer
Use the index api to read only part of the index, if the on-disk version
of the index is index-v5.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 builtin/ls-files.c | 31 ---
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/builtin/ls-files.c b/builtin/ls-files.c
index 08d9786..80cc398 100644
--- a/builtin/ls-files.c
+++ b/builtin/ls-files.c
@@ -31,6 +31,7 @@ static const char *prefix;
 static int max_prefix_len;
 static int prefix_len;
 static const char **pathspec;
+static struct pathspec pathspec_struct;
 static int error_unmatch;
 static char *ps_matched;
 static const char *with_tree;
@@ -457,6 +458,7 @@ int cmd_ls_files(int argc, const char **argv, const char 
*cmd_prefix)
struct dir_struct dir;
struct exclude_list *el;
struct string_list exclude_list = STRING_LIST_INIT_NODUP;
+   struct filter_opts *opts = xmalloc(sizeof(*opts));
struct option builtin_ls_files_options[] = {
{ OPTION_CALLBACK, 'z', NULL, NULL, NULL,
N_(paths are separated with NUL character),
@@ -522,9 +524,6 @@ int cmd_ls_files(int argc, const char **argv, const char 
*cmd_prefix)
prefix_len = strlen(prefix);
git_config(git_default_config, NULL);
 
-   if (read_cache()  0)
-   die(index file corrupt);
-
argc = parse_options(argc, argv, prefix, builtin_ls_files_options,
ls_files_usage, 0);
el = add_exclude_list(dir, EXC_CMDL, --exclude option);
@@ -556,14 +555,7 @@ int cmd_ls_files(int argc, const char **argv, const char 
*cmd_prefix)
setup_work_tree();
 
pathspec = get_pathspec(prefix, argv);
-
-   /* be nice with submodule paths ending in a slash */
-   if (pathspec)
-   strip_trailing_slash_from_submodules();
-
-   /* Find common prefix for all pathspec's */
-   max_prefix = common_prefix(pathspec);
-   max_prefix_len = max_prefix ? strlen(max_prefix) : 0;
+   init_pathspec(pathspec_struct, pathspec);
 
/* Treat unmatching pathspec elements as errors */
if (pathspec  error_unmatch) {
@@ -573,6 +565,23 @@ int cmd_ls_files(int argc, const char **argv, const char 
*cmd_prefix)
ps_matched = xcalloc(1, num);
}
 
+   if (!with_tree) {
+   memset(opts, 0, sizeof(*opts));
+   opts-pathspec = pathspec_struct;
+   opts-read_staged = 1;
+   if (show_resolve_undo)
+   opts-read_resolve_undo = 1;
+   read_cache_filtered(opts);
+   } else {
+   read_cache();
+   }
+   /* be nice with submodule paths ending in a slash */
+   if (pathspec)
+   strip_trailing_slash_from_submodules();
+
+   max_prefix = common_prefix(pathspec);
+   max_prefix_len = max_prefix ? strlen(max_prefix) : 0;
+
if ((dir.flags  DIR_SHOW_IGNORED)  !exc_given)
die(ls-files --ignored needs some exclude pattern);
 
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 11/19] read-cache: make in-memory format aware of stat_crc

2013-07-12 Thread Thomas Gummerer
Make the in-memory format aware of the stat_crc used by index-v5.
It is simply ignored by index version prior to v5.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache.h  |  1 +
 read-cache.c | 25 +
 2 files changed, 26 insertions(+)

diff --git a/cache.h b/cache.h
index 455b772..2097105 100644
--- a/cache.h
+++ b/cache.h
@@ -127,6 +127,7 @@ struct cache_entry {
unsigned int ce_flags;
unsigned int ce_namelen;
unsigned char sha1[20];
+   uint32_t ce_stat_crc;
struct cache_entry *next; /* used by name_hash */
char name[FLEX_ARRAY]; /* more */
 };
diff --git a/read-cache.c b/read-cache.c
index ab716ed..9bfbb4f 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -108,6 +108,29 @@ int match_stat_data(const struct stat_data *sd, struct 
stat *st)
return changed;
 }
 
+static uint32_t calculate_stat_crc(struct cache_entry *ce)
+{
+   unsigned int ctimens = 0;
+   uint32_t stat, stat_crc;
+
+   stat = htonl(ce-ce_stat_data.sd_ctime.sec);
+   stat_crc = crc32(0, (Bytef*)stat, 4);
+#ifdef USE_NSEC
+   ctimens = ce-ce_stat_data.sd_ctime.nsec;
+#endif
+   stat = htonl(ctimens);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_stat_data.sd_ino);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_stat_data.sd_dev);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_stat_data.sd_uid);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_stat_data.sd_gid);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   return stat_crc;
+}
+
 /*
  * This only updates the non-critical parts of the directory
  * cache, ie the parts that aren't tracked by GIT, and only used
@@ -122,6 +145,8 @@ void fill_stat_cache_info(struct cache_entry *ce, struct 
stat *st)
 
if (S_ISREG(st-st_mode))
ce_mark_uptodate(ce);
+
+   ce-ce_stat_crc = calculate_stat_crc(ce);
 }
 
 static int ce_compare_data(const struct cache_entry *ce, struct stat *st)
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 10/19] documentation: add documentation of the index-v5 file format

2013-07-12 Thread Thomas Gummerer
Add a documentation of the index file format version 5 to
Documentation/technical.

Helped-by: Michael Haggerty mhag...@alum.mit.edu
Helped-by: Junio C Hamano gits...@pobox.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Robin Rosenberg robin.rosenb...@dewire.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Documentation/technical/index-file-format-v5.txt | 296 +++
 1 file changed, 296 insertions(+)
 create mode 100644 Documentation/technical/index-file-format-v5.txt

diff --git a/Documentation/technical/index-file-format-v5.txt 
b/Documentation/technical/index-file-format-v5.txt
new file mode 100644
index 000..4213087
--- /dev/null
+++ b/Documentation/technical/index-file-format-v5.txt
@@ -0,0 +1,296 @@
+GIT index format
+
+
+== The git index
+
+   The git index file (.git/index) documents the status of the files
+ in the git staging area.
+
+   The staging area is used for preparing commits, merging, etc.
+
+== The git index file format
+
+   All binary numbers are in network byte order. Version 5 is described
+ here. The index file consists of various sections. They appear in
+ the following order in the file.
+
+   - header: the description of the index format, including it's signature,
+ version and various other fields that are used internally.
+
+   - diroffsets (ndir entries of direcotry offset): A 4-byte offset
+   relative to the beginning of the direntries block (see below)
+   for each of the ndir directories in the index, sorted by pathname
+   (of the directory it's pointing to). [1]
+
+   - direntries (ndir entries of directory offset): A directory entry
+   for each of the ndir directories in the index, sorted by pathname
+   (see below). [2]
+
+   - fileoffsets (nfile entries of file offset): A 4-byte offset
+   relative to the beginning of the fileentries block (see below)
+   for each of the nfile files in the index. [1]
+
+   - fileentries (nfile entries of file entry): A file entry for
+   each of the nfile files in the index (see below).
+
+   - crdata: A number of entries for conflicted data/resolved conflicts
+   (see below).
+
+   - Extensions (Currently none, see below in the future)
+
+ Extensions are identified by signature. Optional extensions can
+ be ignored if GIT does not understand them.
+
+ GIT supports an arbitrary number of extension, but currently none
+ is implemented. [3]
+
+ extsig (32-bits): extension signature. If the first byte is 'A'..'Z'
+ the extension is optional and can be ignored.
+
+ extsize (32-bits): size of the extension, excluding the header
+   (extsig, extsize, extchecksum).
+
+ extchecksum (32-bits): crc32 checksum of the extension signature
+   and size.
+
+- Extension data.
+
+== Header
+   sig (32-bits): Signature:
+ The signature is { 'D', 'I', 'R', 'C' } (stands for dircache)
+
+   vnr (32-bits): Version number:
+ The current supported versions are 2, 3, 4 and 5.
+
+   ndir (32-bits): number of directories in the index.
+
+   nfile (32-bits): number of file entries in the index.
+
+   fblockoffset (32-bits): offset to the file block, relative to the
+ beginning of the file.
+
+   - Offset to the extensions.
+
+ nextensions (32-bits): number of extensions.
+
+ extoffset (32-bits): offset to the extension. (Possibly none, as
+   many as indicated in the 4-byte number of extensions)
+
+   headercrc (32-bits): crc checksum including the header and the
+ offsets to the extensions.
+
+
+== Directory offsets (diroffsets)
+
+  diroffset (32-bits): offset to the directory relative to the beginning
+of the index file. There are ndir + 1 offsets in the diroffset table,
+the last is pointing to the end of the last direntry. With this last
+entry, we are able to replace the strlen of when reading the directory
+name, by calculating it from diroffset[n+1]-diroffset[n]-61.  61 is the
+size of the directory data, which follows each each directory + the
+crc sum + the NUL byte.
+
+  This part is needed for making the directory entries bisectable and
+thus allowing a binary search.
+
+== Directory entry (direntries)
+
+  Directory entries are sorted in lexicographic order by the name
+of their path starting with the root.
+
+  pathname (variable length, nul terminated): relative to top level
+directory (without the leading slash). '/' is used as path
+separator. A string of length 0 ('') indicates the root directory.
+The special path components ., and .. (without quotes) are
+disallowed. The path also includes a trailing slash. [9]
+
+  foffset (32-bits): offset to the lexicographically first file in
+the file offsets (fileoffsets), relative to the beginning of
+the fileoffset block.
+
+  cr (32-bits): offset to conflicted/resolved data at the end of the
+index. 0

[PATCH v2 14/19] read-cache: read cache-tree in index-v5

2013-07-12 Thread Thomas Gummerer
Since the cache-tree data is saved as part of the directory data,
we already read it at the beginning of the index. The cache-tree
is only converted from this directory data.

The cache-tree data is arranged in a tree, with the children sorted by
pathlen at each node, while the ondisk format is sorted lexically.
So we have to rebuild this format from the on-disk directory list.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache-tree.c|   2 +-
 cache-tree.h|   6 
 read-cache-v5.c | 100 
 3 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/cache-tree.c b/cache-tree.c
index 37e4d00..f4b0917 100644
--- a/cache-tree.c
+++ b/cache-tree.c
@@ -31,7 +31,7 @@ void cache_tree_free(struct cache_tree **it_p)
*it_p = NULL;
 }
 
-static int subtree_name_cmp(const char *one, int onelen,
+int subtree_name_cmp(const char *one, int onelen,
const char *two, int twolen)
 {
if (onelen  twolen)
diff --git a/cache-tree.h b/cache-tree.h
index 55d0f59..9aac493 100644
--- a/cache-tree.h
+++ b/cache-tree.h
@@ -21,10 +21,16 @@ struct cache_tree {
struct cache_tree_sub **down;
 };
 
+struct directory_queue {
+   struct directory_queue *down;
+   struct directory_entry *de;
+};
+
 struct cache_tree *cache_tree(void);
 void cache_tree_free(struct cache_tree **);
 void cache_tree_invalidate_path(struct cache_tree *, const char *);
 struct cache_tree_sub *cache_tree_sub(struct cache_tree *, const char *);
+int subtree_name_cmp(const char *, int, const char *, int);
 
 void cache_tree_write(struct strbuf *, struct cache_tree *root);
 struct cache_tree *cache_tree_read(const char *buffer, unsigned long size);
diff --git a/read-cache-v5.c b/read-cache-v5.c
index 853b97d..0b9c320 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -448,6 +448,103 @@ static int read_conflicts(struct conflict_entry **head,
return 0;
 }
 
+static struct cache_tree *convert_one(struct directory_queue *queue, int dirnr)
+{
+   int i, subtree_nr;
+   struct cache_tree *it;
+   struct directory_queue *down;
+
+   it = cache_tree();
+   it-entry_count = queue[dirnr].de-de_nentries;
+   subtree_nr = queue[dirnr].de-de_nsubtrees;
+   if (0 = it-entry_count)
+   hashcpy(it-sha1, queue[dirnr].de-sha1);
+
+   /*
+* Just a heuristic -- we do not add directories that often but
+* we do not want to have to extend it immediately when we do,
+* hence +2.
+*/
+   it-subtree_alloc = subtree_nr + 2;
+   it-down = xcalloc(it-subtree_alloc, sizeof(struct cache_tree_sub *));
+   down = queue[dirnr].down;
+   for (i = 0; i  subtree_nr; i++) {
+   struct cache_tree *sub;
+   struct cache_tree_sub *subtree;
+   char *buf, *name;
+
+   name = ;
+   buf = strtok(down[i].de-pathname, /);
+   while (buf) {
+   name = buf;
+   buf = strtok(NULL, /);
+   }
+   sub = convert_one(down, i);
+   if(!sub)
+   goto free_return;
+   subtree = cache_tree_sub(it, name);
+   subtree-cache_tree = sub;
+   }
+   if (subtree_nr != it-subtree_nr)
+   die(cache-tree: internal error);
+   return it;
+ free_return:
+   cache_tree_free(it);
+   return NULL;
+}
+
+static int compare_cache_tree_elements(const void *a, const void *b)
+{
+   const struct directory_entry *de1, *de2;
+
+   de1 = ((const struct directory_queue *)a)-de;
+   de2 = ((const struct directory_queue *)b)-de;
+   return subtree_name_cmp(de1-pathname, de1-de_pathlen,
+   de2-pathname, de2-de_pathlen);
+}
+
+static struct directory_entry *sort_directories(struct directory_entry *de,
+   struct directory_queue *queue)
+{
+   int i, nsubtrees;
+
+   nsubtrees = de-de_nsubtrees;
+   for (i = 0; i  nsubtrees; i++) {
+   struct directory_entry *new_de;
+   de = de-next;
+   new_de = xmalloc(directory_entry_size(de-de_pathlen));
+   memcpy(new_de, de, directory_entry_size(de-de_pathlen));
+   queue[i].de = new_de;
+   if (de-de_nsubtrees) {
+   queue[i].down = xcalloc(de-de_nsubtrees,
+   sizeof(struct directory_queue));
+   de = sort_directories(de,
+   queue[i].down);
+   }
+   }
+   qsort(queue, nsubtrees, sizeof(struct directory_queue),
+   compare_cache_tree_elements);
+   return de;
+}
+
+/*
+ * This function modifies the directory argument that is given to it.
+ * Don't use it if the directory entries are still needed after.
+ */
+static struct

[PATCH v2 13/19] read-cache: read resolve-undo data

2013-07-12 Thread Thomas Gummerer
Make git read the resolve-undo data from the index.

Since the resolve-undo data is joined with the conflicts in
the ondisk format of the index file version 5, conflicts and
resolved data is read at the same time, and the resolve-undo
data is then converted to the in-memory format.

Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c | 39 +++
 1 file changed, 39 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index 00112ea..853b97d 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -1,5 +1,6 @@
 #include cache.h
 #include read-cache.h
+#include string-list.h
 #include resolve-undo.h
 #include cache-tree.h
 #include dir.h
@@ -447,6 +448,43 @@ static int read_conflicts(struct conflict_entry **head,
return 0;
 }
 
+static void resolve_undo_convert_v5(struct index_state *istate,
+   struct conflict_entry *conflict)
+{
+   int i;
+
+   while (conflict) {
+   struct string_list_item *lost;
+   struct resolve_undo_info *ui;
+   struct conflict_part *cp;
+
+   if (conflict-entries 
+   (conflict-entries-flags  CONFLICT_CONFLICTED) != 0) {
+   conflict = conflict-next;
+   continue;
+   }
+   if (!istate-resolve_undo) {
+   istate-resolve_undo = xcalloc(1, sizeof(struct 
string_list));
+   istate-resolve_undo-strdup_strings = 1;
+   }
+
+   lost = string_list_insert(istate-resolve_undo, conflict-name);
+   if (!lost-util)
+   lost-util = xcalloc(1, sizeof(*ui));
+   ui = lost-util;
+
+   cp = conflict-entries;
+   for (i = 0; i  3; i++)
+   ui-mode[i] = 0;
+   while (cp) {
+   ui-mode[conflict_stage(cp) - 1] = cp-entry_mode;
+   hashcpy(ui-sha1[conflict_stage(cp) - 1], cp-sha1);
+   cp = cp-next;
+   }
+   conflict = conflict-next;
+   }
+}
+
 static int read_entries(struct index_state *istate, struct directory_entry 
**de,
unsigned int *entry_offset, void **mmap,
unsigned long mmap_size, unsigned int *nr,
@@ -460,6 +498,7 @@ static int read_entries(struct index_state *istate, struct 
directory_entry **de,
conflict_queue = NULL;
if (read_conflicts(conflict_queue, *de, mmap, mmap_size)  0)
return -1;
+   resolve_undo_convert_v5(istate, conflict_queue);
for (i = 0; i  (*de)-de_nfiles; i++) {
if (read_entry(ce,
   *de,
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 12/19] read-cache: read index-v5

2013-07-12 Thread Thomas Gummerer
Make git read the index file version 5 without complaining.

This version of the reader doesn't read neither the cache-tree
nor the resolve undo data, but doesn't choke on an index that
includes such data.

Helped-by: Junio C Hamano gits...@pobox.com
Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Makefile|   1 +
 cache.h |  75 ++-
 read-cache-v5.c | 638 
 read-cache.h|   1 +
 4 files changed, 714 insertions(+), 1 deletion(-)
 create mode 100644 read-cache-v5.c

diff --git a/Makefile b/Makefile
index 73369ae..80e35f5 100644
--- a/Makefile
+++ b/Makefile
@@ -856,6 +856,7 @@ LIB_OBJS += quote.o
 LIB_OBJS += reachable.o
 LIB_OBJS += read-cache.o
 LIB_OBJS += read-cache-v2.o
+LIB_OBJS += read-cache-v5.o
 LIB_OBJS += reflog-walk.o
 LIB_OBJS += refs.o
 LIB_OBJS += remote.o
diff --git a/cache.h b/cache.h
index 2097105..1e5cc77 100644
--- a/cache.h
+++ b/cache.h
@@ -99,7 +99,7 @@ unsigned long git_deflate_bound(git_zstream *, unsigned long);
 #define CACHE_SIGNATURE 0x44495243 /* DIRC */
 
 #define INDEX_FORMAT_LB 2
-#define INDEX_FORMAT_UB 4
+#define INDEX_FORMAT_UB 5
 
 /*
  * The cache_time is just the low 32 bits of the
@@ -121,6 +121,15 @@ struct stat_data {
unsigned int sd_size;
 };
 
+/*
+ * The *next pointer is used in read_entries_v5 for holding
+ * all the elements of a directory, and points to the next
+ * cache_entry in a directory.
+ *
+ * It is reset by the add_name_hash call in set_index_entry
+ * to set it to point to the next cache_entry in the
+ * correct in-memory format ordering.
+ */
 struct cache_entry {
struct stat_data ce_stat_data;
unsigned int ce_mode;
@@ -132,11 +141,59 @@ struct cache_entry {
char name[FLEX_ARRAY]; /* more */
 };
 
+struct directory_entry {
+   struct directory_entry *next;
+   struct directory_entry *next_hash;
+   struct cache_entry *ce;
+   struct cache_entry *ce_last;
+   struct conflict_entry *conflict;
+   struct conflict_entry *conflict_last;
+   unsigned int conflict_size;
+   unsigned int de_foffset;
+   unsigned int de_cr;
+   unsigned int de_ncr;
+   unsigned int de_nsubtrees;
+   unsigned int de_nfiles;
+   unsigned int de_nentries;
+   unsigned char sha1[20];
+   unsigned short de_flags;
+   unsigned int de_pathlen;
+   char pathname[FLEX_ARRAY];
+};
+
+struct conflict_part {
+   struct conflict_part *next;
+   unsigned short flags;
+   unsigned short entry_mode;
+   unsigned char sha1[20];
+};
+
+struct conflict_entry {
+   struct conflict_entry *next;
+   unsigned int nfileconflicts;
+   struct conflict_part *entries;
+   unsigned int namelen;
+   unsigned int pathlen;
+   char name[FLEX_ARRAY];
+};
+
+struct ondisk_conflict_part {
+   unsigned short flags;
+   unsigned short entry_mode;
+   unsigned char sha1[20];
+};
+
+#define CE_NAMEMASK  (0x0fff)
 #define CE_STAGEMASK (0x3000)
 #define CE_EXTENDED  (0x4000)
 #define CE_VALID (0x8000)
+#define CE_SMUDGED   (0x0400) /* index v5 only flag */
 #define CE_STAGESHIFT 12
 
+#define CONFLICT_CONFLICTED (0x8000)
+#define CONFLICT_STAGESHIFT 13
+#define CONFLICT_STAGEMASK (0x6000)
+
 /*
  * Range 0x in ce_flags is divided into
  * two parts: in-memory flags and on-disk ones.
@@ -173,6 +230,18 @@ struct cache_entry {
 #define CE_EXTENDED_FLAGS (CE_INTENT_TO_ADD | CE_SKIP_WORKTREE)
 
 /*
+ * Representation of the extended on-disk flags in the v5 format.
+ * They must not collide with the ordinary on-disk flags, and need to
+ * fit in 16 bits.  Note however that v5 does not save the name
+ * length.
+ */
+#define CE_INTENT_TO_ADD_V5  (0x4000)
+#define CE_SKIP_WORKTREE_V5  (0x0800)
+#if (CE_VALID|CE_STAGEMASK)  (CE_INTENTTOADD_V5|CE_SKIPWORKTREE_V5)
+#error v5 on-disk flags collide with ordinary on-disk flags
+#endif
+
+/*
  * Safeguard to avoid saving wrong flags:
  *  - CE_EXTENDED2 won't get saved until its semantic is known
  *  - Bits in 0x have been saved in ce_flags already
@@ -211,6 +280,8 @@ static inline unsigned create_ce_flags(unsigned stage)
 #define ce_skip_worktree(ce) ((ce)-ce_flags  CE_SKIP_WORKTREE)
 #define ce_mark_uptodate(ce) ((ce)-ce_flags |= CE_UPTODATE)
 
+#define conflict_stage(c) ((CONFLICT_STAGEMASK  (c)-flags)  
CONFLICT_STAGESHIFT)
+
 #define ce_permissions(mode) (((mode)  0100) ? 0755 : 0644)
 static inline unsigned int create_ce_mode(unsigned int mode)
 {
@@ -258,6 +329,8 @@ static inline unsigned int canon_mode(unsigned int mode)
 }
 
 #define cache_entry_size(len) (offsetof(struct cache_entry,name) + (len) + 1)
+#define directory_entry_size(len) (offsetof(struct directory_entry,pathname) + 
(len) + 1)
+#define conflict_entry_size(len) (offsetof(struct conflict_entry,name) + (len) 
+ 1)
 
 /*
  * Options by which the index

[PATCH v2 16/19] read-cache: write index-v5 cache-tree data

2013-07-12 Thread Thomas Gummerer
Write the cache-tree data for the index version 5 file format. The
in-memory cache-tree data is converted to the ondisk format, by adding
it to the directory entries, that were compiled from the cache-entries
in the step before.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c | 53 +
 1 file changed, 53 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index 33667d7..cd819b4 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -941,6 +941,57 @@ static struct conflict_entry 
*create_conflict_entry_from_ce(struct cache_entry *
return create_new_conflict(ce-name, ce_namelen(ce), pathlen);
 }
 
+static void convert_one_to_ondisk_v5(struct hash_table *table, struct 
cache_tree *it,
+   const char *path, int pathlen, uint32_t crc)
+{
+   int i;
+   struct directory_entry *found, *search;
+
+   crc = crc32(crc, (Bytef*)path, pathlen);
+   found = lookup_hash(crc, table);
+   search = found;
+   while (search  strcmp(path, search-pathname + search-de_pathlen - 
strlen(path)) != 0)
+   search = search-next_hash;
+   if (!search)
+   return;
+   /*
+* The number of subtrees is already calculated by
+* compile_directory_data, therefore we only need to
+* add the entry_count
+*/
+   search-de_nentries = it-entry_count;
+   if (0 = it-entry_count)
+   hashcpy(search-sha1, it-sha1);
+   if (strcmp(path, ) != 0)
+   crc = crc32(crc, (Bytef*)/, 1);
+
+#if DEBUG
+   if (0 = it-entry_count)
+   fprintf(stderr, cache-tree %.*s (%d ent, %d subtree) %s\n,
+   pathlen, path, it-entry_count, it-subtree_nr,
+   sha1_to_hex(it-sha1));
+   else
+   fprintf(stderr, cache-tree %.*s (%d subtree) invalid\n,
+   pathlen, path, it-subtree_nr);
+#endif
+
+   for (i = 0; i  it-subtree_nr; i++) {
+   struct cache_tree_sub *down = it-down[i];
+   if (i) {
+   struct cache_tree_sub *prev = it-down[i-1];
+   if (subtree_name_cmp(down-name, down-namelen,
+prev-name, prev-namelen) = 0)
+   die(fatal - unsorted cache subtree);
+   }
+   convert_one_to_ondisk_v5(table, down-cache_tree, down-name, 
down-namelen, crc);
+   }
+}
+
+static void cache_tree_to_ondisk_v5(struct hash_table *table, struct 
cache_tree *root)
+{
+   convert_one_to_ondisk_v5(table, root, , 0, 0);
+}
+
 static struct directory_entry *compile_directory_data(struct index_state 
*istate,
int nfile,
unsigned int *ndir,
@@ -1046,6 +1097,8 @@ static struct directory_entry 
*compile_directory_data(struct index_state *istate
previous_entry-next = no_subtrees;
}
}
+   if (istate-cache_tree)
+   cache_tree_to_ondisk_v5(table, istate-cache_tree);
return de;
 }
 
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 18/19] update-index.c: rewrite index when index-version is given

2013-07-12 Thread Thomas Gummerer
Make update-index always rewrite the index when a index-version
is given, even if the index already has the right version.
This option is used for performance testing the writer and
reader.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 builtin/update-index.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/builtin/update-index.c b/builtin/update-index.c
index 4c6e3a6..7e723c0 100644
--- a/builtin/update-index.c
+++ b/builtin/update-index.c
@@ -6,6 +6,7 @@
 #include cache.h
 #include quote.h
 #include cache-tree.h
+#include read-cache.h
 #include tree-walk.h
 #include builtin.h
 #include refs.h
@@ -863,8 +864,7 @@ int cmd_update_index(int argc, const char **argv, const 
char *prefix)
preferred_index_format,
INDEX_FORMAT_LB, INDEX_FORMAT_UB);
 
-   if (the_index.version != preferred_index_format)
-   active_cache_changed = 1;
+   active_cache_changed = 1;
the_index.version = preferred_index_format;
}
 
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 17/19] read-cache: write resolve-undo data for index-v5

2013-07-12 Thread Thomas Gummerer
Make git read the resolve-undo data from the index.

Since the resolve-undo data is joined with the conflicts in
the ondisk format of the index file version 5, conflicts and
resolved data is read at the same time, and the resolve-undo
data is then converted to the in-memory format.

Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c | 94 +
 1 file changed, 94 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index cd819b4..093ee1a 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -992,6 +992,99 @@ static void cache_tree_to_ondisk_v5(struct hash_table 
*table, struct cache_tree
convert_one_to_ondisk_v5(table, root, , 0, 0);
 }
 
+static void resolve_undo_to_ondisk_v5(struct hash_table *table,
+ struct string_list *resolve_undo,
+ unsigned int *ndir, int *total_dir_len,
+ struct directory_entry *de)
+{
+   struct string_list_item *item;
+   struct directory_entry *search;
+
+   if (!resolve_undo)
+   return;
+   for_each_string_list_item(item, resolve_undo) {
+   struct conflict_entry *conflict_entry;
+   struct resolve_undo_info *ui = item-util;
+   char *super;
+   int i, dir_len, len;
+   uint32_t crc;
+   struct directory_entry *found, *current, *new_tree;
+
+   if (!ui)
+   continue;
+
+   super = super_directory(item-string);
+   if (!super)
+   dir_len = 0;
+   else
+   dir_len = strlen(super);
+   crc = crc32(0, (Bytef*)super, dir_len);
+   found = lookup_hash(crc, table);
+   current = NULL;
+   new_tree = NULL;
+
+   while (!found) {
+   struct directory_entry *new;
+
+   new = init_directory_entry(super, dir_len);
+   if (!current)
+   current = new;
+   insert_directory_entry(new, table, total_dir_len, ndir, 
crc);
+   if (new_tree != NULL)
+   new-de_nsubtrees = 1;
+   new-next = new_tree;
+   new_tree = new;
+   super = super_directory(super);
+   if (!super)
+   dir_len = 0;
+   else
+   dir_len = strlen(super);
+   crc = crc32(0, (Bytef*)super, dir_len);
+   found = lookup_hash(crc, table);
+   }
+   search = found;
+   while (search-next_hash  strcmp(super, search-pathname) != 
0)
+   search = search-next_hash;
+   if (search  !current)
+   current = search;
+   if (!search  !current)
+   current = new_tree;
+   if (!super  new_tree) {
+   new_tree-next = de-next;
+   de-next = new_tree;
+   de-de_nsubtrees++;
+   } else if (new_tree) {
+   struct directory_entry *temp;
+
+   search = de-next;
+   while (strcmp(super, search-pathname))
+   search = search-next;
+   temp = new_tree;
+   while (temp-next)
+   temp = temp-next;
+   search-de_nsubtrees++;
+   temp-next = search-next;
+   search-next = new_tree;
+   }
+
+   len = strlen(item-string);
+   conflict_entry = create_new_conflict(item-string, len, 
current-de_pathlen);
+   add_conflict_to_directory_entry(current, conflict_entry);
+   for (i = 0; i  3; i++) {
+   if (ui-mode[i]) {
+   struct conflict_part *cp;
+
+   cp = xmalloc(sizeof(struct conflict_part));
+   cp-flags = (i + 1)  CONFLICT_STAGESHIFT;
+   cp-entry_mode = ui-mode[i];
+   cp-next = NULL;
+   hashcpy(cp-sha1, ui-sha1[i]);
+   add_part_to_conflict_entry(current, 
conflict_entry, cp);
+   }
+   }
+   }
+}
+
 static struct directory_entry *compile_directory_data(struct index_state 
*istate,
int nfile,
unsigned int *ndir,
@@ -1099,6 +1192,7 @@ static struct directory_entry 
*compile_directory_data(struct

[PATCH v2 15/19] read-cache: write index-v5

2013-07-12 Thread Thomas Gummerer
Write the index version 5 file format to disk. This version doesn't
write the cache-tree data and resolve-undo data to the file.

The main work is done when filtering out the directories from the
current in-memory format, where in the same turn also the conflicts
and the file data is calculated.

Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache.h |   8 +
 read-cache-v5.c | 594 +++-
 read-cache.c|  11 +-
 read-cache.h|   1 +
 4 files changed, 611 insertions(+), 3 deletions(-)

diff --git a/cache.h b/cache.h
index 1e5cc77..f3685f8 100644
--- a/cache.h
+++ b/cache.h
@@ -565,6 +565,7 @@ extern int unmerged_index(const struct index_state *);
 extern int verify_path(const char *path);
 extern struct cache_entry *index_name_exists(struct index_state *istate, const 
char *name, int namelen, int igncase);
 extern int index_name_pos(const struct index_state *, const char *name, int 
namelen);
+extern struct directory_entry *init_directory_entry(char *pathname, int len);
 #define ADD_CACHE_OK_TO_ADD 1  /* Ok to add */
 #define ADD_CACHE_OK_TO_REPLACE 2  /* Ok to replace file/directory */
 #define ADD_CACHE_SKIP_DFCHECK 4   /* Ok to skip DF conflict checks */
@@ -1363,6 +1364,13 @@ static inline ssize_t write_str_in_full(int fd, const 
char *str)
return write_in_full(fd, str, strlen(str));
 }
 
+/* index-v5 helper functions */
+extern char *super_directory(const char *filename);
+extern void insert_directory_entry(struct directory_entry *, struct hash_table 
*, int *, unsigned int *, uint32_t);
+extern void add_conflict_to_directory_entry(struct directory_entry *, struct 
conflict_entry *);
+extern void add_part_to_conflict_entry(struct directory_entry *, struct 
conflict_entry *, struct conflict_part *);
+extern struct conflict_entry *create_new_conflict(char *, int, int);
+
 /* pager.c */
 extern void setup_pager(void);
 extern const char *pager_program;
diff --git a/read-cache-v5.c b/read-cache-v5.c
index 0b9c320..33667d7 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -769,9 +769,601 @@ static int read_index_v5(struct index_state *istate, void 
*mmap,
return 0;
 }
 
+#define WRITE_BUFFER_SIZE 8192
+static unsigned char write_buffer[WRITE_BUFFER_SIZE];
+static unsigned long write_buffer_len;
+
+static int ce_write_flush(int fd)
+{
+   unsigned int buffered = write_buffer_len;
+   if (buffered) {
+   if (write_in_full(fd, write_buffer, buffered) != buffered)
+   return -1;
+   write_buffer_len = 0;
+   }
+   return 0;
+}
+
+static int ce_write(uint32_t *crc, int fd, void *data, unsigned int len)
+{
+   if (crc)
+   *crc = crc32(*crc, (Bytef*)data, len);
+   while (len) {
+   unsigned int buffered = write_buffer_len;
+   unsigned int partial = WRITE_BUFFER_SIZE - buffered;
+   if (partial  len)
+   partial = len;
+   memcpy(write_buffer + buffered, data, partial);
+   buffered += partial;
+   if (buffered == WRITE_BUFFER_SIZE) {
+   write_buffer_len = buffered;
+   if (ce_write_flush(fd))
+   return -1;
+   buffered = 0;
+   }
+   write_buffer_len = buffered;
+   len -= partial;
+   data = (char *) data + partial;
+   }
+   return 0;
+}
+
+static int ce_flush(int fd)
+{
+   unsigned int left = write_buffer_len;
+
+   if (left)
+   write_buffer_len = 0;
+
+   if (write_in_full(fd, write_buffer, left) != left)
+   return -1;
+
+   return 0;
+}
+
+static void ce_smudge_racily_clean_entry(struct cache_entry *ce)
+{
+   /*
+* This method shall only be called if the timestamp of ce
+* is racy (check with is_racy_timestamp). If the timestamp
+* is racy, the writer will set the CE_SMUDGED flag.
+*
+* The reader (match_stat_basic) will then take care
+* of checking if the entry is really changed or not, by
+* taking into account the size and the stat_crc and if
+* that hasn't changed checking the sha1.
+*/
+   ce-ce_flags |= CE_SMUDGED;
+}
+
+char *super_directory(const char *filename)
+{
+   char *slash;
+
+   slash = strrchr(filename, '/');
+   if (slash)
+   return xmemdupz(filename, slash-filename);
+   return NULL;
+}
+
+struct directory_entry *init_directory_entry(char *pathname, int len)
+{
+   struct directory_entry *de = xmalloc(directory_entry_size(len));
+
+   memcpy(de-pathname, pathname, len);
+   de-pathname[len] = '\0';
+   de-de_flags  = 0;
+   de-de_foffset= 0;
+   de-de_cr = 0;
+   de-de_ncr

[PATCH v2 19/19] p0003-index.sh: add perf test for the index formats

2013-07-12 Thread Thomas Gummerer
From: Thomas Rast tr...@inf.ethz.ch

Add a performance test for index version [23]/4/5 by using
git update-index --index-version=x, thus testing both the reader
and the writer speed of all index formats.

Signed-off-by: Thomas Rast tr...@inf.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/perf/p0003-index.sh | 59 +++
 1 file changed, 59 insertions(+)
 create mode 100755 t/perf/p0003-index.sh

diff --git a/t/perf/p0003-index.sh b/t/perf/p0003-index.sh
new file mode 100755
index 000..3e02868
--- /dev/null
+++ b/t/perf/p0003-index.sh
@@ -0,0 +1,59 @@
+#!/bin/sh
+
+test_description=Tests index versions [23]/4/5
+
+. ./perf-lib.sh
+
+test_perf_large_repo
+
+test_expect_success convert to v3 
+   git update-index --index-version=2
+
+
+test_perf v[23]: update-index 
+   git update-index --index-version=2 /dev/null
+
+
+subdir=$(git ls-files | sed 's#/[^/]*$##' | grep -v '^$' | uniq | tail -n 30 | 
head -1)
+
+test_perf v[23]: grep nonexistent -- subdir 
+   test_must_fail git grep nonexistent -- $subdir /dev/null
+
+
+test_perf v[23]: ls-files -- subdir 
+   git ls-files $subdir /dev/null
+
+
+test_expect_success convert to v4 
+   git update-index --index-version=4
+
+
+test_perf v4: update-index 
+   git update-index --index-version=4 /dev/null
+
+
+test_perf v4: grep nonexistent -- subdir 
+   test_must_fail git grep nonexistent -- $subdir /dev/null
+
+
+test_perf v4: ls-files -- subdir 
+   git ls-files $subdir /dev/null
+
+
+test_expect_success convert to v5 
+   git update-index --index-version=5
+
+
+test_perf v5: update-index 
+   git update-index --index-version=5 /dev/null
+
+
+test_perf v5: grep nonexistent -- subdir 
+   test_must_fail git grep nonexistent -- $subdir /dev/null
+
+
+test_perf v5: ls-files -- subdir 
+   git ls-files $subdir /dev/null
+
+
+test_done
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5.5/22] Add documentation for the index api

2013-07-11 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Wed, Jul 10, 2013 at 3:10 AM, Thomas Gummerer t.gumme...@gmail.com wrote:
 If you happen to know that certain entries match the given pathspec,
 you could help the caller avoid match_pathspec'ing again by set a bit
 in ce_flags.

 I currently don't know which entries do match the pathspec from just
 reading the index file, additional calls would be needed.  I don't think
 that would be worth the overhead.

 Yeah I now see that you select what to load in v5 with the adjusted
 pathspec, not the input pathspec. Originally I thought you match the
 input pathspec against every file entry in the index :P Your adjusted
 pathspec looks like what common_prefix is for. It's cheaper than
 creating adjusted_pathspec from match_pathspec and reduces loading in
 major cases, where glob is not used.

 Still, creating an adjusted pathspec this way looks iffy. You need to
 understand pathspec in order to strip the filename part out to match
 the directory match only. An alternative is use
 tree_entry_interesting. It goes along well with tree traversal and can
 be used to match directories with original pathspec. Once you see it
 matches an entry in a directory, you could skip matching the rest of
 the files and load the whole directory. read_index_filtered_v5 and
 read_entries may need some tweaking though. I'll try it and post a
 patch later if I succeed.

Hrm, I played around a bit with this idea, but I couldn't figure out how
to make it work.  For it to work we would still have to load some
entries in a directory at least?  Or is there a way to match the
directories, which I just haven't figured out yet?

 To know which entry exists in the index and which is
 new, use another flag. Most reader code won't change if we do it this
 way, all match_pathspec() remain where they are.

 Hrm you mean to know which cache entries are added (or changed) in the
 in-memory index and will have to be written later?  I'm not sure I
 understand correctly what you mean here.

 Oh.. The to know.. sentence was nonsense. We probably don't need to
 know. We may track changed entries for partial writing, but let's
 leave that out for now.

Ok, makes sense.

 +`index_change_filter_opts(opts)`::
 +   This function again has a slightly different functionality for
 +   index-v2 and index-v5.
 +
 +   For index-v2 it simply changes the filter_opts, so
 +   for_each_index_entry uses the changed index_opts, to iterate
 +   over a different set of cache entries.
 +
 +   For index-v5 it refreshes the index if the filter_opts have
 +   changed and sets the new filter_opts in the index state, again
 +   to iterate over a different set of cache entries as with
 +   index-v2.
 +
 +   This has some optimization potential, in the case that the
 +   opts get stricter (less of the index should be read) it
 +   doesn't have to reload anything, but currently does.

 The only use case I see so far is converting a partial index_state
 back to a full one. Apart from doing so in order to write the new
 index, I think some operation (like rename tracking in diff or
 unpack-trees) may expect full index. I think we should support that. I
 doubt we need to change pathspec to something different than the one
 we used to load the index. When a user passes a pathspec to a command,
 the user expects the command to operate on that set only, not outside.

 One application was in ls-files, where we strip the trailing slash from
 the pathspecs for submodules.  But when we let the caller filter the
 rest out it's not needed anymore.  We load all entries without the
 trailing slash anyway.

 That submodule trailing slash stripping code will be moved away soon
 (I've been working on it for some time now). There's similar code in
 pathspec.c. I hope by the time this series becomes a candidate for
 'next', those pathspec manipulation is already gone. For
 strip_trailing_slash_from_submodules, peeking in index file for a few
 entries is probably ok. For check_path_for_gitlink, full index is
 loaded until we figure out a clever way.

Ah great, for now I'll just not use the for_each_index_entry function in
ls-files, and then change the code later once the stripping code is
moved away.

 Some thoughts about the writing api.

 In think we should avoid automatically converting partial index into a
 full one before writing. Push that back to the caller and die() when
 asked to update partial index. They know at what point the index may
 be updated and even what part of it may be updated. I think all
 commands fall into two categories, tree-wide updates (merge,
 checkout...) and limited by the user-given pathspec. what part to be
 updated is not so hard to determine.

 Hrm this is only true if index entries are added or removed, not if they
 are only changed.  If they are only changed we can write a partially
 read index once we have partial writing.

 Yep. We can detect if changes are updates only

Re: [PATCH 13/22] documentation: add documentation of the index-v5 file format

2013-07-11 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Sun, Jul 7, 2013 at 3:11 PM, Thomas Gummerer t.gumme...@gmail.com wrote:
 +== File entry (fileentries)
 +
 +  File entries are sorted in ascending order on the name field, after the
 +  respective offset given by the directory entries. All file names are
 +  prefix compressed, meaning the file name is relative to the directory.
 +
 +  filename (variable length, nul terminated). The exact encoding is
 +undefined, but the filename cannot contain a NUL byte (iow, the same
 +encoding as a UNIX pathname).
 +
 +  flags (16-bits): 'flags' field split into (high to low bits)
 +
 +assumevalid (1-bit): assume-valid flag
 +
 +intenttoadd (1-bit): intent-to-add flag, used by git add -N.
 +  Extended flag in index v3.
 +
 +stage (2-bit): stage of the file during merge
 +
 +skipworktree (1-bit): skip-worktree flag, used by sparse checkout.
 +  Extended flag in index v3.
 +
 +smudged (1-bit): indicates if the file is racily smudged.
 +
 +10-bit unused, must be zero [6]
 +
 +  mode (16-bits): file mode, split into (high to low bits)
 +
 +objtype (4-bits): object type
 +  valid values in binary are 1000 (regular file), 1010 (symbolic
 +  link) and 1110 (gitlink)
 +
 +3-bit unused
 +
 +permission (9-bits): unix permission. Only 0755 and 0644 are valid
 +  for regular files. Symbolic links and gitlinks have value 0 in
 +  this field.
 +
 +  mtimes (32-bits): mtime seconds, the last time a file's data changed
 +this is stat(2) data
 +
 +  mtimens (32-bits): mtime nanosecond fractions
 +this is stat(2) data
 +
 +  file size (32-bits): The on-disk size, trucated to 32-bit.
 +this is stat(2) data
 +
 +  statcrc (32-bits): crc32 checksum over ctime seconds, ctime
 +nanoseconds, ino, dev, uid, gid (All stat(2) data
 +except mtime and file size). If the statcrc is 0 it will
 +be ignored. [7]
 +
 +  objhash (160-bits): SHA-1 for the represented object
 +
 +  entrycrc (32-bits): crc32 checksum for the file entry. The crc code
 +includes the offset to the offset to the file, relative to the
 +beginning of the file.

 Question about the possibility of updating index file directly. If git
 updates a few fields of an entry (but not entrycrc yet) and crashes,
 the entry would become corrupt because its entrycrc does not match the
 content. What do we do? Do we need to save a copy of the entry
 somewhere in the index file (maybe in the conflict data section), so
 that the reader can recover the index? Losing the index because of
 bugs is big deal in my opinion. pre-v5 never faces this because we
 keep the original copy til the end.

 Maybe entrycrc should not cover stat fields and statcrc. It would make
 refreshing safer. If the above happens during refresh, only statcrc is
 corrupt and we can just refresh the entry. entrycrc still says the
 other fields are good (and they are).

The original idea was to change the lock-file for partial writing to
make it work for this case.  The exact structure of the file still has
to be defined, but generally it would be done in the following steps:

  1. Write the changed entry to the lock-file
  2. Change the entry in the index
  3. If we succeed delete the lock-file (commit the transaction)

If git crashes, and leaves the index corrupted, we can recover the
information from the lock-file and write the new information to the
index file and then delete the lock-file.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 13/22] documentation: add documentation of the index-v5 file format

2013-07-11 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Thu, Jul 11, 2013 at 6:39 PM, Thomas Gummerer t.gumme...@gmail.com wrote:
 Question about the possibility of updating index file directly. If git
 updates a few fields of an entry (but not entrycrc yet) and crashes,
 the entry would become corrupt because its entrycrc does not match the
 content. What do we do? Do we need to save a copy of the entry
 somewhere in the index file (maybe in the conflict data section), so
 that the reader can recover the index? Losing the index because of
 bugs is big deal in my opinion. pre-v5 never faces this because we
 keep the original copy til the end.

 Maybe entrycrc should not cover stat fields and statcrc. It would make
 refreshing safer. If the above happens during refresh, only statcrc is
 corrupt and we can just refresh the entry. entrycrc still says the
 other fields are good (and they are).

 The original idea was to change the lock-file for partial writing to
 make it work for this case.  The exact structure of the file still has
 to be defined, but generally it would be done in the following steps:

   1. Write the changed entry to the lock-file
   2. Change the entry in the index
   3. If we succeed delete the lock-file (commit the transaction)

 If git crashes, and leaves the index corrupted, we can recover the
 information from the lock-file and write the new information to the
 index file and then delete the lock-file.

 Ah makes sense. Still concerned about refreshing though. Updated files
 are usually few while refreshed files could be a lot more, increasing
 the cost at #1.

Any idea how common refreshing a big part of the cache is?  If it's not
to common, I'd prefer to leave the stat data and stat crc in the
entrycrc, as we can inform the user if something is wrong with the
index, be it from git failing, or from disk corruption.

On the other hand if refresh_cache is relatively common and usually
changes a big part of the index we should leave them out, as git can
still run correctly with incorrect stat data, but takes a little longer,
because it may have to check the file contents.  That will be trade-off
to make here.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5.5/22] Add documentation for the index api

2013-07-09 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Tue, Jul 9, 2013 at 3:54 AM, Thomas Gummerer t.gumme...@gmail.com wrote:
 As promised, a draft for a documentation for the index api as it is in
 this series.

 First of all, it may be a good idea to acknowledge
 index_state-cache[] as part of the API for now. Not hiding it
 simplifies a few things (no need for new next_ce field, no worries
 about rewinding in unpack-trees..). Supporting partial loading (and
 maybe partial update in some cases) with this API and
 index_state-cache[] part of the API are good enough for me. We can do
 another tree-based API or something update later when it's formed (I
 looked at your index-v5api branch but I don't think a tree-based api
 was there, my concern is how much extra head pre-v5 has to pay to use
 tree-based api).

Yes, I think you're right, that simplifies everything a lot, while we
still can have partial loading.  Hiding index_state-cache[] was mainly
thought for future changes for the in-memory format, but I think that
will not be happening for a while.

 +`read_index_filtered(opts)`::
 +   This method behaves differently for index-v2 and index-v5.
 +
 +   For index-v2 it simply reads the whole index as read_index()
 +   does, so we are sure we don't have to reload anything if the
 +   user wants a different filter.  It also sets the filter_opts
 +   in the index_state, which is used to limit the results when
 +   iterating over the index with for_each_index_entry().
 +
 +   The whole index is read to avoid the need to eventually
 +   re-read the index later, because the performance is no
 +   different when reading it partially.
 +
 +   For index-v5 it creates an adjusted_pathspec to filter the
 +   reading.  First all the directory entries are read and then
 +   the cache_entries in the directories that match the adjusted
 +   pathspec are read.  The filter_opts in the index_state are set
 +   to filter out the rest of the cache_entries that are matched
 +   by the adjusted pathspec but not by the pathspec given.  The
 +   rest of the index entries are filtered out when iterating over
 +   the cache with for_each_index_entries.

 You can state in the API that the input pathspec is used as a hint to
 load only a portion of the index. read_index_filtered may load _more_
 than necessary. It's the caller's responsibility to verify again which
 is matched and which is not. That's how read_directory is done. I
 think it gives you more liberty in loading strategy. It's already true
 for v2 because full index is loaded regardless of the given pathspec.
 In the end, we have a linear list (from public view) of cache entries,
 accessible via index_state-cache[].

Yes, and it's also partly true for index-v5, as the full content of a
directory is loaded even if only some files it it match the pathspec
that's given.

 If you happen to know that certain entries match the given pathspec,
 you could help the caller avoid match_pathspec'ing again by set a bit
 in ce_flags.

I currently don't know which entries do match the pathspec from just
reading the index file, additional calls would be needed.  I don't think
that would be worth the overhead.

 To know which entry exists in the index and which is
 new, use another flag. Most reader code won't change if we do it this
 way, all match_pathspec() remain where they are.

Hrm you mean to know which cache entries are added (or changed) in the
in-memory index and will have to be written later?  I'm not sure I
understand correctly what you mean here.

 +`for_each_index_entry(fn, cb_data)`::
 +   Iterates over all cache_entries in the index filtered by
 +   filter_opts in the index_stat.  For each cache entry fn is
 +   executed with cb_data as callback data.  From within the loop
 +   do `return 0` to continue, or `return 1` to break the loop.

 Because we don't attempt to hide index_state-cache[], this one may be
 for convenience, the user is not required to convert to it. Actually I
 think this may be slower because of the cost of calling function
 pointer.

Yes right, I think you're right.  In fact I just tested it, and it's
slightly slower.

I still think it would make sense to keep it around, for the callers
that want the cache filtered exactly by the filter_opts, for convenience
as you said.

 +`next_index_entry(ce)`::
 +   Returns the cache_entry that follows after ce

 next_ce field and this method may be gone too, just access 
 index_state-cache[]

Yes, this makes no sense when we're not hiding index_state-cache[].
The same goes for the get_index_entry_by_name function, which is
essentially the same as using index_name_pos and then getting the cache
entry from index_state-cache[].

 +`index_change_filter_opts(opts)`::
 +   This function again has a slightly different functionality for
 +   index-v2 and index-v5.
 +
 +   For index-v2 it simply changes the filter_opts, so

Re: [PATCH 05/22] read-cache: add index reading api

2013-07-09 Thread Thomas Gummerer
Junio C Hamano gits...@pobox.com writes:

 Thomas Gummerer t.gumme...@gmail.com writes:

 The reader often needs to rewind the read-pointer partially while
 walking the index (e.g. next_cache_entry() in unpack-trees.c and how
 the o-cache_bottom position is used throughout the subsystem).  I
 am not sure if this singly-linked list is a good way to go.

 I'm not very familiar with the unpack-trees code, but from a quick look
 the pointer (or position in the cache) is always only moved forward.

 I am more worried about o-cache_bottom processing, where it
 currently is an index into an array.

 With your ce-next_in_list_of_read_entries change, a natural rewrite
 would be to point at the ce with o-cache_bottom, but then that
 would mean you cannot in-place replace the entries like we used to
 be able to in an array based implementation.

 But your series does not seem to touch unpack-trees yet, so I may be
 worried too much before it becomes necessary.

Yes, you're right, as Duy mentioned in the other email I just responded
to it makes sense to keep the index around for now.

I looked at the unpack-trees code a bit, and adding a new api and hiding
index_state-cache[] will probably be a bit harder to do than I
originally thought, so it's best to keep that around for now, as we're
still able to get the benefits from partial loading even if it's not
hidden.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 05/22] read-cache: add index reading api

2013-07-08 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Sun, Jul 7, 2013 at 3:11 PM, Thomas Gummerer t.gumme...@gmail.com wrote:
 +/*
 + * Options by which the index should be filtered when read partially.
 + *
 + * pathspec: The pathspec which the index entries have to match
 + * seen: Used to return the seen parameter from match_pathspec()
 + * max_prefix, max_prefix_len: These variables are set to the longest
 + * common prefix, the length of the longest common prefix of the
 + * given pathspec
 + *
 + * read_staged: used to indicate if the conflicted entries (entries
 + * with a stage) should be included
 + * read_cache_tree: used to indicate if the cache-tree should be read
 + * read_resolve_undo: used to indicate if the resolve undo data should
 + * be read
 + */
 +struct filter_opts {
 +   const char **pathspec;
 +   char *seen;
 +   char *max_prefix;
 +   int max_prefix_len;
 +
 +   int read_staged;
 +   int read_cache_tree;
 +   int read_resolve_undo;
 +};
 +
  struct index_state {
 struct cache_entry **cache;
 unsigned int version;
 @@ -270,6 +297,8 @@ struct index_state {
 struct hash_table name_hash;
 struct hash_table dir_hash;
 struct index_ops *ops;
 +   struct internal_ops *internal_ops;
 +   struct filter_opts *filter_opts;
  };

 ...

 -/* remember to discard_cache() before reading a different cache! */
 -int read_index_from(struct index_state *istate, const char *path)
 +
 +int read_index_filtered_from(struct index_state *istate, const char *path,
 +struct filter_opts *opts)
  {
 int fd, err, i;
 struct stat st_old, st_new;
 @@ -1337,7 +1425,7 @@ int read_index_from(struct index_state *istate, const 
 char *path)
 if (istate-ops-verify_hdr(mmap, mmap_size)  0)
 err = 1;

 -   if (istate-ops-read_index(istate, mmap, mmap_size)  0)
 +   if (istate-ops-read_index(istate, mmap, mmap_size, opts)  
 0)
 err = 1;
 istate-timestamp.sec = st_old.st_mtime;
 istate-timestamp.nsec = ST_MTIME_NSEC(st_old);
 @@ -1345,6 +1433,7 @@ int read_index_from(struct index_state *istate, const 
 char *path)
 die_errno(cannot stat the open index);

 munmap(mmap, mmap_size);
 +   istate-filter_opts = opts;
 if (!index_changed(st_old, st_new)  !err)
 return istate-cache_nr;
 }

 Putting filter_opts in index_state feels like a bad design. Iterator
 information should be separated from the iterated object, so that two
 callers can walk through the same index without stepping on each other
 (I'm not talking about multithreading, a caller may walk a bit, then
 the other caller starts walking, then the former caller resumes
 walking again in a call chain).

Yes, you're right.  We need the filter_opts to see what part of the
index has been loaded [1] and which part has been skipped, but it
shouldn't be used for filtering in the for_each_index_entry function.

I think there should be two versions of the for_each_index_entry
function then, where the for_each_index_entry function would behave the
same way as the for_each_index_entry_filtered function with the
filter_opts parameter set to NULL:
for_each_index_entry_filtered(struct index_state *, each_cache_entry_fn, void 
*cb_data, struct filter_opts *)
for_each_index_entry(struct index_state *, each_cache_entry_fn, void *cb_data)

Both of them then should call index_change_filter_opts to make sure all
the entries that are needed are loaded in the in-memory format.

Does that make sense?

[1] That is only important for the new index-v5 file format, which can
be loaded partially.  The filter_opts could always be set to NULL,
as the whole index is always loaded anyway.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 05/22] read-cache: add index reading api

2013-07-08 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Sun, Jul 7, 2013 at 3:11 PM, Thomas Gummerer t.gumme...@gmail.com wrote:
 Add an api for access to the index file.  Currently there is only a very
 basic api for accessing the index file, which only allows a full read of
 the index, and lets the users of the data filter it.  The new index api
 gives the users the possibility to use only part of the index and
 provides functions for iterating over and accessing cache entries.

 This simplifies future improvements to the in-memory format, as changes
 will be concentrated on one file, instead of the whole git source code.

 Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
 ---
  cache.h |  57 +-
  read-cache-v2.c |  96 +++--
  read-cache.c| 108 
 
  read-cache.h|  12 ++-
  4 files changed, 263 insertions(+), 10 deletions(-)

 diff --git a/cache.h b/cache.h
 index 5082b34..d38dfbd 100644
 --- a/cache.h
 +++ b/cache.h
 @@ -127,7 +127,8 @@ struct cache_entry {
 unsigned int ce_flags;
 unsigned int ce_namelen;
 unsigned char sha1[20];
 -   struct cache_entry *next;
 +   struct cache_entry *next; /* used by name_hash */
 +   struct cache_entry *next_ce; /* used to keep a list of cache entries 
 */
 char name[FLEX_ARRAY]; /* more */
  };

 From what I read, doing

 ce = start;
 while (ce) { do(something); ce = next_cache_entry(ce); }

 is the same as

 i = start_index;
 while (i  active_nr) { ce = active_cache[i]; do(something); i++; }

 What's the advantage of using the former over the latter? Do you plan
 to eliminate the latter loop (by hiding struct cache_entry **cache;
 from public index_state structure?

Yes, I wanted to eliminate the latter loop, because it depends on the
in-memory format of the index.  By moving all direct accesses of
index_state-cache to an api it gets easier to change the in-memory
format.  I played a bit with a tree-based in-memory format [1], which
represents the on-disk format of index-v5 more closely, making
modifications and partial-loading simpler.

I've tried switching all those loops to api calls, but that would make
the api too bloated because there is a lot of those loops.  I found it
more sensible to do it this way, leaving the loops how they are, while
making future changes to the in-memory format a lot simpler.

[1] https://github.com/tgummerer/git/blob/index-v5api/read-cache-v5.c#L17
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 15/22] read-cache: read index-v5

2013-07-08 Thread Thomas Gummerer
Eric Sunshine sunsh...@sunshineco.com writes:

 On Sun, Jul 7, 2013 at 4:11 AM, Thomas Gummerer t.gumme...@gmail.com wrote:
 Make git read the index file version 5 without complaining.

 This version of the reader doesn't read neither the cache-tree
 nor the resolve undo data, but doesn't choke on an index that
 includes such data.
 ---
 diff --git a/read-cache-v5.c b/read-cache-v5.c
 new file mode 100644
 index 000..e319f30
 --- /dev/null
 +++ b/read-cache-v5.c
 @@ -0,0 +1,658 @@
 +static struct directory_entry *read_directories(unsigned int *dir_offset,
 +   unsigned int *dir_table_offset,
 +   void *mmap,
 +   int mmap_size)
 +{
 +   int i, ondisk_directory_size;
 +   uint32_t *filecrc, *beginning, *end;
 +   struct directory_entry *current = NULL;
 +   struct ondisk_directory_entry *disk_de;
 +   struct directory_entry *de;
 +   unsigned int data_len, len;
 +   char *name;
 +
 +   /* Length of pathname + nul byte for termination + size of
 +* members of ondisk_directory_entry. (Just using the size
 +* of the stuct doesn't work, because there may be padding

 s/stuct/struct/

 +* bytes for the struct)
 +*/

 Also:

   /*
* Format multi-line comment
* like this.
*/

 Remaining multi-line comments appear to be formatted correctly.

Thanks for catching this and the other typos.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 05/22] read-cache: add index reading api

2013-07-08 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Mon, Jul 8, 2013 at 6:20 PM, Thomas Gummerer t.gumme...@gmail.com wrote:
 Duy Nguyen pclo...@gmail.com writes:
 Putting filter_opts in index_state feels like a bad design. Iterator
 information should be separated from the iterated object, so that two
 callers can walk through the same index without stepping on each other
 (I'm not talking about multithreading, a caller may walk a bit, then
 the other caller starts walking, then the former caller resumes
 walking again in a call chain).

 Yes, you're right.  We need the filter_opts to see what part of the
 index has been loaded [1] and which part has been skipped, but it
 shouldn't be used for filtering in the for_each_index_entry function.

 I think there should be two versions of the for_each_index_entry
 function then, where the for_each_index_entry function would behave the
 same way as the for_each_index_entry_filtered function with the
 filter_opts parameter set to NULL:
 for_each_index_entry_filtered(struct index_state *, each_cache_entry_fn, 
 void *cb_data, struct filter_opts *)
 for_each_index_entry(struct index_state *, each_cache_entry_fn, void 
 *cb_data)

 Both of them then should call index_change_filter_opts to make sure all
 the entries that are needed are loaded in the in-memory format.

 Does that make sense?

 Hmm.. I was confused actually (documentation on the api would help
 greatly). If you already filter at load time, I don't think you need
 to match again. The caller asked for filter and it should know what's
 in there so for_each_index_entry just goes through all entries without
 extra match_pathspec. Or is that what next_index_entry for?
 match_pathspec function could be expensive when glob is involved. If
 the caller wants extra matching, it could do inside the callback
 function.

Yes, a documentation would be good.  I'll try to write something better
later today, when I have some more time.  In the meantime I'll just
outline what the functions do here shortly:

read_index_filtered(opts): This method behaves differently for index-v2
  and index-v5.
  For index-v2 it simply reads the whole index as read_cache() does, so
  we are sure we don't have to reload anything if the user wants a
  different filter.
  For index-v5 it creates an adjusted pathspec to and reads all
  directories that are matched by them.

get_index_entry_by_name(name, namelen, ce): Returns a cache_entry
  matched by name via the ce parameter.  If a cache_entry is matched
  exactly 1 is returned.
  Name may also be a path, in which case it returns 0 and the first
  cache_entry in that path. e.g. we have:
  ...
  path/file1
  
in the index and name is path, than it returns 0 and the path/file1
cache_entry.  If name is path/file1 on the other hand it returns 1
and the path/file1 cache_entry.

for_each_index_entry(fn, cb_data):  Iterates over all cache_entries in
  the index filtered by filter_opts in the index_state, and executes fn
  for each of them with the cb_data as callback data.

next_index_entry(ce): Returns the cache_entry that follows after ce

index_change_filter_opts(opts): For index-v2 it simply changes the
  filter_opts, so for_each_index_entry uses the changed index_opts.
  For index-v5 it refreshes the index if the filter_opts have changed.
  This has some optimization potential, in the case that the opts get
  stricter (less of the index should be read) it doesn't have to reload
  anything.

I'm not sure what's in the cache, because the whole index is in the
cache if the on-disk format is index-v2 and the index is filtered by the
adjusted_pathspec if the on-disk format is index-v5.  That's what I need
the extra match_pathspec for. But yes, that could also be left to the
caller.

Hope that makes it a little clearer.

 It seems you could change the filter with index_change_filter_opts. In
 v5 the index will be reloaded. What happens when some index entries
 area already modified? Do we start to have read-only index views and
 one read-write view? If partial views are always read-only, perhaps we
 just allow the user to create a new index_state (or view) with new
 filter and destroy the old one. We don't have to care about changing
 or separating filter in that case because the view is the iterator.

The read-write part is mostly covered by the next patch (6/22).  Before
changing the index, the filter_opts always have to be set to NULL, using
index_change_filter_opts and therefore use the whole index.  This is
currently hard to improve, because we always need the whole index when
we write it.  Changing this only makes sense once we have partial
writing too.

So in principle the index_change_filter_opts function implements those
views.

Even with partial writing we have to distinguish if a cache_entry has
been added/removed, in which case a full rewrite is necessary or if a
cache_entry has simply been modified (it's content changed), in which
case we could replace it in place

Re: [PATCH 06/22] make sure partially read index is not changed

2013-07-08 Thread Thomas Gummerer
Junio C Hamano gits...@pobox.com writes:

 Thomas Gummerer t.gumme...@gmail.com writes:

 A partially read index file currently cannot be written to disk.  Make
 sure that never happens, by re-reading the index file if the index file
 wasn't read completely before changing the in-memory index.

 I am not quite sure what you are trying to do.

 In operations that modify the index (replace_index_entry(),
 remove_index_entry_at(), etc.)  you lift the filter_ops and keep
 partially_read flag still on.  In the write-out codepath, you have
 an assert to make sure the caller has cleared the partially_read
 flag.  A natural way to clear the flag is to re-read the index from
 the file, but then you can easily lose the modifications.

 Also shouldn't the flag be cleared upon discard_index()?  If it is
 done there, you probably would not need to clear it in read_index().

Hrm, maybe the code isn't quite clear enough here, or maybe the patch
should come directly before (16/22) read-cache: read index-v5 to be more
clear.

The flag is always set to 0 in read_index_v2, as the whole index is
always read and therefore it never needs to be reset.  With
read_index_v5 on the other hand the flag is set when the filter_opts are
different than NULL.

But thinking about it, the flag is actually not necessary at all.  The
filter_opts should simply be checked for NULL for the assert and they
should also be set to NULL on discard_index.  Will fix this in the next
version.  Thanks.

 Should
 there be another safety that says calling read_index() with the
 partially_read flag on is a bug or something?

I'm not sure.  I think it doesn't hurt, as we discard the index when
we change the index_ops.  At the moment I can't think of a case where
where calling read_index() could be used when it's partially read
without discarding the cache first.  I'll add it in the next version.


 Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
 ---
  builtin/update-index.c | 4 
  cache.h| 4 +++-
  read-cache-v2.c| 3 +++
  read-cache.c   | 8 
  4 files changed, 18 insertions(+), 1 deletion(-)

 diff --git a/builtin/update-index.c b/builtin/update-index.c
 index 5c7762e..03f6426 100644
 --- a/builtin/update-index.c
 +++ b/builtin/update-index.c
 @@ -49,6 +49,8 @@ static int mark_ce_flags(const char *path, int flag, int 
 mark)
  int namelen = strlen(path);
  int pos = cache_name_pos(path, namelen);
  if (0 = pos) {
 +if (active_cache_partially_read)
 +cache_change_filter_opts(NULL);
  if (mark)
  active_cache[pos]-ce_flags |= flag;
  else
 @@ -253,6 +255,8 @@ static void chmod_path(int flip, const char *path)
  pos = cache_name_pos(path, strlen(path));
  if (pos  0)
  goto fail;
 +if (active_cache_partially_read)
 +cache_change_filter_opts(NULL);
  ce = active_cache[pos];
  mode = ce-ce_mode;
  if (!S_ISREG(mode))
 diff --git a/cache.h b/cache.h
 index d38dfbd..f6c3407 100644
 --- a/cache.h
 +++ b/cache.h
 @@ -293,7 +293,8 @@ struct index_state {
  struct cache_tree *cache_tree;
  struct cache_time timestamp;
  unsigned name_hash_initialized : 1,
 - initialized : 1;
 + initialized : 1,
 + partially_read : 1;
  struct hash_table name_hash;
  struct hash_table dir_hash;
  struct index_ops *ops;
 @@ -315,6 +316,7 @@ extern void free_name_hash(struct index_state *istate);
  #define active_alloc (the_index.cache_alloc)
  #define active_cache_changed (the_index.cache_changed)
  #define active_cache_tree (the_index.cache_tree)
 +#define active_cache_partially_read (the_index.partially_read)

  #define read_cache() read_index(the_index)
  #define read_cache_from(path) read_index_from(the_index, (path))
 diff --git a/read-cache-v2.c b/read-cache-v2.c
 index 1ed640d..2cc792d 100644
 --- a/read-cache-v2.c
 +++ b/read-cache-v2.c
 @@ -273,6 +273,7 @@ static int read_index_v2(struct index_state *istate, 
 void *mmap,
  src_offset += 8;
  src_offset += extsize;
  }
 +istate-partially_read = 0;
  return 0;
  unmap:
  munmap(mmap, mmap_size);
 @@ -495,6 +496,8 @@ static int write_index_v2(struct index_state *istate, 
 int newfd)
  struct stat st;
  struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;

 +if (istate-partially_read)
 +die(BUG: index: cannot write a partially read index);
  for (i = removed = extended = 0; i  entries; i++) {
  if (cache[i]-ce_flags  CE_REMOVE)
  removed++;
 diff --git a/read-cache.c b/read-cache.c
 index b30ee75..4529fab 100644
 --- a/read-cache.c
 +++ b/read-cache.c
 @@ -30,6 +30,8 @@ static void replace_index_entry(struct index_state 
 *istate, int nr, struct cache
  {
  struct cache_entry *old = istate-cache[nr];

 +if (istate-partially_read)
 +index_change_filter_opts

Re: [PATCH 05/22] read-cache: add index reading api

2013-07-08 Thread Thomas Gummerer
Junio C Hamano gits...@pobox.com writes:

 Thomas Gummerer t.gumme...@gmail.com writes:

 Add an api for access to the index file.  Currently there is only a very
 basic api for accessing the index file, which only allows a full read of
 the index, and lets the users of the data filter it.  The new index api
 gives the users the possibility to use only part of the index and
 provides functions for iterating over and accessing cache entries.

 This simplifies future improvements to the in-memory format, as changes
 will be concentrated on one file, instead of the whole git source code.

 Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
 ---
  cache.h |  57 +-
  read-cache-v2.c |  96 +++--
  read-cache.c| 108 
 
  read-cache.h|  12 ++-
  4 files changed, 263 insertions(+), 10 deletions(-)

 diff --git a/cache.h b/cache.h
 index 5082b34..d38dfbd 100644
 --- a/cache.h
 +++ b/cache.h
 @@ -127,7 +127,8 @@ struct cache_entry {
  unsigned int ce_flags;
  unsigned int ce_namelen;
  unsigned char sha1[20];
 -struct cache_entry *next;
 +struct cache_entry *next; /* used by name_hash */
 +struct cache_entry *next_ce; /* used to keep a list of cache entries */

 The reader often needs to rewind the read-pointer partially while
 walking the index (e.g. next_cache_entry() in unpack-trees.c and how
 the o-cache_bottom position is used throughout the subsystem).  I
 am not sure if this singly-linked list is a good way to go.

I'm not very familiar with the unpack-trees code, but from a quick look
the pointer (or position in the cache) is always only moved forward.  A
problem I do see though is skipping a number of entries at once.  An
example for that below:
int matches;
matches = 
cache_tree_matches_traversal(o-src_index-cache_tree,
   names, info);
/*
 * Everything under the name matches; skip the
 * entire hierarchy.  diff_index_cached codepath
 * special cases D/F conflicts in such a way that
 * it does not do any look-ahead, so this is safe.
 */
if (matches) {
o-cache_bottom += matches;
return mask;
}

This could probably be transformed into something like
skip_cache_tree_matches(cache-tree, names, info);

I'll take some time to familiarize myself with the unpack-trees code to
see if I can find a better solution than this, and if there are more
pitfalls.

 +/*
 + * Options by which the index should be filtered when read partially.
 + *
 + * pathspec: The pathspec which the index entries have to match
 + * seen: Used to return the seen parameter from match_pathspec()
 + * max_prefix, max_prefix_len: These variables are set to the longest
 + * common prefix, the length of the longest common prefix of the
 + * given pathspec

 These probably should use struct pathspec abstration, not just the
 array of raw strings, no?

Yes, thanks, that's probably a good idea.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5.5/22] Add documentation for the index api

2013-07-08 Thread Thomas Gummerer
Document the new index api and add examples of how it should be used
instead of the old functions directly accessing the index.

Helped-by: Nguyễn Thái Ngọc Duy pclo...@gmail.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---

Duy Nguyen pclo...@gmail.com writes:

 Hmm.. I was confused actually (documentation on the api would help
 greatly).

As promised, a draft for a documentation for the index api as it is in
this series.

Documentation/technical/api-in-core-index.txt | 108 +-
 1 file changed, 106 insertions(+), 2 deletions(-)

diff --git a/Documentation/technical/api-in-core-index.txt 
b/Documentation/technical/api-in-core-index.txt
index adbdbf5..5269bb1 100644
--- a/Documentation/technical/api-in-core-index.txt
+++ b/Documentation/technical/api-in-core-index.txt
@@ -1,14 +1,116 @@
 in-core index API
 =

+Reading API
+---
+
+`read_index()`::
+   Read the whole index file from disk.
+
+`index_name_pos(name, namelen)`::
+   Find a cache_entry with name in the index.  Returns pos if an
+   entry is matched exactly and -pos-1 if an entry is matched
+   partially.
+   e.g.
+   index:
+   file1
+   file2
+   path/file1
+   zzz
+
+   index_name_pos(path/file1, 10) returns 2, while
+   index_name_pos(path, 4) returns -1
+
+`read_index_filtered(opts)`::
+   This method behaves differently for index-v2 and index-v5.
+
+   For index-v2 it simply reads the whole index as read_index()
+   does, so we are sure we don't have to reload anything if the
+   user wants a different filter.  It also sets the filter_opts
+   in the index_state, which is used to limit the results when
+   iterating over the index with for_each_index_entry().
+
+   The whole index is read to avoid the need to eventually
+   re-read the index later, because the performance is no
+   different when reading it partially.
+
+   For index-v5 it creates an adjusted_pathspec to filter the
+   reading.  First all the directory entries are read and then
+   the cache_entries in the directories that match the adjusted
+   pathspec are read.  The filter_opts in the index_state are set
+   to filter out the rest of the cache_entries that are matched
+   by the adjusted pathspec but not by the pathspec given.  The
+   rest of the index entries are filtered out when iterating over
+   the cache with for_each_index_entries.
+
+`get_index_entry_by_name(name, namelen, ce)`::
+   Returns a cache_entry matched by the name, returned via the
+   ce parameter.  If a cache entry is matched exactly, 1 is
+   returned, otherwise 0.  For an example see index_name_pos().
+   This function should be used instead of the index_name_pos()
+   function to retrieve cache entries.
+
+`for_each_index_entry(fn, cb_data)`::
+   Iterates over all cache_entries in the index filtered by
+   filter_opts in the index_stat.  For each cache entry fn is
+   executed with cb_data as callback data.  From within the loop
+   do `return 0` to continue, or `return 1` to break the loop.
+
+`next_index_entry(ce)`::
+   Returns the cache_entry that follows after ce
+
+`index_change_filter_opts(opts)`::
+   This function again has a slightly different functionality for
+   index-v2 and index-v5.
+
+   For index-v2 it simply changes the filter_opts, so
+   for_each_index_entry uses the changed index_opts, to iterate
+   over a different set of cache entries.
+
+   For index-v5 it refreshes the index if the filter_opts have
+   changed and sets the new filter_opts in the index state, again
+   to iterate over a different set of cache entries as with
+   index-v2.
+
+   This has some optimization potential, in the case that the
+   opts get stricter (less of the index should be read) it
+   doesn't have to reload anything, but currently does.
+
+Using the new index api
+---
+
+Currently loops over a specific set of index entry were written as:
+  i = start_index;
+  while (i  active_nr) { ce = active_cache[i]; do(something); i++; }
+
+they should be rewritten to:
+  ce = start;
+  while (ce) { do(something); ce = next_cache_entry(ce); }
+
+which is the equivalent operation but hides the in-memory format of
+the index from the user.
+
+For getting a cache entry get_cache_entry_by_name() should be used
+instead of cache_name_pos(). e.g.:
+  int pos = cache_name_pos(name, namelen);
+  struct cache_entry *ce = active_cache[pos];
+  if (pos  0) { do(something) }
+  else { do(somethingelse) }
+
+should be written as:
+  struct cache_entry *ce;
+  int ret = get_cache_entry_by_name(name, namelen, ce);
+  if (!ret) { do(something) }
+  else { do(somethingelse) }
+
+TODO
+
 Talk about read-cache.c and cache-tree.c, things like:

 * cache - the_index macros
-* read_index()
 * write_index()
 * ie_match_stat() and ie_modified

[PATCH 06/22] make sure partially read index is not changed

2013-07-07 Thread Thomas Gummerer
A partially read index file currently cannot be written to disk.  Make
sure that never happens, by re-reading the index file if the index file
wasn't read completely before changing the in-memory index.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 builtin/update-index.c | 4 
 cache.h| 4 +++-
 read-cache-v2.c| 3 +++
 read-cache.c   | 8 
 4 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/builtin/update-index.c b/builtin/update-index.c
index 5c7762e..03f6426 100644
--- a/builtin/update-index.c
+++ b/builtin/update-index.c
@@ -49,6 +49,8 @@ static int mark_ce_flags(const char *path, int flag, int mark)
int namelen = strlen(path);
int pos = cache_name_pos(path, namelen);
if (0 = pos) {
+   if (active_cache_partially_read)
+   cache_change_filter_opts(NULL);
if (mark)
active_cache[pos]-ce_flags |= flag;
else
@@ -253,6 +255,8 @@ static void chmod_path(int flip, const char *path)
pos = cache_name_pos(path, strlen(path));
if (pos  0)
goto fail;
+   if (active_cache_partially_read)
+   cache_change_filter_opts(NULL);
ce = active_cache[pos];
mode = ce-ce_mode;
if (!S_ISREG(mode))
diff --git a/cache.h b/cache.h
index d38dfbd..f6c3407 100644
--- a/cache.h
+++ b/cache.h
@@ -293,7 +293,8 @@ struct index_state {
struct cache_tree *cache_tree;
struct cache_time timestamp;
unsigned name_hash_initialized : 1,
-initialized : 1;
+initialized : 1,
+partially_read : 1;
struct hash_table name_hash;
struct hash_table dir_hash;
struct index_ops *ops;
@@ -315,6 +316,7 @@ extern void free_name_hash(struct index_state *istate);
 #define active_alloc (the_index.cache_alloc)
 #define active_cache_changed (the_index.cache_changed)
 #define active_cache_tree (the_index.cache_tree)
+#define active_cache_partially_read (the_index.partially_read)
 
 #define read_cache() read_index(the_index)
 #define read_cache_from(path) read_index_from(the_index, (path))
diff --git a/read-cache-v2.c b/read-cache-v2.c
index 1ed640d..2cc792d 100644
--- a/read-cache-v2.c
+++ b/read-cache-v2.c
@@ -273,6 +273,7 @@ static int read_index_v2(struct index_state *istate, void 
*mmap,
src_offset += 8;
src_offset += extsize;
}
+   istate-partially_read = 0;
return 0;
 unmap:
munmap(mmap, mmap_size);
@@ -495,6 +496,8 @@ static int write_index_v2(struct index_state *istate, int 
newfd)
struct stat st;
struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 
+   if (istate-partially_read)
+   die(BUG: index: cannot write a partially read index);
for (i = removed = extended = 0; i  entries; i++) {
if (cache[i]-ce_flags  CE_REMOVE)
removed++;
diff --git a/read-cache.c b/read-cache.c
index b30ee75..4529fab 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -30,6 +30,8 @@ static void replace_index_entry(struct index_state *istate, 
int nr, struct cache
 {
struct cache_entry *old = istate-cache[nr];
 
+   if (istate-partially_read)
+   index_change_filter_opts(istate, NULL);
remove_name_hash(istate, old);
set_index_entry(istate, nr, ce);
istate-cache_changed = 1;
@@ -467,6 +469,8 @@ int remove_index_entry_at(struct index_state *istate, int 
pos)
 {
struct cache_entry *ce = istate-cache[pos];
 
+   if (istate-partially_read)
+   index_change_filter_opts(istate, NULL);
record_resolve_undo(istate, ce);
remove_name_hash(istate, ce);
istate-cache_changed = 1;
@@ -978,6 +982,8 @@ int add_index_entry(struct index_state *istate, struct 
cache_entry *ce, int opti
 {
int pos;
 
+   if (istate-partially_read)
+   index_change_filter_opts(istate, NULL);
if (option  ADD_CACHE_JUST_APPEND)
pos = istate-cache_nr;
else {
@@ -1184,6 +1190,8 @@ int refresh_index(struct index_state *istate, unsigned 
int flags, const char **p
/* If we are doing --really-refresh that
 * means the index is not valid anymore.
 */
+   if (istate-partially_read)
+   index_change_filter_opts(istate, NULL);
ce-ce_flags = ~CE_VALID;
istate-cache_changed = 1;
}
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/22] read-cache: Re-read index if index file changed

2013-07-07 Thread Thomas Gummerer
Add the possibility of re-reading the index file, if it changed
while reading.

The index file might change during the read, causing outdated
information to be displayed. We check if the index file changed
by using its stat data as heuristic.

Helped-by: Ramsay Jones ram...@ramsay1.demon.co.uk
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache.c | 91 +---
 1 file changed, 57 insertions(+), 34 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 1e7ffc2..3e3a0e2 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1275,11 +1275,31 @@ int read_index(struct index_state *istate)
return read_index_from(istate, get_index_file());
 }
 
+static int index_changed(struct stat *st_old, struct stat *st_new)
+{
+   if (st_old-st_mtime != st_new-st_mtime ||
+#if !defined (__CYGWIN__)
+   st_old-st_uid   != st_new-st_uid ||
+   st_old-st_gid   != st_new-st_gid ||
+   st_old-st_ino   != st_new-st_ino ||
+#endif
+#if USE_NSEC
+   ST_MTIME_NSEC(*st_old) != ST_MTIME_NSEC(*st_new) ||
+#endif
+#if USE_STDEV
+   st_old-st_dev != st_new-st_dev ||
+#endif
+   st_old-st_size != st_new-st_size)
+   return 1;
+
+   return 0;
+}
+
 /* remember to discard_cache() before reading a different cache! */
 int read_index_from(struct index_state *istate, const char *path)
 {
-   int fd;
-   struct stat st;
+   int fd, err, i;
+   struct stat st_old, st_new;
struct cache_version_header *hdr;
void *mmap;
size_t mmap_size;
@@ -1291,41 +1311,44 @@ int read_index_from(struct index_state *istate, const 
char *path)
errno = ENOENT;
istate-timestamp.sec = 0;
istate-timestamp.nsec = 0;
+   for (i = 0; i  50; i++) {
+   err = 0;
+   fd = open(path, O_RDONLY);
+   if (fd  0) {
+   if (errno == ENOENT)
+   return 0;
+   die_errno(index file open failed);
+   }
 
-   fd = open(path, O_RDONLY);
-   if (fd  0) {
-   if (errno == ENOENT)
-   return 0;
-   die_errno(index file open failed);
+   if (fstat(fd, st_old))
+   die_errno(cannot stat the open index);
+
+   errno = EINVAL;
+   mmap_size = xsize_t(st_old.st_size);
+   mmap = xmmap(NULL, mmap_size, PROT_READ | PROT_WRITE, 
MAP_PRIVATE, fd, 0);
+   close(fd);
+   if (mmap == MAP_FAILED)
+   die_errno(unable to map index file);
+
+   hdr = mmap;
+   if (verify_hdr_version(istate, hdr, mmap_size)  0)
+   err = 1;
+
+   if (istate-ops-verify_hdr(mmap, mmap_size)  0)
+   err = 1;
+
+   if (istate-ops-read_index(istate, mmap, mmap_size)  0)
+   err = 1;
+   istate-timestamp.sec = st_old.st_mtime;
+   istate-timestamp.nsec = ST_MTIME_NSEC(st_old);
+   if (lstat(path, st_new))
+   die_errno(cannot stat the open index);
+
+   munmap(mmap, mmap_size);
+   if (!index_changed(st_old, st_new)  !err)
+   return istate-cache_nr;
}
 
-   if (fstat(fd, st))
-   die_errno(cannot stat the open index);
-
-   errno = EINVAL;
-   mmap_size = xsize_t(st.st_size);
-   mmap = xmmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 
0);
-   close(fd);
-   if (mmap == MAP_FAILED)
-   die_errno(unable to map index file);
-
-   hdr = mmap;
-   if (verify_hdr_version(istate, hdr, mmap_size)  0)
-   goto unmap;
-
-   if (istate-ops-verify_hdr(mmap, mmap_size)  0)
-   goto unmap;
-
-   if (istate-ops-read_index(istate, mmap, mmap_size)  0)
-   goto unmap;
-   istate-timestamp.sec = st.st_mtime;
-   istate-timestamp.nsec = ST_MTIME_NSEC(st);
-
-   munmap(mmap, mmap_size);
-   return istate-cache_nr;
-
-unmap:
-   munmap(mmap, mmap_size);
die(index file corrupt);
 }
 
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01/22] t2104: Don't fail for index versions other than [23]

2013-07-07 Thread Thomas Gummerer
t2104 currently checks for the exact index version 2 or 3,
depending if there is a skip-worktree flag or not. Other
index versions do not use extended flags and thus cannot
be tested for version changes.

Make this test update the index to version 2 at the beginning
of the test. Testing the skip-worktree flags for the default
index format is still covered by t7011 and t7012.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/t2104-update-index-skip-worktree.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/t/t2104-update-index-skip-worktree.sh 
b/t/t2104-update-index-skip-worktree.sh
index 1d0879b..bd9644f 100755
--- a/t/t2104-update-index-skip-worktree.sh
+++ b/t/t2104-update-index-skip-worktree.sh
@@ -22,6 +22,7 @@ H sub/2
 EOF
 
 test_expect_success 'setup' '
+   git update-index --index-version=2 
mkdir sub 
touch ./1 ./2 sub/1 sub/2 
git add 1 2 sub/1 sub/2 
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 00/22] Index v5

2013-07-07 Thread Thomas Gummerer
Hi,

This is a follow up for last years Google Summer of Code (late I know
:-) ), which wasn't merged back then.  The previous rounds of the
series are at $gmane/202752, $gmane/202923, $gmane/203088 and
$gmane/203517.

Since then I added a index reading api, which allows certain parts of
Git to take advantage of the the partial reading capability of the new
index file format now.  In this series the grep and the ls-files and
the code-paths used by them are switched to the new api.

Another goal for the api is to hide the open coded loops and accesses
to the in-memory format, to make it simpler to change the in-memory
format to a version that fits the new on-disk format better.

Except for the new patches, mostly the read-cache: read index-v5
patch changed, as the possibility to read the index partially was
added.

The first patch for t2104 makes sense without the rest of the series,
as it fixes running the test-suite with index-v4 as the default index
format.

Below are the timings for the WebKit repository.  c4b2d88 is the
revicion before adding anything, while HEAD are the times at the last
patch in the series.  The slower times in update-index come from the
update-index patch so they are no problem (in c4b2d88 the index is
only read, while in HEAD it's read and written).  The increase in time
in the ls-files test come from the not having the prune_cache function
in the index api.

I have not added this function as it only seems of use in ls-files,
but it can still be added if this increase is a problem.

Testc4b2d88   HEAD  
 
-
0003.2: v[23]: update-index 0.11(0.06+0.04)   0.22(0.15+0.05) 
+100.0%
0003.3: v[23]: grep nonexistent -- subdir   0.12(0.08+0.03)   0.12(0.09+0.02) 
+0.0%  
0003.4: v[23]: ls-files -- subdir   0.11(0.08+0.01)   0.12(0.08+0.03) 
+9.1%  
0003.6: v4: update-index0.09(0.06+0.02)   0.18(0.14+0.03) 
+100.0%
0003.7: v4: grep nonexistent -- subdir  0.10(0.08+0.02)   0.10(0.07+0.02) 
+0.0%  
0003.8: v4: ls-files -- subdir  0.09(0.07+0.01)   0.10(0.08+0.01) 
+11.1% 
0003.10: v5: update-index   missing 0.15(0.10+0.03)   
 
0003.11: v5: grep nonexistent -- subdir missing 0.01(0.00+0.00)   
 
0003.12: v5: ls-files -- subdir missing 0.01(0.01+0.00)   
 

And for reference the times for a synthetic repository with a 470MB
index file, just to demonstrate the improvements in large repositories.

Testc4b2d88   HEAD  
 
-
0003.2: v[23]: update-index 1.50(1.18+0.30)   3.18(2.55+0.60) 
+112.0%
0003.3: v[23]: grep nonexistent -- subdir   1.62(1.28+0.32)   1.66(1.28+0.36) 
+2.5%  
0003.4: v[23]: ls-files -- subdir   1.49(1.21+0.26)   1.62(1.28+0.32) 
+8.7%  
0003.6: v4: update-index1.18(0.89+0.28)   2.68(2.22+0.44) 
+127.1%
0003.7: v4: grep nonexistent -- subdir  1.29(1.00+0.28)   1.30(1.04+0.24) 
+0.8%  
0003.8: v4: ls-files -- subdir  1.20(0.95+0.23)   1.30(0.98+0.30) 
+8.3%  
0003.10: v5: update-index   missing 2.12(1.63+0.48)   
 
0003.11: v5: grep nonexistent -- subdir missing 0.08(0.04+0.02)   
 
0003.12: v5: ls-files -- subdir missing 0.07(0.05+0.01)   
 


Thomas Gummerer (21):
  t2104: Don't fail for index versions other than [23]
  read-cache: split index file version specific functionality
  read-cache: move index v2 specific functions to their own file
  read-cache: Re-read index if index file changed
  read-cache: add index reading api
  make sure partially read index is not changed
  dir.c: use index api
  tree.c: use index api
  name-hash.c: use index api
  grep.c: Use index api
  ls-files.c: use the index api
  read-cache: make read_blob_data_from_index use index api
  documentation: add documentation of the index-v5 file format
  read-cache: make in-memory format aware of stat_crc
  read-cache: read index-v5
  read-cache: read resolve-undo data
  read-cache: read cache-tree in index-v5
  read-cache: write index-v5
  read-cache: write index-v5 cache-tree data
  read-cache: write resolve-undo data for index-v5
  update-index.c: rewrite index when index-version is given

Thomas Rast (1):
  p0003-index.sh: add perf test for the index formats

 Documentation/technical/index-file-format-v5.txt |  296 +
 Makefile |3 +
 builtin/grep.c   |   71 +-
 builtin/ls-files.c   |  213 ++-
 builtin/update-index.c   |8 +-
 cache-tree.c |2 +-
 cache-tree.h

[PATCH 02/22] read-cache: split index file version specific functionality

2013-07-07 Thread Thomas Gummerer
Split index file version specific functionality to their own functions,
to prepare for moving the index file version specific parts to their own
file.  This makes it easier to add a new index file format later.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache.h  |   5 +-
 read-cache.c | 130 +--
 test-index-version.c |   2 +-
 3 files changed, 90 insertions(+), 47 deletions(-)

diff --git a/cache.h b/cache.h
index c288678..7af853b 100644
--- a/cache.h
+++ b/cache.h
@@ -100,9 +100,12 @@ unsigned long git_deflate_bound(git_zstream *, unsigned 
long);
  */
 
 #define CACHE_SIGNATURE 0x44495243 /* DIRC */
-struct cache_header {
+struct cache_version_header {
unsigned int hdr_signature;
unsigned int hdr_version;
+};
+
+struct cache_header {
unsigned int hdr_entries;
 };
 
diff --git a/read-cache.c b/read-cache.c
index d5201f9..93947bf 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1268,10 +1268,8 @@ struct ondisk_cache_entry_extended {
ondisk_cache_entry_extended_size(ce_namelen(ce)) : \
ondisk_cache_entry_size(ce_namelen(ce)))
 
-static int verify_hdr(struct cache_header *hdr, unsigned long size)
+static int verify_hdr_version(struct cache_version_header *hdr, unsigned long 
size)
 {
-   git_SHA_CTX c;
-   unsigned char sha1[20];
int hdr_version;
 
if (hdr-hdr_signature != htonl(CACHE_SIGNATURE))
@@ -1279,10 +1277,22 @@ static int verify_hdr(struct cache_header *hdr, 
unsigned long size)
hdr_version = ntohl(hdr-hdr_version);
if (hdr_version  INDEX_FORMAT_LB || INDEX_FORMAT_UB  hdr_version)
return error(bad index version %d, hdr_version);
+   return 0;
+}
+
+static int verify_hdr(void *mmap, unsigned long size)
+{
+   git_SHA_CTX c;
+   unsigned char sha1[20];
+
+   if (size  sizeof(struct cache_version_header)
+   + sizeof(struct cache_header) + 20)
+   die(index file smaller than expected);
+
git_SHA1_Init(c);
-   git_SHA1_Update(c, hdr, size - 20);
+   git_SHA1_Update(c, mmap, size - 20);
git_SHA1_Final(sha1, c);
-   if (hashcmp(sha1, (unsigned char *)hdr + size - 20))
+   if (hashcmp(sha1, (unsigned char *)mmap + size - 20))
return error(bad index file sha1 signature);
return 0;
 }
@@ -1424,47 +1434,19 @@ static struct cache_entry *create_from_disk(struct 
ondisk_cache_entry *ondisk,
return ce;
 }
 
-/* remember to discard_cache() before reading a different cache! */
-int read_index_from(struct index_state *istate, const char *path)
+static int read_index_v2(struct index_state *istate, void *mmap, unsigned long 
mmap_size)
 {
-   int fd, i;
-   struct stat st;
+   int i;
unsigned long src_offset;
-   struct cache_header *hdr;
-   void *mmap;
-   size_t mmap_size;
+   struct cache_version_header *hdr;
+   struct cache_header *hdr_v2;
struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 
-   if (istate-initialized)
-   return istate-cache_nr;
-
-   istate-timestamp.sec = 0;
-   istate-timestamp.nsec = 0;
-   fd = open(path, O_RDONLY);
-   if (fd  0) {
-   if (errno == ENOENT)
-   return 0;
-   die_errno(index file open failed);
-   }
-
-   if (fstat(fd, st))
-   die_errno(cannot stat the open index);
-
-   mmap_size = xsize_t(st.st_size);
-   if (mmap_size  sizeof(struct cache_header) + 20)
-   die(index file smaller than expected);
-
-   mmap = xmmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 
0);
-   if (mmap == MAP_FAILED)
-   die_errno(unable to map index file);
-   close(fd);
-
hdr = mmap;
-   if (verify_hdr(hdr, mmap_size)  0)
-   goto unmap;
+   hdr_v2 = (struct cache_header *)((char *)mmap + sizeof(*hdr));
 
istate-version = ntohl(hdr-hdr_version);
-   istate-cache_nr = ntohl(hdr-hdr_entries);
+   istate-cache_nr = ntohl(hdr_v2-hdr_entries);
istate-cache_alloc = alloc_nr(istate-cache_nr);
istate-cache = xcalloc(istate-cache_alloc, sizeof(*istate-cache));
istate-initialized = 1;
@@ -1474,7 +1456,7 @@ int read_index_from(struct index_state *istate, const 
char *path)
else
previous_name = NULL;
 
-   src_offset = sizeof(*hdr);
+   src_offset = sizeof(*hdr) + sizeof(*hdr_v2);
for (i = 0; i  istate-cache_nr; i++) {
struct ondisk_cache_entry *disk_ce;
struct cache_entry *ce;
@@ -1487,8 +1469,6 @@ int read_index_from(struct index_state *istate, const 
char *path)
src_offset += consumed;
}
strbuf_release(previous_name_buf);
-   istate-timestamp.sec = st.st_mtime;
-   istate-timestamp.nsec = ST_MTIME_NSEC

[PATCH 05/22] read-cache: add index reading api

2013-07-07 Thread Thomas Gummerer
Add an api for access to the index file.  Currently there is only a very
basic api for accessing the index file, which only allows a full read of
the index, and lets the users of the data filter it.  The new index api
gives the users the possibility to use only part of the index and
provides functions for iterating over and accessing cache entries.

This simplifies future improvements to the in-memory format, as changes
will be concentrated on one file, instead of the whole git source code.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache.h |  57 +-
 read-cache-v2.c |  96 +++--
 read-cache.c| 108 
 read-cache.h|  12 ++-
 4 files changed, 263 insertions(+), 10 deletions(-)

diff --git a/cache.h b/cache.h
index 5082b34..d38dfbd 100644
--- a/cache.h
+++ b/cache.h
@@ -127,7 +127,8 @@ struct cache_entry {
unsigned int ce_flags;
unsigned int ce_namelen;
unsigned char sha1[20];
-   struct cache_entry *next;
+   struct cache_entry *next; /* used by name_hash */
+   struct cache_entry *next_ce; /* used to keep a list of cache entries */
char name[FLEX_ARRAY]; /* more */
 };
 
@@ -258,6 +259,32 @@ static inline unsigned int canon_mode(unsigned int mode)
 
 #define cache_entry_size(len) (offsetof(struct cache_entry,name) + (len) + 1)
 
+/*
+ * Options by which the index should be filtered when read partially.
+ *
+ * pathspec: The pathspec which the index entries have to match
+ * seen: Used to return the seen parameter from match_pathspec()
+ * max_prefix, max_prefix_len: These variables are set to the longest
+ * common prefix, the length of the longest common prefix of the
+ * given pathspec
+ *
+ * read_staged: used to indicate if the conflicted entries (entries
+ * with a stage) should be included
+ * read_cache_tree: used to indicate if the cache-tree should be read
+ * read_resolve_undo: used to indicate if the resolve undo data should
+ * be read
+ */
+struct filter_opts {
+   const char **pathspec;
+   char *seen;
+   char *max_prefix;
+   int max_prefix_len;
+
+   int read_staged;
+   int read_cache_tree;
+   int read_resolve_undo;
+};
+
 struct index_state {
struct cache_entry **cache;
unsigned int version;
@@ -270,6 +297,8 @@ struct index_state {
struct hash_table name_hash;
struct hash_table dir_hash;
struct index_ops *ops;
+   struct internal_ops *internal_ops;
+   struct filter_opts *filter_opts;
 };
 
 extern struct index_state the_index;
@@ -311,6 +340,17 @@ extern void free_name_hash(struct index_state *istate);
 #define unmerge_cache_entry_at(at) unmerge_index_entry_at(the_index, at)
 #define unmerge_cache(pathspec) unmerge_index(the_index, pathspec)
 #define read_blob_data_from_cache(path, sz) 
read_blob_data_from_index(the_index, (path), (sz))
+
+/* index api */
+#define read_cache_filtered(opts) read_index_filtered(the_index, (opts))
+#define read_cache_filtered_from(path, opts) 
read_index_filtered_from(the_index, (path), (opts))
+#define get_cache_entry_by_name(name, namelen, ce) \
+   get_index_entry_by_name(the_index, (name), (namelen), (ce))
+#define for_each_cache_entry(fn, cb_data) \
+   for_each_index_entry(the_index, (fn), (cb_data))
+#define next_cache_entry(ce) next_index_entry(ce)
+#define cache_change_filter_opts(opts) index_change_filter_opts(the_index, 
(opts))
+#define sort_cache() sort_index(the_index)
 #endif
 
 enum object_type {
@@ -438,6 +478,21 @@ extern int init_db(const char *template_dir, unsigned int 
flags);
} \
} while (0)
 
+/* index api */
+extern int read_index_filtered(struct index_state *, struct filter_opts *opts);
+extern int read_index_filtered_from(struct index_state *, const char *path, 
struct filter_opts *opts);
+extern int get_index_entry_by_name(struct index_state *, const char *name, int 
namelen,
+  struct cache_entry **ce);
+extern struct cache_entry *next_index_entry(struct cache_entry *ce);
+void index_change_filter_opts(struct index_state *istate, struct filter_opts 
*opts);
+void sort_index(struct index_state *istate);
+
+typedef int each_cache_entry_fn(struct cache_entry *ce, void *);
+
+extern int for_each_index_entry(struct index_state *istate,
+   each_cache_entry_fn, void *);
+
+
 /* Initialize and use the cache information */
 extern int read_index(struct index_state *);
 extern int read_index_preload(struct index_state *, const char **pathspec);
diff --git a/read-cache-v2.c b/read-cache-v2.c
index a6883c3..1ed640d 100644
--- a/read-cache-v2.c
+++ b/read-cache-v2.c
@@ -3,6 +3,7 @@
 #include resolve-undo.h
 #include cache-tree.h
 #include varint.h
+#include dir.h
 
 /* Mask for the name length in ce_flags in the on-disk index */
 #define CE_NAMEMASK  (0x0fff

[PATCH 07/22] dir.c: use index api

2013-07-07 Thread Thomas Gummerer
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 dir.c | 33 +++--
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/dir.c b/dir.c
index 897c874..f4919ba 100644
--- a/dir.c
+++ b/dir.c
@@ -468,19 +468,19 @@ void add_exclude(const char *string, const char *base,
 
 static void *read_skip_worktree_file_from_index(const char *path, size_t *size)
 {
-   int pos, len;
+   int len;
unsigned long sz;
enum object_type type;
void *data;
struct index_state *istate = the_index;
+   struct cache_entry *ce;
 
len = strlen(path);
-   pos = index_name_pos(istate, path, len);
-   if (pos  0)
+   if (!get_index_entry_by_name(istate, path, len, ce))
return NULL;
-   if (!ce_skip_worktree(istate-cache[pos]))
+   if (!ce_skip_worktree(ce))
return NULL;
-   data = read_sha1_file(istate-cache[pos]-sha1, type, sz);
+   data = read_sha1_file(ce-sha1, type, sz);
if (!data || type != OBJ_BLOB) {
free(data);
return NULL;
@@ -968,16 +968,13 @@ static enum exist_status 
directory_exists_in_index_icase(const char *dirname, in
  */
 static enum exist_status directory_exists_in_index(const char *dirname, int 
len)
 {
-   int pos;
+   struct cache_entry *ce;
 
if (ignore_case)
return directory_exists_in_index_icase(dirname, len);
 
-   pos = cache_name_pos(dirname, len);
-   if (pos  0)
-   pos = -pos-1;
-   while (pos  active_nr) {
-   struct cache_entry *ce = active_cache[pos++];
+   get_cache_entry_by_name(dirname, len, ce);
+   while (ce) {
unsigned char endchar;
 
if (strncmp(ce-name, dirname, len))
@@ -989,6 +986,7 @@ static enum exist_status directory_exists_in_index(const 
char *dirname, int len)
return index_directory;
if (!endchar  S_ISGITLINK(ce-ce_mode))
return index_gitdir;
+   ce = next_cache_entry(ce);
}
return index_nonexistent;
 }
@@ -1114,7 +1112,6 @@ static int exclude_matches_pathspec(const char *path, int 
len,
 
 static int get_index_dtype(const char *path, int len)
 {
-   int pos;
struct cache_entry *ce;
 
ce = cache_name_exists(path, len, 0);
@@ -1131,18 +1128,18 @@ static int get_index_dtype(const char *path, int len)
}
 
/* Try to look it up as a directory */
-   pos = cache_name_pos(path, len);
-   if (pos = 0)
+   if (get_cache_entry_by_name(path, len, ce));
return DT_UNKNOWN;
-   pos = -pos-1;
-   while (pos  active_nr) {
-   ce = active_cache[pos++];
+
+   while (ce) {
if (strncmp(ce-name, path, len))
break;
if (ce-name[len]  '/')
break;
-   if (ce-name[len]  '/')
+   if (ce-name[len]  '/') {
+   ce = next_cache_entry(ce);
continue;
+   }
if (!ce_uptodate(ce))
break;  /* continue? */
return DT_DIR;
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 08/22] tree.c: use index api

2013-07-07 Thread Thomas Gummerer
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 tree.c | 38 --
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/tree.c b/tree.c
index 62fed63..5cd43f4 100644
--- a/tree.c
+++ b/tree.c
@@ -128,20 +128,28 @@ int read_tree_recursive(struct tree *tree,
return ret;
 }
 
-static int cmp_cache_name_compare(const void *a_, const void *b_)
+
+struct read_tree_data {
+   read_tree_fn_t fn;
+   int stage;
+};
+
+int get_read_tree_fn(struct cache_entry *ce, void *cb_data)
 {
-   const struct cache_entry *ce1, *ce2;
+   struct read_tree_data *data = cb_data;
 
-   ce1 = *((const struct cache_entry **)a_);
-   ce2 = *((const struct cache_entry **)b_);
-   return cache_name_stage_compare(ce1-name, ce1-ce_namelen, 
ce_stage(ce1),
- ce2-name, ce2-ce_namelen, ce_stage(ce2));
+   if (ce_stage(ce) == data-stage) {
+   data-fn = read_one_entry;
+   return 0;
+   }
+   return 1;
 }
 
 int read_tree(struct tree *tree, int stage, struct pathspec *match)
 {
read_tree_fn_t fn = NULL;
-   int i, err;
+   int err;
+   struct read_tree_data rtd;
 
/*
 * Currently the only existing callers of this function all
@@ -158,11 +166,10 @@ int read_tree(struct tree *tree, int stage, struct 
pathspec *match)
 * do it the original slow way, otherwise, append and then
 * sort at the end.
 */
-   for (i = 0; !fn  i  active_nr; i++) {
-   struct cache_entry *ce = active_cache[i];
-   if (ce_stage(ce) == stage)
-   fn = read_one_entry;
-   }
+   rtd.fn = fn;
+   rtd.stage = stage;
+   for_each_cache_entry(get_read_tree_fn, rtd);
+   fn = rtd.fn;
 
if (!fn)
fn = read_one_entry_quick;
@@ -170,12 +177,7 @@ int read_tree(struct tree *tree, int stage, struct 
pathspec *match)
if (fn == read_one_entry || err)
return err;
 
-   /*
-* Sort the cache entry -- we need to nuke the cache tree, though.
-*/
-   cache_tree_free(active_cache_tree);
-   qsort(active_cache, active_nr, sizeof(active_cache[0]),
- cmp_cache_name_compare);
+   sort_cache();
return 0;
 }
 
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/22] grep.c: Use index api

2013-07-07 Thread Thomas Gummerer
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 builtin/grep.c | 71 ++
 1 file changed, 37 insertions(+), 34 deletions(-)

diff --git a/builtin/grep.c b/builtin/grep.c
index a419cda..2a1c8f4 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -368,41 +368,33 @@ static void run_pager(struct grep_opt *opt, const char 
*prefix)
free(argv);
 }
 
-static int grep_cache(struct grep_opt *opt, const struct pathspec *pathspec, 
int cached)
+struct grep_opts {
+   struct grep_opt *opt;
+   const struct pathspec *pathspec;
+   int cached;
+   int hit;
+};
+
+static int grep_cache(struct cache_entry *ce, void *cb_data)
 {
-   int hit = 0;
-   int nr;
-   read_cache();
+   struct grep_opts *opts = cb_data;
 
-   for (nr = 0; nr  active_nr; nr++) {
-   struct cache_entry *ce = active_cache[nr];
-   if (!S_ISREG(ce-ce_mode))
-   continue;
-   if (!match_pathspec_depth(pathspec, ce-name, ce_namelen(ce), 
0, NULL))
-   continue;
-   /*
-* If CE_VALID is on, we assume worktree file and its cache 
entry
-* are identical, even if worktree file has been modified, so 
use
-* cache version instead
-*/
-   if (cached || (ce-ce_flags  CE_VALID) || 
ce_skip_worktree(ce)) {
-   if (ce_stage(ce))
-   continue;
-   hit |= grep_sha1(opt, ce-sha1, ce-name, 0, ce-name);
-   }
-   else
-   hit |= grep_file(opt, ce-name);
-   if (ce_stage(ce)) {
-   do {
-   nr++;
-   } while (nr  active_nr 
-!strcmp(ce-name, active_cache[nr]-name));
-   nr--; /* compensate for loop control */
-   }
-   if (hit  opt-status_only)
-   break;
-   }
-   return hit;
+   if (!S_ISREG(ce-ce_mode))
+   return 0;
+   if (!match_pathspec_depth(opts-pathspec, ce-name, ce_namelen(ce), 0, 
NULL))
+   return 0;
+   /*
+* If CE_VALID is on, we assume worktree file and its cache entry
+* are identical, even if worktree file has been modified, so use
+* cache version instead
+*/
+   if (opts-cached || (ce-ce_flags  CE_VALID) || ce_skip_worktree(ce))
+   opts-hit |= grep_sha1(opts-opt, ce-sha1, ce-name, 0, 
ce-name);
+   else
+   opts-hit |= grep_file(opts-opt, ce-name);
+   if (opts-hit  opts-opt-status_only)
+   return 1;
+   return 0;
 }
 
 static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec,
@@ -895,10 +887,21 @@ int cmd_grep(int argc, const char **argv, const char 
*prefix)
} else if (0 = opt_exclude) {
die(_(--[no-]exclude-standard cannot be used for tracked 
contents.));
} else if (!list.nr) {
+   struct grep_opts opts;
+   struct filter_opts *filter_opts = xmalloc(sizeof(*filter_opts));
+
if (!cached)
setup_work_tree();
 
-   hit = grep_cache(opt, pathspec, cached);
+   memset(filter_opts, 0, sizeof(*filter_opts));
+   filter_opts-pathspec = pathspec.raw;
+   opts.opt = opt;
+   opts.pathspec = pathspec;
+   opts.cached = cached;
+   opts.hit = 0;
+   read_cache_filtered(filter_opts);
+   for_each_cache_entry(grep_cache, opts);
+   hit = opts.hit;
} else {
if (cached)
die(_(both --cached and trees are given.));
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 12/22] read-cache: make read_blob_data_from_index use index api

2013-07-07 Thread Thomas Gummerer
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache.c | 22 ++
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 4529fab..c81e643 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1588,29 +1588,27 @@ int index_name_is_other(const struct index_state 
*istate, const char *name,
 
 void *read_blob_data_from_index(struct index_state *istate, const char *path, 
unsigned long *size)
 {
-   int pos, len;
+   int ret, len;
unsigned long sz;
enum object_type type;
void *data;
+   struct cache_entry *ce;
 
len = strlen(path);
-   pos = index_name_pos(istate, path, len);
-   if (pos  0) {
+   ret = get_index_entry_by_name(istate, path, len, ce);
+   if (!ret) {
/*
 * We might be in the middle of a merge, in which
 * case we would read stage #2 (ours).
 */
-   int i;
-   for (i = -pos - 1;
-(pos  0  i  istate-cache_nr 
- !strcmp(istate-cache[i]-name, path));
-i++)
-   if (ce_stage(istate-cache[i]) == 2)
-   pos = i;
+   for (; !ret  ce  !strcmp(ce-name, path); ce = 
next_index_entry(ce))
+   if (ce_stage(ce) == 2)
+   ret = 1;
+
}
-   if (pos  0)
+   if (!ret)
return NULL;
-   data = read_sha1_file(istate-cache[pos]-sha1, type, sz);
+   data = read_sha1_file(ce-sha1, type, sz);
if (!data || type != OBJ_BLOB) {
free(data);
return NULL;
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 11/22] ls-files.c: use the index api

2013-07-07 Thread Thomas Gummerer
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 builtin/ls-files.c | 213 +
 1 file changed, 100 insertions(+), 113 deletions(-)

diff --git a/builtin/ls-files.c b/builtin/ls-files.c
index 08d9786..82857d4 100644
--- a/builtin/ls-files.c
+++ b/builtin/ls-files.c
@@ -88,36 +88,35 @@ static void show_killed_files(struct dir_struct *dir)
for (i = 0; i  dir-nr; i++) {
struct dir_entry *ent = dir-entries[i];
char *cp, *sp;
-   int pos, len, killed = 0;
+   int len, killed = 0;
 
for (cp = ent-name; cp - ent-name  ent-len; cp = sp + 1) {
+   struct cache_entry *ce;
+
sp = strchr(cp, '/');
if (!sp) {
/* If ent-name is prefix of an entry in the
 * cache, it will be killed.
 */
-   pos = cache_name_pos(ent-name, ent-len);
-   if (0 = pos)
+   if (get_cache_entry_by_name(ent-name, 
ent-len, ce))
die(bug in show-killed-files);
-   pos = -pos - 1;
-   while (pos  active_nr 
-  ce_stage(active_cache[pos]))
-   pos++; /* skip unmerged */
-   if (active_nr = pos)
+   while (ce  ce_stage(ce))
+   ce = next_cache_entry(ce);
+   if (!ce)
break;
/* pos points at a name immediately after
 * ent-name in the cache.  Does it expect
 * ent-name to be a directory?
 */
-   len = ce_namelen(active_cache[pos]);
+   len = ce_namelen(ce);
if ((ent-len  len) 
-   !strncmp(active_cache[pos]-name,
+   !strncmp(ce-name,
 ent-name, ent-len) 
-   active_cache[pos]-name[ent-len] == '/')
+   ce-name[ent-len] == '/')
killed = 1;
break;
}
-   if (0 = cache_name_pos(ent-name, sp - ent-name)) {
+   if (get_cache_entry_by_name(ent-name, sp - ent-name, 
ce)) {
/* If any of the leading directories in
 * ent-name is registered in the cache,
 * ent-name will be killed.
@@ -213,10 +212,43 @@ static int ce_excluded(struct dir_struct *dir, struct 
cache_entry *ce)
return is_excluded(dir, ce-name, dtype);
 }
 
-static void show_files(struct dir_struct *dir)
+static int show_cached_stage(struct cache_entry *ce, void *cb_data)
 {
-   int i;
+   struct dir_struct *dir = cb_data;
+
+   if ((dir-flags  DIR_SHOW_IGNORED)  !ce_excluded(dir, ce))
+   return 0;
+   if (show_unmerged  !ce_stage(ce))
+   return 0;
+   if (ce-ce_flags  CE_UPDATE)
+   return 0;
+   show_ce_entry(ce_stage(ce) ? tag_unmerged :
+   (ce_skip_worktree(ce) ? tag_skip_worktree : tag_cached), ce);
+   return 0;
+}
 
+static int show_deleted_modified(struct cache_entry *ce, void *cb_data)
+{
+   struct stat st;
+   int err;
+   struct dir_struct *dir = cb_data;
+
+   if ((dir-flags  DIR_SHOW_IGNORED)  !ce_excluded(dir, ce))
+   return 0;
+   if (ce-ce_flags  CE_UPDATE)
+   return 0;
+   if (ce_skip_worktree(ce))
+   return 0;
+   err = lstat(ce-name, st);
+   if (show_deleted  err)
+   show_ce_entry(tag_removed, ce);
+   if (show_modified  ce_modified(ce, st, 0))
+   show_ce_entry(tag_modified, ce);
+   return 0;
+}
+
+static void show_files(struct dir_struct *dir)
+{
/* For cached/deleted files we don't need to even do the readdir */
if (show_others || show_killed) {
fill_directory(dir, pathspec);
@@ -225,66 +257,18 @@ static void show_files(struct dir_struct *dir)
if (show_killed)
show_killed_files(dir);
}
-   if (show_cached || show_stage) {
-   for (i = 0; i  active_nr; i++) {
-   struct cache_entry *ce = active_cache[i];
-   if ((dir-flags  DIR_SHOW_IGNORED) 
-   !ce_excluded(dir, ce))
-   continue

[PATCH 09/22] name-hash.c: use index api

2013-07-07 Thread Thomas Gummerer
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 name-hash.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/name-hash.c b/name-hash.c
index 617c86c..6551849 100644
--- a/name-hash.c
+++ b/name-hash.c
@@ -144,16 +144,19 @@ static void hash_index_entry(struct index_state *istate, 
struct cache_entry *ce)
add_dir_entry(istate, ce);
 }
 
-static void lazy_init_name_hash(struct index_state *istate)
+static int hash_entry(struct cache_entry *ce, void *istate)
 {
-   int nr;
+   hash_index_entry((struct index_state *)istate, ce);
+   return 0;
+}
 
+static void lazy_init_name_hash(struct index_state *istate)
+{
if (istate-name_hash_initialized)
return;
if (istate-cache_nr)
preallocate_hash(istate-name_hash, istate-cache_nr);
-   for (nr = 0; nr  istate-cache_nr; nr++)
-   hash_index_entry(istate, istate-cache[nr]);
+   for_each_index_entry(istate, hash_entry, istate);
istate-name_hash_initialized = 1;
 }
 
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 16/22] read-cache: read resolve-undo data

2013-07-07 Thread Thomas Gummerer
Make git read the resolve-undo data from the index.

Since the resolve-undo data is joined with the conflicts in
the ondisk format of the index file version 5, conflicts and
resolved data is read at the same time, and the resolve-undo
data is then converted to the in-memory format.

Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c | 39 +++
 1 file changed, 39 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index e319f30..193970a 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -1,5 +1,6 @@
 #include cache.h
 #include read-cache.h
+#include string-list.h
 #include resolve-undo.h
 #include cache-tree.h
 #include dir.h
@@ -447,6 +448,43 @@ static int read_conflicts(struct conflict_entry **head,
return 0;
 }
 
+static void resolve_undo_convert_v5(struct index_state *istate,
+   struct conflict_entry *conflict)
+{
+   int i;
+
+   while (conflict) {
+   struct string_list_item *lost;
+   struct resolve_undo_info *ui;
+   struct conflict_part *cp;
+
+   if (conflict-entries 
+   (conflict-entries-flags  CONFLICT_CONFLICTED) != 0) {
+   conflict = conflict-next;
+   continue;
+   }
+   if (!istate-resolve_undo) {
+   istate-resolve_undo = xcalloc(1, sizeof(struct 
string_list));
+   istate-resolve_undo-strdup_strings = 1;
+   }
+
+   lost = string_list_insert(istate-resolve_undo, conflict-name);
+   if (!lost-util)
+   lost-util = xcalloc(1, sizeof(*ui));
+   ui = lost-util;
+
+   cp = conflict-entries;
+   for (i = 0; i  3; i++)
+   ui-mode[i] = 0;
+   while (cp) {
+   ui-mode[conflict_stage(cp) - 1] = cp-entry_mode;
+   hashcpy(ui-sha1[conflict_stage(cp) - 1], cp-sha1);
+   cp = cp-next;
+   }
+   conflict = conflict-next;
+   }
+}
+
 static int read_entries(struct index_state *istate, struct directory_entry 
**de,
unsigned int *entry_offset, void **mmap,
unsigned long mmap_size, unsigned int *nr,
@@ -460,6 +498,7 @@ static int read_entries(struct index_state *istate, struct 
directory_entry **de,
conflict_queue = NULL;
if (read_conflicts(conflict_queue, *de, mmap, mmap_size)  0)
return -1;
+   resolve_undo_convert_v5(istate, conflict_queue);
for (i = 0; i  (*de)-de_nfiles; i++) {
if (read_entry(ce,
   *de,
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 20/22] read-cache: write resolve-undo data for index-v5

2013-07-07 Thread Thomas Gummerer
Make git read the resolve-undo data from the index.

Since the resolve-undo data is joined with the conflicts in
the ondisk format of the index file version 5, conflicts and
resolved data is read at the same time, and the resolve-undo
data is then converted to the in-memory format.

Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c | 94 +
 1 file changed, 94 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index 306de30..412db53 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -1011,6 +1011,99 @@ static void cache_tree_to_ondisk_v5(struct hash_table 
*table, struct cache_tree
convert_one_to_ondisk_v5(table, root, , 0, 0);
 }
 
+static void resolve_undo_to_ondisk_v5(struct hash_table *table,
+ struct string_list *resolve_undo,
+ unsigned int *ndir, int *total_dir_len,
+ struct directory_entry *de)
+{
+   struct string_list_item *item;
+   struct directory_entry *search;
+
+   if (!resolve_undo)
+   return;
+   for_each_string_list_item(item, resolve_undo) {
+   struct conflict_entry *conflict_entry;
+   struct resolve_undo_info *ui = item-util;
+   char *super;
+   int i, dir_len, len;
+   uint32_t crc;
+   struct directory_entry *found, *current, *new_tree;
+
+   if (!ui)
+   continue;
+
+   super = super_directory(item-string);
+   if (!super)
+   dir_len = 0;
+   else
+   dir_len = strlen(super);
+   crc = crc32(0, (Bytef*)super, dir_len);
+   found = lookup_hash(crc, table);
+   current = NULL;
+   new_tree = NULL;
+
+   while (!found) {
+   struct directory_entry *new;
+
+   new = init_directory_entry(super, dir_len);
+   if (!current)
+   current = new;
+   insert_directory_entry(new, table, total_dir_len, ndir, 
crc);
+   if (new_tree != NULL)
+   new-de_nsubtrees = 1;
+   new-next = new_tree;
+   new_tree = new;
+   super = super_directory(super);
+   if (!super)
+   dir_len = 0;
+   else
+   dir_len = strlen(super);
+   crc = crc32(0, (Bytef*)super, dir_len);
+   found = lookup_hash(crc, table);
+   }
+   search = found;
+   while (search-next_hash  strcmp(super, search-pathname) != 
0)
+   search = search-next_hash;
+   if (search  !current)
+   current = search;
+   if (!search  !current)
+   current = new_tree;
+   if (!super  new_tree) {
+   new_tree-next = de-next;
+   de-next = new_tree;
+   de-de_nsubtrees++;
+   } else if (new_tree) {
+   struct directory_entry *temp;
+
+   search = de-next;
+   while (strcmp(super, search-pathname))
+   search = search-next;
+   temp = new_tree;
+   while (temp-next)
+   temp = temp-next;
+   search-de_nsubtrees++;
+   temp-next = search-next;
+   search-next = new_tree;
+   }
+
+   len = strlen(item-string);
+   conflict_entry = create_new_conflict(item-string, len, 
current-de_pathlen);
+   add_conflict_to_directory_entry(current, conflict_entry);
+   for (i = 0; i  3; i++) {
+   if (ui-mode[i]) {
+   struct conflict_part *cp;
+
+   cp = xmalloc(sizeof(struct conflict_part));
+   cp-flags = (i + 1)  CONFLICT_STAGESHIFT;
+   cp-entry_mode = ui-mode[i];
+   cp-next = NULL;
+   hashcpy(cp-sha1, ui-sha1[i]);
+   add_part_to_conflict_entry(current, 
conflict_entry, cp);
+   }
+   }
+   }
+}
+
 static struct directory_entry *compile_directory_data(struct index_state 
*istate,
int nfile,
unsigned int *ndir,
@@ -1118,6 +1211,7 @@ static struct directory_entry 
*compile_directory_data

[PATCH 15/22] read-cache: read index-v5

2013-07-07 Thread Thomas Gummerer
Make git read the index file version 5 without complaining.

This version of the reader doesn't read neither the cache-tree
nor the resolve undo data, but doesn't choke on an index that
includes such data.

Helped-by: Junio C Hamano gits...@pobox.com
Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Makefile|   1 +
 cache.h |  75 ++-
 read-cache-v5.c | 658 
 read-cache.h|   1 +
 4 files changed, 734 insertions(+), 1 deletion(-)
 create mode 100644 read-cache-v5.c

diff --git a/Makefile b/Makefile
index 73369ae..80e35f5 100644
--- a/Makefile
+++ b/Makefile
@@ -856,6 +856,7 @@ LIB_OBJS += quote.o
 LIB_OBJS += reachable.o
 LIB_OBJS += read-cache.o
 LIB_OBJS += read-cache-v2.o
+LIB_OBJS += read-cache-v5.o
 LIB_OBJS += reflog-walk.o
 LIB_OBJS += refs.o
 LIB_OBJS += remote.o
diff --git a/cache.h b/cache.h
index d77af5e..e110ec8 100644
--- a/cache.h
+++ b/cache.h
@@ -99,7 +99,7 @@ unsigned long git_deflate_bound(git_zstream *, unsigned long);
 #define CACHE_SIGNATURE 0x44495243 /* DIRC */
 
 #define INDEX_FORMAT_LB 2
-#define INDEX_FORMAT_UB 4
+#define INDEX_FORMAT_UB 5
 
 /*
  * The cache_time is just the low 32 bits of the
@@ -121,6 +121,15 @@ struct stat_data {
unsigned int sd_size;
 };
 
+/*
+ * The *next pointer is used in read_entries_v5 for holding
+ * all the elements of a directory, and points to the next
+ * cache_entry in a directory.
+ *
+ * It is reset by the add_name_hash call in set_index_entry
+ * to set it to point to the next cache_entry in the
+ * correct in-memory format ordering.
+ */
 struct cache_entry {
struct stat_data ce_stat_data;
unsigned int ce_mode;
@@ -133,11 +142,59 @@ struct cache_entry {
char name[FLEX_ARRAY]; /* more */
 };
 
+struct directory_entry {
+   struct directory_entry *next;
+   struct directory_entry *next_hash;
+   struct cache_entry *ce;
+   struct cache_entry *ce_last;
+   struct conflict_entry *conflict;
+   struct conflict_entry *conflict_last;
+   unsigned int conflict_size;
+   unsigned int de_foffset;
+   unsigned int de_cr;
+   unsigned int de_ncr;
+   unsigned int de_nsubtrees;
+   unsigned int de_nfiles;
+   unsigned int de_nentries;
+   unsigned char sha1[20];
+   unsigned short de_flags;
+   unsigned int de_pathlen;
+   char pathname[FLEX_ARRAY];
+};
+
+struct conflict_part {
+   struct conflict_part *next;
+   unsigned short flags;
+   unsigned short entry_mode;
+   unsigned char sha1[20];
+};
+
+struct conflict_entry {
+   struct conflict_entry *next;
+   unsigned int nfileconflicts;
+   struct conflict_part *entries;
+   unsigned int namelen;
+   unsigned int pathlen;
+   char name[FLEX_ARRAY];
+};
+
+struct ondisk_conflict_part {
+   unsigned short flags;
+   unsigned short entry_mode;
+   unsigned char sha1[20];
+};
+
+#define CE_NAMEMASK  (0x0fff)
 #define CE_STAGEMASK (0x3000)
 #define CE_EXTENDED  (0x4000)
 #define CE_VALID (0x8000)
+#define CE_SMUDGED   (0x0400) /* index v5 only flag */
 #define CE_STAGESHIFT 12
 
+#define CONFLICT_CONFLICTED (0x8000)
+#define CONFLICT_STAGESHIFT 13
+#define CONFLICT_STAGEMASK (0x6000)
+
 /*
  * Range 0x in ce_flags is divided into
  * two parts: in-memory flags and on-disk ones.
@@ -174,6 +231,18 @@ struct cache_entry {
 #define CE_EXTENDED_FLAGS (CE_INTENT_TO_ADD | CE_SKIP_WORKTREE)
 
 /*
+ * Representation of the extended on-disk flags in the v5 format.
+ * They must not collide with the ordinary on-disk flags, and need to
+ * fit in 16 bits.  Note however that v5 does not save the name
+ * length.
+ */
+#define CE_INTENT_TO_ADD_V5  (0x4000)
+#define CE_SKIP_WORKTREE_V5  (0x0800)
+#if (CE_VALID|CE_STAGEMASK)  (CE_INTENTTOADD_V5|CE_SKIPWORKTREE_V5)
+#error v5 on-disk flags collide with ordinary on-disk flags
+#endif
+
+/*
  * Safeguard to avoid saving wrong flags:
  *  - CE_EXTENDED2 won't get saved until its semantic is known
  *  - Bits in 0x have been saved in ce_flags already
@@ -212,6 +281,8 @@ static inline unsigned create_ce_flags(unsigned stage)
 #define ce_skip_worktree(ce) ((ce)-ce_flags  CE_SKIP_WORKTREE)
 #define ce_mark_uptodate(ce) ((ce)-ce_flags |= CE_UPTODATE)
 
+#define conflict_stage(c) ((CONFLICT_STAGEMASK  (c)-flags)  
CONFLICT_STAGESHIFT)
+
 #define ce_permissions(mode) (((mode)  0100) ? 0755 : 0644)
 static inline unsigned int create_ce_mode(unsigned int mode)
 {
@@ -259,6 +330,8 @@ static inline unsigned int canon_mode(unsigned int mode)
 }
 
 #define cache_entry_size(len) (offsetof(struct cache_entry,name) + (len) + 1)
+#define directory_entry_size(len) (offsetof(struct directory_entry,pathname) + 
(len) + 1)
+#define conflict_entry_size(len) (offsetof(struct conflict_entry,name) + (len) 
+ 1)
 
 /*
  * Options by which the index

[PATCH 17/22] read-cache: read cache-tree in index-v5

2013-07-07 Thread Thomas Gummerer
Since the cache-tree data is saved as part of the directory data,
we already read it at the beginning of the index. The cache-tree
is only converted from this directory data.

The cache-tree data is arranged in a tree, with the children sorted by
pathlen at each node, while the ondisk format is sorted lexically.
So we have to rebuild this format from the on-disk directory list.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache-tree.c|   2 +-
 cache-tree.h|   6 
 read-cache-v5.c | 100 
 3 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/cache-tree.c b/cache-tree.c
index 37e4d00..f4b0917 100644
--- a/cache-tree.c
+++ b/cache-tree.c
@@ -31,7 +31,7 @@ void cache_tree_free(struct cache_tree **it_p)
*it_p = NULL;
 }
 
-static int subtree_name_cmp(const char *one, int onelen,
+int subtree_name_cmp(const char *one, int onelen,
const char *two, int twolen)
 {
if (onelen  twolen)
diff --git a/cache-tree.h b/cache-tree.h
index 55d0f59..9aac493 100644
--- a/cache-tree.h
+++ b/cache-tree.h
@@ -21,10 +21,16 @@ struct cache_tree {
struct cache_tree_sub **down;
 };
 
+struct directory_queue {
+   struct directory_queue *down;
+   struct directory_entry *de;
+};
+
 struct cache_tree *cache_tree(void);
 void cache_tree_free(struct cache_tree **);
 void cache_tree_invalidate_path(struct cache_tree *, const char *);
 struct cache_tree_sub *cache_tree_sub(struct cache_tree *, const char *);
+int subtree_name_cmp(const char *, int, const char *, int);
 
 void cache_tree_write(struct strbuf *, struct cache_tree *root);
 struct cache_tree *cache_tree_read(const char *buffer, unsigned long size);
diff --git a/read-cache-v5.c b/read-cache-v5.c
index 193970a..f1ad132 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -448,6 +448,103 @@ static int read_conflicts(struct conflict_entry **head,
return 0;
 }
 
+static struct cache_tree *convert_one(struct directory_queue *queue, int dirnr)
+{
+   int i, subtree_nr;
+   struct cache_tree *it;
+   struct directory_queue *down;
+
+   it = cache_tree();
+   it-entry_count = queue[dirnr].de-de_nentries;
+   subtree_nr = queue[dirnr].de-de_nsubtrees;
+   if (0 = it-entry_count)
+   hashcpy(it-sha1, queue[dirnr].de-sha1);
+
+   /*
+* Just a heuristic -- we do not add directories that often but
+* we do not want to have to extend it immediately when we do,
+* hence +2.
+*/
+   it-subtree_alloc = subtree_nr + 2;
+   it-down = xcalloc(it-subtree_alloc, sizeof(struct cache_tree_sub *));
+   down = queue[dirnr].down;
+   for (i = 0; i  subtree_nr; i++) {
+   struct cache_tree *sub;
+   struct cache_tree_sub *subtree;
+   char *buf, *name;
+
+   name = ;
+   buf = strtok(down[i].de-pathname, /);
+   while (buf) {
+   name = buf;
+   buf = strtok(NULL, /);
+   }
+   sub = convert_one(down, i);
+   if(!sub)
+   goto free_return;
+   subtree = cache_tree_sub(it, name);
+   subtree-cache_tree = sub;
+   }
+   if (subtree_nr != it-subtree_nr)
+   die(cache-tree: internal error);
+   return it;
+ free_return:
+   cache_tree_free(it);
+   return NULL;
+}
+
+static int compare_cache_tree_elements(const void *a, const void *b)
+{
+   const struct directory_entry *de1, *de2;
+
+   de1 = ((const struct directory_queue *)a)-de;
+   de2 = ((const struct directory_queue *)b)-de;
+   return subtree_name_cmp(de1-pathname, de1-de_pathlen,
+   de2-pathname, de2-de_pathlen);
+}
+
+static struct directory_entry *sort_directories(struct directory_entry *de,
+   struct directory_queue *queue)
+{
+   int i, nsubtrees;
+
+   nsubtrees = de-de_nsubtrees;
+   for (i = 0; i  nsubtrees; i++) {
+   struct directory_entry *new_de;
+   de = de-next;
+   new_de = xmalloc(directory_entry_size(de-de_pathlen));
+   memcpy(new_de, de, directory_entry_size(de-de_pathlen));
+   queue[i].de = new_de;
+   if (de-de_nsubtrees) {
+   queue[i].down = xcalloc(de-de_nsubtrees,
+   sizeof(struct directory_queue));
+   de = sort_directories(de,
+   queue[i].down);
+   }
+   }
+   qsort(queue, nsubtrees, sizeof(struct directory_queue),
+   compare_cache_tree_elements);
+   return de;
+}
+
+/*
+ * This function modifys the directory argument that is given to it.
+ * Don't use it if the directory entries are still needed after.
+ */
+static struct

[PATCH 21/22] update-index.c: rewrite index when index-version is given

2013-07-07 Thread Thomas Gummerer
Make update-index always rewrite the index when a index-version
is given, even if the index already has the right version.
This option is used for performance testing the writer and
reader.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 builtin/update-index.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/builtin/update-index.c b/builtin/update-index.c
index 03f6426..7954ddb 100644
--- a/builtin/update-index.c
+++ b/builtin/update-index.c
@@ -6,6 +6,7 @@
 #include cache.h
 #include quote.h
 #include cache-tree.h
+#include read-cache.h
 #include tree-walk.h
 #include builtin.h
 #include refs.h
@@ -863,8 +864,7 @@ int cmd_update_index(int argc, const char **argv, const 
char *prefix)
preferred_index_format,
INDEX_FORMAT_LB, INDEX_FORMAT_UB);
 
-   if (the_index.version != preferred_index_format)
-   active_cache_changed = 1;
+   active_cache_changed = 1;
the_index.version = preferred_index_format;
}
 
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/22] read-cache: move index v2 specific functions to their own file

2013-07-07 Thread Thomas Gummerer
Move index version 2 specific functions to their own file. The non-index
specific functions will be in read-cache.c, while the index version 2
specific functions will be in read-cache-v2.c.

Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Makefile |   2 +
 cache.h  |  16 +-
 read-cache-v2.c  | 556 +
 read-cache.c | 575 ---
 read-cache.h |  57 +
 test-index-version.c |   5 +
 6 files changed, 661 insertions(+), 550 deletions(-)
 create mode 100644 read-cache-v2.c
 create mode 100644 read-cache.h

diff --git a/Makefile b/Makefile
index 5a68fe5..73369ae 100644
--- a/Makefile
+++ b/Makefile
@@ -711,6 +711,7 @@ LIB_H += progress.h
 LIB_H += prompt.h
 LIB_H += quote.h
 LIB_H += reachable.h
+LIB_H += read-cache.h
 LIB_H += reflog-walk.h
 LIB_H += refs.h
 LIB_H += remote.h
@@ -854,6 +855,7 @@ LIB_OBJS += prompt.o
 LIB_OBJS += quote.o
 LIB_OBJS += reachable.o
 LIB_OBJS += read-cache.o
+LIB_OBJS += read-cache-v2.o
 LIB_OBJS += reflog-walk.o
 LIB_OBJS += refs.o
 LIB_OBJS += remote.o
diff --git a/cache.h b/cache.h
index 7af853b..5082b34 100644
--- a/cache.h
+++ b/cache.h
@@ -95,19 +95,8 @@ unsigned long git_deflate_bound(git_zstream *, unsigned 
long);
  */
 #define DEFAULT_GIT_PORT 9418
 
-/*
- * Basic data structures for the directory cache
- */
 
 #define CACHE_SIGNATURE 0x44495243 /* DIRC */
-struct cache_version_header {
-   unsigned int hdr_signature;
-   unsigned int hdr_version;
-};
-
-struct cache_header {
-   unsigned int hdr_entries;
-};
 
 #define INDEX_FORMAT_LB 2
 #define INDEX_FORMAT_UB 4
@@ -280,6 +269,7 @@ struct index_state {
 initialized : 1;
struct hash_table name_hash;
struct hash_table dir_hash;
+   struct index_ops *ops;
 };
 
 extern struct index_state the_index;
@@ -489,8 +479,8 @@ extern void *read_blob_data_from_index(struct index_state 
*, const char *, unsig
 #define CE_MATCH_RACY_IS_DIRTY 02
 /* do stat comparison even if CE_SKIP_WORKTREE is true */
 #define CE_MATCH_IGNORE_SKIP_WORKTREE  04
-extern int ie_match_stat(const struct index_state *, const struct cache_entry 
*, struct stat *, unsigned int);
-extern int ie_modified(const struct index_state *, const struct cache_entry *, 
struct stat *, unsigned int);
+extern int ie_match_stat(struct index_state *, const struct cache_entry *, 
struct stat *, unsigned int);
+extern int ie_modified(struct index_state *, const struct cache_entry *, 
struct stat *, unsigned int);
 
 #define PATHSPEC_ONESTAR 1 /* the pathspec pattern sastisfies GFNM_ONESTAR 
*/
 
diff --git a/read-cache-v2.c b/read-cache-v2.c
new file mode 100644
index 000..a6883c3
--- /dev/null
+++ b/read-cache-v2.c
@@ -0,0 +1,556 @@
+#include cache.h
+#include read-cache.h
+#include resolve-undo.h
+#include cache-tree.h
+#include varint.h
+
+/* Mask for the name length in ce_flags in the on-disk index */
+#define CE_NAMEMASK  (0x0fff)
+
+struct cache_header {
+   unsigned int hdr_entries;
+};
+
+/*
+ * Index File I/O
+ */
+
+/*
+ * dev/ino/uid/gid/size are also just tracked to the low 32 bits
+ * Again - this is just a (very strong in practice) heuristic that
+ * the inode hasn't changed.
+ *
+ * We save the fields in big-endian order to allow using the
+ * index file over NFS transparently.
+ */
+struct ondisk_cache_entry {
+   struct cache_time ctime;
+   struct cache_time mtime;
+   unsigned int dev;
+   unsigned int ino;
+   unsigned int mode;
+   unsigned int uid;
+   unsigned int gid;
+   unsigned int size;
+   unsigned char sha1[20];
+   unsigned short flags;
+   char name[FLEX_ARRAY]; /* more */
+};
+
+/*
+ * This struct is used when CE_EXTENDED bit is 1
+ * The struct must match ondisk_cache_entry exactly from
+ * ctime till flags
+ */
+struct ondisk_cache_entry_extended {
+   struct cache_time ctime;
+   struct cache_time mtime;
+   unsigned int dev;
+   unsigned int ino;
+   unsigned int mode;
+   unsigned int uid;
+   unsigned int gid;
+   unsigned int size;
+   unsigned char sha1[20];
+   unsigned short flags;
+   unsigned short flags2;
+   char name[FLEX_ARRAY]; /* more */
+};
+
+/* These are only used for v3 or lower */
+#define align_flex_name(STRUCT,len) ((offsetof(struct STRUCT,name) + (len) + 
8)  ~7)
+#define ondisk_cache_entry_size(len) align_flex_name(ondisk_cache_entry,len)
+#define ondisk_cache_entry_extended_size(len) 
align_flex_name(ondisk_cache_entry_extended,len)
+#define ondisk_ce_size(ce) (((ce)-ce_flags  CE_EXTENDED) ? \
+   ondisk_cache_entry_extended_size(ce_namelen(ce)) : \
+   ondisk_cache_entry_size(ce_namelen

[PATCH 19/22] read-cache: write index-v5 cache-tree data

2013-07-07 Thread Thomas Gummerer
Write the cache-tree data for the index version 5 file format. The
in-memory cache-tree data is converted to the ondisk format, by adding
it to the directory entries, that were compiled from the cache-entries
in the step before.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c | 53 +
 1 file changed, 53 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index f056f6b..306de30 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -960,6 +960,57 @@ static struct conflict_entry 
*create_conflict_entry_from_ce(struct cache_entry *
return create_new_conflict(ce-name, ce_namelen(ce), pathlen);
 }
 
+static void convert_one_to_ondisk_v5(struct hash_table *table, struct 
cache_tree *it,
+   const char *path, int pathlen, uint32_t crc)
+{
+   int i;
+   struct directory_entry *found, *search;
+
+   crc = crc32(crc, (Bytef*)path, pathlen);
+   found = lookup_hash(crc, table);
+   search = found;
+   while (search  strcmp(path, search-pathname + search-de_pathlen - 
strlen(path)) != 0)
+   search = search-next_hash;
+   if (!search)
+   return;
+   /*
+* The number of subtrees is already calculated by
+* compile_directory_data, therefore we only need to
+* add the entry_count
+*/
+   search-de_nentries = it-entry_count;
+   if (0 = it-entry_count)
+   hashcpy(search-sha1, it-sha1);
+   if (strcmp(path, ) != 0)
+   crc = crc32(crc, (Bytef*)/, 1);
+
+#if DEBUG
+   if (0 = it-entry_count)
+   fprintf(stderr, cache-tree %.*s (%d ent, %d subtree) %s\n,
+   pathlen, path, it-entry_count, it-subtree_nr,
+   sha1_to_hex(it-sha1));
+   else
+   fprintf(stderr, cache-tree %.*s (%d subtree) invalid\n,
+   pathlen, path, it-subtree_nr);
+#endif
+
+   for (i = 0; i  it-subtree_nr; i++) {
+   struct cache_tree_sub *down = it-down[i];
+   if (i) {
+   struct cache_tree_sub *prev = it-down[i-1];
+   if (subtree_name_cmp(down-name, down-namelen,
+prev-name, prev-namelen) = 0)
+   die(fatal - unsorted cache subtree);
+   }
+   convert_one_to_ondisk_v5(table, down-cache_tree, down-name, 
down-namelen, crc);
+   }
+}
+
+static void cache_tree_to_ondisk_v5(struct hash_table *table, struct 
cache_tree *root)
+{
+   convert_one_to_ondisk_v5(table, root, , 0, 0);
+}
+
 static struct directory_entry *compile_directory_data(struct index_state 
*istate,
int nfile,
unsigned int *ndir,
@@ -1065,6 +1116,8 @@ static struct directory_entry 
*compile_directory_data(struct index_state *istate
previous_entry-next = no_subtrees;
}
}
+   if (istate-cache_tree)
+   cache_tree_to_ondisk_v5(table, istate-cache_tree);
return de;
 }
 
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 18/22] read-cache: write index-v5

2013-07-07 Thread Thomas Gummerer
Write the index version 5 file format to disk. This version doesn't
write the cache-tree data and resolve-undo data to the file.

The main work is done when filtering out the directories from the
current in-memory format, where in the same turn also the conflicts
and the file data is calculated.

Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache.h |   8 +
 read-cache-v5.c | 594 +++-
 read-cache.c|  11 +-
 read-cache.h|   1 +
 4 files changed, 611 insertions(+), 3 deletions(-)

diff --git a/cache.h b/cache.h
index e110ec8..a92b490 100644
--- a/cache.h
+++ b/cache.h
@@ -581,6 +581,7 @@ extern int unmerged_index(const struct index_state *);
 extern int verify_path(const char *path);
 extern struct cache_entry *index_name_exists(struct index_state *istate, const 
char *name, int namelen, int igncase);
 extern int index_name_pos(const struct index_state *, const char *name, int 
namelen);
+extern struct directory_entry *init_directory_entry(char *pathname, int len);
 #define ADD_CACHE_OK_TO_ADD 1  /* Ok to add */
 #define ADD_CACHE_OK_TO_REPLACE 2  /* Ok to replace file/directory */
 #define ADD_CACHE_SKIP_DFCHECK 4   /* Ok to skip DF conflict checks */
@@ -1379,6 +1380,13 @@ static inline ssize_t write_str_in_full(int fd, const 
char *str)
return write_in_full(fd, str, strlen(str));
 }
 
+/* index-v5 helper functions */
+extern char *super_directory(const char *filename);
+extern void insert_directory_entry(struct directory_entry *, struct hash_table 
*, int *, unsigned int *, uint32_t);
+extern void add_conflict_to_directory_entry(struct directory_entry *, struct 
conflict_entry *);
+extern void add_part_to_conflict_entry(struct directory_entry *, struct 
conflict_entry *, struct conflict_part *);
+extern struct conflict_entry *create_new_conflict(char *, int, int);
+
 /* pager.c */
 extern void setup_pager(void);
 extern const char *pager_program;
diff --git a/read-cache-v5.c b/read-cache-v5.c
index f1ad132..f056f6b 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -788,10 +788,602 @@ static void index_change_filter_opts_v5(struct 
index_state *istate, struct filte
read_index_filtered(istate, opts);
 }
 
+#define WRITE_BUFFER_SIZE 8192
+static unsigned char write_buffer[WRITE_BUFFER_SIZE];
+static unsigned long write_buffer_len;
+
+static int ce_write_flush(int fd)
+{
+   unsigned int buffered = write_buffer_len;
+   if (buffered) {
+   if (write_in_full(fd, write_buffer, buffered) != buffered)
+   return -1;
+   write_buffer_len = 0;
+   }
+   return 0;
+}
+
+static int ce_write(uint32_t *crc, int fd, void *data, unsigned int len)
+{
+   if (crc)
+   *crc = crc32(*crc, (Bytef*)data, len);
+   while (len) {
+   unsigned int buffered = write_buffer_len;
+   unsigned int partial = WRITE_BUFFER_SIZE - buffered;
+   if (partial  len)
+   partial = len;
+   memcpy(write_buffer + buffered, data, partial);
+   buffered += partial;
+   if (buffered == WRITE_BUFFER_SIZE) {
+   write_buffer_len = buffered;
+   if (ce_write_flush(fd))
+   return -1;
+   buffered = 0;
+   }
+   write_buffer_len = buffered;
+   len -= partial;
+   data = (char *) data + partial;
+   }
+   return 0;
+}
+
+static int ce_flush(int fd)
+{
+   unsigned int left = write_buffer_len;
+
+   if (left)
+   write_buffer_len = 0;
+
+   if (write_in_full(fd, write_buffer, left) != left)
+   return -1;
+
+   return 0;
+}
+
+static void ce_smudge_racily_clean_entry(struct cache_entry *ce)
+{
+   /*
+* This method shall only be called if the timestamp of ce
+* is racy (check with is_racy_timestamp). If the timestamp
+* is racy, the writer will set the CE_SMUDGED flag.
+*
+* The reader (match_stat_basic) will then take care
+* of checking if the entry is really changed or not, by
+* taking into account the size and the stat_crc and if
+* that hasn't changed checking the sha1.
+*/
+   ce-ce_flags |= CE_SMUDGED;
+}
+
+char *super_directory(const char *filename)
+{
+   char *slash;
+
+   slash = strrchr(filename, '/');
+   if (slash)
+   return xmemdupz(filename, slash-filename);
+   return NULL;
+}
+
+struct directory_entry *init_directory_entry(char *pathname, int len)
+{
+   struct directory_entry *de = xmalloc(directory_entry_size(len));
+
+   memcpy(de-pathname, pathname, len);
+   de-pathname[len] = '\0';
+   de-de_flags  = 0;
+   de-de_foffset= 0;
+   de

[PATCH 13/22] documentation: add documentation of the index-v5 file format

2013-07-07 Thread Thomas Gummerer
Add a documentation of the index file format version 5 to
Documentation/technical.

Helped-by: Michael Haggerty mhag...@alum.mit.edu
Helped-by: Junio C Hamano gits...@pobox.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Robin Rosenberg robin.rosenb...@dewire.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Documentation/technical/index-file-format-v5.txt | 296 +++
 1 file changed, 296 insertions(+)
 create mode 100644 Documentation/technical/index-file-format-v5.txt

diff --git a/Documentation/technical/index-file-format-v5.txt 
b/Documentation/technical/index-file-format-v5.txt
new file mode 100644
index 000..4213087
--- /dev/null
+++ b/Documentation/technical/index-file-format-v5.txt
@@ -0,0 +1,296 @@
+GIT index format
+
+
+== The git index
+
+   The git index file (.git/index) documents the status of the files
+ in the git staging area.
+
+   The staging area is used for preparing commits, merging, etc.
+
+== The git index file format
+
+   All binary numbers are in network byte order. Version 5 is described
+ here. The index file consists of various sections. They appear in
+ the following order in the file.
+
+   - header: the description of the index format, including it's signature,
+ version and various other fields that are used internally.
+
+   - diroffsets (ndir entries of direcotry offset): A 4-byte offset
+   relative to the beginning of the direntries block (see below)
+   for each of the ndir directories in the index, sorted by pathname
+   (of the directory it's pointing to). [1]
+
+   - direntries (ndir entries of directory offset): A directory entry
+   for each of the ndir directories in the index, sorted by pathname
+   (see below). [2]
+
+   - fileoffsets (nfile entries of file offset): A 4-byte offset
+   relative to the beginning of the fileentries block (see below)
+   for each of the nfile files in the index. [1]
+
+   - fileentries (nfile entries of file entry): A file entry for
+   each of the nfile files in the index (see below).
+
+   - crdata: A number of entries for conflicted data/resolved conflicts
+   (see below).
+
+   - Extensions (Currently none, see below in the future)
+
+ Extensions are identified by signature. Optional extensions can
+ be ignored if GIT does not understand them.
+
+ GIT supports an arbitrary number of extension, but currently none
+ is implemented. [3]
+
+ extsig (32-bits): extension signature. If the first byte is 'A'..'Z'
+ the extension is optional and can be ignored.
+
+ extsize (32-bits): size of the extension, excluding the header
+   (extsig, extsize, extchecksum).
+
+ extchecksum (32-bits): crc32 checksum of the extension signature
+   and size.
+
+- Extension data.
+
+== Header
+   sig (32-bits): Signature:
+ The signature is { 'D', 'I', 'R', 'C' } (stands for dircache)
+
+   vnr (32-bits): Version number:
+ The current supported versions are 2, 3, 4 and 5.
+
+   ndir (32-bits): number of directories in the index.
+
+   nfile (32-bits): number of file entries in the index.
+
+   fblockoffset (32-bits): offset to the file block, relative to the
+ beginning of the file.
+
+   - Offset to the extensions.
+
+ nextensions (32-bits): number of extensions.
+
+ extoffset (32-bits): offset to the extension. (Possibly none, as
+   many as indicated in the 4-byte number of extensions)
+
+   headercrc (32-bits): crc checksum including the header and the
+ offsets to the extensions.
+
+
+== Directory offsets (diroffsets)
+
+  diroffset (32-bits): offset to the directory relative to the beginning
+of the index file. There are ndir + 1 offsets in the diroffset table,
+the last is pointing to the end of the last direntry. With this last
+entry, we are able to replace the strlen of when reading the directory
+name, by calculating it from diroffset[n+1]-diroffset[n]-61.  61 is the
+size of the directory data, which follows each each directory + the
+crc sum + the NUL byte.
+
+  This part is needed for making the directory entries bisectable and
+thus allowing a binary search.
+
+== Directory entry (direntries)
+
+  Directory entries are sorted in lexicographic order by the name
+of their path starting with the root.
+
+  pathname (variable length, nul terminated): relative to top level
+directory (without the leading slash). '/' is used as path
+separator. A string of length 0 ('') indicates the root directory.
+The special path components ., and .. (without quotes) are
+disallowed. The path also includes a trailing slash. [9]
+
+  foffset (32-bits): offset to the lexicographically first file in
+the file offsets (fileoffsets), relative to the beginning of
+the fileoffset block.
+
+  cr (32-bits): offset to conflicted/resolved data at the end of the
+index. 0

[PATCH 14/22] read-cache: make in-memory format aware of stat_crc

2013-07-07 Thread Thomas Gummerer
Make the in-memory format aware of the stat_crc used by index-v5.
It is simply ignored by index version prior to v5.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache.h  |  1 +
 read-cache.c | 25 +
 2 files changed, 26 insertions(+)

diff --git a/cache.h b/cache.h
index f6c3407..d77af5e 100644
--- a/cache.h
+++ b/cache.h
@@ -127,6 +127,7 @@ struct cache_entry {
unsigned int ce_flags;
unsigned int ce_namelen;
unsigned char sha1[20];
+   uint32_t ce_stat_crc;
struct cache_entry *next; /* used by name_hash */
struct cache_entry *next_ce; /* used to keep a list of cache entries */
char name[FLEX_ARRAY]; /* more */
diff --git a/read-cache.c b/read-cache.c
index c81e643..5ec0222 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -108,6 +108,29 @@ int match_stat_data(const struct stat_data *sd, struct 
stat *st)
return changed;
 }
 
+static uint32_t calculate_stat_crc(struct cache_entry *ce)
+{
+   unsigned int ctimens = 0;
+   uint32_t stat, stat_crc;
+
+   stat = htonl(ce-ce_stat_data.sd_ctime.sec);
+   stat_crc = crc32(0, (Bytef*)stat, 4);
+#ifdef USE_NSEC
+   ctimens = ce-ce_stat_data.sd_ctime.nsec;
+#endif
+   stat = htonl(ctimens);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_stat_data.sd_ino);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_stat_data.sd_dev);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_stat_data.sd_uid);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_stat_data.sd_gid);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   return stat_crc;
+}
+
 /*
  * This only updates the non-critical parts of the directory
  * cache, ie the parts that aren't tracked by GIT, and only used
@@ -122,6 +145,8 @@ void fill_stat_cache_info(struct cache_entry *ce, struct 
stat *st)
 
if (S_ISREG(st-st_mode))
ce_mark_uptodate(ce);
+
+   ce-ce_stat_crc = calculate_stat_crc(ce);
 }
 
 static int ce_compare_data(const struct cache_entry *ce, struct stat *st)
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 22/22] p0003-index.sh: add perf test for the index formats

2013-07-07 Thread Thomas Gummerer
From: Thomas Rast tr...@inf.ethz.ch

Add a performance test for index version [23]/4/5 by using
git update-index --index-version=x, thus testing both the reader
and the writer speed of all index formats.

Signed-off-by: Thomas Rast tr...@inf.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/perf/p0003-index.sh | 59 +++
 1 file changed, 59 insertions(+)
 create mode 100755 t/perf/p0003-index.sh

diff --git a/t/perf/p0003-index.sh b/t/perf/p0003-index.sh
new file mode 100755
index 000..3e02868
--- /dev/null
+++ b/t/perf/p0003-index.sh
@@ -0,0 +1,59 @@
+#!/bin/sh
+
+test_description=Tests index versions [23]/4/5
+
+. ./perf-lib.sh
+
+test_perf_large_repo
+
+test_expect_success convert to v3 
+   git update-index --index-version=2
+
+
+test_perf v[23]: update-index 
+   git update-index --index-version=2 /dev/null
+
+
+subdir=$(git ls-files | sed 's#/[^/]*$##' | grep -v '^$' | uniq | tail -n 30 | 
head -1)
+
+test_perf v[23]: grep nonexistent -- subdir 
+   test_must_fail git grep nonexistent -- $subdir /dev/null
+
+
+test_perf v[23]: ls-files -- subdir 
+   git ls-files $subdir /dev/null
+
+
+test_expect_success convert to v4 
+   git update-index --index-version=4
+
+
+test_perf v4: update-index 
+   git update-index --index-version=4 /dev/null
+
+
+test_perf v4: grep nonexistent -- subdir 
+   test_must_fail git grep nonexistent -- $subdir /dev/null
+
+
+test_perf v4: ls-files -- subdir 
+   git ls-files $subdir /dev/null
+
+
+test_expect_success convert to v5 
+   git update-index --index-version=5
+
+
+test_perf v5: update-index 
+   git update-index --index-version=5 /dev/null
+
+
+test_perf v5: grep nonexistent -- subdir 
+   test_must_fail git grep nonexistent -- $subdir /dev/null
+
+
+test_perf v5: ls-files -- subdir 
+   git ls-files $subdir /dev/null
+
+
+test_done
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] perf-lib: fix start/stop of perf tests

2013-06-29 Thread Thomas Gummerer
ae75342 test-lib: rearrange start/end of test_expect_* and test_skip
changed the way tests are started/stopped, but did not update the perf
tests.  They were therefore giving the wrong output, because of the
wrong test count.  Fix this by starting and stopping the tests
correctly.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/perf/perf-lib.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/t/perf/perf-lib.sh b/t/perf/perf-lib.sh
index a816fbc..c61d535 100644
--- a/t/perf/perf-lib.sh
+++ b/t/perf/perf-lib.sh
@@ -150,6 +150,7 @@ exit $ret' 3 24
 
 
 test_perf () {
+   test_start_
test $# = 3  { test_prereq=$1; shift; } || test_prereq=
test $# = 2 ||
error bug in the test script: not 2 or 3 parameters to 
test-expect-success
@@ -187,7 +188,7 @@ test_perf () {
base=$perf_results_dir/$perf_results_prefix$(basename $0 
.sh).$test_count
$TEST_DIRECTORY/perf/min_time.perl test_time.* $base.times
fi
-   echo 3 
+   test_finish_
 }
 
 # We extend test_done to print timings at the end (./run disables this
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] prompt: fix show upstream with svn and zsh

2013-05-22 Thread Thomas Gummerer
SZEDER Gábor sze...@ira.uka.de writes:

 Hi,


 On Tue, May 21, 2013 at 10:54:27PM +0200, Thomas Gummerer wrote:
 Currently the __git_ps1 git prompt gives the following error with a
 repository converted by git-svn, when used with zsh:

 __git_ps1_show_upstream:19: bad pattern: svn_remote[

 This was introduced by 6d158cba (bash completion: Support divergence
 from upstream messages in __git_ps1), when the script was for bash
 only.  Make it compatible with zsh.

 What is the actual cause of this problem/incompatibility and how/why do
 these changes fix it?

 -svn_remote[ $((${#svn_remote[@]} + 1)) ]=$value
 +svn_remote[$((${#svn_remote[@]} + 1))]=$value

 I mean, did zsh really complained because of the space after the '[' ?!

Yes, removing the spaces after the '[' fixes the problem.  I'm not very
proficient in shell scripting, so I can't tell if there is another
cause.

 @@ -146,8 +146,8 @@ __git_ps1_show_upstream ()
  svn*)
  # get the upstream from the git-svn-id: ... in a commit 
 message
  # (git-svn uses essentially the same procedure internally)
 -local svn_upstream=($(git log --first-parent -1 \
 ---grep=^git-svn-id: 
 \(${svn_url_pattern#??}\) 2/dev/null))
 +set -a svn_upstream $(git log --first-parent -1 \
 +--grep=^git-svn-id: 
 \(${svn_url_pattern#??}\) 2/dev/null)
  if [[ 0 -ne ${#svn_upstream[@]} ]]; then
  svn_upstream=${svn_upstream[ ${#svn_upstream[@]} - 2 ]}

 If so, then what about this one?

You're right, this line gives an error too, the code just wasn't
following that path before.  I'll fix it in the re-roll.  Other than
those two I couldn't spot any other occurrence of this pattern.

 Best,
 Gábor

Thanks,
Thomas
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] prompt: fix show upstream with svn and zsh

2013-05-22 Thread Thomas Gummerer
Felipe Contreras felipe.contre...@gmail.com writes:

 On Tue, May 21, 2013 at 3:54 PM, Thomas Gummerer t.gumme...@gmail.com wrote:
 Currently the __git_ps1 git prompt gives the following error with a
 repository converted by git-svn, when used with zsh:

__git_ps1_show_upstream:19: bad pattern: svn_remote[

 This was introduced by 6d158cba (bash completion: Support divergence
 from upstream messages in __git_ps1), when the script was for bash
 only.  Make it compatible with zsh.

 Signed-off-by: Thomas Gummerer t.gumme...@gmail.com

 This patch is fine by me. I would like to see an example of how to
 trigger the issue with a standalone command in the commit message, but
 it's not necessary. It would also make sense to address the comment
 from Szeder that does raise questions about other places in the code
 where 'array[ $foo ]' is used, maybe there's a caveat we are not
 considering, or maybe your use-case did not execute that code.

Yes, the code was not executed, it will be fixed in the re-roll.

 And finally, I don't recall seen 'set -a' used elsewhere in the code.
 If memory serves well, we have replaced 'local -a foo=value' with
 'local -a foo\nfoo=value' before to fix zsh issues, I think that would
 be safer.

Yes, thanks for the suggestion, that works.

 But this patch is needed regardless, that, or the patch that broke
 things should be reverted for v1.8.3.

I don't think anything should or can be reverted, as this code was
introduced in 2010, but nobody triggered it until now.

 Thomas, if you don't have time, let me know and I can take a look.

 Cheers.

 --
 Felipe Contreras

Thanks,
Thomas
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] prompt: fix show upstream with svn and zsh

2013-05-22 Thread Thomas Gummerer
Currently the __git_ps1 git prompt gives the following error with a
repository converted by git-svn, when used with zsh:

   __git_ps1_show_upstream:19: bad pattern: svn_remote[
   __git_ps1_show_upstream:45: bad substitution

To reproduce the problem, the __git_ps1_show_upstream function can be
executed in a repository converted with git-svn.  Both those errors are
triggered by spaces after the '['.

Zsh also doesn't support initializing an array with `local var=(...)`.
This triggers the following error:

   __git_ps1_show_upstream:41: bad pattern: svn_upstream=(commit

Use
   local -a
   var=(...)
instead to make is compatible.

This was introduced by 6d158cba (bash completion: Support divergence
from upstream messages in __git_ps1), when the script was for bash
only.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 contrib/completion/git-prompt.sh | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/contrib/completion/git-prompt.sh b/contrib/completion/git-prompt.sh
index eaf5c36..b6b1534 100644
--- a/contrib/completion/git-prompt.sh
+++ b/contrib/completion/git-prompt.sh
@@ -124,7 +124,7 @@ __git_ps1_show_upstream ()
fi
;;
svn-remote.*.url)
-   svn_remote[ $((${#svn_remote[@]} + 1)) ]=$value
+   svn_remote[$((${#svn_remote[@]} + 1))]=$value
svn_url_pattern+=\\|$value
upstream=svn+git # default upstream is SVN if 
available, else git
;;
@@ -146,10 +146,11 @@ __git_ps1_show_upstream ()
svn*)
# get the upstream from the git-svn-id: ... in a commit 
message
# (git-svn uses essentially the same procedure internally)
-   local svn_upstream=($(git log --first-parent -1 \
+   local -a svn_upstream
+   svn_upstream=($(git log --first-parent -1 \
--grep=^git-svn-id: 
\(${svn_url_pattern#??}\) 2/dev/null))
if [[ 0 -ne ${#svn_upstream[@]} ]]; then
-   svn_upstream=${svn_upstream[ ${#svn_upstream[@]} - 2 ]}
+   svn_upstream=${svn_upstream[${#svn_upstream[@]} - 2]}
svn_upstream=${svn_upstream%@*}
local n_stop=${#svn_remote[@]}
for ((n=1; n = n_stop; n++)); do
-- 
1.8.3.rc2.359.g2fb82f5

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: What's cooking in git.git (Sep 2012, #01; Tue, 4)

2012-09-07 Thread Thomas Gummerer
On 09/04, Junio C Hamano wrote:
 * tg/index-v5 (2012-08-17) 13 commits
  . p0002-index.sh: add perf test for the index formats
  . update-index.c: rewrite index when index-version is given
  . Write resolve-undo data for index-v5
  . Write index-v5 cache-tree data
  . Write index-v5
  . Read cache-tree in index-v5
  . Read resolve-undo data
  . Read index-v5
  . Make in-memory format aware of stat_crc
  . Add documentation of the index-v5 file format
  . t2104: Don't fail for index versions other than [23]
  . read-cache.c: Re-read index if index file changed
  . Move index v2 specific functions to their own file
 
 A GSoC project.  Was waiting for comments from mentors and
 stakeholders, but nothing seems to be happening, other than breakage
 fixes on Cygwin.  May discard.

I was planning on continuing to work on this topic as part of my Bachelor
Thesis.  I had a brief discussion with Thomas Rast on IRC about this
today.  Because I am planning to implement an api for partial loading
we decided it's probably best to hold off until that's implemented,
because parts of this series may change and it's going to take me a while
to implement the api.

As for the actual look of the api, I think something along the lines of
what was discussed at [1] would fit well.

The commands would then learn to use this api. (First the commands that
just read the index and later the commands that read and write the index,
but for that the api will have to support writing the index)

[1] http://thread.gmane.org/gmane.comp.version-control.git/198283/focus=198739 
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Prefix shell test output messages with test id

2012-09-06 Thread Thomas Gummerer
On 09/06, Jan-Marek Glogowski wrote:
 This adds the test ID (t) prefix to the test result message of
 all shell tests.  This is especially useful when doing a parallel
 check run, where it's currently quite hard to identify the actual
 failing test case.
 
 Signed-off-by: Jan-Marek Glogowski glo...@fbihome.de

This breaks the TAP output format of the tests, which is needed to run
them with prove.  To identify the failing tests more easily when running
the tests in parallel, you may want to add GIT_TEST_TARGET = prove to
your config.mak.

If this change is really needed, I think you should add the test-id after
the message.
 ---
  t/t-basic.sh| 28 ++--
  t/test-lib-functions.sh | 11 +++
  t/test-lib.sh   | 10 ++
  3 Dateien geändert, 27 Zeilen hinzugefügt(+), 22 Zeilen entfernt(-)
 
 diff --git a/t/t-basic.sh b/t/t-basic.sh
 index ccb5435..1bbf5b8 100755
 --- a/t/t-basic.sh
 +++ b/t/t-basic.sh
 @@ -58,7 +58,7 @@ test_expect_failure 'pretend we have a known breakage' '
  test_expect_success 'pretend we have fixed a known breakage (run in sub 
 test-lib)' 
   mkdir passing-todo 
   (cd passing-todo 
 - cat passing-todo.sh -EOF 
 + cat t05-passing-todo.sh -EOF 
   #!$SHELL_PATH
  
   test_description='A passing TODO test
 @@ -77,14 +77,14 @@ test_expect_success 'pretend we have fixed a known 
 breakage (run in sub test-lib
  
   test_done
   EOF
 - chmod +x passing-todo.sh 
 - ./passing-todo.sh out 2err 
 + chmod +x t05-passing-todo.sh 
 + ./t05-passing-todo.sh out 2err 
   ! test -s err 
   sed -e 's/^ //' expect -\\EOF 
 -  ok 1 - pretend we have fixed a known breakage # TODO known breakage
 -  # fixed 1 known breakage(s)
 -  # passed all 1 test(s)
 -  1..1
 +  t05: ok 1 - pretend we have fixed a known breakage # TODO known 
 breakage
 +  t05: # fixed 1 known breakage(s)
 +  t05: # passed all 1 test(s)
 +  t05: 1..1
   EOF
   test_cmp expect out)
  
 @@ -141,7 +141,7 @@ test_expect_success 'tests clean up even on failures' 
   (
   cd failing-cleanup 
  
 - cat failing-cleanup.sh -EOF 
 + cat t12-failing-cleanup.sh -EOF 
   #!$SHELL_PATH
  
   test_description='Failing tests with cleanup commands'
 @@ -162,23 +162,23 @@ test_expect_success 'tests clean up even on failures' 
  
   EOF
  
 - chmod +x failing-cleanup.sh 
 - test_must_fail ./failing-cleanup.sh out 2err 
 + chmod +x t12-failing-cleanup.sh 
 + test_must_fail ./t12-failing-cleanup.sh out 2err 
   ! test -s err 
   ! test -f \trash directory.failing-cleanup/clean-after-failure\ 
   sed -e 's/Z$//' -e 's/^ //' expect -\\EOF 
 -  not ok - 1 tests clean up even after a failure
 +  t12: not ok 1 - tests clean up even after a failure
# Z
# touch clean-after-failure 
# test_when_finished rm clean-after-failure 
# (exit 1)
# Z
 -  not ok - 2 failure to clean up causes the test to fail
 +  t12: not ok 2 - failure to clean up causes the test to fail
# Z
# test_when_finished \(exit 2)\
# Z
 -  # failed 2 among 2 test(s)
 -  1..2
 +  t12: # failed 2 among 2 test(s)
 +  t12: 1..2
   EOF
   test_cmp expect out
   )
 diff --git a/t/test-lib-functions.sh b/t/test-lib-functions.sh
 index 9bc57d2..c81ad7f 100644
 --- a/t/test-lib-functions.sh
 +++ b/t/test-lib-functions.sh
 @@ -24,6 +24,9 @@
  #
  # In particular, quoting isn't enough, as the path may contain the same quote
  # that we're using.
 +
 +TID=$(basename ${0%%-*})
 +
  test_set_editor () {
   FAKE_EDITOR=$1
   export FAKE_EDITOR
 @@ -434,7 +437,7 @@ test_external_without_stderr () {
  test_path_is_file () {
   if ! [ -f $1 ]
   then
 - echo File $1 doesn't exist. $*
 + echo $TID: File $1 doesn't exist. $*
   false
   fi
  }
 @@ -442,7 +445,7 @@ test_path_is_file () {
  test_path_is_dir () {
   if ! [ -d $1 ]
   then
 - echo Directory $1 doesn't exist. $*
 + echo $TID: Directory $1 doesn't exist. $*
   false
   fi
  }
 @@ -450,7 +453,7 @@ test_path_is_dir () {
  test_path_is_missing () {
   if [ -e $1 ]
   then
 - echo Path exists:
 + echo $TID: Path exists:
   ls -ld $1
   if [ $# -ge 1 ]; then
   echo $*
 @@ -476,7 +479,7 @@ test_line_count () {
   error bug in the test script: not 3 parameters to 
 test_line_count
   elif ! test $(wc -l $3) $1 $2
   then
 - echo test_line_count: line count for $3 !$1 $2
 + echo $TID: test_line_count: line count for $3 !$1 $2
   cat $3
   return 1
   fi
 diff --git a/t/test-lib.sh b/t/test-lib.sh
 index 

Re: [PATCH/RFC v4 02/13] read-cache.c: Re-read index if index file changed

2012-08-27 Thread Thomas Gummerer
On 08/25, Joachim Schmitz wrote:
 Thomas Gummerer t.gumme...@gmail.com schrieb im Newsbeitrag 
 news:134529-6925-3-git-send-email-t.gumme...@gmail.com...
  [...]
  +   usleep(10*1000);
 
 usleep() is not available to anybody, e.g. it is not in HP NonStop (not in 
 every case at least)
 
 Bye, Jojo
 
Thanks for noticing, will be fixed in the re-roll.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: read-cache-v5.c: compiler warnings

2012-08-20 Thread Thomas Gummerer
On 08/19, Ramsay Jones wrote:
 Hi Thomas,
 
 Just an FYI; two out of the three gcc compilers I use (Linux, cygwin
 but not MingW), issue the following warnings:
 
 CC read-cache-v5.o
 read-cache-v5.c: In function `write_index_v5':
 read-cache-v5.c:1011: warning: 'dir' might be used uninitialized in this 
 function
 read-cache-v5.c:1012: warning: 'search' might be used uninitialized in 
 this function
 read-cache-v5.c:1012: warning: 'found' might be used uninitialized in 
 this function
 
 [commit d2537158 (Write index-v5, 16-08-2012)]
 
 Normally I would take a look and offer a patch, but I haven't found time
 to do so in the last 10 days. So, I'm just giving you a heads up ... ;-P

Thanks for noticing.  This warnings only seem to be issued by older
versions of gcc, which is probably why two compilers give the warnings
while one doesn't.

The variables can never be uninitialized, because they are guarded by
ifs and older versions don't seem to recognize that.  I'll fix them
in a re-roll though (or a separate patch if that's preferred), just
waiting for some feedback for v4 of my series.

 
 HTH
 
 ATB,
 Ramsay Jones
 

Thomas
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v4 02/13] read-cache.c: Re-read index if index file changed

2012-08-16 Thread Thomas Gummerer
Add the possibility of re-reading the index file, if it changed
while reading.

The index file might change during the read, causing outdated
information to be displayed. We check if the index file changed
by using its stat data as heuristic.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache.c | 87 +---
 1 file changed, 60 insertions(+), 27 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 6a8b4b1..cdd8480 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1170,11 +1170,34 @@ int read_index(struct index_state *istate)
return read_index_from(istate, get_index_file());
 }
 
+static int index_changed(struct stat st_old, struct stat st_new)
+{
+   int changed = 0;
+
+   if (st_old.st_mtime != st_new.st_mtime ||
+   st_old.st_uid   != st_new.st_uid ||
+   st_old.st_gid   != st_new.st_gid ||
+   st_old.st_ino   != st_new.st_ino ||
+   st_old.st_size  != st_new.st_size)
+   changed = 1;
+#ifdef USE_NSEC
+   if (ST_MTIME_NSEC(st_old) != ST_MTIME_NSEC(st_new))
+   changed = 1;
+#endif
+
+#ifdef USE_STDEV
+   if (st_old.st_dev != st_new.st_dev)
+   changed = 1;
+#endif
+
+   return changed;
+}
+
 /* remember to discard_cache() before reading a different cache! */
 int read_index_from(struct index_state *istate, const char *path)
 {
-   int fd;
-   struct stat st;
+   int fd, err, i = 0;
+   struct stat st_old, st_new;
struct cache_version_header *hdr;
void *mmap;
size_t mmap_size;
@@ -1186,38 +1209,48 @@ int read_index_from(struct index_state *istate, const 
char *path)
errno = ENOENT;
istate-timestamp.sec = 0;
istate-timestamp.nsec = 0;
-   fd = open(path, O_RDONLY);
-   if (fd  0) {
-   if (errno == ENOENT)
-   return 0;
-   die_errno(index file open failed);
-   }
+   do {
+   err = 0;
+   fd = open(path, O_RDONLY);
+   if (fd  0) {
+   if (errno == ENOENT)
+   return 0;
+   die_errno(index file open failed);
+   }
 
-   if (fstat(fd, st))
-   die_errno(cannot stat the open index);
+   if (fstat(fd, st_old))
+   die_errno(cannot stat the open index);
 
-   errno = EINVAL;
-   mmap_size = xsize_t(st.st_size);
-   mmap = xmmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 
0);
-   close(fd);
-   if (mmap == MAP_FAILED)
-   die_errno(unable to map index file);
+   errno = EINVAL;
+   mmap_size = xsize_t(st_old.st_size);
+   mmap = xmmap(NULL, mmap_size, PROT_READ | PROT_WRITE, 
MAP_PRIVATE, fd, 0);
+   close(fd);
+   if (mmap == MAP_FAILED)
+   die_errno(unable to map index file);
 
-   hdr = mmap;
-   if (verify_hdr_version(istate, hdr, mmap_size)  0)
-   goto unmap;
+   hdr = mmap;
+   if (verify_hdr_version(istate, hdr, mmap_size)  0)
+   err = 1;
 
-   if (istate-ops-verify_hdr(mmap, mmap_size)  0)
-   goto unmap;
+   if (istate-ops-verify_hdr(mmap, mmap_size)  0)
+   err = 1;
 
-   istate-ops-read_index(istate, mmap, mmap_size);
-   istate-timestamp.sec = st.st_mtime;
-   istate-timestamp.nsec = ST_MTIME_NSEC(st);
+   if (istate-ops-read_index(istate, mmap, mmap_size)  0)
+   err = 1;
+   istate-timestamp.sec = st_old.st_mtime;
+   istate-timestamp.nsec = ST_MTIME_NSEC(st_old);
+   if (lstat(path, st_new))
+   die_errno(cannot stat the open index);
 
-   munmap(mmap, mmap_size);
-   return istate-cache_nr;
+   munmap(mmap, mmap_size);
+
+   if (!index_changed(st_old, st_new)  !err)
+   return istate-cache_nr;
+
+   usleep(10*1000);
+   i++;
+   } while ((err || index_changed(st_old, st_new))  i  50);
 
-unmap:
munmap(mmap, mmap_size);
die(index file corrupt);
 }
-- 
1.7.11.2

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v4 03/13] t2104: Don't fail for index versions other than [23]

2012-08-16 Thread Thomas Gummerer
t2104 currently checks for the exact index version 2 or 3,
depending if there is a skip-worktree flag or not. Other
index versions do not use extended flags and thus cannot
be tested for version changes.

Make this test update the index to version 2 at the beginning
of the test. Testing the skip-worktree flags for the default
index format is still covered by t7011 and t7012.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/t2104-update-index-skip-worktree.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/t/t2104-update-index-skip-worktree.sh 
b/t/t2104-update-index-skip-worktree.sh
index 1d0879b..bd9644f 100755
--- a/t/t2104-update-index-skip-worktree.sh
+++ b/t/t2104-update-index-skip-worktree.sh
@@ -22,6 +22,7 @@ H sub/2
 EOF
 
 test_expect_success 'setup' '
+   git update-index --index-version=2 
mkdir sub 
touch ./1 ./2 sub/1 sub/2 
git add 1 2 sub/1 sub/2 
-- 
1.7.11.2

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v4 12/13] update-index.c: rewrite index when index-version is given

2012-08-16 Thread Thomas Gummerer
Make update-index always rewrite the index when a index-version
is given, even if the index already has the right version.
This option is used for performance testing the writer and
reader.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 builtin/update-index.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/builtin/update-index.c b/builtin/update-index.c
index 4ce341c..c31d176 100644
--- a/builtin/update-index.c
+++ b/builtin/update-index.c
@@ -6,6 +6,7 @@
 #include cache.h
 #include quote.h
 #include cache-tree.h
+#include read-cache.h
 #include tree-walk.h
 #include builtin.h
 #include refs.h
@@ -861,6 +862,7 @@ int cmd_update_index(int argc, const char **argv, const 
char *prefix)
if (the_index.version != preferred_index_format)
active_cache_changed = 1;
the_index.version = preferred_index_format;
+   set_istate_ops(the_index);
}
 
if (read_from_stdin) {
@@ -886,7 +888,7 @@ int cmd_update_index(int argc, const char **argv, const 
char *prefix)
strbuf_release(buf);
}
 
-   if (active_cache_changed) {
+   if (active_cache_changed || preferred_index_format) {
if (newfd  0) {
if (refresh_args.flags  REFRESH_QUIET)
exit(128);
-- 
1.7.11.2

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v4 13/13] p0002-index.sh: add perf test for the index formats

2012-08-16 Thread Thomas Gummerer
From: Thomas Rast tr...@student.ethz.ch

Add a performance test for index version [23]/4/5 by using
git update-index --index-version=x, thus testing both the reader
and the writer speed of all index formats.

Signed-off-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/perf/p0002-index.sh | 33 +
 1 file changed, 33 insertions(+)
 create mode 100755 t/perf/p0002-index.sh

diff --git a/t/perf/p0002-index.sh b/t/perf/p0002-index.sh
new file mode 100755
index 000..140c7a0
--- /dev/null
+++ b/t/perf/p0002-index.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+test_description=Tests index versions [23]/4/5
+
+. ./perf-lib.sh
+
+test_perf_large_repo
+
+test_expect_success 'convert to v3' '
+   git update-index --index-version=3
+'
+
+test_perf 'v[23]: update-index' '
+   git update-index --index-version=3 /dev/null
+'
+
+test_expect_success 'convert to v4' '
+   git update-index --index-version=4
+'
+
+test_perf 'v4: update-index' '
+   git update-index --index-version=4 /dev/null
+'
+
+test_expect_success 'convert to v5' '
+   git update-index --index-version=5
+'
+
+test_perf 'v5: update-index' '
+   git update-index --index-version=5 /dev/null
+'
+
+test_done
-- 
1.7.11.2

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v4 11/13] Write resolve-undo data for index-v5

2012-08-16 Thread Thomas Gummerer
Make git read the resolve-undo data from the index.

Since the resolve-undo data is joined with the conflicts in
the ondisk format of the index file version 5, conflicts and
resolved data is read at the same time, and the resolve-undo
data is then converted to the in-memory format.

Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c | 96 +
 1 file changed, 96 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index d740d0b..ce2375a 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -907,6 +907,99 @@ static void cache_tree_to_ondisk_v5(struct hash_table 
*table, struct cache_tree
convert_one_to_ondisk_v5(table, root, , 0, 0);
 }
 
+static void resolve_undo_to_ondisk_v5(struct hash_table *table,
+ struct string_list *resolve_undo,
+ unsigned int *ndir, int *total_dir_len,
+ struct directory_entry *de)
+{
+   struct string_list_item *item;
+   struct directory_entry *search;
+
+   if (!resolve_undo)
+   return;
+   for_each_string_list_item(item, resolve_undo) {
+   struct conflict_entry *conflict_entry;
+   struct resolve_undo_info *ui = item-util;
+   char *super;
+   int i, dir_len, len;
+   uint32_t crc;
+   struct directory_entry *found, *current, *new_tree;
+
+   if (!ui)
+   continue;
+
+   super = super_directory(item-string);
+   if (!super)
+   dir_len = 0;
+   else
+   dir_len = strlen(super);
+   crc = crc32(0, (Bytef*)super, dir_len);
+   found = lookup_hash(crc, table);
+   current = NULL;
+   new_tree = NULL;
+
+   while (!found) {
+   struct directory_entry *new;
+
+   new = init_directory_entry(super, dir_len);
+   if (!current)
+   current = new;
+   insert_directory_entry(new, table, total_dir_len, ndir, 
crc);
+   if (new_tree != NULL)
+   new-de_nsubtrees = 1;
+   new-next = new_tree;
+   new_tree = new;
+   super = super_directory(super);
+   if (!super)
+   dir_len = 0;
+   else
+   dir_len = strlen(super);
+   crc = crc32(0, (Bytef*)super, dir_len);
+   found = lookup_hash(crc, table);
+   }
+   search = found;
+   while (search-next_hash  strcmp(super, search-pathname) != 
0)
+   search = search-next_hash;
+   if (search  !current)
+   current = search;
+   if (!search  !current)
+   current = new_tree;
+   if (!super  new_tree) {
+   new_tree-next = de-next;
+   de-next = new_tree;
+   de-de_nsubtrees++;
+   } else if (new_tree) {
+   struct directory_entry *temp;
+
+   search = de-next;
+   while (strcmp(super, search-pathname))
+   search = search-next;
+   temp = new_tree;
+   while (temp-next)
+   temp = temp-next;
+   search-de_nsubtrees++;
+   temp-next = search-next;
+   search-next = new_tree;
+   }
+
+   len = strlen(item-string);
+   conflict_entry = create_new_conflict(item-string, len, 
current-de_pathlen);
+   add_conflict_to_directory_entry(current, conflict_entry);
+   for (i = 0; i  3; i++) {
+   if (ui-mode[i]) {
+   struct conflict_part *cp;
+
+   cp = xmalloc(sizeof(struct conflict_part));
+   cp-flags = (i + 1)  CONFLICT_STAGESHIFT;
+   cp-entry_mode = ui-mode[i];
+   cp-next = NULL;
+   hashcpy(cp-sha1, ui-sha1[i]);
+   add_part_to_conflict_entry(current, 
conflict_entry, cp);
+   }
+   }
+   }
+}
+
 static struct directory_entry *compile_directory_data(struct index_state 
*istate,
int nfile,
unsigned int *ndir,
@@ -1012,6 +1105,9 @@ static struct directory_entry 
*compile_directory_data(struct

[PATCH/RFC v4 08/13] Read cache-tree in index-v5

2012-08-16 Thread Thomas Gummerer
Since the cache-tree data is saved as part of the directory data,
we already read it at the beginning of the index. The cache-tree
is only converted from this directory data.

The cache-tree data is arranged in a tree, with the children sorted by
pathlen at each node, while the ondisk format is sorted lexically.
So we have to rebuild this format from the on-disk directory list.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache-tree.c|  2 +-
 cache-tree.h|  6 
 read-cache-v5.c | 98 +
 3 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/cache-tree.c b/cache-tree.c
index 28ed657..61544d8 100644
--- a/cache-tree.c
+++ b/cache-tree.c
@@ -31,7 +31,7 @@ void cache_tree_free(struct cache_tree **it_p)
*it_p = NULL;
 }
 
-static int subtree_name_cmp(const char *one, int onelen,
+int subtree_name_cmp(const char *one, int onelen,
const char *two, int twolen)
 {
if (onelen  twolen)
diff --git a/cache-tree.h b/cache-tree.h
index d8cb2e9..7416007 100644
--- a/cache-tree.h
+++ b/cache-tree.h
@@ -20,10 +20,16 @@ struct cache_tree {
struct cache_tree_sub **down;
 };
 
+struct directory_queue {
+   struct directory_queue *down;
+   struct directory_entry *de;
+};
+
 struct cache_tree *cache_tree(void);
 void cache_tree_free(struct cache_tree **);
 void cache_tree_invalidate_path(struct cache_tree *, const char *);
 struct cache_tree_sub *cache_tree_sub(struct cache_tree *, const char *);
+int subtree_name_cmp(const char *, int, const char *, int);
 
 void cache_tree_write(struct strbuf *, struct cache_tree *root);
 struct cache_tree *cache_tree_read(const char *buffer, unsigned long size);
diff --git a/read-cache-v5.c b/read-cache-v5.c
index fb549de..b497726 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -448,6 +448,103 @@ static int read_conflicts(struct conflict_entry **head,
return 0;
 }
 
+static struct cache_tree *convert_one(struct directory_queue *queue, int dirnr)
+{
+   int i, subtree_nr;
+   struct cache_tree *it;
+   struct directory_queue *down;
+
+   it = cache_tree();
+   it-entry_count = queue[dirnr].de-de_nentries;
+   subtree_nr = queue[dirnr].de-de_nsubtrees;
+   if (0 = it-entry_count)
+   hashcpy(it-sha1, queue[dirnr].de-sha1);
+
+   /*
+   * Just a heuristic -- we do not add directories that often but
+   * we do not want to have to extend it immediately when we do,
+   * hence +2.
+   */
+   it-subtree_alloc = subtree_nr + 2;
+   it-down = xcalloc(it-subtree_alloc, sizeof(struct cache_tree_sub *));
+   down = queue[dirnr].down;
+   for (i = 0; i  subtree_nr; i++) {
+   struct cache_tree *sub;
+   struct cache_tree_sub *subtree;
+   char *buf, *name;
+
+   name = ;
+   buf = strtok(down[i].de-pathname, /);
+   while (buf) {
+   name = buf;
+   buf = strtok(NULL, /);
+   }
+   sub = convert_one(down, i);
+   if(!sub)
+   goto free_return;
+   subtree = cache_tree_sub(it, name);
+   subtree-cache_tree = sub;
+   }
+   if (subtree_nr != it-subtree_nr)
+   die(cache-tree: internal error);
+   return it;
+ free_return:
+   cache_tree_free(it);
+   return NULL;
+}
+
+static int compare_cache_tree_elements(const void *a, const void *b)
+{
+   const struct directory_entry *de1, *de2;
+
+   de1 = ((const struct directory_queue *)a)-de;
+   de2 = ((const struct directory_queue *)b)-de;
+   return subtree_name_cmp(de1-pathname, de1-de_pathlen,
+   de2-pathname, de2-de_pathlen);
+}
+
+static struct directory_entry *sort_directories(struct directory_entry *de,
+   struct directory_queue *queue)
+{
+   int i, nsubtrees;
+
+   nsubtrees = de-de_nsubtrees;
+   for (i = 0; i  nsubtrees; i++) {
+   struct directory_entry *new_de;
+   de = de-next;
+   new_de = xmalloc(directory_entry_size(de-de_pathlen));
+   memcpy(new_de, de, directory_entry_size(de-de_pathlen));
+   queue[i].de = new_de;
+   if (de-de_nsubtrees) {
+   queue[i].down = xcalloc(de-de_nsubtrees,
+   sizeof(struct directory_queue));
+   de = sort_directories(de,
+   queue[i].down);
+   }
+   }
+   qsort(queue, nsubtrees, sizeof(struct directory_queue),
+   compare_cache_tree_elements);
+   return de;
+}
+
+/*
+ * This function modifys the directory argument that is given to it.
+ * Don't use it if the directory entries are still needed after.
+ */
+static struct cache_tree

[PATCH/RFC v4 10/13] Write index-v5 cache-tree data

2012-08-16 Thread Thomas Gummerer
Write the cache-tree data for the index version 5 file format. The
in-memory cache-tree data is converted to the ondisk format, by adding
it to the directory entries, that were compiled from the cache-entries
in the step before.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c | 51 +++
 1 file changed, 51 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index a2afc94..d740d0b 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -856,6 +856,57 @@ static struct conflict_entry 
*create_conflict_entry_from_ce(struct cache_entry *
return create_new_conflict(ce-name, ce_namelen(ce), pathlen);
 }
 
+static void convert_one_to_ondisk_v5(struct hash_table *table, struct 
cache_tree *it,
+   const char *path, int pathlen, uint32_t crc)
+{
+   int i;
+   struct directory_entry *found, *search;
+
+   crc = crc32(crc, (Bytef*)path, pathlen);
+   found = lookup_hash(crc, table);
+   search = found;
+   while (search  strcmp(path, search-pathname + search-de_pathlen - 
strlen(path)) != 0)
+   search = search-next_hash;
+   if (!search)
+   return;
+   /*
+* The number of subtrees is already calculated by
+* compile_directory_data, therefore we only need to
+* add the entry_count
+*/
+   search-de_nentries = it-entry_count;
+   if (0 = it-entry_count)
+   hashcpy(search-sha1, it-sha1);
+   if (strcmp(path, ) != 0)
+   crc = crc32(crc, (Bytef*)/, 1);
+
+#if DEBUG
+   if (0 = it-entry_count)
+   fprintf(stderr, cache-tree %.*s (%d ent, %d subtree) %s\n,
+   pathlen, path, it-entry_count, it-subtree_nr,
+   sha1_to_hex(it-sha1));
+   else
+   fprintf(stderr, cache-tree %.*s (%d subtree) invalid\n,
+   pathlen, path, it-subtree_nr);
+#endif
+
+   for (i = 0; i  it-subtree_nr; i++) {
+   struct cache_tree_sub *down = it-down[i];
+   if (i) {
+   struct cache_tree_sub *prev = it-down[i-1];
+   if (subtree_name_cmp(down-name, down-namelen,
+prev-name, prev-namelen) = 0)
+   die(fatal - unsorted cache subtree);
+   }
+   convert_one_to_ondisk_v5(table, down-cache_tree, down-name, 
down-namelen, crc);
+   }
+}
+
+static void cache_tree_to_ondisk_v5(struct hash_table *table, struct 
cache_tree *root)
+{
+   convert_one_to_ondisk_v5(table, root, , 0, 0);
+}
+
 static struct directory_entry *compile_directory_data(struct index_state 
*istate,
int nfile,
unsigned int *ndir,
-- 
1.7.11.2

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v4 07/13] Read resolve-undo data

2012-08-16 Thread Thomas Gummerer
Make git read the resolve-undo data from the index.

Since the resolve-undo data is joined with the conflicts in
the ondisk format of the index file version 5, conflicts and
resolved data is read at the same time, and the resolve-undo
data is then converted to the in-memory format.

Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c | 38 ++
 1 file changed, 38 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index 2031969..fb549de 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -1,5 +1,6 @@
 #include cache.h
 #include read-cache.h
+#include string-list.h
 #include resolve-undo.h
 #include cache-tree.h
 
@@ -447,6 +448,42 @@ static int read_conflicts(struct conflict_entry **head,
return 0;
 }
 
+static void resolve_undo_convert_v5(struct index_state *istate,
+   struct conflict_entry *ce)
+{
+   int i;
+
+   while (ce) {
+   struct string_list_item *lost;
+   struct resolve_undo_info *ui;
+   struct conflict_part *cp;
+
+   if (ce-entries  (ce-entries-flags  CONFLICT_CONFLICTED) 
!= 0) {
+   ce = ce-next;
+   continue;
+   }
+   if (!istate-resolve_undo) {
+   istate-resolve_undo = xcalloc(1, sizeof(struct 
string_list));
+   istate-resolve_undo-strdup_strings = 1;
+   }
+
+   lost = string_list_insert(istate-resolve_undo, ce-name);
+   if (!lost-util)
+   lost-util = xcalloc(1, sizeof(*ui));
+   ui = lost-util;
+
+   cp = ce-entries;
+   for (i = 0; i  3; i++)
+   ui-mode[i] = 0;
+   while (cp) {
+   ui-mode[conflict_stage(cp) - 1] = cp-entry_mode;
+   hashcpy(ui-sha1[conflict_stage(cp) - 1], cp-sha1);
+   cp = cp-next;
+   }
+   ce = ce-next;
+   }
+}
+
 static int read_entries(struct index_state *istate, struct directory_entry 
**de,
unsigned long *entry_offset, void **mmap,
unsigned long mmap_size, int *nr,
@@ -460,6 +497,7 @@ static int read_entries(struct index_state *istate, struct 
directory_entry **de,
conflict_queue = NULL;
if (read_conflicts(conflict_queue, *de, mmap, mmap_size)  0)
return -1;
+   resolve_undo_convert_v5(istate, conflict_queue);
for (i = 0; i  (*de)-de_nfiles; i++) {
if (read_entry(ce,
*de,
-- 
1.7.11.2

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v4 06/13] Read index-v5

2012-08-16 Thread Thomas Gummerer
Make git read the index file version 5 without complaining.

This version of the reader doesn't read neither the cache-tree
nor the resolve undo data, but doesn't choke on an index that
includes such data.

Helped-by: Junio C Hamano gits...@pobox.com
Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Makefile|   1 +
 cache.h |  73 
 read-cache-v5.c | 556 
 3 files changed, 630 insertions(+)
 create mode 100644 read-cache-v5.c

diff --git a/Makefile b/Makefile
index 3ccd3a8..ef55509 100644
--- a/Makefile
+++ b/Makefile
@@ -770,6 +770,7 @@ LIB_OBJS += quote.o
 LIB_OBJS += reachable.o
 LIB_OBJS += read-cache.o
 LIB_OBJS += read-cache-v2.o
+LIB_OBJS += read-cache-v5.o
 LIB_OBJS += reflog-walk.o
 LIB_OBJS += refs.o
 LIB_OBJS += remote.o
diff --git a/cache.h b/cache.h
index bfe3099..bec9402 100644
--- a/cache.h
+++ b/cache.h
@@ -110,6 +110,15 @@ struct cache_time {
unsigned int nsec;
 };
 
+/*
+ * The *next pointer is used in read_entries_v5 for holding
+ * all the elements of a directory, and points to the next
+ * cache_entry in a directory.
+ *
+ * It is reset by the add_name_hash call in set_index_entry
+ * to set it to point to the next cache_entry in the
+ * correct in-memory format ordering.
+ */
 struct cache_entry {
struct cache_time ce_ctime;
struct cache_time ce_mtime;
@@ -128,11 +137,59 @@ struct cache_entry {
char name[FLEX_ARRAY]; /* more */
 };
 
+struct directory_entry {
+   struct directory_entry *next;
+   struct directory_entry *next_hash;
+   struct cache_entry *ce;
+   struct cache_entry *ce_last;
+   struct conflict_entry *conflict;
+   struct conflict_entry *conflict_last;
+   unsigned int conflict_size;
+   unsigned int de_foffset;
+   unsigned int de_cr;
+   unsigned int de_ncr;
+   unsigned int de_nsubtrees;
+   unsigned int de_nfiles;
+   unsigned int de_nentries;
+   unsigned char sha1[20];
+   unsigned short de_flags;
+   unsigned int de_pathlen;
+   char pathname[FLEX_ARRAY];
+};
+
+struct conflict_part {
+   struct conflict_part *next;
+   unsigned short flags;
+   unsigned short entry_mode;
+   unsigned char sha1[20];
+};
+
+struct conflict_entry {
+   struct conflict_entry *next;
+   unsigned int nfileconflicts;
+   struct conflict_part *entries;
+   unsigned int namelen;
+   unsigned int pathlen;
+   char name[FLEX_ARRAY];
+};
+
+struct ondisk_conflict_part {
+   unsigned short flags;
+   unsigned short entry_mode;
+   unsigned char sha1[20];
+};
+
+#define CE_NAMEMASK  (0x0fff)
 #define CE_STAGEMASK (0x3000)
 #define CE_EXTENDED  (0x4000)
 #define CE_VALID (0x8000)
+#define CE_SMUDGED   (0x0400) /* index v5 only flag */
 #define CE_STAGESHIFT 12
 
+#define CONFLICT_CONFLICTED (0x8000)
+#define CONFLICT_STAGESHIFT 13
+#define CONFLICT_STAGEMASK (0x6000)
+
 /*
  * Range 0x in ce_flags is divided into
  * two parts: in-memory flags and on-disk ones.
@@ -166,6 +223,18 @@ struct cache_entry {
 #define CE_EXTENDED_FLAGS (CE_INTENT_TO_ADD | CE_SKIP_WORKTREE)
 
 /*
+ * Representation of the extended on-disk flags in the v5 format.
+ * They must not collide with the ordinary on-disk flags, and need to
+ * fit in 16 bits.  Note however that v5 does not save the name
+ * length.
+ */
+#define CE_INTENT_TO_ADD_V5  (0x4000)
+#define CE_SKIP_WORKTREE_V5  (0x0800)
+#if (CE_VALID|CE_STAGEMASK)  (CE_INTENTTOADD_V5|CE_SKIPWORKTREE_V5)
+#error v5 on-disk flags collide with ordinary on-disk flags
+#endif
+
+/*
  * Safeguard to avoid saving wrong flags:
  *  - CE_EXTENDED2 won't get saved until its semantic is known
  *  - Bits in 0x have been saved in ce_flags already
@@ -203,6 +272,8 @@ static inline unsigned create_ce_flags(unsigned stage)
 #define ce_skip_worktree(ce) ((ce)-ce_flags  CE_SKIP_WORKTREE)
 #define ce_mark_uptodate(ce) ((ce)-ce_flags |= CE_UPTODATE)
 
+#define conflict_stage(c) ((CONFLICT_STAGEMASK  (c)-flags)  
CONFLICT_STAGESHIFT)
+
 #define ce_permissions(mode) (((mode)  0100) ? 0755 : 0644)
 static inline unsigned int create_ce_mode(unsigned int mode)
 {
@@ -249,6 +320,8 @@ static inline unsigned int canon_mode(unsigned int mode)
 }
 
 #define cache_entry_size(len) (offsetof(struct cache_entry,name) + (len) + 1)
+#define directory_entry_size(len) (offsetof(struct directory_entry,pathname) + 
(len) + 1)
+#define conflict_entry_size(len) (offsetof(struct conflict_entry,name) + (len) 
+ 1)
 
 struct index_state {
struct cache_entry **cache;
diff --git a/read-cache-v5.c b/read-cache-v5.c
new file mode 100644
index 000..2031969
--- /dev/null
+++ b/read-cache-v5.c
@@ -0,0 +1,556 @@
+#include cache.h
+#include read-cache.h
+#include resolve-undo.h
+#include cache-tree.h
+
+#define ptr_add(x,y) ((void *)(((char *)(x)) + (y

[PATCH/RFC v4 05/13] Make in-memory format aware of stat_crc

2012-08-16 Thread Thomas Gummerer
Make the in-memory format aware of the stat_crc used by index-v5.
It is simply ignored by index version prior to v5.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache.h  |  1 +
 read-cache.c | 25 +
 2 files changed, 26 insertions(+)

diff --git a/cache.h b/cache.h
index c77cdbe..bfe3099 100644
--- a/cache.h
+++ b/cache.h
@@ -122,6 +122,7 @@ struct cache_entry {
unsigned int ce_flags;
unsigned int ce_namelen;
unsigned char sha1[20];
+   uint32_t ce_stat_crc;
struct cache_entry *next;
struct cache_entry *dir_next;
char name[FLEX_ARRAY]; /* more */
diff --git a/read-cache.c b/read-cache.c
index cdd8480..9d2bd62 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -51,6 +51,29 @@ void rename_index_entry_at(struct index_state *istate, int 
nr, const char *new_n
add_index_entry(istate, new, 
ADD_CACHE_OK_TO_ADD|ADD_CACHE_OK_TO_REPLACE);
 }
 
+static uint32_t calculate_stat_crc(struct cache_entry *ce)
+{
+   unsigned int ctimens = 0;
+   uint32_t stat, stat_crc;
+
+   stat = htonl(ce-ce_ctime.sec);
+   stat_crc = crc32(0, (Bytef*)stat, 4);
+#ifdef USE_NSEC
+   ctimens = ce-ce_ctime.nsec;
+#endif
+   stat = htonl(ctimens);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_ino);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_dev);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_uid);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_gid);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   return stat_crc;
+}
+
 /*
  * This only updates the non-critical parts of the directory
  * cache, ie the parts that aren't tracked by GIT, and only used
@@ -73,6 +96,8 @@ void fill_stat_cache_info(struct cache_entry *ce, struct stat 
*st)
 
if (S_ISREG(st-st_mode))
ce_mark_uptodate(ce);
+
+   ce-ce_stat_crc = calculate_stat_crc(ce);
 }
 
 static int ce_compare_data(struct cache_entry *ce, struct stat *st)
-- 
1.7.11.2

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v4 01/13] Move index v2 specific functions to their own file

2012-08-16 Thread Thomas Gummerer
Move index version 2 specific functions to their own file,
to prepare for the addition of a new index file format. With
the split into two files we have the non-index specific
functions in read-cache.c and the index-v2 specific functions
in read-cache-v2.c

Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Makefile |   2 +
 cache.h  |  13 +-
 read-cache-v2.c  | 581 
 read-cache.c | 612 +++
 read-cache.h |  57 +
 test-index-version.c |   7 +-
 6 files changed, 682 insertions(+), 590 deletions(-)
 create mode 100644 read-cache-v2.c
 create mode 100644 read-cache.h

diff --git a/Makefile b/Makefile
index 6b0c961..3ccd3a8 100644
--- a/Makefile
+++ b/Makefile
@@ -645,6 +645,7 @@ LIB_H += progress.h
 LIB_H += prompt.h
 LIB_H += quote.h
 LIB_H += reachable.h
+LIB_H += read-cache.h
 LIB_H += reflog-walk.h
 LIB_H += refs.h
 LIB_H += remote.h
@@ -768,6 +769,7 @@ LIB_OBJS += prompt.o
 LIB_OBJS += quote.o
 LIB_OBJS += reachable.o
 LIB_OBJS += read-cache.o
+LIB_OBJS += read-cache-v2.o
 LIB_OBJS += reflog-walk.o
 LIB_OBJS += refs.o
 LIB_OBJS += remote.o
diff --git a/cache.h b/cache.h
index 67f28b4..c77cdbe 100644
--- a/cache.h
+++ b/cache.h
@@ -94,16 +94,8 @@ unsigned long git_deflate_bound(git_zstream *, unsigned 
long);
  */
 #define DEFAULT_GIT_PORT 9418
 
-/*
- * Basic data structures for the directory cache
- */
 
 #define CACHE_SIGNATURE 0x44495243 /* DIRC */
-struct cache_header {
-   unsigned int hdr_signature;
-   unsigned int hdr_version;
-   unsigned int hdr_entries;
-};
 
 #define INDEX_FORMAT_LB 2
 #define INDEX_FORMAT_UB 4
@@ -267,6 +259,7 @@ struct index_state {
unsigned name_hash_initialized : 1,
 initialized : 1;
struct hash_table name_hash;
+   struct index_ops *ops;
 };
 
 extern struct index_state the_index;
@@ -471,8 +464,8 @@ extern int index_name_is_other(const struct index_state *, 
const char *, int);
 #define CE_MATCH_RACY_IS_DIRTY 02
 /* do stat comparison even if CE_SKIP_WORKTREE is true */
 #define CE_MATCH_IGNORE_SKIP_WORKTREE  04
-extern int ie_match_stat(const struct index_state *, struct cache_entry *, 
struct stat *, unsigned int);
-extern int ie_modified(const struct index_state *, struct cache_entry *, 
struct stat *, unsigned int);
+extern int ie_match_stat(struct index_state *, struct cache_entry *, struct 
stat *, unsigned int);
+extern int ie_modified(struct index_state *, struct cache_entry *, struct stat 
*, unsigned int);
 
 struct pathspec {
const char **raw; /* get_pathspec() result, not freed by 
free_pathspec() */
diff --git a/read-cache-v2.c b/read-cache-v2.c
new file mode 100644
index 000..2c5e78b
--- /dev/null
+++ b/read-cache-v2.c
@@ -0,0 +1,581 @@
+#include cache.h
+#include read-cache.h
+#include resolve-undo.h
+#include cache-tree.h
+#include varint.h
+
+/* Mask for the name length in ce_flags in the on-disk index */
+#define CE_NAMEMASK  (0x0fff)
+
+struct cache_header {
+   unsigned int hdr_entries;
+};
+
+/*
+ * Index File I/O
+ */
+
+/*
+ * dev/ino/uid/gid/size are also just tracked to the low 32 bits
+ * Again - this is just a (very strong in practice) heuristic that
+ * the inode hasn't changed.
+ *
+ * We save the fields in big-endian order to allow using the
+ * index file over NFS transparently.
+ */
+struct ondisk_cache_entry {
+   struct cache_time ctime;
+   struct cache_time mtime;
+   unsigned int dev;
+   unsigned int ino;
+   unsigned int mode;
+   unsigned int uid;
+   unsigned int gid;
+   unsigned int size;
+   unsigned char sha1[20];
+   unsigned short flags;
+   char name[FLEX_ARRAY]; /* more */
+};
+
+/*
+ * This struct is used when CE_EXTENDED bit is 1
+ * The struct must match ondisk_cache_entry exactly from
+ * ctime till flags
+ */
+struct ondisk_cache_entry_extended {
+   struct cache_time ctime;
+   struct cache_time mtime;
+   unsigned int dev;
+   unsigned int ino;
+   unsigned int mode;
+   unsigned int uid;
+   unsigned int gid;
+   unsigned int size;
+   unsigned char sha1[20];
+   unsigned short flags;
+   unsigned short flags2;
+   char name[FLEX_ARRAY]; /* more */
+};
+
+/* These are only used for v3 or lower */
+#define align_flex_name(STRUCT,len) ((offsetof(struct STRUCT,name) + (len) + 
8)  ~7)
+#define ondisk_cache_entry_size(len) align_flex_name(ondisk_cache_entry,len)
+#define ondisk_cache_entry_extended_size(len) 
align_flex_name(ondisk_cache_entry_extended,len)
+#define ondisk_ce_size(ce) (((ce)-ce_flags  CE_EXTENDED) ? \
+   ondisk_cache_entry_extended_size(ce_namelen(ce

[PATCH/RFC v4 04/13] Add documentation of the index-v5 file format

2012-08-16 Thread Thomas Gummerer
Add a documentation of the index file format version 5 to
Documentation/technical.

Helped-by: Michael Haggerty mhag...@alum.mit.edu
Helped-by: Junio C Hamano gits...@pobox.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Robin Rosenberg robin.rosenb...@dewire.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Documentation/technical/index-file-format-v5.txt | 296 +++
 1 file changed, 296 insertions(+)
 create mode 100644 Documentation/technical/index-file-format-v5.txt

diff --git a/Documentation/technical/index-file-format-v5.txt 
b/Documentation/technical/index-file-format-v5.txt
new file mode 100644
index 000..563d830
--- /dev/null
+++ b/Documentation/technical/index-file-format-v5.txt
@@ -0,0 +1,296 @@
+GIT index format
+
+
+== The git index
+
+   The git index file (.git/index) documents the status of the files
+ in the git staging area.
+
+   The staging area is used for preparing commits, merging, etc.
+
+== The git index file format
+
+   All binary numbers are in network byte order. Version 5 is described
+ here. The index file consists of various sections. They appear in
+ the following order in the file.
+
+   - header: the description of the index format, including it's signature,
+ version and various other fields that are used internally.
+
+   - diroffsets (ndir entries of direcotry offset): A 4-byte offset
+   relative to the beginning of the direntries block (see below)
+   for each of the ndir directories in the index, sorted by pathname
+   (of the directory it's pointing to). [1]
+
+   - direntries (ndir entries of directory offset): A directory entry
+   for each of the ndir directories in the index, sorted by pathname
+   (see below). [2]
+
+   - fileoffsets (nfile entries of file offset): A 4-byte offset
+   relative to the beginning of the fileentries block (see below)
+   for each of the nfile files in the index. [1]
+
+   - fileentries (nfile entries of file entry): A file entry for
+   each of the nfile files in the index (see below).
+
+   - crdata: A number of entries for conflicted data/resolved conflicts
+   (see below).
+
+   - Extensions (Currently none, see below in the future)
+
+ Extensions are identified by signature. Optional extensions can
+ be ignored if GIT does not understand them.
+
+ GIT supports an arbitrary number of extension, but currently none
+ is implemented. [3]
+
+ extsig (32-bits): extension signature. If the first byte is 'A'..'Z'
+ the extension is optional and can be ignored.
+
+ extsize (32-bits): size of the extension, excluding the header
+   (extsig, extsize, extchecksum).
+
+ extchecksum (32-bits): crc32 checksum of the extension signature
+   and size.
+
+- Extension data.
+
+== Header
+   sig (32-bits): Signature:
+ The signature is { 'D', 'I', 'R', 'C' } (stands for dircache)
+
+   vnr (32-bits): Version number:
+ The current supported versions are 2, 3, 4 and 5.
+
+   ndir (32-bits): number of directories in the index.
+
+   nfile (32-bits): number of file entries in the index.
+
+   fblockoffset (32-bits): offset to the file block, relative to the
+ beginning of the file.
+
+   - Offset to the extensions.
+
+ nextensions (32-bits): number of extensions.
+
+ extoffset (32-bits): offset to the extension. (Possibly none, as
+   many as indicated in the 4-byte number of extensions)
+
+   headercrc (32-bits): crc checksum including the header and the
+ offsets to the extensions.
+
+
+== Directory offsets (diroffsets)
+
+  diroffset (32-bits): offset to the directory relative to the beginning
+of the index file. There are ndir + 1 offsets in the diroffset table,
+the last is pointing to the end of the last direntry. With this last
+entry, we are able to replace the strlen of when reading the directory
+name, by calculating it from diroffset[n+1]-diroffset[n]-61.  61 is the
+size of the directory data, which follows each each directory + the
+crc sum + the NUL byte.
+
+  This part is needed for making the directory entries bisectable and
+thus allowing a binary search.
+
+== Directory entry (direntries)
+  
+  Directory entries are sorted in lexicographic order by the name 
+of their path starting with the root.
+  
+  pathname (variable length, nul terminated): relative to top level
+directory (without the leading slash). '/' is used as path
+separator. A string of length 0 ('') indicates the root directory.
+The special path components ., and .. (without quotes) are
+disallowed. The path also includes a trailing slash. [9]
+
+  foffset (32-bits): offset to the lexicographically first file in 
+the file offsets (fileoffsets), relative to the beginning of
+the fileoffset block.
+
+  cr (32-bits): offset to conflicted/resolved data at the end of the
+index

Re: [PATCH/RFC v3 07/13] Read resolve-undo data

2012-08-10 Thread Thomas Gummerer
On 08/09, Junio C Hamano wrote:
 Thomas Gummerer t.gumme...@gmail.com writes:
 
  On 08/09, Junio C Hamano wrote:
  Thomas Gummerer t.gumme...@gmail.com writes:
  
   Make git read the resolve-undo data from the index.
  
   Since the resolve-undo data is joined with the conflicts in
   the ondisk format of the index file version 5, conflicts and
   resolved data is read at the same time, and the resolve-undo
   data is then converted to the in-memory format.
  
  This, and the next one, are both about reading extension data from
  the v2 formatted index, no?
 
  Yes, exactly.
 
  Again, mild NAK.
  
  I think it is a lot more logical for the v5 code to read data stored
  in the resolve-undo and cache-tree extensions using the public API
  just like other users of these data do, and write out whatever in a
  way that is specific to the v5 index format.
  
  If the v5 codepath needs some information that is not exposed to
  other users of istate-resolve_undo and istate-cache_tree, then the
  story is different, but I do not think that is the case.
 
  Sorry it's not clear to me what you mean with using the public API here.
  Do you mean using resolve_undo_write() and resolve_undo_read()?
 
 The code that reads from istate-resolve_undo is fine to do the v5
 specific conversion, but it does not belong to resolve-undo.c file
 which is about the resolve-undo extension.  Moving it to v5 specific
 file you added for this topic, read-cache-v5.c, and everything looks
 more logical.  When we taught ls-files to show the paths with
 resolve-undo data, we didn't add any function to resolve-undo.c that
 does ls-files's work for it.  Instead, ls-files just uses the public
 API (the data structure you find at the_index.resolve_undo is part
 of the API) to find what it needs to learn, and I think v5 code can
 do the same.
 
 then the story is different comment refers to a possibilty that
 v5 code might need something more than callers outside resolve-undo.c
 can find from its public interface, but I do not think it is the
 case.

Ok, thanks for the clarification, will change it for the re-roll.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v3 01/13] Move index v2 specific functions to their own file

2012-08-10 Thread Thomas Gummerer
On 08/10, Junio C Hamano wrote:
 Thomas Rast tr...@student.ethz.ch writes:
 
  But I think the idea always was that any write that changes the basic
  layout of the file (so that you would read something wrong) will need a
  full rewrite.  Otherwise we're too far in DB land.
 
  Most updates will be
  of the update the stat and/or sha1 of a file kind, anyway.
 
 Yes, I agree the v5 format documented in the series does not let you
 do anything other than the kind of updates without rewriting [*1*]
 
 But that does not fundamentally change the story that a new format
 and a new way to access the index to cope with larger projects would
 want to come up with a solution to address the competing read/write
 issue, or at least help to make it easier to solve the issue in the
 future.
 
 That problem is not new is not an answer when the question is We
 still have the problem.
 
 
 [Footnote]
 
 *1* While my gut feeling matches your guess that the kind of updates
 would be the majority, I do not think anybody did numbers to
 substanticate it, which we may want to see happen.

Hrm anther way to solve this may be the following. The idea would be
to just check if the index_file changed basically using the same
heuristic we already use to detect file changes.  (use the stat data,
mtime, size, etc.)

With this code we do not rely on the crc sums to check if the index
needs to be re-read anymore and don't have a problem if part of the
index changes, after we read it (we know the index changed from its
mtime and can just re-read it).  Another thing that would have to
change is that we can't use die if a crc is wrong, but some return
code, but that shouldn't be a problem.  I'm not sure I'm not missing
something here though.

do {
fd = open()
fstat(fd, st_old);
mmap = xmmap(fd);
verify_various_fields(mmap);
istate-ops-read_index(istate,
mmap,
mmap_size));
fstat(fd, st_new);
close(fd);
} while (stat_data_doesnt_match(st_old, st_new));


--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v2 06/16] t3700: sleep for 1 second, to avoid interfering with the racy code

2012-08-09 Thread Thomas Gummerer
On 08/08, Junio C Hamano wrote:
 Junio C Hamano gits...@pobox.com writes:
 
  So whether done with sleep or test-chmtime, avoiding a racily
  clean situation sounds like sweeping a bug in the v5 code in racy
  situation under the rug to me (unless I am misunderstanding what
  you are doing with this change and in your explanation, or the test
  was checking a wrong thing, that is).
 
  Even more confused
 
 OK, after staring this test for a long time, and going back to
 3d1f148 (refresh_index: do not show unmerged path that is outside
 pathspec, 2012-02-17), I give up.
 
 Let me ask the same question in a more direct way.  Which part of
 this test break with your series?
 
 test_expect_success 'git add --refresh with pathspec' '
 git reset --hard 
 echo foo  echo bar  echo baz 
 git add foo bar baz  H=$(git rev-parse :foo)  git rm -f 
 foo 
 echo 100644 $H 3 foo | git update-index --index-info 
   # sleep 1  in the update here ...
 test-chmtime -60 bar baz 
 expect 
 git add --refresh bar actual 
 test_cmp expect actual 
 
 git diff-files --name-only actual 
 ! grep bar actual
 grep baz actual
 '
 
 We prepare an index with bunch of paths, we make foo unmerged, we
 smudge bar and baz stat-dirty, so that diff-files would report
 them, even though their contents match what is recorded in the
 index.

After getting confused a bit myself, I now think here is the problem.
The v5 code smudges baz when doing git add --refresh bar.  Therefore
baz isn't considered stat-dirty by the code, but a racily smudged entry
and therefore its content gets checked, thus not showing up in
git diff-files.  The mtime doesn't get checked anymore as it is used
as smudge marker and thus 0.  Adding sleep just avoids smudging the
entry.

The alternative would be to use the size or the crc as smudge marker
but I don't think they are good canidates, as they can still be used
by the reader to avoid checking the filesystem.

Another alternative would be to introduce a CE_SMUDGED flag as it was
suggested by Thomas on irc IIRC, but we chose to use the mtime as
smudge marker instead.

 Then we say git add --refresh bar.  As far as I know, the output
 from git add --refresh pathspec is limited to foo: needs merge
 if and only if foo is covered by pathspec and foo is unmerged.
 
   Side note: If --verbose is given to the same command, we
   also give Unstaged changes after refreshing the index:
   followed by M foo or U foo if foo does not match the
   index but not unmerged, or if foo is unmerged, again if
   and only if foo is covered by pathspec.  But that is not
   how we invoke git add --refresh in this test.
 
 So if you are getting a test failure from the test_cmp, wouldn't it
 mean that your series broke what 3d1f148 did (namely, make sure we
 report only on paths that are covered by pathspec, in this case
 bar), as the contents of bar in the working tree matches what is
 recorded in the index?
 
 If the failure you are seeing is that bar appears in the output of
 git diff-files --name-only, it means that diff-files noticed
 that bar is stat-dirty after git add --refresh bar.  Wouldn't it
 mean that the series broke git add --refresh bar in such a way
 that it does not to refresh what it was told to refresh?
 
 Another test that could fail after the point you added sleep 1 is
 that the output from git diff-files --name-only fails to list
 baz in its output, but with test-chmtime -60 bar baz, we made
 sure that bar and baz are stat-dirty, and we only refreshed
 bar and not baz.  If that is the case, then would it mean that
 the series broke git add --refresh bar in such a way that it
 refreshes something other than what it was told to refresh?

 In any case, having to change this test in any way smells like there
 is some breakage in the series; it is not immediately obvious to me
 that the current test is checking anything wrong as I suspected in
 the earlier message.
 
 So,... I dunno.
 
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v2 06/16] t3700: sleep for 1 second, to avoid interfering with the racy code

2012-08-09 Thread Thomas Gummerer
On 08/09, Junio C Hamano wrote:
 Thomas Gummerer t.gumme...@gmail.com writes:
 
  On 08/08, Junio C Hamano wrote:
  ...
  Let me ask the same question in a more direct way.  Which part of
  this test break with your series?
  
  test_expect_success 'git add --refresh with pathspec' '
  git reset --hard 
  echo foo  echo bar  echo baz 
  git add foo bar baz  H=$(git rev-parse :foo)  git rm 
  -f foo 
  echo 100644 $H 3  foo | git update-index --index-info 
 # sleep 1  in the update here ...
  test-chmtime -60 bar baz 
  expect 
  git add --refresh bar actual 
  test_cmp expect actual 
  
  git diff-files --name-only actual 
  ! grep bar actual
  grep baz actual
  '
  
  We prepare an index with bunch of paths, we make foo unmerged, we
  smudge bar and baz stat-dirty, so that diff-files would report
  them, even though their contents match what is recorded in the
  index.
 
  After getting confused a bit myself, I now think here is the problem.
  The v5 code smudges baz when doing git add --refresh bar.  Therefore
  baz isn't considered stat-dirty by the code, but a racily smudged entry
  and therefore its content gets checked, thus not showing up in
  git diff-files.
 
 So in short, the breakage is the last one among the three choices I
 gave you in my message you are responding to.  The user asked to
 refresh bar so that later diff-files won't report a false change
 on it, but baz effectively ends up getting refreshed at the same
 time and a false change is not reported.

Exactly.

 That breakage is, from the correctness point of view, not a
 breakage.  As the primary purpose of refreshing is to support
 commands that want to rely on a quick ce_modified() call to tell
 files that are modified in the working tree since it was last added
 to the index---you refresh once, and then you call such commands
 many times without having to worry about having to compare the
 contents between the indexed objects and the working tree files.
 
 But from the performance point of view, which is the whole point of
 refresh, the behaviour of the new code is dubious.  If the user is
 working in a large working tree (which automatically means large
 index, the primary reason we are doing this v5 experiment), the user
 often is working in a deep and narrow subdirectory of it, and a path
 limited refresh (the test names a specific file bar, but imagine
 it were . to limit it to the directory the user is working in) may
 be a cheap way not to bother even checking outside the area the user
 currently is working in.

That's true, but once we have the partial reader/writer, we do not
bother checking outside the area the user is currently working in
anyway.

Also and probably more importantly, this will only affect a *very*
small number of entries, because timestamps outside of the directory
in which the user is working in are rarely updated recently and
thus racy.

 Also, smudging more entries than necessary
 to be checked by ce_modified_check_fs() later at runtime may mean
 that it defeats the refresh once and then compare cheaply many
 times pattern that is employed by existing scripts.

The new racy code also calls ce_modified_check_fs() only if the size
and the stat_crc are not changed.  It's true that ce_modified_check_fs()
can be called multiple times, when match_stat_crc() is called, but that
could be solved by adding an additional flag CE_IS_MODIFIED, which
indicates that ce_modified_check_fs() was already run.

 Is the root cause really where the racily-clean so smudge to tell
 later runtime to check contents bit goes?  I am hoping that the
 issue is not coming from the difference between the current code and
 your code when they decide to smudge, what entries they decide to
 smudge and based on what condition.

I just gave it a try using a CE_SMUDGED flag, instead of the mtime
as smudge marker, which which this test works without any problems.
It doesn't work the other way round, the test as the test doesn't
break when using mtime as smudge marker in v2, because we do the
ce_modified_check_fs() test earlier.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v3 01/13] Move index v2 specific functions to their own file

2012-08-09 Thread Thomas Gummerer
On 08/09, Junio C Hamano wrote:
 Thomas Gummerer t.gumme...@gmail.com writes:
 
   /* remember to discard_cache() before reading a different cache! */
   int read_index_from(struct index_state *istate, const char *path)
   {
  ...
  mmap = xmmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 
  0);
  -   close(fd);
  if (mmap == MAP_FAILED)
  die_errno(unable to map index file);
   
  hdr = mmap;
  -   if (verify_hdr(hdr, mmap_size)  0)
  +   if (verify_hdr_version(istate, hdr, mmap_size)  0)
  goto unmap;
   ...
  +   if (istate-ops-verify_hdr(mmap, mmap_size)  0)
  +   goto unmap;
   
  +   istate-ops-read_index(istate, mmap, mmap_size, fd);
  ...
  +   close(fd);
 
 This looks utterly wrong.
 
 You already have mapped the whole thing, so there is nothing to be
 read from fd.  You have everything in-core.  Leaving fd open and
 pass it around looks like it is asking for trouble and confusion.
 
 If you found that an entry you read halfway has an inconsistent crc,
 and if you suspect that is because somebody else was writing to the
 same index, it is a _sure_ sign that you are not alone, and all the
 entries you read so far to the core, even if they weren't touched by
 that sombody else when you read them, may be stale, and worse yet,
 what you are going to read may be inconsistent with what you've read
 and have in-core (e.g. you may have read f before somebody else
 that is racing with you have turned it into a directory, and your
 next read may find f/d in the index without crc error).
 
 One sane way to avoid reading such an inconsistent state may be to
 redo this whole function, starting from the part that calls mmap().
 IOW,
 
   do {
   fd = open()
   mmap = xmmap(fd);
   close(fd);
 verify_various_fields(mmap);
 status = istate-ops-read_index(istate, mmap, mmap_size));
   } while (status == READ_AGAIN);
 
 I do not think the pass fd around so that we can redo the mapping
 deep inside the callchain is either a good idea or necessary.

Thanks, that looks better.  I'll change it for the re-roll.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v3 07/13] Read resolve-undo data

2012-08-09 Thread Thomas Gummerer
On 08/09, Junio C Hamano wrote:
 Thomas Gummerer t.gumme...@gmail.com writes:
 
  Make git read the resolve-undo data from the index.
 
  Since the resolve-undo data is joined with the conflicts in
  the ondisk format of the index file version 5, conflicts and
  resolved data is read at the same time, and the resolve-undo
  data is then converted to the in-memory format.
 
 This, and the next one, are both about reading extension data from
 the v2 formatted index, no?

Yes, exactly.

 Again, mild NAK.
 
 I think it is a lot more logical for the v5 code to read data stored
 in the resolve-undo and cache-tree extensions using the public API
 just like other users of these data do, and write out whatever in a
 way that is specific to the v5 index format.
 
 If the v5 codepath needs some information that is not exposed to
 other users of istate-resolve_undo and istate-cache_tree, then the
 story is different, but I do not think that is the case.

Sorry it's not clear to me what you mean with using the public API here.
Do you mean using resolve_undo_write() and resolve_undo_read()? I
wouldn't think those two methods would be really useful, as they expect
the data mangled in to a char* or return it as struct strbuf*.  And I
don't see the other methods doing something more useful.  Or I could
use the resolve-undo string_list directly, and just move the function
to read-cache-v5.c, or am I missing something here?

 
  Helped-by: Thomas Rast tr...@student.ethz.ch
  Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
  ---
   read-cache-v5.c |1 +
   resolve-undo.c  |   36 
   resolve-undo.h  |2 ++
   3 files changed, 39 insertions(+)
 
  diff --git a/read-cache-v5.c b/read-cache-v5.c
  index ec1201d..b47398d 100644
  --- a/read-cache-v5.c
  +++ b/read-cache-v5.c
  @@ -494,6 +494,7 @@ static struct directory_entry *read_entries(struct 
  index_state *istate,
  int i;
   
  conflict_queue = read_conflicts(de, mmap, mmap_size, fd);
  +   resolve_undo_convert_v5(istate, conflict_queue);
  for (i = 0; i  de-de_nfiles; i++) {
  ce = read_entry(de,
  entry_offset,
  diff --git a/resolve-undo.c b/resolve-undo.c
  index 72b4612..f96c6ba 100644
  --- a/resolve-undo.c
  +++ b/resolve-undo.c
  @@ -170,3 +170,39 @@ void unmerge_index(struct index_state *istate, const 
  char **pathspec)
  i = unmerge_index_entry_at(istate, i);
  }
   }
  +
  +void resolve_undo_convert_v5(struct index_state *istate,
  +   struct conflict_entry *ce)
  +{
  +   int i;
  +
  +   while (ce) {
  +   struct string_list_item *lost;
  +   struct resolve_undo_info *ui;
  +   struct conflict_part *cp;
  +
  +   if (ce-entries  (ce-entries-flags  CONFLICT_CONFLICTED) 
  != 0) {
  +   ce = ce-next;
  +   continue;
  +   }
  +   if (!istate-resolve_undo) {
  +   istate-resolve_undo = xcalloc(1, sizeof(struct 
  string_list));
  +   istate-resolve_undo-strdup_strings = 1;
  +   }
  +
  +   lost = string_list_insert(istate-resolve_undo, ce-name);
  +   if (!lost-util)
  +   lost-util = xcalloc(1, sizeof(*ui));
  +   ui = lost-util;
  +
  +   cp = ce-entries;
  +   for (i = 0; i  3; i++)
  +   ui-mode[i] = 0;
  +   while (cp) {
  +   ui-mode[conflict_stage(cp) - 1] = cp-entry_mode;
  +   hashcpy(ui-sha1[conflict_stage(cp) - 1], cp-sha1);
  +   cp = cp-next;
  +   }
  +   ce = ce-next;
  +   }
  +}
  diff --git a/resolve-undo.h b/resolve-undo.h
  index 8458769..ab660a6 100644
  --- a/resolve-undo.h
  +++ b/resolve-undo.h
  @@ -13,4 +13,6 @@ extern void resolve_undo_clear_index(struct index_state 
  *);
   extern int unmerge_index_entry_at(struct index_state *, int);
   extern void unmerge_index(struct index_state *, const char **);
   
  +extern void resolve_undo_convert_v5(struct index_state *, struct 
  conflict_entry *);
  +
   #endif
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v2 15/16] update-index.c: add a force-rewrite option

2012-08-08 Thread Thomas Gummerer
On 08/05, Junio C Hamano wrote:
 Thomas Gummerer t.gumme...@gmail.com writes:
 
  Add a force-rewrite option to update-index, which allows the user
  to rewrite the index, even if there are no changes. This can be used
  to do performance tests of both the reader and the writer.
 
  Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
  ---
   builtin/update-index.c |5 -
   1 file changed, 4 insertions(+), 1 deletion(-)
 
 I do not think this is wrong per-se, but is a new command that needs
 to be documented?  If it is only for benchmarking and debugging, it
 might be sufficient to make --index-version n always rewrite the
 index.

The command is only for benchmarking, I don't see another case where
it makes sense for anyone to rewrite the whole index, without changing
anything. I've made --index-version rewrite the index for the re-roll.

  diff --git a/builtin/update-index.c b/builtin/update-index.c
  index 4ce341c..7fedc8f 100644
  --- a/builtin/update-index.c
  +++ b/builtin/update-index.c
  @@ -24,6 +24,7 @@ static int allow_remove;
   static int allow_replace;
   static int info_only;
   static int force_remove;
  +static int force_rewrite;
   static int verbose;
   static int mark_valid_only;
   static int mark_skip_worktree_only;
  @@ -728,6 +729,8 @@ int cmd_update_index(int argc, const char **argv, const 
  char *prefix)
  OPT_BIT(0, unmerged, refresh_args.flags,
  refresh even if index contains unmerged entries,
  REFRESH_UNMERGED),
  +   OPT_SET_INT(0, force-rewrite, force_rewrite,
  +   force a index rewrite even if there is no change, 1),
  {OPTION_CALLBACK, 0, refresh, refresh_args, NULL,
  refresh stat information,
  PARSE_OPT_NOARG | PARSE_OPT_NONEG,
  @@ -886,7 +889,7 @@ int cmd_update_index(int argc, const char **argv, const 
  char *prefix)
  strbuf_release(buf);
  }
   
  -   if (active_cache_changed) {
  +   if (active_cache_changed || force_rewrite) {
  if (newfd  0) {
  if (refresh_args.flags  REFRESH_QUIET)
  exit(128);
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v2 09/16] Read index-v5

2012-08-08 Thread Thomas Gummerer
On 08/05, Junio C Hamano wrote:
 Thomas Gummerer t.gumme...@gmail.com writes:
 
  +static struct directory_entry *read_directories_v5(unsigned int 
  *dir_offset,
  +   unsigned int *dir_table_offset,
  +   void *mmap,
  +   int mmap_size)
  +{
  +   int i, ondisk_directory_size;
  +   uint32_t *filecrc, *beginning, *end;
  +   struct directory_entry *current = NULL;
  +   struct ondisk_directory_entry *disk_de;
  +   struct directory_entry *de;
  +   unsigned int data_len, len;
  +   char *name;
  +
  +   ondisk_directory_size = sizeof(disk_de-flags)
  +   + sizeof(disk_de-foffset)
  +   + sizeof(disk_de-cr)
  +   + sizeof(disk_de-ncr)
  +   + sizeof(disk_de-nsubtrees)
  +   + sizeof(disk_de-nfiles)
  +   + sizeof(disk_de-nentries)
  +   + sizeof(disk_de-sha1);
  +   name = (char *)mmap + *dir_offset;
  +   beginning = mmap + *dir_table_offset;
 
 Notice how you computed name with pointer arithmetic by first
 casting mmap (which is void *) and when computing beginning, you
 forgot to cast mmap and attempted pointer arithmetic with void *.
 The latter does not work and breaks compilation.
 
 The pointer-arith with void * is not limited to this function.

Sorry for not noticing this, it always compiled fine for me. Guess
I should use -pedantic more often ;-)

 Please check the a band-aid (I wouldn't call it a fix-up) patch I
 added on top of the series before queuing the topic to 'pu'; it is
 primarily to illustrate the places I noticed that have this issue.
 
 I do not necessarily suggest that the way the band-aid patch makes
 it compile is the best approach.  It might be cleaner to use a saner
 type like char * (or perhaps const char *) as the type to point
 at a piece of memory you read from the disk.  I haven't formed an
 opinion.
 
 Thanks.

I've used the type of the respective assignment for now. e.g. i have
struct cache_header *hdr, so I'm using
hdr = (struct cache_header *)mmap + x;

read-cache-v5.c compiles with -pedantic without warnings.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] Add index-v5

2012-08-08 Thread Thomas Gummerer


On 08/07, Robin Rosenberg wrote:
 Nguyễn Thái Ngọc Duy skrev 2012-08-06 16.36:
 
 +++ b/read-cache-v5.c
 @@ -0,0 +1,1170 @@
 +#include cache.h
 +#include read-cache.h
 +#include resolve-undo.h
 +#include cache-tree.h
 +
 +struct cache_header_v5 {
 +unsigned int hdr_ndir;
 +unsigned int hdr_nfile;
 +unsigned int hdr_fblockoffset;
 +unsigned int hdr_nextension;
 +};
 +
 +struct ondisk_cache_entry_v5 {
 +unsigned short flags;
 +unsigned short mode;
 +struct cache_time mtime;
 +int stat_crc;
 +unsigned char sha1[20];
 +};
 
 I mentioned this before in another thread, but for JGit I'd like
 to see size as a separate attribute. The rest of stat_crc is not
 available to Java so when this index gets its way into JGit,
 stat_crc will be zero and will never be checked.
 

I'm sorry for forgetting to add this, it will be included in the
re-roll.  The stat_crc will be ignored if it is 0 in the ondisk
index.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v3 05/13] Make in-memory format aware of stat_crc

2012-08-08 Thread Thomas Gummerer
Make the in-memory format aware of the stat_crc used by index-v5.
It is simply ignored by index version prior to v5.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache.h  |1 +
 read-cache.c |   25 +
 2 files changed, 26 insertions(+)

diff --git a/cache.h b/cache.h
index c77cdbe..bfe3099 100644
--- a/cache.h
+++ b/cache.h
@@ -122,6 +122,7 @@ struct cache_entry {
unsigned int ce_flags;
unsigned int ce_namelen;
unsigned char sha1[20];
+   uint32_t ce_stat_crc;
struct cache_entry *next;
struct cache_entry *dir_next;
char name[FLEX_ARRAY]; /* more */
diff --git a/read-cache.c b/read-cache.c
index 125e6a0..d8f8b74 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -51,6 +51,29 @@ void rename_index_entry_at(struct index_state *istate, int 
nr, const char *new_n
add_index_entry(istate, new, 
ADD_CACHE_OK_TO_ADD|ADD_CACHE_OK_TO_REPLACE);
 }
 
+static uint32_t calculate_stat_crc(struct cache_entry *ce)
+{
+   unsigned int ctimens = 0;
+   uint32_t stat, stat_crc;
+
+   stat = htonl(ce-ce_ctime.sec);
+   stat_crc = crc32(0, (Bytef*)stat, 4);
+#ifdef USE_NSEC
+   ctimens = ce-ce_ctime.nsec;
+#endif
+   stat = htonl(ctimens);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_ino);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_dev);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_uid);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_gid);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   return stat_crc;
+}
+
 /*
  * This only updates the non-critical parts of the directory
  * cache, ie the parts that aren't tracked by GIT, and only used
@@ -73,6 +96,8 @@ void fill_stat_cache_info(struct cache_entry *ce, struct stat 
*st)
 
if (S_ISREG(st-st_mode))
ce_mark_uptodate(ce);
+
+   ce-ce_stat_crc = calculate_stat_crc(ce);
 }
 
 static int ce_compare_data(struct cache_entry *ce, struct stat *st)
-- 
1.7.10.GIT

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v3 02/13] t2104: Don't fail for index versions other than [23]

2012-08-08 Thread Thomas Gummerer
t2104 currently checks for the exact index version 2 or 3,
depending if there is a skip-worktree flag or not. Other
index versions do not use extended flags and thus cannot
be tested for version changes.

Make this test update the index to version 2 at the beginning
of the test. Testing the skip-worktree flags for the default
index format is still covered by t7011 and t7012.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/t2104-update-index-skip-worktree.sh |1 +
 1 file changed, 1 insertion(+)

diff --git a/t/t2104-update-index-skip-worktree.sh 
b/t/t2104-update-index-skip-worktree.sh
index 1d0879b..bd9644f 100755
--- a/t/t2104-update-index-skip-worktree.sh
+++ b/t/t2104-update-index-skip-worktree.sh
@@ -22,6 +22,7 @@ H sub/2
 EOF
 
 test_expect_success 'setup' '
+   git update-index --index-version=2 
mkdir sub 
touch ./1 ./2 sub/1 sub/2 
git add 1 2 sub/1 sub/2 
-- 
1.7.10.GIT

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v3 13/13] p0002-index.sh: add perf test for the index formats

2012-08-08 Thread Thomas Gummerer
From: Thomas Rast tr...@student.ethz.ch

Add a performance test for index version [23]/4/5 by using
git update-index --update-index=[345], thus testing both the reader
and the writer speed of all index formats.

Signed-off-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/perf/p0002-index.sh |   33 +
 1 file changed, 33 insertions(+)
 create mode 100755 t/perf/p0002-index.sh

diff --git a/t/perf/p0002-index.sh b/t/perf/p0002-index.sh
new file mode 100755
index 000..140c7a0
--- /dev/null
+++ b/t/perf/p0002-index.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+test_description=Tests index versions [23]/4/5
+
+. ./perf-lib.sh
+
+test_perf_large_repo
+
+test_expect_success 'convert to v3' '
+   git update-index --index-version=3
+'
+
+test_perf 'v[23]: update-index' '
+   git update-index --index-version=3 /dev/null
+'
+
+test_expect_success 'convert to v4' '
+   git update-index --index-version=4
+'
+
+test_perf 'v4: update-index' '
+   git update-index --index-version=4 /dev/null
+'
+
+test_expect_success 'convert to v5' '
+   git update-index --index-version=5
+'
+
+test_perf 'v5: update-index' '
+   git update-index --index-version=5 /dev/null
+'
+
+test_done
-- 
1.7.10.GIT

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v3 06/13] Read index-v5

2012-08-08 Thread Thomas Gummerer
Make git read the index file version 5 without complaining.

This version of the reader doesn't read neither the cache-tree
nor the resolve undo data, but doesn't choke on an index that
includes such data.

Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Makefile|1 +
 cache.h |   72 +++
 read-cache-v5.c |  589 +++
 read-cache.c|1 -
 4 files changed, 662 insertions(+), 1 deletion(-)
 create mode 100644 read-cache-v5.c

diff --git a/Makefile b/Makefile
index b4a7c73..77be175 100644
--- a/Makefile
+++ b/Makefile
@@ -770,6 +770,7 @@ LIB_OBJS += quote.o
 LIB_OBJS += reachable.o
 LIB_OBJS += read-cache.o
 LIB_OBJS += read-cache-v2.o
+LIB_OBJS += read-cache-v5.o
 LIB_OBJS += reflog-walk.o
 LIB_OBJS += refs.o
 LIB_OBJS += remote.o
diff --git a/cache.h b/cache.h
index bfe3099..a0a1781 100644
--- a/cache.h
+++ b/cache.h
@@ -110,6 +110,15 @@ struct cache_time {
unsigned int nsec;
 };
 
+/*
+ * The *next pointer is used in read_entries_v5 for holding
+ * all the elements of a directory, and points to the next
+ * cache_entry in a directory.
+ *
+ * It is reset by the add_name_hash call in set_index_entry
+ * to set it to point to the next cache_entry in the
+ * correct in-memory format ordering.
+ */
 struct cache_entry {
struct cache_time ce_ctime;
struct cache_time ce_mtime;
@@ -128,11 +137,58 @@ struct cache_entry {
char name[FLEX_ARRAY]; /* more */
 };
 
+struct directory_entry {
+   struct directory_entry *next;
+   struct directory_entry *next_hash;
+   struct cache_entry *ce;
+   struct cache_entry *ce_last;
+   struct conflict_entry *conflict;
+   struct conflict_entry *conflict_last;
+   unsigned int conflict_size;
+   unsigned int de_foffset;
+   unsigned int de_cr;
+   unsigned int de_ncr;
+   unsigned int de_nsubtrees;
+   unsigned int de_nfiles;
+   unsigned int de_nentries;
+   unsigned char sha1[20];
+   unsigned short de_flags;
+   unsigned int de_pathlen;
+   char pathname[FLEX_ARRAY];
+};
+
+struct conflict_part {
+   struct conflict_part *next;
+   unsigned short flags;
+   unsigned short entry_mode;
+   unsigned char sha1[20];
+};
+
+struct conflict_entry {
+   struct conflict_entry *next;
+   unsigned int nfileconflicts;
+   struct conflict_part *entries;
+   unsigned int namelen;
+   unsigned int pathlen;
+   char name[FLEX_ARRAY];
+};
+
+struct ondisk_conflict_part {
+   unsigned short flags;
+   unsigned short entry_mode;
+   unsigned char sha1[20];
+};
+
+#define CE_NAMEMASK  (0x0fff)
 #define CE_STAGEMASK (0x3000)
 #define CE_EXTENDED  (0x4000)
 #define CE_VALID (0x8000)
 #define CE_STAGESHIFT 12
 
+#define CONFLICT_CONFLICTED (0x8000)
+#define CONFLICT_STAGESHIFT 13
+#define CONFLICT_STAGEMASK (0x6000)
+
 /*
  * Range 0x in ce_flags is divided into
  * two parts: in-memory flags and on-disk ones.
@@ -166,6 +222,18 @@ struct cache_entry {
 #define CE_EXTENDED_FLAGS (CE_INTENT_TO_ADD | CE_SKIP_WORKTREE)
 
 /*
+ * Representation of the extended on-disk flags in the v5 format.
+ * They must not collide with the ordinary on-disk flags, and need to
+ * fit in 16 bits.  Note however that v5 does not save the name
+ * length.
+ */
+#define CE_INTENT_TO_ADD_V5  (0x4000)
+#define CE_SKIP_WORKTREE_V5  (0x0800)
+#if (CE_VALID|CE_STAGEMASK)  (CE_INTENTTOADD_V5|CE_SKIPWORKTREE_V5)
+#error v5 on-disk flags collide with ordinary on-disk flags
+#endif
+
+/*
  * Safeguard to avoid saving wrong flags:
  *  - CE_EXTENDED2 won't get saved until its semantic is known
  *  - Bits in 0x have been saved in ce_flags already
@@ -203,6 +271,8 @@ static inline unsigned create_ce_flags(unsigned stage)
 #define ce_skip_worktree(ce) ((ce)-ce_flags  CE_SKIP_WORKTREE)
 #define ce_mark_uptodate(ce) ((ce)-ce_flags |= CE_UPTODATE)
 
+#define conflict_stage(c) ((CONFLICT_STAGEMASK  (c)-flags)  
CONFLICT_STAGESHIFT)
+
 #define ce_permissions(mode) (((mode)  0100) ? 0755 : 0644)
 static inline unsigned int create_ce_mode(unsigned int mode)
 {
@@ -249,6 +319,8 @@ static inline unsigned int canon_mode(unsigned int mode)
 }
 
 #define cache_entry_size(len) (offsetof(struct cache_entry,name) + (len) + 1)
+#define directory_entry_size(len) (offsetof(struct directory_entry,pathname) + 
(len) + 1)
+#define conflict_entry_size(len) (offsetof(struct conflict_entry,name) + (len) 
+ 1)
 
 struct index_state {
struct cache_entry **cache;
diff --git a/read-cache-v5.c b/read-cache-v5.c
new file mode 100644
index 000..ec1201d
--- /dev/null
+++ b/read-cache-v5.c
@@ -0,0 +1,589 @@
+#include cache.h
+#include read-cache.h
+#include resolve-undo.h
+#include cache-tree.h
+
+struct cache_header {
+   unsigned int hdr_ndir;
+   unsigned int hdr_nfile;
+   unsigned int

[PATCH/RFC v3 08/13] Read cache-tree in index-v5

2012-08-08 Thread Thomas Gummerer
Since the cache-tree data is saved as part of the directory data,
we already read it at the beginning of the index. The cache-tree
is only converted from this directory data.

The cache-tree data is arranged in a tree, with the children sorted by
pathlen at each node, while the ondisk format is sorted lexically.
So we have to rebuild this format from the on-disk directory list.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache-tree.c|   93 +++
 cache-tree.h|   10 ++
 read-cache-v5.c |1 +
 3 files changed, 104 insertions(+)

diff --git a/cache-tree.c b/cache-tree.c
index 28ed657..440cd04 100644
--- a/cache-tree.c
+++ b/cache-tree.c
@@ -519,6 +519,99 @@ struct cache_tree *cache_tree_read(const char *buffer, 
unsigned long size)
return read_one(buffer, size);
 }
 
+static struct cache_tree *convert_one(struct directory_queue *queue, int dirnr)
+{
+   int i, subtree_nr;
+   struct cache_tree *it;
+   struct directory_queue *down;
+
+   it = cache_tree();
+   it-entry_count = queue[dirnr].de-de_nentries;
+   subtree_nr = queue[dirnr].de-de_nsubtrees;
+   if (0 = it-entry_count)
+   hashcpy(it-sha1, queue[dirnr].de-sha1);
+
+   /*
+   * Just a heuristic -- we do not add directories that often but
+   * we do not want to have to extend it immediately when we do,
+   * hence +2.
+   */
+   it-subtree_alloc = subtree_nr + 2;
+   it-down = xcalloc(it-subtree_alloc, sizeof(struct cache_tree_sub *));
+   down = queue[dirnr].down;
+   for (i = 0; i  subtree_nr; i++) {
+   struct cache_tree *sub;
+   struct cache_tree_sub *subtree;
+   char *buf, *name;
+
+   name = ;
+   buf = strtok(down[i].de-pathname, /);
+   while (buf) {
+   name = buf;
+   buf = strtok(NULL, /);
+   }
+   sub = convert_one(down, i);
+   if(!sub)
+   goto free_return;
+   subtree = cache_tree_sub(it, name);
+   subtree-cache_tree = sub;
+   }
+   if (subtree_nr != it-subtree_nr)
+   die(cache-tree: internal error);
+   return it;
+ free_return:
+   cache_tree_free(it);
+   return NULL;
+}
+
+static int compare_cache_tree_elements(const void *a, const void *b)
+{
+   const struct directory_entry *de1, *de2;
+
+   de1 = ((const struct directory_queue *)a)-de;
+   de2 = ((const struct directory_queue *)b)-de;
+   return subtree_name_cmp(de1-pathname, de1-de_pathlen,
+   de2-pathname, de2-de_pathlen);
+}
+
+static struct directory_entry *sort_directories(struct directory_entry *de,
+   struct directory_queue *queue)
+{
+   int i, nsubtrees;
+
+   nsubtrees = de-de_nsubtrees;
+   for (i = 0; i  nsubtrees; i++) {
+   struct directory_entry *new_de;
+   de = de-next;
+   new_de = xmalloc(directory_entry_size(de-de_pathlen));
+   memcpy(new_de, de, directory_entry_size(de-de_pathlen));
+   queue[i].de = new_de;
+   if (de-de_nsubtrees) {
+   queue[i].down = xcalloc(de-de_nsubtrees,
+   sizeof(struct directory_queue));
+   de = sort_directories(de,
+   queue[i].down);
+   }
+   }
+   qsort(queue, nsubtrees, sizeof(struct directory_queue),
+   compare_cache_tree_elements);
+   return de;
+}
+
+struct cache_tree *cache_tree_convert_v5(struct directory_entry *de)
+{
+   struct directory_queue *queue;
+
+   if (!de-de_nentries)
+   return NULL;
+   queue = xcalloc(1, sizeof(struct directory_queue));
+   queue[0].de = de;
+   queue[0].down = xcalloc(de-de_nsubtrees, sizeof(struct 
directory_queue));
+
+   sort_directories(de, queue[0].down);
+   return convert_one(queue, 0);
+}
+
 static struct cache_tree *cache_tree_find(struct cache_tree *it, const char 
*path)
 {
if (!it)
diff --git a/cache-tree.h b/cache-tree.h
index d8cb2e9..7f29d26 100644
--- a/cache-tree.h
+++ b/cache-tree.h
@@ -20,6 +20,11 @@ struct cache_tree {
struct cache_tree_sub **down;
 };
 
+struct directory_queue {
+   struct directory_queue *down;
+   struct directory_entry *de;
+};
+
 struct cache_tree *cache_tree(void);
 void cache_tree_free(struct cache_tree **);
 void cache_tree_invalidate_path(struct cache_tree *, const char *);
@@ -27,6 +32,11 @@ struct cache_tree_sub *cache_tree_sub(struct cache_tree *, 
const char *);
 
 void cache_tree_write(struct strbuf *, struct cache_tree *root);
 struct cache_tree *cache_tree_read(const char *buffer, unsigned long size);
+/*
+ * This function modifys the directory argument

[PATCH/RFC v3 10/13] Write index-v5 cache-tree data

2012-08-08 Thread Thomas Gummerer
Write the cache-tree data for the index version 5 file format. The
in-memory cache-tree data is converted to the ondisk format, by adding
it to the directory entries, that were compiled from the cache-entries
in the step before.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache-tree.c |   52 
 cache-tree.h |1 +
 read-cache.c |1 +
 3 files changed, 54 insertions(+)

diff --git a/cache-tree.c b/cache-tree.c
index 440cd04..e167b61 100644
--- a/cache-tree.c
+++ b/cache-tree.c
@@ -612,6 +612,58 @@ struct cache_tree *cache_tree_convert_v5(struct 
directory_entry *de)
return convert_one(queue, 0);
 }
 
+
+static void convert_one_to_ondisk_v5(struct hash_table *table, struct 
cache_tree *it,
+   const char *path, int pathlen, uint32_t crc)
+{
+   int i;
+   struct directory_entry *found, *search;
+
+   crc = crc32(crc, (Bytef*)path, pathlen);
+   found = lookup_hash(crc, table);
+   search = found;
+   while (search  strcmp(path, search-pathname + search-de_pathlen - 
strlen(path)) != 0)
+   search = search-next_hash;
+   if (!search)
+   return;
+   /*
+* The number of subtrees is already calculated by
+* compile_directory_data, therefore we only need to
+* add the entry_count
+*/
+   search-de_nentries = it-entry_count;
+   if (0 = it-entry_count)
+   hashcpy(search-sha1, it-sha1);
+   if (strcmp(path, ) != 0)
+   crc = crc32(crc, (Bytef*)/, 1);
+
+#if DEBUG
+   if (0 = it-entry_count)
+   fprintf(stderr, cache-tree %.*s (%d ent, %d subtree) %s\n,
+   pathlen, path, it-entry_count, it-subtree_nr,
+   sha1_to_hex(it-sha1));
+   else
+   fprintf(stderr, cache-tree %.*s (%d subtree) invalid\n,
+   pathlen, path, it-subtree_nr);
+#endif
+
+   for (i = 0; i  it-subtree_nr; i++) {
+   struct cache_tree_sub *down = it-down[i];
+   if (i) {
+   struct cache_tree_sub *prev = it-down[i-1];
+   if (subtree_name_cmp(down-name, down-namelen,
+prev-name, prev-namelen) = 0)
+   die(fatal - unsorted cache subtree);
+   }
+   convert_one_to_ondisk_v5(table, down-cache_tree, down-name, 
down-namelen, crc);
+   }
+}
+
+void cache_tree_to_ondisk_v5(struct hash_table *table, struct cache_tree *root)
+{
+   convert_one_to_ondisk_v5(table, root, , 0, 0);
+}
+
 static struct cache_tree *cache_tree_find(struct cache_tree *it, const char 
*path)
 {
if (!it)
diff --git a/cache-tree.h b/cache-tree.h
index 7f29d26..e08bc31 100644
--- a/cache-tree.h
+++ b/cache-tree.h
@@ -37,6 +37,7 @@ struct cache_tree *cache_tree_read(const char *buffer, 
unsigned long size);
  * Don't use it if the directory entries are still needed after.
  */
 struct cache_tree *cache_tree_convert_v5(struct directory_entry *de);
+void cache_tree_to_ondisk_v5(struct hash_table *table, struct cache_tree 
*root);
 
 int cache_tree_fully_valid(struct cache_tree *);
 int cache_tree_update(struct cache_tree *, struct cache_entry **, int, int);
diff --git a/read-cache.c b/read-cache.c
index 199ba75..962d6a2 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1310,6 +1310,7 @@ void update_index_if_able(struct index_state *istate, 
struct lock_file *lockfile
else
rollback_lock_file(lockfile);
 }
+
 int write_index(struct index_state *istate, int newfd)
 {
set_istate_ops(istate);
-- 
1.7.10.GIT

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v3 04/13] Add documentation of the index-v5 file format

2012-08-08 Thread Thomas Gummerer
Add a documentation of the index file format version 5 to
Documentation/technical.

Helped-by: Michael Haggerty mhag...@alum.mit.edu
Helped-by: Junio C Hamano gits...@pobox.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Robin Rosenberg robin.rosenb...@dewire.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Documentation/technical/index-file-format-v5.txt |  285 ++
 1 file changed, 285 insertions(+)
 create mode 100644 Documentation/technical/index-file-format-v5.txt

diff --git a/Documentation/technical/index-file-format-v5.txt 
b/Documentation/technical/index-file-format-v5.txt
new file mode 100644
index 000..6707f06
--- /dev/null
+++ b/Documentation/technical/index-file-format-v5.txt
@@ -0,0 +1,285 @@
+GIT index format
+
+
+== The git index file format
+
+   The git index file (.git/index) documents the status of the files
+ in the git staging area.
+
+   The staging area is used for preparing commits, merging, etc.
+
+   All binary numbers are in network byte order. Version 5 is described
+ here.
+
+   - A 20-byte header consisting of
+
+ sig (32-bits): Signature:
+   The signature is { 'D', 'I', 'R', 'C' } (stands for dircache)
+
+ vnr (32-bits): Version number:
+   The current supported versions are 2, 3, 4 and 5.
+
+ ndir (32-bits): number of directories in the index.
+
+ nfile (32-bits): number of file entries in the index.
+
+ fblockoffset (32-bits): offset to the file block, relative to the
+   beginning of the file.
+
+   - Offset to the extensions.
+
+ nextensions (32-bits): number of extensions.
+
+ extoffset (32-bits): offset to the extension. (Possibly none, as
+   many as indicated in the 4-byte number of extensions)
+
+ headercrc (32-bits): crc checksum for the header and extension
+   offsets
+
+   - diroffsets (ndir * directory offsets): A directory offset for each
+   of the ndir directories in the index, sorted by pathname (of the
+   directory it's pointing to) (see below). The diroffsets are relative
+   to the beginning of the direntries block. [1]
+
+   - direntries (ndir * directory entries): A directory entry for each
+   of the ndir directories in the index, sorted by pathname (see
+   below). [2]
+
+   - fileoffsets (nfile * file offsets): A file offset for each of the
+   nfile files in the index (see below). The file offsets are relative
+   to the beginning of the fileentries block. [1]
+
+   - fileentries (nfile * file entries): A file entry for each of the
+   nfile files in the index (see below).
+
+   - crdata: A number of entries for conflicted data/resolved conflicts
+   (see below).
+
+   - Extensions (Currently none, see below in the future)
+
+ Extensions are identified by signature. Optional extensions can
+ be ignored if GIT does not understand them.
+
+ GIT supports an arbitrary number of extension, but currently none
+ is implemented. [3]
+
+ extsig (32-bits): extension signature. If the first byte is 'A'..'Z'
+ the extension is optional and can be ignored.
+
+ extsize (32-bits): size of the extension, excluding the header
+   (extsig, extsize, extchecksum).
+
+ extchecksum (32-bits): crc32 checksum of the extension signature
+   and size.
+
+- Extension data.
+
+
+== Directory offsets (diroffsets)
+
+  diroffset (32-bits): offset to the directory relative to the beginning
+of the index file. There are ndir + 1 offsets in the diroffset table,
+the last is pointing to the end of the last direntry. With this last
+entry, we can replace the strlen when reading each filename, by
+calculating its length with the offsets.
+
+  This part is needed for making the directory entries bisectable and
+thus allowing a binary search.
+
+== Directory entry (direntries)
+  
+  Directory entries are sorted in lexicographic order by the name 
+of their path starting with the root.
+  
+  pathname (variable length, nul terminated): relative to top level
+directory (without the leading slash). '/' is used as path
+separator. A string of length 0 ('') indicates the root directory.
+The special path components ., and .. (without quotes) are
+disallowed. The path also includes a trailing slash. [9]
+
+  foffset (32-bits): offset to the lexicographically first file in 
+the file offsets (fileoffsets), relative to the beginning of
+the fileoffset block.
+
+  cr (32-bits): offset to conflicted/resolved data at the end of the
+index. 0 if there is no such data. [4]
+
+  ncr (32-bits): number of conflicted/resolved data entries at the
+end of the index if the offset is non 0. If cr is 0, ncr is
+also 0.
+
+  nsubtrees (32-bits): number of subtrees this tree has in the index.
+
+  nfiles (32-bits): number of files in the directory, that are in
+the index.
+
+  nentries (32-bits

[PATCH/RFC v3 03/13] t3700: Avoid interfering with the racy code

2012-08-08 Thread Thomas Gummerer
The new git racy code uses the mtime of cache-entries as smudge
marker for racily clean entries. The work of checking the file-system
if the entry really changed is offloaded to the reader. This interferes
with this test, because the entry is racily smudged and thus has
mtime 0.

To avoid interfering with the racy code, we use a time relative
to the time returned by time(3), instead of a time relative to
the mtime of the cache entries.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/t3700-add.sh |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t/t3700-add.sh b/t/t3700-add.sh
index 874b3a6..829d36d 100755
--- a/t/t3700-add.sh
+++ b/t/t3700-add.sh
@@ -184,7 +184,7 @@ test_expect_success 'git add --refresh with pathspec' '
echo foo  echo bar  echo baz 
git add foo bar baz  H=$(git rev-parse :foo)  git rm -f foo 
echo 100644 $H 3   foo | git update-index --index-info 
-   test-chmtime -60 bar baz 
+   test-chmtime =-60 bar baz 
expect 
git add --refresh bar actual 
test_cmp expect actual 
-- 
1.7.10.GIT

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v3 01/13] Move index v2 specific functions to their own file

2012-08-08 Thread Thomas Gummerer
Move index version 2 specific functions to their own file,
to prepare for the addition of a new index file format. With
the split into two files we have the non-index specific
functions in read-cache.c and the index-v2 specific functions
in read-cache-v2.c

Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Makefile |2 +
 cache.h  |   13 +-
 read-cache-v2.c  |  581 +++
 read-cache.c |  613 +++---
 read-cache.h |   57 +
 test-index-version.c |7 +-
 6 files changed, 683 insertions(+), 590 deletions(-)
 create mode 100644 read-cache-v2.c
 create mode 100644 read-cache.h

diff --git a/Makefile b/Makefile
index 4b58b91..b4a7c73 100644
--- a/Makefile
+++ b/Makefile
@@ -645,6 +645,7 @@ LIB_H += progress.h
 LIB_H += prompt.h
 LIB_H += quote.h
 LIB_H += reachable.h
+LIB_H += read-cache.h
 LIB_H += reflog-walk.h
 LIB_H += refs.h
 LIB_H += remote.h
@@ -768,6 +769,7 @@ LIB_OBJS += prompt.o
 LIB_OBJS += quote.o
 LIB_OBJS += reachable.o
 LIB_OBJS += read-cache.o
+LIB_OBJS += read-cache-v2.o
 LIB_OBJS += reflog-walk.o
 LIB_OBJS += refs.o
 LIB_OBJS += remote.o
diff --git a/cache.h b/cache.h
index 67f28b4..c77cdbe 100644
--- a/cache.h
+++ b/cache.h
@@ -94,16 +94,8 @@ unsigned long git_deflate_bound(git_zstream *, unsigned 
long);
  */
 #define DEFAULT_GIT_PORT 9418
 
-/*
- * Basic data structures for the directory cache
- */
 
 #define CACHE_SIGNATURE 0x44495243 /* DIRC */
-struct cache_header {
-   unsigned int hdr_signature;
-   unsigned int hdr_version;
-   unsigned int hdr_entries;
-};
 
 #define INDEX_FORMAT_LB 2
 #define INDEX_FORMAT_UB 4
@@ -267,6 +259,7 @@ struct index_state {
unsigned name_hash_initialized : 1,
 initialized : 1;
struct hash_table name_hash;
+   struct index_ops *ops;
 };
 
 extern struct index_state the_index;
@@ -471,8 +464,8 @@ extern int index_name_is_other(const struct index_state *, 
const char *, int);
 #define CE_MATCH_RACY_IS_DIRTY 02
 /* do stat comparison even if CE_SKIP_WORKTREE is true */
 #define CE_MATCH_IGNORE_SKIP_WORKTREE  04
-extern int ie_match_stat(const struct index_state *, struct cache_entry *, 
struct stat *, unsigned int);
-extern int ie_modified(const struct index_state *, struct cache_entry *, 
struct stat *, unsigned int);
+extern int ie_match_stat(struct index_state *, struct cache_entry *, struct 
stat *, unsigned int);
+extern int ie_modified(struct index_state *, struct cache_entry *, struct stat 
*, unsigned int);
 
 struct pathspec {
const char **raw; /* get_pathspec() result, not freed by 
free_pathspec() */
diff --git a/read-cache-v2.c b/read-cache-v2.c
new file mode 100644
index 000..38f1791
--- /dev/null
+++ b/read-cache-v2.c
@@ -0,0 +1,581 @@
+#include cache.h
+#include read-cache.h
+#include resolve-undo.h
+#include cache-tree.h
+#include varint.h
+
+/* Mask for the name length in ce_flags in the on-disk index */
+#define CE_NAMEMASK  (0x0fff)
+
+struct cache_header {
+   unsigned int hdr_entries;
+};
+
+/*
+ * Index File I/O
+ */
+
+/*
+ * dev/ino/uid/gid/size are also just tracked to the low 32 bits
+ * Again - this is just a (very strong in practice) heuristic that
+ * the inode hasn't changed.
+ *
+ * We save the fields in big-endian order to allow using the
+ * index file over NFS transparently.
+ */
+struct ondisk_cache_entry {
+   struct cache_time ctime;
+   struct cache_time mtime;
+   unsigned int dev;
+   unsigned int ino;
+   unsigned int mode;
+   unsigned int uid;
+   unsigned int gid;
+   unsigned int size;
+   unsigned char sha1[20];
+   unsigned short flags;
+   char name[FLEX_ARRAY]; /* more */
+};
+
+/*
+ * This struct is used when CE_EXTENDED bit is 1
+ * The struct must match ondisk_cache_entry exactly from
+ * ctime till flags
+ */
+struct ondisk_cache_entry_extended {
+   struct cache_time ctime;
+   struct cache_time mtime;
+   unsigned int dev;
+   unsigned int ino;
+   unsigned int mode;
+   unsigned int uid;
+   unsigned int gid;
+   unsigned int size;
+   unsigned char sha1[20];
+   unsigned short flags;
+   unsigned short flags2;
+   char name[FLEX_ARRAY]; /* more */
+};
+
+/* These are only used for v3 or lower */
+#define align_flex_name(STRUCT,len) ((offsetof(struct STRUCT,name) + (len) + 
8)  ~7)
+#define ondisk_cache_entry_size(len) align_flex_name(ondisk_cache_entry,len)
+#define ondisk_cache_entry_extended_size(len) 
align_flex_name(ondisk_cache_entry_extended,len)
+#define ondisk_ce_size(ce) (((ce)-ce_flags  CE_EXTENDED) ? \
+   ondisk_cache_entry_extended_size(ce_namelen(ce

[PATCH/RFC v3 07/13] Read resolve-undo data

2012-08-08 Thread Thomas Gummerer
Make git read the resolve-undo data from the index.

Since the resolve-undo data is joined with the conflicts in
the ondisk format of the index file version 5, conflicts and
resolved data is read at the same time, and the resolve-undo
data is then converted to the in-memory format.

Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c |1 +
 resolve-undo.c  |   36 
 resolve-undo.h  |2 ++
 3 files changed, 39 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index ec1201d..b47398d 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -494,6 +494,7 @@ static struct directory_entry *read_entries(struct 
index_state *istate,
int i;
 
conflict_queue = read_conflicts(de, mmap, mmap_size, fd);
+   resolve_undo_convert_v5(istate, conflict_queue);
for (i = 0; i  de-de_nfiles; i++) {
ce = read_entry(de,
entry_offset,
diff --git a/resolve-undo.c b/resolve-undo.c
index 72b4612..f96c6ba 100644
--- a/resolve-undo.c
+++ b/resolve-undo.c
@@ -170,3 +170,39 @@ void unmerge_index(struct index_state *istate, const char 
**pathspec)
i = unmerge_index_entry_at(istate, i);
}
 }
+
+void resolve_undo_convert_v5(struct index_state *istate,
+   struct conflict_entry *ce)
+{
+   int i;
+
+   while (ce) {
+   struct string_list_item *lost;
+   struct resolve_undo_info *ui;
+   struct conflict_part *cp;
+
+   if (ce-entries  (ce-entries-flags  CONFLICT_CONFLICTED) 
!= 0) {
+   ce = ce-next;
+   continue;
+   }
+   if (!istate-resolve_undo) {
+   istate-resolve_undo = xcalloc(1, sizeof(struct 
string_list));
+   istate-resolve_undo-strdup_strings = 1;
+   }
+
+   lost = string_list_insert(istate-resolve_undo, ce-name);
+   if (!lost-util)
+   lost-util = xcalloc(1, sizeof(*ui));
+   ui = lost-util;
+
+   cp = ce-entries;
+   for (i = 0; i  3; i++)
+   ui-mode[i] = 0;
+   while (cp) {
+   ui-mode[conflict_stage(cp) - 1] = cp-entry_mode;
+   hashcpy(ui-sha1[conflict_stage(cp) - 1], cp-sha1);
+   cp = cp-next;
+   }
+   ce = ce-next;
+   }
+}
diff --git a/resolve-undo.h b/resolve-undo.h
index 8458769..ab660a6 100644
--- a/resolve-undo.h
+++ b/resolve-undo.h
@@ -13,4 +13,6 @@ extern void resolve_undo_clear_index(struct index_state *);
 extern int unmerge_index_entry_at(struct index_state *, int);
 extern void unmerge_index(struct index_state *, const char **);
 
+extern void resolve_undo_convert_v5(struct index_state *, struct 
conflict_entry *);
+
 #endif
-- 
1.7.10.GIT

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v3 11/13] Write resolve-undo data for index-v5

2012-08-08 Thread Thomas Gummerer
Write the resolve undo data to the ondisk format, by joining the data
in the resolve-undo string-list with the already existing conflicts
that were compiled before, when searching the directories and add
them to the corresponding directory entries.

Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c |3 ++
 resolve-undo.c  |   93 +++
 resolve-undo.h  |1 +
 3 files changed, 97 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index 45f7acd..3d03111 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -861,6 +861,9 @@ static struct directory_entry 
*compile_directory_data(struct index_state *istate
previous_entry-next = no_subtrees;
}
}
+   if (istate-cache_tree)
+   cache_tree_to_ondisk_v5(table, istate-cache_tree);
+   resolve_undo_to_ondisk_v5(table, istate-resolve_undo, ndir, 
total_dir_len, de);
return de;
 }
 
diff --git a/resolve-undo.c b/resolve-undo.c
index f96c6ba..4568dcc 100644
--- a/resolve-undo.c
+++ b/resolve-undo.c
@@ -206,3 +206,96 @@ void resolve_undo_convert_v5(struct index_state *istate,
ce = ce-next;
}
 }
+
+void resolve_undo_to_ondisk_v5(struct hash_table *table,
+   struct string_list *resolve_undo,
+   unsigned int *ndir, int *total_dir_len,
+   struct directory_entry *de)
+{
+   struct string_list_item *item;
+   struct directory_entry *search;
+
+   if (!resolve_undo)
+   return;
+   for_each_string_list_item(item, resolve_undo) {
+   struct conflict_entry *conflict_entry;
+   struct resolve_undo_info *ui = item-util;
+   char *super;
+   int i, dir_len, len;
+   uint32_t crc;
+   struct directory_entry *found, *current, *new_tree;
+
+   if (!ui)
+   continue;
+
+   super = super_directory(item-string);
+   if (!super)
+   dir_len = 0;
+   else
+   dir_len = strlen(super);
+   crc = crc32(0, (Bytef*)super, dir_len);
+   found = lookup_hash(crc, table);
+   current = NULL;
+   new_tree = NULL;
+   
+   while (!found) {
+   struct directory_entry *new;
+
+   new = init_directory_entry(super, dir_len);
+   if (!current)
+   current = new;
+   insert_directory_entry(new, table, total_dir_len, ndir, 
crc);
+   if (new_tree != NULL)
+   new-de_nsubtrees = 1;
+   new-next = new_tree;
+   new_tree = new;
+   super = super_directory(super);
+   if (!super)
+   dir_len = 0;
+   else
+   dir_len = strlen(super);
+   crc = crc32(0, (Bytef*)super, dir_len);
+   found = lookup_hash(crc, table);
+   }
+   search = found;
+   while (search-next_hash  strcmp(super, search-pathname) != 
0)
+   search = search-next_hash;
+   if (search  !current)
+   current = search;
+   if (!search  !current)
+   current = new_tree;
+   if (!super  new_tree) {
+   new_tree-next = de-next;
+   de-next = new_tree;
+   de-de_nsubtrees++;
+   } else if (new_tree) {
+   struct directory_entry *temp;
+
+   search = de-next;
+   while (strcmp(super, search-pathname))
+   search = search-next;
+   temp = new_tree;
+   while (temp-next)
+   temp = temp-next;
+   search-de_nsubtrees++;
+   temp-next = search-next;
+   search-next = new_tree;
+   }
+
+   len = strlen(item-string);
+   conflict_entry = create_new_conflict(item-string, len, 
current-de_pathlen);
+   add_conflict_to_directory_entry(current, conflict_entry);
+   for (i = 0; i  3; i++) {
+   if (ui-mode[i]) {
+   struct conflict_part *cp;
+
+   cp = xmalloc(sizeof(struct conflict_part));
+   cp-flags = (i + 1)  CONFLICT_STAGESHIFT;
+   cp-entry_mode = ui-mode[i];
+   cp-next = NULL

[PATCH/RFC v3 12/13] update-index.c: always rewrite the index when index-version is given

2012-08-08 Thread Thomas Gummerer
Make git update-index always rewrite the index, if a index-version
is given. This is used for performance testing, to have a reader
and writer for the whole index.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 builtin/update-index.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/builtin/update-index.c b/builtin/update-index.c
index 4ce341c..c31d176 100644
--- a/builtin/update-index.c
+++ b/builtin/update-index.c
@@ -6,6 +6,7 @@
 #include cache.h
 #include quote.h
 #include cache-tree.h
+#include read-cache.h
 #include tree-walk.h
 #include builtin.h
 #include refs.h
@@ -861,6 +862,7 @@ int cmd_update_index(int argc, const char **argv, const 
char *prefix)
if (the_index.version != preferred_index_format)
active_cache_changed = 1;
the_index.version = preferred_index_format;
+   set_istate_ops(the_index);
}
 
if (read_from_stdin) {
@@ -886,7 +888,7 @@ int cmd_update_index(int argc, const char **argv, const 
char *prefix)
strbuf_release(buf);
}
 
-   if (active_cache_changed) {
+   if (active_cache_changed || preferred_index_format) {
if (newfd  0) {
if (refresh_args.flags  REFRESH_QUIET)
exit(128);
-- 
1.7.10.GIT

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v3 09/13] Write index-v5

2012-08-08 Thread Thomas Gummerer
Write the index version 5 file format to disk. This version doesn't
write the cache-tree data and resolve-undo data to the file.

The main work is done when filtering out the directories from the
current in-memory format, where in the same turn also the conflicts
and the file data is calculated.

Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache.h |   10 +-
 read-cache-v5.c |  589 ++-
 read-cache.c|   19 +-
 read-cache.h|3 +
 4 files changed, 611 insertions(+), 10 deletions(-)

diff --git a/cache.h b/cache.h
index a0a1781..3fa348d 100644
--- a/cache.h
+++ b/cache.h
@@ -98,7 +98,7 @@ unsigned long git_deflate_bound(git_zstream *, unsigned long);
 #define CACHE_SIGNATURE 0x44495243 /* DIRC */
 
 #define INDEX_FORMAT_LB 2
-#define INDEX_FORMAT_UB 4
+#define INDEX_FORMAT_UB 5
 
 /*
  * The cache_time is just the low 32 bits of the
@@ -510,6 +510,7 @@ extern int verify_path(const char *path);
 extern struct cache_entry *index_name_exists(struct index_state *istate, const 
char *name, int namelen, int igncase);
 extern int index_name_stage_pos(const struct index_state *, const char *name, 
int namelen, int stage);
 extern int index_name_pos(const struct index_state *, const char *name, int 
namelen);
+extern struct directory_entry *init_directory_entry(char *pathname, int len);
 #define ADD_CACHE_OK_TO_ADD 1  /* Ok to add */
 #define ADD_CACHE_OK_TO_REPLACE 2  /* Ok to replace file/directory */
 #define ADD_CACHE_SKIP_DFCHECK 4   /* Ok to skip DF conflict checks */
@@ -1244,6 +1245,13 @@ static inline ssize_t write_str_in_full(int fd, const 
char *str)
return write_in_full(fd, str, strlen(str));
 }
 
+/* index-v5 helper functions */
+extern char *super_directory(const char *filename);
+extern void insert_directory_entry(struct directory_entry *, struct hash_table 
*, int *, unsigned int *, uint32_t);
+extern void add_conflict_to_directory_entry(struct directory_entry *, struct 
conflict_entry *);
+extern void add_part_to_conflict_entry(struct directory_entry *, struct 
conflict_entry *, struct conflict_part *);
+extern struct conflict_entry *create_new_conflict(char *, int, int);
+
 /* pager.c */
 extern void setup_pager(void);
 extern const char *pager_program;
diff --git a/read-cache-v5.c b/read-cache-v5.c
index 57d0fb5..45f7acd 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -583,9 +583,596 @@ static void read_index_v5(struct index_state *istate, 
void *mmap, int mmap_size,
istate-cache_tree = cache_tree_convert_v5(root_directory);
 }
 
+#define WRITE_BUFFER_SIZE 8192
+static unsigned char write_buffer[WRITE_BUFFER_SIZE];
+static unsigned long write_buffer_len;
+
+static int ce_write_flush(int fd)
+{
+   unsigned int buffered = write_buffer_len;
+   if (buffered) {
+   if (write_in_full(fd, write_buffer, buffered) != buffered)
+   return -1;
+   write_buffer_len = 0;
+   }
+   return 0;
+}
+
+static int ce_write(uint32_t *crc, int fd, void *data, unsigned int len)
+{
+   if (crc)
+   *crc = crc32(*crc, (Bytef*)data, len);
+   while (len) {
+   unsigned int buffered = write_buffer_len;
+   unsigned int partial = WRITE_BUFFER_SIZE - buffered;
+   if (partial  len)
+   partial = len;
+   memcpy(write_buffer + buffered, data, partial);
+   buffered += partial;
+   if (buffered == WRITE_BUFFER_SIZE) {
+   write_buffer_len = buffered;
+   if (ce_write_flush(fd))
+   return -1;
+   buffered = 0;
+   }
+   write_buffer_len = buffered;
+   len -= partial;
+   data = (char *) data + partial;
+   }
+   return 0;
+}
+
+static int ce_flush(int fd)
+{
+   unsigned int left = write_buffer_len;
+
+   if (left)
+   write_buffer_len = 0;
+
+   if (write_in_full(fd, write_buffer, left) != left)
+   return -1;
+
+   return 0;
+}
+
+static void ce_smudge_racily_clean_entry(struct cache_entry *ce)
+{
+   /*
+* This method shall only be called if the timestamp of ce
+* is racy (check with is_racy_timestamp). If the timestamp
+* is racy, the writer will just set the time to 0.
+*
+* The reader (match_stat_basic) will then take care
+* of checking if the entry is really changed or not, by
+* taking into account the stat_crc and if that hasn't changed
+* checking the sha1.
+*/
+   ce-ce_mtime.sec = 0;
+   ce-ce_mtime.nsec = 0;
+}
+
+char *super_directory(const char *filename)
+{
+   char *slash;
+
+   slash = strrchr(filename, '/');
+   if (slash)
+   return

Re: [PATCH/RFC v2 09/16] Read index-v5

2012-08-08 Thread Thomas Gummerer


On 08/08, Junio C Hamano wrote:
 Thomas Gummerer t.gumme...@gmail.com writes:
 
   +name = (char *)mmap + *dir_offset;
   +beginning = mmap + *dir_table_offset;
  
  Notice how you computed name with pointer arithmetic by first
  casting mmap (which is void *) and when computing beginning, you
  forgot to cast mmap and attempted pointer arithmetic with void *.
  The latter does not work and breaks compilation.
  
  The pointer-arith with void * is not limited to this function.
  ...
  I've used the type of the respective assignment for now. e.g. i have
  struct cache_header *hdr, so I'm using
  hdr = (struct cache_header *)mmap + x;
 
 You need to be careful when rewriting the above to choose the right
 value for 'x' if you go that route (which I wouldn't recommend).
 
 With
 
 hdr = ptr_add(mmap, x);
 
 you are making hdr point at x BYTES beyond mmap, but
 
 hdr = (struct cache_header *)mmap + x;
 
 means something entirely different, no?  hdr points at x entries
 of struct cache_header beyond mmap (in other words, if mmap[] were
 defined as struct cache_header mmap[], the above is saying the
 same as hdr = mmap[x]).
 
 I think the way you casted to compute the value for the name
 pointer is the (second) right thing to do.  The cast (char *)
 applied to mmap is about mmap is a typeless blob of memory I want
 to count bytes in.  Give me *dir_offset bytes into that blob.  It
 is not tied to the type of LHS (i.e. name) at all.  The result
 then needs to be casted to the type of LHS (i.e. name), and in
 this case the types happen to be the same, so you do not have to
 cast the result of the addition but that is mere luck.
 
 The next line is not so lucky and you would need to say something
 like:
 
 beginning = (uint32_t *)((char *)mmap + *dir_table_offset);
 
 Again, inner cast is about mmap is a blob counted in bytes, the
 outer cast is about type mismatch between a byte-address and LHS of
 the assignment.

This is what I tried in v3 of the series, but it didn't seem quiet
right.

 If mmap variable in this function were not void * but something
 more sane like const char *, you wouldn't have to have the inner
 cast to begin with, and that is why I said the way you did name is
 the second right thing.  Then you can write them like
 
 name = mmap + *dir_offset;
 beginning = (uint32_t *)(mmap + *dir_offset);
 
 After thinking about this, the ptr_add() macro might be the best
 solution, even though I originally called it as a band-aid.  We know
 mmap is a blob of memory, byte-offset of each component of which we
 know about, so we can say
 
 name = ptr_add(mmap, *dir_offset);
 beginning = ptr_add(mmap, *dir_offset);
 
 Hmmm..

I start to think so too. Casting the mmap variable to const char *
in the method call doesn't feel right to me, even though it would work.
Unless there are any objections I'll use ptr_add in the next version.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v2 01/16] Modify cache_header to prepare for other index formats

2012-08-07 Thread Thomas Gummerer
On 08/05, Junio C Hamano wrote:
 Thomas Gummerer t.gumme...@gmail.com writes:
 
  diff --git a/read-cache.c b/read-cache.c
  index 2f8159f..5d61d92 100644
  --- a/read-cache.c
  +++ b/read-cache.c
  @@ -1433,7 +1446,7 @@ int read_index_from(struct index_state *istate, const 
  char *path)
   
  errno = EINVAL;
  mmap_size = xsize_t(st.st_size);
  -   if (mmap_size  sizeof(struct cache_header) + 20)
  +   if (mmap_size  sizeof(struct cache_version_header) + 20)
  die(index file smaller than expected);
 
 At the design level, I have a large problem with this change.  I
 understand that you wanted to make sure that some versions can lack
 the num-entries word in the header, but then what is the point of
 keeping that +20 here?  Are all versions of the file format still
 required to have the 20-byte trailing SHA-1 sum over the whole file?

No, index-v5 doesn't have the trailing SHA-1 over the whole file.

   Side note: I am actually fine with that sum at the end
   requirement, but then it needs to be documented what are
   assumed to be unomittable and why.
 
 I also do not see why v5 *needs* to drop the num-entries
 word from the header in the first place.

v5 still has the num-entries word, but at a different position.
The +20 however would still be wrong, because of the missing
SHA-1 over the file.

 At the practical level, we used to error out, upon seeing a file
 that claims to be v2 in the header but is too small to hold the
 version header, the number of entries word and the trailing SHA-1
 sum.  We no longer do this and happily call verify_hdr() in the
 following code even when the file is too small, no?

This part is called even before we know what version of the index
we will read, and before the file is mmaped.  The best solution
i think is to drop the check and just call verify_hdr, since it will 
check the checksum anyway and detect the error, while not having
a big cost on a index file that is very small.

  @@ -1442,11 +1455,13 @@ int read_index_from(struct index_state *istate, 
  const char *path)
  die_errno(unable to map index file);
   
  hdr = mmap;
  +   hdr_v2 =  mmap + sizeof(*hdr);
  if (verify_hdr(hdr, mmap_size)  0)
  goto unmap;
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v2 04/16] Modify write functions to prepare for other index formats

2012-08-07 Thread Thomas Gummerer


On 08/05, Junio C Hamano wrote:
 Thomas Gummerer t.gumme...@gmail.com writes:
 
  -static int ce_write(git_SHA_CTX *context, int fd, void *data, unsigned int 
  len)
  +static int ce_write_v2(git_SHA_CTX *context, int fd, void *data, unsigned 
  int len)
   {
 
 Mild NAK to name this function with any hint that it is for v2 only.
 The type of data is not struct ondisk_index_entry_v2 and this is
 just a way to stream data to fd while hashing, which is similar in
 spirit to what csum-file.c sha1file'API does.  Perhaps we may want
 to update ce_write() interface to build on top of sha1file API?
 
 At this step in the series, is it too early to split read-cache.c
 into two files, move all the v2 specific part to read-cache-v2.c,
 and keep static function names like write_index_ext_header() as they
 are?  After all, the main dispatch would become
 
  +int write_index(struct index_state *istate, int newfd)
  +{
  +   if (!istate-version)
  +   istate-version = INDEX_FORMAT_DEFAULT;
  +
  +   return write_index_v2(istate, newfd);
  +}
 
 so read-cache-v2.c would need to export write_index_v2() but the
 functions to implement it like ce_write_entry() do not have to be
 exposed outside the file, no?

No I think it makes sense to split them at this point. I'll do it along
the lines of what Duy suggested with his patch. [1]

[1] http://thread.gmane.org/gmane.comp.version-control.git/202923/focus=202964
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v2 06/16] t3700: sleep for 1 second, to avoid interfering with the racy code

2012-08-07 Thread Thomas Gummerer
On 08/05, Junio C Hamano wrote:
 Thomas Gummerer t.gumme...@gmail.com writes:
 
  The new git racy code uses the mtime of cache-entries to smudge
  a racy clean entry, and loads the work, of checking the file-system
 
 -ECANTPARSE.

The git racy code for index-v5 uses the mtime of the cache-entries as
smudge markers. The work of checking the file-system is loaded of to
the reader.

  if the entry has really changed, off to the reader. This interferes
  with this test, because the entry is racily smudged and thus has
  mtime 0. We wait 1 second to avoid smudging the entry and getting
  correct test results.
 
 Mild NAK, especially it is totally unclear why you even need to muck
 with racy-git check in the current format of the index in the first
 place, and even if it were necessary, it is unclear why this cannot
 be done with test-chmtime.

The racy-git code needs to be changed, to avoid problems when implementing
the partial writing for index-v5. Otherwise it could cause problems, when
we have entries that should be smudged, but are not due to the different
racy algorithms.

I'll do it with test-chmtime in the reroll though.

  Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
  ---
   t/t3700-add.sh |1 +
   1 file changed, 1 insertion(+)
 
  diff --git a/t/t3700-add.sh b/t/t3700-add.sh
  index 874b3a6..4d70805 100755
  --- a/t/t3700-add.sh
  +++ b/t/t3700-add.sh
  @@ -184,6 +184,7 @@ test_expect_success 'git add --refresh with pathspec' '
  echo foo  echo bar  echo baz 
  git add foo bar baz  H=$(git rev-parse :foo)  git rm -f foo 
  echo 100644 $H 3   foo | git update-index --index-info 
  +   sleep 1 
  test-chmtime -60 bar baz 
  expect 
  git add --refresh bar actual 
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v2 10/16] Read resolve-undo data

2012-08-07 Thread Thomas Gummerer


On 08/05, Junio C Hamano wrote:
 Thomas Gummerer t.gumme...@gmail.com writes:
 
  Make git read the resolve-undo data from the index.
 
  Since the resolve-undo data is joined with the conflicts in
  the ondisk format of the index file version 5, conflicts and
  resolved data is read at the same time, and the resolve-undo
  data is then converted to the in-memory format.
 
  Helped-by: Thomas Rast tr...@student.ethz.ch
  Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
  ---
   read-cache.c   |1 +
   resolve-undo.c |   36 
   resolve-undo.h |2 ++
   3 files changed, 39 insertions(+)
 
  diff --git a/read-cache.c b/read-cache.c
  index 70334f9..03370f9 100644
  --- a/read-cache.c
  +++ b/read-cache.c
  @@ -1942,6 +1942,7 @@ static struct directory_entry *read_entries_v5(struct 
  index_state *istate,
  int i;
   
  conflict_queue = read_conflicts_v5(de, mmap, mmap_size, fd);
  +   resolve_undo_convert_v5(istate, conflict_queue);
  for (i = 0; i  de-de_nfiles; i++) {
  ce = read_entry_v5(de,
  entry_offset,
  diff --git a/resolve-undo.c b/resolve-undo.c
  index 72b4612..f96c6ba 100644
  --- a/resolve-undo.c
  +++ b/resolve-undo.c
  @@ -170,3 +170,39 @@ void unmerge_index(struct index_state *istate, const 
  char **pathspec)
  i = unmerge_index_entry_at(istate, i);
  }
   }
  +
  +void resolve_undo_convert_v5(struct index_state *istate,
  +   struct conflict_entry *ce)
  +{
 
 It is unclear why this needs to be part of resolve-undo.c and
 exported from it.  Shouldn't it (and bulk of the previous few
 patches) be part of a read-cache-v5.c file (with v2/3/4 specific
 part separated out from read-cache.c to form read-cache-v2.c)?

I thought this should be part of resolve-undo.c, to keep everything
that has to do with resolve-undo in the same file, taking model
from resolve_undo_read and resolve_undo_write.  But I don't care
to deeply about it, it can easily be moved to read-cache-v5.c.

  +   int i;
  +
  +   while (ce) {
  +   struct string_list_item *lost;
  +   struct resolve_undo_info *ui;
  +   struct conflict_part *cp;
  +
  +   if (ce-entries  (ce-entries-flags  CONFLICT_CONFLICTED) 
  != 0) {
  +   ce = ce-next;
  +   continue;
  +   }
  +   if (!istate-resolve_undo) {
  +   istate-resolve_undo = xcalloc(1, sizeof(struct 
  string_list));
  +   istate-resolve_undo-strdup_strings = 1;
  +   }
  +
  +   lost = string_list_insert(istate-resolve_undo, ce-name);
  +   if (!lost-util)
  +   lost-util = xcalloc(1, sizeof(*ui));
  +   ui = lost-util;
  +
  +   cp = ce-entries;
  +   for (i = 0; i  3; i++)
  +   ui-mode[i] = 0;
  +   while (cp) {
  +   ui-mode[conflict_stage(cp) - 1] = cp-entry_mode;
  +   hashcpy(ui-sha1[conflict_stage(cp) - 1], cp-sha1);
  +   cp = cp-next;
  +   }
  +   ce = ce-next;
  +   }
  +}
  diff --git a/resolve-undo.h b/resolve-undo.h
  index 8458769..ab660a6 100644
  --- a/resolve-undo.h
  +++ b/resolve-undo.h
  @@ -13,4 +13,6 @@ extern void resolve_undo_clear_index(struct index_state 
  *);
   extern int unmerge_index_entry_at(struct index_state *, int);
   extern void unmerge_index(struct index_state *, const char **);
   
  +extern void resolve_undo_convert_v5(struct index_state *, struct 
  conflict_entry *);
  +
   #endif
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v2 0/16] Introduce index file format version 5

2012-08-06 Thread Thomas Gummerer
On 08/06, Junio C Hamano wrote:
 Nguyễn Thái Ngọc Duy  pclo...@gmail.com writes:
 
  These mails are about cosmetics only. But I think it helps maintenance
  in long term. I notice in your series we have many functions with _v2
  and _v5 mixed together. Worse, some functions that are _v2 only are
  not suffixed with _v2. I still think separating v2/v5 changes is a
  good idea. So I played a bit, see how it might become.
 
  The next two emails demonstrate how we take v2-specific code out to
  read-cache-v2.c, then add v5 code in the next patch. Notice there's very
  little change in read-cache.c in the second patch. I wanted to see how
  v5 changes affects v2 users and the second patch shows it.
 
 I like the splitting of the backend into two files; it is a good
 direction to go, but I really prefer to see it done way before in
 the series, so that many symbols in read-cache-v2.c do not have to
 be contaminated with foo_v2 suffix, and similarly _v5 suffix for
 symbols in read-cache-v5.c when they are added.

I agree. I planned to make those changes in the re-roll of this series,
basically making patch 1/2/3/4 in this series in one commit, moving it
to read-cache-v2.c and building read-cache-v5.c along the commits in this
series. The re-roll should be out by tomorrow.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


<    3   4   5   6   7   8   9   >