Re: Subject: Something like cat-file for the index?

2014-01-10 Thread Thomas Gummerer

Hi,

Enno Weichert enno.weich...@gmail.com writes:
 Hi,

 I'd like to have a more technical look into the index file and what/how it
 stores data; call it educational spelunking.

 I know the index-format.txt but I'd really like to save me the work to
 implement a pretty-printed output based on it.
 I know ls-files but that's obviously not the whole thing.

 So: is there something like cat-file, that basically gives me a readable
 version of the information (version number and all...) in the index already
 implemented or did nobody care until now?

You can use `git ls-files --debug` and `git ls-files --stage` to get all
the information about the files in the index.  The meaning of the flags
is the only thing that's not shown by the command, and I don't think
there is a tool yet to examine them.

The undocumented --resolve-undo flag to git ls-files shows you the
resolve undo data that is stored in the index.

If you build git yourself, the `test-dump-cache-tree` helper can be used
to show all information about the cache-tree that is stored in the
index.

The you can get the version of the index either by using
`test-index-version` when you build git yourself, or by using `file
.git/index`, which in addition will give you the number of entries that
are in the index.

--
Thomas
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/WIP v2 02/14] read-cache: new extension to mark what file is watched

2014-01-17 Thread Thomas Gummerer
Nguyễn Thái Ngọc Duy pclo...@gmail.com writes:

 If an entry is watched, git lets an external program decide if the
 entry is modified or not. It's more like --assume-unchanged, but
 designed to be controlled by machine.

 We are running out of on-disk ce_flags, so instead of extending
 on-disk entry format again, watched flags are in-core only and
 stored as extension instead.

As you said yourself in 
http://thread.gmane.org/gmane.comp.version-control.git/240339/focus=240385
this is not quite true.  As for your explanation there,

 Anyway using extended flags means 2 extra bytes per entry for
 almost every entry in this case (and for index v5 it means redoing
 crc32 for almost every entry too when the bit is updated) so it may
 still be a good idea to keep the new flag separate.

I don't think adding 2 extra bytes would be too bad, since we are
already using 62 bytes plus the bytes for the filename for each index
entry, so it would be a less than 3% increase in the index file size.
(And the extended flags may be used anyway in some cases)

As for index-v5 (if that's ever going to happen), it depends mostly on
how often the CE_WATCHED is going to be updated, to decide whether it
makes sense to store this as extension.

That said, I don't care too deeply if it's stored one way or another,
but I think it would be good to update the commit message with a better
rationale for the choice.

 Signed-off-by: Nguyễn Thái Ngọc Duy pclo...@gmail.com
 ---
  cache.h  |  2 ++
  read-cache.c | 41 -
  2 files changed, 42 insertions(+), 1 deletion(-)

 diff --git a/cache.h b/cache.h
 index a09d622..069dce7 100644
 --- a/cache.h
 +++ b/cache.h
 @@ -168,6 +168,8 @@ struct cache_entry {
  /* used to temporarily mark paths matched by pathspecs */
  #define CE_MATCHED   (1  26)

 +#define CE_WATCHED   (1  27)
 +
  /*
   * Extended on-disk flags
   */
 diff --git a/read-cache.c b/read-cache.c
 index fe1d153..6f21e3f 100644
 --- a/read-cache.c
 +++ b/read-cache.c
 @@ -33,6 +33,7 @@ static struct cache_entry *refresh_cache_entry(struct 
 cache_entry *ce, int reall
  #define CACHE_EXT(s) ( (s[0]24)|(s[1]16)|(s[2]8)|(s[3]) )
  #define CACHE_EXT_TREE 0x54524545/* TREE */
  #define CACHE_EXT_RESOLVE_UNDO 0x52455543 /* REUC */
 +#define CACHE_EXT_WATCH 0x57415443 /* WATC */

  struct index_state the_index;

 @@ -1293,6 +1294,19 @@ static int verify_hdr(struct cache_header *hdr,
   return 0;
  }

 +static void read_watch_extension(struct index_state *istate, uint8_t *data,
 +  unsigned long sz)
 +{
 + int i;
 + if ((istate-cache_nr + 7) / 8 != sz) {
 + error(invalid 'WATC' extension);
 + return;
 + }
 + for (i = 0; i  istate-cache_nr; i++)
 + if (data[i / 8]  (1  (i % 8)))
 + istate-cache[i]-ce_flags |= CE_WATCHED;
 +}
 +
  static int read_index_extension(struct index_state *istate,
   const char *ext, void *data, unsigned long sz)
  {
 @@ -1303,6 +1317,9 @@ static int read_index_extension(struct index_state 
 *istate,
   case CACHE_EXT_RESOLVE_UNDO:
   istate-resolve_undo = resolve_undo_read(data, sz);
   break;
 + case CACHE_EXT_WATCH:
 + read_watch_extension(istate, data, sz);
 + break;
   default:
   if (*ext  'A' || 'Z'  *ext)
   return error(index uses %.4s extension, which we do 
 not understand,
 @@ -1781,7 +1798,7 @@ int write_index(struct index_state *istate, int newfd)
  {
   git_SHA_CTX c;
   struct cache_header hdr;
 - int i, err, removed, extended, hdr_version;
 + int i, err, removed, extended, hdr_version, has_watches = 0;
   struct cache_entry **cache = istate-cache;
   int entries = istate-cache_nr;
   struct stat st;
 @@ -1790,6 +1807,8 @@ int write_index(struct index_state *istate, int newfd)
   for (i = removed = extended = 0; i  entries; i++) {
   if (cache[i]-ce_flags  CE_REMOVE)
   removed++;
 + else if (cache[i]-ce_flags  CE_WATCHED)
 + has_watches++;

   /* reduce extended entries if possible */
   cache[i]-ce_flags = ~CE_EXTENDED;
 @@ -1861,6 +1880,26 @@ int write_index(struct index_state *istate, int newfd)
   if (err)
   return -1;
   }
 + if (has_watches) {
 + int id, sz = (entries - removed + 7) / 8;
 + uint8_t *data = xmalloc(sz);
 + memset(data, 0, sz);
 + for (i = 0, id = 0; i  entries  has_watches; i++) {
 + struct cache_entry *ce = cache[i];
 + if (ce-ce_flags  CE_REMOVE)
 + continue;
 + if (ce-ce_flags  CE_WATCHED) {
 + data[id / 8] |= 1  (id % 8);
 + has_watches--;
 

[PATCH 0/3] Wider exposure for index-v4

2014-02-15 Thread Thomas Gummerer
Hi,

since index-v5 didn't seem to generate enough interest to be merged, I
have a few patches that give users users easier access to index-v4.
Until now users have to go into the source code and compile git
themselves to use index-v4 by default, or use git-update-index to
change the index file to the new version.

With this patches it's possible to set the default index file format
either in gitconfig or in an environment variable.  It also simplifies
testing index-v4 by adding a Makefile knob to use it for running the
test suite.  For safety, existing repositories are not changed when
the environment or the config variables are set.

I'm not sure about the precedence in patch 3, right now the environment
variable has precedence, but it should be easy to give the config
option precedence over that.

Thomas Gummerer (3):
introduce GIT_INDEX_VERSION environment variable
test-lib: allow setting the index format version
read-cache: add index.version config variable

Documentation/config.txt  |  4 +++
Documentation/git.txt |  5 
Makefile  |  7 +
read-cache.c  | 36 +++-
t/t1600-index.sh  | 52 +++
t/t2104-update-index-skip-worktree.sh |  2 ++
t/test-lib-functions.sh   |  5 
t/test-lib.sh |  3 ++
8 files changed, 113 insertions(+), 1 deletion(-)
create mode 100755 t/t1600-index.sh

--
1.8.3.2

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] test-lib: allow setting the index format version

2014-02-15 Thread Thomas Gummerer
Allow adding a TEST_GIT_INDEX_VERSION variable to config.mak to set the
index version with which the test suite should be run.

If it isn't set, the default version given in the source code is
used (currently version 3).

To avoid breakages with index versions other than [23], also set the
index version under which t2104 is run to 3.  This test only tests
functionality specific to version 2 and 3 of the index file and would
fail if the test suite is run with any other version.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Makefile  | 7 +++
 t/t2104-update-index-skip-worktree.sh | 2 ++
 t/test-lib-functions.sh   | 5 +
 t/test-lib.sh | 3 +++
 4 files changed, 17 insertions(+)

diff --git a/Makefile b/Makefile
index 287e6f8..c98d28f 100644
--- a/Makefile
+++ b/Makefile
@@ -342,6 +342,10 @@ all::
 # Define DEFAULT_HELP_FORMAT to man, info or html
 # (defaults to man) if you want to have a different default when
 # git help is called without a parameter specifying the format.
+#
+# Define TEST_GIT_INDEX_FORMAT to 2, 3 or 4 to run the test suite
+# with a different indexfile format.  If it isn't set the index file
+# format used is index-v[23].
 
 GIT-VERSION-FILE: FORCE
@$(SHELL_PATH) ./GIT-VERSION-GEN
@@ -2223,6 +2227,9 @@ endif
 ifdef GIT_PERF_MAKE_OPTS
@echo GIT_PERF_MAKE_OPTS=\''$(subst ','\'',$(subst 
','\'',$(GIT_PERF_MAKE_OPTS)))'\' $@
 endif
+ifdef TEST_GIT_INDEX_VERSION
+   @echo TEST_GIT_INDEX_VERSION=\''$(subst ','\'',$(subst 
','\'',$(TEST_GIT_INDEX_VERSION)))'\' $@
+endif
 
 ### Detect Python interpreter path changes
 ifndef NO_PYTHON
diff --git a/t/t2104-update-index-skip-worktree.sh 
b/t/t2104-update-index-skip-worktree.sh
index 1d0879b..29c1fb1 100755
--- a/t/t2104-update-index-skip-worktree.sh
+++ b/t/t2104-update-index-skip-worktree.sh
@@ -7,6 +7,8 @@ test_description='skip-worktree bit test'
 
 . ./test-lib.sh
 
+test_set_index_version 3
+
 cat expect.full EOF
 H 1
 H 2
diff --git a/t/test-lib-functions.sh b/t/test-lib-functions.sh
index aeae3ca..0bf1e63 100644
--- a/t/test-lib-functions.sh
+++ b/t/test-lib-functions.sh
@@ -32,6 +32,11 @@ test_set_editor () {
export EDITOR
 }
 
+test_set_index_version () {
+GIT_INDEX_VERSION=$1
+export GIT_INDEX_VERSION
+}
+
 test_decode_color () {
awk '
function name(n) {
diff --git a/t/test-lib.sh b/t/test-lib.sh
index 1cf78d5..e6cf5b0 100644
--- a/t/test-lib.sh
+++ b/t/test-lib.sh
@@ -108,6 +108,9 @@ export GIT_AUTHOR_EMAIL GIT_AUTHOR_NAME
 export GIT_COMMITTER_EMAIL GIT_COMMITTER_NAME
 export EDITOR
 
+GIT_INDEX_VERSION=$TEST_GIT_INDEX_VERSION
+export GIT_INDEX_VERSION
+
 # Add libc MALLOC and MALLOC_PERTURB test
 # only if we are not executing the test with valgrind
 if expr  $GIT_TEST_OPTS  : .* --valgrind  /dev/null ||
-- 
1.8.5.2.300.ge613be6.dirty

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] introduce GIT_INDEX_VERSION environment variable

2014-02-15 Thread Thomas Gummerer
Respect a GIT_INDEX_VERSION environment variable, when a new index is
initialized.  Setting the environment variable will not cause existing
index files to be converted to another format, but will only affect
newly written index files.  This can be used to initialize repositories
with index-v4.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Documentation/git.txt |  5 +
 read-cache.c  | 18 +-
 t/t1600-index.sh  | 24 
 3 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100755 t/t1600-index.sh

diff --git a/Documentation/git.txt b/Documentation/git.txt
index aec3726..bc9eeea 100644
--- a/Documentation/git.txt
+++ b/Documentation/git.txt
@@ -712,6 +712,11 @@ Git so take care if using Cogito etc.
index file. If not specified, the default of `$GIT_DIR/index`
is used.
 
+'GIT_INDEX_VERSION'::
+   This environment variable allows the specification of an index
+   version for new repositories.  It won't affect existing index
+   files.  By default index file version 3 is used.
+
 'GIT_OBJECT_DIRECTORY'::
If the object storage directory is specified via this
environment variable then the sha1 directories are created
diff --git a/read-cache.c b/read-cache.c
index 3f735f3..3993e12 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1223,6 +1223,22 @@ static struct cache_entry *refresh_cache_entry(struct 
cache_entry *ce, int reall
 
 #define INDEX_FORMAT_DEFAULT 3
 
+static unsigned int get_index_format_default()
+{
+   char *envversion = getenv(GIT_INDEX_VERSION);
+   if (!envversion) {
+   return INDEX_FORMAT_DEFAULT;
+   } else {
+   unsigned int version = strtol(envversion, NULL, 10);
+   if (version  INDEX_FORMAT_LB || version  INDEX_FORMAT_UB) {
+   warning(_(GIT_INDEX_VERSION set, but the value is 
invalid.\n
+ Using version %i), INDEX_FORMAT_DEFAULT);
+   version = INDEX_FORMAT_DEFAULT;
+   }
+   return version;
+   }
+}
+
 /*
  * dev/ino/uid/gid/size are also just tracked to the low 32 bits
  * Again - this is just a (very strong in practice) heuristic that
@@ -1799,7 +1815,7 @@ int write_index(struct index_state *istate, int newfd)
}
 
if (!istate-version)
-   istate-version = INDEX_FORMAT_DEFAULT;
+   istate-version = get_index_format_default();
 
/* demote version 3 to version 2 when the latter suffices */
if (istate-version == 3 || istate-version == 2)
diff --git a/t/t1600-index.sh b/t/t1600-index.sh
new file mode 100755
index 000..37fd84d
--- /dev/null
+++ b/t/t1600-index.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+test_description='index file specific tests'
+
+. ./test-lib.sh
+
+test_expect_success 'setup' '
+   echo 1 a
+'
+
+test_expect_success 'out of bounds GIT_INDEX_VERSION issues warning' '
+   (
+   GIT_INDEX_VERSION=1 
+   export GIT_INDEX_VERSION 
+   git add a 21 | sed s/[0-9]// actual.err 
+   sed -e s/ Z$/ / -\EOF expect.err 
+   warning: GIT_INDEX_VERSION set, but the value is 
invalid.
+   Using version Z
+   EOF
+   test_i18ncmp expect.err actual.err
+   )
+'
+
+test_done
-- 
1.8.5.2.300.ge613be6.dirty

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] read-cache: add index.version config variable

2014-02-15 Thread Thomas Gummerer
Add a config variable that allows setting the default index version when
initializing a new index file.  Similar to the GIT_INDEX_VERSION
environment variable this only affects new index files.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Documentation/config.txt |  4 
 read-cache.c | 27 ++-
 t/t1600-index.sh | 27 +++
 3 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 1655455..033939a 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -1591,6 +1591,10 @@ imap::
The configuration variables in the 'imap' section are described
in linkgit:git-imap-send[1].
 
+index.version::
+   Specify the version with which new index files should be
+   initialized.  This does not affect existing repositories.
+
 init.templatedir::
Specify the directory from which templates will be copied.
(See the TEMPLATE DIRECTORY section of linkgit:git-init[1].)
diff --git a/read-cache.c b/read-cache.c
index 3993e12..ca9b68c 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1223,20 +1223,37 @@ static struct cache_entry *refresh_cache_entry(struct 
cache_entry *ce, int reall
 
 #define INDEX_FORMAT_DEFAULT 3
 
+static int index_format_config(const char *var, const char *value, void *cb)
+{
+   unsigned int *version = cb;
+   if (!strcmp(var, index.version)) {
+   *version = git_config_int(var, value);
+   return 0;
+   }
+   return 1;
+}
+
+
 static unsigned int get_index_format_default()
 {
char *envversion = getenv(GIT_INDEX_VERSION);
+   unsigned int version = INDEX_FORMAT_DEFAULT;
if (!envversion) {
-   return INDEX_FORMAT_DEFAULT;
-   } else {
-   unsigned int version = strtol(envversion, NULL, 10);
+   git_config(index_format_config, version);
if (version  INDEX_FORMAT_LB || version  INDEX_FORMAT_UB) {
-   warning(_(GIT_INDEX_VERSION set, but the value is 
invalid.\n
+   warning(_(index.version set, but the value is 
invalid.\n
  Using version %i), INDEX_FORMAT_DEFAULT);
-   version = INDEX_FORMAT_DEFAULT;
+   return INDEX_FORMAT_DEFAULT;
}
return version;
}
+   version = strtol(envversion, NULL, 10);
+   if (version  INDEX_FORMAT_LB || version  INDEX_FORMAT_UB) {
+   warning(_(GIT_INDEX_VERSION set, but the value is invalid.\n
+ Using version %i), INDEX_FORMAT_DEFAULT);
+   version = INDEX_FORMAT_DEFAULT;
+   }
+   return version;
 }
 
 /*
diff --git a/t/t1600-index.sh b/t/t1600-index.sh
index 37fd84d..bf34985 100755
--- a/t/t1600-index.sh
+++ b/t/t1600-index.sh
@@ -21,4 +21,31 @@ test_expect_success 'out of bounds GIT_INDEX_VERSION issues 
warning' '
)
 '
 
+test_expect_success 'out of bounds index.version issuses warning' '
+   (
+   unset GIT_INDEX_VERSION 
+   rm .git/index 
+   git config --add index.version 1 
+   git add a 21 | sed s/[0-9]// actual.err 
+   sed -e s/ Z$/ / -\EOF expect.err 
+   warning: index.version set, but the value is invalid.
+   Using version Z
+   EOF
+   test_i18ncmp expect.err actual.err
+   )
+'
+
+test_expect_success 'GIT_INDEX_VERSION takes precedence over config' '
+   (
+   rm .git/index 
+   GIT_INDEX_VERSION=4 
+   export GIT_INDEX_VERSION 
+   git config --add index.version 2 
+   git add a 21 
+   echo 4 expect 
+   test-index-version .git/index actual 
+   test_cmp expect actual
+   )
+'
+
 test_done
-- 
1.8.5.2.300.ge613be6.dirty

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/3] Wider exposure for index-v4

2014-02-16 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Sun, Feb 16, 2014 at 2:23 AM, Thomas Gummerer t.gumme...@gmail.com wrote:
 Hi,

 since index-v5 didn't seem to generate enough interest to be merged, I

 I thought there were some comments last time that you were going to
 address and resubmit?

Yes, there were some comments to the last round, which I already fixed
locally, I'd just have to rebase it to make sure it stell applies
cleanly.  No responses from Junio to [1] and [2] gave me the impression
that it's not going to be applied.  I would be happy to rebase and
submit if there is a chance for it getting in.

[1] http://thread.gmane.org/gmane.comp.version-control.git/238414/focus=239065
[2] http://thread.gmane.org/gmane.comp.version-control.git/232488/focus=233504

 have a few patches that give users users easier access to index-v4.
 Until now users have to go into the source code and compile git
 themselves to use index-v4 by default, or use git-update-index to
 change the index file to the new version.

 Not objecting this, but I think something like [1] would give v4 more
 exposure. Reading the patch again, I think putting that detection code
 in unpack_trees() or git-merge may make more sense because people will
 be advised about upgrading to v4 at the next fast-forward.

Thanks, I forgot about this patch.  I still think at least the first two
patches of this series make sense in addition to your patch, allowing
developers to easily run the test suite with index-v4.

 [1] http://article.gmane.org/gmane.comp.version-control.git/216307

 With this patches it's possible to set the default index file format
 either in gitconfig or in an environment variable.  It also simplifies
 testing index-v4 by adding a Makefile knob to use it for running the
 test suite.  For safety, existing repositories are not changed when
 the environment or the config variables are set.

 I'm not sure about the precedence in patch 3, right now the environment
 variable has precedence, but it should be easy to give the config
 option precedence over that.

 Thomas Gummerer (3):
 introduce GIT_INDEX_VERSION environment variable
 test-lib: allow setting the index format version
 read-cache: add index.version config variable

 Documentation/config.txt  |  4 +++
 Documentation/git.txt |  5 
 Makefile  |  7 +
 read-cache.c  | 36 +++-
 t/t1600-index.sh  | 52 
 +++
 t/t2104-update-index-skip-worktree.sh |  2 ++
 t/test-lib-functions.sh   |  5 
 t/test-lib.sh |  3 ++
 8 files changed, 113 insertions(+), 1 deletion(-)
 create mode 100755 t/t1600-index.sh

 --
 1.8.3.2

 --
 To unsubscribe from this list: send the line unsubscribe git in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html



 --
 Duy
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] introduce GIT_INDEX_VERSION environment variable

2014-02-21 Thread Thomas Gummerer
Junio C Hamano gits...@pobox.com writes:

 Thomas Gummerer t.gumme...@gmail.com writes:

 diff --git a/Documentation/git.txt b/Documentation/git.txt
 index aec3726..bc9eeea 100644
 --- a/Documentation/git.txt
 +++ b/Documentation/git.txt
 @@ -712,6 +712,11 @@ Git so take care if using Cogito etc.
  index file. If not specified, the default of `$GIT_DIR/index`
  is used.
  
 +'GIT_INDEX_VERSION'::
 +This environment variable allows the specification of an index
 +version for new repositories.  It won't affect existing index
 +files.  By default index file version 3 is used.
 +

 This is half-correct, isn't it?  In-code variable may say version 3
 but we demote it to version 2 unless we absolutely need to use the
 version 3 ugliness.

Yes, you're right, to be correct we should say [23] instead of 3 here
maybe?

 diff --git a/read-cache.c b/read-cache.c
 index 3f735f3..3993e12 100644
 --- a/read-cache.c
 +++ b/read-cache.c
 @@ -1223,6 +1223,22 @@ static struct cache_entry *refresh_cache_entry(struct 
 cache_entry *ce, int reall
  
  #define INDEX_FORMAT_DEFAULT 3
  
 +static unsigned int get_index_format_default()
 +{
 +char *envversion = getenv(GIT_INDEX_VERSION);
 +if (!envversion) {
 +return INDEX_FORMAT_DEFAULT;
 +} else {
 +unsigned int version = strtol(envversion, NULL, 10);

 If we are using strtol() to parse it carefully, we should make sure
 it parses to the end by giving a non-NULL second argument and
 checking where the parsing stopped.

Thanks, makes sense, will fix it in the re-roll.

 diff --git a/t/t1600-index.sh b/t/t1600-index.sh
 new file mode 100755
 index 000..37fd84d
 --- /dev/null
 +++ b/t/t1600-index.sh
 @@ -0,0 +1,24 @@
 +#!/bin/sh
 +
 +test_description='index file specific tests'
 +
 +. ./test-lib.sh
 +
 +test_expect_success 'setup' '
 +echo 1 a
 +'
 +
 +test_expect_success 'out of bounds GIT_INDEX_VERSION issues warning' '
 +(
 +GIT_INDEX_VERSION=1 
 +export GIT_INDEX_VERSION 
 +git add a 21 | sed s/[0-9]// actual.err 
 +sed -e s/ Z$/ / -\EOF expect.err 
 +warning: GIT_INDEX_VERSION set, but the value is 
 invalid.
 +Using version Z
 +EOF
 +test_i18ncmp expect.err actual.err
 +)
 +'

 If we already have an index in version N when this test starts, what
 should happen?

I think we shouldn't print anything, since we won't change the file
format.  I'll add another test for that.

 Will queue on 'pu', with some suggested fixups.

Thanks, I think both fixups in 'pu' make sense, so I'll include them in
the re-roll.

 Thanks.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/3] Easier access to index-v4

2014-02-23 Thread Thomas Gummerer
Hi,

previous round was at $gmane/242198.

Thanks to Junio, Eric and Duy for comments on the previous round.

Since then I've squashed the fixes suggested by Junio, added a test
showing what should happen if an index file is present and
GIT_INDEX_VERSION is set and fixed the typo found by Eric.

Thomas Gummerer (3):
  introduce GIT_INDEX_VERSION environment variable
  test-lib: allow setting the index format version
  read-cache: add index.version config variable

 Documentation/config.txt  |  4 ++
 Documentation/git.txt |  5 +++
 Makefile  |  7 
 read-cache.c  | 38 +-
 t/t1600-index.sh  | 76 +++
 t/t2104-update-index-skip-worktree.sh |  2 +
 t/test-lib-functions.sh   |  5 +++
 t/test-lib.sh |  3 ++
 8 files changed, 139 insertions(+), 1 deletion(-)
 create mode 100755 t/t1600-index.sh

-- 
1.9.0.1.ge0caaa8.dirty

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/3] read-cache: add index.version config variable

2014-02-23 Thread Thomas Gummerer
Add a config variable that allows setting the default index version when
initializing a new index file.  Similar to the GIT_INDEX_VERSION
environment variable this only affects new index files.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Documentation/config.txt |  4 
 read-cache.c | 35 ++-
 t/t1600-index.sh | 27 +++
 3 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 5f4d793..98200aa 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -1601,6 +1601,10 @@ imap::
The configuration variables in the 'imap' section are described
in linkgit:git-imap-send[1].
 
+index.version::
+   Specify the version with which new index files should be
+   initialized.  This does not affect existing repositories.
+
 init.templatedir::
Specify the directory from which templates will be copied.
(See the TEMPLATE DIRECTORY section of linkgit:git-init[1].)
diff --git a/read-cache.c b/read-cache.c
index efc4aae..6bc9724 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1219,23 +1219,40 @@ static struct cache_entry *refresh_cache_entry(struct 
cache_entry *ce, int reall
 
 #define INDEX_FORMAT_DEFAULT 3
 
+static int index_format_config(const char *var, const char *value, void *cb)
+{
+   unsigned int *version = cb;
+   if (!strcmp(var, index.version)) {
+   *version = git_config_int(var, value);
+   return 0;
+   }
+   return 1;
+}
+
 static unsigned int get_index_format_default(void)
 {
char *envversion = getenv(GIT_INDEX_VERSION);
-   if (!envversion) {
-   return INDEX_FORMAT_DEFAULT;
-   } else {
-   char *endp;
-   unsigned int version = strtoul(envversion, endp, 10);
+   char *endp;
+   unsigned int version = INDEX_FORMAT_DEFAULT;
 
-   if (*endp ||
-   version  INDEX_FORMAT_LB || INDEX_FORMAT_UB  version) {
-   warning(_(GIT_INDEX_VERSION set, but the value is 
invalid.\n
+   if (!envversion) {
+   git_config(index_format_config, version);
+   if (version  INDEX_FORMAT_LB || INDEX_FORMAT_UB  version) {
+   warning(_(index.version set, but the value is 
invalid.\n
  Using version %i), INDEX_FORMAT_DEFAULT);
-   version = INDEX_FORMAT_DEFAULT;
+   return INDEX_FORMAT_DEFAULT;
}
return version;
}
+
+   version = strtoul(envversion, endp, 10);
+   if (*endp ||
+   version  INDEX_FORMAT_LB || INDEX_FORMAT_UB  version) {
+   warning(_(GIT_INDEX_VERSION set, but the value is invalid.\n
+ Using version %i), INDEX_FORMAT_DEFAULT);
+   version = INDEX_FORMAT_DEFAULT;
+   }
+   return version;
 }
 
 /*
diff --git a/t/t1600-index.sh b/t/t1600-index.sh
index 6195c55..079d241 100755
--- a/t/t1600-index.sh
+++ b/t/t1600-index.sh
@@ -46,4 +46,31 @@ test_expect_success 'no warning with bogus GIT_INDEX_VERSION 
and existing index'
)
 '
 
+test_expect_success 'out of bounds index.version issues warning' '
+   (
+   sane_unset GIT_INDEX_VERSION 
+   rm -f .git/index 
+   git config --add index.version 1 
+   git add a 21 | sed s/[0-9]// actual.err 
+   sed -e s/ Z$/ / -\EOF expect.err 
+   warning: index.version set, but the value is invalid.
+   Using version Z
+   EOF
+   test_i18ncmp expect.err actual.err
+   )
+'
+
+test_expect_success 'GIT_INDEX_VERSION takes precedence over config' '
+   (
+   rm -f .git/index 
+   GIT_INDEX_VERSION=4 
+   export GIT_INDEX_VERSION 
+   git config --add index.version 2 
+   git add a 21 
+   echo 4 expect 
+   test-index-version .git/index actual 
+   test_cmp expect actual
+   )
+'
+
 test_done
-- 
1.9.0.1.ge0caaa8.dirty

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/3] test-lib: allow setting the index format version

2014-02-23 Thread Thomas Gummerer
Allow adding a TEST_GIT_INDEX_VERSION variable to config.mak to set the
index version with which the test suite should be run.

If it isn't set, the default version given in the source code is
used (currently version 3).

To avoid breakages with index versions other than [23], also set the
index version under which t2104 is run to 3.  This test only tests
functionality specific to version 2 and 3 of the index file and would
fail if the test suite is run with any other version.

Helped-by: Junio C Hamano gits...@pobox.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Makefile  | 7 +++
 t/t2104-update-index-skip-worktree.sh | 2 ++
 t/test-lib-functions.sh   | 5 +
 t/test-lib.sh | 3 +++
 4 files changed, 17 insertions(+)

diff --git a/Makefile b/Makefile
index dddaf4f..5caa3b2 100644
--- a/Makefile
+++ b/Makefile
@@ -342,6 +342,10 @@ all::
 # Define DEFAULT_HELP_FORMAT to man, info or html
 # (defaults to man) if you want to have a different default when
 # git help is called without a parameter specifying the format.
+#
+# Define TEST_GIT_INDEX_VERSION to 2, 3 or 4 to run the test suite
+# with a different indexfile format version.  If it isn't set the index
+# file format used is index-v[23].
 
 GIT-VERSION-FILE: FORCE
@$(SHELL_PATH) ./GIT-VERSION-GEN
@@ -,6 +2226,9 @@ endif
 ifdef GIT_PERF_MAKE_OPTS
@echo GIT_PERF_MAKE_OPTS=\''$(subst ','\'',$(subst 
','\'',$(GIT_PERF_MAKE_OPTS)))'\' $@
 endif
+ifdef TEST_GIT_INDEX_VERSION
+   @echo TEST_GIT_INDEX_VERSION=\''$(subst ','\'',$(subst 
','\'',$(TEST_GIT_INDEX_VERSION)))'\' $@
+endif
 
 ### Detect Python interpreter path changes
 ifndef NO_PYTHON
diff --git a/t/t2104-update-index-skip-worktree.sh 
b/t/t2104-update-index-skip-worktree.sh
index 1d0879b..29c1fb1 100755
--- a/t/t2104-update-index-skip-worktree.sh
+++ b/t/t2104-update-index-skip-worktree.sh
@@ -7,6 +7,8 @@ test_description='skip-worktree bit test'
 
 . ./test-lib.sh
 
+test_set_index_version 3
+
 cat expect.full EOF
 H 1
 H 2
diff --git a/t/test-lib-functions.sh b/t/test-lib-functions.sh
index aeae3ca..0bf1e63 100644
--- a/t/test-lib-functions.sh
+++ b/t/test-lib-functions.sh
@@ -32,6 +32,11 @@ test_set_editor () {
export EDITOR
 }
 
+test_set_index_version () {
+GIT_INDEX_VERSION=$1
+export GIT_INDEX_VERSION
+}
+
 test_decode_color () {
awk '
function name(n) {
diff --git a/t/test-lib.sh b/t/test-lib.sh
index 1531c24..492f81f 100644
--- a/t/test-lib.sh
+++ b/t/test-lib.sh
@@ -108,6 +108,9 @@ export GIT_AUTHOR_EMAIL GIT_AUTHOR_NAME
 export GIT_COMMITTER_EMAIL GIT_COMMITTER_NAME
 export EDITOR
 
+GIT_INDEX_VERSION=$TEST_GIT_INDEX_VERSION
+export GIT_INDEX_VERSION
+
 # Add libc MALLOC and MALLOC_PERTURB test
 # only if we are not executing the test with valgrind
 if expr  $GIT_TEST_OPTS  : .* --valgrind  /dev/null ||
-- 
1.9.0.1.ge0caaa8.dirty

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/3] introduce GIT_INDEX_VERSION environment variable

2014-02-23 Thread Thomas Gummerer
Respect a GIT_INDEX_VERSION environment variable, when a new index is
initialized.  Setting the environment variable will not cause existing
index files to be converted to another format, but will only affect
newly written index files.  This can be used to initialize repositories
with index-v4.

Helped-by: Junio C Hamano gits...@pobox.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Documentation/git.txt |  5 +
 read-cache.c  | 21 -
 t/t1600-index.sh  | 49 +
 3 files changed, 74 insertions(+), 1 deletion(-)
 create mode 100755 t/t1600-index.sh

diff --git a/Documentation/git.txt b/Documentation/git.txt
index 02bbc08..27a199c 100644
--- a/Documentation/git.txt
+++ b/Documentation/git.txt
@@ -720,6 +720,11 @@ Git so take care if using Cogito etc.
index file. If not specified, the default of `$GIT_DIR/index`
is used.
 
+'GIT_INDEX_VERSION'::
+   This environment variable allows the specification of an index
+   version for new repositories.  It won't affect existing index
+   files.  By default index file version [23] is used.
+
 'GIT_OBJECT_DIRECTORY'::
If the object storage directory is specified via this
environment variable then the sha1 directories are created
diff --git a/read-cache.c b/read-cache.c
index 33dd676..efc4aae 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1219,6 +1219,25 @@ static struct cache_entry *refresh_cache_entry(struct 
cache_entry *ce, int reall
 
 #define INDEX_FORMAT_DEFAULT 3
 
+static unsigned int get_index_format_default(void)
+{
+   char *envversion = getenv(GIT_INDEX_VERSION);
+   if (!envversion) {
+   return INDEX_FORMAT_DEFAULT;
+   } else {
+   char *endp;
+   unsigned int version = strtoul(envversion, endp, 10);
+
+   if (*endp ||
+   version  INDEX_FORMAT_LB || INDEX_FORMAT_UB  version) {
+   warning(_(GIT_INDEX_VERSION set, but the value is 
invalid.\n
+ Using version %i), INDEX_FORMAT_DEFAULT);
+   version = INDEX_FORMAT_DEFAULT;
+   }
+   return version;
+   }
+}
+
 /*
  * dev/ino/uid/gid/size are also just tracked to the low 32 bits
  * Again - this is just a (very strong in practice) heuristic that
@@ -1795,7 +1814,7 @@ int write_index(struct index_state *istate, int newfd)
}
 
if (!istate-version)
-   istate-version = INDEX_FORMAT_DEFAULT;
+   istate-version = get_index_format_default();
 
/* demote version 3 to version 2 when the latter suffices */
if (istate-version == 3 || istate-version == 2)
diff --git a/t/t1600-index.sh b/t/t1600-index.sh
new file mode 100755
index 000..6195c55
--- /dev/null
+++ b/t/t1600-index.sh
@@ -0,0 +1,49 @@
+#!/bin/sh
+
+test_description='index file specific tests'
+
+. ./test-lib.sh
+
+test_expect_success 'setup' '
+   echo 1 a
+'
+
+test_expect_success 'bogus GIT_INDEX_VERSION issues warning' '
+   (
+   rm -f .git/index 
+   GIT_INDEX_VERSION=2bogus 
+   export GIT_INDEX_VERSION 
+   git add a 21 | sed s/[0-9]// actual.err 
+   sed -e s/ Z$/ / -\EOF expect.err 
+   warning: GIT_INDEX_VERSION set, but the value is 
invalid.
+   Using version Z
+   EOF
+   test_i18ncmp expect.err actual.err
+   )
+'
+
+test_expect_success 'out of bounds GIT_INDEX_VERSION issues warning' '
+   (
+   rm -f .git/index 
+   GIT_INDEX_VERSION=1 
+   export GIT_INDEX_VERSION 
+   git add a 21 | sed s/[0-9]// actual.err 
+   sed -e s/ Z$/ / -\EOF expect.err 
+   warning: GIT_INDEX_VERSION set, but the value is 
invalid.
+   Using version Z
+   EOF
+   test_i18ncmp expect.err actual.err
+   )
+'
+
+test_expect_success 'no warning with bogus GIT_INDEX_VERSION and existing 
index' '
+   (
+   GIT_INDEX_VERSION=1 
+   export GIT_INDEX_VERSION 
+   git add a 2actual.err 
+   expect.err 
+   test_i18ncmp expect.err actual.err
+   )
+'
+
+test_done
-- 
1.9.0.1.ge0caaa8.dirty

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 0/3] Easier access to index-v4

2014-02-24 Thread Thomas Gummerer
Junio C Hamano gits...@pobox.com writes:

 Junio C Hamano gits...@pobox.com writes:

 Thomas Gummerer t.gumme...@gmail.com writes:

 previous round was at $gmane/242198.

 Since then I've squashed the fixes suggested by Junio, added a test
 showing what should happen if an index file is present and
 GIT_INDEX_VERSION is set and fixed the typo found by Eric.

 Looks good; thanks.

 Tests, seem to leak these unnecessary diag (not limited to t0010).

 sh t0010-racy-git.sh 
 warning: GIT_INDEX_VERSION set, but the value is invalid.
 Using version 3
 ok 1 - Racy GIT trial #0 part A
 ok 2 - Racy GIT trial #0 part B
 warning: GIT_INDEX_VERSION set, but the value is invalid.
 Using version 3
 ...
 # passed all 10 test(s)
 1..10


 The same thing under prove.

 *** prove ***
 t0010-racy-git.sh .. warning: GIT_INDEX_VERSION set, but the value is 
 invalid.
 Using version 3
 t0010-racy-git.sh .. 2/? warning: GIT_INDEX_VERSION set, but the value is 
 invalid.
 Using version 3
 t0010-racy-git.sh .. 4/? warning: GIT_INDEX_VERSION set, but the value is 
 invalid.
 Using version 3
 t0010-racy-git.sh .. 6/? warning: GIT_INDEX_VERSION set, but the value is 
 invalid.
 Using version 3
 t0010-racy-git.sh .. 8/? warning: GIT_INDEX_VERSION set, but the value is 
 invalid.
 Using version 3
 t0010-racy-git.sh .. ok
 All tests successful.


 I suspect the real culprit is the early part in test-lib.sh that
 sets GIT_INDEX_VERSION explicitly from TEST_GIT_INDEX_VERSION when
 the latter is not even specified.

 Something along this line, perhaps?

Sorry about this, I didn't run the test suite without
TEST_GIT_INDEX_VERSION in config.mak which I obviously should have.

Yes, this looks good to me, thanks!

  t/test-lib.sh | 7 +--
  1 file changed, 5 insertions(+), 2 deletions(-)

 diff --git a/t/test-lib.sh b/t/test-lib.sh
 index 492f81f..01a98cb 100644
 --- a/t/test-lib.sh
 +++ b/t/test-lib.sh
 @@ -108,8 +108,11 @@ export GIT_AUTHOR_EMAIL GIT_AUTHOR_NAME
  export GIT_COMMITTER_EMAIL GIT_COMMITTER_NAME
  export EDITOR
  
 -GIT_INDEX_VERSION=$TEST_GIT_INDEX_VERSION
 -export GIT_INDEX_VERSION
 +if test -n ${TEST_GIT_INDEX_VERSION+isset}
 +then
 + GIT_INDEX_VERSION=$TEST_GIT_INDEX_VERSION
 + export GIT_INDEX_VERSION
 +fi
  
  # Add libc MALLOC and MALLOC_PERTURB test
  # only if we are not executing the test with valgrind
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5.5/22] Add documentation for the index api

2013-07-11 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Wed, Jul 10, 2013 at 3:10 AM, Thomas Gummerer t.gumme...@gmail.com wrote:
 If you happen to know that certain entries match the given pathspec,
 you could help the caller avoid match_pathspec'ing again by set a bit
 in ce_flags.

 I currently don't know which entries do match the pathspec from just
 reading the index file, additional calls would be needed.  I don't think
 that would be worth the overhead.

 Yeah I now see that you select what to load in v5 with the adjusted
 pathspec, not the input pathspec. Originally I thought you match the
 input pathspec against every file entry in the index :P Your adjusted
 pathspec looks like what common_prefix is for. It's cheaper than
 creating adjusted_pathspec from match_pathspec and reduces loading in
 major cases, where glob is not used.

 Still, creating an adjusted pathspec this way looks iffy. You need to
 understand pathspec in order to strip the filename part out to match
 the directory match only. An alternative is use
 tree_entry_interesting. It goes along well with tree traversal and can
 be used to match directories with original pathspec. Once you see it
 matches an entry in a directory, you could skip matching the rest of
 the files and load the whole directory. read_index_filtered_v5 and
 read_entries may need some tweaking though. I'll try it and post a
 patch later if I succeed.

Hrm, I played around a bit with this idea, but I couldn't figure out how
to make it work.  For it to work we would still have to load some
entries in a directory at least?  Or is there a way to match the
directories, which I just haven't figured out yet?

 To know which entry exists in the index and which is
 new, use another flag. Most reader code won't change if we do it this
 way, all match_pathspec() remain where they are.

 Hrm you mean to know which cache entries are added (or changed) in the
 in-memory index and will have to be written later?  I'm not sure I
 understand correctly what you mean here.

 Oh.. The to know.. sentence was nonsense. We probably don't need to
 know. We may track changed entries for partial writing, but let's
 leave that out for now.

Ok, makes sense.

 +`index_change_filter_opts(opts)`::
 +   This function again has a slightly different functionality for
 +   index-v2 and index-v5.
 +
 +   For index-v2 it simply changes the filter_opts, so
 +   for_each_index_entry uses the changed index_opts, to iterate
 +   over a different set of cache entries.
 +
 +   For index-v5 it refreshes the index if the filter_opts have
 +   changed and sets the new filter_opts in the index state, again
 +   to iterate over a different set of cache entries as with
 +   index-v2.
 +
 +   This has some optimization potential, in the case that the
 +   opts get stricter (less of the index should be read) it
 +   doesn't have to reload anything, but currently does.

 The only use case I see so far is converting a partial index_state
 back to a full one. Apart from doing so in order to write the new
 index, I think some operation (like rename tracking in diff or
 unpack-trees) may expect full index. I think we should support that. I
 doubt we need to change pathspec to something different than the one
 we used to load the index. When a user passes a pathspec to a command,
 the user expects the command to operate on that set only, not outside.

 One application was in ls-files, where we strip the trailing slash from
 the pathspecs for submodules.  But when we let the caller filter the
 rest out it's not needed anymore.  We load all entries without the
 trailing slash anyway.

 That submodule trailing slash stripping code will be moved away soon
 (I've been working on it for some time now). There's similar code in
 pathspec.c. I hope by the time this series becomes a candidate for
 'next', those pathspec manipulation is already gone. For
 strip_trailing_slash_from_submodules, peeking in index file for a few
 entries is probably ok. For check_path_for_gitlink, full index is
 loaded until we figure out a clever way.

Ah great, for now I'll just not use the for_each_index_entry function in
ls-files, and then change the code later once the stripping code is
moved away.

 Some thoughts about the writing api.

 In think we should avoid automatically converting partial index into a
 full one before writing. Push that back to the caller and die() when
 asked to update partial index. They know at what point the index may
 be updated and even what part of it may be updated. I think all
 commands fall into two categories, tree-wide updates (merge,
 checkout...) and limited by the user-given pathspec. what part to be
 updated is not so hard to determine.

 Hrm this is only true if index entries are added or removed, not if they
 are only changed.  If they are only changed we can write a partially
 read index once we have partial writing.

 Yep. We can detect if changes are updates only

Re: [PATCH 13/22] documentation: add documentation of the index-v5 file format

2013-07-11 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Sun, Jul 7, 2013 at 3:11 PM, Thomas Gummerer t.gumme...@gmail.com wrote:
 +== File entry (fileentries)
 +
 +  File entries are sorted in ascending order on the name field, after the
 +  respective offset given by the directory entries. All file names are
 +  prefix compressed, meaning the file name is relative to the directory.
 +
 +  filename (variable length, nul terminated). The exact encoding is
 +undefined, but the filename cannot contain a NUL byte (iow, the same
 +encoding as a UNIX pathname).
 +
 +  flags (16-bits): 'flags' field split into (high to low bits)
 +
 +assumevalid (1-bit): assume-valid flag
 +
 +intenttoadd (1-bit): intent-to-add flag, used by git add -N.
 +  Extended flag in index v3.
 +
 +stage (2-bit): stage of the file during merge
 +
 +skipworktree (1-bit): skip-worktree flag, used by sparse checkout.
 +  Extended flag in index v3.
 +
 +smudged (1-bit): indicates if the file is racily smudged.
 +
 +10-bit unused, must be zero [6]
 +
 +  mode (16-bits): file mode, split into (high to low bits)
 +
 +objtype (4-bits): object type
 +  valid values in binary are 1000 (regular file), 1010 (symbolic
 +  link) and 1110 (gitlink)
 +
 +3-bit unused
 +
 +permission (9-bits): unix permission. Only 0755 and 0644 are valid
 +  for regular files. Symbolic links and gitlinks have value 0 in
 +  this field.
 +
 +  mtimes (32-bits): mtime seconds, the last time a file's data changed
 +this is stat(2) data
 +
 +  mtimens (32-bits): mtime nanosecond fractions
 +this is stat(2) data
 +
 +  file size (32-bits): The on-disk size, trucated to 32-bit.
 +this is stat(2) data
 +
 +  statcrc (32-bits): crc32 checksum over ctime seconds, ctime
 +nanoseconds, ino, dev, uid, gid (All stat(2) data
 +except mtime and file size). If the statcrc is 0 it will
 +be ignored. [7]
 +
 +  objhash (160-bits): SHA-1 for the represented object
 +
 +  entrycrc (32-bits): crc32 checksum for the file entry. The crc code
 +includes the offset to the offset to the file, relative to the
 +beginning of the file.

 Question about the possibility of updating index file directly. If git
 updates a few fields of an entry (but not entrycrc yet) and crashes,
 the entry would become corrupt because its entrycrc does not match the
 content. What do we do? Do we need to save a copy of the entry
 somewhere in the index file (maybe in the conflict data section), so
 that the reader can recover the index? Losing the index because of
 bugs is big deal in my opinion. pre-v5 never faces this because we
 keep the original copy til the end.

 Maybe entrycrc should not cover stat fields and statcrc. It would make
 refreshing safer. If the above happens during refresh, only statcrc is
 corrupt and we can just refresh the entry. entrycrc still says the
 other fields are good (and they are).

The original idea was to change the lock-file for partial writing to
make it work for this case.  The exact structure of the file still has
to be defined, but generally it would be done in the following steps:

  1. Write the changed entry to the lock-file
  2. Change the entry in the index
  3. If we succeed delete the lock-file (commit the transaction)

If git crashes, and leaves the index corrupted, we can recover the
information from the lock-file and write the new information to the
index file and then delete the lock-file.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 13/22] documentation: add documentation of the index-v5 file format

2013-07-11 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Thu, Jul 11, 2013 at 6:39 PM, Thomas Gummerer t.gumme...@gmail.com wrote:
 Question about the possibility of updating index file directly. If git
 updates a few fields of an entry (but not entrycrc yet) and crashes,
 the entry would become corrupt because its entrycrc does not match the
 content. What do we do? Do we need to save a copy of the entry
 somewhere in the index file (maybe in the conflict data section), so
 that the reader can recover the index? Losing the index because of
 bugs is big deal in my opinion. pre-v5 never faces this because we
 keep the original copy til the end.

 Maybe entrycrc should not cover stat fields and statcrc. It would make
 refreshing safer. If the above happens during refresh, only statcrc is
 corrupt and we can just refresh the entry. entrycrc still says the
 other fields are good (and they are).

 The original idea was to change the lock-file for partial writing to
 make it work for this case.  The exact structure of the file still has
 to be defined, but generally it would be done in the following steps:

   1. Write the changed entry to the lock-file
   2. Change the entry in the index
   3. If we succeed delete the lock-file (commit the transaction)

 If git crashes, and leaves the index corrupted, we can recover the
 information from the lock-file and write the new information to the
 index file and then delete the lock-file.

 Ah makes sense. Still concerned about refreshing though. Updated files
 are usually few while refreshed files could be a lot more, increasing
 the cost at #1.

Any idea how common refreshing a big part of the cache is?  If it's not
to common, I'd prefer to leave the stat data and stat crc in the
entrycrc, as we can inform the user if something is wrong with the
index, be it from git failing, or from disk corruption.

On the other hand if refresh_cache is relatively common and usually
changes a big part of the index we should leave them out, as git can
still run correctly with incorrect stat data, but takes a little longer,
because it may have to check the file contents.  That will be trade-off
to make here.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 01/19] t2104: Don't fail for index versions other than [23]

2013-07-12 Thread Thomas Gummerer
t2104 currently checks for the exact index version 2 or 3,
depending if there is a skip-worktree flag or not. Other
index versions do not use extended flags and thus cannot
be tested for version changes.

Make this test update the index to version 2 at the beginning
of the test. Testing the skip-worktree flags for the default
index format is still covered by t7011 and t7012.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/t2104-update-index-skip-worktree.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/t/t2104-update-index-skip-worktree.sh 
b/t/t2104-update-index-skip-worktree.sh
index 1d0879b..bd9644f 100755
--- a/t/t2104-update-index-skip-worktree.sh
+++ b/t/t2104-update-index-skip-worktree.sh
@@ -22,6 +22,7 @@ H sub/2
 EOF
 
 test_expect_success 'setup' '
+   git update-index --index-version=2 
mkdir sub 
touch ./1 ./2 sub/1 sub/2 
git add 1 2 sub/1 sub/2 
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 04/19] read-cache: Re-read index if index file changed

2013-07-12 Thread Thomas Gummerer
Add the possibility of re-reading the index file, if it changed
while reading.

The index file might change during the read, causing outdated
information to be displayed. We check if the index file changed
by using its stat data as heuristic.

Helped-by: Ramsay Jones ram...@ramsay1.demon.co.uk
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache.c | 91 +---
 1 file changed, 57 insertions(+), 34 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 1e7ffc2..3e3a0e2 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1275,11 +1275,31 @@ int read_index(struct index_state *istate)
return read_index_from(istate, get_index_file());
 }
 
+static int index_changed(struct stat *st_old, struct stat *st_new)
+{
+   if (st_old-st_mtime != st_new-st_mtime ||
+#if !defined (__CYGWIN__)
+   st_old-st_uid   != st_new-st_uid ||
+   st_old-st_gid   != st_new-st_gid ||
+   st_old-st_ino   != st_new-st_ino ||
+#endif
+#if USE_NSEC
+   ST_MTIME_NSEC(*st_old) != ST_MTIME_NSEC(*st_new) ||
+#endif
+#if USE_STDEV
+   st_old-st_dev != st_new-st_dev ||
+#endif
+   st_old-st_size != st_new-st_size)
+   return 1;
+
+   return 0;
+}
+
 /* remember to discard_cache() before reading a different cache! */
 int read_index_from(struct index_state *istate, const char *path)
 {
-   int fd;
-   struct stat st;
+   int fd, err, i;
+   struct stat st_old, st_new;
struct cache_version_header *hdr;
void *mmap;
size_t mmap_size;
@@ -1291,41 +1311,44 @@ int read_index_from(struct index_state *istate, const 
char *path)
errno = ENOENT;
istate-timestamp.sec = 0;
istate-timestamp.nsec = 0;
+   for (i = 0; i  50; i++) {
+   err = 0;
+   fd = open(path, O_RDONLY);
+   if (fd  0) {
+   if (errno == ENOENT)
+   return 0;
+   die_errno(index file open failed);
+   }
 
-   fd = open(path, O_RDONLY);
-   if (fd  0) {
-   if (errno == ENOENT)
-   return 0;
-   die_errno(index file open failed);
+   if (fstat(fd, st_old))
+   die_errno(cannot stat the open index);
+
+   errno = EINVAL;
+   mmap_size = xsize_t(st_old.st_size);
+   mmap = xmmap(NULL, mmap_size, PROT_READ | PROT_WRITE, 
MAP_PRIVATE, fd, 0);
+   close(fd);
+   if (mmap == MAP_FAILED)
+   die_errno(unable to map index file);
+
+   hdr = mmap;
+   if (verify_hdr_version(istate, hdr, mmap_size)  0)
+   err = 1;
+
+   if (istate-ops-verify_hdr(mmap, mmap_size)  0)
+   err = 1;
+
+   if (istate-ops-read_index(istate, mmap, mmap_size)  0)
+   err = 1;
+   istate-timestamp.sec = st_old.st_mtime;
+   istate-timestamp.nsec = ST_MTIME_NSEC(st_old);
+   if (lstat(path, st_new))
+   die_errno(cannot stat the open index);
+
+   munmap(mmap, mmap_size);
+   if (!index_changed(st_old, st_new)  !err)
+   return istate-cache_nr;
}
 
-   if (fstat(fd, st))
-   die_errno(cannot stat the open index);
-
-   errno = EINVAL;
-   mmap_size = xsize_t(st.st_size);
-   mmap = xmmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 
0);
-   close(fd);
-   if (mmap == MAP_FAILED)
-   die_errno(unable to map index file);
-
-   hdr = mmap;
-   if (verify_hdr_version(istate, hdr, mmap_size)  0)
-   goto unmap;
-
-   if (istate-ops-verify_hdr(mmap, mmap_size)  0)
-   goto unmap;
-
-   if (istate-ops-read_index(istate, mmap, mmap_size)  0)
-   goto unmap;
-   istate-timestamp.sec = st.st_mtime;
-   istate-timestamp.nsec = ST_MTIME_NSEC(st);
-
-   munmap(mmap, mmap_size);
-   return istate-cache_nr;
-
-unmap:
-   munmap(mmap, mmap_size);
die(index file corrupt);
 }
 
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 03/19] read-cache: move index v2 specific functions to their own file

2013-07-12 Thread Thomas Gummerer
Move index version 2 specific functions to their own file. The non-index
specific functions will be in read-cache.c, while the index version 2
specific functions will be in read-cache-v2.c.

Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Makefile |   2 +
 cache.h  |  16 +-
 read-cache-v2.c  | 556 +
 read-cache.c | 575 ---
 read-cache.h |  57 +
 test-index-version.c |   5 +
 6 files changed, 661 insertions(+), 550 deletions(-)
 create mode 100644 read-cache-v2.c
 create mode 100644 read-cache.h

diff --git a/Makefile b/Makefile
index 5a68fe5..73369ae 100644
--- a/Makefile
+++ b/Makefile
@@ -711,6 +711,7 @@ LIB_H += progress.h
 LIB_H += prompt.h
 LIB_H += quote.h
 LIB_H += reachable.h
+LIB_H += read-cache.h
 LIB_H += reflog-walk.h
 LIB_H += refs.h
 LIB_H += remote.h
@@ -854,6 +855,7 @@ LIB_OBJS += prompt.o
 LIB_OBJS += quote.o
 LIB_OBJS += reachable.o
 LIB_OBJS += read-cache.o
+LIB_OBJS += read-cache-v2.o
 LIB_OBJS += reflog-walk.o
 LIB_OBJS += refs.o
 LIB_OBJS += remote.o
diff --git a/cache.h b/cache.h
index 7af853b..5082b34 100644
--- a/cache.h
+++ b/cache.h
@@ -95,19 +95,8 @@ unsigned long git_deflate_bound(git_zstream *, unsigned 
long);
  */
 #define DEFAULT_GIT_PORT 9418
 
-/*
- * Basic data structures for the directory cache
- */
 
 #define CACHE_SIGNATURE 0x44495243 /* DIRC */
-struct cache_version_header {
-   unsigned int hdr_signature;
-   unsigned int hdr_version;
-};
-
-struct cache_header {
-   unsigned int hdr_entries;
-};
 
 #define INDEX_FORMAT_LB 2
 #define INDEX_FORMAT_UB 4
@@ -280,6 +269,7 @@ struct index_state {
 initialized : 1;
struct hash_table name_hash;
struct hash_table dir_hash;
+   struct index_ops *ops;
 };
 
 extern struct index_state the_index;
@@ -489,8 +479,8 @@ extern void *read_blob_data_from_index(struct index_state 
*, const char *, unsig
 #define CE_MATCH_RACY_IS_DIRTY 02
 /* do stat comparison even if CE_SKIP_WORKTREE is true */
 #define CE_MATCH_IGNORE_SKIP_WORKTREE  04
-extern int ie_match_stat(const struct index_state *, const struct cache_entry 
*, struct stat *, unsigned int);
-extern int ie_modified(const struct index_state *, const struct cache_entry *, 
struct stat *, unsigned int);
+extern int ie_match_stat(struct index_state *, const struct cache_entry *, 
struct stat *, unsigned int);
+extern int ie_modified(struct index_state *, const struct cache_entry *, 
struct stat *, unsigned int);
 
 #define PATHSPEC_ONESTAR 1 /* the pathspec pattern sastisfies GFNM_ONESTAR 
*/
 
diff --git a/read-cache-v2.c b/read-cache-v2.c
new file mode 100644
index 000..a6883c3
--- /dev/null
+++ b/read-cache-v2.c
@@ -0,0 +1,556 @@
+#include cache.h
+#include read-cache.h
+#include resolve-undo.h
+#include cache-tree.h
+#include varint.h
+
+/* Mask for the name length in ce_flags in the on-disk index */
+#define CE_NAMEMASK  (0x0fff)
+
+struct cache_header {
+   unsigned int hdr_entries;
+};
+
+/*
+ * Index File I/O
+ */
+
+/*
+ * dev/ino/uid/gid/size are also just tracked to the low 32 bits
+ * Again - this is just a (very strong in practice) heuristic that
+ * the inode hasn't changed.
+ *
+ * We save the fields in big-endian order to allow using the
+ * index file over NFS transparently.
+ */
+struct ondisk_cache_entry {
+   struct cache_time ctime;
+   struct cache_time mtime;
+   unsigned int dev;
+   unsigned int ino;
+   unsigned int mode;
+   unsigned int uid;
+   unsigned int gid;
+   unsigned int size;
+   unsigned char sha1[20];
+   unsigned short flags;
+   char name[FLEX_ARRAY]; /* more */
+};
+
+/*
+ * This struct is used when CE_EXTENDED bit is 1
+ * The struct must match ondisk_cache_entry exactly from
+ * ctime till flags
+ */
+struct ondisk_cache_entry_extended {
+   struct cache_time ctime;
+   struct cache_time mtime;
+   unsigned int dev;
+   unsigned int ino;
+   unsigned int mode;
+   unsigned int uid;
+   unsigned int gid;
+   unsigned int size;
+   unsigned char sha1[20];
+   unsigned short flags;
+   unsigned short flags2;
+   char name[FLEX_ARRAY]; /* more */
+};
+
+/* These are only used for v3 or lower */
+#define align_flex_name(STRUCT,len) ((offsetof(struct STRUCT,name) + (len) + 
8)  ~7)
+#define ondisk_cache_entry_size(len) align_flex_name(ondisk_cache_entry,len)
+#define ondisk_cache_entry_extended_size(len) 
align_flex_name(ondisk_cache_entry_extended,len)
+#define ondisk_ce_size(ce) (((ce)-ce_flags  CE_EXTENDED) ? \
+   ondisk_cache_entry_extended_size(ce_namelen(ce)) : \
+   ondisk_cache_entry_size(ce_namelen

[PATCH v2 05/19] Add documentation for the index api

2013-07-12 Thread Thomas Gummerer
Add documentation for the index reading api.  This also includes
documentation for the new api functions introduced in the next patch.

Helped-by: Nguyễn Thái Ngọc Duy pclo...@gmail.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Documentation/technical/api-in-core-index.txt | 54 +--
 1 file changed, 50 insertions(+), 4 deletions(-)

diff --git a/Documentation/technical/api-in-core-index.txt 
b/Documentation/technical/api-in-core-index.txt
index adbdbf5..9b8c37c 100644
--- a/Documentation/technical/api-in-core-index.txt
+++ b/Documentation/technical/api-in-core-index.txt
@@ -1,14 +1,60 @@
 in-core index API
 =
 
+Reading API
+---
+
+`cache`::
+
+   An array of cache entries.  This is used to access the cache
+   entries directly.  Use `index_name_pos` to search for the
+   index of a specific cache entry.
+
+`read_index_filtered`::
+
+   Read a part of the index, filtered by the pathspec given in
+   the opts.  The function may load more than necessary, so the
+   caller still responsible to apply filters appropriately.  The
+   filtering is only done for performance reasons, as it's
+   possible to only read part of the index when the on-disk
+   format is index-v5.
+
+   To iterate only over the entries that match the pathspec, use
+   the for_each_index_entry function.
+
+`read_index`::
+
+   Read the whole index file from disk.
+
+`index_name_pos`::
+
+   Find a cache_entry with name in the index.  Returns pos if an
+   entry is matched exactly and -1-pos if an entry is matched
+   partially.
+   e.g.
+   index:
+   file1
+   file2
+   path/file1
+   zzz
+
+   index_name_pos(path/file1, 10) returns 2, while
+   index_name_pos(path, 4) returns -3
+
+`for_each_index_entry`::
+
+   Iterates over all cache_entries in the index filtered by
+   filter_opts in the index_state.  For each cache entry fn is
+   executed with cb_data as callback data.  From within the loop
+   do `return 0` to continue, or `return 1` to break the loop.
+
+TODO
+
 Talk about read-cache.c and cache-tree.c, things like:
 
-* cache - the_index macros
-* read_index()
 * write_index()
 * ie_match_stat() and ie_modified(); how they are different and when to
   use which.
-* index_name_pos()
 * remove_index_entry_at()
 * remove_file_from_index()
 * add_file_to_index()
@@ -18,4 +64,4 @@ Talk about read-cache.c and cache-tree.c, things like:
 * cache_tree_invalidate_path()
 * cache_tree_update()
 
-(JC, Linus)
+(JC, Linus, Thomas Gummerer)
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 06/19] read-cache: add index reading api

2013-07-12 Thread Thomas Gummerer
Add an api for access to the index file.  Currently there is only a very
basic api for accessing the index file, which only allows a full read of
the index, and lets the users of the data filter it.  The new index api
gives the users the possibility to use only part of the index and
provides functions for iterating over and accessing cache entries.

This simplifies future improvements to the in-memory format, as changes
will be concentrated on one file, instead of the whole git source code.

Helped-by: Nguyễn Thái Ngọc Duy pclo...@gmail.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache.h | 42 +-
 read-cache-v2.c | 35 +--
 read-cache.c| 44 
 read-cache.h|  8 +++-
 4 files changed, 121 insertions(+), 8 deletions(-)

diff --git a/cache.h b/cache.h
index 5082b34..d305d21 100644
--- a/cache.h
+++ b/cache.h
@@ -127,7 +127,7 @@ struct cache_entry {
unsigned int ce_flags;
unsigned int ce_namelen;
unsigned char sha1[20];
-   struct cache_entry *next;
+   struct cache_entry *next; /* used by name_hash */
char name[FLEX_ARRAY]; /* more */
 };
 
@@ -258,6 +258,29 @@ static inline unsigned int canon_mode(unsigned int mode)
 
 #define cache_entry_size(len) (offsetof(struct cache_entry,name) + (len) + 1)
 
+/*
+ * Options by which the index should be filtered when read partially.
+ *
+ * pathspec: The pathspec which the index entries have to match
+ * seen: Used to return the seen parameter from match_pathspec()
+ * max_prefix_len: The common prefix length of the pathspecs
+ *
+ * read_staged: used to indicate if the conflicted entries (entries
+ * with a stage) should be included
+ * read_cache_tree: used to indicate if the cache-tree should be read
+ * read_resolve_undo: used to indicate if the resolve undo data should
+ * be read
+ */
+struct filter_opts {
+   const struct pathspec *pathspec;
+   char *seen;
+   int max_prefix_len;
+
+   int read_staged;
+   int read_cache_tree;
+   int read_resolve_undo;
+};
+
 struct index_state {
struct cache_entry **cache;
unsigned int version;
@@ -270,6 +293,8 @@ struct index_state {
struct hash_table name_hash;
struct hash_table dir_hash;
struct index_ops *ops;
+   struct internal_ops *internal_ops;
+   struct filter_opts *filter_opts;
 };
 
 extern struct index_state the_index;
@@ -311,6 +336,12 @@ extern void free_name_hash(struct index_state *istate);
 #define unmerge_cache_entry_at(at) unmerge_index_entry_at(the_index, at)
 #define unmerge_cache(pathspec) unmerge_index(the_index, pathspec)
 #define read_blob_data_from_cache(path, sz) 
read_blob_data_from_index(the_index, (path), (sz))
+
+/* index api */
+#define read_cache_filtered(opts) read_index_filtered(the_index, (opts))
+#define read_cache_filtered_from(path, opts) 
read_index_filtered_from(the_index, (path), (opts))
+#define for_each_cache_entry(fn, cb_data) \
+   for_each_index_entry(the_index, (fn), (cb_data))
 #endif
 
 enum object_type {
@@ -438,6 +469,15 @@ extern int init_db(const char *template_dir, unsigned int 
flags);
} \
} while (0)
 
+/* index api */
+extern int read_index_filtered(struct index_state *, struct filter_opts *opts);
+extern int read_index_filtered_from(struct index_state *, const char *path, 
struct filter_opts *opts);
+
+typedef int each_cache_entry_fn(struct cache_entry *ce, void *);
+extern int for_each_index_entry(struct index_state *istate,
+   each_cache_entry_fn, void *);
+
+
 /* Initialize and use the cache information */
 extern int read_index(struct index_state *);
 extern int read_index_preload(struct index_state *, const char **pathspec);
diff --git a/read-cache-v2.c b/read-cache-v2.c
index a6883c3..51b618f 100644
--- a/read-cache-v2.c
+++ b/read-cache-v2.c
@@ -3,6 +3,7 @@
 #include resolve-undo.h
 #include cache-tree.h
 #include varint.h
+#include dir.h
 
 /* Mask for the name length in ce_flags in the on-disk index */
 #define CE_NAMEMASK  (0x0fff)
@@ -207,8 +208,14 @@ static int read_index_extension(struct index_state *istate,
return 0;
 }
 
+/*
+ * The performance is the same if we read the whole index or only
+ * part of it, therefore we always read the whole index to avoid
+ * having to re-read it later.  The filter_opts will determine
+ * what part of the index is used when retrieving the cache-entries.
+ */
 static int read_index_v2(struct index_state *istate, void *mmap,
-unsigned long mmap_size)
+unsigned long mmap_size, struct filter_opts *opts)
 {
int i;
unsigned long src_offset;
@@ -238,7 +245,6 @@ static int read_index_v2(struct index_state *istate, void 
*mmap,
disk_ce = (struct ondisk_cache_entry *)((char *)mmap + 
src_offset);
ce

[PATCH v2 07/19] make sure partially read index is not changed

2013-07-12 Thread Thomas Gummerer
A partially read index file currently cannot be written to disk.  Make
sure that never happens, by erroring out when a caller tries to change a
partially read index.  The caller is responsible for reading the whole
index when it's trying to change it later.

Forcing the caller to load the right part of the index file instead of
re-reading it when changing it, gives a bit of a performance advantage,
by avoiding to read parts of the index twice.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 builtin/update-index.c |  4 
 cache.h|  1 +
 read-cache-v2.c|  2 ++
 read-cache.c   | 10 ++
 4 files changed, 17 insertions(+)

diff --git a/builtin/update-index.c b/builtin/update-index.c
index 5c7762e..4c6e3a6 100644
--- a/builtin/update-index.c
+++ b/builtin/update-index.c
@@ -49,6 +49,8 @@ static int mark_ce_flags(const char *path, int flag, int mark)
int namelen = strlen(path);
int pos = cache_name_pos(path, namelen);
if (0 = pos) {
+   if (active_cache_partially_read)
+   die(BUG: Can't change a partially read index);
if (mark)
active_cache[pos]-ce_flags |= flag;
else
@@ -253,6 +255,8 @@ static void chmod_path(int flip, const char *path)
pos = cache_name_pos(path, strlen(path));
if (pos  0)
goto fail;
+   if (active_cache_partially_read)
+   die(BUG: Can't change a partially read index);
ce = active_cache[pos];
mode = ce-ce_mode;
if (!S_ISREG(mode))
diff --git a/cache.h b/cache.h
index d305d21..455b772 100644
--- a/cache.h
+++ b/cache.h
@@ -311,6 +311,7 @@ extern void free_name_hash(struct index_state *istate);
 #define active_alloc (the_index.cache_alloc)
 #define active_cache_changed (the_index.cache_changed)
 #define active_cache_tree (the_index.cache_tree)
+#define active_cache_partially_read (the_index.filter_opts)
 
 #define read_cache() read_index(the_index)
 #define read_cache_from(path) read_index_from(the_index, (path))
diff --git a/read-cache-v2.c b/read-cache-v2.c
index 51b618f..f3c0685 100644
--- a/read-cache-v2.c
+++ b/read-cache-v2.c
@@ -479,6 +479,8 @@ static int write_index_v2(struct index_state *istate, int 
newfd)
struct stat st;
struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 
+   if (istate-filter_opts)
+   die(BUG: index: cannot write a partially read index);
for (i = removed = extended = 0; i  entries; i++) {
if (cache[i]-ce_flags  CE_REMOVE)
removed++;
diff --git a/read-cache.c b/read-cache.c
index 9053d43..ab716ed 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -30,6 +30,8 @@ static void replace_index_entry(struct index_state *istate, 
int nr, struct cache
 {
struct cache_entry *old = istate-cache[nr];
 
+   if (istate-filter_opts)
+   die(BUG: Can't change a partially read index);
remove_name_hash(istate, old);
set_index_entry(istate, nr, ce);
istate-cache_changed = 1;
@@ -467,6 +469,8 @@ int remove_index_entry_at(struct index_state *istate, int 
pos)
 {
struct cache_entry *ce = istate-cache[pos];
 
+   if (istate-filter_opts)
+   die(BUG: Can't change a partially read index);
record_resolve_undo(istate, ce);
remove_name_hash(istate, ce);
istate-cache_changed = 1;
@@ -973,6 +977,8 @@ int add_index_entry(struct index_state *istate, struct 
cache_entry *ce, int opti
 {
int pos;
 
+   if (istate-filter_opts)
+   die(BUG: Can't change a partially read index);
if (option  ADD_CACHE_JUST_APPEND)
pos = istate-cache_nr;
else {
@@ -1173,6 +1179,8 @@ int refresh_index(struct index_state *istate, unsigned 
int flags, const char **p
/* If we are doing --really-refresh that
 * means the index is not valid anymore.
 */
+   if (istate-filter_opts)
+   die(BUG: Can't change a partially read 
index);
ce-ce_flags = ~CE_VALID;
istate-cache_changed = 1;
}
@@ -1331,6 +1339,8 @@ int read_index_filtered_from(struct index_state *istate, 
const char *path,
void *mmap;
size_t mmap_size;
 
+   if (istate-filter_opts)
+   die(BUG: Can't re-read partially read index);
errno = EBUSY;
if (istate-initialized)
return istate-cache_nr;
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 08/19] grep.c: Use index api

2013-07-12 Thread Thomas Gummerer
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 builtin/grep.c | 71 ++
 1 file changed, 37 insertions(+), 34 deletions(-)

diff --git a/builtin/grep.c b/builtin/grep.c
index a419cda..8b02644 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -368,41 +368,33 @@ static void run_pager(struct grep_opt *opt, const char 
*prefix)
free(argv);
 }
 
-static int grep_cache(struct grep_opt *opt, const struct pathspec *pathspec, 
int cached)
+struct grep_opts {
+   struct grep_opt *opt;
+   const struct pathspec *pathspec;
+   int cached;
+   int hit;
+};
+
+static int grep_cache(struct cache_entry *ce, void *cb_data)
 {
-   int hit = 0;
-   int nr;
-   read_cache();
+   struct grep_opts *opts = cb_data;
 
-   for (nr = 0; nr  active_nr; nr++) {
-   struct cache_entry *ce = active_cache[nr];
-   if (!S_ISREG(ce-ce_mode))
-   continue;
-   if (!match_pathspec_depth(pathspec, ce-name, ce_namelen(ce), 
0, NULL))
-   continue;
-   /*
-* If CE_VALID is on, we assume worktree file and its cache 
entry
-* are identical, even if worktree file has been modified, so 
use
-* cache version instead
-*/
-   if (cached || (ce-ce_flags  CE_VALID) || 
ce_skip_worktree(ce)) {
-   if (ce_stage(ce))
-   continue;
-   hit |= grep_sha1(opt, ce-sha1, ce-name, 0, ce-name);
-   }
-   else
-   hit |= grep_file(opt, ce-name);
-   if (ce_stage(ce)) {
-   do {
-   nr++;
-   } while (nr  active_nr 
-!strcmp(ce-name, active_cache[nr]-name));
-   nr--; /* compensate for loop control */
-   }
-   if (hit  opt-status_only)
-   break;
-   }
-   return hit;
+   if (!S_ISREG(ce-ce_mode))
+   return 0;
+   if (!match_pathspec_depth(opts-pathspec, ce-name, ce_namelen(ce), 0, 
NULL))
+   return 0;
+   /*
+* If CE_VALID is on, we assume worktree file and its cache entry
+* are identical, even if worktree file has been modified, so use
+* cache version instead
+*/
+   if (opts-cached || (ce-ce_flags  CE_VALID) || ce_skip_worktree(ce))
+   opts-hit |= grep_sha1(opts-opt, ce-sha1, ce-name, 0, 
ce-name);
+   else
+   opts-hit |= grep_file(opts-opt, ce-name);
+   if (opts-hit  opts-opt-status_only)
+   return 1;
+   return 0;
 }
 
 static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec,
@@ -895,10 +887,21 @@ int cmd_grep(int argc, const char **argv, const char 
*prefix)
} else if (0 = opt_exclude) {
die(_(--[no-]exclude-standard cannot be used for tracked 
contents.));
} else if (!list.nr) {
+   struct grep_opts opts;
+   struct filter_opts *filter_opts = xmalloc(sizeof(*filter_opts));
+
if (!cached)
setup_work_tree();
 
-   hit = grep_cache(opt, pathspec, cached);
+   memset(filter_opts, 0, sizeof(*filter_opts));
+   filter_opts-pathspec = pathspec;
+   opts.opt = opt;
+   opts.pathspec = pathspec;
+   opts.cached = cached;
+   opts.hit = 0;
+   read_cache_filtered(filter_opts);
+   for_each_cache_entry(grep_cache, opts);
+   hit = opts.hit;
} else {
if (cached)
die(_(both --cached and trees are given.));
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 09/19] ls-files.c: use index api

2013-07-12 Thread Thomas Gummerer
Use the index api to read only part of the index, if the on-disk version
of the index is index-v5.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 builtin/ls-files.c | 31 ---
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/builtin/ls-files.c b/builtin/ls-files.c
index 08d9786..80cc398 100644
--- a/builtin/ls-files.c
+++ b/builtin/ls-files.c
@@ -31,6 +31,7 @@ static const char *prefix;
 static int max_prefix_len;
 static int prefix_len;
 static const char **pathspec;
+static struct pathspec pathspec_struct;
 static int error_unmatch;
 static char *ps_matched;
 static const char *with_tree;
@@ -457,6 +458,7 @@ int cmd_ls_files(int argc, const char **argv, const char 
*cmd_prefix)
struct dir_struct dir;
struct exclude_list *el;
struct string_list exclude_list = STRING_LIST_INIT_NODUP;
+   struct filter_opts *opts = xmalloc(sizeof(*opts));
struct option builtin_ls_files_options[] = {
{ OPTION_CALLBACK, 'z', NULL, NULL, NULL,
N_(paths are separated with NUL character),
@@ -522,9 +524,6 @@ int cmd_ls_files(int argc, const char **argv, const char 
*cmd_prefix)
prefix_len = strlen(prefix);
git_config(git_default_config, NULL);
 
-   if (read_cache()  0)
-   die(index file corrupt);
-
argc = parse_options(argc, argv, prefix, builtin_ls_files_options,
ls_files_usage, 0);
el = add_exclude_list(dir, EXC_CMDL, --exclude option);
@@ -556,14 +555,7 @@ int cmd_ls_files(int argc, const char **argv, const char 
*cmd_prefix)
setup_work_tree();
 
pathspec = get_pathspec(prefix, argv);
-
-   /* be nice with submodule paths ending in a slash */
-   if (pathspec)
-   strip_trailing_slash_from_submodules();
-
-   /* Find common prefix for all pathspec's */
-   max_prefix = common_prefix(pathspec);
-   max_prefix_len = max_prefix ? strlen(max_prefix) : 0;
+   init_pathspec(pathspec_struct, pathspec);
 
/* Treat unmatching pathspec elements as errors */
if (pathspec  error_unmatch) {
@@ -573,6 +565,23 @@ int cmd_ls_files(int argc, const char **argv, const char 
*cmd_prefix)
ps_matched = xcalloc(1, num);
}
 
+   if (!with_tree) {
+   memset(opts, 0, sizeof(*opts));
+   opts-pathspec = pathspec_struct;
+   opts-read_staged = 1;
+   if (show_resolve_undo)
+   opts-read_resolve_undo = 1;
+   read_cache_filtered(opts);
+   } else {
+   read_cache();
+   }
+   /* be nice with submodule paths ending in a slash */
+   if (pathspec)
+   strip_trailing_slash_from_submodules();
+
+   max_prefix = common_prefix(pathspec);
+   max_prefix_len = max_prefix ? strlen(max_prefix) : 0;
+
if ((dir.flags  DIR_SHOW_IGNORED)  !exc_given)
die(ls-files --ignored needs some exclude pattern);
 
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 11/19] read-cache: make in-memory format aware of stat_crc

2013-07-12 Thread Thomas Gummerer
Make the in-memory format aware of the stat_crc used by index-v5.
It is simply ignored by index version prior to v5.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache.h  |  1 +
 read-cache.c | 25 +
 2 files changed, 26 insertions(+)

diff --git a/cache.h b/cache.h
index 455b772..2097105 100644
--- a/cache.h
+++ b/cache.h
@@ -127,6 +127,7 @@ struct cache_entry {
unsigned int ce_flags;
unsigned int ce_namelen;
unsigned char sha1[20];
+   uint32_t ce_stat_crc;
struct cache_entry *next; /* used by name_hash */
char name[FLEX_ARRAY]; /* more */
 };
diff --git a/read-cache.c b/read-cache.c
index ab716ed..9bfbb4f 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -108,6 +108,29 @@ int match_stat_data(const struct stat_data *sd, struct 
stat *st)
return changed;
 }
 
+static uint32_t calculate_stat_crc(struct cache_entry *ce)
+{
+   unsigned int ctimens = 0;
+   uint32_t stat, stat_crc;
+
+   stat = htonl(ce-ce_stat_data.sd_ctime.sec);
+   stat_crc = crc32(0, (Bytef*)stat, 4);
+#ifdef USE_NSEC
+   ctimens = ce-ce_stat_data.sd_ctime.nsec;
+#endif
+   stat = htonl(ctimens);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_stat_data.sd_ino);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_stat_data.sd_dev);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_stat_data.sd_uid);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_stat_data.sd_gid);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   return stat_crc;
+}
+
 /*
  * This only updates the non-critical parts of the directory
  * cache, ie the parts that aren't tracked by GIT, and only used
@@ -122,6 +145,8 @@ void fill_stat_cache_info(struct cache_entry *ce, struct 
stat *st)
 
if (S_ISREG(st-st_mode))
ce_mark_uptodate(ce);
+
+   ce-ce_stat_crc = calculate_stat_crc(ce);
 }
 
 static int ce_compare_data(const struct cache_entry *ce, struct stat *st)
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 10/19] documentation: add documentation of the index-v5 file format

2013-07-12 Thread Thomas Gummerer
Add a documentation of the index file format version 5 to
Documentation/technical.

Helped-by: Michael Haggerty mhag...@alum.mit.edu
Helped-by: Junio C Hamano gits...@pobox.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Robin Rosenberg robin.rosenb...@dewire.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Documentation/technical/index-file-format-v5.txt | 296 +++
 1 file changed, 296 insertions(+)
 create mode 100644 Documentation/technical/index-file-format-v5.txt

diff --git a/Documentation/technical/index-file-format-v5.txt 
b/Documentation/technical/index-file-format-v5.txt
new file mode 100644
index 000..4213087
--- /dev/null
+++ b/Documentation/technical/index-file-format-v5.txt
@@ -0,0 +1,296 @@
+GIT index format
+
+
+== The git index
+
+   The git index file (.git/index) documents the status of the files
+ in the git staging area.
+
+   The staging area is used for preparing commits, merging, etc.
+
+== The git index file format
+
+   All binary numbers are in network byte order. Version 5 is described
+ here. The index file consists of various sections. They appear in
+ the following order in the file.
+
+   - header: the description of the index format, including it's signature,
+ version and various other fields that are used internally.
+
+   - diroffsets (ndir entries of direcotry offset): A 4-byte offset
+   relative to the beginning of the direntries block (see below)
+   for each of the ndir directories in the index, sorted by pathname
+   (of the directory it's pointing to). [1]
+
+   - direntries (ndir entries of directory offset): A directory entry
+   for each of the ndir directories in the index, sorted by pathname
+   (see below). [2]
+
+   - fileoffsets (nfile entries of file offset): A 4-byte offset
+   relative to the beginning of the fileentries block (see below)
+   for each of the nfile files in the index. [1]
+
+   - fileentries (nfile entries of file entry): A file entry for
+   each of the nfile files in the index (see below).
+
+   - crdata: A number of entries for conflicted data/resolved conflicts
+   (see below).
+
+   - Extensions (Currently none, see below in the future)
+
+ Extensions are identified by signature. Optional extensions can
+ be ignored if GIT does not understand them.
+
+ GIT supports an arbitrary number of extension, but currently none
+ is implemented. [3]
+
+ extsig (32-bits): extension signature. If the first byte is 'A'..'Z'
+ the extension is optional and can be ignored.
+
+ extsize (32-bits): size of the extension, excluding the header
+   (extsig, extsize, extchecksum).
+
+ extchecksum (32-bits): crc32 checksum of the extension signature
+   and size.
+
+- Extension data.
+
+== Header
+   sig (32-bits): Signature:
+ The signature is { 'D', 'I', 'R', 'C' } (stands for dircache)
+
+   vnr (32-bits): Version number:
+ The current supported versions are 2, 3, 4 and 5.
+
+   ndir (32-bits): number of directories in the index.
+
+   nfile (32-bits): number of file entries in the index.
+
+   fblockoffset (32-bits): offset to the file block, relative to the
+ beginning of the file.
+
+   - Offset to the extensions.
+
+ nextensions (32-bits): number of extensions.
+
+ extoffset (32-bits): offset to the extension. (Possibly none, as
+   many as indicated in the 4-byte number of extensions)
+
+   headercrc (32-bits): crc checksum including the header and the
+ offsets to the extensions.
+
+
+== Directory offsets (diroffsets)
+
+  diroffset (32-bits): offset to the directory relative to the beginning
+of the index file. There are ndir + 1 offsets in the diroffset table,
+the last is pointing to the end of the last direntry. With this last
+entry, we are able to replace the strlen of when reading the directory
+name, by calculating it from diroffset[n+1]-diroffset[n]-61.  61 is the
+size of the directory data, which follows each each directory + the
+crc sum + the NUL byte.
+
+  This part is needed for making the directory entries bisectable and
+thus allowing a binary search.
+
+== Directory entry (direntries)
+
+  Directory entries are sorted in lexicographic order by the name
+of their path starting with the root.
+
+  pathname (variable length, nul terminated): relative to top level
+directory (without the leading slash). '/' is used as path
+separator. A string of length 0 ('') indicates the root directory.
+The special path components ., and .. (without quotes) are
+disallowed. The path also includes a trailing slash. [9]
+
+  foffset (32-bits): offset to the lexicographically first file in
+the file offsets (fileoffsets), relative to the beginning of
+the fileoffset block.
+
+  cr (32-bits): offset to conflicted/resolved data at the end of the
+index. 0

[PATCH v2 14/19] read-cache: read cache-tree in index-v5

2013-07-12 Thread Thomas Gummerer
Since the cache-tree data is saved as part of the directory data,
we already read it at the beginning of the index. The cache-tree
is only converted from this directory data.

The cache-tree data is arranged in a tree, with the children sorted by
pathlen at each node, while the ondisk format is sorted lexically.
So we have to rebuild this format from the on-disk directory list.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache-tree.c|   2 +-
 cache-tree.h|   6 
 read-cache-v5.c | 100 
 3 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/cache-tree.c b/cache-tree.c
index 37e4d00..f4b0917 100644
--- a/cache-tree.c
+++ b/cache-tree.c
@@ -31,7 +31,7 @@ void cache_tree_free(struct cache_tree **it_p)
*it_p = NULL;
 }
 
-static int subtree_name_cmp(const char *one, int onelen,
+int subtree_name_cmp(const char *one, int onelen,
const char *two, int twolen)
 {
if (onelen  twolen)
diff --git a/cache-tree.h b/cache-tree.h
index 55d0f59..9aac493 100644
--- a/cache-tree.h
+++ b/cache-tree.h
@@ -21,10 +21,16 @@ struct cache_tree {
struct cache_tree_sub **down;
 };
 
+struct directory_queue {
+   struct directory_queue *down;
+   struct directory_entry *de;
+};
+
 struct cache_tree *cache_tree(void);
 void cache_tree_free(struct cache_tree **);
 void cache_tree_invalidate_path(struct cache_tree *, const char *);
 struct cache_tree_sub *cache_tree_sub(struct cache_tree *, const char *);
+int subtree_name_cmp(const char *, int, const char *, int);
 
 void cache_tree_write(struct strbuf *, struct cache_tree *root);
 struct cache_tree *cache_tree_read(const char *buffer, unsigned long size);
diff --git a/read-cache-v5.c b/read-cache-v5.c
index 853b97d..0b9c320 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -448,6 +448,103 @@ static int read_conflicts(struct conflict_entry **head,
return 0;
 }
 
+static struct cache_tree *convert_one(struct directory_queue *queue, int dirnr)
+{
+   int i, subtree_nr;
+   struct cache_tree *it;
+   struct directory_queue *down;
+
+   it = cache_tree();
+   it-entry_count = queue[dirnr].de-de_nentries;
+   subtree_nr = queue[dirnr].de-de_nsubtrees;
+   if (0 = it-entry_count)
+   hashcpy(it-sha1, queue[dirnr].de-sha1);
+
+   /*
+* Just a heuristic -- we do not add directories that often but
+* we do not want to have to extend it immediately when we do,
+* hence +2.
+*/
+   it-subtree_alloc = subtree_nr + 2;
+   it-down = xcalloc(it-subtree_alloc, sizeof(struct cache_tree_sub *));
+   down = queue[dirnr].down;
+   for (i = 0; i  subtree_nr; i++) {
+   struct cache_tree *sub;
+   struct cache_tree_sub *subtree;
+   char *buf, *name;
+
+   name = ;
+   buf = strtok(down[i].de-pathname, /);
+   while (buf) {
+   name = buf;
+   buf = strtok(NULL, /);
+   }
+   sub = convert_one(down, i);
+   if(!sub)
+   goto free_return;
+   subtree = cache_tree_sub(it, name);
+   subtree-cache_tree = sub;
+   }
+   if (subtree_nr != it-subtree_nr)
+   die(cache-tree: internal error);
+   return it;
+ free_return:
+   cache_tree_free(it);
+   return NULL;
+}
+
+static int compare_cache_tree_elements(const void *a, const void *b)
+{
+   const struct directory_entry *de1, *de2;
+
+   de1 = ((const struct directory_queue *)a)-de;
+   de2 = ((const struct directory_queue *)b)-de;
+   return subtree_name_cmp(de1-pathname, de1-de_pathlen,
+   de2-pathname, de2-de_pathlen);
+}
+
+static struct directory_entry *sort_directories(struct directory_entry *de,
+   struct directory_queue *queue)
+{
+   int i, nsubtrees;
+
+   nsubtrees = de-de_nsubtrees;
+   for (i = 0; i  nsubtrees; i++) {
+   struct directory_entry *new_de;
+   de = de-next;
+   new_de = xmalloc(directory_entry_size(de-de_pathlen));
+   memcpy(new_de, de, directory_entry_size(de-de_pathlen));
+   queue[i].de = new_de;
+   if (de-de_nsubtrees) {
+   queue[i].down = xcalloc(de-de_nsubtrees,
+   sizeof(struct directory_queue));
+   de = sort_directories(de,
+   queue[i].down);
+   }
+   }
+   qsort(queue, nsubtrees, sizeof(struct directory_queue),
+   compare_cache_tree_elements);
+   return de;
+}
+
+/*
+ * This function modifies the directory argument that is given to it.
+ * Don't use it if the directory entries are still needed after.
+ */
+static struct

[PATCH v2 13/19] read-cache: read resolve-undo data

2013-07-12 Thread Thomas Gummerer
Make git read the resolve-undo data from the index.

Since the resolve-undo data is joined with the conflicts in
the ondisk format of the index file version 5, conflicts and
resolved data is read at the same time, and the resolve-undo
data is then converted to the in-memory format.

Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c | 39 +++
 1 file changed, 39 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index 00112ea..853b97d 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -1,5 +1,6 @@
 #include cache.h
 #include read-cache.h
+#include string-list.h
 #include resolve-undo.h
 #include cache-tree.h
 #include dir.h
@@ -447,6 +448,43 @@ static int read_conflicts(struct conflict_entry **head,
return 0;
 }
 
+static void resolve_undo_convert_v5(struct index_state *istate,
+   struct conflict_entry *conflict)
+{
+   int i;
+
+   while (conflict) {
+   struct string_list_item *lost;
+   struct resolve_undo_info *ui;
+   struct conflict_part *cp;
+
+   if (conflict-entries 
+   (conflict-entries-flags  CONFLICT_CONFLICTED) != 0) {
+   conflict = conflict-next;
+   continue;
+   }
+   if (!istate-resolve_undo) {
+   istate-resolve_undo = xcalloc(1, sizeof(struct 
string_list));
+   istate-resolve_undo-strdup_strings = 1;
+   }
+
+   lost = string_list_insert(istate-resolve_undo, conflict-name);
+   if (!lost-util)
+   lost-util = xcalloc(1, sizeof(*ui));
+   ui = lost-util;
+
+   cp = conflict-entries;
+   for (i = 0; i  3; i++)
+   ui-mode[i] = 0;
+   while (cp) {
+   ui-mode[conflict_stage(cp) - 1] = cp-entry_mode;
+   hashcpy(ui-sha1[conflict_stage(cp) - 1], cp-sha1);
+   cp = cp-next;
+   }
+   conflict = conflict-next;
+   }
+}
+
 static int read_entries(struct index_state *istate, struct directory_entry 
**de,
unsigned int *entry_offset, void **mmap,
unsigned long mmap_size, unsigned int *nr,
@@ -460,6 +498,7 @@ static int read_entries(struct index_state *istate, struct 
directory_entry **de,
conflict_queue = NULL;
if (read_conflicts(conflict_queue, *de, mmap, mmap_size)  0)
return -1;
+   resolve_undo_convert_v5(istate, conflict_queue);
for (i = 0; i  (*de)-de_nfiles; i++) {
if (read_entry(ce,
   *de,
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 12/19] read-cache: read index-v5

2013-07-12 Thread Thomas Gummerer
Make git read the index file version 5 without complaining.

This version of the reader doesn't read neither the cache-tree
nor the resolve undo data, but doesn't choke on an index that
includes such data.

Helped-by: Junio C Hamano gits...@pobox.com
Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Makefile|   1 +
 cache.h |  75 ++-
 read-cache-v5.c | 638 
 read-cache.h|   1 +
 4 files changed, 714 insertions(+), 1 deletion(-)
 create mode 100644 read-cache-v5.c

diff --git a/Makefile b/Makefile
index 73369ae..80e35f5 100644
--- a/Makefile
+++ b/Makefile
@@ -856,6 +856,7 @@ LIB_OBJS += quote.o
 LIB_OBJS += reachable.o
 LIB_OBJS += read-cache.o
 LIB_OBJS += read-cache-v2.o
+LIB_OBJS += read-cache-v5.o
 LIB_OBJS += reflog-walk.o
 LIB_OBJS += refs.o
 LIB_OBJS += remote.o
diff --git a/cache.h b/cache.h
index 2097105..1e5cc77 100644
--- a/cache.h
+++ b/cache.h
@@ -99,7 +99,7 @@ unsigned long git_deflate_bound(git_zstream *, unsigned long);
 #define CACHE_SIGNATURE 0x44495243 /* DIRC */
 
 #define INDEX_FORMAT_LB 2
-#define INDEX_FORMAT_UB 4
+#define INDEX_FORMAT_UB 5
 
 /*
  * The cache_time is just the low 32 bits of the
@@ -121,6 +121,15 @@ struct stat_data {
unsigned int sd_size;
 };
 
+/*
+ * The *next pointer is used in read_entries_v5 for holding
+ * all the elements of a directory, and points to the next
+ * cache_entry in a directory.
+ *
+ * It is reset by the add_name_hash call in set_index_entry
+ * to set it to point to the next cache_entry in the
+ * correct in-memory format ordering.
+ */
 struct cache_entry {
struct stat_data ce_stat_data;
unsigned int ce_mode;
@@ -132,11 +141,59 @@ struct cache_entry {
char name[FLEX_ARRAY]; /* more */
 };
 
+struct directory_entry {
+   struct directory_entry *next;
+   struct directory_entry *next_hash;
+   struct cache_entry *ce;
+   struct cache_entry *ce_last;
+   struct conflict_entry *conflict;
+   struct conflict_entry *conflict_last;
+   unsigned int conflict_size;
+   unsigned int de_foffset;
+   unsigned int de_cr;
+   unsigned int de_ncr;
+   unsigned int de_nsubtrees;
+   unsigned int de_nfiles;
+   unsigned int de_nentries;
+   unsigned char sha1[20];
+   unsigned short de_flags;
+   unsigned int de_pathlen;
+   char pathname[FLEX_ARRAY];
+};
+
+struct conflict_part {
+   struct conflict_part *next;
+   unsigned short flags;
+   unsigned short entry_mode;
+   unsigned char sha1[20];
+};
+
+struct conflict_entry {
+   struct conflict_entry *next;
+   unsigned int nfileconflicts;
+   struct conflict_part *entries;
+   unsigned int namelen;
+   unsigned int pathlen;
+   char name[FLEX_ARRAY];
+};
+
+struct ondisk_conflict_part {
+   unsigned short flags;
+   unsigned short entry_mode;
+   unsigned char sha1[20];
+};
+
+#define CE_NAMEMASK  (0x0fff)
 #define CE_STAGEMASK (0x3000)
 #define CE_EXTENDED  (0x4000)
 #define CE_VALID (0x8000)
+#define CE_SMUDGED   (0x0400) /* index v5 only flag */
 #define CE_STAGESHIFT 12
 
+#define CONFLICT_CONFLICTED (0x8000)
+#define CONFLICT_STAGESHIFT 13
+#define CONFLICT_STAGEMASK (0x6000)
+
 /*
  * Range 0x in ce_flags is divided into
  * two parts: in-memory flags and on-disk ones.
@@ -173,6 +230,18 @@ struct cache_entry {
 #define CE_EXTENDED_FLAGS (CE_INTENT_TO_ADD | CE_SKIP_WORKTREE)
 
 /*
+ * Representation of the extended on-disk flags in the v5 format.
+ * They must not collide with the ordinary on-disk flags, and need to
+ * fit in 16 bits.  Note however that v5 does not save the name
+ * length.
+ */
+#define CE_INTENT_TO_ADD_V5  (0x4000)
+#define CE_SKIP_WORKTREE_V5  (0x0800)
+#if (CE_VALID|CE_STAGEMASK)  (CE_INTENTTOADD_V5|CE_SKIPWORKTREE_V5)
+#error v5 on-disk flags collide with ordinary on-disk flags
+#endif
+
+/*
  * Safeguard to avoid saving wrong flags:
  *  - CE_EXTENDED2 won't get saved until its semantic is known
  *  - Bits in 0x have been saved in ce_flags already
@@ -211,6 +280,8 @@ static inline unsigned create_ce_flags(unsigned stage)
 #define ce_skip_worktree(ce) ((ce)-ce_flags  CE_SKIP_WORKTREE)
 #define ce_mark_uptodate(ce) ((ce)-ce_flags |= CE_UPTODATE)
 
+#define conflict_stage(c) ((CONFLICT_STAGEMASK  (c)-flags)  
CONFLICT_STAGESHIFT)
+
 #define ce_permissions(mode) (((mode)  0100) ? 0755 : 0644)
 static inline unsigned int create_ce_mode(unsigned int mode)
 {
@@ -258,6 +329,8 @@ static inline unsigned int canon_mode(unsigned int mode)
 }
 
 #define cache_entry_size(len) (offsetof(struct cache_entry,name) + (len) + 1)
+#define directory_entry_size(len) (offsetof(struct directory_entry,pathname) + 
(len) + 1)
+#define conflict_entry_size(len) (offsetof(struct conflict_entry,name) + (len) 
+ 1)
 
 /*
  * Options by which the index

[PATCH v2 16/19] read-cache: write index-v5 cache-tree data

2013-07-12 Thread Thomas Gummerer
Write the cache-tree data for the index version 5 file format. The
in-memory cache-tree data is converted to the ondisk format, by adding
it to the directory entries, that were compiled from the cache-entries
in the step before.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c | 53 +
 1 file changed, 53 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index 33667d7..cd819b4 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -941,6 +941,57 @@ static struct conflict_entry 
*create_conflict_entry_from_ce(struct cache_entry *
return create_new_conflict(ce-name, ce_namelen(ce), pathlen);
 }
 
+static void convert_one_to_ondisk_v5(struct hash_table *table, struct 
cache_tree *it,
+   const char *path, int pathlen, uint32_t crc)
+{
+   int i;
+   struct directory_entry *found, *search;
+
+   crc = crc32(crc, (Bytef*)path, pathlen);
+   found = lookup_hash(crc, table);
+   search = found;
+   while (search  strcmp(path, search-pathname + search-de_pathlen - 
strlen(path)) != 0)
+   search = search-next_hash;
+   if (!search)
+   return;
+   /*
+* The number of subtrees is already calculated by
+* compile_directory_data, therefore we only need to
+* add the entry_count
+*/
+   search-de_nentries = it-entry_count;
+   if (0 = it-entry_count)
+   hashcpy(search-sha1, it-sha1);
+   if (strcmp(path, ) != 0)
+   crc = crc32(crc, (Bytef*)/, 1);
+
+#if DEBUG
+   if (0 = it-entry_count)
+   fprintf(stderr, cache-tree %.*s (%d ent, %d subtree) %s\n,
+   pathlen, path, it-entry_count, it-subtree_nr,
+   sha1_to_hex(it-sha1));
+   else
+   fprintf(stderr, cache-tree %.*s (%d subtree) invalid\n,
+   pathlen, path, it-subtree_nr);
+#endif
+
+   for (i = 0; i  it-subtree_nr; i++) {
+   struct cache_tree_sub *down = it-down[i];
+   if (i) {
+   struct cache_tree_sub *prev = it-down[i-1];
+   if (subtree_name_cmp(down-name, down-namelen,
+prev-name, prev-namelen) = 0)
+   die(fatal - unsorted cache subtree);
+   }
+   convert_one_to_ondisk_v5(table, down-cache_tree, down-name, 
down-namelen, crc);
+   }
+}
+
+static void cache_tree_to_ondisk_v5(struct hash_table *table, struct 
cache_tree *root)
+{
+   convert_one_to_ondisk_v5(table, root, , 0, 0);
+}
+
 static struct directory_entry *compile_directory_data(struct index_state 
*istate,
int nfile,
unsigned int *ndir,
@@ -1046,6 +1097,8 @@ static struct directory_entry 
*compile_directory_data(struct index_state *istate
previous_entry-next = no_subtrees;
}
}
+   if (istate-cache_tree)
+   cache_tree_to_ondisk_v5(table, istate-cache_tree);
return de;
 }
 
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 18/19] update-index.c: rewrite index when index-version is given

2013-07-12 Thread Thomas Gummerer
Make update-index always rewrite the index when a index-version
is given, even if the index already has the right version.
This option is used for performance testing the writer and
reader.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 builtin/update-index.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/builtin/update-index.c b/builtin/update-index.c
index 4c6e3a6..7e723c0 100644
--- a/builtin/update-index.c
+++ b/builtin/update-index.c
@@ -6,6 +6,7 @@
 #include cache.h
 #include quote.h
 #include cache-tree.h
+#include read-cache.h
 #include tree-walk.h
 #include builtin.h
 #include refs.h
@@ -863,8 +864,7 @@ int cmd_update_index(int argc, const char **argv, const 
char *prefix)
preferred_index_format,
INDEX_FORMAT_LB, INDEX_FORMAT_UB);
 
-   if (the_index.version != preferred_index_format)
-   active_cache_changed = 1;
+   active_cache_changed = 1;
the_index.version = preferred_index_format;
}
 
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 17/19] read-cache: write resolve-undo data for index-v5

2013-07-12 Thread Thomas Gummerer
Make git read the resolve-undo data from the index.

Since the resolve-undo data is joined with the conflicts in
the ondisk format of the index file version 5, conflicts and
resolved data is read at the same time, and the resolve-undo
data is then converted to the in-memory format.

Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c | 94 +
 1 file changed, 94 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index cd819b4..093ee1a 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -992,6 +992,99 @@ static void cache_tree_to_ondisk_v5(struct hash_table 
*table, struct cache_tree
convert_one_to_ondisk_v5(table, root, , 0, 0);
 }
 
+static void resolve_undo_to_ondisk_v5(struct hash_table *table,
+ struct string_list *resolve_undo,
+ unsigned int *ndir, int *total_dir_len,
+ struct directory_entry *de)
+{
+   struct string_list_item *item;
+   struct directory_entry *search;
+
+   if (!resolve_undo)
+   return;
+   for_each_string_list_item(item, resolve_undo) {
+   struct conflict_entry *conflict_entry;
+   struct resolve_undo_info *ui = item-util;
+   char *super;
+   int i, dir_len, len;
+   uint32_t crc;
+   struct directory_entry *found, *current, *new_tree;
+
+   if (!ui)
+   continue;
+
+   super = super_directory(item-string);
+   if (!super)
+   dir_len = 0;
+   else
+   dir_len = strlen(super);
+   crc = crc32(0, (Bytef*)super, dir_len);
+   found = lookup_hash(crc, table);
+   current = NULL;
+   new_tree = NULL;
+
+   while (!found) {
+   struct directory_entry *new;
+
+   new = init_directory_entry(super, dir_len);
+   if (!current)
+   current = new;
+   insert_directory_entry(new, table, total_dir_len, ndir, 
crc);
+   if (new_tree != NULL)
+   new-de_nsubtrees = 1;
+   new-next = new_tree;
+   new_tree = new;
+   super = super_directory(super);
+   if (!super)
+   dir_len = 0;
+   else
+   dir_len = strlen(super);
+   crc = crc32(0, (Bytef*)super, dir_len);
+   found = lookup_hash(crc, table);
+   }
+   search = found;
+   while (search-next_hash  strcmp(super, search-pathname) != 
0)
+   search = search-next_hash;
+   if (search  !current)
+   current = search;
+   if (!search  !current)
+   current = new_tree;
+   if (!super  new_tree) {
+   new_tree-next = de-next;
+   de-next = new_tree;
+   de-de_nsubtrees++;
+   } else if (new_tree) {
+   struct directory_entry *temp;
+
+   search = de-next;
+   while (strcmp(super, search-pathname))
+   search = search-next;
+   temp = new_tree;
+   while (temp-next)
+   temp = temp-next;
+   search-de_nsubtrees++;
+   temp-next = search-next;
+   search-next = new_tree;
+   }
+
+   len = strlen(item-string);
+   conflict_entry = create_new_conflict(item-string, len, 
current-de_pathlen);
+   add_conflict_to_directory_entry(current, conflict_entry);
+   for (i = 0; i  3; i++) {
+   if (ui-mode[i]) {
+   struct conflict_part *cp;
+
+   cp = xmalloc(sizeof(struct conflict_part));
+   cp-flags = (i + 1)  CONFLICT_STAGESHIFT;
+   cp-entry_mode = ui-mode[i];
+   cp-next = NULL;
+   hashcpy(cp-sha1, ui-sha1[i]);
+   add_part_to_conflict_entry(current, 
conflict_entry, cp);
+   }
+   }
+   }
+}
+
 static struct directory_entry *compile_directory_data(struct index_state 
*istate,
int nfile,
unsigned int *ndir,
@@ -1099,6 +1192,7 @@ static struct directory_entry 
*compile_directory_data(struct

[PATCH v2 15/19] read-cache: write index-v5

2013-07-12 Thread Thomas Gummerer
Write the index version 5 file format to disk. This version doesn't
write the cache-tree data and resolve-undo data to the file.

The main work is done when filtering out the directories from the
current in-memory format, where in the same turn also the conflicts
and the file data is calculated.

Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache.h |   8 +
 read-cache-v5.c | 594 +++-
 read-cache.c|  11 +-
 read-cache.h|   1 +
 4 files changed, 611 insertions(+), 3 deletions(-)

diff --git a/cache.h b/cache.h
index 1e5cc77..f3685f8 100644
--- a/cache.h
+++ b/cache.h
@@ -565,6 +565,7 @@ extern int unmerged_index(const struct index_state *);
 extern int verify_path(const char *path);
 extern struct cache_entry *index_name_exists(struct index_state *istate, const 
char *name, int namelen, int igncase);
 extern int index_name_pos(const struct index_state *, const char *name, int 
namelen);
+extern struct directory_entry *init_directory_entry(char *pathname, int len);
 #define ADD_CACHE_OK_TO_ADD 1  /* Ok to add */
 #define ADD_CACHE_OK_TO_REPLACE 2  /* Ok to replace file/directory */
 #define ADD_CACHE_SKIP_DFCHECK 4   /* Ok to skip DF conflict checks */
@@ -1363,6 +1364,13 @@ static inline ssize_t write_str_in_full(int fd, const 
char *str)
return write_in_full(fd, str, strlen(str));
 }
 
+/* index-v5 helper functions */
+extern char *super_directory(const char *filename);
+extern void insert_directory_entry(struct directory_entry *, struct hash_table 
*, int *, unsigned int *, uint32_t);
+extern void add_conflict_to_directory_entry(struct directory_entry *, struct 
conflict_entry *);
+extern void add_part_to_conflict_entry(struct directory_entry *, struct 
conflict_entry *, struct conflict_part *);
+extern struct conflict_entry *create_new_conflict(char *, int, int);
+
 /* pager.c */
 extern void setup_pager(void);
 extern const char *pager_program;
diff --git a/read-cache-v5.c b/read-cache-v5.c
index 0b9c320..33667d7 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -769,9 +769,601 @@ static int read_index_v5(struct index_state *istate, void 
*mmap,
return 0;
 }
 
+#define WRITE_BUFFER_SIZE 8192
+static unsigned char write_buffer[WRITE_BUFFER_SIZE];
+static unsigned long write_buffer_len;
+
+static int ce_write_flush(int fd)
+{
+   unsigned int buffered = write_buffer_len;
+   if (buffered) {
+   if (write_in_full(fd, write_buffer, buffered) != buffered)
+   return -1;
+   write_buffer_len = 0;
+   }
+   return 0;
+}
+
+static int ce_write(uint32_t *crc, int fd, void *data, unsigned int len)
+{
+   if (crc)
+   *crc = crc32(*crc, (Bytef*)data, len);
+   while (len) {
+   unsigned int buffered = write_buffer_len;
+   unsigned int partial = WRITE_BUFFER_SIZE - buffered;
+   if (partial  len)
+   partial = len;
+   memcpy(write_buffer + buffered, data, partial);
+   buffered += partial;
+   if (buffered == WRITE_BUFFER_SIZE) {
+   write_buffer_len = buffered;
+   if (ce_write_flush(fd))
+   return -1;
+   buffered = 0;
+   }
+   write_buffer_len = buffered;
+   len -= partial;
+   data = (char *) data + partial;
+   }
+   return 0;
+}
+
+static int ce_flush(int fd)
+{
+   unsigned int left = write_buffer_len;
+
+   if (left)
+   write_buffer_len = 0;
+
+   if (write_in_full(fd, write_buffer, left) != left)
+   return -1;
+
+   return 0;
+}
+
+static void ce_smudge_racily_clean_entry(struct cache_entry *ce)
+{
+   /*
+* This method shall only be called if the timestamp of ce
+* is racy (check with is_racy_timestamp). If the timestamp
+* is racy, the writer will set the CE_SMUDGED flag.
+*
+* The reader (match_stat_basic) will then take care
+* of checking if the entry is really changed or not, by
+* taking into account the size and the stat_crc and if
+* that hasn't changed checking the sha1.
+*/
+   ce-ce_flags |= CE_SMUDGED;
+}
+
+char *super_directory(const char *filename)
+{
+   char *slash;
+
+   slash = strrchr(filename, '/');
+   if (slash)
+   return xmemdupz(filename, slash-filename);
+   return NULL;
+}
+
+struct directory_entry *init_directory_entry(char *pathname, int len)
+{
+   struct directory_entry *de = xmalloc(directory_entry_size(len));
+
+   memcpy(de-pathname, pathname, len);
+   de-pathname[len] = '\0';
+   de-de_flags  = 0;
+   de-de_foffset= 0;
+   de-de_cr = 0;
+   de-de_ncr

[PATCH v2 19/19] p0003-index.sh: add perf test for the index formats

2013-07-12 Thread Thomas Gummerer
From: Thomas Rast tr...@inf.ethz.ch

Add a performance test for index version [23]/4/5 by using
git update-index --index-version=x, thus testing both the reader
and the writer speed of all index formats.

Signed-off-by: Thomas Rast tr...@inf.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/perf/p0003-index.sh | 59 +++
 1 file changed, 59 insertions(+)
 create mode 100755 t/perf/p0003-index.sh

diff --git a/t/perf/p0003-index.sh b/t/perf/p0003-index.sh
new file mode 100755
index 000..3e02868
--- /dev/null
+++ b/t/perf/p0003-index.sh
@@ -0,0 +1,59 @@
+#!/bin/sh
+
+test_description=Tests index versions [23]/4/5
+
+. ./perf-lib.sh
+
+test_perf_large_repo
+
+test_expect_success convert to v3 
+   git update-index --index-version=2
+
+
+test_perf v[23]: update-index 
+   git update-index --index-version=2 /dev/null
+
+
+subdir=$(git ls-files | sed 's#/[^/]*$##' | grep -v '^$' | uniq | tail -n 30 | 
head -1)
+
+test_perf v[23]: grep nonexistent -- subdir 
+   test_must_fail git grep nonexistent -- $subdir /dev/null
+
+
+test_perf v[23]: ls-files -- subdir 
+   git ls-files $subdir /dev/null
+
+
+test_expect_success convert to v4 
+   git update-index --index-version=4
+
+
+test_perf v4: update-index 
+   git update-index --index-version=4 /dev/null
+
+
+test_perf v4: grep nonexistent -- subdir 
+   test_must_fail git grep nonexistent -- $subdir /dev/null
+
+
+test_perf v4: ls-files -- subdir 
+   git ls-files $subdir /dev/null
+
+
+test_expect_success convert to v5 
+   git update-index --index-version=5
+
+
+test_perf v5: update-index 
+   git update-index --index-version=5 /dev/null
+
+
+test_perf v5: grep nonexistent -- subdir 
+   test_must_fail git grep nonexistent -- $subdir /dev/null
+
+
+test_perf v5: ls-files -- subdir 
+   git ls-files $subdir /dev/null
+
+
+test_done
-- 
1.8.3.453.g1dfc63d

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 00/19] Index-v5

2013-07-15 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Sat, Jul 13, 2013 at 12:26 AM, Thomas Gummerer t.gumme...@gmail.com 
 wrote:
  t/perf/p0003-index.sh|   59 +
  t/t2104-update-index-skip-worktree.sh|1 +

 For such a big code addition, the test part seems modest. Speaking
 from my experience, I rarely run perf tests and make test does not
 activate v5 code at all. A few more tests would be nice. The good news
 is I changed default index version to 5 and ran make test, all
 passed.

I was using the test suite with index version 5 as default index version
for testing of the new index file format.   I think that's the best way
to test the index, as it covers all aspects.  Maybe we should add a test
that covers the basic functionality, just to make sure nothing obvious
is broken when running the test suite with index-v2?  Something like
this maybe:

---8---

From c476b521c94f1a9b0b4fcfe92d63321442d79c9a Mon Sep 17 00:00:00 2001
From: Thomas Gummerer t.gumme...@gmail.com
Date: Mon, 15 Jul 2013 11:21:06 +0200
Subject: [PATCH] t1600: add basic test for index-v5

Add a test that checks the index-v5 file format when running the
test-suite with index-v2 as default index format.  When making changes
to the index, the test suite still should be run with both index v2 and
index v5 as default index format, for better coverage of all aspects of
the index.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/t1600-index-v5.sh | 133 
 1 file changed, 133 insertions(+)
 create mode 100755 t/t1600-index-v5.sh

diff --git a/t/t1600-index-v5.sh b/t/t1600-index-v5.sh
new file mode 100755
index 000..528c17e
--- /dev/null
+++ b/t/t1600-index-v5.sh
@@ -0,0 +1,133 @@
+#!/bin/sh
+#
+# Copyright (c) 2013 Thomas Gummerer
+
+test_description='Test basic functionaltiy of index-v5.
+
+This test just covers the basics, to make sure normal runs of the test
+suite cover this version of the index file format too.  For better
+testing of the index-v5 format, the default index version should be
+changed to 5 and the test suite should be re-run'
+
+. ./test-lib.sh
+
+check_resolve_undo () {
+   msg=$1
+   shift
+   while case $# in
+   0)  break ;;
+   1|2|3)  die Bug in check-resolve-undo test ;;
+   esac
+   do
+   path=$1
+   shift
+   for stage in 1 2 3
+   do
+   sha1=$1
+   shift
+   case $sha1 in
+   '') continue ;;
+   esac
+   sha1=$(git rev-parse --verify $sha1)
+   printf 100644 %s %s\t%s\n $sha1 $stage $path
+   done
+   done $msg.expect 
+   git ls-files --resolve-undo $msg.actual 
+   test_cmp $msg.expect $msg.actual
+}
+
+prime_resolve_undo () {
+   git reset --hard 
+   git checkout second^0 
+   test_tick 
+   test_must_fail git merge third^0 
+   echo merge does not leave anything 
+   check_resolve_undo empty 
+   echo different fi/le 
+   git add fi/le 
+   echo resolving records 
+   check_resolve_undo recorded fi/le initial:fi/le second:fi/le third:fi/le
+}
+
+test_expect_success 'setup' '
+   git update-index --index-version=5 
+   echo file1 file1 
+   echo file2 file2 
+   mkdir -p top/sub 
+   echo x top/x 
+   echo xy top/xy 
+   echo y top/y 
+   echo yx top/yx 
+   echo sub1 top/sub/sub1 
+   git add . 
+   git commit -m initial import
+'
+
+test_expect_success 'ls-files shows all files' '
+   cat expected -EOF 
+   100644 e2129701f1a4d54dc44f03c93bca0a2aec7c5449 0   file1
+   100644 6c493ff740f9380390d5c9ddef4af18697ac9375 0   file2
+   100644 48df0cb83fee5d667537343f60a6057a63dd3c9b 0   top/sub/sub1
+   100644 587be6b4c3f93f93c489c0111bba5596147a26cb 0   top/x
+   100644 5aad9376af82d7b98a34f95fb0f298a162f52656 0   top/xy
+   100644 975fbec8256d3e8a3797e7a3611380f27c49f4ac 0   top/y
+   100644 ba1575927fa5b1f4bce72ad0c349566f1b02508e 0   top/yx
+   EOF
+   git ls-files --stage actual 
+   test_cmp expected actual
+'
+
+test_expect_success 'ls-files with pathspec in subdir' '
+   cd top/sub 
+   cat expected -EOF 
+   ../x
+   ../xy
+   EOF
+   git ls-files --error-unmatch ../x* actual 
+   test_cmp expected actual 
+   cd ../..
+'
+
+test_expect_success 'read-tree HEAD establishes cache-tree' '
+   git read-tree HEAD 
+   cat expected -EOF 
+   84e73410ea7864ccada24d897462e8ce1e1b872b  (7 entries, 1 subtrees)
+   602482536bd3852e8ac2977ed1a9913a8c244aa0 top/ (5 entries, 1 subtrees)
+   20bb0109200f37a7e19283b4abc4a672be3f0adb top/sub/ (1 entries, 0 
subtrees)
+   EOF
+   test-dump-cache-tree actual 
+   test_cmp expected actual
+'
+
+test_expect_success 'setup

Re: [PATCH v2 08/19] grep.c: Use index api

2013-07-15 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Sat, Jul 13, 2013 at 12:26 AM, Thomas Gummerer t.gumme...@gmail.com 
 wrote:
 +static int grep_cache(struct cache_entry *ce, void *cb_data)
  {
 -   int hit = 0;
 -   int nr;
 -   read_cache();
 +   struct grep_opts *opts = cb_data;

 -   for (nr = 0; nr  active_nr; nr++) {
 -   struct cache_entry *ce = active_cache[nr];
 -   if (!S_ISREG(ce-ce_mode))
 -   continue;
 -   if (!match_pathspec_depth(pathspec, ce-name, 
 ce_namelen(ce), 0, NULL))
 -   continue;
 -   /*
 -* If CE_VALID is on, we assume worktree file and its cache 
 entry
 -* are identical, even if worktree file has been modified, 
 so use
 -* cache version instead
 -*/
 -   if (cached || (ce-ce_flags  CE_VALID) || 
 ce_skip_worktree(ce)) {
 -   if (ce_stage(ce))
 -   continue;
 -   hit |= grep_sha1(opt, ce-sha1, ce-name, 0, 
 ce-name);
 -   }
 -   else
 -   hit |= grep_file(opt, ce-name);
 -   if (ce_stage(ce)) {
 -   do {
 -   nr++;
 -   } while (nr  active_nr 
 -!strcmp(ce-name, active_cache[nr]-name));
 -   nr--; /* compensate for loop control */
 -   }
 -   if (hit  opt-status_only)
 -   break;
 -   }
 -   return hit;
 +   if (!S_ISREG(ce-ce_mode))
 +   return 0;
 +   if (!match_pathspec_depth(opts-pathspec, ce-name, ce_namelen(ce), 
 0, NULL))
 +   return 0;

 You do a match_pathspec_depth here..

 @@ -895,10 +887,21 @@ int cmd_grep(int argc, const char **argv, const char 
 *prefix)
 } else if (0 = opt_exclude) {
 die(_(--[no-]exclude-standard cannot be used for tracked 
 contents.));
 } else if (!list.nr) {
 +   struct grep_opts opts;
 +   struct filter_opts *filter_opts = 
 xmalloc(sizeof(*filter_opts));
 +
 if (!cached)
 setup_work_tree();

 -   hit = grep_cache(opt, pathspec, cached);
 +   memset(filter_opts, 0, sizeof(*filter_opts));
 +   filter_opts-pathspec = pathspec;
 +   opts.opt = opt;
 +   opts.pathspec = pathspec;
 +   opts.cached = cached;
 +   opts.hit = 0;
 +   read_cache_filtered(filter_opts);
 +   for_each_cache_entry(grep_cache, opts);

 And here again inside for_each_cache_entry. In the worst case that
 could turn into 2 expensive fnmatch instead of one. Is this conversion
 worth it? Note that match_pathspec is just a deprecated version of
 match_pathspec_depth. They basically do the same thing.

Right, the match_pathspec_depth should in builtin/grep.c should be
removed, it's unnecessary when using for_each_index_entry.  Thanks for
spotting it.  Other than that I still think the change makes sense.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 00/19] Index-v5

2013-07-17 Thread Thomas Gummerer
Ramsay Jones ram...@ramsay1.demon.co.uk writes:

 Thomas Gummerer wrote:
 Hi,

 previous rounds (without api) are at $gmane/202752, $gmane/202923,
 $gmane/203088 and $gmane/203517, the previous round with api was at
 $gmane/229732.  Thanks to Junio, Duy and Eric for their comments on
 the previous round.

 If I remember correctly, the original version of this series had the
 same problem as Michael's Fix some reference-related races series
 (now in master). In particular, you had introduced an 'index_changed()'
 function which does essentially the same job as 'stat_validity_check()'
 in the new reference handling API. I seem to remember advising you
 not to compare st_uid, st_gid and st_ino on __CYGWIN__.

Yes, you provided a patch that simply wrapped those checks in a #if
!defined (__CYGWIN__), which is included in the new series too.


 I haven't had time to look at this version of your series yet, but it
 may be worth taking a look at stat_validity_check(). (although that is
 causing failures on cygwin at the moment! ;-)

I took a quick look, that function makes sense I think.  I'll use it in
the re-roll.  It makes probably sense to wrap the uid, gid and ino
fields as in the index_changed function.


 Also, I can't recall if I mentioned it to you at the time, but your
 index reading code was (unnecessarily) calling munmap() twice on the
 same buffer (without an intervening mmap()). This causes problems for
 systems that have the NO_MMAP build variable set. In particular, the
 compat/mmap.c code will attempt to free() the allocated memory block
 twice, with unpredictable results.

 I wrote a patch to address this at the time (Hmm, seems to be built
 on v1.8.1), but didn't submit it since your patch didn't progress. :-D
 I have included the patch below.

I can't recall this either.  From a quick check I don't call munmap() on
a already unmapped mmap, so I think this is fine as it is and your patch
is independent from it.  Not sure if it makes sense as safeguard for
future changes.

 -- 8 --
 From: Ramsay Jones ram...@ramsay1.demon.co.uk
 Date: Sun, 9 Sep 2012 20:50:32 +0100
 Subject: [PATCH] mmap.c: Keep log of mmap() blocks to avoid double-delete bug

 When compiling with the NO_MMAP build variable set, the built-in
 'git_mmap()' and 'git_munmap()' compatibility routines use simple
 memory allocation and file I/O to emulate the required behaviour.
 The current implementation is vulnerable to the double-delete bug
 (where the pointer returned by malloc() is passed to free() two or
 more times), should the mapped memory block address be passed to
 munmap() multiple times.

 In order to guard the implementation from such a calling sequence,
 we keep a list of mmap-block descriptors, which we then consult to
 determine the validity of the input pointer to munmap(). This then
 allows 'git_munmap()' to return -1 on error, as required, with
 errno set to EINVAL.

 Using a list in the log of mmap-ed blocks, along with the resulting
 linear search, means that the performance of the code is directly
 proportional to the number of concurrently active memory mapped
 file regions. The number of such regions is not expected to be
 excessive.

 Signed-off-by: Ramsay Jones ram...@ramsay1.demon.co.uk
 ---
  compat/mmap.c | 57 -
  1 file changed, 56 insertions(+), 1 deletion(-)

 diff --git a/compat/mmap.c b/compat/mmap.c
 index c9d46d1..400e034 100644
 --- a/compat/mmap.c
 +++ b/compat/mmap.c
 @@ -1,14 +1,61 @@
  #include ../git-compat-util.h

 +struct mmbd {  /* memory mapped block descriptor */
 + struct mmbd *next;  /* next in list */
 + void   *start;  /* pointer to memory mapped block */
 + size_t length;  /* length of memory mapped block */
 +};
 +
 +static struct mmbd *head;  /* head of mmb descriptor list */
 +
 +
 +static void add_desc(struct mmbd *desc, void *start, size_t length)
 +{
 + desc-start = start;
 + desc-length = length;
 + desc-next = head;
 + head = desc;
 +}
 +
 +static void free_desc(struct mmbd *desc)
 +{
 + if (head == desc)
 + head = head-next;
 + else {
 + struct mmbd *d = head;
 + for (; d; d = d-next) {
 + if (d-next == desc) {
 + d-next = desc-next;
 + break;
 + }
 + }
 + }
 + free(desc);
 +}
 +
 +static struct mmbd *find_desc(void *start)
 +{
 + struct mmbd *d = head;
 + for (; d; d = d-next) {
 + if (d-start == start)
 + return d;
 + }
 + return NULL;
 +}
 +
  void *git_mmap(void *start, size_t length, int prot, int flags, int fd, 
 off_t offset)
  {
   size_t n = 0;
 + struct mmbd *desc = NULL;

   if (start != NULL || !(flags  MAP_PRIVATE))
   die(Invalid usage of mmap when built with NO_MMAP);

   start = xmalloc(length);
 - if (start == NULL) {
 + desc

Re: [PATCH v2 09/19] ls-files.c: use index api

2013-07-17 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Sat, Jul 13, 2013 at 12:26 AM, Thomas Gummerer t.gumme...@gmail.com 
 wrote:
 +   if (!with_tree) {
 +   memset(opts, 0, sizeof(*opts));
 +   opts-pathspec = pathspec_struct;
 +   opts-read_staged = 1;
 +   if (show_resolve_undo)
 +   opts-read_resolve_undo = 1;
 +   read_cache_filtered(opts);

 So you load partial index here.

 +   } else {
 +   read_cache();
 +   }
 +   /* be nice with submodule paths ending in a slash */
 +   if (pathspec)
 +   strip_trailing_slash_from_submodules();

 Then strip_trailing_slash_from_submodules will attempt to convert
 pathspec foo/ to foo if foo exists in the index and is a
 gitlink. But becaues foo/ is used to load the partial index, foo
 is not loaded (is it?) and this could become incorrect no-op. I
 suggest you go through the pathspec once checking for ones ending with
 '/'. If so strip_trailing_... may potentially update pathspec, just
 load full index. If no pathspec ends with '/', strip_trail... is no-op
 and we can do partial loading safely.

It was loaded, because the adjusted_pathspec algorithm stripped the
trailing slash and then everything until the next slash.  That's
overkill except when the trailing slash had to be stripped.

I'll make the adjusted_pathspec algorithm more restrictive, so the last
trailing slash is no longer stripped.  If a pathspec contains a trailing
slash I'll load the whole index, as you suggested.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 10/19] documentation: add documentation of the index-v5 file format

2013-07-17 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Sat, Jul 13, 2013 at 12:26 AM, Thomas Gummerer t.gumme...@gmail.com 
 wrote:
 +== Directory offsets (diroffsets)
 +
 +  diroffset (32-bits): offset to the directory relative to the beginning
 +of the index file. There are ndir + 1 offsets in the diroffset table,
 +the last is pointing to the end of the last direntry. With this last
 +entry, we are able to replace the strlen of when reading the directory

 strlen of what?

Of the dirname.  Will fix in the re-roll, thanks for noticing.

 +name, by calculating it from diroffset[n+1]-diroffset[n]-61.  61 is the
 +size of the directory data, which follows each each directory + the
 +crc sum + the NUL byte.
 +
 +  This part is needed for making the directory entries bisectable and
 +thus allowing a binary search.

 Just thinking out loud. Maybe this section and fileoffsets should be
 made optional extensions. So far I see no use for them. It's nice for
 a program to occasionally look at a single entry, but such programs do
 not exist (yet). For inotify monitor that may want to update a single
 file's stat, it could regenerate the index with {dir,file}offsets
 extensions the first time it attempts to update the index, then it
 could do bsearch.

There is a use already, namely saving the strlen for the filename.
Previously we had the length of the filename in the lower bits of the
flags, but decided to remove it to avoid having extended flags.  That in
addition to the use case in the future warrants keeping them in the
index, I think.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 12/19] read-cache: read index-v5

2013-07-17 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

[..]

 +static int read_entries(struct index_state *istate, struct directory_entry 
 **de,
 +   unsigned int *entry_offset, void **mmap,
 +   unsigned long mmap_size, unsigned int *nr,
 +   unsigned int *foffsetblock)
 +{
 +   struct cache_entry *head = NULL, *tail = NULL;
 +   struct conflict_entry *conflict_queue;
 +   struct cache_entry *ce;
 +   int i;
 +
 +   conflict_queue = NULL;
 +   if (read_conflicts(conflict_queue, *de, mmap, mmap_size)  0)
 +   return -1;
 +   for (i = 0; i  (*de)-de_nfiles; i++) {
 +   if (read_entry(ce,
 +  *de,
 +  entry_offset,
 +  mmap,
 +  mmap_size,
 +  foffsetblock)  0)
 +   return -1;
 +   ce_queue_push(head, tail, ce);
 +   *foffsetblock += 4;
 +
 +   /*
 +* Add the conflicted entries at the end of the index file
 +* to the in memory format
 +*/
 +   if (conflict_queue 
 +   (conflict_queue-entries-flags  CONFLICT_CONFLICTED) 
 != 0 
 +   !cache_name_compare(conflict_queue-name, 
 conflict_queue-namelen,
 +   ce-name, ce_namelen(ce))) {
 +   struct conflict_part *cp;
 +   cp = conflict_queue-entries;
 +   cp = cp-next;
 +   while (cp) {
 +   ce = convert_conflict_part(cp,
 +   conflict_queue-name,
 +   conflict_queue-namelen);
 +   ce_queue_push(head, tail, ce);
 +   conflict_part_head_remove(cp);
 +   }
 +   conflict_entry_head_remove(conflict_queue);
 +   }

 I start to wonder if separating staged entries is a good idea. It
 seems to make the code more complicated. The good point about conflict
 section at the end of the file is you can just truncate() it out.
 Another way is putting staged entries in fileentries, sorted
 alphabetically then by stage number, and a flag indicating if the
 entry is valid. When you remove resolve an entry, just set the flag to
 invalid (partial write), so that read code will skip it.

 I think this approach is reasonably cheap (unless there are a lot of
 conflicts) and it simplifies this piece of code. truncate() may be
 overrated anyway. In my experience, I git add path as soon as I
 resolve path (so that git diff shrinks). One entry at a time, one
 index write at a time. I don't think I ever resolve everything then
 git add -u ., which is where truncate() shines because staged
 entries are removed all at once. We should optimize for one file
 resolution at a time, imo.

Thanks for your comments.  I'll address the other ones once we decided
to do with the conflicts.

It does make the code quite a bit more complicated, but also has one
advantage that you overlooked.  We wouldn't truncate() when resolving
the conflicts.  The resolve undo data is stored with the conflicts and
therefore we could just flip a bit and set the stage of the cache-entry
in the main index to 0 (always once we got partial writing).  This can
be fast both in case we resolve one entry at a time and when we resolve
a lot of entries.  The advantage is even bigger when we resolve one
entry at a time, when we otherwise would have to re-write the index for
each conflict resolution.

[..]
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 00/19] Index-v5

2013-07-17 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Mon, Jul 15, 2013 at 4:30 PM, Thomas Gummerer t.gumme...@gmail.com wrote:
 Duy Nguyen pclo...@gmail.com writes:

 On Sat, Jul 13, 2013 at 12:26 AM, Thomas Gummerer t.gumme...@gmail.com 
 wrote:
  t/perf/p0003-index.sh|   59 +
  t/t2104-update-index-skip-worktree.sh|1 +

 For such a big code addition, the test part seems modest. Speaking
 from my experience, I rarely run perf tests and make test does not
 activate v5 code at all. A few more tests would be nice. The good news
 is I changed default index version to 5 and ran make test, all
 passed.

 I was using the test suite with index version 5 as default index version
 for testing of the new index file format.   I think that's the best way
 to test the index, as it covers all aspects.

 We need other people to run the test suite with v5 to catch bugs that
 only appear on other platforms. Perhaps an env variable to allow the
 test suite to override the binary's default index version and a new
 make target test-v5 maybe.

Ah ok, I understand.  I think it's best to add a GIT_INDEX_VERSION=x
config option to config.mak, where x is the index version that should be
tested.  This is also the way other test options are set
(e.g. NO_SVN_TESTS).  In addition this allows testing other versions of
the index file too.

 Maybe we should add a test
 that covers the basic functionality, just to make sure nothing obvious
 is broken when running the test suite with index-v2?  Something like
 this maybe:

 Yep. v5 specfic test cases can cover some corner cases that the rest
 of the test suite does not care. Haven't read your t1600 though.

Ok.  I can't think of any corner cases right now that would be in v5,
but not in other versions, but if I catch some I'll add tests.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 07/19] make sure partially read index is not changed

2013-07-17 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Sat, Jul 13, 2013 at 12:26 AM, Thomas Gummerer t.gumme...@gmail.com 
 wrote:
 A partially read index file currently cannot be written to disk.  Make
 sure that never happens, by erroring out when a caller tries to change a
 partially read index.  The caller is responsible for reading the whole
 index when it's trying to change it later.

 Forcing the caller to load the right part of the index file instead of
 re-reading it when changing it, gives a bit of a performance advantage,
 by avoiding to read parts of the index twice.

 If you want to be strict about this, put similar check in
 fill_stat_cache_info (used by entry.c), cache_tree_invalidate_path and
 convert the code base (maybe except unpack-trees.c, just put a check
 in the beginning of unpack_trees()) to use wrapper function to change
 ce_flags (some still update ce_flags directly, grep them). Some flags
 are in memory only and should be allowed to change in partial index,
 some are to be written on disk and should not be allowed.

I'm not sure anymore we should even be this strict.  A partially read
index will never make it to the disk, because write_index checks if it's
fully read.   I think the check in write_index and read_index_filtered
would be enough.

The only disadvantage would be that it takes a little longer to catch an
error that should never happen in the first place.  If it occurs the
user has bigger problems than the time that not having caught the
error earlier adds to the execution of the command.

I also don't see a clean way to add the check to fill_stat_cache_info or
cache_tree_invalidate_path, because we would need an additional
parameter for the index_state, or at least for index_state-filter_opts,
which doesn't do anything but check if the index is only partially loaded.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 03/19] read-cache: move index v2 specific functions to their own file

2013-07-19 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Sat, Jul 13, 2013 at 12:26 AM, Thomas Gummerer t.gumme...@gmail.com 
 wrote:
 @@ -489,8 +479,8 @@ extern void *read_blob_data_from_index(struct 
 index_state *, const char *, unsig
  #define CE_MATCH_RACY_IS_DIRTY 02
  /* do stat comparison even if CE_SKIP_WORKTREE is true */
  #define CE_MATCH_IGNORE_SKIP_WORKTREE  04
 -extern int ie_match_stat(const struct index_state *, const struct 
 cache_entry *, struct stat *, unsigned int);
 -extern int ie_modified(const struct index_state *, const struct cache_entry 
 *, struct stat *, unsigned int);
 +extern int ie_match_stat(struct index_state *, const struct cache_entry *, 
 struct stat *, unsigned int);
 +extern int ie_modified(struct index_state *, const struct cache_entry *, 
 struct stat *, unsigned int);


 I would rather we keep const struct index_state* if we could. I
 tried putting const back and found that ce_match_stat_basic() calls
 set_istate_ops(), which writes to struct index_state. Putting
 set_istate_ops() in ce_match_stat_basic() may seem convenient, but
 does not make much sense (why would a match_stat function update
 index_ops?). I think you could move it out and

  - read_index calls set_istate_ops
  - (bonus) discard_index probably should reset version field to zero
 and clear internal_ops
  - the callers that use index without read_index must call
 initialize_index() or something, which in turn calls set_istate_ops.
 initialize_index() may take the preferred index version
  - do not let update-index modifies version field directly when
 --index-version is given. Wrap it with set_index_version() or
 something, so we can do internal conversion from one version to
 another if needed
  - remove set_istate_ops in write_index(), we may need internal_ops
 long before writing. When write_index is called, internal_ops should
 be already initialized

Ok, this makes sense.  The only thing that I'm a little worried about is
catching all code paths that need to initialize the index.  I'll
implement these suggestions in the re-roll.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 00/19] Index-v5

2013-07-19 Thread Thomas Gummerer
Junio C Hamano gits...@pobox.com writes:

 Thomas Gummerer t.gumme...@gmail.com writes:

 Ah ok, I understand.  I think it's best to add a GIT_INDEX_VERSION=x
 config option to config.mak, where x is the index version that should be
 tested.

 Whatever you do, please do not call it GIT_INDEX_VERSION _if_ it is
 only to be used while testing.  Have string TEST somewhere in the
 name, and make t/test-lib-functions.sh take notice.

 Currently, the way for the user to show the preference as to which
 index version to use is to explicitly set the version once, and then
 we will (try to) propagate it inside the repository.

 I would not mind if we add a mechanism to make write_index() notice
 the environment variable GIT_INDEX_VERSION and write the index in
 that version.  But that is conceptually very different from whatever
 you give make VARIABLE=blah from the command line when building
 Git (or set in config.mak which amounts to the same thing).

What I currently did is add a environment variable GIT_INDEX_VERSION
that is used only if there is no index yet, to make sure existing
repositories aren't affected and still have to be converted explicitly
by using git update-index.

For the tests I simply did export GIT_INDEX_VERSION in test-lib.sh to
allow the addition of GIT_INDEX_VERSION in config.mak.  Should I rename
that to GIT_INDEX_VERSION_TEST and do something like

set_index_version() {
export GIT_INDEX_VERSION=$GIT_INDEX_VERSION_TEST
}

in test-lib-functions.sh instead, does that make sense?
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 12/19] read-cache: read index-v5

2013-08-07 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 A little bit more..

 On Sat, Jul 13, 2013 at 12:26 AM, Thomas Gummerer t.gumme...@gmail.com 
 wrote:
 +static void ce_queue_push(struct cache_entry **head,
 +struct cache_entry **tail,
 +struct cache_entry *ce)
 +{
 +   if (!*head) {
 +   *head = *tail = ce;
 +   (*tail)-next = NULL;
 +   return;
 +   }
 +
 +   (*tail)-next = ce;
 +   ce-next = NULL;
 +   *tail = (*tail)-next;

 No no no. next is for name-hash.c don't reuse it here. And I don't
 think you really need to maintain a linked list of cache_entries by
 the way. More on read_entries comments below..

You're right, I don't need it for the reading code.  I do need to keep a
list of cache-entries for writing though, where a linked list is best
suited for the task.  I'll use a next_ce pointer for that.

 +}
 +
 +static struct cache_entry *ce_queue_pop(struct cache_entry **head)
 +{
 +   struct cache_entry *ce;
 +
 +   ce = *head;
 +   *head = (*head)-next;
 +   return ce;
 +}

 Same as ce_queue_push.

 +static int read_entries(struct index_state *istate, struct directory_entry 
 **de,
 +   unsigned int *entry_offset, void **mmap,
 +   unsigned long mmap_size, unsigned int *nr,
 +   unsigned int *foffsetblock)
 +{
 +   struct cache_entry *head = NULL, *tail = NULL;
 +   struct conflict_entry *conflict_queue;
 +   struct cache_entry *ce;
 +   int i;
 +
 +   conflict_queue = NULL;
 +   if (read_conflicts(conflict_queue, *de, mmap, mmap_size)  0)
 +   return -1;
 +   for (i = 0; i  (*de)-de_nfiles; i++) {
 +   if (read_entry(ce,
 +  *de,
 +  entry_offset,
 +  mmap,
 +  mmap_size,
 +  foffsetblock)  0)
 +   return -1;
 +   ce_queue_push(head, tail, ce);
 +   *foffsetblock += 4;
 +
 +   /*
 +* Add the conflicted entries at the end of the index file
 +* to the in memory format
 +*/
 +   if (conflict_queue 
 +   (conflict_queue-entries-flags  CONFLICT_CONFLICTED) 
 != 0 
 +   !cache_name_compare(conflict_queue-name, 
 conflict_queue-namelen,
 +   ce-name, ce_namelen(ce))) {
 +   struct conflict_part *cp;
 +   cp = conflict_queue-entries;
 +   cp = cp-next;
 +   while (cp) {
 +   ce = convert_conflict_part(cp,
 +   conflict_queue-name,
 +   conflict_queue-namelen);
 +   ce_queue_push(head, tail, ce);
 +   conflict_part_head_remove(cp);
 +   }
 +   conflict_entry_head_remove(conflict_queue);
 +   }

 I start to wonder if separating staged entries is a good idea. It
 seems to make the code more complicated. The good point about conflict
 section at the end of the file is you can just truncate() it out.
 Another way is putting staged entries in fileentries, sorted
 alphabetically then by stage number, and a flag indicating if the
 entry is valid. When you remove resolve an entry, just set the flag to
 invalid (partial write), so that read code will skip it.

 I think this approach is reasonably cheap (unless there are a lot of
 conflicts) and it simplifies this piece of code. truncate() may be
 overrated anyway. In my experience, I git add path as soon as I
 resolve path (so that git diff shrinks). One entry at a time, one
 index write at a time. I don't think I ever resolve everything then
 git add -u ., which is where truncate() shines because staged
 entries are removed all at once. We should optimize for one file
 resolution at a time, imo.

 +   }
 +
 +   *de = (*de)-next;
 +
 +   while (head) {
 +   if (*de != NULL
 +strcmp(head-name, (*de)-pathname)  0) {
 +   read_entries(istate,
 +de,
 +entry_offset,
 +mmap,
 +mmap_size,
 +nr,
 +foffsetblock);
 +   } else {
 +   ce = ce_queue_pop(head);
 +   set_index_entry(istate, *nr, ce);
 +   (*nr)++;
 +   ce-next = NULL;
 +   }

 In this loop, you do some sort of merge sort combining a list of
 sorted files and a list of sorted directories (which

Re: [PATCH v2 12/19] read-cache: read index-v5

2013-08-08 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Wed, Jul 17, 2013 at 3:11 PM, Thomas Gummerer t.gumme...@gmail.com wrote:
 Duy Nguyen pclo...@gmail.com writes:

 [..]

 +static int read_entries(struct index_state *istate, struct 
 directory_entry **de,
 +   unsigned int *entry_offset, void **mmap,
 +   unsigned long mmap_size, unsigned int *nr,
 +   unsigned int *foffsetblock)
 +{
 +   struct cache_entry *head = NULL, *tail = NULL;
 +   struct conflict_entry *conflict_queue;
 +   struct cache_entry *ce;
 +   int i;
 +
 +   conflict_queue = NULL;
 +   if (read_conflicts(conflict_queue, *de, mmap, mmap_size)  0)
 +   return -1;
 +   for (i = 0; i  (*de)-de_nfiles; i++) {
 +   if (read_entry(ce,
 +  *de,
 +  entry_offset,
 +  mmap,
 +  mmap_size,
 +  foffsetblock)  0)
 +   return -1;
 +   ce_queue_push(head, tail, ce);
 +   *foffsetblock += 4;
 +
 +   /*
 +* Add the conflicted entries at the end of the index file
 +* to the in memory format
 +*/
 +   if (conflict_queue 
 +   (conflict_queue-entries-flags  CONFLICT_CONFLICTED) 
 != 0 
 +   !cache_name_compare(conflict_queue-name, 
 conflict_queue-namelen,
 +   ce-name, ce_namelen(ce))) {
 +   struct conflict_part *cp;
 +   cp = conflict_queue-entries;
 +   cp = cp-next;
 +   while (cp) {
 +   ce = convert_conflict_part(cp,
 +   conflict_queue-name,
 +   conflict_queue-namelen);
 +   ce_queue_push(head, tail, ce);
 +   conflict_part_head_remove(cp);
 +   }
 +   conflict_entry_head_remove(conflict_queue);
 +   }

 I start to wonder if separating staged entries is a good idea. It
 seems to make the code more complicated. The good point about conflict
 section at the end of the file is you can just truncate() it out.
 Another way is putting staged entries in fileentries, sorted
 alphabetically then by stage number, and a flag indicating if the
 entry is valid. When you remove resolve an entry, just set the flag to
 invalid (partial write), so that read code will skip it.

 I think this approach is reasonably cheap (unless there are a lot of
 conflicts) and it simplifies this piece of code. truncate() may be
 overrated anyway. In my experience, I git add path as soon as I
 resolve path (so that git diff shrinks). One entry at a time, one
 index write at a time. I don't think I ever resolve everything then
 git add -u ., which is where truncate() shines because staged
 entries are removed all at once. We should optimize for one file
 resolution at a time, imo.

 Thanks for your comments.  I'll address the other ones once we decided
 to do with the conflicts.

 It does make the code quite a bit more complicated, but also has one
 advantage that you overlooked.

 I did overlook, although my goal is to keep the code simpler, not more
 comlicated. The thinking is if we can find everything in fileentries
 table, the code here is simplified, so..

 We wouldn't truncate() when resolving
 the conflicts.  The resolve undo data is stored with the conflicts and
 therefore we could just flip a bit and set the stage of the cache-entry
 in the main index to 0 (always once we got partial writing).  This can
 be fast both in case we resolve one entry at a time and when we resolve
 a lot of entries.  The advantage is even bigger when we resolve one
 entry at a time, when we otherwise would have to re-write the index for
 each conflict resolution.

 If I understand it correctly, filentries can only contain stage-0 or
 stage-1 entries, stage  0 entries are stored in conflict data. Once
 a conflict is solved, you update the stage-1 entry in fileentries,
 turning it to stage-0 and recalculate the entry checksum. Conflict
 data remains there to function as the old REUC extension. Correct?

Correct.

 First of all, if that's true, we only need 1 bit for stage in fileentries 
 table.

 Secondly, you may get away with looking up to conflict data in this
 function by storing all stages in fileentries (now we need 2-bit
 stage), replicated in conflict data for reuc function. When you
 resolve conflict, you flip stage-1 to stage-0, and flip (a new bit) to
 mark stage-2 entry invalid so the code knows to skip it. Next time the
 index is rewritten, invalid entries are removed, but we still have old
 stage entries in conflict data. The flipping business is pretty much
 what

Re: [PATCH v2 12/19] read-cache: read index-v5

2013-08-09 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Wed, Jul 17, 2013 at 3:11 PM, Thomas Gummerer t.gumme...@gmail.com wrote:
 Duy Nguyen pclo...@gmail.com writes:

 [..]

 +static int read_entries(struct index_state *istate, struct 
 directory_entry **de,
 +   unsigned int *entry_offset, void **mmap,
 +   unsigned long mmap_size, unsigned int *nr,
 +   unsigned int *foffsetblock)
 +{
 +   struct cache_entry *head = NULL, *tail = NULL;
 +   struct conflict_entry *conflict_queue;
 +   struct cache_entry *ce;
 +   int i;
 +
 +   conflict_queue = NULL;
 +   if (read_conflicts(conflict_queue, *de, mmap, mmap_size)  0)
 +   return -1;
 +   for (i = 0; i  (*de)-de_nfiles; i++) {
 +   if (read_entry(ce,
 +  *de,
 +  entry_offset,
 +  mmap,
 +  mmap_size,
 +  foffsetblock)  0)
 +   return -1;
 +   ce_queue_push(head, tail, ce);
 +   *foffsetblock += 4;
 +
 +   /*
 +* Add the conflicted entries at the end of the index file
 +* to the in memory format
 +*/
 +   if (conflict_queue 
 +   (conflict_queue-entries-flags  CONFLICT_CONFLICTED) 
 != 0 
 +   !cache_name_compare(conflict_queue-name, 
 conflict_queue-namelen,
 +   ce-name, ce_namelen(ce))) {
 +   struct conflict_part *cp;
 +   cp = conflict_queue-entries;
 +   cp = cp-next;
 +   while (cp) {
 +   ce = convert_conflict_part(cp,
 +   conflict_queue-name,
 +   conflict_queue-namelen);
 +   ce_queue_push(head, tail, ce);
 +   conflict_part_head_remove(cp);
 +   }
 +   conflict_entry_head_remove(conflict_queue);
 +   }

 I start to wonder if separating staged entries is a good idea. It
 seems to make the code more complicated. The good point about conflict
 section at the end of the file is you can just truncate() it out.
 Another way is putting staged entries in fileentries, sorted
 alphabetically then by stage number, and a flag indicating if the
 entry is valid. When you remove resolve an entry, just set the flag to
 invalid (partial write), so that read code will skip it.

 I think this approach is reasonably cheap (unless there are a lot of
 conflicts) and it simplifies this piece of code. truncate() may be
 overrated anyway. In my experience, I git add path as soon as I
 resolve path (so that git diff shrinks). One entry at a time, one
 index write at a time. I don't think I ever resolve everything then
 git add -u ., which is where truncate() shines because staged
 entries are removed all at once. We should optimize for one file
 resolution at a time, imo.

 Thanks for your comments.  I'll address the other ones once we decided
 to do with the conflicts.

 It does make the code quite a bit more complicated, but also has one
 advantage that you overlooked.

 I did overlook, although my goal is to keep the code simpler, not more
 comlicated. The thinking is if we can find everything in fileentries
 table, the code here is simplified, so..

 We wouldn't truncate() when resolving
 the conflicts.  The resolve undo data is stored with the conflicts and
 therefore we could just flip a bit and set the stage of the cache-entry
 in the main index to 0 (always once we got partial writing).  This can
 be fast both in case we resolve one entry at a time and when we resolve
 a lot of entries.  The advantage is even bigger when we resolve one
 entry at a time, when we otherwise would have to re-write the index for
 each conflict resolution.

 If I understand it correctly, filentries can only contain stage-0 or
 stage-1 entries, stage  0 entries are stored in conflict data. Once
 a conflict is solved, you update the stage-1 entry in fileentries,
 turning it to stage-0 and recalculate the entry checksum. Conflict
 data remains there to function as the old REUC extension. Correct?

 First of all, if that's true, we only need 1 bit for stage in fileentries 
 table.

 Secondly, you may get away with looking up to conflict data in this
 function by storing all stages in fileentries (now we need 2-bit
 stage), replicated in conflict data for reuc function. When you
 resolve conflict, you flip stage-1 to stage-0, and flip (a new bit) to
 mark stage-2 entry invalid so the code knows to skip it. Next time the
 index is rewritten, invalid entries are removed, but we still have old
 stage entries in conflict data. The flipping business is pretty much
 what you plan

[PATCH v3 00/24] Index-v5

2013-08-18 Thread Thomas Gummerer
Hi,

previous rounds (without api) are at $gmane/202752, $gmane/202923,
$gmane/203088 and $gmane/203517, the previous rounds with api were at
$gmane/229732 and $gmane/230210.  Thanks to Duy for reviewing the the
last round and Junio and Ramsay for additional comments.

Changes since the previous round:

read-cache: move index v2 specific functions to their own file
  - set istate-ops to NULL in discard_index

read-cache: add index reading api
  - style fixes
  - instead of using internal_ops struct, do for_each_index_entry in
read-cache.c

grep.c: use index api
  - remove duplicate call to match_pathspec_depth

ls-files.c: use index api
  - load the whole index if there is a trai

documentation: add documentation of the index-v5 file format
  - fix typo
  - change the position of nfile and ndir in the index file
  - document that the conflicts are also stored in the fileentries
block
  - document invalid flag

read-cache: read index-v5
  - restrict partial loading a bit more, by being more careful when
adjusting the pathspec
  - move the ondisk structs from cache.h to read-cache-v5.c
  - merge for and while loop in read_entries
  - keep a directory tree instead of a flat list when reading the
directories
  - ce_queue_push moved to read-cache: write index-v5 using a next_ce
pointer instead of the next pointer that's already used by
name-hash.
  - fix reading if there are extensions that are not yet supported
  - ignore entries that have the invalid flag set

read-cache: read cache-tree in index-v5
  - use the tree structure which is now used in read index-v5

read-cache: write index-v5
  - simplify compile_directory_data

changes to the index file format:
  - store the number of files before the number of directories in the
header, so that the file command still can recognize the number of
files in the repository correctly.
  - store all staged entries in the fileentries block. Doesn't hurt
the performance a lot but simplifies the code.
  - add an invalid flag for entries that should be ignored.  currently
unused but respected when reading.  will be used once the conflict
resolution is done by flipping a bit in the conflict entries at the
end of the index.
  
added commits:
  - read-cache: use fixed width integer types
  - read-cache: clear version in discard_index()
  - read-cache: Don't compare uid, gid and ino on cygwin
  - introduce GIT_INDEX_VERSION environment variable
  - test-lib: allow setting the index format version

Thomas Gummerer (23):
  t2104: Don't fail for index versions other than [23]
  read-cache: use fixed width integer types
  read-cache: split index file version specific functionality
  read-cache: clear version in discard_index()
  read-cache: move index v2 specific functions to their own file
  read-cache: Don't compare uid, gid and ino on cygwin
  read-cache: Re-read index if index file changed
  add documentation for the index api
  read-cache: add index reading api
  make sure partially read index is not changed
  grep.c: use index api
  ls-files.c: use index api
  documentation: add documentation of the index-v5 file format
  read-cache: make in-memory format aware of stat_crc
  read-cache: read index-v5
  read-cache: read resolve-undo data
  read-cache: read cache-tree in index-v5
  read-cache: write index-v5
  read-cache: write index-v5 cache-tree data
  read-cache: write resolve-undo data for index-v5
  update-index.c: rewrite index when index-version is given
  introduce GIT_INDEX_VERSION environment variable
  test-lib: allow setting the index format version

Thomas Rast (1):
  p0003-index.sh: add perf test for the index formats

 Documentation/technical/api-in-core-index.txt|   54 +-
 Documentation/technical/index-file-format-v5.txt |  301 +
 Makefile |   10 +
 builtin/apply.c  |2 +
 builtin/grep.c   |   69 +-
 builtin/ls-files.c   |   36 +-
 builtin/update-index.c   |6 +-
 cache-tree.c |2 +-
 cache-tree.h |1 +
 cache.h  |   93 +-
 read-cache-v2.c  |  550 +
 read-cache-v5.c  | 1417 ++
 read-cache.c |  685 +++
 read-cache.h |   61 +
 t/perf/p0003-index.sh|   63 +
 t/t2104-update-index-skip-worktree.sh|1 +
 t/test-lib-functions.sh  |5 +
 t/test-lib.sh|3 +
 test-index-version.c |6 +
 unpack-trees.c   |3 +-
 20 files changed, 2786 insertions(+), 582 deletions(-)
 create mode 100644 Documentation

[PATCH v3 06/24] read-cache: Don't compare uid, gid and ino on cygwin

2013-08-18 Thread Thomas Gummerer
Cygwin doesn't have uid, gid and ino stats fields.  Therefore we should
never check them in the match_stat_data when working on the CYGWIN
platform.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---

This patch was not tested on Cygwin yet.  I think it's needed though,
because the re-reading of the index if it changed will no longer use
it's own index_changed function, but use the stat_validity_check
function instead.  Would be great if someone running Cygwin could test
this.

 read-cache.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/read-cache.c b/read-cache.c
index 1f827de..aa17ce7 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -82,6 +82,7 @@ int match_stat_data(const struct stat_data *sd, struct stat 
*st)
changed |= CTIME_CHANGED;
 #endif
 
+#if !defined (__CYGWIN__)
if (check_stat) {
if (sd-sd_uid != (unsigned int) st-st_uid ||
sd-sd_gid != (unsigned int) st-st_gid)
@@ -89,6 +90,7 @@ int match_stat_data(const struct stat_data *sd, struct stat 
*st)
if (sd-sd_ino != (unsigned int) st-st_ino)
changed |= INODE_CHANGED;
}
+#endif
 
 #ifdef USE_STDEV
/*
-- 
1.8.3.4.1231.g9fbf354.dirty

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 05/24] read-cache: move index v2 specific functions to their own file

2013-08-18 Thread Thomas Gummerer
Move index version 2 specific functions to their own file. The non-index
specific functions will be in read-cache.c, while the index version 2
specific functions will be in read-cache-v2.c.

Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Makefile   |   2 +
 builtin/apply.c|   2 +
 builtin/update-index.c |   2 +-
 cache.h|  13 +-
 read-cache-v2.c| 544 ++
 read-cache.c   | 576 +
 read-cache.h   |  58 +
 test-index-version.c   |   6 +
 unpack-trees.c |   3 +-
 9 files changed, 669 insertions(+), 537 deletions(-)
 create mode 100644 read-cache-v2.c
 create mode 100644 read-cache.h

diff --git a/Makefile b/Makefile
index 6b446e7..afae23e 100644
--- a/Makefile
+++ b/Makefile
@@ -712,6 +712,7 @@ LIB_H += progress.h
 LIB_H += prompt.h
 LIB_H += quote.h
 LIB_H += reachable.h
+LIB_H += read-cache.h
 LIB_H += reflog-walk.h
 LIB_H += refs.h
 LIB_H += remote.h
@@ -855,6 +856,7 @@ LIB_OBJS += prompt.o
 LIB_OBJS += quote.o
 LIB_OBJS += reachable.o
 LIB_OBJS += read-cache.o
+LIB_OBJS += read-cache-v2.o
 LIB_OBJS += reflog-walk.o
 LIB_OBJS += refs.o
 LIB_OBJS += remote.o
diff --git a/builtin/apply.c b/builtin/apply.c
index 50912c9..3d5a5dc 100644
--- a/builtin/apply.c
+++ b/builtin/apply.c
@@ -3682,6 +3682,8 @@ static void build_fake_ancestor(struct patch *list, const 
char *filename)
die (Could not add %s to temporary index, name);
}
 
+   if (!result.initialized)
+   initialize_index(result, 0);
fd = open(filename, O_WRONLY | O_CREAT, 0666);
if (fd  0 || write_index(result, fd) || close(fd))
die (Could not write temporary index to %s, filename);
diff --git a/builtin/update-index.c b/builtin/update-index.c
index e3a10d7..c5bb889 100644
--- a/builtin/update-index.c
+++ b/builtin/update-index.c
@@ -863,7 +863,7 @@ int cmd_update_index(int argc, const char **argv, const 
char *prefix)
 
if (the_index.version != preferred_index_format)
active_cache_changed = 1;
-   the_index.version = preferred_index_format;
+   change_cache_version(preferred_index_format);
}
 
if (read_from_stdin) {
diff --git a/cache.h b/cache.h
index 9ef778a..d4dae21 100644
--- a/cache.h
+++ b/cache.h
@@ -95,16 +95,8 @@ unsigned long git_deflate_bound(git_zstream *, unsigned 
long);
  */
 #define DEFAULT_GIT_PORT 9418
 
-/*
- * Basic data structures for the directory cache
- */
 
 #define CACHE_SIGNATURE 0x44495243 /* DIRC */
-struct cache_header {
-   uint32_t hdr_signature;
-   uint32_t hdr_version;
-   uint32_t hdr_entries;
-};
 
 #define INDEX_FORMAT_LB 2
 #define INDEX_FORMAT_UB 4
@@ -279,6 +271,7 @@ struct index_state {
 initialized : 1;
struct hash_table name_hash;
struct hash_table dir_hash;
+   struct index_ops *ops;
 };
 
 extern struct index_state the_index;
@@ -296,6 +289,8 @@ extern void free_name_hash(struct index_state *istate);
 #define active_cache_changed (the_index.cache_changed)
 #define active_cache_tree (the_index.cache_tree)
 
+#define initialize_cache() initialize_index(the_index, 0)
+#define change_cache_version(version) change_index_version(the_index, 
(version))
 #define read_cache() read_index(the_index)
 #define read_cache_from(path) read_index_from(the_index, (path))
 #define read_cache_preload(pathspec) read_index_preload(the_index, (pathspec))
@@ -454,6 +449,8 @@ extern void sanitize_stdfds(void);
} while (0)
 
 /* Initialize and use the cache information */
+extern void initialize_index(struct index_state *istate, int version);
+extern void change_index_version(struct index_state *istate, int version);
 extern int read_index(struct index_state *);
 extern int read_index_preload(struct index_state *, const struct pathspec 
*pathspec);
 extern int read_index_from(struct index_state *, const char *path);
diff --git a/read-cache-v2.c b/read-cache-v2.c
new file mode 100644
index 000..070d468
--- /dev/null
+++ b/read-cache-v2.c
@@ -0,0 +1,544 @@
+#include cache.h
+#include read-cache.h
+#include resolve-undo.h
+#include cache-tree.h
+#include varint.h
+
+/* Mask for the name length in ce_flags in the on-disk index */
+#define CE_NAMEMASK  (0x0fff)
+
+/*
+ * Index File I/O
+ */
+
+/*
+ * dev/ino/uid/gid/size are also just tracked to the low 32 bits
+ * Again - this is just a (very strong in practice) heuristic that
+ * the inode hasn't changed.
+ *
+ * We save the fields in big-endian order to allow using the
+ * index file over NFS transparently.
+ */
+struct ondisk_cache_entry {
+   struct cache_time ctime;
+   struct cache_time mtime;
+   uint32_t

[PATCH v3 03/24] read-cache: split index file version specific functionality

2013-08-18 Thread Thomas Gummerer
Split index file version specific functionality to their own functions,
to prepare for moving the index file version specific parts to their own
file.  This makes it easier to add a new index file format later.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache.c | 114 ++-
 1 file changed, 74 insertions(+), 40 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 0df5b31..de0bbcd 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1269,10 +1269,8 @@ struct ondisk_cache_entry_extended {
ondisk_cache_entry_extended_size(ce_namelen(ce)) : \
ondisk_cache_entry_size(ce_namelen(ce)))
 
-static int verify_hdr(struct cache_header *hdr, unsigned long size)
+static int verify_hdr_version(struct cache_header *hdr, unsigned long size)
 {
-   git_SHA_CTX c;
-   unsigned char sha1[20];
int hdr_version;
 
if (hdr-hdr_signature != htonl(CACHE_SIGNATURE))
@@ -1280,10 +1278,21 @@ static int verify_hdr(struct cache_header *hdr, 
unsigned long size)
hdr_version = ntohl(hdr-hdr_version);
if (hdr_version  INDEX_FORMAT_LB || INDEX_FORMAT_UB  hdr_version)
return error(bad index version %d, hdr_version);
+   return 0;
+}
+
+static int verify_hdr(void *mmap, unsigned long size)
+{
+   git_SHA_CTX c;
+   unsigned char sha1[20];
+
+   if (size  sizeof(struct cache_header) + 20)
+   die(index file smaller than expected);
+
git_SHA1_Init(c);
-   git_SHA1_Update(c, hdr, size - 20);
+   git_SHA1_Update(c, mmap, size - 20);
git_SHA1_Final(sha1, c);
-   if (hashcmp(sha1, (unsigned char *)hdr + size - 20))
+   if (hashcmp(sha1, (unsigned char *)mmap + size - 20))
return error(bad index file sha1 signature);
return 0;
 }
@@ -1425,44 +1434,14 @@ static struct cache_entry *create_from_disk(struct 
ondisk_cache_entry *ondisk,
return ce;
 }
 
-/* remember to discard_cache() before reading a different cache! */
-int read_index_from(struct index_state *istate, const char *path)
+static int read_index_v2(struct index_state *istate, void *mmap, unsigned long 
mmap_size)
 {
-   int fd, i;
-   struct stat st;
+   int i;
unsigned long src_offset;
struct cache_header *hdr;
-   void *mmap;
-   size_t mmap_size;
struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 
-   if (istate-initialized)
-   return istate-cache_nr;
-
-   istate-timestamp.sec = 0;
-   istate-timestamp.nsec = 0;
-   fd = open(path, O_RDONLY);
-   if (fd  0) {
-   if (errno == ENOENT)
-   return 0;
-   die_errno(index file open failed);
-   }
-
-   if (fstat(fd, st))
-   die_errno(cannot stat the open index);
-
-   mmap_size = xsize_t(st.st_size);
-   if (mmap_size  sizeof(struct cache_header) + 20)
-   die(index file smaller than expected);
-
-   mmap = xmmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 
0);
-   if (mmap == MAP_FAILED)
-   die_errno(unable to map index file);
-   close(fd);
-
hdr = mmap;
-   if (verify_hdr(hdr, mmap_size)  0)
-   goto unmap;
 
istate-version = ntohl(hdr-hdr_version);
istate-cache_nr = ntohl(hdr-hdr_entries);
@@ -1488,8 +1467,6 @@ int read_index_from(struct index_state *istate, const 
char *path)
src_offset += consumed;
}
strbuf_release(previous_name_buf);
-   istate-timestamp.sec = st.st_mtime;
-   istate-timestamp.nsec = ST_MTIME_NSEC(st);
 
while (src_offset = mmap_size - 20 - 8) {
/* After an array of active_nr index entries,
@@ -1509,6 +1486,58 @@ int read_index_from(struct index_state *istate, const 
char *path)
src_offset += 8;
src_offset += extsize;
}
+   return 0;
+unmap:
+   munmap(mmap, mmap_size);
+   die(index file corrupt);
+}
+
+/* remember to discard_cache() before reading a different cache! */
+int read_index_from(struct index_state *istate, const char *path)
+{
+   int fd;
+   struct stat st;
+   struct cache_header *hdr;
+   void *mmap;
+   size_t mmap_size;
+
+   errno = EBUSY;
+   if (istate-initialized)
+   return istate-cache_nr;
+
+   errno = ENOENT;
+   istate-timestamp.sec = 0;
+   istate-timestamp.nsec = 0;
+   fd = open(path, O_RDONLY);
+   if (fd  0) {
+   if (errno == ENOENT)
+   return 0;
+   die_errno(index file open failed);
+   }
+
+   if (fstat(fd, st))
+   die_errno(cannot stat the open index);
+
+   errno = EINVAL;
+   mmap_size = xsize_t(st.st_size);
+   if (mmap_size  sizeof(struct cache_header) + 20)
+   die(index file smaller than

[PATCH v3 04/24] read-cache: clear version in discard_index()

2013-08-18 Thread Thomas Gummerer
All fields except index_state-version are reset in discard_index.
Reset the version too.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/read-cache.c b/read-cache.c
index de0bbcd..1e22f6f 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1558,6 +1558,7 @@ int discard_index(struct index_state *istate)
for (i = 0; i  istate-cache_nr; i++)
free(istate-cache[i]);
resolve_undo_clear_index(istate);
+   istate-version = 0;
istate-cache_nr = 0;
istate-cache_changed = 0;
istate-timestamp.sec = 0;
-- 
1.8.3.4.1231.g9fbf354.dirty

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 01/24] t2104: Don't fail for index versions other than [23]

2013-08-18 Thread Thomas Gummerer
t2104 currently checks for the exact index version 2 or 3,
depending if there is a skip-worktree flag or not. Other
index versions do not use extended flags and thus cannot
be tested for version changes.

Make this test update the index to version 2 at the beginning
of the test. Testing the skip-worktree flags for the default
index format is still covered by t7011 and t7012.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/t2104-update-index-skip-worktree.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/t/t2104-update-index-skip-worktree.sh 
b/t/t2104-update-index-skip-worktree.sh
index 1d0879b..bd9644f 100755
--- a/t/t2104-update-index-skip-worktree.sh
+++ b/t/t2104-update-index-skip-worktree.sh
@@ -22,6 +22,7 @@ H sub/2
 EOF
 
 test_expect_success 'setup' '
+   git update-index --index-version=2 
mkdir sub 
touch ./1 ./2 sub/1 sub/2 
git add 1 2 sub/1 sub/2 
-- 
1.8.3.4.1231.g9fbf354.dirty

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 02/24] read-cache: use fixed width integer types

2013-08-18 Thread Thomas Gummerer
Use the fixed width integer types uint16_t and uint32_t for ondisk
structures, because unsigned short and unsigned int do not hae a
guaranteed size.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache.h  | 10 +-
 read-cache.c | 30 +++---
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/cache.h b/cache.h
index bd6fb9f..9ef778a 100644
--- a/cache.h
+++ b/cache.h
@@ -101,9 +101,9 @@ unsigned long git_deflate_bound(git_zstream *, unsigned 
long);
 
 #define CACHE_SIGNATURE 0x44495243 /* DIRC */
 struct cache_header {
-   unsigned int hdr_signature;
-   unsigned int hdr_version;
-   unsigned int hdr_entries;
+   uint32_t hdr_signature;
+   uint32_t hdr_version;
+   uint32_t hdr_entries;
 };
 
 #define INDEX_FORMAT_LB 2
@@ -115,8 +115,8 @@ struct cache_header {
  * check it for equality in the 32 bits we save.
  */
 struct cache_time {
-   unsigned int sec;
-   unsigned int nsec;
+   uint32_t sec;
+   uint32_t nsec;
 };
 
 struct stat_data {
diff --git a/read-cache.c b/read-cache.c
index ceaf207..0df5b31 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1230,14 +1230,14 @@ static struct cache_entry *refresh_cache_entry(struct 
cache_entry *ce, int reall
 struct ondisk_cache_entry {
struct cache_time ctime;
struct cache_time mtime;
-   unsigned int dev;
-   unsigned int ino;
-   unsigned int mode;
-   unsigned int uid;
-   unsigned int gid;
-   unsigned int size;
+   uint32_t dev;
+   uint32_t ino;
+   uint32_t mode;
+   uint32_t uid;
+   uint32_t gid;
+   uint32_t size;
unsigned char sha1[20];
-   unsigned short flags;
+   uint16_t flags;
char name[FLEX_ARRAY]; /* more */
 };
 
@@ -1249,15 +1249,15 @@ struct ondisk_cache_entry {
 struct ondisk_cache_entry_extended {
struct cache_time ctime;
struct cache_time mtime;
-   unsigned int dev;
-   unsigned int ino;
-   unsigned int mode;
-   unsigned int uid;
-   unsigned int gid;
-   unsigned int size;
+   uint32_t dev;
+   uint32_t ino;
+   uint32_t mode;
+   uint32_t uid;
+   uint32_t gid;
+   uint32_t size;
unsigned char sha1[20];
-   unsigned short flags;
-   unsigned short flags2;
+   uint16_t flags;
+   uint16_t flags2;
char name[FLEX_ARRAY]; /* more */
 };
 
-- 
1.8.3.4.1231.g9fbf354.dirty

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 23/24] introduce GIT_INDEX_VERSION environment variable

2013-08-18 Thread Thomas Gummerer
Respect a GIT_INDEX_VERSION environment variable, when a new index is
initialized.  Setting the environment variable will not cause existing
index files to be converted to another format for additional safety.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 1d9b615..f820d8a 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1235,8 +1235,13 @@ static struct cache_entry *refresh_cache_entry(struct 
cache_entry *ce, int reall
 void initialize_index(struct index_state *istate, int version)
 {
istate-initialized = 1;
-   if (!version)
-   version = INDEX_FORMAT_DEFAULT;
+   if (!version) {
+   char *envversion = getenv(GIT_INDEX_VERSION);
+   if (!envversion)
+   version = INDEX_FORMAT_DEFAULT;
+   else
+   version = atoi(envversion);
+   }
istate-version = version;
set_istate_ops(istate);
 }
-- 
1.8.3.4.1231.g9fbf354.dirty

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 18/24] read-cache: write index-v5

2013-08-18 Thread Thomas Gummerer
Write the index version 5 file format to disk. This version doesn't
write the cache-tree data and resolve-undo data to the file.

The main work is done when filtering out the directories from the
current in-memory format, where in the same turn also the conflicts
and the file data is calculated.

Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache.h |   2 +
 read-cache-v5.c | 596 +++-
 read-cache.c|   4 +-
 read-cache.h|   1 +
 4 files changed, 601 insertions(+), 2 deletions(-)

diff --git a/cache.h b/cache.h
index 89f556b..a109f35 100644
--- a/cache.h
+++ b/cache.h
@@ -138,6 +138,7 @@ struct cache_entry {
unsigned char sha1[20];
uint32_t ce_stat_crc;
struct cache_entry *next; /* used by name_hash */
+   struct cache_entry *next_ce;
char name[FLEX_ARRAY]; /* more */
 };
 
@@ -532,6 +533,7 @@ extern int unmerged_index(const struct index_state *);
 extern int verify_path(const char *path);
 extern struct cache_entry *index_name_exists(struct index_state *istate, const 
char *name, int namelen, int igncase);
 extern int index_name_pos(const struct index_state *, const char *name, int 
namelen);
+extern struct directory_entry *init_directory_entry(char *pathname, int len);
 #define ADD_CACHE_OK_TO_ADD 1  /* Ok to add */
 #define ADD_CACHE_OK_TO_REPLACE 2  /* Ok to replace file/directory */
 #define ADD_CACHE_SKIP_DFCHECK 4   /* Ok to skip DF conflict checks */
diff --git a/read-cache-v5.c b/read-cache-v5.c
index b14505a..85b912b 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -673,9 +673,603 @@ static int read_index_v5(struct index_state *istate, void 
*mmap,
return 0;
 }
 
+#define WRITE_BUFFER_SIZE 8192
+static unsigned char write_buffer[WRITE_BUFFER_SIZE];
+static unsigned long write_buffer_len;
+
+static int ce_write_flush(int fd)
+{
+   unsigned int buffered = write_buffer_len;
+   if (buffered) {
+   if (write_in_full(fd, write_buffer, buffered) != buffered)
+   return -1;
+   write_buffer_len = 0;
+   }
+   return 0;
+}
+
+static int ce_write(uint32_t *crc, int fd, void *data, unsigned int len)
+{
+   if (crc)
+   *crc = crc32(*crc, (Bytef*)data, len);
+   while (len) {
+   unsigned int buffered = write_buffer_len;
+   unsigned int partial = WRITE_BUFFER_SIZE - buffered;
+   if (partial  len)
+   partial = len;
+   memcpy(write_buffer + buffered, data, partial);
+   buffered += partial;
+   if (buffered == WRITE_BUFFER_SIZE) {
+   write_buffer_len = buffered;
+   if (ce_write_flush(fd))
+   return -1;
+   buffered = 0;
+   }
+   write_buffer_len = buffered;
+   len -= partial;
+   data = (char *) data + partial;
+   }
+   return 0;
+}
+
+static int ce_flush(int fd)
+{
+   unsigned int left = write_buffer_len;
+
+   if (left)
+   write_buffer_len = 0;
+
+   if (write_in_full(fd, write_buffer, left) != left)
+   return -1;
+
+   return 0;
+}
+
+static void ce_smudge_racily_clean_entry(struct cache_entry *ce)
+{
+   /*
+* This method shall only be called if the timestamp of ce
+* is racy (check with is_racy_timestamp). If the timestamp
+* is racy, the writer will set the CE_SMUDGED flag.
+*
+* The reader (match_stat_basic) will then take care
+* of checking if the entry is really changed or not, by
+* taking into account the size and the stat_crc and if
+* that hasn't changed checking the sha1.
+*/
+   ce-ce_flags |= CE_SMUDGED;
+}
+
+char *super_directory(const char *filename)
+{
+   char *slash;
+
+   slash = strrchr(filename, '/');
+   if (slash)
+   return xmemdupz(filename, slash-filename);
+   return NULL;
+}
+
+struct directory_entry *init_directory_entry(char *pathname, int len)
+{
+   struct directory_entry *de = xmalloc(directory_entry_size(len));
+
+   memcpy(de-pathname, pathname, len);
+   de-pathname[len] = '\0';
+   de-de_flags  = 0;
+   de-de_foffset= 0;
+   de-de_cr = 0;
+   de-de_ncr= 0;
+   de-de_nsubtrees  = 0;
+   de-de_nfiles = 0;
+   de-de_nentries   = 0;
+   memset(de-sha1, 0, 20);
+   de-de_pathlen= len;
+   de-next  = NULL;
+   de-next_hash = NULL;
+   de-ce= NULL;
+   de-ce_last   = NULL;
+   de-conflict  = NULL;
+   de-conflict_last = NULL;
+   de-conflict_size = 0;
+   return de;
+}
+
+static void ondisk_from_directory_entry(struct

[PATCH v3 22/24] p0003-index.sh: add perf test for the index formats

2013-08-18 Thread Thomas Gummerer
From: Thomas Rast tr...@inf.ethz.ch

Add a performance test for index version [23]/4/5 by using
git update-index --index-version=x, thus testing both the reader
and the writer speed of all index formats.

Signed-off-by: Thomas Rast tr...@inf.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/perf/p0003-index.sh | 63 +++
 1 file changed, 63 insertions(+)
 create mode 100755 t/perf/p0003-index.sh

diff --git a/t/perf/p0003-index.sh b/t/perf/p0003-index.sh
new file mode 100755
index 000..5360175
--- /dev/null
+++ b/t/perf/p0003-index.sh
@@ -0,0 +1,63 @@
+#!/bin/sh
+
+test_description=Tests index versions [23]/4/5
+
+. ./perf-lib.sh
+
+test_perf_large_repo
+
+test_expect_success convert to v3 
+   git update-index --index-version=2
+
+
+test_perf v[23]: update-index 
+   git update-index --index-version=2 /dev/null
+
+
+subdir=$(git ls-files | sed 's#/[^/]*$##' | grep -v '^$' | uniq | tail -n 30 | 
head -1)
+
+test_perf v[23]: grep nonexistent -- subdir 
+   test_must_fail git grep nonexistent -- $subdir /dev/null
+
+
+test_perf v[23]: ls-files -- subdir 
+   git ls-files $subdir /dev/null
+
+
+test_expect_success convert to v4 
+   git update-index --index-version=4
+
+
+test_perf v4: update-index 
+   git update-index --index-version=4 /dev/null
+
+
+test_perf v4: grep nonexistent -- subdir 
+   test_must_fail git grep nonexistent -- $subdir /dev/null
+
+
+test_perf v4: ls-files -- subdir 
+   git ls-files $subdir /dev/null
+
+
+test_expect_success convert to v5 
+   git update-index --index-version=5
+
+
+test_perf v5: update-index 
+   git update-index --index-version=5 /dev/null
+
+
+test_perf v5: ls-files 
+   git ls-files /dev/null
+
+
+test_perf v5: grep nonexistent -- subdir 
+   test_must_fail git grep nonexistent -- $subdir /dev/null
+
+
+test_perf v5: ls-files -- subdir 
+   git ls-files $subdir /dev/null
+
+
+test_done
-- 
1.8.3.4.1231.g9fbf354.dirty

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 10/24] make sure partially read index is not changed

2013-08-18 Thread Thomas Gummerer
A partially read index file currently cannot be written to disk.  Make
sure that never happens, by erroring out when a caller tries to write a
partially read index.  Do the same when trying to re-read a partially
read index without having discarded it first to avoid loosing any
information.

Forcing the caller to load the right part of the index file instead of
re-reading it when changing it, gives a bit of a performance advantage,
by avoiding to read parts of the index twice.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/read-cache.c b/read-cache.c
index 38b9a04..7a27f9b 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1332,6 +1332,8 @@ int read_index_filtered_from(struct index_state *istate, 
const char *path,
void *mmap;
size_t mmap_size;
 
+   if (istate-filter_opts)
+   die(BUG: Can't re-read partially read index);
errno = EBUSY;
if (istate-initialized)
return istate-cache_nr;
@@ -1455,6 +1457,8 @@ void update_index_if_able(struct index_state *istate, 
struct lock_file *lockfile
 
 int write_index(struct index_state *istate, int newfd)
 {
+   if (istate-filter_opts)
+   die(BUG: index: cannot write a partially read index);
return istate-ops-write_index(istate, newfd);
 }
 
-- 
1.8.3.4.1231.g9fbf354.dirty

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 07/24] read-cache: Re-read index if index file changed

2013-08-18 Thread Thomas Gummerer
Add the possibility of re-reading the index file, if it changed
while reading.

The index file might change during the read, causing outdated
information to be displayed. We check if the index file changed
by using its stat data as heuristic.

Helped-by: Ramsay Jones ram...@ramsay1.demon.co.uk
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache.c | 65 +++-
 1 file changed, 34 insertions(+), 31 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index aa17ce7..2d12601 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1296,8 +1296,8 @@ int read_index(struct index_state *istate)
 /* remember to discard_cache() before reading a different cache! */
 int read_index_from(struct index_state *istate, const char *path)
 {
-   int fd;
-   struct stat st;
+   int fd, err, i;
+   struct stat_validity sv;
struct cache_header *hdr;
void *mmap;
size_t mmap_size;
@@ -1309,43 +1309,46 @@ int read_index_from(struct index_state *istate, const 
char *path)
errno = ENOENT;
istate-timestamp.sec = 0;
istate-timestamp.nsec = 0;
-
-   fd = open(path, O_RDONLY);
-   if (fd  0) {
-   if (errno == ENOENT) {
-   initialize_index(istate, 0);
-   return 0;
+   sv.sd = NULL;
+   for (i = 0; i  50; i++) {
+   err = 0;
+   fd = open(path, O_RDONLY);
+   if (fd  0) {
+   if (errno == ENOENT) {
+   initialize_index(istate, 0);
+   return 0;
+   }
+   die_errno(index file open failed);
}
-   die_errno(index file open failed);
-   }
 
-   if (fstat(fd, st))
-   die_errno(cannot stat the open index);
+   stat_validity_update(sv, fd);
+   if (!sv.sd)
+   die_errno(cannot stat the open index);
 
-   errno = EINVAL;
-   mmap_size = xsize_t(st.st_size);
-   mmap = xmmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 
0);
-   close(fd);
-   if (mmap == MAP_FAILED)
-   die_errno(unable to map index file);
+   errno = EINVAL;
+   mmap_size = xsize_t(sv.sd-sd_size);
+   mmap = xmmap(NULL, mmap_size, PROT_READ | PROT_WRITE, 
MAP_PRIVATE, fd, 0);
+   close(fd);
+   if (mmap == MAP_FAILED)
+   die_errno(unable to map index file);
 
-   hdr = mmap;
-   if (verify_hdr_version(istate, hdr, mmap_size)  0)
-   goto unmap;
+   hdr = mmap;
+   if (verify_hdr_version(istate, hdr, mmap_size)  0)
+   err = 1;
 
-   if (istate-ops-verify_hdr(mmap, mmap_size)  0)
-   goto unmap;
+   if (istate-ops-verify_hdr(mmap, mmap_size)  0)
+   err = 1;
 
-   if (istate-ops-read_index(istate, mmap, mmap_size)  0)
-   goto unmap;
-   istate-timestamp.sec = st.st_mtime;
-   istate-timestamp.nsec = ST_MTIME_NSEC(st);
+   if (istate-ops-read_index(istate, mmap, mmap_size)  0)
+   err = 1;
+   istate-timestamp.sec = sv.sd-sd_mtime.sec;
+   istate-timestamp.nsec = sv.sd-sd_mtime.nsec;
 
-   munmap(mmap, mmap_size);
-   return istate-cache_nr;
+   munmap(mmap, mmap_size);
+   if (stat_validity_check(sv, path)  !err)
+   return istate-cache_nr;
+   }
 
-unmap:
-   munmap(mmap, mmap_size);
die(index file corrupt);
 }
 
-- 
1.8.3.4.1231.g9fbf354.dirty

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 13/24] documentation: add documentation of the index-v5 file format

2013-08-18 Thread Thomas Gummerer
Add a documentation of the index file format version 5 to
Documentation/technical.

Helped-by: Michael Haggerty mhag...@alum.mit.edu
Helped-by: Junio C Hamano gits...@pobox.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Robin Rosenberg robin.rosenb...@dewire.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Documentation/technical/index-file-format-v5.txt | 301 +++
 1 file changed, 301 insertions(+)
 create mode 100644 Documentation/technical/index-file-format-v5.txt

diff --git a/Documentation/technical/index-file-format-v5.txt 
b/Documentation/technical/index-file-format-v5.txt
new file mode 100644
index 000..5209c02
--- /dev/null
+++ b/Documentation/technical/index-file-format-v5.txt
@@ -0,0 +1,301 @@
+GIT index format
+
+
+== The git index
+
+   The git index file (.git/index) documents the status of the files
+ in the git staging area.
+
+   The staging area is used for preparing commits, merging, etc.
+
+== The git index file format
+
+   All binary numbers are in network byte order. Version 5 is described
+ here. The index file consists of various sections. They appear in
+ the following order in the file.
+
+   - header: the description of the index format, including it's signature,
+ version and various other fields that are used internally.
+
+   - diroffsets (ndir entries of direcotry offset): A 4-byte offset
+   relative to the beginning of the direntries block (see below)
+   for each of the ndir directories in the index, sorted by pathname
+   (of the directory it's pointing to). [1]
+
+   - direntries (ndir entries of directory offset): A directory entry
+   for each of the ndir directories in the index, sorted by pathname
+   (see below). [2]
+
+   - fileoffsets (nfile entries of file offset): A 4-byte offset
+   relative to the beginning of the fileentries block (see below)
+   for each of the nfile files in the index. [1]
+
+   - fileentries (nfile entries of file entry): A file entry for
+   each of the nfile files in the index (see below).
+
+   - crdata: A number of entries for conflicted data/resolved conflicts
+   (see below).
+
+   - Extensions (Currently none, see below in the future)
+
+ Extensions are identified by signature. Optional extensions can
+ be ignored if GIT does not understand them.
+
+ GIT supports an arbitrary number of extension, but currently none
+ is implemented. [3]
+
+ extsig (32-bits): extension signature. If the first byte is 'A'..'Z'
+ the extension is optional and can be ignored.
+
+ extsize (32-bits): size of the extension, excluding the header
+   (extsig, extsize, extchecksum).
+
+ extchecksum (32-bits): crc32 checksum of the extension signature
+   and size.
+
+- Extension data.
+
+== Header
+   sig (32-bits): Signature:
+ The signature is { 'D', 'I', 'R', 'C' } (stands for dircache)
+
+   vnr (32-bits): Version number:
+ The current supported versions are 2, 3, 4 and 5.
+
+   nfile (32-bits): number of file entries in the index.
+
+   ndir (32-bits): number of directories in the index.
+
+   fblockoffset (32-bits): offset to the file block, relative to the
+ beginning of the file.
+
+   - Offset to the extensions.
+
+ nextensions (32-bits): number of extensions.
+
+ extoffset (32-bits): offset to the extension. (Possibly none, as
+   many as indicated in the 4-byte number of extensions)
+
+   headercrc (32-bits): crc checksum including the header and the
+ offsets to the extensions.
+
+
+== Directory offsets (diroffsets)
+
+  diroffset (32-bits): offset to the directory relative to the
+beginning of the index file. There are ndir + 1 offsets in the
+diroffset table, the last is pointing to the end of the last
+direntry. With this last entry, we are able to replace the strlen
+of the directory name when reading the directory name, by
+calculating it from diroffset[n+1]-diroffset[n]-61.  61 is the
+size of the directory data, which follows each each directory +
+the crc sum + the NUL byte.
+
+  This part is needed for making the directory entries bisectable and
+thus allowing a binary search.
+
+== Directory entry (direntries)
+
+  Directory entries are sorted in lexicographic order by the name
+of their path starting with the root.
+
+  pathname (variable length, nul terminated): relative to top level
+directory (without the leading slash). '/' is used as path
+separator. A string of length 0 ('') indicates the root directory.
+The special path components ., and .. (without quotes) are
+disallowed. The path also includes a trailing slash. [9]
+
+  foffset (32-bits): offset to the lexicographically first file in
+the file offsets (fileoffsets), relative to the beginning of
+the fileoffset block.
+
+  cr (32-bits): offset to conflicted/resolved data at the end

[PATCH v3 19/24] read-cache: write index-v5 cache-tree data

2013-08-18 Thread Thomas Gummerer
Write the cache-tree data for the index version 5 file format. The
in-memory cache-tree data is converted to the ondisk format, by adding
it to the directory entries, that were compiled from the cache-entries
in the step before.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c | 53 +
 1 file changed, 53 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index 85b912b..ed52b7c 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -891,6 +891,57 @@ static struct conflict_entry 
*create_conflict_entry_from_ce(struct cache_entry *
return create_new_conflict(ce-name, ce_namelen(ce), pathlen);
 }
 
+static void convert_one_to_ondisk_v5(struct hash_table *table, struct 
cache_tree *it,
+   const char *path, int pathlen, uint32_t crc)
+{
+   int i;
+   struct directory_entry *found, *search;
+
+   crc = crc32(crc, (Bytef*)path, pathlen);
+   found = lookup_hash(crc, table);
+   search = found;
+   while (search  strcmp(path, search-pathname + search-de_pathlen - 
strlen(path)) != 0)
+   search = search-next_hash;
+   if (!search)
+   return;
+   /*
+* The number of subtrees is already calculated by
+* compile_directory_data, therefore we only need to
+* add the entry_count
+*/
+   search-de_nentries = it-entry_count;
+   if (0 = it-entry_count)
+   hashcpy(search-sha1, it-sha1);
+   if (strcmp(path, ) != 0)
+   crc = crc32(crc, (Bytef*)/, 1);
+
+#if DEBUG
+   if (0 = it-entry_count)
+   fprintf(stderr, cache-tree %.*s (%d ent, %d subtree) %s\n,
+   pathlen, path, it-entry_count, it-subtree_nr,
+   sha1_to_hex(it-sha1));
+   else
+   fprintf(stderr, cache-tree %.*s (%d subtree) invalid\n,
+   pathlen, path, it-subtree_nr);
+#endif
+
+   for (i = 0; i  it-subtree_nr; i++) {
+   struct cache_tree_sub *down = it-down[i];
+   if (i) {
+   struct cache_tree_sub *prev = it-down[i-1];
+   if (subtree_name_cmp(down-name, down-namelen,
+prev-name, prev-namelen) = 0)
+   die(fatal - unsorted cache subtree);
+   }
+   convert_one_to_ondisk_v5(table, down-cache_tree, down-name, 
down-namelen, crc);
+   }
+}
+
+static void cache_tree_to_ondisk_v5(struct hash_table *table, struct 
cache_tree *root)
+{
+   convert_one_to_ondisk_v5(table, root, , 0, 0);
+}
+
 static void ce_queue_push(struct cache_entry **head,
  struct cache_entry **tail,
  struct cache_entry *ce)
@@ -961,6 +1012,8 @@ static struct directory_entry 
*compile_directory_data(struct index_state *istate
add_part_to_conflict_entry(search, conflict_entry, 
conflict_part);
}
}
+   if (istate-cache_tree)
+   cache_tree_to_ondisk_v5(table, istate-cache_tree);
return de;
 }
 
-- 
1.8.3.4.1231.g9fbf354.dirty

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 15/24] read-cache: read index-v5

2013-08-18 Thread Thomas Gummerer
Make git read the index file version 5 without complaining.

This version of the reader doesn't read neither the cache-tree
nor the resolve undo data, but doesn't choke on an index that
includes such data.

Helped-by: Junio C Hamano gits...@pobox.com
Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Makefile|   1 +
 cache.h |  32 +++-
 read-cache-v5.c | 473 
 read-cache.h|   1 +
 4 files changed, 506 insertions(+), 1 deletion(-)
 create mode 100644 read-cache-v5.c

diff --git a/Makefile b/Makefile
index afae23e..a55206d 100644
--- a/Makefile
+++ b/Makefile
@@ -857,6 +857,7 @@ LIB_OBJS += quote.o
 LIB_OBJS += reachable.o
 LIB_OBJS += read-cache.o
 LIB_OBJS += read-cache-v2.o
+LIB_OBJS += read-cache-v5.o
 LIB_OBJS += reflog-walk.o
 LIB_OBJS += refs.o
 LIB_OBJS += remote.o
diff --git a/cache.h b/cache.h
index 714a334..89f556b 100644
--- a/cache.h
+++ b/cache.h
@@ -99,7 +99,7 @@ unsigned long git_deflate_bound(git_zstream *, unsigned long);
 #define CACHE_SIGNATURE 0x44495243 /* DIRC */
 
 #define INDEX_FORMAT_LB 2
-#define INDEX_FORMAT_UB 4
+#define INDEX_FORMAT_UB 5
 
 /*
  * The cache_time is just the low 32 bits of the
@@ -121,6 +121,15 @@ struct stat_data {
unsigned int sd_size;
 };
 
+/*
+ * The *next pointer is used in read_entries_v5 for holding
+ * all the elements of a directory, and points to the next
+ * cache_entry in a directory.
+ *
+ * It is reset by the add_name_hash call in set_index_entry
+ * to set it to point to the next cache_entry in the
+ * correct in-memory format ordering.
+ */
 struct cache_entry {
struct stat_data ce_stat_data;
unsigned int ce_mode;
@@ -132,11 +141,17 @@ struct cache_entry {
char name[FLEX_ARRAY]; /* more */
 };
 
+#define CE_NAMEMASK  (0x0fff)
 #define CE_STAGEMASK (0x3000)
 #define CE_EXTENDED  (0x4000)
 #define CE_VALID (0x8000)
+#define CE_SMUDGED   (0x0400) /* index v5 only flag */
 #define CE_STAGESHIFT 12
 
+#define CONFLICT_CONFLICTED (0x8000)
+#define CONFLICT_STAGESHIFT 13
+#define CONFLICT_STAGEMASK (0x6000)
+
 /*
  * Range 0x in ce_flags is divided into
  * two parts: in-memory flags and on-disk ones.
@@ -173,6 +188,19 @@ struct cache_entry {
 #define CE_EXTENDED_FLAGS (CE_INTENT_TO_ADD | CE_SKIP_WORKTREE)
 
 /*
+ * Representation of the extended on-disk flags in the v5 format.
+ * They must not collide with the ordinary on-disk flags, and need to
+ * fit in 16 bits.  Note however that v5 does not save the name
+ * length.
+ */
+#define CE_INTENT_TO_ADD_V5  (0x4000)
+#define CE_SKIP_WORKTREE_V5  (0x0800)
+#define CE_INVALID_V5(0x0200)
+#if (CE_VALID|CE_STAGEMASK)  
(CE_INTENTTOADD_V5|CE_SKIPWORKTREE_V5|CE_INVALID_V5)
+#error v5 on-disk flags collide with ordinary on-disk flags
+#endif
+
+/*
  * Safeguard to avoid saving wrong flags:
  *  - CE_EXTENDED2 won't get saved until its semantic is known
  *  - Bits in 0x have been saved in ce_flags already
@@ -213,6 +241,8 @@ static inline unsigned create_ce_flags(unsigned stage)
 #define ce_skip_worktree(ce) ((ce)-ce_flags  CE_SKIP_WORKTREE)
 #define ce_mark_uptodate(ce) ((ce)-ce_flags |= CE_UPTODATE)
 
+#define conflict_stage(c) ((CONFLICT_STAGEMASK  (c)-flags)  
CONFLICT_STAGESHIFT)
+
 #define ce_permissions(mode) (((mode)  0100) ? 0755 : 0644)
 static inline unsigned int create_ce_mode(unsigned int mode)
 {
diff --git a/read-cache-v5.c b/read-cache-v5.c
new file mode 100644
index 000..799b8e7
--- /dev/null
+++ b/read-cache-v5.c
@@ -0,0 +1,473 @@
+#include cache.h
+#include read-cache.h
+#include resolve-undo.h
+#include cache-tree.h
+#include dir.h
+#include pathspec.h
+
+#define ptr_add(x,y) ((void *)(((char *)(x)) + (y)))
+
+struct cache_header_v5 {
+   uint32_t hdr_ndir;
+   uint32_t hdr_fblockoffset;
+   uint32_t hdr_nextension;
+};
+
+struct directory_entry {
+   struct directory_entry **sub;
+   struct directory_entry *next;
+   struct directory_entry *next_hash;
+   struct cache_entry *ce;
+   struct cache_entry *ce_last;
+   struct conflict_entry *conflict;
+   struct conflict_entry *conflict_last;
+   uint32_t conflict_size;
+   uint32_t de_foffset;
+   uint32_t de_cr;
+   uint32_t de_ncr;
+   uint32_t de_nsubtrees;
+   uint32_t de_nfiles;
+   uint32_t de_nentries;
+   unsigned char sha1[20];
+   uint16_t de_flags;
+   uint32_t de_pathlen;
+   char pathname[FLEX_ARRAY];
+};
+
+struct conflict_part {
+   struct conflict_part *next;
+   uint16_t flags;
+   uint16_t entry_mode;
+   unsigned char sha1[20];
+};
+
+struct conflict_entry {
+   struct conflict_entry *next;
+   uint32_t nfileconflicts;
+   struct conflict_part *entries;
+   uint32_t namelen;
+   uint32_t pathlen;
+   char name[FLEX_ARRAY];
+};
+
+#define directory_entry_size

[PATCH v3 20/24] read-cache: write resolve-undo data for index-v5

2013-08-18 Thread Thomas Gummerer
Make git read the resolve-undo data from the index.

Since the resolve-undo data is joined with the conflicts in
the ondisk format of the index file version 5, conflicts and
resolved data is read at the same time, and the resolve-undo
data is then converted to the in-memory format.

Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c | 89 +
 1 file changed, 89 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index ed52b7c..10960fd 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -942,6 +942,94 @@ static void cache_tree_to_ondisk_v5(struct hash_table 
*table, struct cache_tree
convert_one_to_ondisk_v5(table, root, , 0, 0);
 }
 
+static void resolve_undo_to_ondisk_v5(struct hash_table *table,
+ struct string_list *resolve_undo,
+ unsigned int *ndir,
+ unsigned int *total_dir_len,
+ struct directory_entry *de)
+{
+   struct string_list_item *item;
+   struct directory_entry *search;
+
+   if (!resolve_undo)
+   return;
+   for_each_string_list_item(item, resolve_undo) {
+   struct conflict_entry *conflict_entry;
+   struct resolve_undo_info *ui = item-util;
+   char *super;
+   int i, dir_len, len;
+   uint32_t crc;
+   struct directory_entry *found, *current, *new_tree;
+
+   if (!ui)
+   continue;
+
+   super = super_directory(item-string);
+   dir_len = super ? strlen(super) : 0;
+   crc = crc32(0, (Bytef*)super, dir_len);
+   found = lookup_hash(crc, table);
+   current = NULL;
+   new_tree = NULL;
+
+   while (!found) {
+   struct directory_entry *new;
+
+   new = init_directory_entry(super, dir_len);
+   if (!current)
+   current = new;
+   insert_directory_entry(new, table, total_dir_len, ndir, 
crc);
+   if (new_tree != NULL)
+   new-de_nsubtrees = 1;
+   new-next = new_tree;
+   new_tree = new;
+   super = super_directory(super);
+   dir_len = super ? strlen(super) : 0;
+   crc = crc32(0, (Bytef*)super, dir_len);
+   found = lookup_hash(crc, table);
+   }
+   search = found;
+   while (search-next_hash  strcmp(super, search-pathname) != 
0)
+   search = search-next_hash;
+   if (search  !current)
+   current = search;
+   if (!search  !current)
+   current = new_tree;
+   if (!super  new_tree) {
+   new_tree-next = de-next;
+   de-next = new_tree;
+   de-de_nsubtrees++;
+   } else if (new_tree) {
+   struct directory_entry *temp;
+
+   search = de-next;
+   while (strcmp(super, search-pathname))
+   search = search-next;
+   temp = new_tree;
+   while (temp-next)
+   temp = temp-next;
+   search-de_nsubtrees++;
+   temp-next = search-next;
+   search-next = new_tree;
+   }
+
+   len = strlen(item-string);
+   conflict_entry = create_new_conflict(item-string, len, 
current-de_pathlen);
+   add_conflict_to_directory_entry(current, conflict_entry);
+   for (i = 0; i  3; i++) {
+   if (ui-mode[i]) {
+   struct conflict_part *cp;
+
+   cp = xmalloc(sizeof(struct conflict_part));
+   cp-flags = (i + 1)  CONFLICT_STAGESHIFT;
+   cp-entry_mode = ui-mode[i];
+   cp-next = NULL;
+   hashcpy(cp-sha1, ui-sha1[i]);
+   add_part_to_conflict_entry(current, 
conflict_entry, cp);
+   }
+   }
+   }
+}
+
 static void ce_queue_push(struct cache_entry **head,
  struct cache_entry **tail,
  struct cache_entry *ce)
@@ -1014,6 +1102,7 @@ static struct directory_entry 
*compile_directory_data(struct index_state *istate
}
if (istate-cache_tree)
cache_tree_to_ondisk_v5(table, istate-cache_tree);
+   resolve_undo_to_ondisk_v5(table, istate-resolve_undo, ndir

[PATCH v3 08/24] add documentation for the index api

2013-08-18 Thread Thomas Gummerer
Add documentation for the index reading api.  This also includes
documentation for the new api functions introduced in the next patch.

Helped-by: Nguyễn Thái Ngọc Duy pclo...@gmail.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Documentation/technical/api-in-core-index.txt | 54 +--
 1 file changed, 50 insertions(+), 4 deletions(-)

diff --git a/Documentation/technical/api-in-core-index.txt 
b/Documentation/technical/api-in-core-index.txt
index adbdbf5..9b8c37c 100644
--- a/Documentation/technical/api-in-core-index.txt
+++ b/Documentation/technical/api-in-core-index.txt
@@ -1,14 +1,60 @@
 in-core index API
 =
 
+Reading API
+---
+
+`cache`::
+
+   An array of cache entries.  This is used to access the cache
+   entries directly.  Use `index_name_pos` to search for the
+   index of a specific cache entry.
+
+`read_index_filtered`::
+
+   Read a part of the index, filtered by the pathspec given in
+   the opts.  The function may load more than necessary, so the
+   caller still responsible to apply filters appropriately.  The
+   filtering is only done for performance reasons, as it's
+   possible to only read part of the index when the on-disk
+   format is index-v5.
+
+   To iterate only over the entries that match the pathspec, use
+   the for_each_index_entry function.
+
+`read_index`::
+
+   Read the whole index file from disk.
+
+`index_name_pos`::
+
+   Find a cache_entry with name in the index.  Returns pos if an
+   entry is matched exactly and -1-pos if an entry is matched
+   partially.
+   e.g.
+   index:
+   file1
+   file2
+   path/file1
+   zzz
+
+   index_name_pos(path/file1, 10) returns 2, while
+   index_name_pos(path, 4) returns -3
+
+`for_each_index_entry`::
+
+   Iterates over all cache_entries in the index filtered by
+   filter_opts in the index_state.  For each cache entry fn is
+   executed with cb_data as callback data.  From within the loop
+   do `return 0` to continue, or `return 1` to break the loop.
+
+TODO
+
 Talk about read-cache.c and cache-tree.c, things like:
 
-* cache - the_index macros
-* read_index()
 * write_index()
 * ie_match_stat() and ie_modified(); how they are different and when to
   use which.
-* index_name_pos()
 * remove_index_entry_at()
 * remove_file_from_index()
 * add_file_to_index()
@@ -18,4 +64,4 @@ Talk about read-cache.c and cache-tree.c, things like:
 * cache_tree_invalidate_path()
 * cache_tree_update()
 
-(JC, Linus)
+(JC, Linus, Thomas Gummerer)
-- 
1.8.3.4.1231.g9fbf354.dirty

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 10/24] make sure partially read index is not changed

2013-08-20 Thread Thomas Gummerer
Eric Sunshine sunsh...@sunshineco.com writes:

 On Sun, Aug 18, 2013 at 3:41 PM, Thomas Gummerer t.gumme...@gmail.com wrote:
 A partially read index file currently cannot be written to disk.  Make
 sure that never happens, by erroring out when a caller tries to write a

 s/,//

 partially read index.  Do the same when trying to re-read a partially
 read index without having discarded it first to avoid loosing any

 s/loosing/losing/

 information.

 Forcing the caller to load the right part of the index file instead of
 re-reading it when changing it, gives a bit of a performance advantage,

 s/it,/it/  (or s/file instead/file, instead/)
 s/advantage,/advantage/

 by avoiding to read parts of the index twice.

 /to read/reading/

 More below...


 Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
 ---
  read-cache.c | 4 
  1 file changed, 4 insertions(+)

 diff --git a/read-cache.c b/read-cache.c
 index 38b9a04..7a27f9b 100644
 --- a/read-cache.c
 +++ b/read-cache.c
 @@ -1332,6 +1332,8 @@ int read_index_filtered_from(struct index_state 
 *istate, const char *path,
 void *mmap;
 size_t mmap_size;

 +   if (istate-filter_opts)
 +   die(BUG: Can't re-read partially read index);
 errno = EBUSY;
 if (istate-initialized)
 return istate-cache_nr;
 @@ -1455,6 +1457,8 @@ void update_index_if_able(struct index_state *istate, 
 struct lock_file *lockfile

  int write_index(struct index_state *istate, int newfd)
  {
 +   if (istate-filter_opts)
 +   die(BUG: index: cannot write a partially read index);

 Consistency nit:

 In the preceding hunk, the error message starts BUG: Can't..., but
 in this hunk we have BUG: index: cannot

 So, BUG: is the prefix of one, but BUG: index: is the prefix of the other.

 Spelling difference: Can't vs. cannot.

 Capitalization difference: Can't vs. cannot.

Thanks for catching this.  From quick grepping it seems the preferred
version seems to be the one with only BUG: as prefix and starting with
a lower case letter after this.  Both can't and cannot are used in the
codebase, but cannot seems to be used more often.  I'll use that.

Will fix this and the rest of the style/spelling/grammar fixes you
suggested.  Thanks.

 return istate-ops-write_index(istate, newfd);
  }

 --
 1.8.3.4.1231.g9fbf354.dirty

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 15/24] read-cache: read index-v5

2013-08-20 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 General comment: a short comment before each function describing what
 the function does would be helpful. This only applies for complex
 functions (read_* ones). Of course verify_hdr does not require extra
 explanantion.

Yes, makes sense, I'll do that in the re-roll.

  On Mon, Aug 19, 2013 at 2:42 AM, Thomas Gummerer t.gumme...@gmail.com 
 wrote:
 +static struct directory_entry *directory_entry_from_ondisk(struct 
 ondisk_directory_entry *ondisk,
 +  const char *name,
 +  size_t len)
 +{
 +   struct directory_entry *de = xmalloc(directory_entry_size(len));
 +
 +   memcpy(de-pathname, name, len);
 +   de-pathname[len] = '\0';
 +   de-de_flags  = ntoh_s(ondisk-flags);
 +   de-de_foffset= ntoh_l(ondisk-foffset);
 +   de-de_cr = ntoh_l(ondisk-cr);
 +   de-de_ncr= ntoh_l(ondisk-ncr);
 +   de-de_nsubtrees  = ntoh_l(ondisk-nsubtrees);
 +   de-de_nfiles = ntoh_l(ondisk-nfiles);
 +   de-de_nentries   = ntoh_l(ondisk-nentries);
 +   de-de_pathlen= len;
 +   hashcpy(de-sha1, ondisk-sha1);
 +   return de;
 +}

 This function leaves a lot of fields uninitialized..

 +static struct directory_entry *read_directories(unsigned int *dir_offset,
 +   unsigned int *dir_table_offset,
 +   void *mmap,
 +   int mmap_size)
 +{
 
 +   de = directory_entry_from_ondisk(disk_de, name, len);
 +   de-next = NULL;
 +   de-sub = NULL;

 ..and two of them are set to NULL here. Maybe
 directory_entry_from_ondisk() could be made to call
 init_directory_entry() instead so that we don't need to manually reset
 some fields here, which leaves me wondering why other fields are not
 important to reset. init_directory_entry() is introduced later in
 write index-v5 patch, you so may want to move it up a few patches.

The rest of the fields are only used for compiling the data that will be
written.  Using init_directory_entry() here makes sense anyway though,
thanks for the suggestion.

 +static int read_entry(struct cache_entry **ce, char *pathname, size_t 
 pathlen,
 + void *mmap, unsigned long mmap_size,
 + unsigned int first_entry_offset,
 + unsigned int foffsetblock)
 +{
 +   int len, offset_to_offset;
 +   char *name;
 +   uint32_t foffsetblockcrc, *filecrc, *beginning, *end, entry_offset;
 +   struct ondisk_cache_entry *disk_ce;
 +
 +   beginning = ptr_add(mmap, foffsetblock);
 +   end = ptr_add(mmap, foffsetblock + 4);
 +   len = ntoh_l(*end) - ntoh_l(*beginning) - sizeof(struct 
 ondisk_cache_entry) - 5;

 It took me a while to check and figure out  - 5 here means minus NUL
 and the crc. A short comment would help. I think there's also another
 -5 in read_directories(). Or maybe just rename len to namelen.

Will add a short comment.

 +struct conflict_entry *create_new_conflict(char *name, int len, int pathlen)
 +{
 +   struct conflict_entry *conflict_entry;
 +
 +   if (pathlen)
 +   pathlen++;
 +   conflict_entry = xmalloc(conflict_entry_size(len));
 +   conflict_entry-entries = NULL;
 +   conflict_entry-nfileconflicts = 0;
 +   conflict_entry-namelen = len;
 +   memcpy(conflict_entry-name, name, len);
 +   conflict_entry-name[len] = '\0';
 +   conflict_entry-pathlen = pathlen;
 +   conflict_entry-next = NULL;

 A memset followed by memcpy and conflict_entry-pathlen = pathlen
 would make this shorter and won't miss new fields added in future.

Makes sense, thanks.

 +static int read_entries(struct index_state *istate, struct directory_entry 
 *de,
 +   unsigned int first_entry_offset, void *mmap,
 +   unsigned long mmap_size, unsigned int *nr,
 +   unsigned int foffsetblock)
 +{
 +   struct cache_entry *ce;
 +   int i, subdir = 0;
 +
 +   for (i = 0; i  de-de_nfiles; i++) {
 +   unsigned int subdir_foffsetblock = de-de_foffset + 
 foffsetblock + (i * 4);
 +   if (read_entry(ce, de-pathname, de-de_pathlen, mmap, 
 mmap_size,
 +  first_entry_offset, subdir_foffsetblock)  0)
 +   return -1;

 You read one file entry, say abc/def...

You're not quite right here.  I'm reading def here, de is the root
directory and de-sub[subdir] is the first sub directory, named abc/

 +   while (subdir  de-de_nsubtrees 
 +  cache_name_compare(ce-name + de-de_pathlen,
 + ce_namelen(ce) - de-de_pathlen,
 + de-sub[subdir]-pathname + 
 de-de_pathlen,
 + de-sub[subdir]-de_pathlen - 
 de-de_pathlen)  0) {

 Oh right

Re: [PATCH v3 15/24] read-cache: read index-v5

2013-08-20 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Mon, Aug 19, 2013 at 2:42 AM, Thomas Gummerer t.gumme...@gmail.com wrote:
 +static int read_entry(struct cache_entry **ce, char *pathname, size_t 
 pathlen,
 + void *mmap, unsigned long mmap_size,
 + unsigned int first_entry_offset,
 + unsigned int foffsetblock)
 +{
 +   int len, offset_to_offset;
 +   char *name;
 +   uint32_t foffsetblockcrc, *filecrc, *beginning, *end, entry_offset;
 +   struct ondisk_cache_entry *disk_ce;
 +
 +   beginning = ptr_add(mmap, foffsetblock);
 +   end = ptr_add(mmap, foffsetblock + 4);
 +   len = ntoh_l(*end) - ntoh_l(*beginning) - sizeof(struct 
 ondisk_cache_entry) - 5;
 +   entry_offset = first_entry_offset + ntoh_l(*beginning);
 +   name = ptr_add(mmap, entry_offset);
 +   disk_ce = ptr_add(mmap, entry_offset + len + 1);
 +   *ce = cache_entry_from_ondisk(disk_ce, pathname, name, len, pathlen);
 +   filecrc = ptr_add(mmap, entry_offset + len + 1 + sizeof(*disk_ce));
 +   offset_to_offset = htonl(foffsetblock);
 +   foffsetblockcrc = crc32(0, (Bytef*)offset_to_offset, 4);
 +   if (!check_crc32(foffsetblockcrc,
 +   ptr_add(mmap, entry_offset), len + 1 + sizeof(*disk_ce),
 +   ntoh_l(*filecrc)))
 +   return -1;
 +
 +   return 0;
 +}

 Last thought before book+bed time. I wonder if moving the name part to
 the end of the entry (i.e. chaging on disk format) would simplify this
 code. The new ondisk_cache_entry would be something like this

 struct ondisk_cache_entry {
uint16_t flags;
uint16_t mode;
struct cache_time mtime;
uint32_t size;
int stat_crc;
unsigned char sha1[20];
char name[FLEX_ARRAY];
 };

I think it simplifies it a bit, but not too much, the only thing I see
avoiding the use of the name variable.  I think it will also simplify
the writing code a bit.  The only negative part would be for bisecting
the index, but that would still be possible, and only slightly more
complicated.  I'll give it a try tomorrow and check if it's worth it.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 02/24] read-cache: use fixed width integer types

2013-08-20 Thread Thomas Gummerer
Junio C Hamano gits...@pobox.com writes:

 Thomas Gummerer t.gumme...@gmail.com writes:

 Use the fixed width integer types uint16_t and uint32_t for ondisk
 structures, because unsigned short and unsigned int do not hae a
 guaranteed size.

 This sounds like an independent fix to me.  I'd queue this early
 independent from the rest of the series.

 Thanks.

Sounds good to me.  Thanks.


 Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
 ---
  cache.h  | 10 +-
  read-cache.c | 30 +++---
  2 files changed, 20 insertions(+), 20 deletions(-)

 diff --git a/cache.h b/cache.h
 index bd6fb9f..9ef778a 100644
 --- a/cache.h
 +++ b/cache.h
 @@ -101,9 +101,9 @@ unsigned long git_deflate_bound(git_zstream *, unsigned 
 long);
  
  #define CACHE_SIGNATURE 0x44495243  /* DIRC */
  struct cache_header {
 -unsigned int hdr_signature;
 -unsigned int hdr_version;
 -unsigned int hdr_entries;
 +uint32_t hdr_signature;
 +uint32_t hdr_version;
 +uint32_t hdr_entries;
  };
  
  #define INDEX_FORMAT_LB 2
 @@ -115,8 +115,8 @@ struct cache_header {
   * check it for equality in the 32 bits we save.
   */
  struct cache_time {
 -unsigned int sec;
 -unsigned int nsec;
 +uint32_t sec;
 +uint32_t nsec;
  };
  
  struct stat_data {
 diff --git a/read-cache.c b/read-cache.c
 index ceaf207..0df5b31 100644
 --- a/read-cache.c
 +++ b/read-cache.c
 @@ -1230,14 +1230,14 @@ static struct cache_entry 
 *refresh_cache_entry(struct cache_entry *ce, int reall
  struct ondisk_cache_entry {
  struct cache_time ctime;
  struct cache_time mtime;
 -unsigned int dev;
 -unsigned int ino;
 -unsigned int mode;
 -unsigned int uid;
 -unsigned int gid;
 -unsigned int size;
 +uint32_t dev;
 +uint32_t ino;
 +uint32_t mode;
 +uint32_t uid;
 +uint32_t gid;
 +uint32_t size;
  unsigned char sha1[20];
 -unsigned short flags;
 +uint16_t flags;
  char name[FLEX_ARRAY]; /* more */
  };
  
 @@ -1249,15 +1249,15 @@ struct ondisk_cache_entry {
  struct ondisk_cache_entry_extended {
  struct cache_time ctime;
  struct cache_time mtime;
 -unsigned int dev;
 -unsigned int ino;
 -unsigned int mode;
 -unsigned int uid;
 -unsigned int gid;
 -unsigned int size;
 +uint32_t dev;
 +uint32_t ino;
 +uint32_t mode;
 +uint32_t uid;
 +uint32_t gid;
 +uint32_t size;
  unsigned char sha1[20];
 -unsigned short flags;
 -unsigned short flags2;
 +uint16_t flags;
 +uint16_t flags2;
  char name[FLEX_ARRAY]; /* more */
  };
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 04/24] read-cache: clear version in discard_index()

2013-08-20 Thread Thomas Gummerer
Junio C Hamano gits...@pobox.com writes:

 Thomas Gummerer t.gumme...@gmail.com writes:

 All fields except index_state-version are reset in discard_index.
 Reset the version too.

 What is the practical consequence of not clearing this field?  I
 somehow have a feeling that this was done deliberately, so that we
 can stick to the version of the index file format better, once the
 user said update-index --index-version $N to set it up.  I suspect
 that the patch would affect a codepath that does read_cache(), calls
 discard_index(), populates the index and then does write_cache().
 We stick to the version the user specified earlier in our current
 code, while the patched code will revert to whatever default built
 into your Git binary, no?

Yeah you're right, I missed that use-case.  I'll drop this patch from
the re-roll.  Sorry for the noise.


 Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
 ---
  read-cache.c | 1 +
  1 file changed, 1 insertion(+)

 diff --git a/read-cache.c b/read-cache.c
 index de0bbcd..1e22f6f 100644
 --- a/read-cache.c
 +++ b/read-cache.c
 @@ -1558,6 +1558,7 @@ int discard_index(struct index_state *istate)
  for (i = 0; i  istate-cache_nr; i++)
  free(istate-cache[i]);
  resolve_undo_clear_index(istate);
 +istate-version = 0;
  istate-cache_nr = 0;
  istate-cache_changed = 0;
  istate-timestamp.sec = 0;
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 23/24] introduce GIT_INDEX_VERSION environment variable

2013-08-20 Thread Thomas Gummerer
Duy Nguyen pclo...@gmail.com writes:

 On Mon, Aug 19, 2013 at 2:42 AM, Thomas Gummerer t.gumme...@gmail.com wrote:
 Respect a GIT_INDEX_VERSION environment variable, when a new index is
 initialized.  Setting the environment variable will not cause existing
 index files to be converted to another format for additional safety.

 Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
 ---
  read-cache.c | 9 +++--
  1 file changed, 7 insertions(+), 2 deletions(-)

 There should be a line or two about this variable in git.txt, section
 Environment variables. We could even have core.defaultIndexVersion
 for people who don't want to set environment variables (and set this
 key in ~/.gitconfig instead) but this is not important now.

Ok, I'll add it in git.txt.  I agree, core.defaultIndexVersion can still
be done in a follow-up patch, the environment variable is the important
thing now because it's used for testing.  Existing repositories have to
be converted with git-update-index anyway.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 00/24] Index-v5

2013-08-30 Thread Thomas Gummerer
Junio C Hamano gits...@pobox.com writes:

 Duy Nguyen pclo...@gmail.com writes:

 On Mon, Aug 19, 2013 at 2:41 AM, Thomas Gummerer t.gumme...@gmail.com 
 wrote:

 I'm done reviewing this version (I neglected the extension writing
 patches because after spending hours on the main write patch I don't
 want to look at them anymore :p). Now that rc period is over, with a
 partial write proof-of-concept, I think it's enough to call Junio's
 attention on the series, see if we have any chance of merging it. The
 partial write POC is needed to make sure we don't overlook anything,
 just support update-index is enough.

 I've been following the review comment threads after looking at the
 patches myself when they were posted. I was hoping to see some API
 improvement over the current we (have to) have everything available
 in-core in a flat array model, which gives a lot of convenience and
 IO overhead at the same time, that would make me say yes, this
 operation, that we need to do very often, will certainly be helped
 by this new API, and in order to support that style of API better,
 the current file format is inadequate and we do need to go to the
 proposed tree like on-disk format for at least one, but
 unfortunately I haven't found any (yet).

 So...

I think the issue is a bit different.  The current API, with some small
additions (e.g. read_index_filtered()) works well as in-memory format,
even for partial reading/writing.  I will try to write a POC for partial
writing to show that the current in-memory format works for this too.
As Duy wrote in the other email, some API changes will be necessary to
allow that, but not a big API change moving from a flat array to a tree
based format.

I think it comes down to this operation will be helped by partial
loading/writing and we need this small API changes
(read_index_filtered() for now, more to follow) and the index format
change to be able to do that.

Does that make sense, with at least Duy's comments in the review
addressed and a POC for partial writing?
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v2 15/16] update-index.c: add a force-rewrite option

2012-08-08 Thread Thomas Gummerer
On 08/05, Junio C Hamano wrote:
 Thomas Gummerer t.gumme...@gmail.com writes:
 
  Add a force-rewrite option to update-index, which allows the user
  to rewrite the index, even if there are no changes. This can be used
  to do performance tests of both the reader and the writer.
 
  Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
  ---
   builtin/update-index.c |5 -
   1 file changed, 4 insertions(+), 1 deletion(-)
 
 I do not think this is wrong per-se, but is a new command that needs
 to be documented?  If it is only for benchmarking and debugging, it
 might be sufficient to make --index-version n always rewrite the
 index.

The command is only for benchmarking, I don't see another case where
it makes sense for anyone to rewrite the whole index, without changing
anything. I've made --index-version rewrite the index for the re-roll.

  diff --git a/builtin/update-index.c b/builtin/update-index.c
  index 4ce341c..7fedc8f 100644
  --- a/builtin/update-index.c
  +++ b/builtin/update-index.c
  @@ -24,6 +24,7 @@ static int allow_remove;
   static int allow_replace;
   static int info_only;
   static int force_remove;
  +static int force_rewrite;
   static int verbose;
   static int mark_valid_only;
   static int mark_skip_worktree_only;
  @@ -728,6 +729,8 @@ int cmd_update_index(int argc, const char **argv, const 
  char *prefix)
  OPT_BIT(0, unmerged, refresh_args.flags,
  refresh even if index contains unmerged entries,
  REFRESH_UNMERGED),
  +   OPT_SET_INT(0, force-rewrite, force_rewrite,
  +   force a index rewrite even if there is no change, 1),
  {OPTION_CALLBACK, 0, refresh, refresh_args, NULL,
  refresh stat information,
  PARSE_OPT_NOARG | PARSE_OPT_NONEG,
  @@ -886,7 +889,7 @@ int cmd_update_index(int argc, const char **argv, const 
  char *prefix)
  strbuf_release(buf);
  }
   
  -   if (active_cache_changed) {
  +   if (active_cache_changed || force_rewrite) {
  if (newfd  0) {
  if (refresh_args.flags  REFRESH_QUIET)
  exit(128);
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v2 09/16] Read index-v5

2012-08-08 Thread Thomas Gummerer
On 08/05, Junio C Hamano wrote:
 Thomas Gummerer t.gumme...@gmail.com writes:
 
  +static struct directory_entry *read_directories_v5(unsigned int 
  *dir_offset,
  +   unsigned int *dir_table_offset,
  +   void *mmap,
  +   int mmap_size)
  +{
  +   int i, ondisk_directory_size;
  +   uint32_t *filecrc, *beginning, *end;
  +   struct directory_entry *current = NULL;
  +   struct ondisk_directory_entry *disk_de;
  +   struct directory_entry *de;
  +   unsigned int data_len, len;
  +   char *name;
  +
  +   ondisk_directory_size = sizeof(disk_de-flags)
  +   + sizeof(disk_de-foffset)
  +   + sizeof(disk_de-cr)
  +   + sizeof(disk_de-ncr)
  +   + sizeof(disk_de-nsubtrees)
  +   + sizeof(disk_de-nfiles)
  +   + sizeof(disk_de-nentries)
  +   + sizeof(disk_de-sha1);
  +   name = (char *)mmap + *dir_offset;
  +   beginning = mmap + *dir_table_offset;
 
 Notice how you computed name with pointer arithmetic by first
 casting mmap (which is void *) and when computing beginning, you
 forgot to cast mmap and attempted pointer arithmetic with void *.
 The latter does not work and breaks compilation.
 
 The pointer-arith with void * is not limited to this function.

Sorry for not noticing this, it always compiled fine for me. Guess
I should use -pedantic more often ;-)

 Please check the a band-aid (I wouldn't call it a fix-up) patch I
 added on top of the series before queuing the topic to 'pu'; it is
 primarily to illustrate the places I noticed that have this issue.
 
 I do not necessarily suggest that the way the band-aid patch makes
 it compile is the best approach.  It might be cleaner to use a saner
 type like char * (or perhaps const char *) as the type to point
 at a piece of memory you read from the disk.  I haven't formed an
 opinion.
 
 Thanks.

I've used the type of the respective assignment for now. e.g. i have
struct cache_header *hdr, so I'm using
hdr = (struct cache_header *)mmap + x;

read-cache-v5.c compiles with -pedantic without warnings.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] Add index-v5

2012-08-08 Thread Thomas Gummerer


On 08/07, Robin Rosenberg wrote:
 Nguyễn Thái Ngọc Duy skrev 2012-08-06 16.36:
 
 +++ b/read-cache-v5.c
 @@ -0,0 +1,1170 @@
 +#include cache.h
 +#include read-cache.h
 +#include resolve-undo.h
 +#include cache-tree.h
 +
 +struct cache_header_v5 {
 +unsigned int hdr_ndir;
 +unsigned int hdr_nfile;
 +unsigned int hdr_fblockoffset;
 +unsigned int hdr_nextension;
 +};
 +
 +struct ondisk_cache_entry_v5 {
 +unsigned short flags;
 +unsigned short mode;
 +struct cache_time mtime;
 +int stat_crc;
 +unsigned char sha1[20];
 +};
 
 I mentioned this before in another thread, but for JGit I'd like
 to see size as a separate attribute. The rest of stat_crc is not
 available to Java so when this index gets its way into JGit,
 stat_crc will be zero and will never be checked.
 

I'm sorry for forgetting to add this, it will be included in the
re-roll.  The stat_crc will be ignored if it is 0 in the ondisk
index.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v3 05/13] Make in-memory format aware of stat_crc

2012-08-08 Thread Thomas Gummerer
Make the in-memory format aware of the stat_crc used by index-v5.
It is simply ignored by index version prior to v5.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache.h  |1 +
 read-cache.c |   25 +
 2 files changed, 26 insertions(+)

diff --git a/cache.h b/cache.h
index c77cdbe..bfe3099 100644
--- a/cache.h
+++ b/cache.h
@@ -122,6 +122,7 @@ struct cache_entry {
unsigned int ce_flags;
unsigned int ce_namelen;
unsigned char sha1[20];
+   uint32_t ce_stat_crc;
struct cache_entry *next;
struct cache_entry *dir_next;
char name[FLEX_ARRAY]; /* more */
diff --git a/read-cache.c b/read-cache.c
index 125e6a0..d8f8b74 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -51,6 +51,29 @@ void rename_index_entry_at(struct index_state *istate, int 
nr, const char *new_n
add_index_entry(istate, new, 
ADD_CACHE_OK_TO_ADD|ADD_CACHE_OK_TO_REPLACE);
 }
 
+static uint32_t calculate_stat_crc(struct cache_entry *ce)
+{
+   unsigned int ctimens = 0;
+   uint32_t stat, stat_crc;
+
+   stat = htonl(ce-ce_ctime.sec);
+   stat_crc = crc32(0, (Bytef*)stat, 4);
+#ifdef USE_NSEC
+   ctimens = ce-ce_ctime.nsec;
+#endif
+   stat = htonl(ctimens);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_ino);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_dev);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_uid);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   stat = htonl(ce-ce_gid);
+   stat_crc = crc32(stat_crc, (Bytef*)stat, 4);
+   return stat_crc;
+}
+
 /*
  * This only updates the non-critical parts of the directory
  * cache, ie the parts that aren't tracked by GIT, and only used
@@ -73,6 +96,8 @@ void fill_stat_cache_info(struct cache_entry *ce, struct stat 
*st)
 
if (S_ISREG(st-st_mode))
ce_mark_uptodate(ce);
+
+   ce-ce_stat_crc = calculate_stat_crc(ce);
 }
 
 static int ce_compare_data(struct cache_entry *ce, struct stat *st)
-- 
1.7.10.GIT

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v3 02/13] t2104: Don't fail for index versions other than [23]

2012-08-08 Thread Thomas Gummerer
t2104 currently checks for the exact index version 2 or 3,
depending if there is a skip-worktree flag or not. Other
index versions do not use extended flags and thus cannot
be tested for version changes.

Make this test update the index to version 2 at the beginning
of the test. Testing the skip-worktree flags for the default
index format is still covered by t7011 and t7012.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/t2104-update-index-skip-worktree.sh |1 +
 1 file changed, 1 insertion(+)

diff --git a/t/t2104-update-index-skip-worktree.sh 
b/t/t2104-update-index-skip-worktree.sh
index 1d0879b..bd9644f 100755
--- a/t/t2104-update-index-skip-worktree.sh
+++ b/t/t2104-update-index-skip-worktree.sh
@@ -22,6 +22,7 @@ H sub/2
 EOF
 
 test_expect_success 'setup' '
+   git update-index --index-version=2 
mkdir sub 
touch ./1 ./2 sub/1 sub/2 
git add 1 2 sub/1 sub/2 
-- 
1.7.10.GIT

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v3 13/13] p0002-index.sh: add perf test for the index formats

2012-08-08 Thread Thomas Gummerer
From: Thomas Rast tr...@student.ethz.ch

Add a performance test for index version [23]/4/5 by using
git update-index --update-index=[345], thus testing both the reader
and the writer speed of all index formats.

Signed-off-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/perf/p0002-index.sh |   33 +
 1 file changed, 33 insertions(+)
 create mode 100755 t/perf/p0002-index.sh

diff --git a/t/perf/p0002-index.sh b/t/perf/p0002-index.sh
new file mode 100755
index 000..140c7a0
--- /dev/null
+++ b/t/perf/p0002-index.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+test_description=Tests index versions [23]/4/5
+
+. ./perf-lib.sh
+
+test_perf_large_repo
+
+test_expect_success 'convert to v3' '
+   git update-index --index-version=3
+'
+
+test_perf 'v[23]: update-index' '
+   git update-index --index-version=3 /dev/null
+'
+
+test_expect_success 'convert to v4' '
+   git update-index --index-version=4
+'
+
+test_perf 'v4: update-index' '
+   git update-index --index-version=4 /dev/null
+'
+
+test_expect_success 'convert to v5' '
+   git update-index --index-version=5
+'
+
+test_perf 'v5: update-index' '
+   git update-index --index-version=5 /dev/null
+'
+
+test_done
-- 
1.7.10.GIT

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v3 06/13] Read index-v5

2012-08-08 Thread Thomas Gummerer
Make git read the index file version 5 without complaining.

This version of the reader doesn't read neither the cache-tree
nor the resolve undo data, but doesn't choke on an index that
includes such data.

Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Makefile|1 +
 cache.h |   72 +++
 read-cache-v5.c |  589 +++
 read-cache.c|1 -
 4 files changed, 662 insertions(+), 1 deletion(-)
 create mode 100644 read-cache-v5.c

diff --git a/Makefile b/Makefile
index b4a7c73..77be175 100644
--- a/Makefile
+++ b/Makefile
@@ -770,6 +770,7 @@ LIB_OBJS += quote.o
 LIB_OBJS += reachable.o
 LIB_OBJS += read-cache.o
 LIB_OBJS += read-cache-v2.o
+LIB_OBJS += read-cache-v5.o
 LIB_OBJS += reflog-walk.o
 LIB_OBJS += refs.o
 LIB_OBJS += remote.o
diff --git a/cache.h b/cache.h
index bfe3099..a0a1781 100644
--- a/cache.h
+++ b/cache.h
@@ -110,6 +110,15 @@ struct cache_time {
unsigned int nsec;
 };
 
+/*
+ * The *next pointer is used in read_entries_v5 for holding
+ * all the elements of a directory, and points to the next
+ * cache_entry in a directory.
+ *
+ * It is reset by the add_name_hash call in set_index_entry
+ * to set it to point to the next cache_entry in the
+ * correct in-memory format ordering.
+ */
 struct cache_entry {
struct cache_time ce_ctime;
struct cache_time ce_mtime;
@@ -128,11 +137,58 @@ struct cache_entry {
char name[FLEX_ARRAY]; /* more */
 };
 
+struct directory_entry {
+   struct directory_entry *next;
+   struct directory_entry *next_hash;
+   struct cache_entry *ce;
+   struct cache_entry *ce_last;
+   struct conflict_entry *conflict;
+   struct conflict_entry *conflict_last;
+   unsigned int conflict_size;
+   unsigned int de_foffset;
+   unsigned int de_cr;
+   unsigned int de_ncr;
+   unsigned int de_nsubtrees;
+   unsigned int de_nfiles;
+   unsigned int de_nentries;
+   unsigned char sha1[20];
+   unsigned short de_flags;
+   unsigned int de_pathlen;
+   char pathname[FLEX_ARRAY];
+};
+
+struct conflict_part {
+   struct conflict_part *next;
+   unsigned short flags;
+   unsigned short entry_mode;
+   unsigned char sha1[20];
+};
+
+struct conflict_entry {
+   struct conflict_entry *next;
+   unsigned int nfileconflicts;
+   struct conflict_part *entries;
+   unsigned int namelen;
+   unsigned int pathlen;
+   char name[FLEX_ARRAY];
+};
+
+struct ondisk_conflict_part {
+   unsigned short flags;
+   unsigned short entry_mode;
+   unsigned char sha1[20];
+};
+
+#define CE_NAMEMASK  (0x0fff)
 #define CE_STAGEMASK (0x3000)
 #define CE_EXTENDED  (0x4000)
 #define CE_VALID (0x8000)
 #define CE_STAGESHIFT 12
 
+#define CONFLICT_CONFLICTED (0x8000)
+#define CONFLICT_STAGESHIFT 13
+#define CONFLICT_STAGEMASK (0x6000)
+
 /*
  * Range 0x in ce_flags is divided into
  * two parts: in-memory flags and on-disk ones.
@@ -166,6 +222,18 @@ struct cache_entry {
 #define CE_EXTENDED_FLAGS (CE_INTENT_TO_ADD | CE_SKIP_WORKTREE)
 
 /*
+ * Representation of the extended on-disk flags in the v5 format.
+ * They must not collide with the ordinary on-disk flags, and need to
+ * fit in 16 bits.  Note however that v5 does not save the name
+ * length.
+ */
+#define CE_INTENT_TO_ADD_V5  (0x4000)
+#define CE_SKIP_WORKTREE_V5  (0x0800)
+#if (CE_VALID|CE_STAGEMASK)  (CE_INTENTTOADD_V5|CE_SKIPWORKTREE_V5)
+#error v5 on-disk flags collide with ordinary on-disk flags
+#endif
+
+/*
  * Safeguard to avoid saving wrong flags:
  *  - CE_EXTENDED2 won't get saved until its semantic is known
  *  - Bits in 0x have been saved in ce_flags already
@@ -203,6 +271,8 @@ static inline unsigned create_ce_flags(unsigned stage)
 #define ce_skip_worktree(ce) ((ce)-ce_flags  CE_SKIP_WORKTREE)
 #define ce_mark_uptodate(ce) ((ce)-ce_flags |= CE_UPTODATE)
 
+#define conflict_stage(c) ((CONFLICT_STAGEMASK  (c)-flags)  
CONFLICT_STAGESHIFT)
+
 #define ce_permissions(mode) (((mode)  0100) ? 0755 : 0644)
 static inline unsigned int create_ce_mode(unsigned int mode)
 {
@@ -249,6 +319,8 @@ static inline unsigned int canon_mode(unsigned int mode)
 }
 
 #define cache_entry_size(len) (offsetof(struct cache_entry,name) + (len) + 1)
+#define directory_entry_size(len) (offsetof(struct directory_entry,pathname) + 
(len) + 1)
+#define conflict_entry_size(len) (offsetof(struct conflict_entry,name) + (len) 
+ 1)
 
 struct index_state {
struct cache_entry **cache;
diff --git a/read-cache-v5.c b/read-cache-v5.c
new file mode 100644
index 000..ec1201d
--- /dev/null
+++ b/read-cache-v5.c
@@ -0,0 +1,589 @@
+#include cache.h
+#include read-cache.h
+#include resolve-undo.h
+#include cache-tree.h
+
+struct cache_header {
+   unsigned int hdr_ndir;
+   unsigned int hdr_nfile;
+   unsigned int

[PATCH/RFC v3 08/13] Read cache-tree in index-v5

2012-08-08 Thread Thomas Gummerer
Since the cache-tree data is saved as part of the directory data,
we already read it at the beginning of the index. The cache-tree
is only converted from this directory data.

The cache-tree data is arranged in a tree, with the children sorted by
pathlen at each node, while the ondisk format is sorted lexically.
So we have to rebuild this format from the on-disk directory list.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache-tree.c|   93 +++
 cache-tree.h|   10 ++
 read-cache-v5.c |1 +
 3 files changed, 104 insertions(+)

diff --git a/cache-tree.c b/cache-tree.c
index 28ed657..440cd04 100644
--- a/cache-tree.c
+++ b/cache-tree.c
@@ -519,6 +519,99 @@ struct cache_tree *cache_tree_read(const char *buffer, 
unsigned long size)
return read_one(buffer, size);
 }
 
+static struct cache_tree *convert_one(struct directory_queue *queue, int dirnr)
+{
+   int i, subtree_nr;
+   struct cache_tree *it;
+   struct directory_queue *down;
+
+   it = cache_tree();
+   it-entry_count = queue[dirnr].de-de_nentries;
+   subtree_nr = queue[dirnr].de-de_nsubtrees;
+   if (0 = it-entry_count)
+   hashcpy(it-sha1, queue[dirnr].de-sha1);
+
+   /*
+   * Just a heuristic -- we do not add directories that often but
+   * we do not want to have to extend it immediately when we do,
+   * hence +2.
+   */
+   it-subtree_alloc = subtree_nr + 2;
+   it-down = xcalloc(it-subtree_alloc, sizeof(struct cache_tree_sub *));
+   down = queue[dirnr].down;
+   for (i = 0; i  subtree_nr; i++) {
+   struct cache_tree *sub;
+   struct cache_tree_sub *subtree;
+   char *buf, *name;
+
+   name = ;
+   buf = strtok(down[i].de-pathname, /);
+   while (buf) {
+   name = buf;
+   buf = strtok(NULL, /);
+   }
+   sub = convert_one(down, i);
+   if(!sub)
+   goto free_return;
+   subtree = cache_tree_sub(it, name);
+   subtree-cache_tree = sub;
+   }
+   if (subtree_nr != it-subtree_nr)
+   die(cache-tree: internal error);
+   return it;
+ free_return:
+   cache_tree_free(it);
+   return NULL;
+}
+
+static int compare_cache_tree_elements(const void *a, const void *b)
+{
+   const struct directory_entry *de1, *de2;
+
+   de1 = ((const struct directory_queue *)a)-de;
+   de2 = ((const struct directory_queue *)b)-de;
+   return subtree_name_cmp(de1-pathname, de1-de_pathlen,
+   de2-pathname, de2-de_pathlen);
+}
+
+static struct directory_entry *sort_directories(struct directory_entry *de,
+   struct directory_queue *queue)
+{
+   int i, nsubtrees;
+
+   nsubtrees = de-de_nsubtrees;
+   for (i = 0; i  nsubtrees; i++) {
+   struct directory_entry *new_de;
+   de = de-next;
+   new_de = xmalloc(directory_entry_size(de-de_pathlen));
+   memcpy(new_de, de, directory_entry_size(de-de_pathlen));
+   queue[i].de = new_de;
+   if (de-de_nsubtrees) {
+   queue[i].down = xcalloc(de-de_nsubtrees,
+   sizeof(struct directory_queue));
+   de = sort_directories(de,
+   queue[i].down);
+   }
+   }
+   qsort(queue, nsubtrees, sizeof(struct directory_queue),
+   compare_cache_tree_elements);
+   return de;
+}
+
+struct cache_tree *cache_tree_convert_v5(struct directory_entry *de)
+{
+   struct directory_queue *queue;
+
+   if (!de-de_nentries)
+   return NULL;
+   queue = xcalloc(1, sizeof(struct directory_queue));
+   queue[0].de = de;
+   queue[0].down = xcalloc(de-de_nsubtrees, sizeof(struct 
directory_queue));
+
+   sort_directories(de, queue[0].down);
+   return convert_one(queue, 0);
+}
+
 static struct cache_tree *cache_tree_find(struct cache_tree *it, const char 
*path)
 {
if (!it)
diff --git a/cache-tree.h b/cache-tree.h
index d8cb2e9..7f29d26 100644
--- a/cache-tree.h
+++ b/cache-tree.h
@@ -20,6 +20,11 @@ struct cache_tree {
struct cache_tree_sub **down;
 };
 
+struct directory_queue {
+   struct directory_queue *down;
+   struct directory_entry *de;
+};
+
 struct cache_tree *cache_tree(void);
 void cache_tree_free(struct cache_tree **);
 void cache_tree_invalidate_path(struct cache_tree *, const char *);
@@ -27,6 +32,11 @@ struct cache_tree_sub *cache_tree_sub(struct cache_tree *, 
const char *);
 
 void cache_tree_write(struct strbuf *, struct cache_tree *root);
 struct cache_tree *cache_tree_read(const char *buffer, unsigned long size);
+/*
+ * This function modifys the directory argument

[PATCH/RFC v3 10/13] Write index-v5 cache-tree data

2012-08-08 Thread Thomas Gummerer
Write the cache-tree data for the index version 5 file format. The
in-memory cache-tree data is converted to the ondisk format, by adding
it to the directory entries, that were compiled from the cache-entries
in the step before.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache-tree.c |   52 
 cache-tree.h |1 +
 read-cache.c |1 +
 3 files changed, 54 insertions(+)

diff --git a/cache-tree.c b/cache-tree.c
index 440cd04..e167b61 100644
--- a/cache-tree.c
+++ b/cache-tree.c
@@ -612,6 +612,58 @@ struct cache_tree *cache_tree_convert_v5(struct 
directory_entry *de)
return convert_one(queue, 0);
 }
 
+
+static void convert_one_to_ondisk_v5(struct hash_table *table, struct 
cache_tree *it,
+   const char *path, int pathlen, uint32_t crc)
+{
+   int i;
+   struct directory_entry *found, *search;
+
+   crc = crc32(crc, (Bytef*)path, pathlen);
+   found = lookup_hash(crc, table);
+   search = found;
+   while (search  strcmp(path, search-pathname + search-de_pathlen - 
strlen(path)) != 0)
+   search = search-next_hash;
+   if (!search)
+   return;
+   /*
+* The number of subtrees is already calculated by
+* compile_directory_data, therefore we only need to
+* add the entry_count
+*/
+   search-de_nentries = it-entry_count;
+   if (0 = it-entry_count)
+   hashcpy(search-sha1, it-sha1);
+   if (strcmp(path, ) != 0)
+   crc = crc32(crc, (Bytef*)/, 1);
+
+#if DEBUG
+   if (0 = it-entry_count)
+   fprintf(stderr, cache-tree %.*s (%d ent, %d subtree) %s\n,
+   pathlen, path, it-entry_count, it-subtree_nr,
+   sha1_to_hex(it-sha1));
+   else
+   fprintf(stderr, cache-tree %.*s (%d subtree) invalid\n,
+   pathlen, path, it-subtree_nr);
+#endif
+
+   for (i = 0; i  it-subtree_nr; i++) {
+   struct cache_tree_sub *down = it-down[i];
+   if (i) {
+   struct cache_tree_sub *prev = it-down[i-1];
+   if (subtree_name_cmp(down-name, down-namelen,
+prev-name, prev-namelen) = 0)
+   die(fatal - unsorted cache subtree);
+   }
+   convert_one_to_ondisk_v5(table, down-cache_tree, down-name, 
down-namelen, crc);
+   }
+}
+
+void cache_tree_to_ondisk_v5(struct hash_table *table, struct cache_tree *root)
+{
+   convert_one_to_ondisk_v5(table, root, , 0, 0);
+}
+
 static struct cache_tree *cache_tree_find(struct cache_tree *it, const char 
*path)
 {
if (!it)
diff --git a/cache-tree.h b/cache-tree.h
index 7f29d26..e08bc31 100644
--- a/cache-tree.h
+++ b/cache-tree.h
@@ -37,6 +37,7 @@ struct cache_tree *cache_tree_read(const char *buffer, 
unsigned long size);
  * Don't use it if the directory entries are still needed after.
  */
 struct cache_tree *cache_tree_convert_v5(struct directory_entry *de);
+void cache_tree_to_ondisk_v5(struct hash_table *table, struct cache_tree 
*root);
 
 int cache_tree_fully_valid(struct cache_tree *);
 int cache_tree_update(struct cache_tree *, struct cache_entry **, int, int);
diff --git a/read-cache.c b/read-cache.c
index 199ba75..962d6a2 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1310,6 +1310,7 @@ void update_index_if_able(struct index_state *istate, 
struct lock_file *lockfile
else
rollback_lock_file(lockfile);
 }
+
 int write_index(struct index_state *istate, int newfd)
 {
set_istate_ops(istate);
-- 
1.7.10.GIT

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v3 04/13] Add documentation of the index-v5 file format

2012-08-08 Thread Thomas Gummerer
Add a documentation of the index file format version 5 to
Documentation/technical.

Helped-by: Michael Haggerty mhag...@alum.mit.edu
Helped-by: Junio C Hamano gits...@pobox.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Robin Rosenberg robin.rosenb...@dewire.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Documentation/technical/index-file-format-v5.txt |  285 ++
 1 file changed, 285 insertions(+)
 create mode 100644 Documentation/technical/index-file-format-v5.txt

diff --git a/Documentation/technical/index-file-format-v5.txt 
b/Documentation/technical/index-file-format-v5.txt
new file mode 100644
index 000..6707f06
--- /dev/null
+++ b/Documentation/technical/index-file-format-v5.txt
@@ -0,0 +1,285 @@
+GIT index format
+
+
+== The git index file format
+
+   The git index file (.git/index) documents the status of the files
+ in the git staging area.
+
+   The staging area is used for preparing commits, merging, etc.
+
+   All binary numbers are in network byte order. Version 5 is described
+ here.
+
+   - A 20-byte header consisting of
+
+ sig (32-bits): Signature:
+   The signature is { 'D', 'I', 'R', 'C' } (stands for dircache)
+
+ vnr (32-bits): Version number:
+   The current supported versions are 2, 3, 4 and 5.
+
+ ndir (32-bits): number of directories in the index.
+
+ nfile (32-bits): number of file entries in the index.
+
+ fblockoffset (32-bits): offset to the file block, relative to the
+   beginning of the file.
+
+   - Offset to the extensions.
+
+ nextensions (32-bits): number of extensions.
+
+ extoffset (32-bits): offset to the extension. (Possibly none, as
+   many as indicated in the 4-byte number of extensions)
+
+ headercrc (32-bits): crc checksum for the header and extension
+   offsets
+
+   - diroffsets (ndir * directory offsets): A directory offset for each
+   of the ndir directories in the index, sorted by pathname (of the
+   directory it's pointing to) (see below). The diroffsets are relative
+   to the beginning of the direntries block. [1]
+
+   - direntries (ndir * directory entries): A directory entry for each
+   of the ndir directories in the index, sorted by pathname (see
+   below). [2]
+
+   - fileoffsets (nfile * file offsets): A file offset for each of the
+   nfile files in the index (see below). The file offsets are relative
+   to the beginning of the fileentries block. [1]
+
+   - fileentries (nfile * file entries): A file entry for each of the
+   nfile files in the index (see below).
+
+   - crdata: A number of entries for conflicted data/resolved conflicts
+   (see below).
+
+   - Extensions (Currently none, see below in the future)
+
+ Extensions are identified by signature. Optional extensions can
+ be ignored if GIT does not understand them.
+
+ GIT supports an arbitrary number of extension, but currently none
+ is implemented. [3]
+
+ extsig (32-bits): extension signature. If the first byte is 'A'..'Z'
+ the extension is optional and can be ignored.
+
+ extsize (32-bits): size of the extension, excluding the header
+   (extsig, extsize, extchecksum).
+
+ extchecksum (32-bits): crc32 checksum of the extension signature
+   and size.
+
+- Extension data.
+
+
+== Directory offsets (diroffsets)
+
+  diroffset (32-bits): offset to the directory relative to the beginning
+of the index file. There are ndir + 1 offsets in the diroffset table,
+the last is pointing to the end of the last direntry. With this last
+entry, we can replace the strlen when reading each filename, by
+calculating its length with the offsets.
+
+  This part is needed for making the directory entries bisectable and
+thus allowing a binary search.
+
+== Directory entry (direntries)
+  
+  Directory entries are sorted in lexicographic order by the name 
+of their path starting with the root.
+  
+  pathname (variable length, nul terminated): relative to top level
+directory (without the leading slash). '/' is used as path
+separator. A string of length 0 ('') indicates the root directory.
+The special path components ., and .. (without quotes) are
+disallowed. The path also includes a trailing slash. [9]
+
+  foffset (32-bits): offset to the lexicographically first file in 
+the file offsets (fileoffsets), relative to the beginning of
+the fileoffset block.
+
+  cr (32-bits): offset to conflicted/resolved data at the end of the
+index. 0 if there is no such data. [4]
+
+  ncr (32-bits): number of conflicted/resolved data entries at the
+end of the index if the offset is non 0. If cr is 0, ncr is
+also 0.
+
+  nsubtrees (32-bits): number of subtrees this tree has in the index.
+
+  nfiles (32-bits): number of files in the directory, that are in
+the index.
+
+  nentries (32-bits

[PATCH/RFC v3 03/13] t3700: Avoid interfering with the racy code

2012-08-08 Thread Thomas Gummerer
The new git racy code uses the mtime of cache-entries as smudge
marker for racily clean entries. The work of checking the file-system
if the entry really changed is offloaded to the reader. This interferes
with this test, because the entry is racily smudged and thus has
mtime 0.

To avoid interfering with the racy code, we use a time relative
to the time returned by time(3), instead of a time relative to
the mtime of the cache entries.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/t3700-add.sh |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t/t3700-add.sh b/t/t3700-add.sh
index 874b3a6..829d36d 100755
--- a/t/t3700-add.sh
+++ b/t/t3700-add.sh
@@ -184,7 +184,7 @@ test_expect_success 'git add --refresh with pathspec' '
echo foo  echo bar  echo baz 
git add foo bar baz  H=$(git rev-parse :foo)  git rm -f foo 
echo 100644 $H 3   foo | git update-index --index-info 
-   test-chmtime -60 bar baz 
+   test-chmtime =-60 bar baz 
expect 
git add --refresh bar actual 
test_cmp expect actual 
-- 
1.7.10.GIT

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v3 01/13] Move index v2 specific functions to their own file

2012-08-08 Thread Thomas Gummerer
Move index version 2 specific functions to their own file,
to prepare for the addition of a new index file format. With
the split into two files we have the non-index specific
functions in read-cache.c and the index-v2 specific functions
in read-cache-v2.c

Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 Makefile |2 +
 cache.h  |   13 +-
 read-cache-v2.c  |  581 +++
 read-cache.c |  613 +++---
 read-cache.h |   57 +
 test-index-version.c |7 +-
 6 files changed, 683 insertions(+), 590 deletions(-)
 create mode 100644 read-cache-v2.c
 create mode 100644 read-cache.h

diff --git a/Makefile b/Makefile
index 4b58b91..b4a7c73 100644
--- a/Makefile
+++ b/Makefile
@@ -645,6 +645,7 @@ LIB_H += progress.h
 LIB_H += prompt.h
 LIB_H += quote.h
 LIB_H += reachable.h
+LIB_H += read-cache.h
 LIB_H += reflog-walk.h
 LIB_H += refs.h
 LIB_H += remote.h
@@ -768,6 +769,7 @@ LIB_OBJS += prompt.o
 LIB_OBJS += quote.o
 LIB_OBJS += reachable.o
 LIB_OBJS += read-cache.o
+LIB_OBJS += read-cache-v2.o
 LIB_OBJS += reflog-walk.o
 LIB_OBJS += refs.o
 LIB_OBJS += remote.o
diff --git a/cache.h b/cache.h
index 67f28b4..c77cdbe 100644
--- a/cache.h
+++ b/cache.h
@@ -94,16 +94,8 @@ unsigned long git_deflate_bound(git_zstream *, unsigned 
long);
  */
 #define DEFAULT_GIT_PORT 9418
 
-/*
- * Basic data structures for the directory cache
- */
 
 #define CACHE_SIGNATURE 0x44495243 /* DIRC */
-struct cache_header {
-   unsigned int hdr_signature;
-   unsigned int hdr_version;
-   unsigned int hdr_entries;
-};
 
 #define INDEX_FORMAT_LB 2
 #define INDEX_FORMAT_UB 4
@@ -267,6 +259,7 @@ struct index_state {
unsigned name_hash_initialized : 1,
 initialized : 1;
struct hash_table name_hash;
+   struct index_ops *ops;
 };
 
 extern struct index_state the_index;
@@ -471,8 +464,8 @@ extern int index_name_is_other(const struct index_state *, 
const char *, int);
 #define CE_MATCH_RACY_IS_DIRTY 02
 /* do stat comparison even if CE_SKIP_WORKTREE is true */
 #define CE_MATCH_IGNORE_SKIP_WORKTREE  04
-extern int ie_match_stat(const struct index_state *, struct cache_entry *, 
struct stat *, unsigned int);
-extern int ie_modified(const struct index_state *, struct cache_entry *, 
struct stat *, unsigned int);
+extern int ie_match_stat(struct index_state *, struct cache_entry *, struct 
stat *, unsigned int);
+extern int ie_modified(struct index_state *, struct cache_entry *, struct stat 
*, unsigned int);
 
 struct pathspec {
const char **raw; /* get_pathspec() result, not freed by 
free_pathspec() */
diff --git a/read-cache-v2.c b/read-cache-v2.c
new file mode 100644
index 000..38f1791
--- /dev/null
+++ b/read-cache-v2.c
@@ -0,0 +1,581 @@
+#include cache.h
+#include read-cache.h
+#include resolve-undo.h
+#include cache-tree.h
+#include varint.h
+
+/* Mask for the name length in ce_flags in the on-disk index */
+#define CE_NAMEMASK  (0x0fff)
+
+struct cache_header {
+   unsigned int hdr_entries;
+};
+
+/*
+ * Index File I/O
+ */
+
+/*
+ * dev/ino/uid/gid/size are also just tracked to the low 32 bits
+ * Again - this is just a (very strong in practice) heuristic that
+ * the inode hasn't changed.
+ *
+ * We save the fields in big-endian order to allow using the
+ * index file over NFS transparently.
+ */
+struct ondisk_cache_entry {
+   struct cache_time ctime;
+   struct cache_time mtime;
+   unsigned int dev;
+   unsigned int ino;
+   unsigned int mode;
+   unsigned int uid;
+   unsigned int gid;
+   unsigned int size;
+   unsigned char sha1[20];
+   unsigned short flags;
+   char name[FLEX_ARRAY]; /* more */
+};
+
+/*
+ * This struct is used when CE_EXTENDED bit is 1
+ * The struct must match ondisk_cache_entry exactly from
+ * ctime till flags
+ */
+struct ondisk_cache_entry_extended {
+   struct cache_time ctime;
+   struct cache_time mtime;
+   unsigned int dev;
+   unsigned int ino;
+   unsigned int mode;
+   unsigned int uid;
+   unsigned int gid;
+   unsigned int size;
+   unsigned char sha1[20];
+   unsigned short flags;
+   unsigned short flags2;
+   char name[FLEX_ARRAY]; /* more */
+};
+
+/* These are only used for v3 or lower */
+#define align_flex_name(STRUCT,len) ((offsetof(struct STRUCT,name) + (len) + 
8)  ~7)
+#define ondisk_cache_entry_size(len) align_flex_name(ondisk_cache_entry,len)
+#define ondisk_cache_entry_extended_size(len) 
align_flex_name(ondisk_cache_entry_extended,len)
+#define ondisk_ce_size(ce) (((ce)-ce_flags  CE_EXTENDED) ? \
+   ondisk_cache_entry_extended_size(ce_namelen(ce

[PATCH/RFC v3 07/13] Read resolve-undo data

2012-08-08 Thread Thomas Gummerer
Make git read the resolve-undo data from the index.

Since the resolve-undo data is joined with the conflicts in
the ondisk format of the index file version 5, conflicts and
resolved data is read at the same time, and the resolve-undo
data is then converted to the in-memory format.

Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c |1 +
 resolve-undo.c  |   36 
 resolve-undo.h  |2 ++
 3 files changed, 39 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index ec1201d..b47398d 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -494,6 +494,7 @@ static struct directory_entry *read_entries(struct 
index_state *istate,
int i;
 
conflict_queue = read_conflicts(de, mmap, mmap_size, fd);
+   resolve_undo_convert_v5(istate, conflict_queue);
for (i = 0; i  de-de_nfiles; i++) {
ce = read_entry(de,
entry_offset,
diff --git a/resolve-undo.c b/resolve-undo.c
index 72b4612..f96c6ba 100644
--- a/resolve-undo.c
+++ b/resolve-undo.c
@@ -170,3 +170,39 @@ void unmerge_index(struct index_state *istate, const char 
**pathspec)
i = unmerge_index_entry_at(istate, i);
}
 }
+
+void resolve_undo_convert_v5(struct index_state *istate,
+   struct conflict_entry *ce)
+{
+   int i;
+
+   while (ce) {
+   struct string_list_item *lost;
+   struct resolve_undo_info *ui;
+   struct conflict_part *cp;
+
+   if (ce-entries  (ce-entries-flags  CONFLICT_CONFLICTED) 
!= 0) {
+   ce = ce-next;
+   continue;
+   }
+   if (!istate-resolve_undo) {
+   istate-resolve_undo = xcalloc(1, sizeof(struct 
string_list));
+   istate-resolve_undo-strdup_strings = 1;
+   }
+
+   lost = string_list_insert(istate-resolve_undo, ce-name);
+   if (!lost-util)
+   lost-util = xcalloc(1, sizeof(*ui));
+   ui = lost-util;
+
+   cp = ce-entries;
+   for (i = 0; i  3; i++)
+   ui-mode[i] = 0;
+   while (cp) {
+   ui-mode[conflict_stage(cp) - 1] = cp-entry_mode;
+   hashcpy(ui-sha1[conflict_stage(cp) - 1], cp-sha1);
+   cp = cp-next;
+   }
+   ce = ce-next;
+   }
+}
diff --git a/resolve-undo.h b/resolve-undo.h
index 8458769..ab660a6 100644
--- a/resolve-undo.h
+++ b/resolve-undo.h
@@ -13,4 +13,6 @@ extern void resolve_undo_clear_index(struct index_state *);
 extern int unmerge_index_entry_at(struct index_state *, int);
 extern void unmerge_index(struct index_state *, const char **);
 
+extern void resolve_undo_convert_v5(struct index_state *, struct 
conflict_entry *);
+
 #endif
-- 
1.7.10.GIT

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v3 11/13] Write resolve-undo data for index-v5

2012-08-08 Thread Thomas Gummerer
Write the resolve undo data to the ondisk format, by joining the data
in the resolve-undo string-list with the already existing conflicts
that were compiled before, when searching the directories and add
them to the corresponding directory entries.

Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c |3 ++
 resolve-undo.c  |   93 +++
 resolve-undo.h  |1 +
 3 files changed, 97 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index 45f7acd..3d03111 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -861,6 +861,9 @@ static struct directory_entry 
*compile_directory_data(struct index_state *istate
previous_entry-next = no_subtrees;
}
}
+   if (istate-cache_tree)
+   cache_tree_to_ondisk_v5(table, istate-cache_tree);
+   resolve_undo_to_ondisk_v5(table, istate-resolve_undo, ndir, 
total_dir_len, de);
return de;
 }
 
diff --git a/resolve-undo.c b/resolve-undo.c
index f96c6ba..4568dcc 100644
--- a/resolve-undo.c
+++ b/resolve-undo.c
@@ -206,3 +206,96 @@ void resolve_undo_convert_v5(struct index_state *istate,
ce = ce-next;
}
 }
+
+void resolve_undo_to_ondisk_v5(struct hash_table *table,
+   struct string_list *resolve_undo,
+   unsigned int *ndir, int *total_dir_len,
+   struct directory_entry *de)
+{
+   struct string_list_item *item;
+   struct directory_entry *search;
+
+   if (!resolve_undo)
+   return;
+   for_each_string_list_item(item, resolve_undo) {
+   struct conflict_entry *conflict_entry;
+   struct resolve_undo_info *ui = item-util;
+   char *super;
+   int i, dir_len, len;
+   uint32_t crc;
+   struct directory_entry *found, *current, *new_tree;
+
+   if (!ui)
+   continue;
+
+   super = super_directory(item-string);
+   if (!super)
+   dir_len = 0;
+   else
+   dir_len = strlen(super);
+   crc = crc32(0, (Bytef*)super, dir_len);
+   found = lookup_hash(crc, table);
+   current = NULL;
+   new_tree = NULL;
+   
+   while (!found) {
+   struct directory_entry *new;
+
+   new = init_directory_entry(super, dir_len);
+   if (!current)
+   current = new;
+   insert_directory_entry(new, table, total_dir_len, ndir, 
crc);
+   if (new_tree != NULL)
+   new-de_nsubtrees = 1;
+   new-next = new_tree;
+   new_tree = new;
+   super = super_directory(super);
+   if (!super)
+   dir_len = 0;
+   else
+   dir_len = strlen(super);
+   crc = crc32(0, (Bytef*)super, dir_len);
+   found = lookup_hash(crc, table);
+   }
+   search = found;
+   while (search-next_hash  strcmp(super, search-pathname) != 
0)
+   search = search-next_hash;
+   if (search  !current)
+   current = search;
+   if (!search  !current)
+   current = new_tree;
+   if (!super  new_tree) {
+   new_tree-next = de-next;
+   de-next = new_tree;
+   de-de_nsubtrees++;
+   } else if (new_tree) {
+   struct directory_entry *temp;
+
+   search = de-next;
+   while (strcmp(super, search-pathname))
+   search = search-next;
+   temp = new_tree;
+   while (temp-next)
+   temp = temp-next;
+   search-de_nsubtrees++;
+   temp-next = search-next;
+   search-next = new_tree;
+   }
+
+   len = strlen(item-string);
+   conflict_entry = create_new_conflict(item-string, len, 
current-de_pathlen);
+   add_conflict_to_directory_entry(current, conflict_entry);
+   for (i = 0; i  3; i++) {
+   if (ui-mode[i]) {
+   struct conflict_part *cp;
+
+   cp = xmalloc(sizeof(struct conflict_part));
+   cp-flags = (i + 1)  CONFLICT_STAGESHIFT;
+   cp-entry_mode = ui-mode[i];
+   cp-next = NULL

[PATCH/RFC v3 12/13] update-index.c: always rewrite the index when index-version is given

2012-08-08 Thread Thomas Gummerer
Make git update-index always rewrite the index, if a index-version
is given. This is used for performance testing, to have a reader
and writer for the whole index.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 builtin/update-index.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/builtin/update-index.c b/builtin/update-index.c
index 4ce341c..c31d176 100644
--- a/builtin/update-index.c
+++ b/builtin/update-index.c
@@ -6,6 +6,7 @@
 #include cache.h
 #include quote.h
 #include cache-tree.h
+#include read-cache.h
 #include tree-walk.h
 #include builtin.h
 #include refs.h
@@ -861,6 +862,7 @@ int cmd_update_index(int argc, const char **argv, const 
char *prefix)
if (the_index.version != preferred_index_format)
active_cache_changed = 1;
the_index.version = preferred_index_format;
+   set_istate_ops(the_index);
}
 
if (read_from_stdin) {
@@ -886,7 +888,7 @@ int cmd_update_index(int argc, const char **argv, const 
char *prefix)
strbuf_release(buf);
}
 
-   if (active_cache_changed) {
+   if (active_cache_changed || preferred_index_format) {
if (newfd  0) {
if (refresh_args.flags  REFRESH_QUIET)
exit(128);
-- 
1.7.10.GIT

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v3 09/13] Write index-v5

2012-08-08 Thread Thomas Gummerer
Write the index version 5 file format to disk. This version doesn't
write the cache-tree data and resolve-undo data to the file.

The main work is done when filtering out the directories from the
current in-memory format, where in the same turn also the conflicts
and the file data is calculated.

Helped-by: Nguyen Thai Ngoc Duy pclo...@gmail.com
Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache.h |   10 +-
 read-cache-v5.c |  589 ++-
 read-cache.c|   19 +-
 read-cache.h|3 +
 4 files changed, 611 insertions(+), 10 deletions(-)

diff --git a/cache.h b/cache.h
index a0a1781..3fa348d 100644
--- a/cache.h
+++ b/cache.h
@@ -98,7 +98,7 @@ unsigned long git_deflate_bound(git_zstream *, unsigned long);
 #define CACHE_SIGNATURE 0x44495243 /* DIRC */
 
 #define INDEX_FORMAT_LB 2
-#define INDEX_FORMAT_UB 4
+#define INDEX_FORMAT_UB 5
 
 /*
  * The cache_time is just the low 32 bits of the
@@ -510,6 +510,7 @@ extern int verify_path(const char *path);
 extern struct cache_entry *index_name_exists(struct index_state *istate, const 
char *name, int namelen, int igncase);
 extern int index_name_stage_pos(const struct index_state *, const char *name, 
int namelen, int stage);
 extern int index_name_pos(const struct index_state *, const char *name, int 
namelen);
+extern struct directory_entry *init_directory_entry(char *pathname, int len);
 #define ADD_CACHE_OK_TO_ADD 1  /* Ok to add */
 #define ADD_CACHE_OK_TO_REPLACE 2  /* Ok to replace file/directory */
 #define ADD_CACHE_SKIP_DFCHECK 4   /* Ok to skip DF conflict checks */
@@ -1244,6 +1245,13 @@ static inline ssize_t write_str_in_full(int fd, const 
char *str)
return write_in_full(fd, str, strlen(str));
 }
 
+/* index-v5 helper functions */
+extern char *super_directory(const char *filename);
+extern void insert_directory_entry(struct directory_entry *, struct hash_table 
*, int *, unsigned int *, uint32_t);
+extern void add_conflict_to_directory_entry(struct directory_entry *, struct 
conflict_entry *);
+extern void add_part_to_conflict_entry(struct directory_entry *, struct 
conflict_entry *, struct conflict_part *);
+extern struct conflict_entry *create_new_conflict(char *, int, int);
+
 /* pager.c */
 extern void setup_pager(void);
 extern const char *pager_program;
diff --git a/read-cache-v5.c b/read-cache-v5.c
index 57d0fb5..45f7acd 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -583,9 +583,596 @@ static void read_index_v5(struct index_state *istate, 
void *mmap, int mmap_size,
istate-cache_tree = cache_tree_convert_v5(root_directory);
 }
 
+#define WRITE_BUFFER_SIZE 8192
+static unsigned char write_buffer[WRITE_BUFFER_SIZE];
+static unsigned long write_buffer_len;
+
+static int ce_write_flush(int fd)
+{
+   unsigned int buffered = write_buffer_len;
+   if (buffered) {
+   if (write_in_full(fd, write_buffer, buffered) != buffered)
+   return -1;
+   write_buffer_len = 0;
+   }
+   return 0;
+}
+
+static int ce_write(uint32_t *crc, int fd, void *data, unsigned int len)
+{
+   if (crc)
+   *crc = crc32(*crc, (Bytef*)data, len);
+   while (len) {
+   unsigned int buffered = write_buffer_len;
+   unsigned int partial = WRITE_BUFFER_SIZE - buffered;
+   if (partial  len)
+   partial = len;
+   memcpy(write_buffer + buffered, data, partial);
+   buffered += partial;
+   if (buffered == WRITE_BUFFER_SIZE) {
+   write_buffer_len = buffered;
+   if (ce_write_flush(fd))
+   return -1;
+   buffered = 0;
+   }
+   write_buffer_len = buffered;
+   len -= partial;
+   data = (char *) data + partial;
+   }
+   return 0;
+}
+
+static int ce_flush(int fd)
+{
+   unsigned int left = write_buffer_len;
+
+   if (left)
+   write_buffer_len = 0;
+
+   if (write_in_full(fd, write_buffer, left) != left)
+   return -1;
+
+   return 0;
+}
+
+static void ce_smudge_racily_clean_entry(struct cache_entry *ce)
+{
+   /*
+* This method shall only be called if the timestamp of ce
+* is racy (check with is_racy_timestamp). If the timestamp
+* is racy, the writer will just set the time to 0.
+*
+* The reader (match_stat_basic) will then take care
+* of checking if the entry is really changed or not, by
+* taking into account the stat_crc and if that hasn't changed
+* checking the sha1.
+*/
+   ce-ce_mtime.sec = 0;
+   ce-ce_mtime.nsec = 0;
+}
+
+char *super_directory(const char *filename)
+{
+   char *slash;
+
+   slash = strrchr(filename, '/');
+   if (slash)
+   return

Re: [PATCH/RFC v2 09/16] Read index-v5

2012-08-08 Thread Thomas Gummerer


On 08/08, Junio C Hamano wrote:
 Thomas Gummerer t.gumme...@gmail.com writes:
 
   +name = (char *)mmap + *dir_offset;
   +beginning = mmap + *dir_table_offset;
  
  Notice how you computed name with pointer arithmetic by first
  casting mmap (which is void *) and when computing beginning, you
  forgot to cast mmap and attempted pointer arithmetic with void *.
  The latter does not work and breaks compilation.
  
  The pointer-arith with void * is not limited to this function.
  ...
  I've used the type of the respective assignment for now. e.g. i have
  struct cache_header *hdr, so I'm using
  hdr = (struct cache_header *)mmap + x;
 
 You need to be careful when rewriting the above to choose the right
 value for 'x' if you go that route (which I wouldn't recommend).
 
 With
 
 hdr = ptr_add(mmap, x);
 
 you are making hdr point at x BYTES beyond mmap, but
 
 hdr = (struct cache_header *)mmap + x;
 
 means something entirely different, no?  hdr points at x entries
 of struct cache_header beyond mmap (in other words, if mmap[] were
 defined as struct cache_header mmap[], the above is saying the
 same as hdr = mmap[x]).
 
 I think the way you casted to compute the value for the name
 pointer is the (second) right thing to do.  The cast (char *)
 applied to mmap is about mmap is a typeless blob of memory I want
 to count bytes in.  Give me *dir_offset bytes into that blob.  It
 is not tied to the type of LHS (i.e. name) at all.  The result
 then needs to be casted to the type of LHS (i.e. name), and in
 this case the types happen to be the same, so you do not have to
 cast the result of the addition but that is mere luck.
 
 The next line is not so lucky and you would need to say something
 like:
 
 beginning = (uint32_t *)((char *)mmap + *dir_table_offset);
 
 Again, inner cast is about mmap is a blob counted in bytes, the
 outer cast is about type mismatch between a byte-address and LHS of
 the assignment.

This is what I tried in v3 of the series, but it didn't seem quiet
right.

 If mmap variable in this function were not void * but something
 more sane like const char *, you wouldn't have to have the inner
 cast to begin with, and that is why I said the way you did name is
 the second right thing.  Then you can write them like
 
 name = mmap + *dir_offset;
 beginning = (uint32_t *)(mmap + *dir_offset);
 
 After thinking about this, the ptr_add() macro might be the best
 solution, even though I originally called it as a band-aid.  We know
 mmap is a blob of memory, byte-offset of each component of which we
 know about, so we can say
 
 name = ptr_add(mmap, *dir_offset);
 beginning = ptr_add(mmap, *dir_offset);
 
 Hmmm..

I start to think so too. Casting the mmap variable to const char *
in the method call doesn't feel right to me, even though it would work.
Unless there are any objections I'll use ptr_add in the next version.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v2 06/16] t3700: sleep for 1 second, to avoid interfering with the racy code

2012-08-09 Thread Thomas Gummerer
On 08/08, Junio C Hamano wrote:
 Junio C Hamano gits...@pobox.com writes:
 
  So whether done with sleep or test-chmtime, avoiding a racily
  clean situation sounds like sweeping a bug in the v5 code in racy
  situation under the rug to me (unless I am misunderstanding what
  you are doing with this change and in your explanation, or the test
  was checking a wrong thing, that is).
 
  Even more confused
 
 OK, after staring this test for a long time, and going back to
 3d1f148 (refresh_index: do not show unmerged path that is outside
 pathspec, 2012-02-17), I give up.
 
 Let me ask the same question in a more direct way.  Which part of
 this test break with your series?
 
 test_expect_success 'git add --refresh with pathspec' '
 git reset --hard 
 echo foo  echo bar  echo baz 
 git add foo bar baz  H=$(git rev-parse :foo)  git rm -f 
 foo 
 echo 100644 $H 3 foo | git update-index --index-info 
   # sleep 1  in the update here ...
 test-chmtime -60 bar baz 
 expect 
 git add --refresh bar actual 
 test_cmp expect actual 
 
 git diff-files --name-only actual 
 ! grep bar actual
 grep baz actual
 '
 
 We prepare an index with bunch of paths, we make foo unmerged, we
 smudge bar and baz stat-dirty, so that diff-files would report
 them, even though their contents match what is recorded in the
 index.

After getting confused a bit myself, I now think here is the problem.
The v5 code smudges baz when doing git add --refresh bar.  Therefore
baz isn't considered stat-dirty by the code, but a racily smudged entry
and therefore its content gets checked, thus not showing up in
git diff-files.  The mtime doesn't get checked anymore as it is used
as smudge marker and thus 0.  Adding sleep just avoids smudging the
entry.

The alternative would be to use the size or the crc as smudge marker
but I don't think they are good canidates, as they can still be used
by the reader to avoid checking the filesystem.

Another alternative would be to introduce a CE_SMUDGED flag as it was
suggested by Thomas on irc IIRC, but we chose to use the mtime as
smudge marker instead.

 Then we say git add --refresh bar.  As far as I know, the output
 from git add --refresh pathspec is limited to foo: needs merge
 if and only if foo is covered by pathspec and foo is unmerged.
 
   Side note: If --verbose is given to the same command, we
   also give Unstaged changes after refreshing the index:
   followed by M foo or U foo if foo does not match the
   index but not unmerged, or if foo is unmerged, again if
   and only if foo is covered by pathspec.  But that is not
   how we invoke git add --refresh in this test.
 
 So if you are getting a test failure from the test_cmp, wouldn't it
 mean that your series broke what 3d1f148 did (namely, make sure we
 report only on paths that are covered by pathspec, in this case
 bar), as the contents of bar in the working tree matches what is
 recorded in the index?
 
 If the failure you are seeing is that bar appears in the output of
 git diff-files --name-only, it means that diff-files noticed
 that bar is stat-dirty after git add --refresh bar.  Wouldn't it
 mean that the series broke git add --refresh bar in such a way
 that it does not to refresh what it was told to refresh?
 
 Another test that could fail after the point you added sleep 1 is
 that the output from git diff-files --name-only fails to list
 baz in its output, but with test-chmtime -60 bar baz, we made
 sure that bar and baz are stat-dirty, and we only refreshed
 bar and not baz.  If that is the case, then would it mean that
 the series broke git add --refresh bar in such a way that it
 refreshes something other than what it was told to refresh?

 In any case, having to change this test in any way smells like there
 is some breakage in the series; it is not immediately obvious to me
 that the current test is checking anything wrong as I suspected in
 the earlier message.
 
 So,... I dunno.
 
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v2 06/16] t3700: sleep for 1 second, to avoid interfering with the racy code

2012-08-09 Thread Thomas Gummerer
On 08/09, Junio C Hamano wrote:
 Thomas Gummerer t.gumme...@gmail.com writes:
 
  On 08/08, Junio C Hamano wrote:
  ...
  Let me ask the same question in a more direct way.  Which part of
  this test break with your series?
  
  test_expect_success 'git add --refresh with pathspec' '
  git reset --hard 
  echo foo  echo bar  echo baz 
  git add foo bar baz  H=$(git rev-parse :foo)  git rm 
  -f foo 
  echo 100644 $H 3  foo | git update-index --index-info 
 # sleep 1  in the update here ...
  test-chmtime -60 bar baz 
  expect 
  git add --refresh bar actual 
  test_cmp expect actual 
  
  git diff-files --name-only actual 
  ! grep bar actual
  grep baz actual
  '
  
  We prepare an index with bunch of paths, we make foo unmerged, we
  smudge bar and baz stat-dirty, so that diff-files would report
  them, even though their contents match what is recorded in the
  index.
 
  After getting confused a bit myself, I now think here is the problem.
  The v5 code smudges baz when doing git add --refresh bar.  Therefore
  baz isn't considered stat-dirty by the code, but a racily smudged entry
  and therefore its content gets checked, thus not showing up in
  git diff-files.
 
 So in short, the breakage is the last one among the three choices I
 gave you in my message you are responding to.  The user asked to
 refresh bar so that later diff-files won't report a false change
 on it, but baz effectively ends up getting refreshed at the same
 time and a false change is not reported.

Exactly.

 That breakage is, from the correctness point of view, not a
 breakage.  As the primary purpose of refreshing is to support
 commands that want to rely on a quick ce_modified() call to tell
 files that are modified in the working tree since it was last added
 to the index---you refresh once, and then you call such commands
 many times without having to worry about having to compare the
 contents between the indexed objects and the working tree files.
 
 But from the performance point of view, which is the whole point of
 refresh, the behaviour of the new code is dubious.  If the user is
 working in a large working tree (which automatically means large
 index, the primary reason we are doing this v5 experiment), the user
 often is working in a deep and narrow subdirectory of it, and a path
 limited refresh (the test names a specific file bar, but imagine
 it were . to limit it to the directory the user is working in) may
 be a cheap way not to bother even checking outside the area the user
 currently is working in.

That's true, but once we have the partial reader/writer, we do not
bother checking outside the area the user is currently working in
anyway.

Also and probably more importantly, this will only affect a *very*
small number of entries, because timestamps outside of the directory
in which the user is working in are rarely updated recently and
thus racy.

 Also, smudging more entries than necessary
 to be checked by ce_modified_check_fs() later at runtime may mean
 that it defeats the refresh once and then compare cheaply many
 times pattern that is employed by existing scripts.

The new racy code also calls ce_modified_check_fs() only if the size
and the stat_crc are not changed.  It's true that ce_modified_check_fs()
can be called multiple times, when match_stat_crc() is called, but that
could be solved by adding an additional flag CE_IS_MODIFIED, which
indicates that ce_modified_check_fs() was already run.

 Is the root cause really where the racily-clean so smudge to tell
 later runtime to check contents bit goes?  I am hoping that the
 issue is not coming from the difference between the current code and
 your code when they decide to smudge, what entries they decide to
 smudge and based on what condition.

I just gave it a try using a CE_SMUDGED flag, instead of the mtime
as smudge marker, which which this test works without any problems.
It doesn't work the other way round, the test as the test doesn't
break when using mtime as smudge marker in v2, because we do the
ce_modified_check_fs() test earlier.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v3 01/13] Move index v2 specific functions to their own file

2012-08-09 Thread Thomas Gummerer
On 08/09, Junio C Hamano wrote:
 Thomas Gummerer t.gumme...@gmail.com writes:
 
   /* remember to discard_cache() before reading a different cache! */
   int read_index_from(struct index_state *istate, const char *path)
   {
  ...
  mmap = xmmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 
  0);
  -   close(fd);
  if (mmap == MAP_FAILED)
  die_errno(unable to map index file);
   
  hdr = mmap;
  -   if (verify_hdr(hdr, mmap_size)  0)
  +   if (verify_hdr_version(istate, hdr, mmap_size)  0)
  goto unmap;
   ...
  +   if (istate-ops-verify_hdr(mmap, mmap_size)  0)
  +   goto unmap;
   
  +   istate-ops-read_index(istate, mmap, mmap_size, fd);
  ...
  +   close(fd);
 
 This looks utterly wrong.
 
 You already have mapped the whole thing, so there is nothing to be
 read from fd.  You have everything in-core.  Leaving fd open and
 pass it around looks like it is asking for trouble and confusion.
 
 If you found that an entry you read halfway has an inconsistent crc,
 and if you suspect that is because somebody else was writing to the
 same index, it is a _sure_ sign that you are not alone, and all the
 entries you read so far to the core, even if they weren't touched by
 that sombody else when you read them, may be stale, and worse yet,
 what you are going to read may be inconsistent with what you've read
 and have in-core (e.g. you may have read f before somebody else
 that is racing with you have turned it into a directory, and your
 next read may find f/d in the index without crc error).
 
 One sane way to avoid reading such an inconsistent state may be to
 redo this whole function, starting from the part that calls mmap().
 IOW,
 
   do {
   fd = open()
   mmap = xmmap(fd);
   close(fd);
 verify_various_fields(mmap);
 status = istate-ops-read_index(istate, mmap, mmap_size));
   } while (status == READ_AGAIN);
 
 I do not think the pass fd around so that we can redo the mapping
 deep inside the callchain is either a good idea or necessary.

Thanks, that looks better.  I'll change it for the re-roll.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v3 07/13] Read resolve-undo data

2012-08-09 Thread Thomas Gummerer
On 08/09, Junio C Hamano wrote:
 Thomas Gummerer t.gumme...@gmail.com writes:
 
  Make git read the resolve-undo data from the index.
 
  Since the resolve-undo data is joined with the conflicts in
  the ondisk format of the index file version 5, conflicts and
  resolved data is read at the same time, and the resolve-undo
  data is then converted to the in-memory format.
 
 This, and the next one, are both about reading extension data from
 the v2 formatted index, no?

Yes, exactly.

 Again, mild NAK.
 
 I think it is a lot more logical for the v5 code to read data stored
 in the resolve-undo and cache-tree extensions using the public API
 just like other users of these data do, and write out whatever in a
 way that is specific to the v5 index format.
 
 If the v5 codepath needs some information that is not exposed to
 other users of istate-resolve_undo and istate-cache_tree, then the
 story is different, but I do not think that is the case.

Sorry it's not clear to me what you mean with using the public API here.
Do you mean using resolve_undo_write() and resolve_undo_read()? I
wouldn't think those two methods would be really useful, as they expect
the data mangled in to a char* or return it as struct strbuf*.  And I
don't see the other methods doing something more useful.  Or I could
use the resolve-undo string_list directly, and just move the function
to read-cache-v5.c, or am I missing something here?

 
  Helped-by: Thomas Rast tr...@student.ethz.ch
  Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
  ---
   read-cache-v5.c |1 +
   resolve-undo.c  |   36 
   resolve-undo.h  |2 ++
   3 files changed, 39 insertions(+)
 
  diff --git a/read-cache-v5.c b/read-cache-v5.c
  index ec1201d..b47398d 100644
  --- a/read-cache-v5.c
  +++ b/read-cache-v5.c
  @@ -494,6 +494,7 @@ static struct directory_entry *read_entries(struct 
  index_state *istate,
  int i;
   
  conflict_queue = read_conflicts(de, mmap, mmap_size, fd);
  +   resolve_undo_convert_v5(istate, conflict_queue);
  for (i = 0; i  de-de_nfiles; i++) {
  ce = read_entry(de,
  entry_offset,
  diff --git a/resolve-undo.c b/resolve-undo.c
  index 72b4612..f96c6ba 100644
  --- a/resolve-undo.c
  +++ b/resolve-undo.c
  @@ -170,3 +170,39 @@ void unmerge_index(struct index_state *istate, const 
  char **pathspec)
  i = unmerge_index_entry_at(istate, i);
  }
   }
  +
  +void resolve_undo_convert_v5(struct index_state *istate,
  +   struct conflict_entry *ce)
  +{
  +   int i;
  +
  +   while (ce) {
  +   struct string_list_item *lost;
  +   struct resolve_undo_info *ui;
  +   struct conflict_part *cp;
  +
  +   if (ce-entries  (ce-entries-flags  CONFLICT_CONFLICTED) 
  != 0) {
  +   ce = ce-next;
  +   continue;
  +   }
  +   if (!istate-resolve_undo) {
  +   istate-resolve_undo = xcalloc(1, sizeof(struct 
  string_list));
  +   istate-resolve_undo-strdup_strings = 1;
  +   }
  +
  +   lost = string_list_insert(istate-resolve_undo, ce-name);
  +   if (!lost-util)
  +   lost-util = xcalloc(1, sizeof(*ui));
  +   ui = lost-util;
  +
  +   cp = ce-entries;
  +   for (i = 0; i  3; i++)
  +   ui-mode[i] = 0;
  +   while (cp) {
  +   ui-mode[conflict_stage(cp) - 1] = cp-entry_mode;
  +   hashcpy(ui-sha1[conflict_stage(cp) - 1], cp-sha1);
  +   cp = cp-next;
  +   }
  +   ce = ce-next;
  +   }
  +}
  diff --git a/resolve-undo.h b/resolve-undo.h
  index 8458769..ab660a6 100644
  --- a/resolve-undo.h
  +++ b/resolve-undo.h
  @@ -13,4 +13,6 @@ extern void resolve_undo_clear_index(struct index_state 
  *);
   extern int unmerge_index_entry_at(struct index_state *, int);
   extern void unmerge_index(struct index_state *, const char **);
   
  +extern void resolve_undo_convert_v5(struct index_state *, struct 
  conflict_entry *);
  +
   #endif
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v3 07/13] Read resolve-undo data

2012-08-10 Thread Thomas Gummerer
On 08/09, Junio C Hamano wrote:
 Thomas Gummerer t.gumme...@gmail.com writes:
 
  On 08/09, Junio C Hamano wrote:
  Thomas Gummerer t.gumme...@gmail.com writes:
  
   Make git read the resolve-undo data from the index.
  
   Since the resolve-undo data is joined with the conflicts in
   the ondisk format of the index file version 5, conflicts and
   resolved data is read at the same time, and the resolve-undo
   data is then converted to the in-memory format.
  
  This, and the next one, are both about reading extension data from
  the v2 formatted index, no?
 
  Yes, exactly.
 
  Again, mild NAK.
  
  I think it is a lot more logical for the v5 code to read data stored
  in the resolve-undo and cache-tree extensions using the public API
  just like other users of these data do, and write out whatever in a
  way that is specific to the v5 index format.
  
  If the v5 codepath needs some information that is not exposed to
  other users of istate-resolve_undo and istate-cache_tree, then the
  story is different, but I do not think that is the case.
 
  Sorry it's not clear to me what you mean with using the public API here.
  Do you mean using resolve_undo_write() and resolve_undo_read()?
 
 The code that reads from istate-resolve_undo is fine to do the v5
 specific conversion, but it does not belong to resolve-undo.c file
 which is about the resolve-undo extension.  Moving it to v5 specific
 file you added for this topic, read-cache-v5.c, and everything looks
 more logical.  When we taught ls-files to show the paths with
 resolve-undo data, we didn't add any function to resolve-undo.c that
 does ls-files's work for it.  Instead, ls-files just uses the public
 API (the data structure you find at the_index.resolve_undo is part
 of the API) to find what it needs to learn, and I think v5 code can
 do the same.
 
 then the story is different comment refers to a possibilty that
 v5 code might need something more than callers outside resolve-undo.c
 can find from its public interface, but I do not think it is the
 case.

Ok, thanks for the clarification, will change it for the re-roll.
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC v3 01/13] Move index v2 specific functions to their own file

2012-08-10 Thread Thomas Gummerer
On 08/10, Junio C Hamano wrote:
 Thomas Rast tr...@student.ethz.ch writes:
 
  But I think the idea always was that any write that changes the basic
  layout of the file (so that you would read something wrong) will need a
  full rewrite.  Otherwise we're too far in DB land.
 
  Most updates will be
  of the update the stat and/or sha1 of a file kind, anyway.
 
 Yes, I agree the v5 format documented in the series does not let you
 do anything other than the kind of updates without rewriting [*1*]
 
 But that does not fundamentally change the story that a new format
 and a new way to access the index to cope with larger projects would
 want to come up with a solution to address the competing read/write
 issue, or at least help to make it easier to solve the issue in the
 future.
 
 That problem is not new is not an answer when the question is We
 still have the problem.
 
 
 [Footnote]
 
 *1* While my gut feeling matches your guess that the kind of updates
 would be the majority, I do not think anybody did numbers to
 substanticate it, which we may want to see happen.

Hrm anther way to solve this may be the following. The idea would be
to just check if the index_file changed basically using the same
heuristic we already use to detect file changes.  (use the stat data,
mtime, size, etc.)

With this code we do not rely on the crc sums to check if the index
needs to be re-read anymore and don't have a problem if part of the
index changes, after we read it (we know the index changed from its
mtime and can just re-read it).  Another thing that would have to
change is that we can't use die if a crc is wrong, but some return
code, but that shouldn't be a problem.  I'm not sure I'm not missing
something here though.

do {
fd = open()
fstat(fd, st_old);
mmap = xmmap(fd);
verify_various_fields(mmap);
istate-ops-read_index(istate,
mmap,
mmap_size));
fstat(fd, st_new);
close(fd);
} while (stat_data_doesnt_match(st_old, st_new));


--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v4 02/13] read-cache.c: Re-read index if index file changed

2012-08-16 Thread Thomas Gummerer
Add the possibility of re-reading the index file, if it changed
while reading.

The index file might change during the read, causing outdated
information to be displayed. We check if the index file changed
by using its stat data as heuristic.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache.c | 87 +---
 1 file changed, 60 insertions(+), 27 deletions(-)

diff --git a/read-cache.c b/read-cache.c
index 6a8b4b1..cdd8480 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1170,11 +1170,34 @@ int read_index(struct index_state *istate)
return read_index_from(istate, get_index_file());
 }
 
+static int index_changed(struct stat st_old, struct stat st_new)
+{
+   int changed = 0;
+
+   if (st_old.st_mtime != st_new.st_mtime ||
+   st_old.st_uid   != st_new.st_uid ||
+   st_old.st_gid   != st_new.st_gid ||
+   st_old.st_ino   != st_new.st_ino ||
+   st_old.st_size  != st_new.st_size)
+   changed = 1;
+#ifdef USE_NSEC
+   if (ST_MTIME_NSEC(st_old) != ST_MTIME_NSEC(st_new))
+   changed = 1;
+#endif
+
+#ifdef USE_STDEV
+   if (st_old.st_dev != st_new.st_dev)
+   changed = 1;
+#endif
+
+   return changed;
+}
+
 /* remember to discard_cache() before reading a different cache! */
 int read_index_from(struct index_state *istate, const char *path)
 {
-   int fd;
-   struct stat st;
+   int fd, err, i = 0;
+   struct stat st_old, st_new;
struct cache_version_header *hdr;
void *mmap;
size_t mmap_size;
@@ -1186,38 +1209,48 @@ int read_index_from(struct index_state *istate, const 
char *path)
errno = ENOENT;
istate-timestamp.sec = 0;
istate-timestamp.nsec = 0;
-   fd = open(path, O_RDONLY);
-   if (fd  0) {
-   if (errno == ENOENT)
-   return 0;
-   die_errno(index file open failed);
-   }
+   do {
+   err = 0;
+   fd = open(path, O_RDONLY);
+   if (fd  0) {
+   if (errno == ENOENT)
+   return 0;
+   die_errno(index file open failed);
+   }
 
-   if (fstat(fd, st))
-   die_errno(cannot stat the open index);
+   if (fstat(fd, st_old))
+   die_errno(cannot stat the open index);
 
-   errno = EINVAL;
-   mmap_size = xsize_t(st.st_size);
-   mmap = xmmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 
0);
-   close(fd);
-   if (mmap == MAP_FAILED)
-   die_errno(unable to map index file);
+   errno = EINVAL;
+   mmap_size = xsize_t(st_old.st_size);
+   mmap = xmmap(NULL, mmap_size, PROT_READ | PROT_WRITE, 
MAP_PRIVATE, fd, 0);
+   close(fd);
+   if (mmap == MAP_FAILED)
+   die_errno(unable to map index file);
 
-   hdr = mmap;
-   if (verify_hdr_version(istate, hdr, mmap_size)  0)
-   goto unmap;
+   hdr = mmap;
+   if (verify_hdr_version(istate, hdr, mmap_size)  0)
+   err = 1;
 
-   if (istate-ops-verify_hdr(mmap, mmap_size)  0)
-   goto unmap;
+   if (istate-ops-verify_hdr(mmap, mmap_size)  0)
+   err = 1;
 
-   istate-ops-read_index(istate, mmap, mmap_size);
-   istate-timestamp.sec = st.st_mtime;
-   istate-timestamp.nsec = ST_MTIME_NSEC(st);
+   if (istate-ops-read_index(istate, mmap, mmap_size)  0)
+   err = 1;
+   istate-timestamp.sec = st_old.st_mtime;
+   istate-timestamp.nsec = ST_MTIME_NSEC(st_old);
+   if (lstat(path, st_new))
+   die_errno(cannot stat the open index);
 
-   munmap(mmap, mmap_size);
-   return istate-cache_nr;
+   munmap(mmap, mmap_size);
+
+   if (!index_changed(st_old, st_new)  !err)
+   return istate-cache_nr;
+
+   usleep(10*1000);
+   i++;
+   } while ((err || index_changed(st_old, st_new))  i  50);
 
-unmap:
munmap(mmap, mmap_size);
die(index file corrupt);
 }
-- 
1.7.11.2

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v4 03/13] t2104: Don't fail for index versions other than [23]

2012-08-16 Thread Thomas Gummerer
t2104 currently checks for the exact index version 2 or 3,
depending if there is a skip-worktree flag or not. Other
index versions do not use extended flags and thus cannot
be tested for version changes.

Make this test update the index to version 2 at the beginning
of the test. Testing the skip-worktree flags for the default
index format is still covered by t7011 and t7012.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/t2104-update-index-skip-worktree.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/t/t2104-update-index-skip-worktree.sh 
b/t/t2104-update-index-skip-worktree.sh
index 1d0879b..bd9644f 100755
--- a/t/t2104-update-index-skip-worktree.sh
+++ b/t/t2104-update-index-skip-worktree.sh
@@ -22,6 +22,7 @@ H sub/2
 EOF
 
 test_expect_success 'setup' '
+   git update-index --index-version=2 
mkdir sub 
touch ./1 ./2 sub/1 sub/2 
git add 1 2 sub/1 sub/2 
-- 
1.7.11.2

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v4 12/13] update-index.c: rewrite index when index-version is given

2012-08-16 Thread Thomas Gummerer
Make update-index always rewrite the index when a index-version
is given, even if the index already has the right version.
This option is used for performance testing the writer and
reader.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 builtin/update-index.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/builtin/update-index.c b/builtin/update-index.c
index 4ce341c..c31d176 100644
--- a/builtin/update-index.c
+++ b/builtin/update-index.c
@@ -6,6 +6,7 @@
 #include cache.h
 #include quote.h
 #include cache-tree.h
+#include read-cache.h
 #include tree-walk.h
 #include builtin.h
 #include refs.h
@@ -861,6 +862,7 @@ int cmd_update_index(int argc, const char **argv, const 
char *prefix)
if (the_index.version != preferred_index_format)
active_cache_changed = 1;
the_index.version = preferred_index_format;
+   set_istate_ops(the_index);
}
 
if (read_from_stdin) {
@@ -886,7 +888,7 @@ int cmd_update_index(int argc, const char **argv, const 
char *prefix)
strbuf_release(buf);
}
 
-   if (active_cache_changed) {
+   if (active_cache_changed || preferred_index_format) {
if (newfd  0) {
if (refresh_args.flags  REFRESH_QUIET)
exit(128);
-- 
1.7.11.2

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v4 13/13] p0002-index.sh: add perf test for the index formats

2012-08-16 Thread Thomas Gummerer
From: Thomas Rast tr...@student.ethz.ch

Add a performance test for index version [23]/4/5 by using
git update-index --index-version=x, thus testing both the reader
and the writer speed of all index formats.

Signed-off-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 t/perf/p0002-index.sh | 33 +
 1 file changed, 33 insertions(+)
 create mode 100755 t/perf/p0002-index.sh

diff --git a/t/perf/p0002-index.sh b/t/perf/p0002-index.sh
new file mode 100755
index 000..140c7a0
--- /dev/null
+++ b/t/perf/p0002-index.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+test_description=Tests index versions [23]/4/5
+
+. ./perf-lib.sh
+
+test_perf_large_repo
+
+test_expect_success 'convert to v3' '
+   git update-index --index-version=3
+'
+
+test_perf 'v[23]: update-index' '
+   git update-index --index-version=3 /dev/null
+'
+
+test_expect_success 'convert to v4' '
+   git update-index --index-version=4
+'
+
+test_perf 'v4: update-index' '
+   git update-index --index-version=4 /dev/null
+'
+
+test_expect_success 'convert to v5' '
+   git update-index --index-version=5
+'
+
+test_perf 'v5: update-index' '
+   git update-index --index-version=5 /dev/null
+'
+
+test_done
-- 
1.7.11.2

--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC v4 11/13] Write resolve-undo data for index-v5

2012-08-16 Thread Thomas Gummerer
Make git read the resolve-undo data from the index.

Since the resolve-undo data is joined with the conflicts in
the ondisk format of the index file version 5, conflicts and
resolved data is read at the same time, and the resolve-undo
data is then converted to the in-memory format.

Helped-by: Thomas Rast tr...@student.ethz.ch
Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 read-cache-v5.c | 96 +
 1 file changed, 96 insertions(+)

diff --git a/read-cache-v5.c b/read-cache-v5.c
index d740d0b..ce2375a 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -907,6 +907,99 @@ static void cache_tree_to_ondisk_v5(struct hash_table 
*table, struct cache_tree
convert_one_to_ondisk_v5(table, root, , 0, 0);
 }
 
+static void resolve_undo_to_ondisk_v5(struct hash_table *table,
+ struct string_list *resolve_undo,
+ unsigned int *ndir, int *total_dir_len,
+ struct directory_entry *de)
+{
+   struct string_list_item *item;
+   struct directory_entry *search;
+
+   if (!resolve_undo)
+   return;
+   for_each_string_list_item(item, resolve_undo) {
+   struct conflict_entry *conflict_entry;
+   struct resolve_undo_info *ui = item-util;
+   char *super;
+   int i, dir_len, len;
+   uint32_t crc;
+   struct directory_entry *found, *current, *new_tree;
+
+   if (!ui)
+   continue;
+
+   super = super_directory(item-string);
+   if (!super)
+   dir_len = 0;
+   else
+   dir_len = strlen(super);
+   crc = crc32(0, (Bytef*)super, dir_len);
+   found = lookup_hash(crc, table);
+   current = NULL;
+   new_tree = NULL;
+
+   while (!found) {
+   struct directory_entry *new;
+
+   new = init_directory_entry(super, dir_len);
+   if (!current)
+   current = new;
+   insert_directory_entry(new, table, total_dir_len, ndir, 
crc);
+   if (new_tree != NULL)
+   new-de_nsubtrees = 1;
+   new-next = new_tree;
+   new_tree = new;
+   super = super_directory(super);
+   if (!super)
+   dir_len = 0;
+   else
+   dir_len = strlen(super);
+   crc = crc32(0, (Bytef*)super, dir_len);
+   found = lookup_hash(crc, table);
+   }
+   search = found;
+   while (search-next_hash  strcmp(super, search-pathname) != 
0)
+   search = search-next_hash;
+   if (search  !current)
+   current = search;
+   if (!search  !current)
+   current = new_tree;
+   if (!super  new_tree) {
+   new_tree-next = de-next;
+   de-next = new_tree;
+   de-de_nsubtrees++;
+   } else if (new_tree) {
+   struct directory_entry *temp;
+
+   search = de-next;
+   while (strcmp(super, search-pathname))
+   search = search-next;
+   temp = new_tree;
+   while (temp-next)
+   temp = temp-next;
+   search-de_nsubtrees++;
+   temp-next = search-next;
+   search-next = new_tree;
+   }
+
+   len = strlen(item-string);
+   conflict_entry = create_new_conflict(item-string, len, 
current-de_pathlen);
+   add_conflict_to_directory_entry(current, conflict_entry);
+   for (i = 0; i  3; i++) {
+   if (ui-mode[i]) {
+   struct conflict_part *cp;
+
+   cp = xmalloc(sizeof(struct conflict_part));
+   cp-flags = (i + 1)  CONFLICT_STAGESHIFT;
+   cp-entry_mode = ui-mode[i];
+   cp-next = NULL;
+   hashcpy(cp-sha1, ui-sha1[i]);
+   add_part_to_conflict_entry(current, 
conflict_entry, cp);
+   }
+   }
+   }
+}
+
 static struct directory_entry *compile_directory_data(struct index_state 
*istate,
int nfile,
unsigned int *ndir,
@@ -1012,6 +1105,9 @@ static struct directory_entry 
*compile_directory_data(struct

[PATCH/RFC v4 08/13] Read cache-tree in index-v5

2012-08-16 Thread Thomas Gummerer
Since the cache-tree data is saved as part of the directory data,
we already read it at the beginning of the index. The cache-tree
is only converted from this directory data.

The cache-tree data is arranged in a tree, with the children sorted by
pathlen at each node, while the ondisk format is sorted lexically.
So we have to rebuild this format from the on-disk directory list.

Signed-off-by: Thomas Gummerer t.gumme...@gmail.com
---
 cache-tree.c|  2 +-
 cache-tree.h|  6 
 read-cache-v5.c | 98 +
 3 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/cache-tree.c b/cache-tree.c
index 28ed657..61544d8 100644
--- a/cache-tree.c
+++ b/cache-tree.c
@@ -31,7 +31,7 @@ void cache_tree_free(struct cache_tree **it_p)
*it_p = NULL;
 }
 
-static int subtree_name_cmp(const char *one, int onelen,
+int subtree_name_cmp(const char *one, int onelen,
const char *two, int twolen)
 {
if (onelen  twolen)
diff --git a/cache-tree.h b/cache-tree.h
index d8cb2e9..7416007 100644
--- a/cache-tree.h
+++ b/cache-tree.h
@@ -20,10 +20,16 @@ struct cache_tree {
struct cache_tree_sub **down;
 };
 
+struct directory_queue {
+   struct directory_queue *down;
+   struct directory_entry *de;
+};
+
 struct cache_tree *cache_tree(void);
 void cache_tree_free(struct cache_tree **);
 void cache_tree_invalidate_path(struct cache_tree *, const char *);
 struct cache_tree_sub *cache_tree_sub(struct cache_tree *, const char *);
+int subtree_name_cmp(const char *, int, const char *, int);
 
 void cache_tree_write(struct strbuf *, struct cache_tree *root);
 struct cache_tree *cache_tree_read(const char *buffer, unsigned long size);
diff --git a/read-cache-v5.c b/read-cache-v5.c
index fb549de..b497726 100644
--- a/read-cache-v5.c
+++ b/read-cache-v5.c
@@ -448,6 +448,103 @@ static int read_conflicts(struct conflict_entry **head,
return 0;
 }
 
+static struct cache_tree *convert_one(struct directory_queue *queue, int dirnr)
+{
+   int i, subtree_nr;
+   struct cache_tree *it;
+   struct directory_queue *down;
+
+   it = cache_tree();
+   it-entry_count = queue[dirnr].de-de_nentries;
+   subtree_nr = queue[dirnr].de-de_nsubtrees;
+   if (0 = it-entry_count)
+   hashcpy(it-sha1, queue[dirnr].de-sha1);
+
+   /*
+   * Just a heuristic -- we do not add directories that often but
+   * we do not want to have to extend it immediately when we do,
+   * hence +2.
+   */
+   it-subtree_alloc = subtree_nr + 2;
+   it-down = xcalloc(it-subtree_alloc, sizeof(struct cache_tree_sub *));
+   down = queue[dirnr].down;
+   for (i = 0; i  subtree_nr; i++) {
+   struct cache_tree *sub;
+   struct cache_tree_sub *subtree;
+   char *buf, *name;
+
+   name = ;
+   buf = strtok(down[i].de-pathname, /);
+   while (buf) {
+   name = buf;
+   buf = strtok(NULL, /);
+   }
+   sub = convert_one(down, i);
+   if(!sub)
+   goto free_return;
+   subtree = cache_tree_sub(it, name);
+   subtree-cache_tree = sub;
+   }
+   if (subtree_nr != it-subtree_nr)
+   die(cache-tree: internal error);
+   return it;
+ free_return:
+   cache_tree_free(it);
+   return NULL;
+}
+
+static int compare_cache_tree_elements(const void *a, const void *b)
+{
+   const struct directory_entry *de1, *de2;
+
+   de1 = ((const struct directory_queue *)a)-de;
+   de2 = ((const struct directory_queue *)b)-de;
+   return subtree_name_cmp(de1-pathname, de1-de_pathlen,
+   de2-pathname, de2-de_pathlen);
+}
+
+static struct directory_entry *sort_directories(struct directory_entry *de,
+   struct directory_queue *queue)
+{
+   int i, nsubtrees;
+
+   nsubtrees = de-de_nsubtrees;
+   for (i = 0; i  nsubtrees; i++) {
+   struct directory_entry *new_de;
+   de = de-next;
+   new_de = xmalloc(directory_entry_size(de-de_pathlen));
+   memcpy(new_de, de, directory_entry_size(de-de_pathlen));
+   queue[i].de = new_de;
+   if (de-de_nsubtrees) {
+   queue[i].down = xcalloc(de-de_nsubtrees,
+   sizeof(struct directory_queue));
+   de = sort_directories(de,
+   queue[i].down);
+   }
+   }
+   qsort(queue, nsubtrees, sizeof(struct directory_queue),
+   compare_cache_tree_elements);
+   return de;
+}
+
+/*
+ * This function modifys the directory argument that is given to it.
+ * Don't use it if the directory entries are still needed after.
+ */
+static struct cache_tree

  1   2   3   4   5   6   7   8   9   >