While stress testing `git filter-repo`, I noticed an issue with
encoding; further digging led to the fixes and features in this series.
See the individual commit messages for details.
Changes since v3 (full range-diff below):
* YES/NO changes suggested by Torsten
* more boolean synonyms as suggested by Junio
* check for the exact expected special bytes, in addition to the size
(Dscho pointed out that it was GitForWindows that munged bytes, not
Windows, so while I need to be careful in what I pass to git, printf
and grep can work directly with the special bytes)
* also checked on gitgitgadget that it passes on the major platforms
[1] https://github.com/gitgitgadget/git/pull/191
Elijah Newren (5):
t9350: fix encoding test to actually test reencoding
fast-import: support 'encoding' commit header
fast-export: avoid stripping encoding header if we cannot reencode
fast-export: differentiate between explicitly utf-8 and implicitly
utf-8
fast-export: do automatic reencoding of commit messages only if
requested
Documentation/git-fast-import.txt | 7 ++
builtin/fast-export.c | 44 +++++++++--
fast-import.c | 11 ++-
t/t9300-fast-import.sh | 20 +++++
t/t9350-fast-export.sh | 78 +++++++++++++++++---
t/t9350/broken-iso-8859-7-commit-message.txt | 1 +
t/t9350/simple-iso-8859-7-commit-message.txt | 1 +
7 files changed, 145 insertions(+), 17 deletions(-)
create mode 100644 t/t9350/broken-iso-8859-7-commit-message.txt
create mode 100644 t/t9350/simple-iso-8859-7-commit-message.txt
Range-diff:
1: 2d7bb64acf ! 1: 37a68a0ffd t9350: fix encoding test to actually test
reencoding
@@ -39,18 +39,16 @@
git fast-import &&
+ # The commit object, if not re-encoded, would be 240 bytes.
+ # Removing the "encoding iso-8859-7\n" header drops 20 bytes.
-+ # Re-encoding the Pi character from \xF0 in iso-8859-7 to
-+ # \xCF\x80 in utf-8 adds a byte. Grepping for specific bytes
-+ # would be nice, but Windows apparently munges user data
-+ # in the form of bytes on the command line to force them to
-+ # be characters instead, so we are limited for portability
-+ # reasons in subsequent similar tests in this file to check
-+ # for size rather than what bytes are present.
++ # Re-encoding the Pi character from \xF0 (\360) in iso-8859-7
++ # to \xCF\x80 (\317\200) in utf-8 adds a byte. Check for
++ # the expected size.
+ test 221 -eq "$(git cat-file -s i18n)" &&
-+ # Also make sure the commit does not have the "encoding" header
++ # ...and for the expected translation of bytes.
git cat-file commit i18n >actual &&
- grep "Áéí óú" actual)
-
++ grep $(printf "\317\200") actual &&
++ # Also make sure the commit does not have the "encoding" header
+ ! grep ^encoding actual)
'
+
2: 9fa5695017 = 2: 3d84f4613d fast-import: support 'encoding' commit header
3: dfc76573e9 ! 3: baa8394a3a fast-export: avoid stripping encoding header if
we cannot reencode
@@ -49,10 +49,14 @@
+ (cd new &&
+ git fast-import &&
+ git cat-file commit i18n-invalid >actual &&
++ # Make sure the commit still has the encoding header
+ grep ^encoding actual &&
-+ # Also verify that the commit has the expected size; i.e.
++ # Verify that the commit has the expected size; i.e.
+ # that no bytes were re-encoded to a different encoding.
-+ test 252 -eq "$(git cat-file -s i18n-invalid)")
++ test 252 -eq "$(git cat-file -s i18n-invalid)" &&
++ # ...and check for the original special bytes
++ grep $(printf "\360") actual &&
++ grep $(printf "\377") actual)
+'
+
test_expect_success 'import/export-marks' '
4: 83b3656b76 = 4: 49960164c6 fast-export: differentiate between explicitly
utf-8 and implicitly utf-8
5: 2063122293 ! 5: 571613a09e fast-export: do automatic reencoding of commit
messages only if requested
@@ -20,7 +20,7 @@
static int progress;
static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP }
signed_tag_mode = SIGNED_TAG_ABORT;
static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode =
TAG_FILTERING_ABORT;
-+static enum { REENCODE_ABORT, REENCODE_PLEASE, REENCODE_NEVER }
reencode_mode = REENCODE_ABORT;
++static enum { REENCODE_ABORT, REENCODE_YES, REENCODE_NO } reencode_mode =
REENCODE_ABORT;
static int fake_missing_tagger;
static int use_done_feature;
static int no_data;
@@ -33,10 +33,10 @@
+{
+ if (unset || !strcmp(arg, "abort"))
+ reencode_mode = REENCODE_ABORT;
-+ else if (!strcmp(arg, "yes"))
-+ reencode_mode = REENCODE_PLEASE;
-+ else if (!strcmp(arg, "no"))
-+ reencode_mode = REENCODE_NEVER;
++ else if (!strcmp(arg, "yes") || !strcmp(arg, "true") || !strcmp(arg,
"on"))
++ reencode_mode = REENCODE_YES;
++ else if (!strcmp(arg, "no") || !strcmp(arg, "false") || !strcmp(arg,
"off"))
++ reencode_mode = REENCODE_NO;
+ else
+ return error("Unknown reencoding mode: %s", arg);
+ return 0;
@@ -56,14 +56,14 @@
- reencoded = reencode_string(message, "UTF-8", encoding);
+ } else if (encoding) {
+ switch(reencode_mode) {
-+ case REENCODE_PLEASE:
++ case REENCODE_YES:
+ reencoded = reencode_string(message, "UTF-8", encoding);
+ break;
-+ case REENCODE_NEVER:
++ case REENCODE_NO:
+ break;
+ case REENCODE_ABORT:
+ die("Encountered commit-specific encoding %s in commit "
-+ "%s; use --reencode=<mode> to handle it",
++ "%s; use --reencode=[yes|no] to handle it",
+ encoding, oid_to_hex(&commit->object.oid));
+ }
+ }
@@ -126,13 +126,14 @@
+ git fast-import &&
+ # The commit object, if not re-encoded, is 240 bytes.
+ # Removing the "encoding iso-8859-7\n" header would drops 20
-+ # bytes. Re-encoding the Pi character from \xF0 in
-+ # iso-8859-7 to \xCF\x80 in utf-8 would add a byte. I would
-+ # grep for the # specific bytes, but Windows lamely does not
-+ # allow that, so just search for the expected size.
++ # bytes. Re-encoding the Pi character from \xF0 (\360) in
++ # iso-8859-7 to \xCF\x80 (\317\200) in utf-8 adds a byte.
++ # Check for the expected size...
+ test 240 -eq "$(git cat-file -s i18n-no-recoding)" &&
-+ # Also make sure the commit has the "encoding" header
++ # ...as well as the expected byte.
+ git cat-file commit i18n-no-recoding >actual &&
++ grep $(printf "\360") actual &&
++ # Also make sure the commit has the "encoding" header
+ grep ^encoding actual)
+'
+
--
2.21.0.782.g571613a09e