[PATCH v12 08/10] convert: check for detectable errors in UTF encodings

lars . schneider Thu, 15 Mar 2018 15:59:25 -0700

From: Lars Schneider <larsxschnei...@gmail.com>

Check that new content is valid with respect to the user defined
'working-tree-encoding' attribute.


Signed-off-by: Lars Schneider <larsxschnei...@gmail.com>
---
 convert.c                        | 61 +++++++++++++++++++++++++++++++++++++++
 t/t0028-working-tree-encoding.sh | 62 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 123 insertions(+)

diff --git a/convert.c b/convert.c
index 85e49741af..3cab4fa907 100644
--- a/convert.c
+++ b/convert.c
@@ -266,6 +266,64 @@ static int will_convert_lf_to_crlf(size_t len, struct 
text_stat *stats,
 
 }
 
+static int validate_encoding(const char *path, const char *enc,
+                     const char *data, size_t len, int die_on_error)
+{
+       /* We only check for UTF here as UTF?? can be an alias for UTF-?? */
+       if (istarts_with(enc, "UTF")) {
+               /*
+                * Check for detectable errors in UTF encodings
+                */
+               if (has_prohibited_utf_bom(enc, data, len)) {
+                       const char *error_msg = _(
+                               "BOM is prohibited in '%s' if encoded as %s");
+                       /*
+                        * This advice is shown for UTF-??BE and UTF-??LE 
encodings.
+                        * We cut off the last two characters of the encoding 
name
+                        * to generate the encoding name suitable for BOMs.
+                        */
+                       const char *advise_msg = _(
+                               "The file '%s' contains a byte order "
+                               "mark (BOM). Please use UTF-%s as "
+                               "working-tree-encoding.");
+                       const char *stripped = NULL;
+                       char *upper = xstrdup_toupper(enc);
+                       upper[strlen(upper)-2] = '\0';
+                       if (!skip_prefix(upper, "UTF-", &stripped))
+                               skip_prefix(stripped, "UTF", &stripped);
+                       advise(advise_msg, path, stripped);
+                       free(upper);
+                       if (die_on_error)
+                               die(error_msg, path, enc);
+                       else {
+                               return error(error_msg, path, enc);
+                       }
+
+               } else if (is_missing_required_utf_bom(enc, data, len)) {
+                       const char *error_msg = _(
+                               "BOM is required in '%s' if encoded as %s");
+                       const char *advise_msg = _(
+                               "The file '%s' is missing a byte order "
+                               "mark (BOM). Please use UTF-%sBE or UTF-%sLE "
+                               "(depending on the byte order) as "
+                               "working-tree-encoding.");
+                       const char *stripped = NULL;
+                       char *upper = xstrdup_toupper(enc);
+                       if (!skip_prefix(upper, "UTF-", &stripped))
+                               skip_prefix(stripped, "UTF", &stripped);
+                       advise(advise_msg, path, stripped, stripped);
+                       free(upper);
+                       if (die_on_error)
+                               die(error_msg, path, enc);
+                       else {
+                               return error(error_msg, path, enc);
+                       }
+               }
+
+       }
+       return 0;
+}
+
 static const char *default_encoding = "UTF-8";
 
 static int encode_to_git(const char *path, const char *src, size_t src_len,
@@ -291,6 +349,9 @@ static int encode_to_git(const char *path, const char *src, 
size_t src_len,
        if (!buf && !src)
                return 1;
 
+       if (validate_encoding(path, enc, src, src_len, die_on_error))
+               return 0;
+
        dst = reencode_string_len(src, src_len, default_encoding, enc,
                                  &dst_len);
        if (!dst) {
diff --git a/t/t0028-working-tree-encoding.sh b/t/t0028-working-tree-encoding.sh
index d67dbde1d4..1bb528b339 100755
--- a/t/t0028-working-tree-encoding.sh
+++ b/t/t0028-working-tree-encoding.sh
@@ -62,6 +62,52 @@ test_expect_success 'check $GIT_DIR/info/attributes support' 
'
 
 for i in 16 32
 do
+       test_expect_success "check prohibited UTF-${i} BOM" '
+               test_when_finished "git reset --hard HEAD" &&
+
+               echo "*.utf${i}be text working-tree-encoding=utf-${i}be" 
>>.gitattributes &&
+               echo "*.utf${i}le text working-tree-encoding=utf-${i}LE" 
>>.gitattributes &&
+
+               # Here we add a UTF-16 (resp. UTF-32) files with BOM 
(big/little-endian)
+               # but we tell Git to treat it as UTF-16BE/UTF-16LE (resp. 
UTF-32).
+               # In these cases the BOM is prohibited.
+               cp bebom.utf${i}be.raw bebom.utf${i}be &&
+               test_must_fail git add bebom.utf${i}be 2>err.out &&
+               test_i18ngrep "fatal: BOM is prohibited .* utf-${i}be" err.out 
&&
+               test_i18ngrep "use UTF-${i} as working-tree-encoding" err.out &&
+
+               cp lebom.utf${i}le.raw lebom.utf${i}be &&
+               test_must_fail git add lebom.utf${i}be 2>err.out &&
+               test_i18ngrep "fatal: BOM is prohibited .* utf-${i}be" err.out 
&&
+               test_i18ngrep "use UTF-${i} as working-tree-encoding" err.out &&
+
+               cp bebom.utf${i}be.raw bebom.utf${i}le &&
+               test_must_fail git add bebom.utf${i}le 2>err.out &&
+               test_i18ngrep "fatal: BOM is prohibited .* utf-${i}LE" err.out 
&&
+               test_i18ngrep "use UTF-${i} as working-tree-encoding" err.out &&
+
+               cp lebom.utf${i}le.raw lebom.utf${i}le &&
+               test_must_fail git add lebom.utf${i}le 2>err.out &&
+               test_i18ngrep "fatal: BOM is prohibited .* utf-${i}LE" err.out 
&&
+               test_i18ngrep "use UTF-${i} as working-tree-encoding" err.out
+       '
+
+       test_expect_success "check required UTF-${i} BOM" '
+               test_when_finished "git reset --hard HEAD" &&
+
+               echo "*.utf${i} text working-tree-encoding=utf-${i}" 
>>.gitattributes &&
+
+               cp nobom.utf${i}be.raw nobom.utf${i} &&
+               test_must_fail git add nobom.utf${i} 2>err.out &&
+               test_i18ngrep "fatal: BOM is required .* utf-${i}" err.out &&
+               test_i18ngrep "use UTF-${i}BE or UTF-${i}LE" err.out &&
+
+               cp nobom.utf${i}le.raw nobom.utf${i} &&
+               test_must_fail git add nobom.utf${i} 2>err.out &&
+               test_i18ngrep "fatal: BOM is required .* utf-${i}" err.out &&
+               test_i18ngrep "use UTF-${i}BE or UTF-${i}LE" err.out
+       '
+
        test_expect_success "eol conversion for UTF-${i} encoded files on 
checkout" '
                test_when_finished "rm -f crlf.utf${i}.raw lf.utf${i}.raw" &&
                test_when_finished "git reset --hard HEAD^" &&
@@ -139,4 +185,20 @@ test_expect_success 'error if encoding round trip is not 
the same during refresh
        test_i18ngrep "error: .* overwritten by checkout:" err.out
 '
 
+test_expect_success 'error if encoding garbage is already in Git' '
+       BEFORE_STATE=$(git rev-parse HEAD) &&
+       test_when_finished "git reset --hard $BEFORE_STATE" &&
+
+       # Skip the UTF-16 filter for the added file
+       # This simulates a Git version that has no checkoutEncoding support
+       cp nobom.utf16be.raw nonsense.utf16 &&
+       TEST_HASH=$(git hash-object --no-filters -w nonsense.utf16) &&
+       git update-index --add --cacheinfo 100644 $TEST_HASH nonsense.utf16 &&
+       COMMIT=$(git commit-tree -p $(git rev-parse HEAD) -m "plain commit" 
$(git write-tree)) &&
+       git update-ref refs/heads/master $COMMIT &&
+
+       git diff 2>err.out &&
+       test_i18ngrep "error: BOM is required" err.out
+'
+
 test_done
-- 
2.16.2

[PATCH v12 08/10] convert: check for detectable errors in UTF encodings

Reply via email to