Re: [PATCH v11 07/10] convert: check for detectable errors in UTF encodings

2018-03-09 Thread Junio C Hamano
lars.schnei...@autodesk.com writes:

> + const char *advise_msg = _(
> + "The file '%s' contains a byte order "
> + "mark (BOM). Please use %.6s as "
> + "working-tree-encoding.");

I know that this will go away in a later step, but why ".6"?

> + advise(advise_msg, path, enc);


Re: [PATCH v11 07/10] convert: check for detectable errors in UTF encodings

2018-03-09 Thread Lars Schneider

> On 09 Mar 2018, at 20:00, Junio C Hamano  wrote:
> 
> lars.schnei...@autodesk.com writes:
> 
>> +const char *advise_msg = _(
>> +"The file '%s' contains a byte order "
>> +"mark (BOM). Please use %.6s as "
>> +"working-tree-encoding.");
> 
> I know that this will go away in a later step, but why ".6"?

I deleted the original comment in the rebase, sorry:

/*
 * This advice is shown for UTF-??BE and UTF-??LE
 * encodings. We truncate the encoding name to 6
 * chars with %.6s to cut off the last two "byte
 * order" characters.
 */

- Lars


Re: [PATCH v11 07/10] convert: check for detectable errors in UTF encodings

2018-03-09 Thread Junio C Hamano
lars.schnei...@autodesk.com writes:

> + const char *advise_msg = _(
> + "The file '%s' contains a byte order "
> + "mark (BOM). Please use %.6s as "
> + "working-tree-encoding.");

I know that this will go away in a later step, but why ".6"?

> + advise(advise_msg, path, enc);


[PATCH v11 07/10] convert: check for detectable errors in UTF encodings

2018-03-09 Thread lars . schneider
From: Lars Schneider 

Check that new content is valid with respect to the user defined
'working-tree-encoding' attribute.

Signed-off-by: Lars Schneider 
---
 convert.c| 48 ++
 t/t0028-working-tree-encoding.sh | 56 
 2 files changed, 104 insertions(+)

diff --git a/convert.c b/convert.c
index aa59ecfe49..b80d666a6b 100644
--- a/convert.c
+++ b/convert.c
@@ -266,6 +266,51 @@ static int will_convert_lf_to_crlf(size_t len, struct 
text_stat *stats,
 
 }
 
+static int validate_encoding(const char *path, const char *enc,
+ const char *data, size_t len, int die_on_error)
+{
+   /* We only check for UTF here as UTF?? can be an alias for UTF-?? */
+   if (istarts_with(enc, "UTF")) {
+   /*
+* Check for detectable errors in UTF encodings
+*/
+   if (has_prohibited_utf_bom(enc, data, len)) {
+   const char *error_msg = _(
+   "BOM is prohibited in '%s' if encoded as %s");
+   /*
+* This advice is shown for UTF-??BE and UTF-??LE 
encodings.
+*/
+   const char *advise_msg = _(
+   "The file '%s' contains a byte order "
+   "mark (BOM). Please use %.6s as "
+   "working-tree-encoding.");
+   advise(advise_msg, path, enc);
+   if (die_on_error)
+   die(error_msg, path, enc);
+   else {
+   return error(error_msg, path, enc);
+   }
+
+   } else if (is_missing_required_utf_bom(enc, data, len)) {
+   const char *error_msg = _(
+   "BOM is required in '%s' if encoded as %s");
+   const char *advise_msg = _(
+   "The file '%s' is missing a byte order "
+   "mark (BOM). Please use %sBE or %sLE "
+   "(depending on the byte order) as "
+   "working-tree-encoding.");
+   advise(advise_msg, path, enc, enc);
+   if (die_on_error)
+   die(error_msg, path, enc);
+   else {
+   return error(error_msg, path, enc);
+   }
+   }
+
+   }
+   return 0;
+}
+
 static const char *default_encoding = "UTF-8";
 
 static int encode_to_git(const char *path, const char *src, size_t src_len,
@@ -291,6 +336,9 @@ static int encode_to_git(const char *path, const char *src, 
size_t src_len,
if (!buf && !src)
return 1;
 
+   if (validate_encoding(path, enc, src, src_len, die_on_error))
+   return 0;
+
dst = reencode_string_len(src, src_len, default_encoding, enc,
  _len);
if (!dst) {
diff --git a/t/t0028-working-tree-encoding.sh b/t/t0028-working-tree-encoding.sh
index e492945a01..e8408dfe5c 100755
--- a/t/t0028-working-tree-encoding.sh
+++ b/t/t0028-working-tree-encoding.sh
@@ -62,6 +62,46 @@ test_expect_success 'check $GIT_DIR/info/attributes support' 
'
 
 for i in 16 32
 do
+   test_expect_success "check prohibited UTF-${i} BOM" '
+   test_when_finished "git reset --hard HEAD" &&
+
+   echo "*.utf${i}be text working-tree-encoding=utf-${i}be" 
>>.gitattributes &&
+   echo "*.utf${i}le text working-tree-encoding=utf-${i}LE" 
>>.gitattributes &&
+
+   # Here we add a UTF-16 (resp. UTF-32) files with BOM 
(big/little-endian)
+   # but we tell Git to treat it as UTF-16BE/UTF-16LE (resp. 
UTF-32).
+   # In these cases the BOM is prohibited.
+   cp bebom.utf${i}be.raw bebom.utf${i}be &&
+   test_must_fail git add bebom.utf${i}be 2>err.out &&
+   test_i18ngrep "fatal: BOM is prohibited .* utf-${i}be" err.out 
&&
+
+   cp lebom.utf${i}le.raw lebom.utf${i}be &&
+   test_must_fail git add lebom.utf${i}be 2>err.out &&
+   test_i18ngrep "fatal: BOM is prohibited .* utf-${i}be" err.out 
&&
+
+   cp bebom.utf${i}be.raw bebom.utf${i}le &&
+   test_must_fail git add bebom.utf${i}le 2>err.out &&
+   test_i18ngrep "fatal: BOM is prohibited .* utf-${i}LE" err.out 
&&
+
+   cp lebom.utf${i}le.raw lebom.utf${i}le &&
+   test_must_fail git add lebom.utf${i}le 2>err.out &&
+   test_i18ngrep "fatal: BOM is prohibited .* utf-${i}LE" err.out
+   '
+
+   test_expect_success "check required UTF-${i} BOM" '
+   test_when_finished "git