# HG changeset patch
# User Manuel Jacob <m...@manueljacob.de>
# Date 1593494609 -7200
#      Tue Jun 30 07:23:29 2020 +0200
# Branch stable
# Node ID 9915fdff8d1732ce62b6df69b50106384d4ad4d1
# Parent  e1a4c7f23e804f37c3848fc408607af916d619d1
# EXP-Topic svn_encoding
convert: handle percent-encoded bytes in file URLs like Subversion

75b59d221aa3 added most of the code that gets removed by this patch. It helped
making progress on Python 3, but the reasoning was wrong in many ways. I tried
to retract it while it was queued, but it was too late.

Back then, I was asssuming that what happened on Python 2 (preserving bytes) is
correct and my Python 3 change is a hack. However it turned out that Subversion
interprets percent-encoded bytes as UTF-8. Accepting the same format as
Subversion is a good idea.

Consistency with urlreq.pathname2url() (as described in the removed comment)
doesn’t matter because that function is only used for passing paths to urllib.

This is not a backwards-incompatible change because before 5c0d5b48e58c,
non-ASCII filenames didn’t work at all on Python 2.

When the locale encoding is ISO-8859-15, `svn` accepts `file:///tmp/a%E2%82%AC`
for `/tmp/a€`. Before this patch, this was the case for this extension on
Python 3, but not on Python 2. This patch makes it work like with `svn` on both
Python 2 and Python 3.

diff --git a/hgext/convert/subversion.py b/hgext/convert/subversion.py
--- a/hgext/convert/subversion.py
+++ b/hgext/convert/subversion.py
@@ -349,6 +349,32 @@
 }
 
 
+class NonUtf8PercentEncodedBytes(Exception):
+    pass
+
+
+# Subversion paths are Unicode. Since the percent-decoding is done on
+# UTF-8-encoded strings, percent-encoded bytes are interpreted as UTF-8.
+def url2pathname_like_subversion(unicodepath):
+    if pycompat.ispy3:
+        # On Python 3, we have to pass unicode to urlreq.url2pathname().
+        # Percent-decoded bytes get decoded using UTF-8 and the 'replace' error
+        # handler.
+        unicodepath = urlreq.url2pathname(unicodepath)
+        if u'\N{REPLACEMENT CHARACTER}' in unicodepath:
+            raise NonUtf8PercentEncodedBytes
+        else:
+            return unicodepath
+    else:
+        # If we passed unicode on Python 2, it would be converted using the
+        # latin-1 encoding. Therefore, we pass UTF-8-encoded bytes.
+        unicodepath = urlreq.url2pathname(unicodepath.encode('utf-8'))
+        try:
+            return unicodepath.decode('utf-8')
+        except UnicodeDecodeError:
+            raise NonUtf8PercentEncodedBytes
+
+
 def issvnurl(ui, url):
     try:
         proto, path = url.split(b'://', 1)
@@ -361,7 +387,7 @@
             ):
                 path = path[:2] + b':/' + path[6:]
             try:
-                path.decode(fsencoding)
+                unicodepath = path.decode(fsencoding)
             except UnicodeDecodeError:
                 ui.warn(
                     _(
@@ -371,28 +397,17 @@
                     % pycompat.sysbytes(fsencoding)
                 )
                 return False
-            # FIXME: The following reasoning and logic is wrong and will be
-            # fixed in a following changeset.
-            # pycompat.fsdecode() / pycompat.fsencode() are used so that bytes
-            # in the URL roundtrip correctly on Unix. urlreq.url2pathname() on
-            # py3 will decode percent-encoded bytes using the utf-8 encoding
-            # and the "replace" error handler. This means that it will not
-            # preserve non-UTF-8 bytes (https://bugs.python.org/issue40983).
-            # url.open() uses the reverse function (urlreq.pathname2url()) and
-            # has a similar problem
-            # (https://bz.mercurial-scm.org/show_bug.cgi?id=6357). It makes
-            # sense to solve both problems together and handle all file URLs
-            # consistently. For now, we warn.
-            unicodepath = urlreq.url2pathname(pycompat.fsdecode(path))
-            if pycompat.ispy3 and u'\N{REPLACEMENT CHARACTER}' in unicodepath:
+            try:
+                unicodepath = url2pathname_like_subversion(unicodepath)
+            except NonUtf8PercentEncodedBytes:
                 ui.warn(
                     _(
-                        b'on Python 3, we currently do not support non-UTF-8 '
-                        b'percent-encoded bytes in file URLs for Subversion '
-                        b'repositories\n'
+                        b'Subversion does not support non-UTF-8 '
+                        b'percent-encoded bytes in file URLs\n'
                     )
                 )
-            path = pycompat.fsencode(unicodepath)
+                return False
+            path = unicodepath.encode(fsencoding)
     except ValueError:
         proto = b'file'
         path = os.path.abspath(url)
diff --git a/tests/test-convert-svn-encoding.t 
b/tests/test-convert-svn-encoding.t
--- a/tests/test-convert-svn-encoding.t
+++ b/tests/test-convert-svn-encoding.t
@@ -197,13 +197,13 @@
   abort: file:/*/$TESTTMP/\xff: missing or unsupported repository (glob) (esc)
   [255]
 
-#if py3
-For now, on Python 3, we abort when encountering non-UTF-8 percent-encoded
-bytes in a filename.
+Subversion decodes percent-encoded bytes on the converted, UTF-8-encoded
+string. Therefore, if the percent-encoded bytes aren't valid UTF-8, Subversion
+would choke on them when converting them to the locale encoding.
 
   $ hg convert file://$TESTTMP/%FF test
   initializing destination test repository
-  on Python 3, we currently do not support non-UTF-8 percent-encoded bytes in 
file URLs for Subversion repositories
+  Subversion does not support non-UTF-8 percent-encoded bytes in file URLs
   file:/*/$TESTTMP/%FF does not look like a CVS checkout (glob)
   $TESTTMP/file:$TESTTMP/%FF does not look like a Git repository
   file:/*/$TESTTMP/%FF does not look like a Subversion repository (glob)
@@ -215,4 +215,3 @@
   file:/*/$TESTTMP/%FF does not look like a P4 repository (glob)
   abort: file:/*/$TESTTMP/%FF: missing or unsupported repository (glob)
   [255]
-#endif
_______________________________________________
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel

Reply via email to