Hi there,

I'm urgently looking for somebody to review the latest patches for
fixing Unicode support in Tahoe command-line tools. It's our last chance
to get those into the upcoming Tahoe v1.7.

As a reviewer, your duty will be to look at these patches, see if the
tests are testing what they should, read through the code looking for
bugs, and then write back saying 'Didn't see any bugs!'."

Zooko, which has way too many things to do right now, plans to release a
beta in about 10 hours. So please, speak up now! :)

You'll find the two patches attached to this email, or on the following
two colorful web pages¹².

Please post your comments on this mailing-list and on trac issue #534³.

Thank you very much for your help!

François

¹http://tahoe-lafs.org/trac/tahoe-lafs/attachment/ticket/534/unicode-helper-functions-v2.diff
²http://tahoe-lafs.org/trac/tahoe-lafs/attachment/ticket/534/unicode-filenames-handling-v2.diff
³http://tahoe-lafs.org/trac/tahoe-lafs/ticket/534
Sun May 16 23:52:33 CEST 2010  Francois Deppierraz <franc...@ctrlaltdel.ch>
  * stringutils.py: Unicode helper functions + associated tests
  
  This file contains a bunch of helper functions which converts
  unicode string from and to argv, filenames and stdout.
diff -rN -u old-tahoe-534/src/allmydata/test/test_stringutils.py new-tahoe-534/src/allmydata/test/test_stringutils.py
--- old-tahoe-534/src/allmydata/test/test_stringutils.py	1970-01-01 01:00:00.000000000 +0100
+++ new-tahoe-534/src/allmydata/test/test_stringutils.py	2010-05-17 09:57:38.766407368 +0200
@@ -0,0 +1,228 @@
+# coding=utf-8
+
+TEST_FILENAMES = (
+  u'Ärtonwall.mp3',
+  u'test_file',
+  u'Blah blah.txt',
+)
+
+# The following main helps to generate a test class for other operating
+# systems.
+
+if __name__ == "__main__":
+    import sys, os
+    import tempfile
+    import shutil
+    import platform
+    
+    if len(sys.argv) != 2:
+        print "Usage: %s lumière" % sys.argv[0]
+        sys.exit(1)
+    
+    print
+    print "class MyWeirdOS(StringUtils, unittest.TestCase):"
+    print "    uname = '%s'" % ' '.join(platform.uname())
+    print "    argv = %s" % repr(sys.argv[1])
+    print "    platform = '%s'" % sys.platform
+    print "    filesystemencoding = '%s'" % sys.getfilesystemencoding()
+    print "    stdoutencoding = '%s'" % sys.stdout.encoding
+
+    try:
+        tmpdir = tempfile.mkdtemp()
+        for fname in TEST_FILENAMES:
+            open(os.path.join(tmpdir, fname), 'w').close() 
+
+        # Use Unicode API under Windows or MacOS X
+        if sys.platform in ('win32', 'darwin'):
+            dirlist = os.listdir(unicode(tmpdir))
+        else:
+            dirlist = os.listdir(tmpdir)
+
+        print "    dirlist = %s" % repr(dirlist)
+    except:
+        print "    # Oops, I cannot write filenames containing non-ascii characters"
+    print
+
+    shutil.rmtree(tmpdir)
+    sys.exit(0)
+
+from twisted.trial import unittest
+from mock import patch
+import sys
+
+from allmydata.util.stringutils import argv_to_unicode, unicode_to_url, \
+    unicode_to_stdout, unicode_platform, listdir_unicode, open_unicode, \
+    FilenameEncodingError
+from twisted.python import usage
+
+class StringUtilsErrors(unittest.TestCase):
+    @patch('sys.stdout')
+    def test_argv_to_unicode(self, mock):
+        mock.encoding = 'utf-8'
+
+        self.failUnlessRaises(usage.UsageError,
+                              argv_to_unicode,
+                              u'lumière'.encode('latin1'))
+
+    def test_unicode_to_url(self):
+        pass
+
+    @patch('sys.stdout')
+    def test_unicode_to_stdout(self, mock):
+        # Encoding koi8-r cannot represent 'è'
+        mock.encoding = 'koi8-r'
+        self.failUnlessEqual(unicode_to_stdout(u'lumière'), 'lumi?re')
+
+    @patch('sys.getfilesystemencoding')
+    @patch('os.listdir')
+    def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
+        # What happen if a latin1-encoded filenames is encountered on an UTF-8
+        # filesystem?
+        mock_listdir.return_value = [
+            u'lumière'.encode('utf-8'),
+            u'lumière'.encode('latin1')]
+
+        mock_getfilesystemencoding.return_value = 'utf-8'
+       
+        self.failUnlessRaises(FilenameEncodingError,
+                              listdir_unicode,
+                              u'/dummy')
+        
+        # We're trying to list a directory whose name cannot be represented in
+        # the filesystem encoding.  This should fail.
+        mock_getfilesystemencoding.return_value = 'ascii'
+        self.failUnlessRaises(FilenameEncodingError,
+                              listdir_unicode,
+                              u'/lumière')
+
+    @patch('sys.getfilesystemencoding')
+    def test_open_unicode(self, mock):
+        mock.return_value = 'ascii'
+
+        self.failUnlessRaises(FilenameEncodingError,
+                              open_unicode,
+                              u'lumière')
+
+    @patch('os.listdir')
+    def test_unicode_normalization(self, mock):
+        # Pretend to run on an Unicode platform such as Windows
+        orig_platform = sys.platform
+        sys.platform = 'win32'
+
+        mock.return_value = [u'A\u0308rtonwall.mp3']
+        self.failUnlessEqual(listdir_unicode(u'/dummy'), [u'\xc4rtonwall.mp3'])
+
+        sys.platform = orig_platform
+
+class StringUtils():
+    def setUp(self):
+        # Mock sys.platform because unicode_platform() uses it
+        self.original_platform = sys.platform
+        sys.platform = self.platform
+
+    def tearDown(self):
+        sys.platform = self.original_platform
+
+    @patch('sys.stdout')
+    def test_argv_to_unicode(self, mock):
+        mock.encoding = self.stdoutencoding
+
+        argu = u'lumière'
+        argv = self.argv
+
+        self.failUnlessEqual(argv_to_unicode(argv), argu)
+
+    def test_unicode_to_url(self):
+        self.failUnless(unicode_to_url(u'lumière'), u'lumière'.encode('utf-8'))
+
+    @patch('sys.stdout')
+    def test_unicode_to_stdout(self, mock):
+        mock.encoding = self.stdoutencoding
+        self.failUnlessEqual(unicode_to_stdout(u'lumière'), self.argv)
+
+    def test_unicode_platform(self):
+        matrix = {
+          'linux2': False,
+          'win32':  True,
+          'darwin': True,
+        }
+
+        self.failUnlessEqual(unicode_platform(), matrix[self.platform])
+ 
+    @patch('sys.getfilesystemencoding')
+    @patch('os.listdir')
+    def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
+
+        mock_listdir.return_value = self.dirlist
+        mock_getfilesystemencoding.return_value = self.filesystemencoding
+       
+        filenames = listdir_unicode(u'/dummy')
+
+        for fname in TEST_FILENAMES:
+            self.failUnless(isinstance(fname, unicode))
+
+            if fname not in filenames:
+                self.fail("Cannot find %r in %r" % (fname, filenames))
+
+    @patch('os.open')
+    def test_open_unicode(self, mock):
+
+        self.failUnlessRaises(IOError,
+                              open_unicode,
+                              u'/dummy_directory/lumière.txt')
+
+
+class UbuntuKarmicUTF8(StringUtils, unittest.TestCase):
+    uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
+    argv = 'lumi\xc3\xa8re'
+    platform = 'linux2'
+    filesystemencoding = 'UTF-8'
+    stdoutencoding = 'UTF-8'
+    dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
+
+
+class UbuntuKarmicLatin1(StringUtils, unittest.TestCase):
+    uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
+    argv = 'lumi\xe8re'
+    platform = 'linux2'
+    filesystemencoding = 'ISO-8859-1'
+    stdoutencoding = 'ISO-8859-1'
+    dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
+
+class WindowsXP(StringUtils, unittest.TestCase):
+    uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
+    argv = 'lumi\xe8re'
+    platform = 'win32'
+    filesystemencoding = 'mbcs'
+    stdoutencoding = 'cp850'
+    dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
+
+    todo = "Unicode arguments on the command-line is not yet supported under Windows, see bug #565."
+
+class WindowsXP_UTF8(StringUtils, unittest.TestCase):
+    uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
+    argv = 'lumi\xe8re'
+    platform = 'win32'
+    filesystemencoding = 'mbcs'
+    stdoutencoding = 'cp65001'
+    dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
+
+    todo = "Unicode arguments on the command-line is not yet supported under Windows, see bug #565."
+
+class WindowsVista(StringUtils, unittest.TestCase):
+    uname = 'Windows Vista 6.0.6000 x86 x86 Family 6 Model 15 Stepping 11, GenuineIntel'
+    argv = 'lumi\xe8re'
+    platform = 'win32'
+    filesystemencoding = 'mbcs'
+    stdoutencoding = 'cp850'
+    dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
+
+    todo = "Unicode arguments on the command-line is not yet supported under Windows, see bug #565."
+
+class MacOSXLeopard(StringUtils, unittest.TestCase):
+    uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
+    argv = 'lumi\xc3\xa8re'
+    platform = 'darwin'
+    filesystemencoding = 'utf-8'
+    stdoutencoding = 'UTF-8'
+    dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
diff -rN -u old-tahoe-534/src/allmydata/util/stringutils.py new-tahoe-534/src/allmydata/util/stringutils.py
--- old-tahoe-534/src/allmydata/util/stringutils.py	1970-01-01 01:00:00.000000000 +0100
+++ new-tahoe-534/src/allmydata/util/stringutils.py	2010-05-17 09:57:38.796407608 +0200
@@ -0,0 +1,135 @@
+"""
+Functions used to convert inputs from whatever encoding used in the system to
+unicode and back.
+"""
+
+import sys
+import os
+import unicodedata
+from allmydata.util.assertutil import precondition
+from twisted.python import usage
+
+def get_term_encoding():
+    """
+    Returns expected encoding for writing to the terminal and reading
+    arguments from the command-line.
+    """
+
+    return sys.stdout.encoding
+
+def argv_to_unicode(s):
+    """
+    Decode given argv element to unicode.
+    """
+    # Try to decode the command-line argument with the encoding returned by
+    # get_term_encoding(), if this fails print an error message to the user.
+
+    precondition(isinstance(s, str), s)
+
+    try:
+        return unicode(s, get_term_encoding())
+    except UnicodeDecodeError:
+        raise usage.UsageError("Argument '%s' cannot be decoded as %s." %
+                               (s, get_term_encoding()))
+
+def unicode_to_url(s):
+    """
+    Encode an unicode object used in an URL.
+    """
+    # According to RFC 2718, non-ascii characters in url's must be UTF-8 encoded.
+
+    precondition(isinstance(s, unicode), s)
+    return s.encode('utf-8')
+
+def unicode_to_stdout(s):
+    """
+    Encode an unicode object for representation on stdout.
+    """
+
+    precondition(isinstance(s, unicode), s)
+
+    try:
+        return s.encode(get_term_encoding())
+
+    # LookupError means that the Python interpreter wasn't able to detect a
+    # potential encoding for stdout.  UnicodeEncodedError means that we're
+    # trying to display a character which cannot be represented by encoding
+    # detected for stdout.  In both cases, we'll replace non-representable
+    # characters by '?'.
+    except (LookupError, UnicodeEncodeError):
+        return s.encode('ascii', 'replace')
+
+def unicode_platform():
+    """
+    Does the current platform handle Unicode filenames natively ?
+    """
+
+    return sys.platform in ('win32', 'darwin')
+
+class FilenameEncodingError(Exception):
+    """
+    Filename cannot be encoded using the current encoding of your filesystem
+    (%s). Please configure your locale correctly or rename this file.
+    """
+
+    pass
+
+def listdir_unicode_unix(path):
+    """
+    This function emulates an Unicode API under Unix similar to one available
+    under Windows or MacOS X.
+
+    If badly encoded filenames are encountered, an exception is raised.
+    """
+    precondition(isinstance(path, unicode), path)
+
+    encoding = sys.getfilesystemencoding()
+    try:
+        byte_path = path.encode(encoding)
+    except UnicodeEncodeError:
+        raise FilenameEncodingError(path)
+
+    try:
+        return [unicode(fn, encoding) for fn in os.listdir(byte_path)]
+    except UnicodeDecodeError:
+        raise FilenameEncodingError(fn)
+
+def listdir_unicode(path, encoding = None):
+    """
+    Wrapper around listdir() which provides safe access to the convenient
+    Unicode API even under Unix.
+    """
+
+    precondition(isinstance(path, unicode), path)
+
+    # On Windows and MacOS X, the Unicode API is used
+    if unicode_platform():
+        dirlist = os.listdir(path)
+
+    # On other platforms (ie. Unix systems), the byte-level API is used
+    else:
+        dirlist = listdir_unicode_unix(path)
+
+    # Normalize the resulting unicode filenames
+    #
+    # This prevents different OS from generating non-equal unicode strings for
+    # the same filename representation
+    return [unicodedata.normalize('NFC', fname) for fname in dirlist]
+
+def open_unicode(path, mode='r'):
+    """
+    Wrapper around open() which provides safe access to the convenient Unicode
+    API even under Unix.
+    """
+
+    precondition(isinstance(path, unicode), path)
+
+    if unicode_platform():
+        return open(path, mode)
+    else:
+        encoding = sys.getfilesystemencoding()
+
+        try:
+            return open(path.encode(encoding), mode)
+        except UnicodeEncodeError:
+            raise FilenameEncodingError(path)
Sun May 16 23:43:37 CEST 2010  Francois Deppierraz <franc...@ctrlaltdel.ch>
  * Fix handling of correctly encoded unicode filenames (#534)
  
  Tahoe CLI commands working on local files, for instance 'tahoe cp' or 'tahoe
  backup', have been improved to correctly handle filenames containing non-ASCII
  characters.
  
  In the case where Tahoe encounters a filename which cannot be decoded using the
  system encoding, an error will be returned and the operation will fail.  Under
  Linux, this typically happens when the filesystem contains filenames encoded
  with another encoding, for instance latin1, than the system locale, for
  instance UTF-8.  In such case, you'll need to fix your system with tools such
  as 'convmv' before using Tahoe CLI.
  
  All CLI commands have been improved to support non-ASCII parameters such as
  filenames and aliases on all supported Operating Systems except Windows as of
  now.
    ***END OF DESCRIPTION***
  
  Place the long patch description above the ***END OF DESCRIPTION*** marker.
  The first line of this file will be the patch name.
  
  
  This patch contains the following changes:
  
diff -rN -u old-tahoe-534/docs/frontends/CLI.txt new-tahoe-534/docs/frontends/CLI.txt
--- old-tahoe-534/docs/frontends/CLI.txt	2010-05-17 09:57:49.323906328 +0200
+++ new-tahoe-534/docs/frontends/CLI.txt	2010-05-17 09:57:49.396409282 +0200
@@ -123,13 +123,13 @@
 perspective on the graph of files and directories.
 
 Each tahoe node remembers a list of starting points, named "aliases",
-in a file named ~/.tahoe/private/aliases . These aliases are short
-strings that stand in for a directory read- or write- cap. If you use
-the command line "ls" without any "[STARTING_DIR]:" argument, then it
-will use the default alias, which is "tahoe", therefore "tahoe ls" has
-the same effect as "tahoe ls tahoe:".  The same goes for the other
-commands which can reasonably use a default alias: get, put, mkdir,
-mv, and rm.
+in a file named ~/.tahoe/private/aliases . These aliases are short UTF-8
+encoded strings that stand in for a directory read- or write- cap. If
+you use the command line "ls" without any "[STARTING_DIR]:" argument,
+then it will use the default alias, which is "tahoe", therefore "tahoe
+ls" has the same effect as "tahoe ls tahoe:".  The same goes for the
+other commands which can reasonably use a default alias: get, put,
+mkdir, mv, and rm.
 
 For backwards compatibility with Tahoe-1.0, if the "tahoe": alias is not
 found in ~/.tahoe/private/aliases, the CLI will use the contents of
diff -rN -u old-tahoe-534/NEWS new-tahoe-534/NEWS
--- old-tahoe-534/NEWS	2010-05-17 09:57:49.323906328 +0200
+++ new-tahoe-534/NEWS	2010-05-17 09:57:49.336410287 +0200
@@ -1,5 +1,26 @@
 User visible changes in Tahoe-LAFS.  -*- outline -*-
 
+* Release 1.7.0
+
+** Bugfixes
+
+*** Unicode filenames handling
+
+Tahoe CLI commands working on local files, for instance 'tahoe cp' or 'tahoe
+backup', have been improved to correctly handle filenames containing non-ASCII
+characters.
+
+In the case where Tahoe encounters a filename which cannot be decoded using the
+system encoding, an error will be returned and the operation will fail.  Under
+Linux, this typically happens when the filesystem contains filenames encoded
+with another encoding, for instance latin1, than the system locale, for
+instance UTF-8.  In such case, you'll need to fix your system with tools such
+as 'convmv' before using Tahoe CLI.
+
+All CLI commands have been improved to support non-ASCII parameters such as
+filenames and aliases on all supported Operating Systems except Windows as of
+now.
+
 * Release 1.6.1 (2010-02-27)
 
 ** Bugfixes
diff -rN -u old-tahoe-534/src/allmydata/scripts/cli.py new-tahoe-534/src/allmydata/scripts/cli.py
--- old-tahoe-534/src/allmydata/scripts/cli.py	2010-05-17 09:57:49.323906328 +0200
+++ new-tahoe-534/src/allmydata/scripts/cli.py	2010-05-17 09:57:49.636405126 +0200
@@ -1,6 +1,7 @@
 import os.path, re, sys, fnmatch
 from twisted.python import usage
 from allmydata.scripts.common import BaseOptions, get_aliases
+from allmydata.util.stringutils import argv_to_unicode
 
 NODEURL_RE=re.compile("http://([^:]*)(:([1-9][0-9]*))?")
 
@@ -49,12 +50,12 @@
 
 class MakeDirectoryOptions(VDriveOptions):
     def parseArgs(self, where=""):
-        self.where = where
+        self.where = argv_to_unicode(where)
     longdesc = """Create a new directory, either unlinked or as a subdirectory."""
 
 class AddAliasOptions(VDriveOptions):
     def parseArgs(self, alias, cap):
-        self.alias = alias
+        self.alias = argv_to_unicode(alias)
         self.cap = cap
 
     def getSynopsis(self):
@@ -64,7 +65,7 @@
 
 class CreateAliasOptions(VDriveOptions):
     def parseArgs(self, alias):
-        self.alias = alias
+        self.alias = argv_to_unicode(alias)
 
     def getSynopsis(self):
         return "%s create-alias ALIAS" % (os.path.basename(sys.argv[0]),)
@@ -83,7 +84,7 @@
         ("json", None, "Show the raw JSON output"),
         ]
     def parseArgs(self, where=""):
-        self.where = where
+        self.where = argv_to_unicode(where)
 
     longdesc = """
     List the contents of some portion of the grid.
@@ -118,8 +119,13 @@
         # tahoe get FOO bar              # write to local file
         # tahoe get tahoe:FOO bar        # same
 
-        self.from_file = arg1
-        self.to_file = arg2
+        self.from_file = argv_to_unicode(arg1)
+
+        if arg2:
+            self.to_file = argv_to_unicode(arg2)
+        else:
+            self.to_file = None
+
         if self.to_file == "-":
             self.to_file = None
 
@@ -151,15 +157,15 @@
         # see Examples below
 
         if arg1 is not None and arg2 is not None:
-            self.from_file = arg1
-            self.to_file = arg2
+            self.from_file = argv_to_unicode(arg1)
+            self.to_file =  argv_to_unicode(arg2)
         elif arg1 is not None and arg2 is None:
-            self.from_file = arg1 # might be "-"
+            self.from_file = argv_to_unicode(arg1) # might be "-"
             self.to_file = None
         else:
             self.from_file = None
             self.to_file = None
-        if self.from_file == "-":
+        if self.from_file == u"-":
             self.from_file = None
 
     def getSynopsis(self):
@@ -197,8 +203,8 @@
     def parseArgs(self, *args):
         if len(args) < 2:
             raise usage.UsageError("cp requires at least two arguments")
-        self.sources = args[:-1]
-        self.destination = args[-1]
+        self.sources = map(argv_to_unicode, args[:-1])
+        self.destination = argv_to_unicode(args[-1])
     def getSynopsis(self):
         return "Usage: tahoe [options] cp FROM.. TO"
     longdesc = """
@@ -228,15 +234,15 @@
 
 class RmOptions(VDriveOptions):
     def parseArgs(self, where):
-        self.where = where
+        self.where = argv_to_unicode(where)
 
     def getSynopsis(self):
         return "%s rm REMOTE_FILE" % (os.path.basename(sys.argv[0]),)
 
 class MvOptions(VDriveOptions):
     def parseArgs(self, frompath, topath):
-        self.from_file = frompath
-        self.to_file = topath
+        self.from_file = argv_to_unicode(frompath)
+        self.to_file = argv_to_unicode(topath)
 
     def getSynopsis(self):
         return "%s mv FROM TO" % (os.path.basename(sys.argv[0]),)
@@ -254,8 +260,8 @@
 
 class LnOptions(VDriveOptions):
     def parseArgs(self, frompath, topath):
-        self.from_file = frompath
-        self.to_file = topath
+        self.from_file = argv_to_unicode(frompath)
+        self.to_file = argv_to_unicode(topath)
 
     def getSynopsis(self):
         return "%s ln FROM TO" % (os.path.basename(sys.argv[0]),)
@@ -279,8 +285,8 @@
         self['exclude'] = set()
 
     def parseArgs(self, localdir, topath):
-        self.from_dir = localdir
-        self.to_dir = topath
+        self.from_dir = argv_to_unicode(localdir)
+        self.to_dir = argv_to_unicode(topath)
 
     def getSynopsis(Self):
         return "%s backup FROM ALIAS:TO" % os.path.basename(sys.argv[0])
@@ -334,7 +340,7 @@
 
 class WebopenOptions(VDriveOptions):
     def parseArgs(self, where=''):
-        self.where = where
+        self.where = argv_to_unicode(where)
 
     def getSynopsis(self):
         return "%s webopen [ALIAS:PATH]" % (os.path.basename(sys.argv[0]),)
@@ -350,7 +356,7 @@
         ("raw", "r", "Display raw JSON data instead of parsed"),
         ]
     def parseArgs(self, where=''):
-        self.where = where
+        self.where = argv_to_unicode(where)
 
     def getSynopsis(self):
         return "%s manifest [ALIAS:PATH]" % (os.path.basename(sys.argv[0]),)
@@ -363,7 +369,7 @@
         ("raw", "r", "Display raw JSON data instead of parsed"),
         ]
     def parseArgs(self, where=''):
-        self.where = where
+        self.where = argv_to_unicode(where)
 
     def getSynopsis(self):
         return "%s stats [ALIAS:PATH]" % (os.path.basename(sys.argv[0]),)
@@ -379,7 +385,7 @@
         ("add-lease", None, "Add/renew lease on all shares"),
         ]
     def parseArgs(self, where=''):
-        self.where = where
+        self.where = argv_to_unicode(where)
 
     def getSynopsis(self):
         return "%s check [ALIAS:PATH]" % (os.path.basename(sys.argv[0]),)
@@ -398,7 +404,7 @@
         ("verbose", "v", "Be noisy about what is happening."),
         ]
     def parseArgs(self, where=''):
-        self.where = where
+        self.where = argv_to_unicode(where)
 
     def getSynopsis(self):
         return "%s deep-check [ALIAS:PATH]" % (os.path.basename(sys.argv[0]),)
diff -rN -u old-tahoe-534/src/allmydata/scripts/common.py new-tahoe-534/src/allmydata/scripts/common.py
--- old-tahoe-534/src/allmydata/scripts/common.py	2010-05-17 09:57:49.313905408 +0200
+++ new-tahoe-534/src/allmydata/scripts/common.py	2010-05-17 09:57:49.636405126 +0200
@@ -1,7 +1,9 @@
 
 import os, sys, urllib
+import codecs
 from twisted.python import usage
-
+from allmydata.util.stringutils import unicode_to_url
+from allmydata.util.assertutil import precondition
 
 class BaseOptions:
     # unit tests can override these to point at StringIO instances
@@ -100,14 +102,14 @@
     except EnvironmentError:
         pass
     try:
-        f = open(aliasfile, "r")
+        f = codecs.open(aliasfile, "r", "utf-8")
         for line in f.readlines():
             line = line.strip()
             if line.startswith("#") or not line:
                 continue
             name, cap = line.split(":", 1)
             # normalize it: remove http: prefix, urldecode
-            cap = cap.strip()
+            cap = cap.strip().encode('ascii')
             aliases[name] = uri.from_string_dirnode(cap).to_string()
     except EnvironmentError:
         pass
@@ -138,7 +140,7 @@
     # and default is not found in aliases, an UnknownAliasError is
     # raised.
     path = path.strip()
-    if uri.has_uri_prefix(path):
+    if uri.has_uri_prefix(path.encode('ascii', 'ignore')):
         # We used to require "URI:blah:./foo" in order to get a subpath,
         # stripping out the ":./" sequence. We still allow that for compatibility,
         # but now also allow just "URI:blah/foo".
@@ -180,4 +182,4 @@
 
 def escape_path(path):
     segments = path.split("/")
-    return "/".join([urllib.quote(s) for s in segments])
+    return "/".join([urllib.quote(unicode_to_url(s)) for s in segments])
diff -rN -u old-tahoe-534/src/allmydata/scripts/tahoe_add_alias.py new-tahoe-534/src/allmydata/scripts/tahoe_add_alias.py
--- old-tahoe-534/src/allmydata/scripts/tahoe_add_alias.py	2010-05-17 09:57:49.313905408 +0200
+++ new-tahoe-534/src/allmydata/scripts/tahoe_add_alias.py	2010-05-17 09:57:49.646405047 +0200
@@ -1,16 +1,20 @@
 
 import os.path
+import codecs
+import sys
 from allmydata import uri
 from allmydata.scripts.common_http import do_http, check_http_error
 from allmydata.scripts.common import get_aliases
 from allmydata.util.fileutil import move_into_place
+from allmydata.util.stringutils import unicode_to_stdout
+
 
 def add_line_to_aliasfile(aliasfile, alias, cap):
     # we use os.path.exists, rather than catching EnvironmentError, to avoid
     # clobbering the valuable alias file in case of spurious or transient
     # filesystem errors.
     if os.path.exists(aliasfile):
-        f = open(aliasfile, "r")
+        f = codecs.open(aliasfile, "r", "utf-8")
         aliases = f.read()
         f.close()
         if not aliases.endswith("\n"):
@@ -18,7 +22,7 @@
     else:
         aliases = ""
     aliases += "%s: %s\n" % (alias, cap)
-    f = open(aliasfile+".tmp", "w")
+    f = codecs.open(aliasfile+".tmp", "w", "utf-8")
     f.write(aliases)
     f.close()
     move_into_place(aliasfile+".tmp", aliasfile)
@@ -41,7 +45,7 @@
 
     add_line_to_aliasfile(aliasfile, alias, cap)
 
-    print >>stdout, "Alias '%s' added" % (alias,)
+    print >>stdout, "Alias '%s' added" % (unicode_to_stdout(alias),)
     return 0
 
 def create_alias(options):
@@ -74,7 +78,7 @@
 
     add_line_to_aliasfile(aliasfile, alias, new_uri)
 
-    print >>stdout, "Alias '%s' created" % (alias,)
+    print >>stdout, "Alias '%s' created" % (unicode_to_stdout(alias),)
     return 0
 
 def list_aliases(options):
diff -rN -u old-tahoe-534/src/allmydata/scripts/tahoe_backup.py new-tahoe-534/src/allmydata/scripts/tahoe_backup.py
--- old-tahoe-534/src/allmydata/scripts/tahoe_backup.py	2010-05-17 09:57:49.313905408 +0200
+++ new-tahoe-534/src/allmydata/scripts/tahoe_backup.py	2010-05-17 09:57:49.646405047 +0200
@@ -9,6 +9,11 @@
 from allmydata.scripts.common_http import do_http
 from allmydata.util import time_format
 from allmydata.scripts import backupdb
+import sys
+from allmydata.util.stringutils import unicode_to_stdout, listdir_unicode, open_unicode
+from allmydata.util.assertutil import precondition
+from twisted.python import usage
+
 
 class HTTPError(Exception):
     pass
@@ -154,12 +159,16 @@
 
     def verboseprint(self, msg):
         if self.verbosity >= 2:
+            if isinstance(msg, unicode):
+                msg = unicode_to_stdout(msg)
+
             print >>self.options.stdout, msg
 
     def warn(self, msg):
         print >>self.options.stderr, msg
 
     def process(self, localpath):
+        precondition(isinstance(localpath, unicode), localpath)
         # returns newdircap
 
         self.verboseprint("processing %s" % localpath)
@@ -167,7 +176,7 @@
         compare_contents = {} # childname -> rocap
 
         try:
-            children = os.listdir(localpath)
+            children = listdir_unicode(localpath)
         except EnvironmentError:
             self.directories_skipped += 1
             self.warn("WARNING: permission denied on directory %s" % localpath)
@@ -283,6 +292,8 @@
 
     # This function will raise an IOError exception when called on an unreadable file
     def upload(self, childpath):
+        precondition(isinstance(childpath, unicode), childpath)
+
         #self.verboseprint("uploading %s.." % childpath)
         metadata = get_local_metadata(childpath)
 
@@ -291,7 +302,7 @@
 
         if must_upload:
             self.verboseprint("uploading %s.." % childpath)
-            infileobj = open(os.path.expanduser(childpath), "rb")
+            infileobj = open_unicode(os.path.expanduser(childpath), "rb")
             url = self.options['node-url'] + "uri"
             resp = do_http("PUT", url, infileobj)
             if resp.status not in (200, 201):
diff -rN -u old-tahoe-534/src/allmydata/scripts/tahoe_cp.py new-tahoe-534/src/allmydata/scripts/tahoe_cp.py
--- old-tahoe-534/src/allmydata/scripts/tahoe_cp.py	2010-05-17 09:57:49.313905408 +0200
+++ new-tahoe-534/src/allmydata/scripts/tahoe_cp.py	2010-05-17 09:57:49.646405047 +0200
@@ -2,12 +2,17 @@
 import os.path
 import urllib
 import simplejson
+import sys
 from cStringIO import StringIO
 from twisted.python.failure import Failure
 from allmydata.scripts.common import get_alias, escape_path, \
                                      DefaultAliasMarker, UnknownAliasError
 from allmydata.scripts.common_http import do_http
 from allmydata import uri
+from twisted.python import usage
+from allmydata.util.stringutils import unicode_to_url, listdir_unicode, open_unicode
+from allmydata.util.assertutil import precondition
+
 
 def ascii_or_none(s):
     if s is None:
@@ -70,6 +75,7 @@
 
 class LocalFileSource:
     def __init__(self, pathname):
+        precondition(isinstance(pathname, unicode), pathname)
         self.pathname = pathname
 
     def need_to_copy_bytes(self):
@@ -80,6 +86,7 @@
 
 class LocalFileTarget:
     def __init__(self, pathname):
+        precondition(isinstance(pathname, unicode), pathname)
         self.pathname = pathname
     def put_file(self, inf):
         outf = open(self.pathname, "wb")
@@ -92,6 +99,7 @@
 
 class LocalMissingTarget:
     def __init__(self, pathname):
+        precondition(isinstance(pathname, unicode), pathname)
         self.pathname = pathname
 
     def put_file(self, inf):
@@ -105,6 +113,8 @@
 
 class LocalDirectorySource:
     def __init__(self, progressfunc, pathname):
+        precondition(isinstance(pathname, unicode), pathname)
+
         self.progressfunc = progressfunc
         self.pathname = pathname
         self.children = None
@@ -113,7 +123,7 @@
         if self.children is not None:
             return
         self.children = {}
-        children = os.listdir(self.pathname)
+        children = listdir_unicode(self.pathname)
         for i,n in enumerate(children):
             self.progressfunc("examining %d of %d" % (i, len(children)))
             pn = os.path.join(self.pathname, n)
@@ -130,6 +140,8 @@
 
 class LocalDirectoryTarget:
     def __init__(self, progressfunc, pathname):
+        precondition(isinstance(pathname, unicode), pathname)
+
         self.progressfunc = progressfunc
         self.pathname = pathname
         self.children = None
@@ -138,7 +150,7 @@
         if self.children is not None:
             return
         self.children = {}
-        children = os.listdir(self.pathname)
+        children = listdir_unicode(self.pathname)
         for i,n in enumerate(children):
             self.progressfunc("examining %d of %d" % (i, len(children)))
             pn = os.path.join(self.pathname, n)
@@ -161,8 +173,9 @@
         return LocalDirectoryTarget(self.progressfunc, pathname)
 
     def put_file(self, name, inf):
+        precondition(isinstance(name, unicode), name)
         pathname = os.path.join(self.pathname, name)
-        outf = open(pathname, "wb")
+        outf = open_unicode(pathname, "wb")
         while True:
             data = inf.read(32768)
             if not data:
@@ -355,7 +368,7 @@
                 if self.writecap:
                     url = self.nodeurl + "/".join(["uri",
                                                    urllib.quote(self.writecap),
-                                                   urllib.quote(name.encode('utf-8'))])
+                                                   urllib.quote(unicode_to_url(name))])
                 self.children[name] = TahoeFileTarget(self.nodeurl, mutable,
                                                       writecap, readcap, url)
             elif data[0] == "dirnode":
diff -rN -u old-tahoe-534/src/allmydata/scripts/tahoe_ls.py new-tahoe-534/src/allmydata/scripts/tahoe_ls.py
--- old-tahoe-534/src/allmydata/scripts/tahoe_ls.py	2010-05-17 09:57:49.313905408 +0200
+++ new-tahoe-534/src/allmydata/scripts/tahoe_ls.py	2010-05-17 09:57:49.646405047 +0200
@@ -4,6 +4,7 @@
 from allmydata.scripts.common import get_alias, DEFAULT_ALIAS, escape_path, \
                                      UnknownAliasError
 from allmydata.scripts.common_http import do_http
+from allmydata.util.stringutils import unicode_to_stdout
 
 def list(options):
     nodeurl = options['node-url']
@@ -130,7 +131,7 @@
             line.append(ctime_s)
         if not options["classify"]:
             classify = ""
-        line.append(name + classify)
+        line.append(unicode_to_stdout(name) + classify)
         if options["uri"]:
             line.append(uri)
         if options["readonly-uri"]:
diff -rN -u old-tahoe-534/src/allmydata/scripts/tahoe_manifest.py new-tahoe-534/src/allmydata/scripts/tahoe_manifest.py
--- old-tahoe-534/src/allmydata/scripts/tahoe_manifest.py	2010-05-17 09:57:49.313905408 +0200
+++ new-tahoe-534/src/allmydata/scripts/tahoe_manifest.py	2010-05-17 09:57:49.646405047 +0200
@@ -85,7 +85,7 @@
                 try:
                     print >>stdout, d["cap"], "/".join(d["path"])
                 except UnicodeEncodeError:
-                    print >>stdout, d["cap"], "/".join([p.encode("utf-8")
+                    print >>stdout, d["cap"], "/".join([unicode_to_stdout(p)
                                                         for p in d["path"]])
 
 def manifest(options):
diff -rN -u old-tahoe-534/src/allmydata/scripts/tahoe_mkdir.py new-tahoe-534/src/allmydata/scripts/tahoe_mkdir.py
--- old-tahoe-534/src/allmydata/scripts/tahoe_mkdir.py	2010-05-17 09:57:49.313905408 +0200
+++ new-tahoe-534/src/allmydata/scripts/tahoe_mkdir.py	2010-05-17 09:57:49.646405047 +0200
@@ -2,6 +2,7 @@
 import urllib
 from allmydata.scripts.common_http import do_http, check_http_error
 from allmydata.scripts.common import get_alias, DEFAULT_ALIAS, UnknownAliasError
+from allmydata.util.stringutils import unicode_to_url
 
 def mkdir(options):
     nodeurl = options['node-url']
@@ -35,7 +36,7 @@
         path = path[:-1]
     # path (in argv) must be "/".join([s.encode("utf-8") for s in segments])
     url = nodeurl + "uri/%s/%s?t=mkdir" % (urllib.quote(rootcap),
-                                           urllib.quote(path))
+                                           urllib.quote(unicode_to_url(path)))
     resp = do_http("POST", url)
     check_http_error(resp, stderr)
     new_uri = resp.read().strip()
diff -rN -u old-tahoe-534/src/allmydata/test/test_cli.py new-tahoe-534/src/allmydata/test/test_cli.py
--- old-tahoe-534/src/allmydata/test/test_cli.py	2010-05-17 09:57:49.293904649 +0200
+++ new-tahoe-534/src/allmydata/test/test_cli.py	2010-05-17 09:57:49.676405528 +0200
@@ -6,6 +6,7 @@
 import urllib
 import re
 import simplejson
+import sys
 
 from allmydata.util import fileutil, hashutil, base32
 from allmydata import uri
@@ -26,6 +27,9 @@
 from twisted.internet import threads # CLI tests use deferToThread
 from twisted.python import usage
 
+from allmydata.util.stringutils import listdir_unicode, open_unicode, \
+     unicode_platform, FilenameEncodingError
+
 timeout = 480 # deep_check takes 360s on Zandr's linksys box, others take > 240s
 
 
@@ -279,7 +283,7 @@
                    "work": "WA",
                    "c": "CA"}
         def ga1(path):
-            return get_alias(aliases, path, "tahoe")
+            return get_alias(aliases, path, u"tahoe")
         uses_lettercolon = common.platform_uses_lettercolon_drivename()
         self.failUnlessEqual(ga1("bare"), ("TA", "bare"))
         self.failUnlessEqual(ga1("baredir/file"), ("TA", "baredir/file"))
@@ -374,7 +378,7 @@
         # default set to something that isn't in the aliases argument should
         # raise an UnknownAliasError.
         def ga4(path):
-            return get_alias(aliases, path, "badddefault:")
+            return get_alias(aliases, path, u"badddefault:")
         self.failUnlessRaises(common.UnknownAliasError, ga4, "afile")
         self.failUnlessRaises(common.UnknownAliasError, ga4, "a/dir/path/")
 
@@ -382,12 +386,44 @@
             old = common.pretend_platform_uses_lettercolon
             try:
                 common.pretend_platform_uses_lettercolon = True
-                retval = get_alias(aliases, path, "baddefault:")
+                retval = get_alias(aliases, path, u"baddefault:")
             finally:
                 common.pretend_platform_uses_lettercolon = old
             return retval
         self.failUnlessRaises(common.UnknownAliasError, ga5, "C:\\Windows")
 
+    def test_listdir_unicode_good(self):
+        basedir = u"cli/common/listdir_unicode_good"
+        fileutil.make_dirs(basedir)
+
+        files = (u'Lôzane', u'Bern', u'Genève')
+
+        for file in files:
+            open(os.path.join(basedir, file), "w").close()
+
+        for file in listdir_unicode(basedir):
+            self.failUnlessEqual(file in files, True)
+
+    def test_listdir_unicode_bad(self):
+        if unicode_platform():
+            raise unittest.SkipTest("This test doesn't make any sense on architecture which handle filenames natively as Unicode entities.")
+
+        basedir = u"cli/common/listdir_unicode_bad"
+        fileutil.make_dirs(basedir)
+
+        files = (u'Lôzane', u'Bern', u'Genève')
+
+        # We use a wrong encoding on purpose
+        if sys.getfilesystemencoding() == 'UTF-8':
+            encoding = 'latin1'
+        else:
+            encoding = 'UTF-8'
+
+        for file in files:
+            path = os.path.join(basedir, file).encode(encoding)
+            open(path, "w").close()
+
+        self.failUnlessRaises(FilenameEncodingError, listdir_unicode, basedir)
 
 class Help(unittest.TestCase):
 
@@ -582,6 +618,48 @@
             self.failUnless(aliases["un-corrupted2"].startswith("URI:DIR2:"))
         d.addCallback(_check_not_corrupted)
 
+        d.addCallback(lambda res: self.do_cli("create-alias", "études"))
+        def _check_create_unicode((rc,stdout,stderr)):
+            self.failUnlessEqual(rc, 0)
+            self.failIf(stderr)
+
+            # If stdout only supports ascii, accentuated characters are
+            # being replaced by '?'
+            if sys.stdout.encoding == "ANSI_X3.4-1968":
+                self.failUnless("Alias '?tudes' created" in stdout)
+            else:
+                self.failUnless("Alias 'études' created" in stdout)
+
+            aliases = get_aliases(self.get_clientdir())
+            self.failUnless(aliases[u"études"].startswith("URI:DIR2:"))
+        d.addCallback(_check_create_unicode)
+
+        d.addCallback(lambda res: self.do_cli("ls", "études:"))
+        def _check_ls1((rc, stdout, stderr)):
+            self.failUnlessEqual(rc, 0)
+            self.failIf(stderr)
+
+            self.failUnlessEqual(stdout, "")
+        d.addCallback(_check_ls1)
+
+        d.addCallback(lambda res: self.do_cli("put", "-", "études:uploaded.txt",
+          stdin="Blah blah blah"))
+
+        d.addCallback(lambda res: self.do_cli("ls", "études:"))
+        def _check_ls2((rc, stdout, stderr)):
+            self.failUnlessEqual(rc, 0)
+            self.failIf(stderr)
+
+            self.failUnlessEqual(stdout, "uploaded.txt\n")
+        d.addCallback(_check_ls2)
+
+        d.addCallback(lambda res: self.do_cli("get", "études:uploaded.txt"))
+        def _check_get((rc, stdout, stderr)):
+            self.failUnlessEqual(rc, 0)
+            self.failIf(stderr)
+            self.failUnlessEqual(stdout, "Blah blah blah")
+        d.addCallback(_check_get)
+
         return d
 
 
@@ -855,6 +933,37 @@
         return d
 
 
+    def test_immutable_from_file_unicode(self):
+        # tahoe put file.txt "à trier.txt"
+        self.basedir = os.path.dirname(self.mktemp())
+        self.set_up_grid()
+
+        rel_fn = os.path.join(self.basedir, "DATAFILE")
+        abs_fn = os.path.abspath(rel_fn)
+        # we make the file small enough to fit in a LIT file, for speed
+        DATA = "short file"
+        f = open(rel_fn, "w")
+        f.write(DATA)
+        f.close()
+
+        d = self.do_cli("create-alias", "tahoe")
+
+        d.addCallback(lambda res:
+                      self.do_cli("put", rel_fn, "à trier.txt"))
+        def _uploaded((rc,stdout,stderr)):
+            readcap = stdout.strip()
+            self.failUnless(readcap.startswith("URI:LIT:"))
+            self.failUnless("201 Created" in stderr, stderr)
+            self.readcap = readcap
+        d.addCallback(_uploaded)
+
+        d.addCallback(lambda res:
+                      self.do_cli("get", "tahoe:à trier.txt"))
+        d.addCallback(lambda (rc,stdout,stderr):
+                      self.failUnlessEqual(stdout, DATA))
+
+        return d
+
 class List(GridTestMixin, CLITestMixin, unittest.TestCase):
     def test_list(self):
         self.basedir = "cli/List/list"
@@ -1138,30 +1247,37 @@
     def test_unicode_filename(self):
         self.basedir = "cli/Cp/unicode_filename"
         self.set_up_grid()
+        d = self.do_cli("create-alias", "tahoe")
+
+        # Use unicode strings when calling os functions
+        if sys.getfilesystemencoding() == "ANSI_X3.4-1968":
+            fn1 = os.path.join(self.basedir, u"Artonwall")
+        else:
+            fn1 = os.path.join(self.basedir, u"Ärtonwall")
 
-        fn1 = os.path.join(self.basedir, "Ärtonwall")
         DATA1 = "unicode file content"
         fileutil.write(fn1, DATA1)
+        d.addCallback(lambda res: self.do_cli("cp", fn1.encode('utf-8'), "tahoe:Ärtonwall"))
+
+        d.addCallback(lambda res: self.do_cli("get", "tahoe:Ärtonwall"))
+        d.addCallback(lambda (rc,out,err): self.failUnlessEqual(out, DATA1))
 
-        fn2 = os.path.join(self.basedir, "Metallica")
+
+        fn2 = os.path.join(self.basedir, u"Metallica")
         DATA2 = "non-unicode file content"
         fileutil.write(fn2, DATA2)
 
         # Bug #534
         # Assure that uploading a file whose name contains unicode character doesn't
         # prevent further uploads in the same directory
-        d = self.do_cli("create-alias", "tahoe")
-        d.addCallback(lambda res: self.do_cli("cp", fn1, "tahoe:"))
-        d.addCallback(lambda res: self.do_cli("cp", fn2, "tahoe:"))
-
-        d.addCallback(lambda res: self.do_cli("get", "tahoe:Ärtonwall"))
-        d.addCallback(lambda (rc,out,err): self.failUnlessEqual(out, DATA1))
+        d.addCallback(lambda res: self.do_cli("cp", fn2.encode('utf-8'), "tahoe:"))
 
         d.addCallback(lambda res: self.do_cli("get", "tahoe:Metallica"))
         d.addCallback(lambda (rc,out,err): self.failUnlessEqual(out, DATA2))
 
+        d.addCallback(lambda res: self.do_cli("ls", "tahoe:"))
+
         return d
-    test_unicode_filename.todo = "This behavior is not yet supported, although it does happen to work (for reasons that are ill-understood) on many platforms.  See issue ticket #534."
 
     def test_dangling_symlink_vs_recursion(self):
         if not hasattr(os, 'symlink'):
@@ -1268,6 +1384,17 @@
         return d
 
 
+class Mkdir(GridTestMixin, CLITestMixin, unittest.TestCase):
+    def test_unicode_mkdir(self):
+        self.basedir = os.path.dirname(self.mktemp())
+        self.set_up_grid()
+
+        d = self.do_cli("create-alias", "tahoe")
+        d.addCallback(lambda res: self.do_cli("mkdir", "tahoe:Motörhead"))
+
+        return d
+ 
+
 class Backup(GridTestMixin, CLITestMixin, StallMixin, unittest.TestCase):
 
     def writeto(self, path, data):
_______________________________________________
tahoe-dev mailing list
tahoe-dev@allmydata.org
http://allmydata.org/cgi-bin/mailman/listinfo/tahoe-dev

Reply via email to