On Wed, 12 Jun 2013 08:02:24 +0000, Νικόλαος Κούρας wrote: > # Collect directory and its filenames as bytes > path = b'/home/nikos/public_html/data/apps/' > files = os.listdir( path ) [snip code]
I realised that the version I gave you earlier, or rather the modified version you came up with, was subject to a race condition. If somebody uploaded a file while the script was running, and that file name was not UTF-8 clean, the script would fail. This version may be more robust and should be resistant to race conditions when files are uploaded. (However, do not *delete* files while this script is running.) As before, I have not tested this. I recommend that you test it thoroughly before deploying it live. def guess_encoding(bytestring): for encoding in ('utf-8', 'iso-8859-7', 'latin-1'): try: bytestring.decode(encoding) except UnicodeDecodeError: # Decoding failed. Try the next one. pass else: # Decoding succeeded. This is our guess. return encoding # If we get here, none of the encodings worked. We cannot guess. return None path = b'/home/nikos/public_html/data/apps/' files = os.listdir( path ) clean_files = [] for filename in files: # Compute 'path/to/filename' filepath_bytes = path + filename encoding = guess_encoding(filepath_bytes) if encoding == 'utf-8': # File name is valid UTF-8, so we can skip to the next file. clean_files.append(filepath_bytes) continue if encoding is None: # No idea what the encoding is. Hit it with a hammer until it # stops moving. filename = filepath_bytes.decode('utf-8', 'xmlcharrefreplace') else: filename = filepath_bytes.decode(encoding) # Rename the file to something which ought to be UTF-8 clean. newname_bytes = filename.encode('utf-8') os.rename(filepath_bytes, newname_bytes) clean_files.append(newname_bytes) # Once we get here, the file ought to be UTF-8 clean, # and the Unicode name ought to exist: assert os.path.exists(newname_bytes.decode('utf-8')) # Dump the old list of file names, it is no longer valid. del files # DO NOT CALL listdir again. Somebody might have uploaded a # new file, with a broken file name. That will be fixed next # time this script runs, but for now, we ignore the dirty file # name and just use the list of clean file names we built above. clean_files = set(clean_files) for name_as_bytes in sorted(clean_files): filename = name_as_bytes.decode('utf-8') # Check the presence of a file against the database # and insert if it doesn't exist cur.execute('SELECT url FROM files WHERE url = %s', filename) data = cur.fetchone() -- Steven -- http://mail.python.org/mailman/listinfo/python-list