On 2015-12-02 15:30:42, Kinney, Michael D wrote: > Jordan, > > Do the UTF-8 files generated include a BOM?
https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8 "The UTF-8 representation of the BOM is the byte sequence 0xEF,0xBB,0xBF." ... "The Unicode Standard permits the BOM in UTF-8, but does not require or recommend its use." This conversion script does not add a BOM to the files. I think the BOM might be rarely used, so I wouldn't be surprised if there were some applications that might be confused by it. > Just in case someone depends on UNI files in UTF-16LE format, do you > think it makes sense for this script to support conversion to from > UTF-8 to UTF-16LE too? Or add a second script to that conversion? I don't think the BOM issue should prevent converting back to utf-16. (We can look for the lack of the UTF-16 BOM.) I considered if the script should handle both conversions, but I'm not too interested in the utf-8 => utf-16 conversion, so I figured I'd take the easy path. :) Do I need to do the other conversion to let this change proceed? Or can someone on your team handle that? (If it is actually needed...) I can also rename the script to ConvertUni.py and add a --utf-8 switch for now. Then we could later add --utf-16 to the same script. (Darn, now I've thought about it enough that I'll probably just add the other conversion. :) -Jordan > > > -----Original Message----- > > From: edk2-devel [mailto:edk2-devel-boun...@lists.01.org] On Behalf Of > > Jordan Justen > > Sent: Wednesday, December 2, 2015 2:17 PM > > To: edk2-devel@lists.01.org > > Cc: Carsey, Jaben <jaben.car...@intel.com>; Justen, Jordan L > > <jordan.l.jus...@intel.com>; Gao, Liming <liming....@intel.com> > > Subject: [edk2] [PATCH 1/3] BaseTools/Scripts: Add ConvertUtf16ToUtf8.py > > script > > > > This script uses python codecs to convert .uni string files from > > utf-16 to utf-8. > > > > The advantages of utf-8 data: > > * Generally smaller files > > * More commonly supported by editors > > * Not treated as binary data in patch files > > > > Cc: Yonghong Zhu <yonghong....@intel.com> > > Cc: Liming Gao <liming....@intel.com> > > Cc: Jaben Carsey <jaben.car...@intel.com> > > Contributed-under: TianoCore Contribution Agreement 1.0 > > Signed-off-by: Jordan Justen <jordan.l.jus...@intel.com> > > --- > > BaseTools/Scripts/ConvertUtf16ToUtf8.py | 123 > > ++++++++++++++++++++++++++++++++ > > 1 file changed, 123 insertions(+) > > create mode 100755 BaseTools/Scripts/ConvertUtf16ToUtf8.py > > > > diff --git a/BaseTools/Scripts/ConvertUtf16ToUtf8.py > > b/BaseTools/Scripts/ConvertUtf16ToUtf8.py > > new file mode 100755 > > index 0000000..5f54603 > > --- /dev/null > > +++ b/BaseTools/Scripts/ConvertUtf16ToUtf8.py > > @@ -0,0 +1,123 @@ > > +## @file > > +# Check a patch for various format issues > > +# > > +# Copyright (c) 2015, Intel Corporation. All rights reserved.<BR> > > +# > > +# This program and the accompanying materials are licensed and made > > +# available under the terms and conditions of the BSD License which > > +# accompanies this distribution. The full text of the license may be > > +# found at http://opensource.org/licenses/bsd-license.php > > +# > > +# THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" > > +# BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER > > +# EXPRESS OR IMPLIED. > > +# > > + > > +from __future__ import print_function > > + > > +VersionNumber = '0.1' > > +__copyright__ = "Copyright (c) 2015, Intel Corporation All rights > > reserved." > > + > > +import argparse > > +import codecs > > +import os > > +import sys > > + > > +try: > > + from io import StringIO > > +except ImportError: > > + from StringIO import StringIO > > + > > +class ConvertOneArg: > > + """Converts utf-16 to utf-8 for one command line argument. > > + > > + This could be a single file, or a directory. > > + """ > > + > > + def __init__(self, source): > > + self.source = source > > + > > + self.ok = True > > + > > + if not os.path.exists(source): > > + self.ok = False > > + elif os.path.isdir(source): > > + for (root, dirs, files) in os.walk(source): > > + files = filter(lambda a: a.endswith('.uni'), files) > > + for filename in files: > > + path = os.path.join(root, filename) > > + self.ok &= self.convert_one_file(path) > > + if not self.ok: > > + break > > + > > + if not self.ok: > > + break > > + else: > > + self.ok &= self.convert_one_file(source) > > + > > + def convert_one_file(self, source): > > + # > > + # Read file > > + # > > + utf16_file = open(source, mode='rb') > > + file_content = utf16_file.read() > > + utf16_file.close() > > + > > + # > > + # Detect UTF-16 Byte Order Mark at beginning of file. > > + # > > + if not (file_content.startswith(codecs.BOM_UTF16_BE) or > > + file_content.startswith(codecs.BOM_UTF16_LE)): > > + print(source + ": already utf-8") > > + return True > > + > > + # > > + # Decode utf-16 string data > > + # > > + str_content = file_content.decode('utf-16', 'ignore') > > + > > + # > > + # Encode string data to utf-8 > > + # > > + utf8_content = str_content.encode('utf-8', 'ignore') > > + > > + # > > + # Write converted utf-8 data back to file > > + # > > + utf8_file = open(source, mode='wb') > > + utf8_file.write(utf8_content) > > + utf8_file.close() > > + > > + print(source + ": converted, size", len(file_content), '=>', > > len(utf8_content)) > > + return True > > + > > + > > +class ConvertUtf16ToUtf8App: > > + """Converts files to utf-8 based on the command line arguments.""" > > + > > + def __init__(self): > > + self.parse_options() > > + sources = self.args.source > > + > > + self.ok = True > > + for patch in sources: > > + self.process_one_arg(patch) > > + > > + if self.ok: > > + self.retval = 0 > > + else: > > + self.retval = -1 > > + > > + def process_one_arg(self, arg): > > + self.ok &= ConvertOneArg(arg).ok > > + > > + def parse_options(self): > > + parser = argparse.ArgumentParser(description=__copyright__) > > + parser.add_argument('--version', action='version', > > + version='%(prog)s ' + VersionNumber) > > + parser.add_argument('source', nargs='+', > > + help='[uni file | directory]') > > + self.args = parser.parse_args() > > + > > +if __name__ == "__main__": > > + sys.exit(ConvertUtf16ToUtf8App().retval) > > -- > > 2.6.2 > > > > _______________________________________________ > > edk2-devel mailing list > > edk2-devel@lists.01.org > > https://lists.01.org/mailman/listinfo/edk2-devel _______________________________________________ edk2-devel mailing list edk2-devel@lists.01.org https://lists.01.org/mailman/listinfo/edk2-devel