Jordan, Do the UTF-8 files generated include a BOM?
Just in case someone depends on UNI files in UTF-16LE format, do you think it makes sense for this script to support conversion to from UTF-8 to UTF-16LE too? Or add a second script to that conversion? Thanks, Mike > -----Original Message----- > From: edk2-devel [mailto:edk2-devel-boun...@lists.01.org] On Behalf Of Jordan > Justen > Sent: Wednesday, December 2, 2015 2:17 PM > To: edk2-devel@lists.01.org > Cc: Carsey, Jaben <jaben.car...@intel.com>; Justen, Jordan L > <jordan.l.jus...@intel.com>; Gao, Liming <liming....@intel.com> > Subject: [edk2] [PATCH 1/3] BaseTools/Scripts: Add ConvertUtf16ToUtf8.py > script > > This script uses python codecs to convert .uni string files from > utf-16 to utf-8. > > The advantages of utf-8 data: > * Generally smaller files > * More commonly supported by editors > * Not treated as binary data in patch files > > Cc: Yonghong Zhu <yonghong....@intel.com> > Cc: Liming Gao <liming....@intel.com> > Cc: Jaben Carsey <jaben.car...@intel.com> > Contributed-under: TianoCore Contribution Agreement 1.0 > Signed-off-by: Jordan Justen <jordan.l.jus...@intel.com> > --- > BaseTools/Scripts/ConvertUtf16ToUtf8.py | 123 > ++++++++++++++++++++++++++++++++ > 1 file changed, 123 insertions(+) > create mode 100755 BaseTools/Scripts/ConvertUtf16ToUtf8.py > > diff --git a/BaseTools/Scripts/ConvertUtf16ToUtf8.py > b/BaseTools/Scripts/ConvertUtf16ToUtf8.py > new file mode 100755 > index 0000000..5f54603 > --- /dev/null > +++ b/BaseTools/Scripts/ConvertUtf16ToUtf8.py > @@ -0,0 +1,123 @@ > +## @file > +# Check a patch for various format issues > +# > +# Copyright (c) 2015, Intel Corporation. All rights reserved.<BR> > +# > +# This program and the accompanying materials are licensed and made > +# available under the terms and conditions of the BSD License which > +# accompanies this distribution. The full text of the license may be > +# found at http://opensource.org/licenses/bsd-license.php > +# > +# THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" > +# BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER > +# EXPRESS OR IMPLIED. > +# > + > +from __future__ import print_function > + > +VersionNumber = '0.1' > +__copyright__ = "Copyright (c) 2015, Intel Corporation All rights reserved." > + > +import argparse > +import codecs > +import os > +import sys > + > +try: > + from io import StringIO > +except ImportError: > + from StringIO import StringIO > + > +class ConvertOneArg: > + """Converts utf-16 to utf-8 for one command line argument. > + > + This could be a single file, or a directory. > + """ > + > + def __init__(self, source): > + self.source = source > + > + self.ok = True > + > + if not os.path.exists(source): > + self.ok = False > + elif os.path.isdir(source): > + for (root, dirs, files) in os.walk(source): > + files = filter(lambda a: a.endswith('.uni'), files) > + for filename in files: > + path = os.path.join(root, filename) > + self.ok &= self.convert_one_file(path) > + if not self.ok: > + break > + > + if not self.ok: > + break > + else: > + self.ok &= self.convert_one_file(source) > + > + def convert_one_file(self, source): > + # > + # Read file > + # > + utf16_file = open(source, mode='rb') > + file_content = utf16_file.read() > + utf16_file.close() > + > + # > + # Detect UTF-16 Byte Order Mark at beginning of file. > + # > + if not (file_content.startswith(codecs.BOM_UTF16_BE) or > + file_content.startswith(codecs.BOM_UTF16_LE)): > + print(source + ": already utf-8") > + return True > + > + # > + # Decode utf-16 string data > + # > + str_content = file_content.decode('utf-16', 'ignore') > + > + # > + # Encode string data to utf-8 > + # > + utf8_content = str_content.encode('utf-8', 'ignore') > + > + # > + # Write converted utf-8 data back to file > + # > + utf8_file = open(source, mode='wb') > + utf8_file.write(utf8_content) > + utf8_file.close() > + > + print(source + ": converted, size", len(file_content), '=>', > len(utf8_content)) > + return True > + > + > +class ConvertUtf16ToUtf8App: > + """Converts files to utf-8 based on the command line arguments.""" > + > + def __init__(self): > + self.parse_options() > + sources = self.args.source > + > + self.ok = True > + for patch in sources: > + self.process_one_arg(patch) > + > + if self.ok: > + self.retval = 0 > + else: > + self.retval = -1 > + > + def process_one_arg(self, arg): > + self.ok &= ConvertOneArg(arg).ok > + > + def parse_options(self): > + parser = argparse.ArgumentParser(description=__copyright__) > + parser.add_argument('--version', action='version', > + version='%(prog)s ' + VersionNumber) > + parser.add_argument('source', nargs='+', > + help='[uni file | directory]') > + self.args = parser.parse_args() > + > +if __name__ == "__main__": > + sys.exit(ConvertUtf16ToUtf8App().retval) > -- > 2.6.2 > > _______________________________________________ > edk2-devel mailing list > edk2-devel@lists.01.org > https://lists.01.org/mailman/listinfo/edk2-devel _______________________________________________ edk2-devel mailing list edk2-devel@lists.01.org https://lists.01.org/mailman/listinfo/edk2-devel