On 06/01/15 09:31, Jordan Justen wrote: > Supplementary Plane characters can exist in UTF-16 files, > but they are not valid UCS-2 characters. > > For example, refer to this python interpreter code: >>>> import codecs >>>> codecs.encode(u'\U00010300', 'utf-16') > '\xff\xfe\x00\xd8\x00\xdf' > > Therefore the UCS-4 0x00010300 character is encoded as two > 16-bit numbers (0xd800 0xdf00) in a little endian UTF-16 > file. > > For more information, see: > http://en.wikipedia.org/wiki/UTF-16#U.2B10000_to_U.2B10FFFF > > This means that our current BaseTools code could be allowing > unsupported UTF-16 characters be used. To fix this, we decode the file > using python's utf-16 decode support. Then we verify that each > character's code point is 0xffff or less. > > v3: Based on Mike Kinney's feedback, we now read the whole file and > verify up-front that it contains valid UCS-2 characters. Thanks > also to Laszlo Ersek for pointing out the Supplementary Plane > characters. > > Contributed-under: TianoCore Contribution Agreement 1.0 > Signed-off-by: Jordan Justen <jordan.l.jus...@intel.com> > Cc: Yingke D Liu <yingke.d....@intel.com> > Cc: Michael D Kinney <michael.d.kin...@intel.com> > Cc: Laszlo Ersek <ler...@redhat.com> > --- > BaseTools/Source/Python/AutoGen/UniClassObject.py | 84 > ++++++++++++++++++++++- > 1 file changed, 82 insertions(+), 2 deletions(-) > > diff --git a/BaseTools/Source/Python/AutoGen/UniClassObject.py > b/BaseTools/Source/Python/AutoGen/UniClassObject.py > index aa54f4f..66fdbf0 100644 > --- a/BaseTools/Source/Python/AutoGen/UniClassObject.py > +++ b/BaseTools/Source/Python/AutoGen/UniClassObject.py > @@ -19,6 +19,7 @@ > import Common.LongFilePathOs as os, codecs, re > import distutils.util > import Common.EdkLogger as EdkLogger > +import StringIO > from Common.BuildToolError import * > from Common.String import GetLineNo > from Common.Misc import PathClass > @@ -147,6 +148,33 @@ def GetLanguageCode(LangName, IsCompatibleMode, File): > > EdkLogger.error("Unicode File Parser", FORMAT_INVALID, "Invalid RFC 4646 > language code : %s" % LangName, File) > > +## Ucs2Codec > +# > +# This is only a partial codec implementation. It only supports > +# encoding, and is primarily used to check that all the characters are > +# valid for UCS-2. > +# > +class Ucs2Codec(codecs.Codec): > + def __init__(self): > + self.__utf16 = codecs.lookup('utf-16') > + > + def encode(self, input, errors='strict'): > + for Char in input: > + if ord(Char) > 0xffff: > + raise ValueError("Code Point too large to encode in UCS-2") > + return self.__utf16.encode(input)
I could be missing some context, but the wikipedia article referenced at the top says that The official Unicode standard says that no UTF forms [...] can encode [U+D800 to U+DFFF] code points. However UCS-2, UTF-8 [...] can encode [U+D800 to U+DFFF] code points in trivial and obvious ways [...] So basically it is possible to construct a UTF-8 file that (albeit violating the unicode standard) will result in U+D800 to U+DFFF code *points* when parsed. Then (I think) these would be turned into UCS2 code *units* with identical numeric values. Which, I think, would be wrong. Also, the same seems to apply to a UTF-16 source: It is possible to unambiguously encode them in UTF-16 by using a code unit equal to the code point, as long as no sequence of two code units can be interpreted as a legal surrogate pair (that is, as long as a high surrogate is never followed by a low surrogate). The majority of UTF-16 encoder and decoder implementations translate between encodings as though this were the case. Shouldn't we accept code points *only* in "U+0000 to U+D7FF and U+E000 to U+FFFF"? The article says Both UTF-16 and UCS-2 encode code points in this range as single 16-bit code units that are numerically equal to the corresponding code points. These code points in the BMP are the /only/ code points that can be represented in UCS-2. Modern text almost exclusively consists of these code points. For example, printf '%b' '\x01\xD8\x24\x00' creates two UTF-16 code units (in LE representation): D801 0024 This is not a valid surrogate pair (there is no low surrogate). Does your code (or the underlying python machinery) reject the output of the above printf invocation? For example, "iconv" rejects it: $ printf '%b' '\x01\xD8\x24\x00' | iconv -f UTF-16LE -t UCS-2 iconv: illegal input sequence at position 0 Hm... let's see a bit lower down: > + > +TheUcs2Codec = Ucs2Codec() > +def Ucs2Search(name): > + if name == 'ucs-2': > + return codecs.CodecInfo( > + name=name, > + encode=TheUcs2Codec.encode, > + decode=TheUcs2Codec.decode) > + else: > + return None > +codecs.register(Ucs2Search) > + > ## StringDefClassObject > # > # A structure for language definition > @@ -209,7 +237,7 @@ class UniFileClassObject(object): > Lang = distutils.util.split_quoted((Line.split(u"//")[0])) > if len(Lang) != 3: > try: > - FileIn = codecs.open(LongFilePath(File.Path), mode='rb', > encoding='utf-16').read() > + FileIn = self.OpenUniFile(LongFilePath(File.Path)) > except UnicodeError, X: > EdkLogger.error("build", FILE_READ_FAILURE, "File read > failure: %s" % str(X), ExtraData=File); > except: > @@ -253,6 +281,58 @@ class UniFileClassObject(object): > self.OrderedStringDict[LangName][Item.StringName] = > len(self.OrderedStringList[LangName]) - 1 > return True > > + def OpenUniFile(self, FileName): > + # > + # Read file > + # > + try: > + UniFile = open(FileName, mode='rb') > + FileIn = UniFile.read() > + UniFile.close() > + except: > + EdkLogger.Error("build", FILE_OPEN_FAILURE, ExtraData=File) > + > + # > + # We currently only support UTF-16 > + # > + Encoding = 'utf-16' > + > + self.VerifyUcs2Data(FileIn, FileName, Encoding) > + > + UniFile = StringIO.StringIO(FileIn) > + Info = codecs.lookup(Encoding) > + (Reader, Writer) = (Info.streamreader, Info.streamwriter) > + return codecs.StreamReaderWriter(UniFile, Reader, Writer) > + > + def VerifyUcs2Data(self, FileIn, FileName, Encoding): > + Ucs2Info = codecs.lookup('ucs-2') > + # > + # Convert to unicode > + # > + try: > + FileDecoded = codecs.decode(FileIn, Encoding) If this method call is guaranteed to throw an exception when it encounters a UTF-16LE (or UTF-8) sequence that would decode to a surrogate code point (ie. in the U+D800 to U+DFFF range), then we should be fine. Otherwise, such code points should be caught in Ucs2Codec.encode(). Thanks Laszlo > + Ucs2Info.encode(FileDecoded) > + except: > + UniFile = StringIO.StringIO(FileIn) > + Info = codecs.lookup(Encoding) > + (Reader, Writer) = (Info.streamreader, Info.streamwriter) > + File = codecs.StreamReaderWriter(UniFile, Reader, Writer) > + LineNumber = 0 > + ErrMsg = lambda Encoding, LineNumber: \ > + '%s contains invalid %s characters on line %d.' % \ > + (FileName, Encoding, LineNumber) > + while True: > + LineNumber = LineNumber + 1 > + try: > + Line = File.readline() > + if Line == '': > + EdkLogger.error('Unicode File Parser', PARSER_ERROR, > + ErrMsg(Encoding, LineNumber)) > + Ucs2Info.encode(Line) > + except: > + EdkLogger.error('Unicode File Parser', PARSER_ERROR, > + ErrMsg('UCS-2', LineNumber)) > + > # > # Get String name and value > # > @@ -305,7 +385,7 @@ class UniFileClassObject(object): > EdkLogger.error("Unicode File Parser", FILE_NOT_FOUND, > ExtraData=File.Path) > > try: > - FileIn = codecs.open(LongFilePath(File.Path), mode='rb', > encoding='utf-16') > + FileIn = self.OpenUniFile(LongFilePath(File.Path)) > except UnicodeError, X: > EdkLogger.error("build", FILE_READ_FAILURE, "File read failure: > %s" % str(X), ExtraData=File.Path); > except: > ------------------------------------------------------------------------------ _______________________________________________ edk2-devel mailing list edk2-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/edk2-devel