John Nagle wrote: > Here's some actual code, from "tokenizer.py". This is called once > for each character in an HTML document, when in "data" state (outside > a tag). It's straightforward code, but look at all those > dictionary lookups. > > def dataState(self): > data = self.stream.char() > > # Keep a charbuffer to handle the escapeFlag > if self.contentModelFlag in\ > (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]): > if len(self.lastFourChars) == 4: > self.lastFourChars.pop(0) > self.lastFourChars.append(data) > > # The rest of the logic > if data == "&" and self.contentModelFlag in\ > (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and > not\ > self.escapeFlag: > self.state = self.states["entityData"] > elif data == "-" and self.contentModelFlag in\ > (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and > not\ > self.escapeFlag and "".join(self.lastFourChars) == "<!--": > self.escapeFlag = True > self.tokenQueue.append({"type": "Characters", "data":data}) > elif (data == "<" and (self.contentModelFlag == > contentModelFlags["PCDATA"] > or (self.contentModelFlag in > (contentModelFlags["CDATA"], > contentModelFlags["RCDATA"]) and > self.escapeFlag == False))): > self.state = self.states["tagOpen"] > elif data == ">" and self.contentModelFlag in\ > (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\ > self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->": > self.escapeFlag = False > self.tokenQueue.append({"type": "Characters", "data":data}) > elif data == EOF: > # Tokenization ends. > return False > elif data in spaceCharacters: > # Directly after emitting a token you switch back to the "data > # state". At that point spaceCharacters are important so > they are > # emitted separately. > self.tokenQueue.append({"type": "SpaceCharacters", "data": > data + self.stream.charsUntil(spaceCharacters, True)}) > # No need to update lastFourChars here, since the first > space will > # have already broken any <!-- or --> sequences > else: > chars = self.stream.charsUntil(("&", "<", ">", "-")) > self.tokenQueue.append({"type": "Characters", "data": > data + chars}) > self.lastFourChars += chars[-4:] > self.lastFourChars = self.lastFourChars[-4:] > return True
Giving this some more thought, I'd also try is to split the huge if-elif-else block like this: if data in string_with_all_special_characters: if data == '&' ...: ... else: ... So there are three things to improve: - eliminate common subexpressions which you know are constant - split the large conditional sequence as shown above - use separate dataState() methods when inside and outside of CDATA/RCDATA blocks and (maybe) escaped blocks Stefan -- http://mail.python.org/mailman/listinfo/python-list