http://git-wip-us.apache.org/repos/asf/lucenenet/blob/198e5868/src/Lucene.Net.Benchmark/Support/TagSoup/HTMLSchema.tt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Support/TagSoup/HTMLSchema.tt b/src/Lucene.Net.Benchmark/Support/TagSoup/HTMLSchema.tt new file mode 100644 index 0000000..5f4a839 --- /dev/null +++ b/src/Lucene.Net.Benchmark/Support/TagSoup/HTMLSchema.tt @@ -0,0 +1,72 @@ +<#@ template debug="true" hostspecific="true" language="C#" #> +<#@ assembly name="System.Xml" #> +<#@ import namespace="System.IO" #> +<#@ import namespace="System.Xml.Xsl" #> +<#@ output extension=".Generated.cs" #> +<# /* +# ----------------------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the ""License""); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an ""AS IS"" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ----------------------------------------------------------------------------------- +*/ #> +//------------------------------------------------------------------------------ +// <auto-generated> +// This code was generated by a tool. +// +// Changes to this file may cause incorrect behavior and will be lost if +// the code is regenerated. +// </auto-generated> +//------------------------------------------------------------------------------ +namespace TagSoup +{ + /// <summary> + /// This class provides a Schema that has been preinitialized with HTML + /// elements, attributes, and character entity declarations. All the declarations + /// normally provided with HTML 4.01 are given, plus some that are IE-specific + /// and NS4-specific. Attribute declarations of type CDATA with no default + /// value are not included. + /// </summary> + public class HTMLSchema : Schema + { + // HTMLModels begin + <# + XslCompiledTransform transform = new XslCompiledTransform(true); + transform.Load(this.Host.ResolvePath("tssl/tssl-models.xslt")); + using(StringWriter writer = new StringWriter()) + { + transform.Transform(this.Host.ResolvePath("definitions/html.tssl"), null, writer); + Write(writer.ToString()); + } + #> // HTMLModels end + + /// <summary> + /// Returns a newly constructed HTMLSchema object independent of + /// any existing ones. + /// </summary> + public HTMLSchema() + { + <# + transform.Load(this.Host.ResolvePath("tssl/tssl.xslt")); + using(StringWriter writer = new StringWriter()) + { + transform.Transform(this.Host.ResolvePath("definitions/html.tssl"), null, writer); + Write(writer.ToString()); + } + #> + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/198e5868/src/Lucene.Net.Benchmark/Support/TagSoup/PYXScanner.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Support/TagSoup/PYXScanner.cs b/src/Lucene.Net.Benchmark/Support/TagSoup/PYXScanner.cs new file mode 100644 index 0000000..711d46a --- /dev/null +++ b/src/Lucene.Net.Benchmark/Support/TagSoup/PYXScanner.cs @@ -0,0 +1,138 @@ +// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. +// +// TagSoup is licensed under the Apache License, +// Version 2.0. You may obtain a copy of this license at +// http://www.apache.org/licenses/LICENSE-2.0 . You may also have +// additional legal rights not granted by this license. +// +// TagSoup is distributed in the hope that it will be useful, but +// unless required by applicable law or agreed to in writing, TagSoup +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS +// OF ANY KIND, either express or implied; not even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// +// This file is part of TagSoup. +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. You may also distribute +// and/or modify it under version 2.1 of the Academic Free License. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// +// PYX Scanner + +using System.IO; + +namespace TagSoup +{ + /// <summary> + /// A <see cref="IScanner"/> that accepts PYX format instead of HTML. + /// Useful primarily for debugging. + /// </summary> + public class PYXScanner : IScanner + { + public virtual void ResetDocumentLocator(string publicid, string systemid) + { + // Need this method for interface compatibility, but note + // that PyxScanner does not implement Locator. + } + + public virtual void Scan(TextReader br, IScanHandler h) + { + string s; + char[] buff = null; + bool instag = false; + while ((s = br.ReadLine()) != null) + { + int size = s.Length; + buff = s.ToCharArray(0, size); + if (buff.Length < size) + { + buff = new char[size]; + } + switch (buff[0]) + { + case '(': + if (instag) + { + h.STagC(buff, 0, 0); + instag = false; + } + h.GI(buff, 1, size - 1); + instag = true; + break; + case ')': + if (instag) + { + h.STagC(buff, 0, 0); + instag = false; + } + h.ETag(buff, 1, size - 1); + break; + case '?': + if (instag) + { + h.STagC(buff, 0, 0); + instag = false; + } + h.PI(buff, 1, size - 1); + break; + case 'A': + int sp = s.IndexOf(' '); + h.Aname(buff, 1, sp - 1); + h.Aval(buff, sp + 1, size - sp - 1); + break; + case '-': + if (instag) + { + h.STagC(buff, 0, 0); + instag = false; + } + if (s.Equals("-\\n")) + { + buff[0] = '\n'; + h.PCDATA(buff, 0, 1); + } + else + { + // FIXME: + // Does not decode \t and \\ in input + h.PCDATA(buff, 1, size - 1); + } + break; + case 'E': + if (instag) + { + h.STagC(buff, 0, 0); + instag = false; + } + h.Entity(buff, 1, size - 1); + break; + default: + // System.err.print("Gotcha "); + // System.err.print(s); + // System.err.print('\n'); + break; + } + } + h.EOF(buff, 0, 0); + } + + public void StartCDATA() + { + } + + //public static void main(string[] argv) { + // IScanner s = new PYXScanner(); + // TextReader r = new StreamReader(System.Console.OpenStandardInput(), Encoding.UTF8); + // TextWriter w = new StreamWriter(System.Console.OpenStandardOutput(), Encoding.UTF8)); + // s.Scan(r, new PYXWriter(w)); + // } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/198e5868/src/Lucene.Net.Benchmark/Support/TagSoup/PYXWriter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Support/TagSoup/PYXWriter.cs b/src/Lucene.Net.Benchmark/Support/TagSoup/PYXWriter.cs new file mode 100644 index 0000000..ff47d0d --- /dev/null +++ b/src/Lucene.Net.Benchmark/Support/TagSoup/PYXWriter.cs @@ -0,0 +1,286 @@ +// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. +// +// TagSoup is licensed under the Apache License, +// Version 2.0. You may obtain a copy of this license at +// http://www.apache.org/licenses/LICENSE-2.0 . You may also have +// additional legal rights not granted by this license. +// +// TagSoup is distributed in the hope that it will be useful, but +// unless required by applicable law or agreed to in writing, TagSoup +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS +// OF ANY KIND, either express or implied; not even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// +// PYX Writer +// FIXME: does not do escapes in attribute values +// FIXME: outputs entities as bare '&' character + +using Sax; +using Sax.Ext; +using System.IO; + +namespace TagSoup +{ + /// <summary> + /// A <see cref="IContentHandler"/> that generates PYX format instead of XML. + /// Primarily useful for debugging. + /// </summary> + public class PYXWriter : IScanHandler, IContentHandler, ILexicalHandler + { + private readonly TextWriter theWriter; // where we Write to + private static char[] dummy = new char[1]; + private string attrName; // saved attribute name + + // ScanHandler implementation + + public void Adup(char[] buff, int offset, int length) + { + theWriter.WriteLine(attrName); + attrName = null; + } + + public void Aname(char[] buff, int offset, int length) + { + theWriter.Write('A'); + theWriter.Write(buff, offset, length); + theWriter.Write(' '); + attrName = new string(buff, offset, length); + } + + public void Aval(char[] buff, int offset, int length) + { + theWriter.Write(buff, offset, length); + theWriter.WriteLine(); + attrName = null; + } + + public void Cmnt(char[] buff, int offset, int length) + { + // theWriter.Write('!'); + // theWriter.Write(buff, offset, length); + // theWriter.WriteLine(); + } + + public void Entity(char[] buff, int offset, int length) + { + } + + public int GetEntity() + { + return 0; + } + + public void EOF(char[] buff, int offset, int length) + { + theWriter.Close(); + } + + public void ETag(char[] buff, int offset, int length) + { + theWriter.Write(')'); + theWriter.Write(buff, offset, length); + theWriter.WriteLine(); + } + + public void Decl(char[] buff, int offset, int length) + { + } + + public void GI(char[] buff, int offset, int length) + { + theWriter.Write('('); + theWriter.Write(buff, offset, length); + theWriter.WriteLine(); + } + + public void CDSect(char[] buff, int offset, int length) + { + PCDATA(buff, offset, length); + } + + public void PCDATA(char[] buff, int offset, int length) + { + if (length == 0) + { + return; // nothing to do + } + bool inProgress = false; + length += offset; + for (int i = offset; i < length; i++) + { + if (buff[i] == '\n') + { + if (inProgress) + { + theWriter.WriteLine(); + } + theWriter.WriteLine("-\\n"); + inProgress = false; + } + else + { + if (!inProgress) + { + theWriter.Write('-'); + } + switch (buff[i]) + { + case '\t': + theWriter.Write("\\t"); + break; + case '\\': + theWriter.Write("\\\\"); + break; + default: + theWriter.Write(buff[i]); + break; + } + inProgress = true; + } + } + if (inProgress) + { + theWriter.WriteLine(); + } + } + + public void PITarget(char[] buff, int offset, int length) + { + theWriter.Write('?'); + theWriter.Write(buff, offset, length); + theWriter.Write(' '); + } + + public void PI(char[] buff, int offset, int length) + { + theWriter.Write(buff, offset, length); + theWriter.WriteLine(); + } + + public void STagC(char[] buff, int offset, int length) + { + // theWriter.WriteLine("!"); // FIXME + } + + public void STagE(char[] buff, int offset, int length) + { + theWriter.WriteLine("!"); // FIXME + } + + // SAX ContentHandler implementation + + public void Characters(char[] buff, int offset, int length) + { + PCDATA(buff, offset, length); + } + + public void EndDocument() + { + theWriter.Close(); + } + + public void EndElement(string uri, string localname, string qname) + { + if (qname.Length == 0) + { + qname = localname; + } + theWriter.Write(')'); + theWriter.WriteLine(qname); + } + + public void EndPrefixMapping(string prefix) + { + } + + public void IgnorableWhitespace(char[] buff, int offset, int length) + { + Characters(buff, offset, length); + } + + public void ProcessingInstruction(string target, string data) + { + theWriter.Write('?'); + theWriter.Write(target); + theWriter.Write(' '); + theWriter.WriteLine(data); + } + + public void SetDocumentLocator(ILocator locator) + { + } + + public void SkippedEntity(string name) + { + } + + public void StartDocument() + { + } + + public void StartElement(string uri, string localname, string qname, IAttributes atts) + { + if (qname.Length == 0) + { + qname = localname; + } + theWriter.Write('('); + theWriter.WriteLine(qname); + int length = atts.Length; + for (int i = 0; i < length; i++) + { + qname = atts.GetQName(i); + if (qname.Length == 0) + { + qname = atts.GetLocalName(i); + } + theWriter.Write('A'); + // theWriter.Write(atts.getType(i)); // DEBUG + theWriter.Write(qname); + theWriter.Write(' '); + theWriter.WriteLine(atts.GetValue(i)); + } + } + + public void StartPrefixMapping(string prefix, string uri) + { + } + + public void Comment(char[] ch, int start, int length) + { + Cmnt(ch, start, length); + } + + public void EndCDATA() + { + } + + public void EndDTD() + { + } + + public void EndEntity(string name) + { + } + + public void StartCDATA() + { + } + + public void StartDTD(string name, string publicId, string systemId) + { + } + + public void StartEntity(string name) + { + } + + // Constructor + + public PYXWriter(TextWriter w) + { + theWriter = w; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/198e5868/src/Lucene.Net.Benchmark/Support/TagSoup/Parser.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Support/TagSoup/Parser.cs b/src/Lucene.Net.Benchmark/Support/TagSoup/Parser.cs new file mode 100644 index 0000000..a0a5463 --- /dev/null +++ b/src/Lucene.Net.Benchmark/Support/TagSoup/Parser.cs @@ -0,0 +1,1484 @@ +// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. +// +// TagSoup is licensed under the Apache License, +// Version 2.0. You may obtain a copy of this license at +// http://www.apache.org/licenses/LICENSE-2.0 . You may also have +// additional legal rights not granted by this license. +// +// TagSoup is distributed in the hope that it will be useful, but +// unless required by applicable law or agreed to in writing, TagSoup +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS +// OF ANY KIND, either express or implied; not even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// +// The TagSoup parser + +using Lucene.Net.Support; +using Sax; +using Sax.Ext; +using Sax.Helpers; +using System; +using System.Collections; +using System.Collections.Generic; +using System.IO; +using System.Text; + +namespace TagSoup +{ + /// <summary> + /// The SAX parser class. + /// </summary> + public class Parser : DefaultHandler, IScanHandler, IXMLReader, ILexicalHandler + { + // XMLReader implementation + + private IContentHandler theContentHandler; + private ILexicalHandler theLexicalHandler; + private IDTDHandler theDTDHandler; + private IErrorHandler theErrorHandler; + private IEntityResolver theEntityResolver; + private Schema theSchema; + private IScanner theScanner; + private IAutoDetector theAutoDetector; + + // Default values for feature flags + + private const bool DEFAULT_NAMESPACES = true; + private const bool DEFAULT_IGNORE_BOGONS = false; + private const bool DEFAULT_BOGONS_EMPTY = false; + private const bool DEFAULT_ROOT_BOGONS = true; + private const bool DEFAULT_DEFAULT_ATTRIBUTES = true; + private const bool DEFAULT_TRANSLATE_COLONS = false; + private const bool DEFAULT_RESTART_ELEMENTS = true; + private const bool DEFAULT_IGNORABLE_WHITESPACE = false; + private const bool DEFAULT_CDATA_ELEMENTS = true; + + // Feature flags. + + private bool namespaces = DEFAULT_NAMESPACES; + private bool ignoreBogons = DEFAULT_IGNORE_BOGONS; + private bool bogonsEmpty = DEFAULT_BOGONS_EMPTY; + private bool rootBogons = DEFAULT_ROOT_BOGONS; + private bool defaultAttributes = DEFAULT_DEFAULT_ATTRIBUTES; + private bool translateColons = DEFAULT_TRANSLATE_COLONS; + private bool restartElements = DEFAULT_RESTART_ELEMENTS; + private bool ignorableWhitespace = DEFAULT_IGNORABLE_WHITESPACE; + private bool cDataElements = DEFAULT_CDATA_ELEMENTS; + + /// <summary> + /// A value of "true" indicates namespace URIs and unprefixed local + /// names for element and attribute names will be available. + /// </summary> + public const string NAMESPACES_FEATURE = "http://xml.org/sax/features/namespaces"; + + /// <summary> + /// A value of "true" indicates that XML qualified names (with prefixes) + /// and attributes (including xmlns* attributes) will be available. + /// We don't support this value. + /// </summary> + public const string NAMESPACE_PREFIXES_FEATURE = "http://xml.org/sax/features/namespace-prefixes"; + + /// <summary> + /// Reports whether this parser processes external general entities + /// (it doe + /// </summary> + public const string EXTERNAL_GENERAL_ENTITIES_FEATURE = "http://xml.org/sax/features/external-general-entities"; + + /// <summary> + /// Reports whether this parser processes external parameter entities + /// (it doesn't). + /// </summary> + public const string EXTERNAL_PARAMETER_ENTITIES_FEATURE = "http://xml.org/sax/features/external-parameter-entities"; + + /// <summary> + /// May be examined only during a parse, after the startDocument() + /// callback has been completed; read-only. The value is true if + /// the document specified standalone="yes" in its XML declaration, + /// and otherwise is false. (It's always false.) + /// </summary> + public const string IS_STANDALONE_FEATURE = "http://xml.org/sax/features/is-standalone"; + + /// <summary> + /// A value of "true" indicates that the LexicalHandler will report + /// the beginning and end of parameter entities (it won't). + /// </summary> + public const string LEXICAL_HANDLER_PARAMETER_ENTITIES_FEATURE = + "http://xml.org/sax/features/lexical-handler/parameter-entities"; + + /// <summary> + /// A value of "true" indicates that system IDs in declarations will + /// be absolutized (relative to their base URIs) before reporting. + /// (This returns true but doesn't actually do anything.) + /// </summary> + public const string RESOLVE_DTD_URIS_FEATURE = "http://xml.org/sax/features/resolve-dtd-uris"; + + /// <summary> + /// Has a value of "true" if all XML names (for elements, + /// prefixes, attributes, entities, notations, and local + /// names), as well as Namespace URIs, will have been interned + /// using <see cref="string.Intern" />. This supports fast testing of + /// equality/inequality against string constants, rather than forcing + /// slower calls to <see cref="string.Equals(object)" />. (We always intern.) + /// </summary> + public const string STRING_INTERNING_FEATURE = "http://xml.org/sax/features/string-interning"; + + /// <summary> + /// Returns "true" if the Attributes objects passed by this + /// parser in <see cref="IContentHandler.StartElement" /> implement the + /// <see cref="Sax.Net.Ext.IAttributes2" /> interface. (They don't.) + /// </summary> + public const string USE_ATTRIBUTES2_FEATURE = "http://xml.org/sax/features/use-attributes2"; + + /// <summary> + /// Returns "true" if the Locator objects passed by this parser + /// parser in <see cref="IContentHandler.SetDocumentLocator" /> implement the + /// <see cref="Sax.Net.Ext.ILocator2" /> interface. (They don't.) + /// </summary> + public const string USE_LOCATOR2_FEATURE = "http://xml.org/sax/features/use-locator2"; + /// <summary> + /// Returns "true" if, when setEntityResolver is given an object + /// implementing the <see cref="Sax.Net.Ext.IEntityResolver2" /> interface, + /// those new methods will be used. (They won't be.) + /// </summary> + public const string USE_ENTITY_RESOLVER2_FEATURE = "http://xml.org/sax/features/use-entity-resolver2"; + + /// <summary> + /// Controls whether the parser is reporting all validity errors + /// (We don't report any validity errors.) + /// </summary> + public const string VALIDATION_FEATURE = "http://xml.org/sax/features/validation"; + + /// <summary> + /// Controls whether the parser reports Unicode normalization + /// errors as described in section 2.13 and Appendix B of the XML + /// 1.1 Recommendation. (We don't normalize.) + /// </summary> + public const string UNICODE_NORMALIZATION_CHECKING_FEATURE = + "http://xml.org/sax/features/unicode-normalization-checking"; + + /// <summary> + /// Controls whether, when the namespace-prefixes feature is set, + /// the parser treats namespace declaration attributes as being in + /// the http://www.w3.org/2000/xmlns/ namespace. (It doesn't.) + /// </summary> + public const string XMLNS_URIS_FEATURE = "http://xml.org/sax/features/xmlns-uris"; + + /// <summary> + /// Returns <c>true</c> if the parser supports both XML 1.1 and XML 1.0. + /// (Always <c>false</c>.) + /// </summary> + public const string XML11_FEATURE = "http://xml.org/sax/features/xml-1.1"; + + /// <summary> + /// A value of <c>true</c> indicates that the parser will ignore + /// unknown elements. + /// </summary> + public const string IGNORE_BOGONS_FEATURE = "http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons"; + + /// <summary> + /// A value of <c>true</c> indicates that the parser will give unknown + /// elements a content model of EMPTY; a value of <c>false</c>, a + /// content model of ANY. + /// </summary> + public const string BOGONS_EMPTY_FEATURE = "http://www.ccil.org/~cowan/tagsoup/features/bogons-empty"; + + /// <summary> + /// A value of <c>true</c> indicates that the parser will allow unknown + /// elements to be the root element. + /// </summary> + public const string ROOT_BOGONS_FEATURE = "http://www.ccil.org/~cowan/tagsoup/features/root-bogons"; + + /// <summary> + /// A value of <c>true</c> indicates that the parser will return default + /// attribute values for missing attributes that have default values. + /// </summary> + public const string DEFAULT_ATTRIBUTES_FEATURE = "http://www.ccil.org/~cowan/tagsoup/features/default-attributes"; + + /// <summary> + /// A value of <c>true</c> indicates that the parser will + /// translate colons into underscores in names. + /// </summary> + public const string TRANSLATE_COLONS_FEATURE = "http://www.ccil.org/~cowan/tagsoup/features/translate-colons"; + + /// <summary> + /// A value of <c>true</c> indicates that the parser will + /// attempt to restart the restartable elements. + /// </summary> + public const string RESTART_ELEMENTS_FEATURE = "http://www.ccil.org/~cowan/tagsoup/features/restart-elements"; + + /// <summary> + /// A value of "true" indicates that the parser will + /// transmit whitespace in element-only content via the SAX + /// ignorableWhitespace callback. Normally this is not done, + /// because HTML is an SGML application and SGML suppresses + /// such whitespace. + /// </summary> + public const string IGNORABLE_WHITESPACE_FEATURE = + "http://www.ccil.org/~cowan/tagsoup/features/ignorable-whitespace"; + + /// <summary> + /// A value of "true" indicates that the parser will treat CDATA + /// elements specially. Normally true, since the input is by + /// default HTML. + /// </summary> + public const string CDATA_ELEMENTS_FEATURE = "http://www.ccil.org/~cowan/tagsoup/features/cdata-elements"; + + /// <summary> + /// Used to see some syntax events that are essential in some + /// applications: comments, CDATA delimiters, selected general + /// entity inclusions, and the start and end of the DTD (and + /// declaration of document element name). The Object must implement + /// <see cref="ILexicalHandler" /> + /// </summary> + public const string LEXICAL_HANDLER_PROPERTY = "http://xml.org/sax/properties/lexical-handler"; + + /// <summary> + /// Specifies the Scanner object this Parser uses. + /// </summary> + public const string SCANNER_PROPERTY = "http://www.ccil.org/~cowan/tagsoup/properties/scanner"; + + /// <summary> + /// Specifies the Schema object this Parser uses. + /// </summary> + public const string SCHEMA_PROPERTY = "http://www.ccil.org/~cowan/tagsoup/properties/schema"; + + /// <summary> + /// Specifies the AutoDetector (for encoding detection) this Parser uses. + /// </summary> + public const string AUTO_DETECTOR_PROPERTY = "http://www.ccil.org/~cowan/tagsoup/properties/auto-detector"; + + + // Due to sucky Java order of initialization issues, these + // entries are maintained separately from the initial values of + // the corresponding instance variables, but care must be taken + // to keep them in sync. + + private readonly Hashtable features = new Hashtable { + { NAMESPACES_FEATURE, DEFAULT_NAMESPACES }, + { NAMESPACE_PREFIXES_FEATURE, false }, + { EXTERNAL_GENERAL_ENTITIES_FEATURE, false }, + { EXTERNAL_PARAMETER_ENTITIES_FEATURE, false }, + { IS_STANDALONE_FEATURE, false }, + { LEXICAL_HANDLER_PARAMETER_ENTITIES_FEATURE, false }, + { RESOLVE_DTD_URIS_FEATURE, true }, + { STRING_INTERNING_FEATURE, true }, + { USE_ATTRIBUTES2_FEATURE, false }, + { USE_LOCATOR2_FEATURE, false }, + { USE_ENTITY_RESOLVER2_FEATURE, false }, + { VALIDATION_FEATURE, false }, + { XMLNS_URIS_FEATURE, false }, + { XML11_FEATURE, false }, + { IGNORE_BOGONS_FEATURE, DEFAULT_IGNORE_BOGONS }, + { BOGONS_EMPTY_FEATURE, DEFAULT_BOGONS_EMPTY }, + { ROOT_BOGONS_FEATURE, DEFAULT_ROOT_BOGONS }, + { DEFAULT_ATTRIBUTES_FEATURE, DEFAULT_DEFAULT_ATTRIBUTES }, + { TRANSLATE_COLONS_FEATURE, DEFAULT_TRANSLATE_COLONS }, + { RESTART_ELEMENTS_FEATURE, DEFAULT_RESTART_ELEMENTS }, + { IGNORABLE_WHITESPACE_FEATURE, DEFAULT_IGNORABLE_WHITESPACE }, + { CDATA_ELEMENTS_FEATURE, DEFAULT_CDATA_ELEMENTS }, + }; + + public virtual bool GetFeature(string name) + { + if (features.ContainsKey(name)) + { + return (bool)features[name]; + } + throw new SAXNotRecognizedException("Unknown feature " + name); + } + + public virtual void SetFeature(string name, bool value) + { + if (false == features.ContainsKey(name)) + { + throw new SAXNotRecognizedException("Unknown feature " + name); + } + features[name] = value; + + if (name.Equals(NAMESPACES_FEATURE)) + { + namespaces = value; + } + else if (name.Equals(IGNORE_BOGONS_FEATURE)) + { + ignoreBogons = value; + } + else if (name.Equals(BOGONS_EMPTY_FEATURE)) + { + bogonsEmpty = value; + } + else if (name.Equals(ROOT_BOGONS_FEATURE)) + { + rootBogons = value; + } + else if (name.Equals(DEFAULT_ATTRIBUTES_FEATURE)) + { + defaultAttributes = value; + } + else if (name.Equals(TRANSLATE_COLONS_FEATURE)) + { + translateColons = value; + } + else if (name.Equals(RESTART_ELEMENTS_FEATURE)) + { + restartElements = value; + } + else if (name.Equals(IGNORABLE_WHITESPACE_FEATURE)) + { + ignorableWhitespace = value; + } + else if (name.Equals(CDATA_ELEMENTS_FEATURE)) + { + cDataElements = value; + } + } + + public virtual object GetProperty(string name) + { + if (name.Equals(LEXICAL_HANDLER_PROPERTY)) + { + return theLexicalHandler == this ? null : theLexicalHandler; + } + if (name.Equals(SCANNER_PROPERTY)) + { + return theScanner; + } + if (name.Equals(SCHEMA_PROPERTY)) + { + return theSchema; + } + if (name.Equals(AUTO_DETECTOR_PROPERTY)) + { + return theAutoDetector; + } + throw new SAXNotRecognizedException("Unknown property " + name); + } + + public virtual void SetProperty(string name, object value) + { + if (name.Equals(LEXICAL_HANDLER_PROPERTY)) + { + if (value == null) + { + theLexicalHandler = this; + } + else + { + var handler = value as ILexicalHandler; + if (handler != null) + { + theLexicalHandler = handler; + } + else + { + throw new SAXNotSupportedException("Your lexical handler is not a ILexicalHandler"); + } + } + } + else if (name.Equals(SCANNER_PROPERTY)) + { + var scanner = value as IScanner; + if (scanner != null) + { + theScanner = scanner; + } + else + { + throw new SAXNotSupportedException("Your scanner is not a IScanner"); + } + } + else if (name.Equals(SCHEMA_PROPERTY)) + { + var schema = value as Schema; + if (schema != null) + { + theSchema = schema; + } + else + { + throw new SAXNotSupportedException("Your schema is not a Schema"); + } + } + else if (name.Equals(AUTO_DETECTOR_PROPERTY)) + { + var detector = value as IAutoDetector; + if (detector != null) + { + theAutoDetector = detector; + } + else + { + throw new SAXNotSupportedException("Your auto-detector is not an IAutoDetector"); + } + } + else + { + throw new SAXNotRecognizedException("Unknown property " + name); + } + } + + public virtual IEntityResolver EntityResolver + { + get { return theEntityResolver == this ? null : theEntityResolver; } + set { theEntityResolver = value ?? this; } + } + + public virtual IDTDHandler DTDHandler + { + get { return theDTDHandler == this ? null : theDTDHandler; } + set { theDTDHandler = value ?? this; } + } + + public virtual IContentHandler ContentHandler + { + get { return theContentHandler == this ? null : theContentHandler; } + set { theContentHandler = value ?? this; } + } + + public virtual IErrorHandler ErrorHandler + { + get { return theErrorHandler == this ? null : theErrorHandler; } + set { theErrorHandler = value ?? this; } + } + + public virtual void Parse(InputSource input) + { + Setup(); + TextReader r = GetReader(input); + theContentHandler.StartDocument(); + theScanner.ResetDocumentLocator(input.PublicId, input.SystemId); + var locator = theScanner as ILocator; + if (locator != null) + { + theContentHandler.SetDocumentLocator(locator); + } + if (!(theSchema.Uri.Equals(""))) + { + theContentHandler.StartPrefixMapping(theSchema.Prefix, theSchema.Uri); + } + theScanner.Scan(r, this); + } + + public virtual void Parse(string systemid) + { + Parse(new InputSource(systemid)); + } + + // Sets up instance variables that haven't been set by setFeature + private void Setup() + { + if (theSchema == null) + { + theSchema = new HTMLSchema(); + } + if (theScanner == null) + { + theScanner = new HTMLScanner(); + } + if (theAutoDetector == null) + { + theAutoDetector = new AutoDetectorDelegate(stream => new StreamReader(stream)); + } + theStack = new Element(theSchema.GetElementType("<root>"), defaultAttributes); + thePCDATA = new Element(theSchema.GetElementType("<pcdata>"), defaultAttributes); + theNewElement = null; + theAttributeName = null; + thePITarget = null; + theSaved = null; + theEntity = 0; + virginStack = true; + theDoctypeName = theDoctypePublicId = theDoctypeSystemId = null; + } + + /// <summary> + /// Return a <see cref="TextReader"/> based on the contents of an <see cref="InputSource"/> + /// Buffer the Stream + /// </summary> + /// <param name="s"></param> + /// <returns></returns> + private TextReader GetReader(InputSource s) + { + TextReader r = s.TextReader; + Stream i = s.Stream; + Encoding encoding = s.Encoding; + string publicid = s.PublicId; + string systemid = s.SystemId; + if (r == null) + { + if (i == null) + { + i = GetInputStream(publicid, systemid); + } + if (!(i is BufferedStream)) + { + i = new BufferedStream(i); + } + if (encoding == null) + { + r = theAutoDetector.AutoDetectingReader(i); + } + else + { + //try { + //TODO: Safe? + r = new StreamReader(i, encoding); + // } + //catch (UnsupportedEncodingException e) { + // r = new StreamReader(i); + // } + } + } + // r = new BufferedReader(r); + return r; + } + + /// <summary> + /// Get an Stream based on a publicid and a systemid + /// We don't process publicids (who uses them anyhow?) + /// </summary> + /// <param name="publicid"></param> + /// <param name="systemid"></param> + /// <returns></returns> + private Stream GetInputStream(string publicid, string systemid) + { + var basis = new Uri("file://" + Environment.CurrentDirectory + Path.DirectorySeparatorChar); + var url = new Uri(basis, systemid); + return new FileStream(url.LocalPath, FileMode.Open, FileAccess.Read, FileShare.Read); + } + + // ScanHandler implementation + + private Element theNewElement; + private string theAttributeName; + private bool theDoctypeIsPresent; + private string theDoctypePublicId; + private string theDoctypeSystemId; + private string theDoctypeName; + private string thePITarget; + private Element theStack; + private Element theSaved; + private Element thePCDATA; + private int theEntity; // needs to support chars past U+FFFF + + + public virtual void Adup(char[] buff, int offset, int length) + { + if (theNewElement == null || theAttributeName == null) + { + return; + } + theNewElement.SetAttribute(theAttributeName, null, theAttributeName); + theAttributeName = null; + } + + public virtual void Aname(char[] buff, int offset, int length) + { + if (theNewElement == null) + { + return; + } + // Currently we don't rely on Schema to canonicalize + // attribute names. + theAttributeName = MakeName(buff, offset, length).ToLowerInvariant(); + // System.err.println("%% Attribute name " + theAttributeName); + } + + public virtual void Aval(char[] buff, int offset, int length) + { + if (theNewElement == null || theAttributeName == null) + { + return; + } + var value = new string(buff, offset, length); + // System.err.println("%% Attribute value [" + value + "]"); + value = ExpandEntities(value); + theNewElement.SetAttribute(theAttributeName, null, value); + theAttributeName = null; + // System.err.println("%% Aval done"); + } + + /// <summary> + /// Expand entity references in attribute values selectively. + /// Currently we expand a reference iff it is properly terminated + /// with a semicolon. + /// </summary> + /// <param name="src"></param> + /// <returns></returns> + private string ExpandEntities(string src) + { + int refStart = -1; + int len = src.Length; + var dst = new char[len]; + int dstlen = 0; + for (int i = 0; i < len; i++) + { + char ch = src[i]; + dst[dstlen++] = ch; + // System.err.print("i = " + i + ", d = " + dstlen + ", ch = [" + ch + "] "); + if (ch == '&' && refStart == -1) + { + // start of a ref excluding & + refStart = dstlen; + // System.err.println("start of ref"); + } + else if (refStart == -1) + { + // not in a ref + // System.err.println("not in ref"); + } + else if (char.IsLetter(ch) || char.IsDigit(ch) || ch == '#') + { + // valid entity char + // System.err.println("valid"); + } + else if (ch == ';') + { + // properly terminated ref + // System.err.print("got [" + new string(dst, refStart, dstlen-refStart-1) + "]"); + int ent = LookupEntity(dst, refStart, dstlen - refStart - 1); + // System.err.println(" = " + ent); + if (ent > 0xFFFF) + { + ent -= 0x10000; + dst[refStart - 1] = (char)((ent >> 10) + 0xD800); + dst[refStart] = (char)((ent & 0x3FF) + 0xDC00); + dstlen = refStart + 1; + } + else if (ent != 0) + { + dst[refStart - 1] = (char)ent; + dstlen = refStart; + } + refStart = -1; + } + else + { + // improperly terminated ref + // System.err.println("end of ref"); + refStart = -1; + } + } + return new string(dst, 0, dstlen); + } + + public virtual void Entity(char[] buff, int offset, int length) + { + theEntity = LookupEntity(buff, offset, length); + } + + /// <summary> + /// Process numeric character references, + /// deferring to the schema for named ones. + /// </summary> + /// <param name="buff"></param> + /// <param name="offset"></param> + /// <param name="length"></param> + /// <returns></returns> + private int LookupEntity(char[] buff, int offset, int length) + { + int result = 0; + if (length < 1) + { + return result; + } + // System.err.println("%% Entity at " + offset + " " + length); + // System.err.println("%% Got entity [" + new string(buff, offset, length) + "]"); + if (buff[offset] == '#') + { + if (length > 1 && (buff[offset + 1] == 'x' || buff[offset + 1] == 'X')) + { + try + { + return Convert.ToInt32(new string(buff, offset + 2, length - 2), 16); + } + catch (FormatException) + { + return 0; + } + } + try + { + return Convert.ToInt32(new string(buff, offset + 1, length - 1), 10); + } + catch (FormatException) + { + return 0; + } + } + return theSchema.GetEntity(new string(buff, offset, length)); + } + + public virtual void EOF(char[] buff, int offset, int length) + { + if (virginStack) + { + Rectify(thePCDATA); + } + while (theStack.Next != null) + { + Pop(); + } + if (!(theSchema.Uri.Equals(""))) + { + theContentHandler.EndPrefixMapping(theSchema.Prefix); + } + theContentHandler.EndDocument(); + } + + public virtual void ETag(char[] buff, int offset, int length) + { + if (ETagCdata(buff, offset, length)) + { + return; + } + ETagBasic(buff, offset, length); + } + + private static readonly char[] etagchars = { '<', '/', '>' }; + public virtual bool ETagCdata(char[] buff, int offset, int length) + { + string currentName = theStack.Name; + // If this is a CDATA element and the tag doesn't match, + // or isn't properly formed (junk after the name), + // restart CDATA mode and process the tag as characters. + if (cDataElements && (theStack.Flags & Schema.F_CDATA) != 0) + { + bool realTag = (length == currentName.Length); + if (realTag) + { + for (int i = 0; i < length; i++) + { + if (char.ToLower(buff[offset + i]) != char.ToLower(currentName[i])) + { + realTag = false; + break; + } + } + } + if (!realTag) + { + theContentHandler.Characters(etagchars, 0, 2); + theContentHandler.Characters(buff, offset, length); + theContentHandler.Characters(etagchars, 2, 1); + theScanner.StartCDATA(); + return true; + } + } + return false; + } + + public virtual void ETagBasic(char[] buff, int offset, int length) + { + theNewElement = null; + string name; + if (length != 0) + { + // Canonicalize case of name + name = MakeName(buff, offset, length); + // System.err.println("got etag [" + name + "]"); + ElementType type = theSchema.GetElementType(name); + if (type == null) + { + return; // mysterious end-tag + } + name = type.Name; + } + else + { + name = theStack.Name; + } + // System.err.println("%% Got end of " + name); + + Element sp; + bool inNoforce = false; + for (sp = theStack; sp != null; sp = sp.Next) + { + if (sp.Name.Equals(name)) + { + break; + } + if ((sp.Flags & Schema.F_NOFORCE) != 0) + { + inNoforce = true; + } + } + + if (sp == null) + { + return; // Ignore unknown etags + } + if (sp.Next == null || sp.Next.Next == null) + { + return; + } + if (inNoforce) + { + // inside an F_NOFORCE element? + sp.Preclose(); // preclose the matching element + } + else + { + // restartably pop everything above us + while (theStack != sp) + { + RestartablyPop(); + } + Pop(); + } + // pop any preclosed elements now at the top + while (theStack.IsPreclosed) + { + Pop(); + } + Restart(null); + } + + /// <summary> + /// Push restartables on the stack if possible + /// e is the next element to be started, if we know what it is + /// </summary> + /// <param name="e"></param> + private void Restart(Element e) + { + while (theSaved != null && theStack.CanContain(theSaved) && (e == null || theSaved.CanContain(e))) + { + Element next = theSaved.Next; + Push(theSaved); + theSaved = next; + } + } + + /// <summary> + /// Pop the stack irrevocably + /// </summary> + private void Pop() + { + if (theStack == null) + { + return; // empty stack + } + string name = theStack.Name; + string localName = theStack.LocalName; + string ns = theStack.Namespace; + string prefix = PrefixOf(name); + + // System.err.println("%% Popping " + name); + if (!namespaces) + { + ns = localName = ""; + } + theContentHandler.EndElement(ns, localName, name); + if (Foreign(prefix, ns)) + { + theContentHandler.EndPrefixMapping(prefix); + // System.err.println("%% Unmapping [" + prefix + "] for elements to " + namespace); + } + Attributes atts = theStack.Attributes; + for (int i = atts.Length - 1; i >= 0; i--) + { + string attNamespace = atts.GetURI(i); + string attPrefix = PrefixOf(atts.GetQName(i)); + if (Foreign(attPrefix, attNamespace)) + { + theContentHandler.EndPrefixMapping(attPrefix); + // System.err.println("%% Unmapping [" + attPrefix + "] for attributes to " + attNamespace); + } + } + theStack = theStack.Next; + } + + /// <summary> + /// Pop the stack restartably + /// </summary> + private void RestartablyPop() + { + Element popped = theStack; + Pop(); + if (restartElements && (popped.Flags & Schema.F_RESTART) != 0) + { + popped.Anonymize(); + popped.Next = theSaved; + theSaved = popped; + } + } + + // Push element onto stack + private bool virginStack = true; + private void Push(Element e) + { + string name = e.Name; + string localName = e.LocalName; + string ns = e.Namespace; + string prefix = PrefixOf(name); + + // System.err.println("%% Pushing " + name); + e.Clean(); + if (!namespaces) + { + ns = localName = ""; + } + if (virginStack && localName.Equals(theDoctypeName, StringComparison.OrdinalIgnoreCase)) + { + try + { + theEntityResolver.ResolveEntity(theDoctypePublicId, theDoctypeSystemId); + } + catch (IOException) + { + } // Can't be thrown for root I believe. + } + if (Foreign(prefix, ns)) + { + theContentHandler.StartPrefixMapping(prefix, ns); + // System.err.println("%% Mapping [" + prefix + "] for elements to " + namespace); + } + Attributes atts = e.Attributes; + int len = atts.Length; + for (int i = 0; i < len; i++) + { + string attNamespace = atts.GetURI(i); + string attPrefix = PrefixOf(atts.GetQName(i)); + if (Foreign(attPrefix, attNamespace)) + { + theContentHandler.StartPrefixMapping(attPrefix, attNamespace); + // System.err.println("%% Mapping [" + attPrefix + "] for attributes to " + attNamespace); + } + } + theContentHandler.StartElement(ns, localName, name, e.Attributes); + e.Next = theStack; + theStack = e; + virginStack = false; + if (cDataElements && (theStack.Flags & Schema.F_CDATA) != 0) + { + theScanner.StartCDATA(); + } + } + + /// <summary> + /// Get the prefix from a QName + /// </summary> + /// <param name="name"></param> + /// <returns></returns> + private static string PrefixOf(string name) + { + int i = name.IndexOf(':'); + string prefix = ""; + if (i != -1) + { + prefix = name.Substring(0, i); + } + // System.err.println("%% " + prefix + " is prefix of " + name); + return prefix; + } + + /// <summary> + /// Return true if we have a foreign name + /// </summary> + /// <param name="prefix"></param> + /// <param name="ns"></param> + /// <returns></returns> + private bool Foreign(string prefix, string ns) + { + // System.err.print("%% Testing " + prefix + " and " + namespace + " for foreignness -- "); + bool foreign = !(prefix.Equals("") || ns.Equals("") || ns.Equals(theSchema.Uri)); + // System.err.println(foreign); + return foreign; + } + + /// <summary> + /// Parsing the complete XML Document Type Definition is way too complex, + /// but for many simple cases we can extract something useful from it. + /// doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' + /// DeclSep ::= PEReference | S + /// intSubset ::= (markupdecl | DeclSep)* + /// markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment + /// ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral + /// </summary> + /// <param name="buff"></param> + /// <param name="offset"></param> + /// <param name="length"></param> + public virtual void Decl(char[] buff, int offset, int length) + { + var s = new string(buff, offset, length); + string name = null; + string systemid = null; + string publicid = null; + string[] v = Split(s); + if (v.Length > 0 && "DOCTYPE".Equals(v[0], StringComparison.OrdinalIgnoreCase)) + { + if (theDoctypeIsPresent) + { + return; // one doctype only! + } + theDoctypeIsPresent = true; + if (v.Length > 1) + { + name = v[1]; + if (v.Length > 3 && "SYSTEM".Equals(v[2])) + { + systemid = v[3]; + } + else if (v.Length > 3 && "PUBLIC".Equals(v[2])) + { + publicid = v[3]; + if (v.Length > 4) + { + systemid = v[4]; + } + else + { + systemid = ""; + } + } + } + } + publicid = TrimQuotes(publicid); + systemid = TrimQuotes(systemid); + if (name != null) + { + publicid = CleanPublicId(publicid); + theLexicalHandler.StartDTD(name, publicid, systemid); + theLexicalHandler.EndDTD(); + theDoctypeName = name; + theDoctypePublicId = publicid; + var locator = theScanner as ILocator; + if (locator != null) + { + // Must resolve systemid + theDoctypeSystemId = locator.SystemId; + try + { + if (Uri.IsWellFormedUriString(theDoctypeSystemId, UriKind.Absolute)) + { + theDoctypeSystemId = new Uri(new Uri(theDoctypeSystemId), systemid).ToString(); + } + } + catch (Exception) + { + } + } + } + } + + // If the string is quoted, trim the quotes. + private static string TrimQuotes(string value) + { + if (value == null) + { + return null; + } + int length = value.Length; + if (length == 0) + { + return value; + } + char s = value[0]; + char e = value[length - 1]; + if (s == e && (s == '\'' || s == '"')) + { + value = value.Substring(1, value.Length - 1); + } + return value; + } + + /// <summary> + /// Split the supplied string into words or phrases seperated by spaces. + /// Recognises quotes around a phrase and doesn't split it. + /// </summary> + /// <param name="val"></param> + /// <returns></returns> + private static string[] Split(string val) + { + val = val.Trim(); + if (val.Length == 0) + { + return new string[0]; + } + var l = new List<string>(); + int s = 0; + int e = 0; + bool sq = false; // single quote + bool dq = false; // double quote + var lastc = (char)0; + int len = val.Length; + for (e = 0; e < len; e++) + { + char c = val[e]; + if (!dq && c == '\'' && lastc != '\\') + { + sq = !sq; + if (s < 0) + { + s = e; + } + } + else if (!sq && c == '\"' && lastc != '\\') + { + dq = !dq; + if (s < 0) + { + s = e; + } + } + else if (!sq && !dq) + { + if (char.IsWhiteSpace(c)) + { + if (s >= 0) + { + l.Add(val.Substring(s, e - s)); + } + s = -1; + } + else if (s < 0 && c != ' ') + { + s = e; + } + } + lastc = c; + } + l.Add(val.Substring(s, e - s)); + return l.ToArray(); + } + + private const string LEGAL = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-'()+,./:=?;!*#@$_%"; + + /// <summary> + /// Replace junk in publicids with spaces + /// </summary> + /// <param name="src"></param> + /// <returns></returns> + private string CleanPublicId(string src) + { + if (src == null) + { + return null; + } + int len = src.Length; + var dst = new StringBuilder(len); + bool suppressSpace = true; + for (int i = 0; i < len; i++) + { + char ch = src[i]; + if (LEGAL.IndexOf(ch) != -1) + { + // legal but not whitespace + dst.Append(ch); + suppressSpace = false; + } + else if (suppressSpace) + { + // normalizable whitespace or junk + } + else + { + dst.Append(' '); + suppressSpace = true; + } + } + // System.err.println("%% Publicid [" + dst.tostring().trim() + "]"); + return dst.ToString().Trim(); // trim any final junk whitespace + } + + public virtual void GI(char[] buff, int offset, int length) + { + if (theNewElement != null) + { + return; + } + string name = MakeName(buff, offset, length); + if (name == null) + { + return; + } + ElementType type = theSchema.GetElementType(name); + if (type == null) + { + // Suppress unknown elements if ignore-bogons is on + if (ignoreBogons) + { + return; + } + int bogonModel = (bogonsEmpty ? Schema.M_EMPTY : Schema.M_ANY); + int bogonMemberOf = (rootBogons ? Schema.M_ANY : (Schema.M_ANY & ~Schema.M_ROOT)); + theSchema.ElementType(name, bogonModel, bogonMemberOf, 0); + if (!rootBogons) + { + theSchema.Parent(name, theSchema.RootElementType.Name); + } + type = theSchema.GetElementType(name); + } + + theNewElement = new Element(type, defaultAttributes); + // System.err.println("%% Got GI " + theNewElement.name()); + } + + public virtual void CDSect(char[] buff, int offset, int length) + { + theLexicalHandler.StartCDATA(); + PCDATA(buff, offset, length); + theLexicalHandler.EndCDATA(); + } + + public virtual void PCDATA(char[] buff, int offset, int length) + { + if (length == 0) + { + return; + } + bool allWhite = true; + for (int i = 0; i < length; i++) + { + if (!char.IsWhiteSpace(buff[offset + i])) + { + allWhite = false; + } + } + if (allWhite && !theStack.CanContain(thePCDATA)) + { + if (ignorableWhitespace) + { + theContentHandler.IgnorableWhitespace(buff, offset, length); + } + } + else + { + Rectify(thePCDATA); + theContentHandler.Characters(buff, offset, length); + } + } + + public virtual void PITarget(char[] buff, int offset, int length) + { + if (theNewElement != null) + { + return; + } + thePITarget = MakeName(buff, offset, length).Replace(':', '_'); + } + + public virtual void PI(char[] buff, int offset, int length) + { + if (theNewElement != null || thePITarget == null) + { + return; + } + if ("xml".Equals(thePITarget, StringComparison.OrdinalIgnoreCase)) + { + return; + } + // if (length > 0 && buff[length - 1] == '?') System.err.println("%% Removing ? from PI"); + if (length > 0 && buff[length - 1] == '?') + { + length--; // remove trailing ? + } + theContentHandler.ProcessingInstruction(thePITarget, new string(buff, offset, length)); + thePITarget = null; + } + + public virtual void STagC(char[] buff, int offset, int length) + { + // System.err.println("%% Start-tag"); + if (theNewElement == null) + { + return; + } + Rectify(theNewElement); + if (theStack.Model == Schema.M_EMPTY) + { + // Force an immediate end tag + ETagBasic(buff, offset, length); + } + } + + public virtual void STagE(char[] buff, int offset, int length) + { + // System.err.println("%% Empty-tag"); + if (theNewElement == null) + { + return; + } + Rectify(theNewElement); + // Force an immediate end tag + ETagBasic(buff, offset, length); + } + + private char[] theCommentBuffer = new char[2000]; + public virtual void Cmnt(char[] buff, int offset, int length) + { + theLexicalHandler.Comment(buff, offset, length); + } + + /// <summary> + /// Rectify the stack, pushing and popping as needed + /// so that the argument can be safely pushed + /// </summary> + /// <param name="e"></param> + private void Rectify(Element e) + { + Element sp; + while (true) + { + for (sp = theStack; sp != null; sp = sp.Next) + { + if (sp.CanContain(e)) + { + break; + } + } + if (sp != null) + { + break; + } + ElementType parentType = e.Parent; + if (parentType == null) + { + break; + } + var parent = new Element(parentType, defaultAttributes); + // System.err.println("%% Ascending from " + e.name() + " to " + parent.name()); + parent.Next = e; + e = parent; + } + if (sp == null) + { + return; // don't know what to do + } + while (theStack != sp) + { + if (theStack == null || theStack.Next == null || theStack.Next.Next == null) + { + break; + } + RestartablyPop(); + } + while (e != null) + { + Element nexte = e.Next; + if (!e.Name.Equals("<pcdata>")) + { + Push(e); + } + e = nexte; + Restart(e); + } + theNewElement = null; + } + + public virtual int GetEntity() + { + return theEntity; + } + + /// <summary> + /// Return the argument as a valid XML name + /// This no longer lowercases the result: we depend on Schema to + /// canonicalize case. + /// </summary> + /// <param name="buff"></param> + /// <param name="offset"></param> + /// <param name="length"></param> + /// <returns></returns> + private string MakeName(char[] buff, int offset, int length) + { + var dst = new StringBuilder(length + 2); + bool seenColon = false; + bool start = true; + // string src = new string(buff, offset, length); // DEBUG + for (; length-- > 0; offset++) + { + char ch = buff[offset]; + if (char.IsLetter(ch) || ch == '_') + { + start = false; + dst.Append(ch); + } + else if (char.IsDigit(ch) || ch == '-' || ch == '.') + { + if (start) + { + dst.Append('_'); + } + start = false; + dst.Append(ch); + } + else if (ch == ':' && !seenColon) + { + seenColon = true; + if (start) + { + dst.Append('_'); + } + start = true; + dst.Append(translateColons ? '_' : ch); + } + } + int dstLength = dst.Length; + if (dstLength == 0 || dst[dstLength - 1] == ':') + { + dst.Append('_'); + } + // System.err.println("Made name \"" + dst + "\" from \"" + src + "\""); + return dst.ToString().Intern(); + } + + private class AutoDetectorDelegate : IAutoDetector + { + private readonly Func<Stream, StreamReader> _delegate; + + public AutoDetectorDelegate(Func<Stream, StreamReader> @delegate) + { + _delegate = @delegate; + } + + public TextReader AutoDetectingReader(Stream stream) + { + return _delegate(stream); + } + } + + // Default LexicalHandler implementation + + public virtual void Comment(char[] ch, int start, int length) + { + } + + public virtual void EndCDATA() + { + } + + public virtual void EndDTD() + { + } + + public virtual void EndEntity(string name) + { + } + + public virtual void StartCDATA() + { + } + + public virtual void StartDTD(string name, string publicid, string systemid) + { + } + + public virtual void StartEntity(string name) + { + } + + /// <summary> + /// Creates a new instance of <see cref="Parser" /> + /// </summary> + public Parser() + { + theNewElement = null; + theContentHandler = this; + theLexicalHandler = this; + theDTDHandler = this; + theErrorHandler = this; + theEntityResolver = this; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/198e5868/src/Lucene.Net.Benchmark/Support/TagSoup/ScanHandler.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Support/TagSoup/ScanHandler.cs b/src/Lucene.Net.Benchmark/Support/TagSoup/ScanHandler.cs new file mode 100644 index 0000000..3901ada --- /dev/null +++ b/src/Lucene.Net.Benchmark/Support/TagSoup/ScanHandler.cs @@ -0,0 +1,105 @@ +// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. +// +// TagSoup is licensed under the Apache License, +// Version 2.0. You may obtain a copy of this license at +// http://www.apache.org/licenses/LICENSE-2.0 . You may also have +// additional legal rights not granted by this license. +// +// TagSoup is distributed in the hope that it will be useful, but +// unless required by applicable law or agreed to in writing, TagSoup +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS +// OF ANY KIND, either express or implied; not even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// +// Scanner handler + +namespace TagSoup +{ + /// <summary> + /// An interface that Scanners use to report events in the input stream. + /// </summary> + public interface IScanHandler + { + /// <summary> + /// Reports an attribute name without a value. + /// </summary> + void Adup(char[] buff, int offset, int length); + + /// <summary> + /// Reports an attribute name; a value will follow. + /// </summary> + void Aname(char[] buff, int offset, int length); + + /// <summary> + /// Reports an attribute value. + /// </summary> + void Aval(char[] buff, int offset, int length); + + /// <summary> + /// Reports the content of a CDATA section (not a CDATA element) + /// </summary> + void CDSect(char[] buff, int offset, int length); + + /// <summary> + /// Reports a <!....> declaration - typically a DOCTYPE + /// </summary> + void Decl(char[] buff, int offset, int length); + + /// <summary> + /// Reports an entity reference or character reference. + /// </summary> + void Entity(char[] buff, int offset, int length); + + /// <summary> + /// Reports EOF. + /// </summary> + void EOF(char[] buff, int offset, int length); + + /// <summary> + /// Reports an end-tag. + /// </summary> + void ETag(char[] buff, int offset, int length); + + /// <summary> + /// Reports the general identifier (element type name) of a start-tag. + /// </summary> + void GI(char[] buff, int offset, int length); + + /// <summary> + /// Reports character content. + /// </summary> + void PCDATA(char[] buff, int offset, int length); + + /// <summary> + /// Reports the data part of a processing instruction. + /// </summary> + void PI(char[] buff, int offset, int length); + + /// <summary> + /// Reports the target part of a processing instruction. + /// </summary> + void PITarget(char[] buff, int offset, int length); + + /// <summary> + /// Reports the close of a start-tag. + /// </summary> + void STagC(char[] buff, int offset, int length); + + /// <summary> + /// Reports the close of an empty-tag. + /// </summary> + void STagE(char[] buff, int offset, int length); + + /// <summary> + /// Reports a comment. + /// </summary> + void Cmnt(char[] buff, int offset, int length); + + /// <summary> + /// Returns the value of the last entity or character reference reported. + /// </summary> + /// <returns>The value of the last entity or character reference reported.</returns> + int GetEntity(); + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/198e5868/src/Lucene.Net.Benchmark/Support/TagSoup/Scanner.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Support/TagSoup/Scanner.cs b/src/Lucene.Net.Benchmark/Support/TagSoup/Scanner.cs new file mode 100644 index 0000000..5e4d406 --- /dev/null +++ b/src/Lucene.Net.Benchmark/Support/TagSoup/Scanner.cs @@ -0,0 +1,53 @@ +// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. +// +// TagSoup is licensed under the Apache License, +// Version 2.0. You may obtain a copy of this license at +// http://www.apache.org/licenses/LICENSE-2.0 . You may also have +// additional legal rights not granted by this license. +// +// TagSoup is distributed in the hope that it will be useful, but +// unless required by applicable law or agreed to in writing, TagSoup +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS +// OF ANY KIND, either express or implied; not even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// +// Scanner + +using System.IO; + +namespace TagSoup +{ + /// <summary> + /// An interface allowing <see cref="Parser"/> to invoke scanners. + /// </summary> + public interface IScanner + { + /// <summary> + /// Invoke a scanner. + /// </summary> + /// <param name="br"> + /// A source of characters to scan + /// </param> + /// <param name="handler"> + /// A <see cref="IScanHandler"/> to report events to + /// </param> + void Scan(TextReader br, IScanHandler handler); + + /// <summary> + /// Reset the embedded locator. + /// </summary> + /// <param name="publicid"> + /// The publicid of the source + /// </param> + /// <param name="systemid"> + /// The systemid of the source + /// </param> + void ResetDocumentLocator(string publicid, string systemid); + + /// <summary> + /// Signal to the scanner to start CDATA content mode. + /// </summary> + void StartCDATA(); + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/198e5868/src/Lucene.Net.Benchmark/Support/TagSoup/Schema.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Support/TagSoup/Schema.cs b/src/Lucene.Net.Benchmark/Support/TagSoup/Schema.cs new file mode 100644 index 0000000..76a86f9 --- /dev/null +++ b/src/Lucene.Net.Benchmark/Support/TagSoup/Schema.cs @@ -0,0 +1,159 @@ +// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. +// +// TagSoup is licensed under the Apache License, +// Version 2.0. You may obtain a copy of this license at +// http://www.apache.org/licenses/LICENSE-2.0 . You may also have +// additional legal rights not granted by this license. +// +// TagSoup is distributed in the hope that it will be useful, but +// unless required by applicable law or agreed to in writing, TagSoup +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS +// OF ANY KIND, either express or implied; not even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// +// Model of document + +using System; +using System.Collections; + +namespace TagSoup +{ + /// <summary> + /// Abstract class representing a TSSL schema. + /// Actual TSSL schemas are compiled into concrete subclasses of this class. + /// </summary> + public abstract class Schema + { + public const int M_ANY = -1;//0xFFFFFFFF; + public const int M_EMPTY = 0; + public const int M_PCDATA = 1 << 30; + public const int M_ROOT = 1 << 31; + + public const int F_RESTART = 1; + public const int F_CDATA = 2; + public const int F_NOFORCE = 4; + + private readonly Hashtable theEntities = new Hashtable(); // string -> Character + private readonly Hashtable theElementTypes = new Hashtable(); // string -> ElementType + + private string theURI = ""; + private string thePrefix = ""; + private ElementType theRoot; + + /// <summary> + /// Add or replace an element type for this schema. + /// </summary> + /// <param name="name"> Name (Qname) of the element</param> + /// <param name="model">Models of the element's content as a vector of bits</param> + /// <param name="memberOf">Models the element is a member of as a vector of bits</param> + /// <param name="flags">Flags for the element</param> + public virtual void ElementType(string name, int model, int memberOf, int flags) + { + var e = new ElementType(name, model, memberOf, flags, this); + theElementTypes[name.ToLower()] = e; + if (memberOf == M_ROOT) + { + theRoot = e; + } + } + + /// <summary> + /// Gets or sets the root element of this schema + /// </summary> + public virtual ElementType RootElementType + { + get { return theRoot; } + } + + /// <summary> + /// Add or replace a default attribute for an element type in this schema. + /// </summary> + /// <param name="elemName">Name (Qname) of the element type</param> + /// <param name="attrName">Name (Qname) of the attribute</param> + /// <param name="type">Type of the attribute</param> + /// <param name="value">Default value of the attribute; null if no default</param> + public virtual void Attribute(string elemName, string attrName, string type, string value) + { + ElementType e = GetElementType(elemName); + if (e == null) + { + throw new Exception("Attribute " + attrName + " specified for unknown element type " + elemName); + } + e.SetAttribute(attrName, type, value); + } + + /// <summary> + /// Specify natural parent of an element in this schema. + /// </summary> + /// <param name="name">Name of the child element</param> + /// <param name="parentName">Name of the parent element</param> + public virtual void Parent(string name, string parentName) + { + ElementType child = GetElementType(name); + ElementType parent = GetElementType(parentName); + if (child == null) + { + throw new Exception("No child " + name + " for parent " + parentName); + } + if (parent == null) + { + throw new Exception("No parent " + parentName + " for child " + name); + } + child.Parent = parent; + } + + /// <summary> + /// Add to or replace a character entity in this schema. + /// </summary> + /// <param name="name">Name of the entity</param> + /// <param name="value">Value of the entity</param> + public virtual void Entity(string name, int value) + { + theEntities[name] = value; + } + + /// <summary> + /// Get an <see cref="TagSoup.ElementType"/> by name. + /// </summary> + /// <param name="name">Name (Qname) of the element type</param> + /// <returns>The corresponding <see cref="TagSoup.ElementType"/></returns> + public virtual ElementType GetElementType(string name) + { + return (ElementType)(theElementTypes[name.ToLower()]); + } + + /// <summary> + /// Get an entity value by name. + /// </summary> + /// <param name="name">Name of the entity</param> + /// <returns>The corresponding character, or 0 if none</returns> + public virtual int GetEntity(string name) + { + // System.err.println("%% Looking up entity " + name); + if (theEntities.ContainsKey(name)) + { + return (int)theEntities[name]; + } + return 0; + } + + /// <summary> + /// Gets or sets the URI (namespace name) of this schema. + /// </summary> + public virtual string Uri + { + get { return theURI; } + set { theURI = value; } + } + + /// <summary> + /// Gets ot sets the prefix of this schema. + /// </summary> + public virtual string Prefix + { + get { return thePrefix; } + set { thePrefix = value; } + } + } +}