http://git-wip-us.apache.org/repos/asf/lucenenet/blob/198e5868/src/Lucene.Net.Benchmark/Support/Sax/SAXParseException.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Support/Sax/SAXParseException.cs b/src/Lucene.Net.Benchmark/Support/Sax/SAXParseException.cs new file mode 100644 index 0000000..b7cdf64 --- /dev/null +++ b/src/Lucene.Net.Benchmark/Support/Sax/SAXParseException.cs @@ -0,0 +1,269 @@ +// SAX exception class. +// http://www.saxproject.org +// No warranty; no copyright -- use this as you will. +// $Id: SAXParseException.java,v 1.11 2004/04/21 13:05:02 dmegginson Exp $ + +using System; +#if FEATURE_SERIALIZABLE +using System.Runtime.Serialization; +#endif + +namespace Sax +{ + /// <summary> + /// Encapsulate an XML parse error or warning. + /// </summary> + /// <remarks> + /// <em>This module, both source code and documentation, is in the + /// Public Domain, and comes with<strong> NO WARRANTY</strong>.</em> + /// See<a href='http://www.saxproject.org'>http://www.saxproject.org</a> + /// for further information. + /// <para/> + /// This exception may include information for locating the error + /// in the original XML document, as if it came from a <see cref="ILocator"/> + /// object. Note that although the application + /// will receive a SAXParseException as the argument to the handlers + /// in the <see cref="IErrorHandler"/> interface, + /// the application is not actually required to throw the exception; + /// instead, it can simply read the information in it and take a + /// different action. + /// <para/> + /// Since this exception is a subclass of <see cref="SAXException"/>, + /// it inherits the ability to wrap another exception. + /// </remarks> + /// <since>SAX 1.0</since> + /// <author>David Megginson</author> + /// <version>2.0.1 (sax2r2)</version> + /// <seealso cref="SAXException"/> + /// <seealso cref="ILocator"/> + /// <seealso cref="IErrorHandler"/> +#if FEATURE_SERIALIZABLE + [Serializable] +#endif + public class SAXParseException : SAXException + { + ////////////////////////////////////////////////////////////////////// + // Constructors. + ////////////////////////////////////////////////////////////////////// + + /// <summary> + /// Construct a new exception with no message. + /// </summary> + // LUCENENET specific for serialization + public SAXParseException() + : base() + { + } + + /// <summary> + /// Create a new <see cref="SAXParseException"/> from a message and a <see cref="ILocator"/>. + /// </summary> + /// <remarks> + /// This constructor is especially useful when an application is + /// creating its own exception from within a <see cref="IContentHandler"/> + /// callback. + /// </remarks> + /// <param name="message">The error or warning message.</param> + /// <param name="locator">The locator object for the error or warning (may be null).</param> + /// <seealso cref="ILocator"/> + public SAXParseException(string message, ILocator locator) + : base(message) + { + if (locator != null) + { + Init(locator.PublicId, locator.SystemId, + locator.LineNumber, locator.ColumnNumber); + } + else + { + Init(null, null, -1, -1); + } + } + + /// <summary> + /// Wrap an existing exception in a SAXParseException. + /// </summary> + /// <remarks> + /// This constructor is especially useful when an application is + /// creating its own exception from within a <see cref="IContentHandler"/> + /// callback, and needs to wrap an existing exception that is not a + /// subclass of <see cref="SAXException"/>. + /// </remarks> + /// <param name="message">The error or warning message, or null to + /// use the message from the embedded exception.</param> + /// <param name="locator">The locator object for the error or warning (may be + /// null).</param> + /// <param name="e">Any exception.</param> + /// <seealso cref="ILocator"/> + public SAXParseException(string message, ILocator locator, + Exception e) + : base(message, e) + { + if (locator != null) + { + Init(locator.PublicId, locator.SystemId, + locator.LineNumber, locator.ColumnNumber); + } + else + { + Init(null, null, -1, -1); + } + } + + /// <summary> + /// Create a new SAXParseException. + /// </summary> + /// <remarks> + /// This constructor is most useful for parser writers. + /// <para/> + /// All parameters except the message are as if + /// they were provided by a <see cref="ILocator"/>. For example, if the + /// system identifier is a URL (including relative filename), the + /// caller must resolve it fully before creating the exception. + /// </remarks> + /// <param name="message">The error or warning message.</param> + /// <param name="publicId">The public identifier of the entity that generated the error or warning.</param> + /// <param name="systemId">The system identifier of the entity that generated the error or warning.</param> + /// <param name="lineNumber">The line number of the end of the text that caused the error or warning.</param> + /// <param name="columnNumber">The column number of the end of the text that cause the error or warning.</param> + public SAXParseException(string message, string publicId, string systemId, + int lineNumber, int columnNumber) + : base(message) + { + Init(publicId, systemId, lineNumber, columnNumber); + } + + /// <summary> + /// Create a new <see cref="SAXParseException"/> with an embedded exception. + /// </summary> + /// <remarks> + /// This constructor is most useful for parser writers who + /// need to wrap an exception that is not a subclass of + /// <see cref="SAXException"/>. + /// <para/> + /// All parameters except the message and exception are as if + /// they were provided by a <see cref="ILocator"/>. For example, if the + /// system identifier is a URL (including relative filename), the + /// caller must resolve it fully before creating the exception. + /// </remarks> + /// <param name="message">The error or warning message, or null to use the message from the embedded exception.</param> + /// <param name="publicId">The public identifier of the entity that generated the error or warning.</param> + /// <param name="systemId">The system identifier of the entity that generated the error or warning.</param> + /// <param name="lineNumber">The line number of the end of the text that caused the error or warning.</param> + /// <param name="columnNumber">The column number of the end of the text that cause the error or warning.</param> + /// <param name="e">Another exception to embed in this one.</param> + public SAXParseException(string message, string publicId, string systemId, + int lineNumber, int columnNumber, Exception e) + : base(message, e) + { + Init(publicId, systemId, lineNumber, columnNumber); + } + +#if FEATURE_SERIALIZABLE + /// <summary> + /// Initializes a new instance of this class with serialized data. + /// </summary> + /// <param name="info">The <see cref="SerializationInfo"/> that holds the serialized object data about the exception being thrown.</param> + /// <param name="context">The <see cref="StreamingContext"/> that contains contextual information about the source or destination.</param> + public SAXParseException(SerializationInfo info, StreamingContext context) + : base(info, context) + { + } +#endif + + /// <summary> + /// Internal initialization method. + /// </summary> + /// <param name="publicId">The public identifier of the entity which generated the exception, or null.</param> + /// <param name="systemId">The system identifier of the entity which generated the exception, or null.</param> + /// <param name="lineNumber">The line number of the error, or -1.</param> + /// <param name="columnNumber">The column number of the error, or -1.</param> + private void Init(string publicId, string systemId, + int lineNumber, int columnNumber) + { + this.publicId = publicId; + this.systemId = systemId; + this.lineNumber = lineNumber; + this.columnNumber = columnNumber; + } + + /// <summary> + /// Get the public identifier of the entity where the exception occurred. + /// Returns a string containing the public identifier, or null if none is available. + /// </summary> + /// <seealso cref="ILocator.PublicId"/> + public string PublicId + { + get { return this.publicId; } + } + + /// <summary> + /// Get the system identifier of the entity where the exception occurred. + /// <para/> + /// If the system identifier is a URL, it will have been resolved fully. + /// <para/> + /// A string containing the system identifier, or null if none is available. + /// </summary> + /// <seealso cref="ILocator.SystemId"/> + public string SystemId + { + get { return this.systemId; } + } + + /// <summary> + /// The line number of the end of the text where the exception occurred. + /// <para/> + /// The first line is line 1. + /// <para/> + /// An integer representing the line number, or -1 if none is available. + /// </summary> + /// <seealso cref="ILocator.LineNumber"/> + public int LineNumber + { + get { return this.lineNumber; } + } + + /// <summary> + /// The column number of the end of the text where the exception occurred. + /// <para/> + /// The first column in a line is position 1. + /// <para/> + /// An integer representing the column number, or -1 + /// if none is available. + /// </summary> + /// <seealso cref="ILocator.ColumnNumber"/> + public int ColumnNumber + { + get { return this.columnNumber; } + } + + + ////////////////////////////////////////////////////////////////////// + // Internal state. + ////////////////////////////////////////////////////////////////////// + + /// <summary> + /// The public identifier, or null. + /// </summary> + /// <seealso cref="PublicId"/> + private string publicId; + + /// <summary> + /// The system identifier, or null. + /// </summary> + /// <seealso cref="SystemId"/> + private string systemId; + + /// <summary> + /// The line number, or -1. + /// </summary> + /// <seealso cref="LineNumber"/> + private int lineNumber; + + /// <summary> + /// The column number, or -1. + /// </summary> + /// <seealso cref="ColumnNumber"/> + private int columnNumber; + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/198e5868/src/Lucene.Net.Benchmark/Support/Sax/XMLFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Support/Sax/XMLFilter.cs b/src/Lucene.Net.Benchmark/Support/Sax/XMLFilter.cs new file mode 100644 index 0000000..f9350d3 --- /dev/null +++ b/src/Lucene.Net.Benchmark/Support/Sax/XMLFilter.cs @@ -0,0 +1,41 @@ +// XMLFilter.java - filter SAX2 events. +// http://www.saxproject.org +// Written by David Megginson +// NO WARRANTY! This class is in the Public Domain. +// $Id: XMLFilter.java,v 1.6 2002/01/30 21:13:48 dbrownell Exp $ + +namespace Sax +{ + /// <summary> + /// Interface for an XML filter. + /// </summary> + /// <remarks> + /// <em>This module, both source code and documentation, is in the + /// Public Domain, and comes with<strong> NO WARRANTY</strong>.</em> + /// See<a href='http://www.saxproject.org'>http://www.saxproject.org</a> + /// for further information. + /// <para/> + /// An XML filter is like an XML reader, except that it obtains its + /// events from another XML reader rather than a primary source like + /// an XML document or database.Filters can modify a stream of + /// events as they pass on to the final application. + /// <para/> + /// The XMLFilterImpl helper class provides a convenient base + /// for creating SAX2 filters, by passing on all <see cref="IEntityResolver"/>, <see cref="IDTDHandler"/>, + /// <see cref="IContentHandler"/> and <see cref="IErrorHandler"/> + /// events automatically. + /// </remarks> + public interface IXMLFilter : IXMLReader + { + /// <summary> + /// Gets or sets the parent reader. Returns the parent filter, or null if none has been set. + /// </summary> + /// <remarks> + /// This method allows the application to link or query the parent + /// reader (which may be another filter). It is generally a + /// bad idea to perform any operations on the parent reader + /// directly: they should all pass through this filter. + /// </remarks> + IXMLReader Parent { get; set; } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/198e5868/src/Lucene.Net.Benchmark/Support/Sax/XMLReader.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Support/Sax/XMLReader.cs b/src/Lucene.Net.Benchmark/Support/Sax/XMLReader.cs new file mode 100644 index 0000000..71b690f --- /dev/null +++ b/src/Lucene.Net.Benchmark/Support/Sax/XMLReader.cs @@ -0,0 +1,305 @@ +// XMLFilter.java - filter SAX2 events. +// http://www.saxproject.org +// Written by David Megginson +// NO WARRANTY! This class is in the Public Domain. +// $Id: XMLFilter.java,v 1.6 2002/01/30 21:13:48 dbrownell Exp $ + +namespace Sax +{ + /// <summary> + /// Interface for an XML filter. + /// </summary> + /// <remarks> + /// <em>This module, both source code and documentation, is in the + /// Public Domain, and comes with<strong> NO WARRANTY</strong>.</em> + /// See<a href='http://www.saxproject.org'>http://www.saxproject.org</a> + /// for further information. + /// <para/> + /// An XML filter is like an XML reader, except that it obtains its + /// events from another XML reader rather than a primary source like + /// an XML document or database.Filters can modify a stream of + /// events as they pass on to the final application. + /// <para/> + /// The <see cref="IXMLFilter"/> helper class provides a convenient base + /// for creating SAX2 filters, by passing on all <see cref="IEntityResolver"/>, + /// <see cref="IDTDHandler"/>, + /// <see cref="IContentHandler"/> and <see cref="IErrorHandler"/> + /// events automatically. + /// </remarks> + /// <since>SAX 2.0</since> + /// <author>David Megginson</author> + /// <version>2.0.1 (sax2r2)</version> + /// <seealso cref="Helpers.XMLFilter"/> + public interface IXMLReader + { + //////////////////////////////////////////////////////////////////// + // Configuration. + //////////////////////////////////////////////////////////////////// + + + /// <summary> + /// Look up the value of a feature flag. + /// </summary> + /// <remarks> + /// The feature name is any fully-qualified URI. It is + /// possible for an XMLReader to recognize a feature name but + /// temporarily be unable to return its value. + /// Some feature values may be available only in specific + /// contexts, such as before, during, or after a parse. + /// Also, some feature values may not be programmatically accessible. + /// (In the case of an adapter for SAX1 {@link Parser}, there is no + /// implementation-independent way to expose whether the underlying + /// parser is performing validation, expanding external entities, + /// and so forth.) + /// <para/>All XMLReaders are required to recognize the + /// http://xml.org/sax/features/namespaces and the + /// http://xml.org/sax/features/namespace-prefixes feature names. + /// <para/>Typical usage is something like this: + /// <code> + /// XMLReader r = new MySAXDriver(); + /// // try to activate validation + /// try { + /// r.SetFeature("http://xml.org/sax/features/validation", true); + /// } catch (SAXException e) { + /// Console.Error.WriteLine("Cannot activate validation."); + /// } + /// // register event handlers + /// r.ContentHandler = new MyContentHandler(); + /// r.ErrorHandler = new MyErrorHandler(); + /// // parse the first document + /// try { + /// r.Parse("http://www.foo.com/mydoc.xml"); + /// } catch (IOException e) { + /// Console.Error.WriteLine("I/O exception reading XML document"); + /// } catch (SAXException e) { + /// Console.Error.WriteLine("XML exception reading document."); + /// } + /// </code> + /// <para/>Implementors are free (and encouraged) to invent their own features, + /// using names built on their own URIs. + /// </remarks> + /// <param name="name">The feature name, which is a fully-qualified URI.</param> + /// <returns>The current value of the feature (true or false).</returns> + /// <exception cref="SAXNotRecognizedException">If the feature + /// value can't be assigned or retrieved.</exception> + /// <exception cref="SAXNotSupportedException">When the + /// <see cref="IXMLReader"/> recognizes the feature name but + /// cannot determine its value at this time.</exception> + /// <seealso cref="SetFeature(string, bool)"/> + bool GetFeature(string name); + + + /// <summary> + /// Set the value of a feature flag. + /// <para/> + /// The feature name is any fully-qualified URI. It is + /// possible for an XMLReader to expose a feature value but + /// to be unable to change the current value. + /// Some feature values may be immutable or mutable only + /// in specific contexts, such as before, during, or after + /// a parse. + /// <para/> + /// All XMLReaders are required to support setting + /// http://xml.org/sax/features/namespaces to true and + /// http://xml.org/sax/features/namespace-prefixes to false. + /// </summary> + /// <param name="name">The feature name, which is a fully-qualified URI.</param> + /// <param name="value">The requested value of the feature (true or false).</param> + /// <exception cref="SAXNotRecognizedException">If the feature + /// value can't be assigned or retrieved.</exception> + /// <exception cref="SAXNotSupportedException">When the + /// <see cref="IXMLReader"/> recognizes the feature name but + /// cannot set the requested value.</exception> + /// <seealso cref="GetFeature(string)"/> + void SetFeature(string name, bool value); + + + /// <summary> + /// Look up the value of a property. + /// </summary> + /// <remarks> + /// The property name is any fully-qualified URI. It is + /// possible for an XMLReader to recognize a property name but + /// temporarily be unable to return its value. + /// Some property values may be available only in specific + /// contexts, such as before, during, or after a parse. + /// <para/> + /// <see cref="IXMLReader"/>s are not required to recognize any specific + /// property names, though an initial core set is documented for + /// SAX2. + /// <para/> + /// Implementors are free (and encouraged) to invent their own properties, + /// using names built on their own URIs. + /// </remarks> + /// <param name="name">The property name, which is a fully-qualified URI.</param> + /// <returns>The current value of the property.</returns> + /// <exception cref="SAXNotRecognizedException">If the property + /// value can't be assigned or retrieved.</exception> + /// <exception cref="SAXNotSupportedException">When the + /// <see cref="IXMLReader"/> recognizes the property name but + /// cannot determine its value at this time.</exception> + /// <seealso cref="SetProperty(string, object)"/> + object GetProperty(string name); + + + /// <summary> + /// Set the value of a property. + /// </summary> + /// <remarks> + /// The property name is any fully-qualified URI. It is + /// possible for an <see cref="IXMLReader"/> to recognize a property name but + /// to be unable to change the current value. + /// Some property values may be immutable or mutable only + /// in specific contexts, such as before, during, or after + /// a parse. + /// <para/> + /// <see cref="IXMLReader"/>s are not required to recognize setting + /// any specific property names, though a core set is defined by + /// SAX2. + /// <para/> + /// This method is also the standard mechanism for setting + /// extended handlers. + /// </remarks> + /// <param name="name">The property name, which is a fully-qualified URI.</param> + /// <param name="value">The requested value for the property.</param> + /// <exception cref="SAXNotRecognizedException">If the property + /// value can't be assigned or retrieved.</exception> + /// <exception cref="SAXNotSupportedException">When the + /// <see cref="IXMLReader"/> recognizes the property name but + /// cannot set the requested value.</exception> + void SetProperty(string name, object value); + + + + //////////////////////////////////////////////////////////////////// + // Event handlers. + //////////////////////////////////////////////////////////////////// + + + /// <summary> + /// Gets or Sets an entity resolver. + /// </summary> + /// <remarks> + /// If the application does not register an entity resolver, + /// the <see cref="IXMLReader"/> will perform its own default resolution. + /// <para/> + /// Applications may register a new or different resolver in the + /// middle of a parse, and the SAX parser must begin using the new + /// resolver immediately. + /// </remarks> + IEntityResolver EntityResolver { get; set; } + + /// <summary> + /// Gets or Sets a DTD event handler. + /// </summary> + /// <remarks> + /// If the application does not register a DTD handler, all DTD + /// events reported by the SAX parser will be silently ignored. + /// <para/> + /// Applications may register a new or different handler in the + /// middle of a parse, and the SAX parser must begin using the new + /// handler immediately. + /// </remarks> + IDTDHandler DTDHandler { get; set; } + + /// <summary> + /// Gets or Sets a content event handler. + /// </summary> + /// <remarks> + /// <para/>If the application does not register a content handler, all + /// content events reported by the SAX parser will be silently + /// ignored. + /// <para/>Applications may register a new or different handler in the + /// middle of a parse, and the SAX parser must begin using the new + /// handler immediately. + /// </remarks> + IContentHandler ContentHandler { get; set; } + + + /// <summary> + /// Gets or Sets an error event handler. + /// </summary> + /// <remarks> + /// If the application does not register an error handler, all + /// error events reported by the SAX parser will be silently + /// ignored; however, normal processing may not continue. It is + /// highly recommended that all SAX applications implement an + /// error handler to avoid unexpected bugs. + /// <para/> + /// Applications may register a new or different handler in the + /// middle of a parse, and the SAX parser must begin using the new + /// handler immediately. + /// </remarks> + IErrorHandler ErrorHandler { get; set; } + + + //////////////////////////////////////////////////////////////////// + // Parsing. + //////////////////////////////////////////////////////////////////// + + /// <summary> + /// Parse an XML document. + /// </summary> + /// <remarks> + /// The application can use this method to instruct the XML + /// reader to begin parsing an XML document from any valid input + /// source (a character stream, a byte stream, or a URI). + /// <para/> + /// Applications may not invoke this method while a parse is in + /// progress (they should create a new XMLReader instead for each + /// nested XML document). Once a parse is complete, an + /// application may reuse the same XMLReader object, possibly with a + /// different input source. + /// Configuration of the <see cref="IXMLReader"/> object (such as handler bindings and + /// values established for feature flags and properties) is unchanged + /// by completion of a parse, unless the definition of that aspect of + /// the configuration explicitly specifies other behavior. + /// (For example, feature flags or properties exposing + /// characteristics of the document being parsed.) + /// <para/> + /// During the parse, the XMLReader will provide information + /// about the XML document through the registered event + /// handlers. + /// <para/> + /// This method is synchronous: it will not return until parsing + /// has ended. If a client application wants to terminate + /// parsing early, it should throw an exception. + /// </remarks> + /// <param name="input">The input source for the top-level of the + /// XML document.</param> + /// <exception cref="SAXException">Any SAX exception, possibly + /// wrapping another exception.</exception> + /// <exception cref="System.IO.IOException">An IO exception from the parser, + /// possibly from a byte stream or character stream + /// supplied by the application.</exception> + /// <seealso cref="InputSource"/> + /// <seealso cref="Parse(string)"/> + /// <seealso cref="EntityResolver"/> + /// <seealso cref="DTDHandler"/> + /// <seealso cref="ContentHandler"/> + /// <seealso cref="ErrorHandler"/> + void Parse(InputSource input); + + + /// <summary> + /// Parse an XML document from a system identifier (URI). + /// </summary> + /// <remarks> + /// This method is a shortcut for the common case of reading a + /// document from a system identifier. It is the exact + /// equivalent of the following: + /// <code> + /// Parse(new InputSource(systemId)); + /// </code> + /// <para/>If the system identifier is a URL, it must be fully resolved + /// by the application before it is passed to the parser. + /// </remarks> + /// <param name="input">The system identifier (URI).</param> + /// <exception cref="SAXException">Any SAX exception, possibly + /// wrapping another exception.</exception> + /// <exception cref="System.IO.IOException">An IO exception from the parser, + /// possibly from a byte stream or character stream + /// supplied by the application.</exception> + void Parse(string systemId); + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/198e5868/src/Lucene.Net.Benchmark/Support/StringExtensions.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Support/StringExtensions.cs b/src/Lucene.Net.Benchmark/Support/StringExtensions.cs new file mode 100644 index 0000000..2104fdb --- /dev/null +++ b/src/Lucene.Net.Benchmark/Support/StringExtensions.cs @@ -0,0 +1,14 @@ +namespace Lucene.Net.Support +{ + public static class StringExtensions + { + public static string Intern(this string value) + { +#if NETSTANDARD + return value; +#else + return string.Intern(value); +#endif + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/198e5868/src/Lucene.Net.Benchmark/Support/TagSoup/AutoDetector.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Support/TagSoup/AutoDetector.cs b/src/Lucene.Net.Benchmark/Support/TagSoup/AutoDetector.cs new file mode 100644 index 0000000..6fcb578 --- /dev/null +++ b/src/Lucene.Net.Benchmark/Support/TagSoup/AutoDetector.cs @@ -0,0 +1,41 @@ +// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. +// +// TagSoup is licensed under the Apache License, +// Version 2.0. You may obtain a copy of this license at +// http://www.apache.org/licenses/LICENSE-2.0 . You may also have +// additional legal rights not granted by this license. +// +// TagSoup is distributed in the hope that it will be useful, but +// unless required by applicable law or agreed to in writing, TagSoup +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS +// OF ANY KIND, either express or implied; not even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// +// Interface to objects that translate InputStreams to Readers by auto-detection + +using System.IO; + +namespace TagSoup +{ + /// <summary> + /// Classes which accept an <see cref="Stream"/> and provide a <see cref="TextReader"/> which figures + /// out the encoding of the <see cref="Stream"/> and reads characters from it should + /// conform to this interface. + /// </summary> + /// <seealso cref="Stream" /> + /// <seealso cref="TextReader" /> + public interface IAutoDetector + { + /// <summary> + /// Given a <see cref="Stream"/>, return a suitable <see cref="TextReader"/> that understands + /// the presumed character encoding of that <see cref="Stream"/>. + /// If bytes are consumed from the <see cref="Stream"/> in the process, they + /// <i>must</i> be pushed back onto the InputStream so that they can be + /// reinterpreted as characters. + /// </summary> + /// <param name="stream">The <see cref="Stream"/></param> + /// <returns>A <see cref="TextReader"/> that reads from the <see cref="Stream"/></returns> + TextReader AutoDetectingReader(Stream stream); + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/198e5868/src/Lucene.Net.Benchmark/Support/TagSoup/Element.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Support/TagSoup/Element.cs b/src/Lucene.Net.Benchmark/Support/TagSoup/Element.cs new file mode 100644 index 0000000..dca7eed --- /dev/null +++ b/src/Lucene.Net.Benchmark/Support/TagSoup/Element.cs @@ -0,0 +1,215 @@ +// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. +// +// TagSoup is licensed under the Apache License, +// Version 2.0. You may obtain a copy of this license at +// http://www.apache.org/licenses/LICENSE-2.0 . You may also have +// additional legal rights not granted by this license. +// +// TagSoup is distributed in the hope that it will be useful, but +// unless required by applicable law or agreed to in writing, TagSoup +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS +// OF ANY KIND, either express or implied; not even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +using Sax.Helpers; + +namespace TagSoup +{ + /// <summary> + /// The internal representation of an actual element (not an element type). + /// An Element has an element type, attributes, and a successor Element + /// for use in constructing stacks and queues of Elements. + /// </summary> + /// <seealso cref="ElementType" /> + /// <seealso cref="Sax.Net.Helpers.Attributes" /> + public class Element + { + private readonly Attributes _atts; // attributes of element + private readonly ElementType _type; // type of element + private bool _preclosed; // this element has been preclosed + + /// <summary> + /// Return an Element from a specified ElementType. + /// </summary> + /// <param name="type"> + /// The element type of the newly constructed element + /// </param> + /// <param name="defaultAttributes"> + /// True if default attributes are wanted + /// </param> + public Element(ElementType type, bool defaultAttributes) + { + _type = type; + if (defaultAttributes) + { + _atts = new Attributes(type.Attributes); + } + else + { + _atts = new Attributes(); + } + Next = null; + _preclosed = false; + } + + /// <summary> + /// Gets the element type. + /// </summary> + public virtual ElementType Type + { + get { return _type; } + } + + /// <summary> + /// Gets the attributes as an Attributes object. + /// Returning an Attributes makes the attributes mutable. + /// </summary> + /// <seealso cref="Attributes" /> + public virtual Attributes Attributes + { + get { return _atts; } + } + + /// <summary> + /// Gets or sets the next element in an element stack or queue. + /// </summary> + public virtual Element Next { get; set; } + + /// <summary> + /// Gets the name of the element's type. + /// </summary> + public virtual string Name + { + get { return _type.Name; } + } + + /// <summary> + /// Gets the namespace name of the element's type. + /// </summary> + public virtual string Namespace + { + get { return _type.Namespace; } + } + + /// <summary> + /// Gets the local name of the element's type. + /// </summary> + public virtual string LocalName + { + get { return _type.LocalName; } + } + + /// <summary> + /// Gets the content model vector of the element's type. + /// </summary> + public virtual int Model + { + get { return _type.Model; } + } + + /// <summary> + /// Gets the member-of vector of the element's type. + /// </summary> + public virtual int MemberOf + { + get { return _type.MemberOf; } + } + + /// <summary> + /// Gets the flags vector of the element's type. + /// </summary> + public virtual int Flags + { + get { return _type.Flags; } + } + + /// <summary> + /// Gets the parent element type of the element's type. + /// </summary> + public virtual ElementType Parent + { + get { return _type.Parent; } + } + + /// <summary> + /// Return true if this element has been preclosed. + /// </summary> + public virtual bool IsPreclosed + { + get { return _preclosed; } + } + + /// <summary> + /// Return true if the type of this element can contain the type of + /// another element. + /// Convenience method. + /// </summary> + /// <param name="other"> + /// The other element + /// </param> + public virtual bool CanContain(Element other) + { + return _type.CanContain(other._type); + } + + /// <summary> + /// Set an attribute and its value into this element. + /// </summary> + /// <param name="name"> + /// The attribute name (Qname) + /// </param> + /// <param name="type"> + /// The attribute type + /// </param> + /// <param name="value"> + /// The attribute value + /// </param> + public virtual void SetAttribute(string name, string type, string value) + { + _type.SetAttribute(_atts, name, type, value); + } + + /// <summary> + /// Make this element anonymous. + /// Remove any <c>id</c> or <c>name</c> attribute present + /// in the element's attributes. + /// </summary> + public virtual void Anonymize() + { + for (int i = _atts.Length - 1; i >= 0; i--) + { + if (_atts.GetType(i).Equals("ID") || _atts.GetQName(i).Equals("name")) + { + _atts.RemoveAttribute(i); + } + } + } + + /// <summary> + /// Clean the attributes of this element. + /// Attributes with null name (the name was ill-formed) + /// or null value (the attribute was present in the element type but + /// not in this actual element) are removed. + /// </summary> + public virtual void Clean() + { + for (int i = _atts.Length - 1; i >= 0; i--) + { + string name = _atts.GetLocalName(i); + if (_atts.GetValue(i) == null || string.IsNullOrEmpty(name)) + { + _atts.RemoveAttribute(i); + } + } + } + + /// <summary> + /// Force this element to preclosed status, meaning that an end-tag has + /// been seen but the element cannot yet be closed for structural reasons. + /// </summary> + public virtual void Preclose() + { + _preclosed = true; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/198e5868/src/Lucene.Net.Benchmark/Support/TagSoup/ElementType.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Support/TagSoup/ElementType.cs b/src/Lucene.Net.Benchmark/Support/TagSoup/ElementType.cs new file mode 100644 index 0000000..6d62a2f --- /dev/null +++ b/src/Lucene.Net.Benchmark/Support/TagSoup/ElementType.cs @@ -0,0 +1,269 @@ +// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. +// +// TagSoup is licensed under the Apache License, +// Version 2.0. You may obtain a copy of this license at +// http://www.apache.org/licenses/LICENSE-2.0 . You may also have +// additional legal rights not granted by this license. +// +// TagSoup is distributed in the hope that it will be useful, but +// unless required by applicable law or agreed to in writing, TagSoup +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS +// OF ANY KIND, either express or implied; not even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +using Sax.Helpers; +using System; +using System.Text; + +namespace TagSoup +{ + /// <summary> + /// This class represents an element type in the schema. + /// An element type has a name, a content model vector, a member-of vector, + /// a flags vector, default attributes, and a schema to which it belongs. + /// </summary> + /// <seealso cref="Schema" /> + public class ElementType + { + private readonly Attributes atts; // default attributes + private readonly string localName; // element type local name + private readonly string name; // element type name (Qname) + private readonly string @namespace; // element type namespace name + private readonly Schema schema; // schema to which this belongs + + /// <summary> + /// Construct an <see cref="ElementType"/>: + /// but it's better to use <see cref="Schema.Element()"/> instead. + /// The content model, member-of, and flags vectors are specified as ints. + /// </summary> + /// <param name="name">The element type name</param> + /// <param name="model">ORed-together bits representing the content + /// models allowed in the content of this element type</param> + /// <param name="memberOf">ORed-together bits representing the content models + /// to which this element type belongs</param> + /// <param name="flags">ORed-together bits representing the flags associated + /// with this element type</param> + /// <param name="schema"> + /// The schema with which this element type will be associated + /// </param> + public ElementType(string name, int model, int memberOf, int flags, Schema schema) + { + this.name = name; + Model = model; + MemberOf = memberOf; + Flags = flags; + atts = new Attributes(); + this.schema = schema; + @namespace = GetNamespace(name, false); + localName = GetLocalName(name); + } + + /// <summary> + /// Gets the name of this element type. + /// </summary> + public virtual string Name + { + get { return name; } + } + + /// <summary> + /// Gets the namespace name of this element type. + /// </summary> + public virtual string Namespace + { + get { return @namespace; } + } + + /// <summary> + /// Gets the local name of this element type. + /// </summary> + public virtual string LocalName + { + get { return localName; } + } + + /// <summary> + /// Gets or sets the content models of this element type as a vector of bits + /// </summary> + public virtual int Model { get; set; } + + /// <summary> + /// Gets or sets the content models to which this element type belongs as a vector of bits + /// </summary> + public virtual int MemberOf { get; set; } + + /// <summary> + /// Gets or sets the flags associated with this element type as a vector of bits + /// </summary> + public virtual int Flags { get; set; } + + /// <summary> + /// Returns the default attributes associated with this element type. + /// Attributes of type CDATA that don't have default values are + /// typically not included. Other attributes without default values + /// have an internal value of <c>null</c>. + /// The return value is an Attributes to allow the caller to mutate + /// the attributes. + /// </summary> + public virtual Attributes Attributes + { + get { return atts; } + } + + /// <summary> + /// Gets or sets the parent element type of this element type. + /// </summary> + public virtual ElementType Parent { get; set; } + + /// <summary> + /// Gets the schema which this element type is associated with. + /// </summary> + public virtual Schema Schema + { + get { return schema; } + } + + /// <summary> + /// Return a namespace name from a Qname. + /// The attribute flag tells us whether to return an empty namespace + /// name if there is no prefix, or use the schema default instead. + /// </summary> + /// <param name="name">The Qname</param> + /// <param name="attribute">True if name is an attribute name</param> + /// <returns>The namespace name</returns> + public virtual string GetNamespace(string name, bool attribute) + { + int colon = name.IndexOf(':'); + if (colon == -1) + { + return attribute ? "" : schema.Uri; + } + string prefix = name.Substring(0, colon); + if (prefix.Equals("xml")) + { + return "http://www.w3.org/XML/1998/namespace"; + } + return string.Intern("urn:x-prefix:" + prefix); + } + + /// <summary> + /// Return a local name from a Qname. + /// </summary> + /// <param name="name">The Qname</param> + /// <returns>The local name</returns> + public virtual string GetLocalName(string name) + { + int colon = name.IndexOf(':'); + if (colon == -1) + { + return name; + } + return string.Intern(name.Substring(colon + 1)); + } + + /// <summary> + /// Returns <c>true</c> if this element type can contain another element type. + /// That is, if any of the models in this element's model vector + /// match any of the models in the other element type's member-of + /// vector. + /// </summary> + /// <param name="other">The other element type</param> + public virtual bool CanContain(ElementType other) + { + return (Model & other.MemberOf) != 0; + } + + /// <summary> + /// Sets an attribute and its value into an <see cref="Sax.IAttributes"/> object. + /// Attempts to set a namespace declaration are ignored. + /// </summary> + /// <param name="atts">The <see cref="Sax.Helpers.Attributes"/> object</param> + /// <param name="name">The name (Qname) of the attribute</param> + /// <param name="type">The type of the attribute</param> + /// <param name="value">The value of the attribute</param> + public virtual void SetAttribute(Attributes atts, string name, string type, string value) + { + if (name.Equals("xmlns") || name.StartsWith("xmlns:")) + { + return; + } + + string ns = GetNamespace(name, true); + string localName = GetLocalName(name); + int i = atts.GetIndex(name); + if (i == -1) + { + name = string.Intern(name); + if (type == null) + { + type = "CDATA"; + } + if (!type.Equals("CDATA")) + { + value = Normalize(value); + } + atts.AddAttribute(ns, localName, name, type, value); + } + else + { + if (type == null) + { + type = atts.GetType(i); + } + if (!type.Equals("CDATA")) + { + value = Normalize(value); + } + atts.SetAttribute(i, ns, localName, name, type, value); + } + } + + /// <summary> + /// Normalize an attribute value (ID-style). + /// CDATA-style attribute normalization is already done. + /// </summary> + /// <param name="value">The value to normalize</param> + public static string Normalize(string value) + { + if (value == null) + { + return null; + } + value = value.Trim(); + if (value.IndexOf(" ", StringComparison.Ordinal) == -1) + { + return value; + } + bool space = false; + var b = new StringBuilder(value.Length); + foreach (char v in value) + { + if (v == ' ') + { + if (!space) + { + b.Append(v); + } + space = true; + } + else + { + b.Append(v); + space = false; + } + } + return b.ToString(); + } + + /// <summary> + /// Sets an attribute and its value into this element type. + /// </summary> + /// <param name="name">The name of the attribute</param> + /// <param name="type">The type of the attribute</param> + /// <param name="value">The value of the attribute</param> + public virtual void SetAttribute(string name, string type, string value) + { + SetAttribute(atts, name, type, value); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/198e5868/src/Lucene.Net.Benchmark/Support/TagSoup/HTMLScanner.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Support/TagSoup/HTMLScanner.cs b/src/Lucene.Net.Benchmark/Support/TagSoup/HTMLScanner.cs new file mode 100644 index 0000000..ed41f84 --- /dev/null +++ b/src/Lucene.Net.Benchmark/Support/TagSoup/HTMLScanner.cs @@ -0,0 +1,745 @@ +// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. +// +// TagSoup is licensed under the Apache License, +// Version 2.0. You may obtain a copy of this license at +// http://www.apache.org/licenses/LICENSE-2.0 . You may also have +// additional legal rights not granted by this license. +// +// TagSoup is distributed in the hope that it will be useful, but +// unless required by applicable law or agreed to in writing, TagSoup +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS +// OF ANY KIND, either express or implied; not even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// + +using Sax; +using System; +using System.IO; + +namespace TagSoup +{ + /// <summary> + /// This class implements a table-driven scanner for HTML, allowing for lots of + /// defects. It implements the Scanner interface, which accepts a Reader + /// object to fetch characters from and a ScanHandler object to report lexical + /// events to. + /// </summary> + public class HTMLScanner : IScanner, ILocator + { + // Start of state table + private const int S_ANAME = 1; + private const int S_APOS = 2; + private const int S_AVAL = 3; + private const int S_BB = 4; + private const int S_BBC = 5; + private const int S_BBCD = 6; + private const int S_BBCDA = 7; + private const int S_BBCDAT = 8; + private const int S_BBCDATA = 9; + private const int S_CDATA = 10; + private const int S_CDATA2 = 11; + private const int S_CDSECT = 12; + private const int S_CDSECT1 = 13; + private const int S_CDSECT2 = 14; + private const int S_COM = 15; + private const int S_COM2 = 16; + private const int S_COM3 = 17; + private const int S_COM4 = 18; + private const int S_DECL = 19; + private const int S_DECL2 = 20; + private const int S_DONE = 21; + private const int S_EMPTYTAG = 22; + private const int S_ENT = 23; + private const int S_EQ = 24; + private const int S_ETAG = 25; + private const int S_GI = 26; + private const int S_NCR = 27; + private const int S_PCDATA = 28; + private const int S_PI = 29; + private const int S_PITARGET = 30; + private const int S_QUOT = 31; + private const int S_STAGC = 32; + private const int S_TAG = 33; + private const int S_TAGWS = 34; + private const int S_XNCR = 35; + private const int A_ADUP = 1; + private const int A_ADUP_SAVE = 2; + private const int A_ADUP_STAGC = 3; + private const int A_ANAME = 4; + private const int A_ANAME_ADUP = 5; + private const int A_ANAME_ADUP_STAGC = 6; + private const int A_AVAL = 7; + private const int A_AVAL_STAGC = 8; + private const int A_CDATA = 9; + private const int A_CMNT = 10; + private const int A_DECL = 11; + private const int A_EMPTYTAG = 12; + private const int A_ENTITY = 13; + private const int A_ENTITY_START = 14; + private const int A_ETAG = 15; + private const int A_GI = 16; + private const int A_GI_STAGC = 17; + private const int A_LT = 18; + private const int A_LT_PCDATA = 19; + private const int A_MINUS = 20; + private const int A_MINUS2 = 21; + private const int A_MINUS3 = 22; + private const int A_PCDATA = 23; + private const int A_PI = 24; + private const int A_PITARGET = 25; + private const int A_PITARGET_PI = 26; + private const int A_SAVE = 27; + private const int A_SKIP = 28; + private const int A_SP = 29; + private const int A_STAGC = 30; + private const int A_UNGET = 31; + private const int A_UNSAVE_PCDATA = 32; + private static int[] statetable = { + S_ANAME, '/', A_ANAME_ADUP, S_EMPTYTAG, + S_ANAME, '=', A_ANAME, S_AVAL, + S_ANAME, '>', A_ANAME_ADUP_STAGC, S_PCDATA, + S_ANAME, 0, A_SAVE, S_ANAME, + S_ANAME, -1, A_ANAME_ADUP_STAGC, S_DONE, + S_ANAME, ' ', A_ANAME, S_EQ, + S_ANAME, '\n', A_ANAME, S_EQ, + S_ANAME, '\t', A_ANAME, S_EQ, + S_APOS, '\'', A_AVAL, S_TAGWS, + S_APOS, 0, A_SAVE, S_APOS, + S_APOS, -1, A_AVAL_STAGC, S_DONE, + S_APOS, ' ', A_SP, S_APOS, + S_APOS, '\n', A_SP, S_APOS, + S_APOS, '\t', A_SP, S_APOS, + S_AVAL, '\'', A_SKIP, S_APOS, + S_AVAL, '"', A_SKIP, S_QUOT, + S_AVAL, '>', A_AVAL_STAGC, S_PCDATA, + S_AVAL, 0, A_SAVE, S_STAGC, + S_AVAL, -1, A_AVAL_STAGC, S_DONE, + S_AVAL, ' ', A_SKIP, S_AVAL, + S_AVAL, '\n', A_SKIP, S_AVAL, + S_AVAL, '\t', A_SKIP, S_AVAL, + S_BB, 'C', A_SKIP, S_BBC, + S_BB, 0, A_SKIP, S_DECL, + S_BB, -1, A_SKIP, S_DONE, + S_BBC, 'D', A_SKIP, S_BBCD, + S_BBC, 0, A_SKIP, S_DECL, + S_BBC, -1, A_SKIP, S_DONE, + S_BBCD, 'A', A_SKIP, S_BBCDA, + S_BBCD, 0, A_SKIP, S_DECL, + S_BBCD, -1, A_SKIP, S_DONE, + S_BBCDA, 'T', A_SKIP, S_BBCDAT, + S_BBCDA, 0, A_SKIP, S_DECL, + S_BBCDA, -1, A_SKIP, S_DONE, + S_BBCDAT, 'A', A_SKIP, S_BBCDATA, + S_BBCDAT, 0, A_SKIP, S_DECL, + S_BBCDAT, -1, A_SKIP, S_DONE, + S_BBCDATA, '[', A_SKIP, S_CDSECT, + S_BBCDATA, 0, A_SKIP, S_DECL, + S_BBCDATA, -1, A_SKIP, S_DONE, + S_CDATA, '<', A_SAVE, S_CDATA2, + S_CDATA, 0, A_SAVE, S_CDATA, + S_CDATA, -1, A_PCDATA, S_DONE, + S_CDATA2, '/', A_UNSAVE_PCDATA, S_ETAG, + S_CDATA2, 0, A_SAVE, S_CDATA, + S_CDATA2, -1, A_UNSAVE_PCDATA, S_DONE, + S_CDSECT, ']', A_SAVE, S_CDSECT1, + S_CDSECT, 0, A_SAVE, S_CDSECT, + S_CDSECT, -1, A_SKIP, S_DONE, + S_CDSECT1, ']', A_SAVE, S_CDSECT2, + S_CDSECT1, 0, A_SAVE, S_CDSECT, + S_CDSECT1, -1, A_SKIP, S_DONE, + S_CDSECT2, '>', A_CDATA, S_PCDATA, + S_CDSECT2, 0, A_SAVE, S_CDSECT, + S_CDSECT2, -1, A_SKIP, S_DONE, + S_COM, '-', A_SKIP, S_COM2, + S_COM, 0, A_SAVE, S_COM2, + S_COM, -1, A_CMNT, S_DONE, + S_COM2, '-', A_SKIP, S_COM3, + S_COM2, 0, A_SAVE, S_COM2, + S_COM2, -1, A_CMNT, S_DONE, + S_COM3, '-', A_SKIP, S_COM4, + S_COM3, 0, A_MINUS, S_COM2, + S_COM3, -1, A_CMNT, S_DONE, + S_COM4, '-', A_MINUS3, S_COM4, + S_COM4, '>', A_CMNT, S_PCDATA, + S_COM4, 0, A_MINUS2, S_COM2, + S_COM4, -1, A_CMNT, S_DONE, + S_DECL, '-', A_SKIP, S_COM, + S_DECL, '[', A_SKIP, S_BB, + S_DECL, '>', A_SKIP, S_PCDATA, + S_DECL, 0, A_SAVE, S_DECL2, + S_DECL, -1, A_SKIP, S_DONE, + S_DECL2, '>', A_DECL, S_PCDATA, + S_DECL2, 0, A_SAVE, S_DECL2, + S_DECL2, -1, A_SKIP, S_DONE, + S_EMPTYTAG, '>', A_EMPTYTAG, S_PCDATA, + S_EMPTYTAG, 0, A_SAVE, S_ANAME, + S_EMPTYTAG, ' ', A_SKIP, S_TAGWS, + S_EMPTYTAG, '\n', A_SKIP, S_TAGWS, + S_EMPTYTAG, '\t', A_SKIP, S_TAGWS, + S_ENT, 0, A_ENTITY, S_ENT, + S_ENT, -1, A_ENTITY, S_DONE, + S_EQ, '=', A_SKIP, S_AVAL, + S_EQ, '>', A_ADUP_STAGC, S_PCDATA, + S_EQ, 0, A_ADUP_SAVE, S_ANAME, + S_EQ, -1, A_ADUP_STAGC, S_DONE, + S_EQ, ' ', A_SKIP, S_EQ, + S_EQ, '\n', A_SKIP, S_EQ, + S_EQ, '\t', A_SKIP, S_EQ, + S_ETAG, '>', A_ETAG, S_PCDATA, + S_ETAG, 0, A_SAVE, S_ETAG, + S_ETAG, -1, A_ETAG, S_DONE, + S_ETAG, ' ', A_SKIP, S_ETAG, + S_ETAG, '\n', A_SKIP, S_ETAG, + S_ETAG, '\t', A_SKIP, S_ETAG, + S_GI, '/', A_SKIP, S_EMPTYTAG, + S_GI, '>', A_GI_STAGC, S_PCDATA, + S_GI, 0, A_SAVE, S_GI, + S_GI, -1, A_SKIP, S_DONE, + S_GI, ' ', A_GI, S_TAGWS, + S_GI, '\n', A_GI, S_TAGWS, + S_GI, '\t', A_GI, S_TAGWS, + S_NCR, 0, A_ENTITY, S_NCR, + S_NCR, -1, A_ENTITY, S_DONE, + S_PCDATA, '&', A_ENTITY_START, S_ENT, + S_PCDATA, '<', A_PCDATA, S_TAG, + S_PCDATA, 0, A_SAVE, S_PCDATA, + S_PCDATA, -1, A_PCDATA, S_DONE, + S_PI, '>', A_PI, S_PCDATA, + S_PI, 0, A_SAVE, S_PI, + S_PI, -1, A_PI, S_DONE, + S_PITARGET, '>', A_PITARGET_PI, S_PCDATA, + S_PITARGET, 0, A_SAVE, S_PITARGET, + S_PITARGET, -1, A_PITARGET_PI, S_DONE, + S_PITARGET, ' ', A_PITARGET, S_PI, + S_PITARGET, '\n', A_PITARGET, S_PI, + S_PITARGET, '\t', A_PITARGET, S_PI, + S_QUOT, '"', A_AVAL, S_TAGWS, + S_QUOT, 0, A_SAVE, S_QUOT, + S_QUOT, -1, A_AVAL_STAGC, S_DONE, + S_QUOT, ' ', A_SP, S_QUOT, + S_QUOT, '\n', A_SP, S_QUOT, + S_QUOT, '\t', A_SP, S_QUOT, + S_STAGC, '>', A_AVAL_STAGC, S_PCDATA, + S_STAGC, 0, A_SAVE, S_STAGC, + S_STAGC, -1, A_AVAL_STAGC, S_DONE, + S_STAGC, ' ', A_AVAL, S_TAGWS, + S_STAGC, '\n', A_AVAL, S_TAGWS, + S_STAGC, '\t', A_AVAL, S_TAGWS, + S_TAG, '!', A_SKIP, S_DECL, + S_TAG, '/', A_SKIP, S_ETAG, + S_TAG, '?', A_SKIP, S_PITARGET, + S_TAG, '<', A_SAVE, S_TAG, + S_TAG, 0, A_SAVE, S_GI, + S_TAG, -1, A_LT_PCDATA, S_DONE, + S_TAG, ' ', A_LT, S_PCDATA, + S_TAG, '\n', A_LT, S_PCDATA, + S_TAG, '\t', A_LT, S_PCDATA, + S_TAGWS, '/', A_SKIP, S_EMPTYTAG, + S_TAGWS, '>', A_STAGC, S_PCDATA, + S_TAGWS, 0, A_SAVE, S_ANAME, + S_TAGWS, -1, A_STAGC, S_DONE, + S_TAGWS, ' ', A_SKIP, S_TAGWS, + S_TAGWS, '\n', A_SKIP, S_TAGWS, + S_TAGWS, '\t', A_SKIP, S_TAGWS, + S_XNCR, 0, A_ENTITY, S_XNCR, + S_XNCR, -1, A_ENTITY, S_DONE, + + }; + private static readonly string[] debug_actionnames = { "", "A_ADUP", "A_ADUP_SAVE", "A_ADUP_STAGC", "A_ANAME", "A_ANAME_ADUP", "A_ANAME_ADUP_STAGC", "A_AVAL", "A_AVAL_STAGC", "A_CDATA", "A_CMNT", "A_DECL", "A_EMPTYTAG", "A_ENTITY", "A_ENTITY_START", "A_ETAG", "A_GI", "A_GI_STAGC", "A_LT", "A_LT_PCDATA", "A_MINUS", "A_MINUS2", "A_MINUS3", "A_PCDATA", "A_PI", "A_PITARGET", "A_PITARGET_PI", "A_SAVE", "A_SKIP", "A_SP", "A_STAGC", "A_UNGET", "A_UNSAVE_PCDATA" }; + private static readonly string[] debug_statenames = { "", "S_ANAME", "S_APOS", "S_AVAL", "S_BB", "S_BBC", "S_BBCD", "S_BBCDA", "S_BBCDAT", "S_BBCDATA", "S_CDATA", "S_CDATA2", "S_CDSECT", "S_CDSECT1", "S_CDSECT2", "S_COM", "S_COM2", "S_COM3", "S_COM4", "S_DECL", "S_DECL2", "S_DONE", "S_EMPTYTAG", "S_ENT", "S_EQ", "S_ETAG", "S_GI", "S_NCR", "S_PCDATA", "S_PI", "S_PITARGET", "S_QUOT", "S_STAGC", "S_TAG", "S_TAGWS", "S_XNCR" }; + + // End of state table + + private string thePublicid; // Locator state + private string theSystemid; + private int theLastLine; + private int theLastColumn; + private int theCurrentLine; + private int theCurrentColumn; + + int theState; // Current state + int theNextState; // Next state + char[] theOutputBuffer = new char[200]; // Output buffer + int theSize; // Current buffer size + int[] theWinMap = { // Windows chars map + 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD, + 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178}; + + ///<summary> + /// Index into the state table for [state][input character - 2]. + /// The state table consists of 4-entry runs on the form + /// { current state, input character, action, next state }. + /// We precompute the index into the state table for all possible + /// { current state, input character } and store the result in + /// the statetableIndex array. Since only some input characters + /// are present in the state table, we only do the computation for + /// characters 0 to the highest character value in the state table. + /// An input character of -2 is used to cover all other characters + /// as -2 is guaranteed not to match any input character entry + /// in the state table. + /// <para>When doing lookups, the input character should first be tested + /// to be in the range [-1 (inclusive), statetableIndexMaxChar (exclusive)]. + /// if it isn't use -2 as the input character. + /// <para>Finally, add 2 to the input character to cover for the fact that + /// Java doesn't support negative array indexes. Then look up + /// the value in the statetableIndex. If the value is -1, then + /// no action or next state was found for the { state, input } that + /// you had. If it isn't -1, then action = statetable[value + 2] and + /// next state = statetable[value + 3]. That is, the value points + /// to the start of the answer 4-tuple in the statetable. + /// </summary> + static short[][] statetableIndex; + + ///<summary> + /// The highest character value seen in the statetable. + /// See the doc comment for statetableIndex to see how this + /// is used. + /// </summary> + static int statetableIndexMaxChar; + public HTMLScanner() + { + int maxState = -1; + int maxChar = -1; + for (int i = 0; i < statetable.Length; i += 4) + { + if (statetable[i] > maxState) + { + maxState = statetable[i]; + } + if (statetable[i + 1] > maxChar) + { + maxChar = statetable[i + 1]; + } + } + statetableIndexMaxChar = maxChar + 1; + + statetableIndex = new short[maxState + 1][]; + + for (int i = 0; i <= maxState; i++) + { + statetableIndex[i] = new short[maxChar + 3]; + } + for (int theState = 0; theState <= maxState; ++theState) + { + for (int ch = -2; ch <= maxChar; ++ch) + { + int hit = -1; + int action = 0; + for (int i = 0; i < statetable.Length; i += 4) + { + if (theState != statetable[i]) + { + if (action != 0) break; + continue; + } + if (statetable[i + 1] == 0) + { + hit = i; + action = statetable[i + 2]; + } + else if (statetable[i + 1] == ch) + { + hit = i; + action = statetable[i + 2]; + break; + } + } + statetableIndex[theState][ch + 2] = (short)hit; + } + } + } + + // Locator implementation + + public virtual int LineNumber + { + get { return theLastLine; } + } + public virtual int ColumnNumber + { + get { return theLastColumn; } + } + public virtual string PublicId + { + get { return thePublicid; } + } + public virtual string SystemId + { + get { return theSystemid; } + } + + + // Scanner implementation + + /// <summary> + /// Reset document locator, supplying systemid and publicid. + /// </summary> + /// <param name="systemid">System id</param> + /// <param name="publicid">Public id</param> + public virtual void ResetDocumentLocator(string publicid, string systemid) + { + thePublicid = publicid; + theSystemid = systemid; + theLastLine = theLastColumn = theCurrentLine = theCurrentColumn = 0; + } + + /// <summary> + /// Scan HTML source, reporting lexical events. + /// </summary> + /// <param name="r">Reader that provides characters</param> + /// <param name="h">ScanHandler that accepts lexical events.</param> + public virtual void Scan(TextReader r, IScanHandler h) + { + theState = S_PCDATA; + + int firstChar = r.Peek(); // Remove any leading BOM + if (firstChar == '\uFEFF') r.Read(); + + while (theState != S_DONE) + { + int ch = r.Peek(); + bool unread = false; + + // Process control characters + if (ch >= 0x80 && ch <= 0x9F) ch = theWinMap[ch - 0x80]; + + if (ch == '\r') + { + r.Read(); + ch = r.Peek(); // expect LF next + if (ch != '\n') + { + unread = true; + ch = '\n'; + } + } + + if (ch == '\n') + { + theCurrentLine++; + theCurrentColumn = 0; + } + else + { + theCurrentColumn++; + } + + if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1)) continue; + + // Search state table + int adjCh = (ch >= -1 && ch < statetableIndexMaxChar) ? ch : -2; + int statetableRow = statetableIndex[theState][adjCh + 2]; + int action = 0; + if (statetableRow != -1) + { + action = statetable[statetableRow + 2]; + theNextState = statetable[statetableRow + 3]; + } + + // System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]); + switch (action) + { + case 0: + throw new Exception( + "HTMLScanner can't cope with " + (int)ch + " in state " + + (int)theState); + case A_ADUP: + h.Adup(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_ADUP_SAVE: + h.Adup(theOutputBuffer, 0, theSize); + theSize = 0; + Save(ch, h); + break; + case A_ADUP_STAGC: + h.Adup(theOutputBuffer, 0, theSize); + theSize = 0; + h.STagC(theOutputBuffer, 0, theSize); + break; + case A_ANAME: + h.Aname(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_ANAME_ADUP: + h.Aname(theOutputBuffer, 0, theSize); + theSize = 0; + h.Adup(theOutputBuffer, 0, theSize); + break; + case A_ANAME_ADUP_STAGC: + h.Aname(theOutputBuffer, 0, theSize); + theSize = 0; + h.Adup(theOutputBuffer, 0, theSize); + h.STagC(theOutputBuffer, 0, theSize); + break; + case A_AVAL: + h.Aval(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_AVAL_STAGC: + h.Aval(theOutputBuffer, 0, theSize); + theSize = 0; + h.STagC(theOutputBuffer, 0, theSize); + break; + case A_CDATA: + Mark(); + // suppress the final "]]" in the buffer + if (theSize > 1) theSize -= 2; + h.PCDATA(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_ENTITY_START: + h.PCDATA(theOutputBuffer, 0, theSize); + theSize = 0; + Save(ch, h); + break; + case A_ENTITY: + Mark(); + char ch1 = (char)ch; + // System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK"))); + if (theState == S_ENT && ch1 == '#') + { + theNextState = S_NCR; + Save(ch, h); + break; + } + else if (theState == S_NCR && (ch1 == 'x' || ch1 == 'X')) + { + theNextState = S_XNCR; + Save(ch, h); + break; + } + else if (theState == S_ENT && char.IsLetterOrDigit(ch1)) + { + Save(ch, h); + break; + } + else if (theState == S_NCR && char.IsDigit(ch1)) + { + Save(ch, h); + break; + } + else if (theState == S_XNCR && (char.IsDigit(ch1) || "abcdefABCDEF".IndexOf(ch1) != -1)) + { + Save(ch, h); + break; + } + + // The whole entity reference has been collected + // System.err.println("%%" + new String(theOutputBuffer, 0, theSize)); + h.Entity(theOutputBuffer, 1, theSize - 1); + int ent = h.GetEntity(); + // System.err.println("%% value = " + ent); + if (ent != 0) + { + theSize = 0; + if (ent >= 0x80 && ent <= 0x9F) + { + ent = theWinMap[ent - 0x80]; + } + if (ent < 0x20) + { + // Control becomes space + ent = 0x20; + } + else if (ent >= 0xD800 && ent <= 0xDFFF) + { + // Surrogates get dropped + ent = 0; + } + else if (ent <= 0xFFFF) + { + // BMP character + Save(ent, h); + } + else + { + // Astral converted to two surrogates + ent -= 0x10000; + Save((ent >> 10) + 0xD800, h); + Save((ent & 0x3FF) + 0xDC00, h); + } + if (ch != ';') + { + unread = true; + theCurrentColumn--; + } + } + else + { + unread = true; + theCurrentColumn--; + } + theNextState = S_PCDATA; + break; + case A_ETAG: + h.ETag(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_DECL: + h.Decl(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_GI: + h.GI(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_GI_STAGC: + h.GI(theOutputBuffer, 0, theSize); + theSize = 0; + h.STagC(theOutputBuffer, 0, theSize); + break; + case A_LT: + Mark(); + Save('<', h); + Save(ch, h); + break; + case A_LT_PCDATA: + Mark(); + Save('<', h); + h.PCDATA(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_PCDATA: + Mark(); + h.PCDATA(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_CMNT: + Mark(); + h.Cmnt(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_MINUS3: + Save('-', h); + Save(' ', h); + break; + case A_MINUS2: + Save('-', h); + Save(' ', h); + Save('-', h); + Save(ch, h); + // fall through into A_MINUS + break; + case A_MINUS: + Save('-', h); + Save(ch, h); + break; + case A_PI: + Mark(); + h.PI(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_PITARGET: + h.PITarget(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_PITARGET_PI: + h.PITarget(theOutputBuffer, 0, theSize); + theSize = 0; + h.PI(theOutputBuffer, 0, theSize); + break; + case A_SAVE: + Save(ch, h); + break; + case A_SKIP: + break; + case A_SP: + Save(' ', h); + break; + case A_STAGC: + h.STagC(theOutputBuffer, 0, theSize); + theSize = 0; + break; + case A_EMPTYTAG: + Mark(); + // System.err.println("%%% Empty tag seen"); + if (theSize > 0) h.GI(theOutputBuffer, 0, theSize); + theSize = 0; + h.STagE(theOutputBuffer, 0, theSize); + break; + case A_UNGET: + unread = true; + theCurrentColumn--; + break; + case A_UNSAVE_PCDATA: + if (theSize > 0) theSize--; + h.PCDATA(theOutputBuffer, 0, theSize); + theSize = 0; + break; + default: + throw new Exception("Can't process state " + action); + } + if (!unread) + { + r.Read(); + } + theState = theNextState; + } + h.EOF(theOutputBuffer, 0, 0); + } + + /// <summary> + /// Mark the current scan position as a "point of interest" - start of a tag, + /// cdata, processing instruction etc. + /// </summary> + private void Mark() + { + theLastColumn = theCurrentColumn; + theLastLine = theCurrentLine; + } + + /// <summary> + /// A callback for the ScanHandler that allows it to force + /// the lexer state to CDATA content (no markup is recognized except + /// the end of element. + /// </summary> + public virtual void StartCDATA() { theNextState = S_CDATA; } + + private void Save(int ch, IScanHandler h) + { + if (theSize >= theOutputBuffer.Length - 20) + { + if (theState == S_PCDATA || theState == S_CDATA) + { + // Return a buffer-sized chunk of PCDATA + h.PCDATA(theOutputBuffer, 0, theSize); + theSize = 0; + } + else + { + // Grow the buffer size + char[] newOutputBuffer = new char[theOutputBuffer.Length * 2]; + Array.Copy(theOutputBuffer, 0, newOutputBuffer, 0, theSize + 1); + theOutputBuffer = newOutputBuffer; + } + } + theOutputBuffer[theSize++] = (char)ch; + } + + /** + Test procedure. Reads HTML from the standard input and writes + PYX to the standard output. + */ + + // public static void main(string[] argv) { + // IScanner s = new HTMLScanner(); + // TextReader r = new StreamReader(System.in, "UTF-8"); + // TextWriter w = new StreamWriter(System.out, "UTF-8"); + // PYXWriter pw = new PYXWriter(w); + // s.scan(r, pw); + // w.close(); + // } + + + private static string NiceChar(int value) + { + if (value == '\n') return "\\n"; + if (value < 32) return "0x" + value.ToString("X"); + return "'" + ((char)value) + "'"; + } + } +}
