[01/12] lucenenet git commit: Ported StreamTokenizer from Apache Harmony

nightowl888 Tue, 01 Aug 2017 21:19:42 -0700

Repository: lucenenet
Updated Branches:
  refs/heads/master db1f605cd -> ea879c611



Ported StreamTokenizer from Apache Harmony


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/05c8a040
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/05c8a040
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/05c8a040

Branch: refs/heads/master
Commit: 05c8a040818010ed608791f71505d2a005f0ed48
Parents: db1f605
Author: Shad Storhaug <[email protected]>
Authored: Tue Jul 25 20:16:50 2017 +0700
Committer: Shad Storhaug <[email protected]>
Committed: Tue Jul 25 20:16:50 2017 +0700

----------------------------------------------------------------------
 src/Lucene.Net.Tests/Lucene.Net.Tests.csproj    |   1 +
 .../Support/IO/TestStreamTokenizer.cs           | 514 +++++++++++++
 src/Lucene.Net/Lucene.Net.csproj                |   1 +
 src/Lucene.Net/Support/IO/StreamTokenizer.cs    | 738 +++++++++++++++++++
 4 files changed, 1254 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/05c8a040/src/Lucene.Net.Tests/Lucene.Net.Tests.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests/Lucene.Net.Tests.csproj 
b/src/Lucene.Net.Tests/Lucene.Net.Tests.csproj
index de94c52..7631329 100644
--- a/src/Lucene.Net.Tests/Lucene.Net.Tests.csproj
+++ b/src/Lucene.Net.Tests/Lucene.Net.Tests.csproj
@@ -520,6 +520,7 @@
     <Compile Include="Support\IO\TestByteBuffer.cs" />
     <Compile Include="Support\IO\TestHeapByteBuffer.cs" />
     <Compile Include="Support\IO\TestReadOnlyHeapByteBuffer.cs" />
+    <Compile Include="Support\IO\TestStreamTokenizer.cs" />
     <Compile Include="Support\SmallObject.cs" />
     <Compile Include="Support\TestPriorityQueue.cs" />
     <Compile Include="Support\Threading\TestCloseableThreadLocal.cs" />

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/05c8a040/src/Lucene.Net.Tests/Support/IO/TestStreamTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests/Support/IO/TestStreamTokenizer.cs 
b/src/Lucene.Net.Tests/Support/IO/TestStreamTokenizer.cs
new file mode 100644
index 0000000..5e50163
--- /dev/null
+++ b/src/Lucene.Net.Tests/Support/IO/TestStreamTokenizer.cs
@@ -0,0 +1,514 @@
+ï»¿// This class was sourced from the Apache Harmony project
+// https://svn.apache.org/repos/asf/harmony/enhanced/java/trunk/
+
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Support.IO
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    [TestFixture]
+    public class TestStreamTokenizer : LuceneTestCase
+    {
+        StringReader r;
+
+        StreamTokenizer st;
+
+        String testString;
+
+        /**
+         * @tests java.io.StreamTokenizer#StreamTokenizer(java.io.InputStream)
+         */
+        [Test]
+        public void Test_ConstructorLSystem_IO_InputStream()
+        {
+#pragma warning disable 612, 618
+            st = new StreamTokenizer(new MemoryStream(
+#pragma warning restore 612, 618
+                Encoding.UTF8.GetBytes("/comments\n d 8 'h'")));
+
+
+            assertEquals("the next token returned should be the letter d",
+                     StreamTokenizer.TT_WORD, st.NextToken());
+
+            assertEquals("the next token returned should be the letter d",
+                     "d", st.StringValue);
+
+
+            assertEquals("the next token returned should be the digit 8",
+                     StreamTokenizer.TT_NUMBER, st.NextToken());
+
+            assertEquals("the next token returned should be the digit 8",
+                     8.0, st.NumberValue);
+
+
+            assertEquals("the next token returned should be the quote 
character",
+                     39, st.NextToken());
+
+            assertEquals("the next token returned should be the quote 
character",
+                     "h", st.StringValue);
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#StreamTokenizer(java.io.Reader)
+         */
+        [Test]
+        public void Test_ConstructorLSystem_IO_TextReader()
+        {
+            setTest("/testing\n d 8 'h' ");
+            assertEquals("the next token returned should be the letter d 
skipping the comments",
+                     StreamTokenizer.TT_WORD, st.NextToken());
+            assertEquals("the next token returned should be the letter d",
+                     "d", st.StringValue);
+
+            assertEquals("the next token returned should be the digit 8",
+                     StreamTokenizer.TT_NUMBER, st.NextToken());
+            assertEquals("the next token returned should be the digit 8",
+                     8.0, st.NumberValue);
+
+            assertEquals("the next token returned should be the quote 
character",
+                     39, st.NextToken());
+            assertEquals("the next token returned should be the quote 
character",
+                     "h", st.StringValue);
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#commentChar(int)
+         */
+        [Test]
+        public void Test_commentCharI()
+        {
+            setTest("*comment \n / 8 'h' ");
+            st.OrdinaryChar('/');
+            st.CommentChar('*');
+            assertEquals("nextToken() did not return the character / skiping 
the comments starting with *",
+                     47, st.NextToken());
+            assertTrue("the next token returned should be the digit 8", st
+                   .NextToken() == StreamTokenizer.TT_NUMBER
+                   && st.NumberValue == 8.0);
+            assertTrue("the next token returned should be the quote character",
+                   st.NextToken() == 39 && st.StringValue.equals("h"));
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#eolIsSignificant(boolean)
+         */
+        [Test]
+        public void Test_eolIsSignificantZ()
+        {
+            setTest("d 8\n");
+            // by default end of line characters are not significant
+            assertTrue("nextToken did not return d",
+                   st.NextToken() == StreamTokenizer.TT_WORD
+                   && st.StringValue.equals("d"));
+            assertTrue("nextToken did not return 8",
+                   st.NextToken() == StreamTokenizer.TT_NUMBER
+                   && st.NumberValue == 8.0);
+            assertTrue("nextToken should be the end of file",
+                   st.NextToken() == StreamTokenizer.TT_EOF);
+            setTest("d\n");
+            st.IsEOLSignificant = (true);
+            // end of line characters are significant
+            assertTrue("nextToken did not return d",
+                   st.NextToken() == StreamTokenizer.TT_WORD
+                   && st.StringValue.equals("d"));
+            assertTrue("nextToken is the end of line",
+                   st.NextToken() == StreamTokenizer.TT_EOL);
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#lineno()
+         */
+        [Test]
+        public void Test_lineno()
+        {
+            setTest("d\n 8\n");
+            assertEquals("the lineno should be 1", 1, st.LineNumber);
+            st.NextToken();
+            st.NextToken();
+            assertEquals("the lineno should be 2", 2, st.LineNumber);
+            st.NextToken();
+            assertEquals("the next line no should be 3", 3, st.LineNumber);
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#lowerCaseMode(boolean)
+         */
+        [Test]
+        public void Test_lowerCaseModeZ()
+        {
+            // SM.
+            setTest("HELLOWORLD");
+            st.LowerCaseMode = (true);
+
+            st.NextToken();
+            assertEquals("sval not converted to lowercase.", "helloworld", 
st.StringValue
+                     );
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#nextToken()
+         */
+        [Test]
+        public void Test_nextToken()
+        {
+            // SM.
+            // LUCENENET NOTE: The original test had \257 (which is octal)
+            // that is not supported in a .NET string, so we convert to 
decimal 175 here.
+            // This also changes the semantics of the test, because for 
whatever
+            // reason in Java it was expecting the octal number to register as 
a TT_WORD.
+            // So, we changed to expect a TT_NUMBER as a result of the above 
change.
+            // Also, we don't need to escape single quotes in .NET.
+            setTest("\r\n/* fje fje 43.4 f \r\n f g */  456.459 \r\n"
+                    + "Hello  /        \r\n \r\n \n \r 175 Hi 'Hello World'");
+            st.OrdinaryChar('/');
+            st.SlashStarComments = true;
+            st.NextToken();
+            assertTrue("Wrong Token type1: " + (char)st.TokenType,
+                   st.TokenType == StreamTokenizer.TT_NUMBER);
+            st.NextToken();
+            assertTrue("Wrong Token type2: " + st.TokenType,
+                   st.TokenType == StreamTokenizer.TT_WORD);
+            st.NextToken();
+            assertTrue("Wrong Token type3: " + st.TokenType, st.TokenType == 
'/');
+            st.NextToken();
+            assertTrue("Wrong Token type4: " + st.TokenType,
+                   st.TokenType == StreamTokenizer.TT_NUMBER);
+            st.NextToken();
+            assertTrue("Wrong Token type5: " + st.TokenType,
+                   st.TokenType == StreamTokenizer.TT_WORD);
+            st.NextToken();
+            assertTrue("Wrong Token type6: " + st.TokenType, st.TokenType == 
'\'');
+            assertTrue("Wrong Token type7: " + st.TokenType, st.StringValue
+                   .equals("Hello World"));
+            st.NextToken();
+            assertTrue("Wrong Token type8: " + st.TokenType, st.TokenType == 
-1);
+
+            using (var pin = new 
MemoryStream(Encoding.UTF8.GetBytes("hello\n\r\r")))
+            {
+#pragma warning disable 612, 618
+                StreamTokenizer s = new StreamTokenizer(pin);
+#pragma warning restore 612, 618
+                s.IsEOLSignificant = (true);
+
+                assertTrue("Wrong token 1,1",
+                       s.NextToken() == StreamTokenizer.TT_WORD
+                       && s.StringValue.equals("hello"));
+
+                assertTrue("Wrong token 1,2", s.NextToken() == '\n');
+
+                assertTrue("Wrong token 1,3", s.NextToken() == '\n');
+
+                assertTrue("Wrong token 1,4", s.NextToken() == '\n');
+
+
+                assertTrue("Wrong token 1,5",
+                       s.NextToken() == StreamTokenizer.TT_EOF);
+            }
+            StreamTokenizer tokenizer = new StreamTokenizer(
+                                    new StringReader("\n \r\n#"));
+            tokenizer.OrdinaryChar('\n'); // make \n ordinary
+            tokenizer.IsEOLSignificant = (true);
+
+            assertTrue("Wrong token 2,1", tokenizer.NextToken() == '\n');
+
+            assertTrue("Wrong token 2,2", tokenizer.NextToken() == '\n');
+
+            assertEquals("Wrong token 2,3", '#', tokenizer.NextToken());
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#ordinaryChar(int)
+         */
+        [Test]
+        public void Test_ordinaryCharI()
+        {
+            // SM.
+            setTest("Ffjein 893");
+            st.OrdinaryChar('F');
+            st.NextToken();
+            assertTrue("OrdinaryChar failed." + (char)st.TokenType,
+                       st.TokenType == 'F');
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#ordinaryChars(int, int)
+         */
+        [Test]
+        public void Test_ordinaryCharsII()
+        {
+            // SM.
+            setTest("azbc iof z 893");
+            st.OrdinaryChars('a', 'z');
+            assertEquals("OrdinaryChars failed.", 'a', st.NextToken());
+            assertEquals("OrdinaryChars failed.", 'z', st.NextToken());
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#parseNumbers()
+         */
+        [Test]
+        public void Test_parseNumbers()
+        {
+            // SM
+            setTest("9.9 678");
+            assertTrue("Base behavior failed.",
+                       st.NextToken() == StreamTokenizer.TT_NUMBER);
+            st.OrdinaryChars('0', '9');
+            assertEquals("setOrdinary failed.", '6', st.NextToken());
+            st.ParseNumbers();
+            assertTrue("parseNumbers failed.",
+                       st.NextToken() == StreamTokenizer.TT_NUMBER);
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#pushBack()
+         */
+        [Test]
+        public void Test_pushBack()
+        {
+            // SM.
+            setTest("Hello 897");
+            st.NextToken();
+            st.PushBack();
+            assertTrue("PushBack failed.",
+                       st.NextToken() == StreamTokenizer.TT_WORD);
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#quoteChar(int)
+         */
+        [Test]
+        public void Test_quoteCharI()
+        {
+            // SM
+            setTest("<Hello World<    HelloWorldH");
+            st.QuoteChar('<');
+            assertEquals("QuoteChar failed.", '<', st.NextToken());
+            assertEquals("QuoteChar failed.", "Hello World", st.StringValue);
+            st.QuoteChar('H');
+            st.NextToken();
+            assertEquals("QuoteChar failed for word.", "elloWorld", 
st.StringValue
+                         );
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#resetSyntax()
+         */
+        [Test]
+        public void Test_resetSyntax()
+        {
+            // SM
+            setTest("H 9\' ello World");
+            st.ResetSyntax();
+            assertTrue("resetSyntax failed1." + (char)st.TokenType,
+                       st.NextToken() == 'H');
+            assertTrue("resetSyntax failed1." + (char)st.TokenType,
+                       st.NextToken() == ' ');
+            assertTrue("resetSyntax failed2." + (char)st.TokenType,
+                       st.NextToken() == '9');
+            assertTrue("resetSyntax failed3." + (char)st.TokenType,
+                       st.NextToken() == '\'');
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#slashSlashComments(boolean)
+         */
+        [Test]
+        public void Test_slashSlashCommentsZ()
+        {
+            // SM.
+            setTest("// foo \r\n /fiji \r\n -456");
+            st.OrdinaryChar('/');
+            st.SlashSlashComments = (true);
+            assertEquals("Test failed.", '/', st.NextToken());
+            assertTrue("Test failed.",
+                       st.NextToken() == StreamTokenizer.TT_WORD);
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#slashSlashComments(boolean)
+         */
+        [Test]
+        public void Test_slashSlashComments_withSSOpen()
+        {
+            TextReader reader = new StringReader("t // t t t");
+
+            StreamTokenizer st = new StreamTokenizer(reader);
+            st.SlashSlashComments = (true);
+
+            assertEquals(StreamTokenizer.TT_WORD, st.NextToken());
+            assertEquals(StreamTokenizer.TT_EOF, st.NextToken());
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#slashSlashComments(boolean)
+         */
+        [Test]
+        public void Test_slashSlashComments_withSSOpen_NoComment()
+        {
+            TextReader reader = new StringReader("// t");
+
+            StreamTokenizer st = new StreamTokenizer(reader);
+            st.SlashSlashComments = (true);
+            st.OrdinaryChar('/');
+
+            assertEquals(StreamTokenizer.TT_EOF, st.NextToken());
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#slashSlashComments(boolean)
+         */
+        [Test]
+        public void Test_slashSlashComments_withSSClosed()
+        {
+            TextReader reader = new StringReader("// t");
+
+            StreamTokenizer st = new StreamTokenizer(reader);
+            st.SlashSlashComments = (false);
+            st.OrdinaryChar('/');
+
+            assertEquals('/', st.NextToken());
+            assertEquals('/', st.NextToken());
+            assertEquals(StreamTokenizer.TT_WORD, st.NextToken());
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#slashStarComments(boolean)
+         */
+        [Test]
+        public void Test_slashStarCommentsZ()
+        {
+            setTest("/* foo \r\n /fiji \r\n*/ -456");
+            st.OrdinaryChar('/');
+            st.SlashStarComments = (true);
+            assertTrue("Test failed.",
+                       st.NextToken() == StreamTokenizer.TT_NUMBER);
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#slashStarComments(boolean)
+         */
+        [Test]
+        public void Test_slashStarComments_withSTOpen()
+        {
+            TextReader reader = new StringReader("t /* t */ t");
+
+            StreamTokenizer st = new StreamTokenizer(reader);
+            st.SlashStarComments = (true);
+
+            assertEquals(StreamTokenizer.TT_WORD, st.NextToken());
+            assertEquals(StreamTokenizer.TT_WORD, st.NextToken());
+            assertEquals(StreamTokenizer.TT_EOF, st.NextToken());
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#slashStarComments(boolean)
+         */
+        [Test]
+        public void Test_slashStarComments_withSTClosed()
+        {
+            TextReader reader = new StringReader("t /* t */ t");
+
+            StreamTokenizer st = new StreamTokenizer(reader);
+            st.SlashStarComments = (false);
+
+            assertEquals(StreamTokenizer.TT_WORD, st.NextToken());
+            assertEquals(StreamTokenizer.TT_EOF, st.NextToken());
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#toString()
+         */
+        [Test]
+        public void Test_toString()
+        {
+            setTest("ABC Hello World");
+            st.NextToken();
+            assertTrue("toString failed." + st.toString(),
+                       st.toString().equals(
+                                "Token[ABC], line 1"));
+
+            // Regression test for HARMONY-4070
+            byte[] data = new byte[] { (byte)'-' };
+#pragma warning disable 612, 618
+            StreamTokenizer tokenizer = new StreamTokenizer(
+                    new MemoryStream(data));
+#pragma warning restore 612, 618
+            tokenizer.NextToken();
+            String result = tokenizer.toString();
+            assertEquals("Token['-'], line 1", result);
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#whitespaceChars(int, int)
+         */
+        [Test]
+        public void Test_whitespaceCharsII()
+        {
+            setTest("azbc iof z 893");
+            st.WhitespaceChars('a', 'z');
+            assertTrue("OrdinaryChar failed.",
+                       st.NextToken() == StreamTokenizer.TT_NUMBER);
+        }
+
+        /**
+         * @tests java.io.StreamTokenizer#wordChars(int, int)
+         */
+        [Test]
+        public void Test_wordCharsII()
+        {
+            setTest("A893 -9B87");
+            st.WordChars('0', '9');
+            assertTrue("WordChar failed1.",
+                       st.NextToken() == StreamTokenizer.TT_WORD);
+            assertEquals("WordChar failed2.", "A893", st.StringValue);
+            assertTrue("WordChar failed3.",
+                       st.NextToken() == StreamTokenizer.TT_NUMBER);
+            st.NextToken();
+            assertEquals("WordChar failed4.", "B87", st.StringValue);
+
+            setTest("    Hello World");
+            st.WordChars(' ', ' ');
+            st.NextToken();
+            assertEquals("WordChars failed for whitespace.", "Hello World", 
st.StringValue
+                         );
+
+            setTest("    Hello World\r\n  \'Hello World\' Hello\' World");
+            st.WordChars(' ', ' ');
+            st.WordChars('\'', '\'');
+            st.NextToken();
+            assertTrue("WordChars failed for whitespace: " + st.StringValue, 
st.StringValue
+                       .equals("Hello World"));
+            st.NextToken();
+            assertTrue("WordChars failed for quote1: " + st.StringValue, 
st.StringValue
+                       .equals("\'Hello World\' Hello\' World"));
+        }
+
+        private void setTest(string s)
+        {
+            testString = s;
+            r = new StringReader(testString);
+            st = new StreamTokenizer(r);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/05c8a040/src/Lucene.Net/Lucene.Net.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net/Lucene.Net.csproj b/src/Lucene.Net/Lucene.Net.csproj
index 3dc6d30..e29c82a 100644
--- a/src/Lucene.Net/Lucene.Net.csproj
+++ b/src/Lucene.Net/Lucene.Net.csproj
@@ -652,6 +652,7 @@
     <Compile Include="Support\IO\FileStreamExtensions.cs" />
     <Compile Include="Support\ICallable.cs" />
     <Compile Include="Support\ICharSequence.cs" />
+    <Compile Include="Support\IO\StreamTokenizer.cs" />
     <Compile Include="Support\RectangularArrays.cs" />
     <Compile Include="Support\Search\ReferenceContext.cs" />
     <Compile Include="Support\Search\ReferenceManagerExtensions.cs" />

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/05c8a040/src/Lucene.Net/Support/IO/StreamTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net/Support/IO/StreamTokenizer.cs 
b/src/Lucene.Net/Support/IO/StreamTokenizer.cs
new file mode 100644
index 0000000..ecfb5d5
--- /dev/null
+++ b/src/Lucene.Net/Support/IO/StreamTokenizer.cs
@@ -0,0 +1,738 @@
+ï»¿// This class was sourced from the Apache Harmony project
+// https://svn.apache.org/repos/asf/harmony/enhanced/java/trunk/
+
+using System;
+using System.Globalization;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Support.IO
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Parses a stream into a set of defined tokens, one at a time. The 
different
+    /// types of tokens that can be found are numbers, identifiers, quoted 
strings,
+    /// and different comment styles. The class can be used for limited 
processing
+    /// of source code of programming languages like Java, although it is 
nowhere
+    /// near a full parser.
+    /// </summary>
+    public class StreamTokenizer
+    {
+        /// <summary>
+        /// Contains a number if the current token is a number 
+        /// (<see cref="TokenType"/> == <see cref="TT_NUMBER"/>).
+        /// </summary>
+        public double NumberValue { get; set; }
+
+        /// <summary>
+        /// Contains a string if the current token is a word 
+        /// (<see cref="TokenType"/> == <see cref="TT_WORD"/>).
+        /// </summary>
+        public string StringValue { get; set; }
+
+        /// <summary>
+        /// The constant representing the end of the stream.
+        /// </summary>
+        public const int TT_EOF = -1;
+
+        /// <summary>
+        /// The constant representing the end of the line.
+        /// </summary>
+        public const int TT_EOL = '\n';
+
+        /// <summary>
+        /// The constant representing a number token.
+        /// </summary>
+        public const int TT_NUMBER = -2;
+
+        /// <summary>
+        /// The constant representing a word token.
+        /// </summary>
+        public const int TT_WORD = -3;
+
+        /// <summary>
+        /// Internal representation of unknown state.
+        /// </summary>
+        private const int TT_UNKNOWN = -4;
+
+        /// <summary>
+        /// After calling {@code nextToken()}, {@code ttype} contains the type 
of
+        /// token that has been read. When a single character is read, its 
value
+        /// converted to an integer is stored in {@code ttype}. For a quoted 
string,
+        /// the value is the quoted character. Otherwise, its value is one of 
the
+        /// following:
+        /// <list type="bullet">
+        ///     <item><description><see cref="TT_WORD"/> - the token is a 
word.</description></item>
+        ///     <item><description><see cref="TT_NUMBER"/> - the token is a 
number.</description></item>
+        ///     <item><description><see cref="TT_EOL"/> - the end of line has 
been reached. Depends on
+        ///     whether <see cref="IsEOLSignificant"/> is 
<c>true</c>.</description></item>
+        ///     <item><description><see cref="TT_EOF"/> - the end of the 
stream has been reached.</description></item>
+        /// </list>
+        /// </summary>
+        public int TokenType { get; private set; } = TT_UNKNOWN;
+
+        /// <summary>
+        /// Internal character meanings, 0 implies TOKEN_ORDINARY
+        /// </summary>
+        private byte[] tokenTypes = new byte[256];
+
+        private static readonly byte TOKEN_COMMENT = 1;
+        private static readonly byte TOKEN_QUOTE = 2;
+        private static readonly byte TOKEN_WHITE = 4;
+        private static readonly byte TOKEN_WORD = 8;
+        private static readonly byte TOKEN_DIGIT = 16;
+
+        private int lineNumber = 1;
+        private bool forceLowercase;
+        private bool isEOLSignificant;
+        private bool slashStarComments;
+        private bool slashSlashComments;
+        private bool pushBackToken;
+        private bool lastCr;
+
+        /// <summary>One of these will have the stream</summary>
+        private Stream inStream;
+        private TextReader inReader;
+        private int peekChar = -2;
+
+        /// <summary>
+        /// Private constructor to initialize the default values according to 
the
+        /// specification.
+        /// </summary>
+        private StreamTokenizer()
+        {
+            /*
+             * Initialize the default state per specification. All byte values 
'A'
+             * through 'Z', 'a' through 'z', and '\u00A0' through '\u00FF' are
+             * considered to be alphabetic.
+             */
+            WordChars('A', 'Z');
+            WordChars('a', 'z');
+            WordChars(160, 255);
+            /**
+             * All byte values '\u0000' through '\u0020' are considered to be 
white
+             * space.
+             */
+            WhitespaceChars(0, 32);
+            /**
+             * '/' is a comment character. Single quote '\'' and double quote 
'"'
+             * are string quote characters.
+             */
+            CommentChar('/');
+            QuoteChar('"');
+            QuoteChar('\'');
+            /**
+             * Numbers are parsed.
+             */
+            ParseNumbers();
+            /**
+             * Ends of lines are treated as white space, not as separate 
tokens.
+             * C-style and C++-style comments are not recognized. These are the
+             * defaults and are not needed in constructor.
+             */
+        }
+
+        /// <summary>
+        /// Constructs a new <see cref="StreamTokenizer"/> with <paramref 
name="input"/> as source input
+        /// stream. This constructor is deprecated; instead, the constructor 
that
+        /// takes a <see cref="TextReader"/> as an arugment should be used.
+        /// </summary>
+        /// <param name="input">the source stream from which to parse 
tokens.</param>
+        /// <exception cref="ArgumentNullException">If <paramref 
name="input"/> is <c>null</c>.</exception>
+        [Obsolete("Use StreamTokenizer(TextReader)")]
+        public StreamTokenizer(Stream input)
+            : this() // Calls private constructor
+        {
+            if (input == null)
+            {
+                throw new ArgumentNullException("input");
+            }
+            inStream = input;
+        }
+
+        /// <summary>
+        /// Constructs a new {@code StreamTokenizer} with {@code r} as source 
reader.
+        /// The tokenizer's initial state is as follows:
+        /// <list type="bullet">
+        ///     <item><description>All byte values 'A' through 'Z', 'a' 
through 'z', and '&#92;u00A0' through '&#92;u00FF' are considered to be 
alphabetic.</description></item>
+        ///     <item><description>All byte values '&#92;u0000' through 
'&#92;u0020' are considered to be white space. '/' is a comment 
character.</description></item>
+        ///     <item><description>Single quote '\'' and double quote '"' are 
string quote characters.</description></item>
+        ///     <item><description>Numbers are parsed.</description></item>
+        ///     <item><description>End of lines are considered to be white 
space rather than separate tokens.</description></item>
+        ///     <item><description>C-style and C++-style comments are not 
recognized.</description></item>
+        /// </list>
+        /// </summary>
+        /// <param name="reader">The source text reader from which to parse 
tokens.</param>
+        public StreamTokenizer(TextReader reader)
+            : this() // Calls private constructor
+        {
+            if (reader == null)
+            {
+                throw new ArgumentNullException("reader");
+            }
+            inReader = reader;
+        }
+
+        /// <summary>
+        /// Specifies that the character <paramref name="ch"/> shall be 
treated as a comment
+        /// character.
+        /// </summary>
+        /// <param name="ch">The character to be considered a comment 
character.</param>
+        public virtual void CommentChar(int ch)
+        {
+            if (0 <= ch && ch < tokenTypes.Length)
+            {
+                tokenTypes[ch] = TOKEN_COMMENT;
+            }
+        }
+
+        /// <summary>
+        /// Specifies whether the end of a line is significant and should be 
returned
+        /// as <see cref="TT_EOF"/> in <see cref="TokenType"/> by this 
tokenizer.
+        /// <c>true</c> if EOL is significant, <c>false</c> otherwise.
+        /// </summary>
+        public virtual bool IsEOLSignificant
+        {
+            get { return isEOLSignificant; }
+            set { isEOLSignificant = value; }
+        }
+
+        /// <summary>
+        /// Gets the current line number.
+        /// </summary>
+        public int LineNumber
+        {
+            get { return lineNumber; }
+        }
+
+        /// <summary>
+        /// Specifies whether word tokens should be converted to lower case 
when they
+        /// are stored in <see cref="StringValue"/>. <c>true</c> if <see 
cref="StringValue"/>
+        /// should be converted to lower case, <c>false</c> otherwise.
+        /// </summary>
+        public bool LowerCaseMode
+        {
+            get { return forceLowercase; }
+            set { forceLowercase = value; }
+        }
+
+        /// <summary>
+        /// Parses the next token from this tokenizer's source stream or 
reader. The
+        /// type of the token is stored in the <see cref="TokenType"/> field, 
additional
+        /// information may be stored in the <see cref="NumberValue"/> or <see 
cref="StringValue"/> fields.
+        /// </summary>
+        /// <returns>The value of <see cref="TokenType"/>.</returns>
+        /// <exception cref="IOException">If an I/O error occurs while parsing 
the next token.</exception>
+        public int NextToken()
+        {
+            if (pushBackToken)
+            {
+                pushBackToken = false;
+                if (TokenType != TT_UNKNOWN)
+                {
+                    return TokenType;
+                }
+            }
+            StringValue = null; // Always reset sval to null
+            int currentChar = peekChar == -2 ? Read() : peekChar;
+
+            if (lastCr && currentChar == '\n')
+            {
+                lastCr = false;
+                currentChar = Read();
+            }
+            if (currentChar == -1)
+            {
+                return (TokenType = TT_EOF);
+            }
+
+            byte currentType = currentChar > 255 ? TOKEN_WORD
+                    : tokenTypes[currentChar];
+            while ((currentType & TOKEN_WHITE) != 0)
+            {
+                /**
+                 * Skip over white space until we hit a new line or a real 
token
+                 */
+                if (currentChar == '\r')
+                {
+                    lineNumber++;
+                    if (isEOLSignificant)
+                    {
+                        lastCr = true;
+                        peekChar = -2;
+                        return (TokenType = TT_EOL);
+                    }
+                    if ((currentChar = Read()) == '\n')
+                    {
+                        currentChar = Read();
+                    }
+                }
+                else if (currentChar == '\n')
+                {
+                    lineNumber++;
+                    if (isEOLSignificant)
+                    {
+                        peekChar = -2;
+                        return (TokenType = TT_EOL);
+                    }
+                    currentChar = Read();
+                }
+                else
+                {
+                    // Advance over this white space character and try again.
+                    currentChar = Read();
+                }
+                if (currentChar == -1)
+                {
+                    return (TokenType = TT_EOF);
+                }
+                currentType = currentChar > 255 ? TOKEN_WORD
+                        : tokenTypes[currentChar];
+            }
+
+            /**
+             * Check for digits before checking for words since digits can be
+             * contained within words.
+             */
+            if ((currentType & TOKEN_DIGIT) != 0)
+            {
+                StringBuilder digits = new StringBuilder(20);
+                bool haveDecimal = false, checkJustNegative = currentChar == 
'-';
+                while (true)
+                {
+                    if (currentChar == '.')
+                    {
+                        haveDecimal = true;
+                    }
+                    digits.Append((char)currentChar);
+                    currentChar = Read();
+                    if ((currentChar < '0' || currentChar > '9')
+                            && (haveDecimal || currentChar != '.'))
+                    {
+                        break;
+                    }
+                }
+                peekChar = currentChar;
+                if (checkJustNegative && digits.Length == 1)
+                {
+                    // Didn't get any other digits other than '-'
+                    return (TokenType = '-');
+                }
+                try
+                {
+                    NumberValue = double.Parse(digits.ToString(), 
CultureInfo.InvariantCulture);
+                }
+#pragma warning disable 168
+                catch (FormatException e)
+#pragma warning disable 168
+                {
+                    // Unsure what to do, will write test.
+                    NumberValue = 0;
+                }
+                return (TokenType = TT_NUMBER);
+            }
+            // Check for words
+            if ((currentType & TOKEN_WORD) != 0)
+            {
+                StringBuilder word = new StringBuilder(20);
+                while (true)
+                {
+                    word.Append((char)currentChar);
+                    currentChar = Read();
+                    if (currentChar == -1
+                            || (currentChar < 256 && (tokenTypes[currentChar] 
& (TOKEN_WORD | TOKEN_DIGIT)) == 0))
+                    {
+                        break;
+                    }
+                }
+                peekChar = currentChar;
+                StringValue = forceLowercase ? 
word.ToString().ToLowerInvariant() : word
+                       .ToString();
+                return (TokenType = TT_WORD);
+            }
+            // Check for quoted character
+            if (currentType == TOKEN_QUOTE)
+            {
+                int matchQuote = currentChar;
+                StringBuilder quoteString = new StringBuilder();
+                int peekOne = Read();
+                while (peekOne >= 0 && peekOne != matchQuote && peekOne != '\r'
+                        && peekOne != '\n')
+                {
+                    bool readPeek = true;
+                    if (peekOne == '\\')
+                    {
+                        int c1 = Read();
+                        // Check for quoted octal IE: \377
+                        if (c1 <= '7' && c1 >= '0')
+                        {
+                            int digitValue = c1 - '0';
+                            c1 = Read();
+                            if (c1 > '7' || c1 < '0')
+                            {
+                                readPeek = false;
+                            }
+                            else
+                            {
+                                digitValue = digitValue * 8 + (c1 - '0');
+                                c1 = Read();
+                                // limit the digit value to a byte
+                                if (digitValue > 037 || c1 > '7' || c1 < '0')
+                                {
+                                    readPeek = false;
+                                }
+                                else
+                                {
+                                    digitValue = digitValue * 8 + (c1 - '0');
+                                }
+                            }
+                            if (!readPeek)
+                            {
+                                // We've consumed one to many
+                                quoteString.Append((char)digitValue);
+                                peekOne = c1;
+                            }
+                            else
+                            {
+                                peekOne = digitValue;
+                            }
+                        }
+                        else
+                        {
+                            switch (c1)
+                            {
+                                case 'a':
+                                    peekOne = 0x7;
+                                    break;
+                                case 'b':
+                                    peekOne = 0x8;
+                                    break;
+                                case 'f':
+                                    peekOne = 0xc;
+                                    break;
+                                case 'n':
+                                    peekOne = 0xA;
+                                    break;
+                                case 'r':
+                                    peekOne = 0xD;
+                                    break;
+                                case 't':
+                                    peekOne = 0x9;
+                                    break;
+                                case 'v':
+                                    peekOne = 0xB;
+                                    break;
+                                default:
+                                    peekOne = c1;
+                                    break;
+                            }
+                        }
+                    }
+                    if (readPeek)
+                    {
+                        quoteString.Append((char)peekOne);
+                        peekOne = Read();
+                    }
+                }
+                if (peekOne == matchQuote)
+                {
+                    peekOne = Read();
+                }
+                peekChar = peekOne;
+                TokenType = matchQuote;
+                StringValue = quoteString.ToString();
+                return TokenType;
+            }
+            // Do comments, both "//" and "/*stuff*/"
+            if (currentChar == '/' && (slashSlashComments || 
slashStarComments))
+            {
+                if ((currentChar = Read()) == '*' && slashStarComments)
+                {
+                    int peekOne = Read();
+                    while (true)
+                    {
+                        currentChar = peekOne;
+                        peekOne = Read();
+                        if (currentChar == -1)
+                        {
+                            peekChar = -1;
+                            return (TokenType = TT_EOF);
+                        }
+                        if (currentChar == '\r')
+                        {
+                            if (peekOne == '\n')
+                            {
+                                peekOne = Read();
+                            }
+                            lineNumber++;
+                        }
+                        else if (currentChar == '\n')
+                        {
+                            lineNumber++;
+                        }
+                        else if (currentChar == '*' && peekOne == '/')
+                        {
+                            peekChar = Read();
+                            return NextToken();
+                        }
+                    }
+                }
+                else if (currentChar == '/' && slashSlashComments)
+                {
+                    // Skip to EOF or new line then return the next token
+                    while ((currentChar = Read()) >= 0 && currentChar != '\r'
+                            && currentChar != '\n')
+                    {
+                        // Intentionally empty
+                    }
+                    peekChar = currentChar;
+                    return NextToken();
+                }
+                else if (currentType != TOKEN_COMMENT)
+                {
+                    // Was just a slash by itself
+                    peekChar = currentChar;
+                    return (TokenType = '/');
+                }
+            }
+            // Check for comment character
+            if (currentType == TOKEN_COMMENT)
+            {
+                // Skip to EOF or new line then return the next token
+                while ((currentChar = Read()) >= 0 && currentChar != '\r'
+                        && currentChar != '\n')
+                {
+                    // Intentionally empty
+                }
+                peekChar = currentChar;
+                return NextToken();
+            }
+
+            peekChar = Read();
+            return (TokenType = currentChar);
+        }
+
+        /// <summary>
+        /// Specifies that the character <paramref name="ch"/> shall be 
treated as an ordinary
+        /// character by this tokenizer. That is, it has no special meaning as 
a
+        /// comment character, word component, white space, string delimiter or
+        /// number.
+        /// </summary>
+        /// <param name="ch">The character to be considered an ordinary 
character.</param>
+        public void OrdinaryChar(int ch)
+        {
+            if (0 <= ch && ch < tokenTypes.Length)
+            {
+                tokenTypes[ch] = 0;
+            }
+        }
+
+        /// <summary>
+        /// Specifies that the characters in the range from <paramref 
name="low"/> to <paramref name="hi"/>
+        /// shall be treated as an ordinary character by this tokenizer. That 
is,
+        /// they have no special meaning as a comment character, word 
component,
+        /// white space, string delimiter or number.
+        /// </summary>
+        /// <param name="low">The first character in the range of ordinary 
characters.</param>
+        /// <param name="hi">The last character in the range of ordinary 
characters.</param>
+        public void OrdinaryChars(int low, int hi)
+        {
+            if (low < 0)
+            {
+                low = 0;
+            }
+            if (hi > tokenTypes.Length)
+            {
+                hi = tokenTypes.Length - 1;
+            }
+            for (int i = low; i <= hi; i++)
+            {
+                tokenTypes[i] = 0;
+            }
+        }
+
+        /// <summary>
+        /// Specifies that this tokenizer shall parse numbers.
+        /// </summary>
+        public void ParseNumbers()
+        {
+            for (int i = '0'; i <= '9'; i++)
+            {
+                tokenTypes[i] |= TOKEN_DIGIT;
+            }
+            tokenTypes['.'] |= TOKEN_DIGIT;
+            tokenTypes['-'] |= TOKEN_DIGIT;
+        }
+
+        /// <summary>
+        /// Indicates that the current token should be pushed back and 
returned again
+        /// the next time <see cref="NextToken()"/> is called.
+        /// </summary>
+        public void PushBack()
+        {
+            pushBackToken = true;
+        }
+
+        /// <summary>
+        /// Specifies that the character <paramref name="ch"/> shall be 
treated as a quote
+        /// character.
+        /// </summary>
+        /// <param name="ch">The character to be considered a quote 
character.</param>
+        public void QuoteChar(int ch)
+        {
+            if (0 <= ch && ch < tokenTypes.Length)
+            {
+                tokenTypes[ch] = TOKEN_QUOTE;
+            }
+        }
+
+        private int Read()
+        {
+            // Call the read for the appropriate stream
+            if (inStream == null)
+            {
+                return inReader.Read();
+            }
+            return inStream.ReadByte();
+        }
+
+        /// <summary>
+        /// Specifies that all characters shall be treated as ordinary 
characters.
+        /// </summary>
+        public void ResetSyntax()
+        {
+            for (int i = 0; i < 256; i++)
+            {
+                tokenTypes[i] = 0;
+            }
+        }
+
+        /// <summary>
+        /// Specifies whether "slash-slash" (C++-style) comments shall be 
recognized.
+        /// This kind of comment ends at the end of the line.
+        /// <c>true</c> if <c>//</c> should be recognized as the start
+        /// of a comment, <c>false</c> otherwise.
+        /// </summary>
+        public bool SlashSlashComments
+        {
+            get { return slashSlashComments; }
+            set { slashSlashComments = value; }
+        }
+
+        /// <summary>
+        /// Specifies whether "slash-star" (C-style) comments shall be 
recognized.
+        /// Slash-star comments cannot be nested and end when a star-slash
+        /// combination is found.
+        /// <c>true</c> if <c>/*</c> should be recognized as the start
+        /// of a comment, <c>false</c> otherwise.
+        /// </summary>
+        public bool SlashStarComments
+        {
+            get { return slashStarComments; }
+            set { slashStarComments = value; }
+        }
+
+        /// <summary>
+        /// Returns the state of this tokenizer in a readable format.
+        /// </summary>
+        /// <returns>The current state of this tokenizer.</returns>
+        public override string ToString()
+        {
+            // Values determined through experimentation
+            StringBuilder result = new StringBuilder();
+            result.Append("Token["); //$NON-NLS-1$
+            switch (TokenType)
+            {
+                case TT_EOF:
+                    result.Append("EOF"); //$NON-NLS-1$
+                    break;
+                case TT_EOL:
+                    result.Append("EOL"); //$NON-NLS-1$
+                    break;
+                case TT_NUMBER:
+                    result.Append("n="); //$NON-NLS-1$
+                    result.Append(NumberValue);
+                    break;
+                case TT_WORD:
+                    result.Append(StringValue);
+                    break;
+                default:
+                    if (TokenType == TT_UNKNOWN || tokenTypes[TokenType] == 
TOKEN_QUOTE)
+                    {
+                        result.Append(StringValue);
+                    }
+                    else
+                    {
+                        result.Append('\'');
+                        result.Append((char)TokenType);
+                        result.Append('\'');
+                    }
+                    break;
+            }
+            result.Append("], line "); //$NON-NLS-1$
+            result.Append(lineNumber);
+            return result.ToString();
+        }
+
+        /// <summary>
+        /// Specifies that the characters in the range from <paramref 
name="low"/> to <paramref name="hi"/>
+        /// shall be treated as whitespace characters by this tokenizer.
+        /// </summary>
+        /// <param name="low">The first character in the range of whitespace 
characters.</param>
+        /// <param name="hi">The last character in the range of whitespace 
characters.</param>
+        public void WhitespaceChars(int low, int hi)
+        {
+            if (low < 0)
+            {
+                low = 0;
+            }
+            if (hi > tokenTypes.Length)
+            {
+                hi = tokenTypes.Length - 1;
+            }
+            for (int i = low; i <= hi; i++)
+            {
+                tokenTypes[i] = TOKEN_WHITE;
+            }
+        }
+
+        /// <summary>
+        /// Specifies that the characters in the range from <paramref 
name="low"/> to <paramref name="hi"/>
+        /// shall be treated as word characters by this tokenizer. A word 
consists of
+        /// a word character followed by zero or more word or number 
characters.
+        /// </summary>
+        /// <param name="low">The first character in the range of word 
characters.</param>
+        /// <param name="hi">The last character in the range of word 
characters.</param>
+        public void WordChars(int low, int hi)
+        {
+            if (low < 0)
+            {
+                low = 0;
+            }
+            if (hi > tokenTypes.Length)
+            {
+                hi = tokenTypes.Length - 1;
+            }
+            for (int i = low; i <= hi; i++)
+            {
+                tokenTypes[i] |= TOKEN_WORD;
+            }
+        }
+    }
+}

[01/12] lucenenet git commit: Ported StreamTokenizer from Apache Harmony

Reply via email to