Re: [PR] feat: Optimize SmartCn Dictionaries and Add Dictionary Loading Tests [lucenenet]

via GitHub Fri, 18 Apr 2025 23:02:47 -0700


NightOwl888 commented on code in PR #1154:
URL: https://github.com/apache/lucenenet/pull/1154#discussion_r2038138668



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary 
(coredict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary 
(coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
+            int total = 0;
+
+            // The file only counted 6763 Chinese characters plus 5 reserved 
slots (3756~3760).
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET: Use BinaryReader to simplify endian conversion and 
stream reading.
+
             using (var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read))
+            using (var reader = new BinaryReader(dctFile))
             {
-
                 // GB2312 characters 0 - 6768
-                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+                for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
                 {
-                    // if (i == 5231)

Review Comment:
   Please leave upstream comments in place.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
         /// <summary>
         /// Load the datafile into this <see cref="BigramDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary 
(bigramdict.dct)</param>
+        /// <param name="dctFilePath">Path to the Bigramdictionary 
(bigramdict.dct)</param>
         /// <exception cref="IOException">If there is a low-level I/O 
error</exception>
         public virtual void LoadFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
             // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
             // The 3756th is used (as a header) to store information.
-            int[]

Review Comment:
   Please add this temporary buffer back in, but allocate it on the stack.
   
   ```c#
   Span<int> buffer = stackalloc char[3];
   ```



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
         /// <summary>
         /// Load the datafile into this <see cref="BigramDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary 
(bigramdict.dct)</param>
+        /// <param name="dctFilePath">Path to the Bigramdictionary 
(bigramdict.dct)</param>
         /// <exception cref="IOException">If there is a low-level I/O 
error</exception>
         public virtual void LoadFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
             // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET specific - refactored constants for clarity
+            const int HEADER_POSITION = 3755;
+            const int MAX_VALID_LENGTH = 1000;
+
             //using (RandomAccessFile dctFile = new 
RandomAccessFile(dctFilePath, "r"))
             using var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read);
+            using var reader = new BinaryReader(dctFile);
 
             // GB2312 characters 0 - 6768
-            for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+            for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
             {
-                string currentStr = GetCCByGB2312Id(i);
-                // if (i == 5231)
-                // System.out.println(i);

Review Comment:
   Please leave upstream comments in place.



##########
src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs:
##########
@@ -0,0 +1,72 @@
+using Lucene.Net.Util;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using Lucene.Net.Attributes;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Reflection;
+

Review Comment:
   Please use our custom `Assert` class, not the one from NUnit.
   
   ```c#
   using Assert = Lucene.Net.TestFramework.Assert;
   ```



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary 
(coredict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary 
(coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
+            int total = 0;
+
+            // The file only counted 6763 Chinese characters plus 5 reserved 
slots (3756~3760).
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET: Use BinaryReader to simplify endian conversion and 
stream reading.
+
             using (var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read))
+            using (var reader = new BinaryReader(dctFile))
             {
-
                 // GB2312 characters 0 - 6768
-                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+                for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
                 {
-                    // if (i == 5231)
-                    // System.out.println(i);
+                    int cnt = reader.ReadInt32(); // LUCENENET: Use 
BinaryReader methods instead of ByteBuffer
 
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // the dictionary was developed for C, and byte order must 
be converted to work with Java
-                    cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
                     if (cnt <= 0)
                     {
                         wordItem_charArrayTable[i] = null;
                         wordItem_frequencyTable[i] = null;
                         continue;
                     }
+
                     wordItem_charArrayTable[i] = new char[cnt][];
                     wordItem_frequencyTable[i] = new int[cnt];
                     total += cnt;
-                    int j = 0;
-                    while (j < cnt)
+
+                    for (int j = 0; j < cnt; j++)
                     {
-                        // wordItemTable[i][j] = new WordItem();
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[0] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// frequency
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[1] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// length
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[2] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// handle
-
-                        // wordItemTable[i][j].frequency = buffer[0];
-                        wordItem_frequencyTable[i][j] = buffer[0];
-
-                        length = buffer[1];
+                        // LUCENENET: Use BinaryReader methods instead of 
ByteBuffer
+                        int frequency = reader.ReadInt32();
+                        int length = reader.ReadInt32();
+                        reader.ReadInt32(); // Skip handle (unused)

Review Comment:
   Please set this to `buffer[2]` as it was upstream.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary 
(coredict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary 
(coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
+            int total = 0;
+
+            // The file only counted 6763 Chinese characters plus 5 reserved 
slots (3756~3760).
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET: Use BinaryReader to simplify endian conversion and 
stream reading.
+
             using (var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read))
+            using (var reader = new BinaryReader(dctFile))
             {
-
                 // GB2312 characters 0 - 6768
-                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+                for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
                 {
-                    // if (i == 5231)
-                    // System.out.println(i);
+                    int cnt = reader.ReadInt32(); // LUCENENET: Use 
BinaryReader methods instead of ByteBuffer
 
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // the dictionary was developed for C, and byte order must 
be converted to work with Java
-                    cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
                     if (cnt <= 0)
                     {
                         wordItem_charArrayTable[i] = null;
                         wordItem_frequencyTable[i] = null;
                         continue;
                     }
+
                     wordItem_charArrayTable[i] = new char[cnt][];
                     wordItem_frequencyTable[i] = new int[cnt];
                     total += cnt;
-                    int j = 0;
-                    while (j < cnt)
+
+                    for (int j = 0; j < cnt; j++)
                     {
-                        // wordItemTable[i][j] = new WordItem();
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[0] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// frequency
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[1] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// length
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[2] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// handle
-
-                        // wordItemTable[i][j].frequency = buffer[0];
-                        wordItem_frequencyTable[i][j] = buffer[0];
-
-                        length = buffer[1];
+                        // LUCENENET: Use BinaryReader methods instead of 
ByteBuffer
+                        int frequency = reader.ReadInt32();
+                        int length = reader.ReadInt32();
+                        reader.ReadInt32(); // Skip handle (unused)
+
+                        wordItem_frequencyTable[i][j] = frequency;
+
                         if (length > 0)
                         {
-                            byte[] lchBuffer = new byte[length];
-                            dctFile.Read(lchBuffer, 0, lchBuffer.Length);
-                            tmpword = gb2312Encoding.GetString(lchBuffer); // 
LUCENENET specific: use cached encoding instance from base class
+                            byte[] lchBuffer = reader.ReadBytes(length);
+                            string tmpword = 
gb2312Encoding.GetString(lchBuffer); // LUCENENET: Use cached encoding instance 
from base class
                             wordItem_charArrayTable[i][j] = 
tmpword.ToCharArray();
                         }
                         else
                         {
-                            // wordItemTable[i][j].charArray = null;

Review Comment:
   Please leave upstream comments in place.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary 
(coredict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary 
(coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;

Review Comment:
   Please define the variables the same way as the upstream code. Someday we 
may need to merge upstream changes, so it helps a lot if we don't have to 
re-evaluate the business logic.
   
   ```c#
   int i, cnt, length, total = 0;
   ```



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary 
(coredict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary 
(coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
+            int total = 0;
+
+            // The file only counted 6763 Chinese characters plus 5 reserved 
slots (3756~3760).
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET: Use BinaryReader to simplify endian conversion and 
stream reading.
+
             using (var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read))
+            using (var reader = new BinaryReader(dctFile))
             {
-
                 // GB2312 characters 0 - 6768
-                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+                for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
                 {
-                    // if (i == 5231)
-                    // System.out.println(i);
+                    int cnt = reader.ReadInt32(); // LUCENENET: Use 
BinaryReader methods instead of ByteBuffer
 
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // the dictionary was developed for C, and byte order must 
be converted to work with Java
-                    cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
                     if (cnt <= 0)
                     {
                         wordItem_charArrayTable[i] = null;
                         wordItem_frequencyTable[i] = null;
                         continue;
                     }
+
                     wordItem_charArrayTable[i] = new char[cnt][];
                     wordItem_frequencyTable[i] = new int[cnt];
                     total += cnt;
-                    int j = 0;
-                    while (j < cnt)
+
+                    for (int j = 0; j < cnt; j++)
                     {
-                        // wordItemTable[i][j] = new WordItem();
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[0] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// frequency
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[1] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// length
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[2] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// handle
-
-                        // wordItemTable[i][j].frequency = buffer[0];

Review Comment:
   Please leave upstream comments in place.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary 
(coredict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary 
(coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
+            int total = 0;
+
+            // The file only counted 6763 Chinese characters plus 5 reserved 
slots (3756~3760).
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET: Use BinaryReader to simplify endian conversion and 
stream reading.
+
             using (var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read))
+            using (var reader = new BinaryReader(dctFile))
             {
-
                 // GB2312 characters 0 - 6768
-                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+                for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
                 {
-                    // if (i == 5231)
-                    // System.out.println(i);
+                    int cnt = reader.ReadInt32(); // LUCENENET: Use 
BinaryReader methods instead of ByteBuffer
 
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // the dictionary was developed for C, and byte order must 
be converted to work with Java
-                    cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
                     if (cnt <= 0)
                     {
                         wordItem_charArrayTable[i] = null;
                         wordItem_frequencyTable[i] = null;
                         continue;
                     }
+
                     wordItem_charArrayTable[i] = new char[cnt][];
                     wordItem_frequencyTable[i] = new int[cnt];
                     total += cnt;
-                    int j = 0;

Review Comment:
   Please do not change the variable declarations from the upstream code.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary 
(coredict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary 
(coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
+            int total = 0;
+
+            // The file only counted 6763 Chinese characters plus 5 reserved 
slots (3756~3760).
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET: Use BinaryReader to simplify endian conversion and 
stream reading.
+
             using (var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read))
+            using (var reader = new BinaryReader(dctFile))
             {
-
                 // GB2312 characters 0 - 6768
-                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+                for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
                 {
-                    // if (i == 5231)
-                    // System.out.println(i);

Review Comment:
   Please leave upstream comments in place.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary 
(coredict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary 
(coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
+            int total = 0;
+
+            // The file only counted 6763 Chinese characters plus 5 reserved 
slots (3756~3760).
             // The 3756th is used (as a header) to store information.
-            int[]

Review Comment:
   Please add this temporary buffer back in, but allocate it on the stack.
   
   ```c#
   Span<int> buffer = stackalloc char[3];
   ```



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary 
(coredict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary 
(coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
+            int total = 0;
+
+            // The file only counted 6763 Chinese characters plus 5 reserved 
slots (3756~3760).
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET: Use BinaryReader to simplify endian conversion and 
stream reading.
+
             using (var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read))
+            using (var reader = new BinaryReader(dctFile))
             {
-
                 // GB2312 characters 0 - 6768
-                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+                for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
                 {
-                    // if (i == 5231)
-                    // System.out.println(i);
+                    int cnt = reader.ReadInt32(); // LUCENENET: Use 
BinaryReader methods instead of ByteBuffer
 
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // the dictionary was developed for C, and byte order must 
be converted to work with Java
-                    cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
                     if (cnt <= 0)
                     {
                         wordItem_charArrayTable[i] = null;
                         wordItem_frequencyTable[i] = null;
                         continue;
                     }
+
                     wordItem_charArrayTable[i] = new char[cnt][];
                     wordItem_frequencyTable[i] = new int[cnt];
                     total += cnt;
-                    int j = 0;
-                    while (j < cnt)
+
+                    for (int j = 0; j < cnt; j++)
                     {
-                        // wordItemTable[i][j] = new WordItem();
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[0] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// frequency
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[1] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// length
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[2] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// handle
-
-                        // wordItemTable[i][j].frequency = buffer[0];
-                        wordItem_frequencyTable[i][j] = buffer[0];
-
-                        length = buffer[1];
+                        // LUCENENET: Use BinaryReader methods instead of 
ByteBuffer
+                        int frequency = reader.ReadInt32();
+                        int length = reader.ReadInt32();

Review Comment:
   Please leave upstream comments in place.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary 
(coredict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary 
(coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
+            int total = 0;
+
+            // The file only counted 6763 Chinese characters plus 5 reserved 
slots (3756~3760).
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET: Use BinaryReader to simplify endian conversion and 
stream reading.
+
             using (var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read))
+            using (var reader = new BinaryReader(dctFile))
             {
-
                 // GB2312 characters 0 - 6768
-                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+                for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
                 {
-                    // if (i == 5231)
-                    // System.out.println(i);
+                    int cnt = reader.ReadInt32(); // LUCENENET: Use 
BinaryReader methods instead of ByteBuffer
 
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // the dictionary was developed for C, and byte order must 
be converted to work with Java
-                    cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
                     if (cnt <= 0)
                     {
                         wordItem_charArrayTable[i] = null;
                         wordItem_frequencyTable[i] = null;
                         continue;
                     }
+
                     wordItem_charArrayTable[i] = new char[cnt][];
                     wordItem_frequencyTable[i] = new int[cnt];
                     total += cnt;
-                    int j = 0;
-                    while (j < cnt)

Review Comment:
   Please do not change the style of loop from the upstream code.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary 
(coredict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary 
(coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
+            int total = 0;
+
+            // The file only counted 6763 Chinese characters plus 5 reserved 
slots (3756~3760).
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET: Use BinaryReader to simplify endian conversion and 
stream reading.
+
             using (var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read))
+            using (var reader = new BinaryReader(dctFile))
             {
-
                 // GB2312 characters 0 - 6768
-                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+                for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
                 {
-                    // if (i == 5231)
-                    // System.out.println(i);
+                    int cnt = reader.ReadInt32(); // LUCENENET: Use 
BinaryReader methods instead of ByteBuffer
 
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // the dictionary was developed for C, and byte order must 
be converted to work with Java
-                    cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
                     if (cnt <= 0)
                     {
                         wordItem_charArrayTable[i] = null;
                         wordItem_frequencyTable[i] = null;
                         continue;
                     }
+
                     wordItem_charArrayTable[i] = new char[cnt][];
                     wordItem_frequencyTable[i] = new int[cnt];
                     total += cnt;
-                    int j = 0;
-                    while (j < cnt)
+
+                    for (int j = 0; j < cnt; j++)
                     {
-                        // wordItemTable[i][j] = new WordItem();
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[0] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// frequency
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[1] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// length
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[2] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// handle
-
-                        // wordItemTable[i][j].frequency = buffer[0];
-                        wordItem_frequencyTable[i][j] = buffer[0];
-
-                        length = buffer[1];
+                        // LUCENENET: Use BinaryReader methods instead of 
ByteBuffer
+                        int frequency = reader.ReadInt32();

Review Comment:
   Please leave upstream comments in place.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary 
(coredict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary 
(coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
+            int total = 0;
+
+            // The file only counted 6763 Chinese characters plus 5 reserved 
slots (3756~3760).
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET: Use BinaryReader to simplify endian conversion and 
stream reading.
+
             using (var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read))
+            using (var reader = new BinaryReader(dctFile))
             {
-
                 // GB2312 characters 0 - 6768
-                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+                for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
                 {
-                    // if (i == 5231)
-                    // System.out.println(i);
+                    int cnt = reader.ReadInt32(); // LUCENENET: Use 
BinaryReader methods instead of ByteBuffer
 
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // the dictionary was developed for C, and byte order must 
be converted to work with Java
-                    cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
                     if (cnt <= 0)
                     {
                         wordItem_charArrayTable[i] = null;
                         wordItem_frequencyTable[i] = null;
                         continue;
                     }
+
                     wordItem_charArrayTable[i] = new char[cnt][];
                     wordItem_frequencyTable[i] = new int[cnt];
                     total += cnt;
-                    int j = 0;
-                    while (j < cnt)
+
+                    for (int j = 0; j < cnt; j++)
                     {
-                        // wordItemTable[i][j] = new WordItem();
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[0] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// frequency
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[1] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// length
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[2] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// handle
-
-                        // wordItemTable[i][j].frequency = buffer[0];
-                        wordItem_frequencyTable[i][j] = buffer[0];
-
-                        length = buffer[1];
+                        // LUCENENET: Use BinaryReader methods instead of 
ByteBuffer
+                        int frequency = reader.ReadInt32();
+                        int length = reader.ReadInt32();
+                        reader.ReadInt32(); // Skip handle (unused)
+
+                        wordItem_frequencyTable[i][j] = frequency;
+
                         if (length > 0)
                         {
-                            byte[] lchBuffer = new byte[length];
-                            dctFile.Read(lchBuffer, 0, lchBuffer.Length);
-                            tmpword = gb2312Encoding.GetString(lchBuffer); // 
LUCENENET specific: use cached encoding instance from base class
+                            byte[] lchBuffer = reader.ReadBytes(length);
+                            string tmpword = 
gb2312Encoding.GetString(lchBuffer); // LUCENENET: Use cached encoding instance 
from base class
                             wordItem_charArrayTable[i][j] = 
tmpword.ToCharArray();
                         }
                         else
                         {
-                            // wordItemTable[i][j].charArray = null;
                             wordItem_charArrayTable[i][j] = null;
                         }
-                        // System.out.println(indexTable[i].wordItems[j]);

Review Comment:
   Please leave upstream comments in place.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary 
(coredict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary 
(coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
+            int total = 0;
+
+            // The file only counted 6763 Chinese characters plus 5 reserved 
slots (3756~3760).
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET: Use BinaryReader to simplify endian conversion and 
stream reading.
+
             using (var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read))
+            using (var reader = new BinaryReader(dctFile))
             {
-
                 // GB2312 characters 0 - 6768
-                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+                for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
                 {
-                    // if (i == 5231)
-                    // System.out.println(i);
+                    int cnt = reader.ReadInt32(); // LUCENENET: Use 
BinaryReader methods instead of ByteBuffer
 
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // the dictionary was developed for C, and byte order must 
be converted to work with Java
-                    cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
                     if (cnt <= 0)
                     {
                         wordItem_charArrayTable[i] = null;
                         wordItem_frequencyTable[i] = null;
                         continue;
                     }
+
                     wordItem_charArrayTable[i] = new char[cnt][];
                     wordItem_frequencyTable[i] = new int[cnt];
                     total += cnt;
-                    int j = 0;
-                    while (j < cnt)
+
+                    for (int j = 0; j < cnt; j++)
                     {
-                        // wordItemTable[i][j] = new WordItem();

Review Comment:
   Please leave upstream comments in place.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
         /// <summary>
         /// Load the datafile into this <see cref="BigramDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary 
(bigramdict.dct)</param>
+        /// <param name="dctFilePath">Path to the Bigramdictionary 
(bigramdict.dct)</param>
         /// <exception cref="IOException">If there is a low-level I/O 
error</exception>
         public virtual void LoadFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
             // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;

Review Comment:
   Please declare `tmpword` here as it was done upstream.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
         /// <summary>
         /// Load the datafile into this <see cref="BigramDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary 
(bigramdict.dct)</param>
+        /// <param name="dctFilePath">Path to the Bigramdictionary 
(bigramdict.dct)</param>
         /// <exception cref="IOException">If there is a low-level I/O 
error</exception>
         public virtual void LoadFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
             // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET specific - refactored constants for clarity
+            const int HEADER_POSITION = 3755;
+            const int MAX_VALID_LENGTH = 1000;
+
             //using (RandomAccessFile dctFile = new 
RandomAccessFile(dctFilePath, "r"))
             using var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read);
+            using var reader = new BinaryReader(dctFile);
 
             // GB2312 characters 0 - 6768
-            for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+            for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
             {
-                string currentStr = GetCCByGB2312Id(i);
-                // if (i == 5231)
-                // System.out.println(i);
 
-                dctFile.Read(intBuffer, 0, intBuffer.Length);
-                // the dictionary was developed for C, and byte order must be 
converted to work with Java
-                cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
+                string currentStr = GetCCByGB2312Id(i); 
+                int cnt;
+                try
+                {
+                   cnt = reader.ReadInt32();  // LUCENENET: Use BinaryReader 
methods instead of ByteBuffer
+                }
+                catch (EndOfStreamException)
+                {
+                    // Reached end of file
+                    break;
+                }
+
                 if (cnt <= 0)
                 {
                     continue;
                 }
-                total += cnt;
-                int j = 0;
-                while (j < cnt)
+
+                for (int j = 0; j < cnt; j++)
                 {
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    buffer[0] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                        .GetInt32();// frequency
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    buffer[1] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                        .GetInt32();// length
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // buffer[2] = ByteBuffer.wrap(intBuffer).order(
-                    // ByteOrder.LITTLE_ENDIAN).getInt();// handle
-
-                    length = buffer[1];
-                    if (length > 0)
+                    // LUCENENET: Use BinaryReader methods instead of 
ByteBuffer
+                    int frequency = reader.ReadInt32();
+                    int length = reader.ReadInt32();
+                    reader.ReadInt32();  // Skip handle value (unused)
+
+                    if (length > 0 && length <= MAX_VALID_LENGTH && 
dctFile.Position + length <= dctFile.Length)
                     {
-                        byte[] lchBuffer = new byte[length];
-                        dctFile.Read(lchBuffer, 0, lchBuffer.Length);
+                        byte[] lchBuffer = reader.ReadBytes(length);  // 
LUCENENET: Use BinaryReader methods instead of ByteBuffer
+
                         //tmpword = new String(lchBuffer, "GB2312");
-                        tmpword = gb2312Encoding.GetString(lchBuffer); // 
LUCENENET specific: use cached encoding instance from base class
+                        string tmpword = gb2312Encoding.GetString(lchBuffer); 
// LUCENENET specific: use cached encoding instance from base class
                         //tmpword = 
Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
-                        if (i != 3755 + GB2312_FIRST_CHAR)
+
+
+                        if (i != HEADER_POSITION + GB2312_FIRST_CHAR)
                         {
                             tmpword = currentStr + tmpword;
                         }
-                        char[] carray = tmpword.ToCharArray();
+
+                        ReadOnlySpan<char> carray = tmpword.AsSpan();
                         long hashId = Hash1(carray);
                         int index = GetAvaliableIndex(hashId, carray);
+
                         if (index != -1)
                         {
                             if (bigramHashTable[index] == 0)
                             {
                                 bigramHashTable[index] = hashId;
                                 // bigramStringTable[index] = tmpword;
+
                             }
-                            frequencyTable[index] += buffer[0];
+                            frequencyTable[index] += frequency;
                         }
                     }
-                    j++;
                 }
             }
-            // log.info("load dictionary done! " + dctFilePath + " total:" + 
total);

Review Comment:
   Please leave upstream comments in place.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
         /// <summary>
         /// Load the datafile into this <see cref="BigramDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary 
(bigramdict.dct)</param>
+        /// <param name="dctFilePath">Path to the Bigramdictionary 
(bigramdict.dct)</param>
         /// <exception cref="IOException">If there is a low-level I/O 
error</exception>
         public virtual void LoadFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
             // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET specific - refactored constants for clarity
+            const int HEADER_POSITION = 3755;
+            const int MAX_VALID_LENGTH = 1000;
+
             //using (RandomAccessFile dctFile = new 
RandomAccessFile(dctFilePath, "r"))
             using var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read);
+            using var reader = new BinaryReader(dctFile);
 
             // GB2312 characters 0 - 6768
-            for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+            for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
             {
-                string currentStr = GetCCByGB2312Id(i);
-                // if (i == 5231)

Review Comment:
   Please leave upstream comments in place.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
         /// <summary>
         /// Load the datafile into this <see cref="BigramDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary 
(bigramdict.dct)</param>
+        /// <param name="dctFilePath">Path to the Bigramdictionary 
(bigramdict.dct)</param>
         /// <exception cref="IOException">If there is a low-level I/O 
error</exception>
         public virtual void LoadFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
             // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET specific - refactored constants for clarity
+            const int HEADER_POSITION = 3755;
+            const int MAX_VALID_LENGTH = 1000;
+
             //using (RandomAccessFile dctFile = new 
RandomAccessFile(dctFilePath, "r"))
             using var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read);
+            using var reader = new BinaryReader(dctFile);
 
             // GB2312 characters 0 - 6768
-            for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+            for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
             {
-                string currentStr = GetCCByGB2312Id(i);
-                // if (i == 5231)
-                // System.out.println(i);
 
-                dctFile.Read(intBuffer, 0, intBuffer.Length);
-                // the dictionary was developed for C, and byte order must be 
converted to work with Java
-                cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
+                string currentStr = GetCCByGB2312Id(i); 
+                int cnt;
+                try
+                {
+                   cnt = reader.ReadInt32();  // LUCENENET: Use BinaryReader 
methods instead of ByteBuffer
+                }
+                catch (EndOfStreamException)

Review Comment:
   Please do not swallow exceptions that would be helpful for debugging if the 
file format is incorrect.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary 
(coredict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary 
(coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
+            int total = 0;
+
+            // The file only counted 6763 Chinese characters plus 5 reserved 
slots (3756~3760).
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET: Use BinaryReader to simplify endian conversion and 
stream reading.
+
             using (var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read))
+            using (var reader = new BinaryReader(dctFile))
             {
-
                 // GB2312 characters 0 - 6768
-                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+                for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
                 {
-                    // if (i == 5231)
-                    // System.out.println(i);
+                    int cnt = reader.ReadInt32(); // LUCENENET: Use 
BinaryReader methods instead of ByteBuffer
 
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // the dictionary was developed for C, and byte order must 
be converted to work with Java
-                    cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
                     if (cnt <= 0)
                     {
                         wordItem_charArrayTable[i] = null;
                         wordItem_frequencyTable[i] = null;
                         continue;
                     }
+
                     wordItem_charArrayTable[i] = new char[cnt][];
                     wordItem_frequencyTable[i] = new int[cnt];
                     total += cnt;
-                    int j = 0;
-                    while (j < cnt)
+
+                    for (int j = 0; j < cnt; j++)
                     {
-                        // wordItemTable[i][j] = new WordItem();
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[0] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// frequency
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[1] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// length
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[2] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// handle
-
-                        // wordItemTable[i][j].frequency = buffer[0];
-                        wordItem_frequencyTable[i][j] = buffer[0];
-
-                        length = buffer[1];
+                        // LUCENENET: Use BinaryReader methods instead of 
ByteBuffer
+                        int frequency = reader.ReadInt32();
+                        int length = reader.ReadInt32();
+                        reader.ReadInt32(); // Skip handle (unused)
+
+                        wordItem_frequencyTable[i][j] = frequency;
+
                         if (length > 0)
                         {
-                            byte[] lchBuffer = new byte[length];
-                            dctFile.Read(lchBuffer, 0, lchBuffer.Length);
-                            tmpword = gb2312Encoding.GetString(lchBuffer); // 
LUCENENET specific: use cached encoding instance from base class
+                            byte[] lchBuffer = reader.ReadBytes(length);
+                            string tmpword = 
gb2312Encoding.GetString(lchBuffer); // LUCENENET: Use cached encoding instance 
from base class
                             wordItem_charArrayTable[i][j] = 
tmpword.ToCharArray();
                         }
                         else
                         {
-                            // wordItemTable[i][j].charArray = null;
                             wordItem_charArrayTable[i][j] = null;
                         }
-                        // System.out.println(indexTable[i].wordItems[j]);
-                        j++;

Review Comment:
   Please increment `j` here as it was done upstream.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs:
##########
@@ -162,7 +162,7 @@ public virtual long Hash1(char c)
         /// </summary>

Review Comment:
   At the top of this file, please change the declaration to throw on invalid 
invalid input, since all of the callers expect exceptions for the proper 
handling.
   
   ```c#
   protected static readonly Encoding gb2312Encoding = 
Encoding.GetEncoding("GB2312",
       EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);
   ```



##########
src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs:
##########
@@ -0,0 +1,72 @@
+using Lucene.Net.Util;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using Lucene.Net.Attributes;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Reflection;
+
+
+[TestFixture]
+[LuceneNetSpecific]
+public class DictionaryTests : LuceneTestCase
+{
+    private const string BigramResourceName = 
"Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramdict.dct";
+
+    [Test, Category("Dictionary")]
+    public void TestBigramDictionary()
+    {
+        using var resourceStream = GetResourceStream(BigramResourceName);
+
+        FileInfo _tempFile = CreateTempFile("bigramdict", ".dct");
+        CopyStreamToFile(resourceStream, _tempFile);
+
+        Assert.IsTrue(_tempFile.Length > 0, "Temp file is empty.");
+
+        BigramDictionary bigramDict = BigramDictionary.GetInstance();
+        bigramDict.LoadFromFile(_tempFile.FullName);
+
+        Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()), 
"Frequency for '啊hello' is incorrect.");
+        Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()), 
"Frequency for '阿world' is incorrect.");
+    }
+
+    [Test, Category("Dictionary")]

Review Comment:
   Please remove the Category attribute.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
         /// <summary>
         /// Load the datafile into this <see cref="BigramDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary 
(bigramdict.dct)</param>
+        /// <param name="dctFilePath">Path to the Bigramdictionary 
(bigramdict.dct)</param>
         /// <exception cref="IOException">If there is a low-level I/O 
error</exception>
         public virtual void LoadFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
             // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET specific - refactored constants for clarity
+            const int HEADER_POSITION = 3755;
+            const int MAX_VALID_LENGTH = 1000;
+
             //using (RandomAccessFile dctFile = new 
RandomAccessFile(dctFilePath, "r"))
             using var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read);
+            using var reader = new BinaryReader(dctFile);
 
             // GB2312 characters 0 - 6768
-            for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+            for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
             {
-                string currentStr = GetCCByGB2312Id(i);
-                // if (i == 5231)
-                // System.out.println(i);
 
-                dctFile.Read(intBuffer, 0, intBuffer.Length);
-                // the dictionary was developed for C, and byte order must be 
converted to work with Java
-                cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
+                string currentStr = GetCCByGB2312Id(i); 
+                int cnt;
+                try
+                {
+                   cnt = reader.ReadInt32();  // LUCENENET: Use BinaryReader 
methods instead of ByteBuffer
+                }
+                catch (EndOfStreamException)
+                {
+                    // Reached end of file
+                    break;
+                }
+
                 if (cnt <= 0)
                 {
                     continue;
                 }
-                total += cnt;
-                int j = 0;
-                while (j < cnt)
+
+                for (int j = 0; j < cnt; j++)
                 {
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    buffer[0] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                        .GetInt32();// frequency
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    buffer[1] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                        .GetInt32();// length
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // buffer[2] = ByteBuffer.wrap(intBuffer).order(
-                    // ByteOrder.LITTLE_ENDIAN).getInt();// handle
-
-                    length = buffer[1];
-                    if (length > 0)
+                    // LUCENENET: Use BinaryReader methods instead of 
ByteBuffer
+                    int frequency = reader.ReadInt32();
+                    int length = reader.ReadInt32();
+                    reader.ReadInt32();  // Skip handle value (unused)

Review Comment:
   Please set the handle to `buffer[2]` as it was done upstream.



##########
src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs:
##########
@@ -0,0 +1,72 @@
+using Lucene.Net.Util;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using Lucene.Net.Attributes;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Reflection;
+
+
+[TestFixture]
+[LuceneNetSpecific]
+public class DictionaryTests : LuceneTestCase
+{
+    private const string BigramResourceName = 
"Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramdict.dct";
+
+    [Test, Category("Dictionary")]

Review Comment:
   Please remove the Category attribute.



##########
src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs:
##########
@@ -0,0 +1,72 @@
+using Lucene.Net.Util;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using Lucene.Net.Attributes;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Reflection;
+
+

Review Comment:
   Please put these tests in the `Lucene.Net.Analysis.Cn.Smart.Hhmm` namespace.
   
   Since this file doesn't exist upstream, it should be moved into a subfolder 
named `Support/Hhmm`.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary 
(coredict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary 
(coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
+            int total = 0;
+
+            // The file only counted 6763 Chinese characters plus 5 reserved 
slots (3756~3760).
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET: Use BinaryReader to simplify endian conversion and 
stream reading.
+
             using (var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read))
+            using (var reader = new BinaryReader(dctFile))
             {
-
                 // GB2312 characters 0 - 6768
-                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+                for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
                 {
-                    // if (i == 5231)
-                    // System.out.println(i);
+                    int cnt = reader.ReadInt32(); // LUCENENET: Use 
BinaryReader methods instead of ByteBuffer

Review Comment:
   Please include the reason why we changed to use BinaryReader.
   
   ```c#
   // LUCENENET: Use BinaryReader to decode little endian instead of 
ByteBuffer, since this is the default in .NET
   ```



##########
src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs:
##########
@@ -0,0 +1,72 @@
+using Lucene.Net.Util;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using Lucene.Net.Attributes;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Reflection;
+
+
+[TestFixture]
+[LuceneNetSpecific]
+public class DictionaryTests : LuceneTestCase
+{
+    private const string BigramResourceName = 
"Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramdict.dct";
+
+    [Test, Category("Dictionary")]
+    public void TestBigramDictionary()
+    {
+        using var resourceStream = GetResourceStream(BigramResourceName);
+
+        FileInfo _tempFile = CreateTempFile("bigramdict", ".dct");
+        CopyStreamToFile(resourceStream, _tempFile);
+
+        Assert.IsTrue(_tempFile.Length > 0, "Temp file is empty.");
+
+        BigramDictionary bigramDict = BigramDictionary.GetInstance();
+        bigramDict.LoadFromFile(_tempFile.FullName);
+
+        Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()), 
"Frequency for '啊hello' is incorrect.");
+        Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()), 
"Frequency for '阿world' is incorrect.");
+    }
+
+    [Test, Category("Dictionary")]
+    public void TestWordDictionaryGetInstance()
+    {
+        WordDictionary wordDict = WordDictionary.GetInstance();
+
+        Assert.NotNull(wordDict, "WordDictionary.GetInstance() returned 
null.");

Review Comment:
   Note that it is impossible for `WordDictionary.GetInstance()` to return 
`null`, so this assert is unnecessary.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary 
(coredict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary 
(coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
+            int total = 0;
+
+            // The file only counted 6763 Chinese characters plus 5 reserved 
slots (3756~3760).
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET: Use BinaryReader to simplify endian conversion and 
stream reading.
+
             using (var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read))
+            using (var reader = new BinaryReader(dctFile))
             {
-
                 // GB2312 characters 0 - 6768
-                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+                for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
                 {
-                    // if (i == 5231)
-                    // System.out.println(i);
+                    int cnt = reader.ReadInt32(); // LUCENENET: Use 
BinaryReader methods instead of ByteBuffer
 
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // the dictionary was developed for C, and byte order must 
be converted to work with Java
-                    cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
                     if (cnt <= 0)
                     {
                         wordItem_charArrayTable[i] = null;
                         wordItem_frequencyTable[i] = null;
                         continue;
                     }
+
                     wordItem_charArrayTable[i] = new char[cnt][];
                     wordItem_frequencyTable[i] = new int[cnt];
                     total += cnt;
-                    int j = 0;
-                    while (j < cnt)
+
+                    for (int j = 0; j < cnt; j++)
                     {
-                        // wordItemTable[i][j] = new WordItem();
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[0] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// frequency
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[1] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// length
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[2] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// handle
-
-                        // wordItemTable[i][j].frequency = buffer[0];
-                        wordItem_frequencyTable[i][j] = buffer[0];
-
-                        length = buffer[1];
+                        // LUCENENET: Use BinaryReader methods instead of 
ByteBuffer

Review Comment:
   Please include the reason why we changed to use BinaryReader.
   
   ```c#
   // LUCENENET: Use BinaryReader to decode little endian instead of 
ByteBuffer, since this is the default in .NET
   ```



##########
src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs:
##########
@@ -0,0 +1,72 @@
+using Lucene.Net.Util;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using Lucene.Net.Attributes;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Reflection;
+
+
+[TestFixture]
+[LuceneNetSpecific]
+public class DictionaryTests : LuceneTestCase
+{
+    private const string BigramResourceName = 
"Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramdict.dct";
+
+    [Test, Category("Dictionary")]
+    public void TestBigramDictionary()
+    {
+        using var resourceStream = GetResourceStream(BigramResourceName);
+
+        FileInfo _tempFile = CreateTempFile("bigramdict", ".dct");
+        CopyStreamToFile(resourceStream, _tempFile);
+
+        Assert.IsTrue(_tempFile.Length > 0, "Temp file is empty.");
+
+        BigramDictionary bigramDict = BigramDictionary.GetInstance();
+        bigramDict.LoadFromFile(_tempFile.FullName);
+
+        Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()), 
"Frequency for '啊hello' is incorrect.");
+        Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()), 
"Frequency for '阿world' is incorrect.");
+    }
+
+    [Test, Category("Dictionary")]
+    public void TestWordDictionaryGetInstance()
+    {
+        WordDictionary wordDict = WordDictionary.GetInstance();

Review Comment:
   Please prepare the temp file for loading the same way that was done in the 
example for `TestBigramDictionary()`.



##########
src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs:
##########
@@ -0,0 +1,72 @@
+using Lucene.Net.Util;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using Lucene.Net.Attributes;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Reflection;
+
+
+[TestFixture]
+[LuceneNetSpecific]
+public class DictionaryTests : LuceneTestCase

Review Comment:
   Please setup the directory and name the files with the correct conventions 
so they are loaded using the existing business logic. This gives us better test 
coverage than using a custom file name and loading it inside of the test.
   
   Also, it is important to consistently use the `CreateTempDir()` and 
`CreateTempFile()` methods of `LuceneTestCase`, since they are already set up 
to delete the temp files when the test is finished running. In this case, we 
should use `CreateTempDir()` so we can update the 
`AnalyzerProfile.ANALYSIS_DATA_DIR` at the beginning of the test with this 
temporary location. Then all we need to do is put temp files named 
`bigramdict.dct` and `coredict.dct` into that location for the corresponding 
classes to load them.
   
   To get the resource stream, please use 
`this.GetType().FindAndGetResourceStream("bigramdict.dct");`. For that to work, 
the `bigramdict.dct` file must be in the same directory as this file in the 
project. It must not be in a subdirectory named `Resources`. So, these files 
should be placed in the `Support/Hhmm` folder, along with `DictionaryTests.cs`. 
`FindAndGetResourceStream()` is an extension method in the `J2N` namespace that 
mimics the classpath functionality in Java for embedded. resource files, which 
makes them relative to the class structure.
   
   ```c#
       private const string BigramFileName = "bigramdict.dct";
       private DirectoryInfo tempDir;
   
       public override void OneTimeSetUp()
       {
           tempDir = CreateTempDir("smartcn-data");
           AnalyzerProfile.ANALYSIS_DATA_DIR = tempDir.FullName;
       }
   
       [Test]
       public void TestBigramDictionary()
       {
           using var resourceStream = 
this.GetType().FindAndGetResourceStream(BigramFileName);
   
           string tempFile = Path.Combine(tempDir.FullName, BigramFileName);
           using (var tempStream = File.Create(tempFile))
           {
               resourceStream.CopyTo(tempStream);
               Assert.IsTrue(tempStream.Length > 0, "Temp file is empty.");
           }
   
           BigramDictionary bigramDict = BigramDictionary.GetInstance();
   
           Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()), 
"Frequency for '啊hello' is incorrect.");
           Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()), 
"Frequency for '阿world' is incorrect.");
       }
   ```



##########
src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj:
##########
@@ -64,4 +64,9 @@
     <PackageReference Include="System.Text.Encoding.CodePages" 
Version="$(SystemTextEncodingCodePagesPackageVersion)" />
   </ItemGroup>
 
+  <ItemGroup>
+  <InternalsVisibleTo Include="Lucene.Net.Tests.Analysis.SmartCn" />

Review Comment:
   Please indent the ItemGroup and InternalsVisibleTo (using spaces) 
appropriately.



##########
src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs:
##########
@@ -0,0 +1,72 @@
+using Lucene.Net.Util;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using Lucene.Net.Attributes;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Reflection;
+
+
+[TestFixture]
+[LuceneNetSpecific]
+public class DictionaryTests : LuceneTestCase
+{
+    private const string BigramResourceName = 
"Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramdict.dct";
+
+    [Test, Category("Dictionary")]
+    public void TestBigramDictionary()
+    {
+        using var resourceStream = GetResourceStream(BigramResourceName);
+
+        FileInfo _tempFile = CreateTempFile("bigramdict", ".dct");
+        CopyStreamToFile(resourceStream, _tempFile);
+
+        Assert.IsTrue(_tempFile.Length > 0, "Temp file is empty.");
+
+        BigramDictionary bigramDict = BigramDictionary.GetInstance();
+        bigramDict.LoadFromFile(_tempFile.FullName);
+
+        Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()), 
"Frequency for '啊hello' is incorrect.");
+        Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()), 
"Frequency for '阿world' is incorrect.");
+    }
+
+    [Test, Category("Dictionary")]
+    public void TestWordDictionaryGetInstance()
+    {
+        WordDictionary wordDict = WordDictionary.GetInstance();
+
+        Assert.NotNull(wordDict, "WordDictionary.GetInstance() returned 
null.");
+

Review Comment:
   Please add additional assertions to ensure the data loaded correctly.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
         /// <summary>
         /// Load the datafile into this <see cref="BigramDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary 
(bigramdict.dct)</param>
+        /// <param name="dctFilePath">Path to the Bigramdictionary 
(bigramdict.dct)</param>
         /// <exception cref="IOException">If there is a low-level I/O 
error</exception>
         public virtual void LoadFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
             // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET specific - refactored constants for clarity
+            const int HEADER_POSITION = 3755;
+            const int MAX_VALID_LENGTH = 1000;
+
             //using (RandomAccessFile dctFile = new 
RandomAccessFile(dctFilePath, "r"))
             using var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read);
+            using var reader = new BinaryReader(dctFile);
 
             // GB2312 characters 0 - 6768
-            for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+            for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
             {
-                string currentStr = GetCCByGB2312Id(i);
-                // if (i == 5231)
-                // System.out.println(i);
 
-                dctFile.Read(intBuffer, 0, intBuffer.Length);
-                // the dictionary was developed for C, and byte order must be 
converted to work with Java
-                cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
+                string currentStr = GetCCByGB2312Id(i); 
+                int cnt;
+                try
+                {
+                   cnt = reader.ReadInt32();  // LUCENENET: Use BinaryReader 
methods instead of ByteBuffer

Review Comment:
   Please include the reason why we changed to use BinaryReader.
   
   ```c#
   // LUCENENET: Use BinaryReader to decode little endian instead of 
ByteBuffer, since this is the default in .NET
   ```



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
         /// <summary>
         /// Load the datafile into this <see cref="BigramDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary 
(bigramdict.dct)</param>
+        /// <param name="dctFilePath">Path to the Bigramdictionary 
(bigramdict.dct)</param>
         /// <exception cref="IOException">If there is a low-level I/O 
error</exception>
         public virtual void LoadFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
             // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET specific - refactored constants for clarity
+            const int HEADER_POSITION = 3755;
+            const int MAX_VALID_LENGTH = 1000;
+
             //using (RandomAccessFile dctFile = new 
RandomAccessFile(dctFilePath, "r"))
             using var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read);
+            using var reader = new BinaryReader(dctFile);
 
             // GB2312 characters 0 - 6768
-            for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+            for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
             {
-                string currentStr = GetCCByGB2312Id(i);
-                // if (i == 5231)
-                // System.out.println(i);
 
-                dctFile.Read(intBuffer, 0, intBuffer.Length);
-                // the dictionary was developed for C, and byte order must be 
converted to work with Java
-                cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
+                string currentStr = GetCCByGB2312Id(i); 
+                int cnt;
+                try
+                {
+                   cnt = reader.ReadInt32();  // LUCENENET: Use BinaryReader 
methods instead of ByteBuffer
+                }
+                catch (EndOfStreamException)
+                {
+                    // Reached end of file
+                    break;
+                }
+
                 if (cnt <= 0)
                 {
                     continue;
                 }
-                total += cnt;
-                int j = 0;
-                while (j < cnt)
+
+                for (int j = 0; j < cnt; j++)
                 {
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    buffer[0] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                        .GetInt32();// frequency
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    buffer[1] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                        .GetInt32();// length
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // buffer[2] = ByteBuffer.wrap(intBuffer).order(
-                    // ByteOrder.LITTLE_ENDIAN).getInt();// handle
-
-                    length = buffer[1];
-                    if (length > 0)
+                    // LUCENENET: Use BinaryReader methods instead of 
ByteBuffer
+                    int frequency = reader.ReadInt32();
+                    int length = reader.ReadInt32();
+                    reader.ReadInt32();  // Skip handle value (unused)
+
+                    if (length > 0 && length <= MAX_VALID_LENGTH && 
dctFile.Position + length <= dctFile.Length)
                     {
-                        byte[] lchBuffer = new byte[length];
-                        dctFile.Read(lchBuffer, 0, lchBuffer.Length);
+                        byte[] lchBuffer = reader.ReadBytes(length);  // 
LUCENENET: Use BinaryReader methods instead of ByteBuffer

Review Comment:
   Please include the reason why we changed to use BinaryReader.
   
   ```c#
   // LUCENENET: Use BinaryReader to decode little endian instead of 
ByteBuffer, since this is the default in .NET
   ```



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
         /// <summary>
         /// Load the datafile into this <see cref="BigramDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary 
(bigramdict.dct)</param>
+        /// <param name="dctFilePath">Path to the Bigramdictionary 
(bigramdict.dct)</param>
         /// <exception cref="IOException">If there is a low-level I/O 
error</exception>
         public virtual void LoadFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
             // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET specific - refactored constants for clarity
+            const int HEADER_POSITION = 3755;

Review Comment:
   It looks like there is a problem here, since the original code used 3756 
instead of 3755. I could be wrong, though. This might be due to the change in 
loop format or it may be an incorrect comment in Java. We should ensure our 
format exactly matches Lucene, as these files should be portable between .NET 
and Java. Do note that there is a file in `analysis-data.zip` 
[here](https://issues.apache.org/jira/browse/LUCENE-1629) that can be used to 
check whether we still support the original format.
   
   Whatever the case, this line should be updated either with the correct 
number or a comment explaining why/how the upstream code was wrong.



##########
src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj:
##########
@@ -62,5 +62,10 @@
   <ItemGroup Condition=" '$(TargetFramework)' == 'net472' ">
     <PackageReference Include="System.Text.Encoding.CodePages" 
Version="$(SystemTextEncodingCodePagesPackageVersion)" />
   </ItemGroup>
+  <ItemGroup>
+  <EmbeddedResource Include="Resources\bigramdict.dct" />

Review Comment:
   Please expand this to include all files with these extensions, as was done 
in the other analysis packages. Also, please place this element above the other 
`ItemGroup` elements as was done in the morfologik and kuromoji projects and 
ensure it is properly indented (with 2 spaces per level).
   
   ```xml
   <ItemGroup>
     <EmbeddedResource Include="**/*.dct" Exclude="bin/**/*;obj/**/*" 
Label="Dictionary Test Data" />
   </ItemGroup>
   ```
   
   > Note that these files will need to be moved into the `Support/Hhmm` folder 
along with the tests.



##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
         /// <summary>
         /// Load the datafile into this <see cref="BigramDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary 
(bigramdict.dct)</param>
+        /// <param name="dctFilePath">Path to the Bigramdictionary 
(bigramdict.dct)</param>
         /// <exception cref="IOException">If there is a low-level I/O 
error</exception>
         public virtual void LoadFromFile(string dctFilePath)
         {
-            int i, cnt, length, total = 0;
             // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
+
+            // LUCENENET: Removed buffer and intBuffer arrays since 
BinaryReader handles reading values directly in a more type-safe and readable 
way.
+            // LUCENENET specific - refactored constants for clarity
+            const int HEADER_POSITION = 3755;
+            const int MAX_VALID_LENGTH = 1000;
+
             //using (RandomAccessFile dctFile = new 
RandomAccessFile(dctFilePath, "r"))
             using var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read);
+            using var reader = new BinaryReader(dctFile);
 
             // GB2312 characters 0 - 6768
-            for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+            for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
             {
-                string currentStr = GetCCByGB2312Id(i);
-                // if (i == 5231)
-                // System.out.println(i);
 
-                dctFile.Read(intBuffer, 0, intBuffer.Length);
-                // the dictionary was developed for C, and byte order must be 
converted to work with Java
-                cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
+                string currentStr = GetCCByGB2312Id(i); 
+                int cnt;
+                try
+                {
+                   cnt = reader.ReadInt32();  // LUCENENET: Use BinaryReader 
methods instead of ByteBuffer
+                }
+                catch (EndOfStreamException)
+                {
+                    // Reached end of file
+                    break;
+                }
+
                 if (cnt <= 0)
                 {
                     continue;
                 }
-                total += cnt;
-                int j = 0;
-                while (j < cnt)
+
+                for (int j = 0; j < cnt; j++)
                 {
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    buffer[0] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                        .GetInt32();// frequency
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    buffer[1] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                        .GetInt32();// length
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // buffer[2] = ByteBuffer.wrap(intBuffer).order(
-                    // ByteOrder.LITTLE_ENDIAN).getInt();// handle
-
-                    length = buffer[1];
-                    if (length > 0)
+                    // LUCENENET: Use BinaryReader methods instead of 
ByteBuffer

Review Comment:
   Please include the reason why we changed to use BinaryReader.
   
   ```c#
   // LUCENENET: Use BinaryReader to decode little endian instead of 
ByteBuffer, since this is the default in .NET
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: dev-unsubscr...@lucenenet.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] feat: Optimize SmartCn Dictionaries and Add Dictionary Loading Tests [lucenenet]

Reply via email to