NightOwl888 commented on code in PR #1154: URL: https://github.com/apache/lucenenet/pull/1154#discussion_r2038138668
########## src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs: ########## @@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj) /// <summary> /// Load the datafile into this <see cref="WordDictionary"/> /// </summary> - /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> - /// <returns>number of words read</returns> + /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param> + /// <returns>Number of words read</returns> /// <exception cref="IOException">If there is a low-level I/O error.</exception> private int LoadMainDataFromFile(string dctFilePath) { - int i, cnt, length, total = 0; - // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + int total = 0; + + // The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760). // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET: Use BinaryReader to simplify endian conversion and stream reading. + using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) + using (var reader = new BinaryReader(dctFile)) { - // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - // if (i == 5231) Review Comment: Please leave upstream comments in place. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs: ########## @@ -254,80 +254,83 @@ private void Load(string dictRoot) /// <summary> /// Load the datafile into this <see cref="BigramDictionary"/> /// </summary> - /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param> + /// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param> /// <exception cref="IOException">If there is a low-level I/O error</exception> public virtual void LoadFromFile(string dctFilePath) { - int i, cnt, length, total = 0; // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. // The 3756th is used (as a header) to store information. - int[] Review Comment: Please add this temporary buffer back in, but allocate it on the stack. ```c# Span<int> buffer = stackalloc char[3]; ``` ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs: ########## @@ -254,80 +254,83 @@ private void Load(string dictRoot) /// <summary> /// Load the datafile into this <see cref="BigramDictionary"/> /// </summary> - /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param> + /// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param> /// <exception cref="IOException">If there is a low-level I/O error</exception> public virtual void LoadFromFile(string dctFilePath) { - int i, cnt, length, total = 0; // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET specific - refactored constants for clarity + const int HEADER_POSITION = 3755; + const int MAX_VALID_LENGTH = 1000; + //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r")) using var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read); + using var reader = new BinaryReader(dctFile); // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - string currentStr = GetCCByGB2312Id(i); - // if (i == 5231) - // System.out.println(i); Review Comment: Please leave upstream comments in place. ########## src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs: ########## @@ -0,0 +1,72 @@ +using Lucene.Net.Util; +using Lucene.Net.Analysis.Cn.Smart.Hhmm; +using Lucene.Net.Attributes; +using NUnit.Framework; +using System; +using System.IO; +using System.Reflection; + Review Comment: Please use our custom `Assert` class, not the one from NUnit. ```c# using Assert = Lucene.Net.TestFramework.Assert; ``` ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs: ########## @@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj) /// <summary> /// Load the datafile into this <see cref="WordDictionary"/> /// </summary> - /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> - /// <returns>number of words read</returns> + /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param> + /// <returns>Number of words read</returns> /// <exception cref="IOException">If there is a low-level I/O error.</exception> private int LoadMainDataFromFile(string dctFilePath) { - int i, cnt, length, total = 0; - // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + int total = 0; + + // The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760). // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET: Use BinaryReader to simplify endian conversion and stream reading. + using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) + using (var reader = new BinaryReader(dctFile)) { - // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - // if (i == 5231) - // System.out.println(i); + int cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32(); if (cnt <= 0) { wordItem_charArrayTable[i] = null; wordItem_frequencyTable[i] = null; continue; } + wordItem_charArrayTable[i] = new char[cnt][]; wordItem_frequencyTable[i] = new int[cnt]; total += cnt; - int j = 0; - while (j < cnt) + + for (int j = 0; j < cnt; j++) { - // wordItemTable[i][j] = new WordItem(); - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// frequency - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// length - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// handle - - // wordItemTable[i][j].frequency = buffer[0]; - wordItem_frequencyTable[i][j] = buffer[0]; - - length = buffer[1]; + // LUCENENET: Use BinaryReader methods instead of ByteBuffer + int frequency = reader.ReadInt32(); + int length = reader.ReadInt32(); + reader.ReadInt32(); // Skip handle (unused) Review Comment: Please set this to `buffer[2]` as it was upstream. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs: ########## @@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj) /// <summary> /// Load the datafile into this <see cref="WordDictionary"/> /// </summary> - /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> - /// <returns>number of words read</returns> + /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param> + /// <returns>Number of words read</returns> /// <exception cref="IOException">If there is a low-level I/O error.</exception> private int LoadMainDataFromFile(string dctFilePath) { - int i, cnt, length, total = 0; - // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + int total = 0; + + // The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760). // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET: Use BinaryReader to simplify endian conversion and stream reading. + using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) + using (var reader = new BinaryReader(dctFile)) { - // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - // if (i == 5231) - // System.out.println(i); + int cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32(); if (cnt <= 0) { wordItem_charArrayTable[i] = null; wordItem_frequencyTable[i] = null; continue; } + wordItem_charArrayTable[i] = new char[cnt][]; wordItem_frequencyTable[i] = new int[cnt]; total += cnt; - int j = 0; - while (j < cnt) + + for (int j = 0; j < cnt; j++) { - // wordItemTable[i][j] = new WordItem(); - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// frequency - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// length - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// handle - - // wordItemTable[i][j].frequency = buffer[0]; - wordItem_frequencyTable[i][j] = buffer[0]; - - length = buffer[1]; + // LUCENENET: Use BinaryReader methods instead of ByteBuffer + int frequency = reader.ReadInt32(); + int length = reader.ReadInt32(); + reader.ReadInt32(); // Skip handle (unused) + + wordItem_frequencyTable[i][j] = frequency; + if (length > 0) { - byte[] lchBuffer = new byte[length]; - dctFile.Read(lchBuffer, 0, lchBuffer.Length); - tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class + byte[] lchBuffer = reader.ReadBytes(length); + string tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET: Use cached encoding instance from base class wordItem_charArrayTable[i][j] = tmpword.ToCharArray(); } else { - // wordItemTable[i][j].charArray = null; Review Comment: Please leave upstream comments in place. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs: ########## @@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj) /// <summary> /// Load the datafile into this <see cref="WordDictionary"/> /// </summary> - /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> - /// <returns>number of words read</returns> + /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param> + /// <returns>Number of words read</returns> /// <exception cref="IOException">If there is a low-level I/O error.</exception> private int LoadMainDataFromFile(string dctFilePath) { - int i, cnt, length, total = 0; Review Comment: Please define the variables the same way as the upstream code. Someday we may need to merge upstream changes, so it helps a lot if we don't have to re-evaluate the business logic. ```c# int i, cnt, length, total = 0; ``` ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs: ########## @@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj) /// <summary> /// Load the datafile into this <see cref="WordDictionary"/> /// </summary> - /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> - /// <returns>number of words read</returns> + /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param> + /// <returns>Number of words read</returns> /// <exception cref="IOException">If there is a low-level I/O error.</exception> private int LoadMainDataFromFile(string dctFilePath) { - int i, cnt, length, total = 0; - // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + int total = 0; + + // The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760). // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET: Use BinaryReader to simplify endian conversion and stream reading. + using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) + using (var reader = new BinaryReader(dctFile)) { - // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - // if (i == 5231) - // System.out.println(i); + int cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32(); if (cnt <= 0) { wordItem_charArrayTable[i] = null; wordItem_frequencyTable[i] = null; continue; } + wordItem_charArrayTable[i] = new char[cnt][]; wordItem_frequencyTable[i] = new int[cnt]; total += cnt; - int j = 0; - while (j < cnt) + + for (int j = 0; j < cnt; j++) { - // wordItemTable[i][j] = new WordItem(); - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// frequency - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// length - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// handle - - // wordItemTable[i][j].frequency = buffer[0]; Review Comment: Please leave upstream comments in place. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs: ########## @@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj) /// <summary> /// Load the datafile into this <see cref="WordDictionary"/> /// </summary> - /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> - /// <returns>number of words read</returns> + /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param> + /// <returns>Number of words read</returns> /// <exception cref="IOException">If there is a low-level I/O error.</exception> private int LoadMainDataFromFile(string dctFilePath) { - int i, cnt, length, total = 0; - // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + int total = 0; + + // The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760). // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET: Use BinaryReader to simplify endian conversion and stream reading. + using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) + using (var reader = new BinaryReader(dctFile)) { - // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - // if (i == 5231) - // System.out.println(i); + int cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32(); if (cnt <= 0) { wordItem_charArrayTable[i] = null; wordItem_frequencyTable[i] = null; continue; } + wordItem_charArrayTable[i] = new char[cnt][]; wordItem_frequencyTable[i] = new int[cnt]; total += cnt; - int j = 0; Review Comment: Please do not change the variable declarations from the upstream code. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs: ########## @@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj) /// <summary> /// Load the datafile into this <see cref="WordDictionary"/> /// </summary> - /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> - /// <returns>number of words read</returns> + /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param> + /// <returns>Number of words read</returns> /// <exception cref="IOException">If there is a low-level I/O error.</exception> private int LoadMainDataFromFile(string dctFilePath) { - int i, cnt, length, total = 0; - // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + int total = 0; + + // The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760). // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET: Use BinaryReader to simplify endian conversion and stream reading. + using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) + using (var reader = new BinaryReader(dctFile)) { - // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - // if (i == 5231) - // System.out.println(i); Review Comment: Please leave upstream comments in place. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs: ########## @@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj) /// <summary> /// Load the datafile into this <see cref="WordDictionary"/> /// </summary> - /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> - /// <returns>number of words read</returns> + /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param> + /// <returns>Number of words read</returns> /// <exception cref="IOException">If there is a low-level I/O error.</exception> private int LoadMainDataFromFile(string dctFilePath) { - int i, cnt, length, total = 0; - // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + int total = 0; + + // The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760). // The 3756th is used (as a header) to store information. - int[] Review Comment: Please add this temporary buffer back in, but allocate it on the stack. ```c# Span<int> buffer = stackalloc char[3]; ``` ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs: ########## @@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj) /// <summary> /// Load the datafile into this <see cref="WordDictionary"/> /// </summary> - /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> - /// <returns>number of words read</returns> + /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param> + /// <returns>Number of words read</returns> /// <exception cref="IOException">If there is a low-level I/O error.</exception> private int LoadMainDataFromFile(string dctFilePath) { - int i, cnt, length, total = 0; - // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + int total = 0; + + // The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760). // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET: Use BinaryReader to simplify endian conversion and stream reading. + using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) + using (var reader = new BinaryReader(dctFile)) { - // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - // if (i == 5231) - // System.out.println(i); + int cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32(); if (cnt <= 0) { wordItem_charArrayTable[i] = null; wordItem_frequencyTable[i] = null; continue; } + wordItem_charArrayTable[i] = new char[cnt][]; wordItem_frequencyTable[i] = new int[cnt]; total += cnt; - int j = 0; - while (j < cnt) + + for (int j = 0; j < cnt; j++) { - // wordItemTable[i][j] = new WordItem(); - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// frequency - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// length - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// handle - - // wordItemTable[i][j].frequency = buffer[0]; - wordItem_frequencyTable[i][j] = buffer[0]; - - length = buffer[1]; + // LUCENENET: Use BinaryReader methods instead of ByteBuffer + int frequency = reader.ReadInt32(); + int length = reader.ReadInt32(); Review Comment: Please leave upstream comments in place. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs: ########## @@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj) /// <summary> /// Load the datafile into this <see cref="WordDictionary"/> /// </summary> - /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> - /// <returns>number of words read</returns> + /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param> + /// <returns>Number of words read</returns> /// <exception cref="IOException">If there is a low-level I/O error.</exception> private int LoadMainDataFromFile(string dctFilePath) { - int i, cnt, length, total = 0; - // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + int total = 0; + + // The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760). // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET: Use BinaryReader to simplify endian conversion and stream reading. + using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) + using (var reader = new BinaryReader(dctFile)) { - // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - // if (i == 5231) - // System.out.println(i); + int cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32(); if (cnt <= 0) { wordItem_charArrayTable[i] = null; wordItem_frequencyTable[i] = null; continue; } + wordItem_charArrayTable[i] = new char[cnt][]; wordItem_frequencyTable[i] = new int[cnt]; total += cnt; - int j = 0; - while (j < cnt) Review Comment: Please do not change the style of loop from the upstream code. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs: ########## @@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj) /// <summary> /// Load the datafile into this <see cref="WordDictionary"/> /// </summary> - /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> - /// <returns>number of words read</returns> + /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param> + /// <returns>Number of words read</returns> /// <exception cref="IOException">If there is a low-level I/O error.</exception> private int LoadMainDataFromFile(string dctFilePath) { - int i, cnt, length, total = 0; - // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + int total = 0; + + // The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760). // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET: Use BinaryReader to simplify endian conversion and stream reading. + using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) + using (var reader = new BinaryReader(dctFile)) { - // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - // if (i == 5231) - // System.out.println(i); + int cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32(); if (cnt <= 0) { wordItem_charArrayTable[i] = null; wordItem_frequencyTable[i] = null; continue; } + wordItem_charArrayTable[i] = new char[cnt][]; wordItem_frequencyTable[i] = new int[cnt]; total += cnt; - int j = 0; - while (j < cnt) + + for (int j = 0; j < cnt; j++) { - // wordItemTable[i][j] = new WordItem(); - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// frequency - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// length - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// handle - - // wordItemTable[i][j].frequency = buffer[0]; - wordItem_frequencyTable[i][j] = buffer[0]; - - length = buffer[1]; + // LUCENENET: Use BinaryReader methods instead of ByteBuffer + int frequency = reader.ReadInt32(); Review Comment: Please leave upstream comments in place. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs: ########## @@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj) /// <summary> /// Load the datafile into this <see cref="WordDictionary"/> /// </summary> - /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> - /// <returns>number of words read</returns> + /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param> + /// <returns>Number of words read</returns> /// <exception cref="IOException">If there is a low-level I/O error.</exception> private int LoadMainDataFromFile(string dctFilePath) { - int i, cnt, length, total = 0; - // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + int total = 0; + + // The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760). // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET: Use BinaryReader to simplify endian conversion and stream reading. + using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) + using (var reader = new BinaryReader(dctFile)) { - // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - // if (i == 5231) - // System.out.println(i); + int cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32(); if (cnt <= 0) { wordItem_charArrayTable[i] = null; wordItem_frequencyTable[i] = null; continue; } + wordItem_charArrayTable[i] = new char[cnt][]; wordItem_frequencyTable[i] = new int[cnt]; total += cnt; - int j = 0; - while (j < cnt) + + for (int j = 0; j < cnt; j++) { - // wordItemTable[i][j] = new WordItem(); - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// frequency - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// length - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// handle - - // wordItemTable[i][j].frequency = buffer[0]; - wordItem_frequencyTable[i][j] = buffer[0]; - - length = buffer[1]; + // LUCENENET: Use BinaryReader methods instead of ByteBuffer + int frequency = reader.ReadInt32(); + int length = reader.ReadInt32(); + reader.ReadInt32(); // Skip handle (unused) + + wordItem_frequencyTable[i][j] = frequency; + if (length > 0) { - byte[] lchBuffer = new byte[length]; - dctFile.Read(lchBuffer, 0, lchBuffer.Length); - tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class + byte[] lchBuffer = reader.ReadBytes(length); + string tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET: Use cached encoding instance from base class wordItem_charArrayTable[i][j] = tmpword.ToCharArray(); } else { - // wordItemTable[i][j].charArray = null; wordItem_charArrayTable[i][j] = null; } - // System.out.println(indexTable[i].wordItems[j]); Review Comment: Please leave upstream comments in place. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs: ########## @@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj) /// <summary> /// Load the datafile into this <see cref="WordDictionary"/> /// </summary> - /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> - /// <returns>number of words read</returns> + /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param> + /// <returns>Number of words read</returns> /// <exception cref="IOException">If there is a low-level I/O error.</exception> private int LoadMainDataFromFile(string dctFilePath) { - int i, cnt, length, total = 0; - // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + int total = 0; + + // The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760). // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET: Use BinaryReader to simplify endian conversion and stream reading. + using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) + using (var reader = new BinaryReader(dctFile)) { - // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - // if (i == 5231) - // System.out.println(i); + int cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32(); if (cnt <= 0) { wordItem_charArrayTable[i] = null; wordItem_frequencyTable[i] = null; continue; } + wordItem_charArrayTable[i] = new char[cnt][]; wordItem_frequencyTable[i] = new int[cnt]; total += cnt; - int j = 0; - while (j < cnt) + + for (int j = 0; j < cnt; j++) { - // wordItemTable[i][j] = new WordItem(); Review Comment: Please leave upstream comments in place. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs: ########## @@ -254,80 +254,83 @@ private void Load(string dictRoot) /// <summary> /// Load the datafile into this <see cref="BigramDictionary"/> /// </summary> - /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param> + /// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param> /// <exception cref="IOException">If there is a low-level I/O error</exception> public virtual void LoadFromFile(string dctFilePath) { - int i, cnt, length, total = 0; // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; Review Comment: Please declare `tmpword` here as it was done upstream. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs: ########## @@ -254,80 +254,83 @@ private void Load(string dictRoot) /// <summary> /// Load the datafile into this <see cref="BigramDictionary"/> /// </summary> - /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param> + /// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param> /// <exception cref="IOException">If there is a low-level I/O error</exception> public virtual void LoadFromFile(string dctFilePath) { - int i, cnt, length, total = 0; // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET specific - refactored constants for clarity + const int HEADER_POSITION = 3755; + const int MAX_VALID_LENGTH = 1000; + //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r")) using var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read); + using var reader = new BinaryReader(dctFile); // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - string currentStr = GetCCByGB2312Id(i); - // if (i == 5231) - // System.out.println(i); - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32(); + string currentStr = GetCCByGB2312Id(i); + int cnt; + try + { + cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer + } + catch (EndOfStreamException) + { + // Reached end of file + break; + } + if (cnt <= 0) { continue; } - total += cnt; - int j = 0; - while (j < cnt) + + for (int j = 0; j < cnt; j++) { - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// frequency - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// length - dctFile.Read(intBuffer, 0, intBuffer.Length); - // buffer[2] = ByteBuffer.wrap(intBuffer).order( - // ByteOrder.LITTLE_ENDIAN).getInt();// handle - - length = buffer[1]; - if (length > 0) + // LUCENENET: Use BinaryReader methods instead of ByteBuffer + int frequency = reader.ReadInt32(); + int length = reader.ReadInt32(); + reader.ReadInt32(); // Skip handle value (unused) + + if (length > 0 && length <= MAX_VALID_LENGTH && dctFile.Position + length <= dctFile.Length) { - byte[] lchBuffer = new byte[length]; - dctFile.Read(lchBuffer, 0, lchBuffer.Length); + byte[] lchBuffer = reader.ReadBytes(length); // LUCENENET: Use BinaryReader methods instead of ByteBuffer + //tmpword = new String(lchBuffer, "GB2312"); - tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class + string tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer); - if (i != 3755 + GB2312_FIRST_CHAR) + + + if (i != HEADER_POSITION + GB2312_FIRST_CHAR) { tmpword = currentStr + tmpword; } - char[] carray = tmpword.ToCharArray(); + + ReadOnlySpan<char> carray = tmpword.AsSpan(); long hashId = Hash1(carray); int index = GetAvaliableIndex(hashId, carray); + if (index != -1) { if (bigramHashTable[index] == 0) { bigramHashTable[index] = hashId; // bigramStringTable[index] = tmpword; + } - frequencyTable[index] += buffer[0]; + frequencyTable[index] += frequency; } } - j++; } } - // log.info("load dictionary done! " + dctFilePath + " total:" + total); Review Comment: Please leave upstream comments in place. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs: ########## @@ -254,80 +254,83 @@ private void Load(string dictRoot) /// <summary> /// Load the datafile into this <see cref="BigramDictionary"/> /// </summary> - /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param> + /// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param> /// <exception cref="IOException">If there is a low-level I/O error</exception> public virtual void LoadFromFile(string dctFilePath) { - int i, cnt, length, total = 0; // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET specific - refactored constants for clarity + const int HEADER_POSITION = 3755; + const int MAX_VALID_LENGTH = 1000; + //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r")) using var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read); + using var reader = new BinaryReader(dctFile); // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - string currentStr = GetCCByGB2312Id(i); - // if (i == 5231) Review Comment: Please leave upstream comments in place. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs: ########## @@ -254,80 +254,83 @@ private void Load(string dictRoot) /// <summary> /// Load the datafile into this <see cref="BigramDictionary"/> /// </summary> - /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param> + /// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param> /// <exception cref="IOException">If there is a low-level I/O error</exception> public virtual void LoadFromFile(string dctFilePath) { - int i, cnt, length, total = 0; // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET specific - refactored constants for clarity + const int HEADER_POSITION = 3755; + const int MAX_VALID_LENGTH = 1000; + //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r")) using var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read); + using var reader = new BinaryReader(dctFile); // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - string currentStr = GetCCByGB2312Id(i); - // if (i == 5231) - // System.out.println(i); - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32(); + string currentStr = GetCCByGB2312Id(i); + int cnt; + try + { + cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer + } + catch (EndOfStreamException) Review Comment: Please do not swallow exceptions that would be helpful for debugging if the file format is incorrect. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs: ########## @@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj) /// <summary> /// Load the datafile into this <see cref="WordDictionary"/> /// </summary> - /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> - /// <returns>number of words read</returns> + /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param> + /// <returns>Number of words read</returns> /// <exception cref="IOException">If there is a low-level I/O error.</exception> private int LoadMainDataFromFile(string dctFilePath) { - int i, cnt, length, total = 0; - // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + int total = 0; + + // The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760). // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET: Use BinaryReader to simplify endian conversion and stream reading. + using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) + using (var reader = new BinaryReader(dctFile)) { - // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - // if (i == 5231) - // System.out.println(i); + int cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32(); if (cnt <= 0) { wordItem_charArrayTable[i] = null; wordItem_frequencyTable[i] = null; continue; } + wordItem_charArrayTable[i] = new char[cnt][]; wordItem_frequencyTable[i] = new int[cnt]; total += cnt; - int j = 0; - while (j < cnt) + + for (int j = 0; j < cnt; j++) { - // wordItemTable[i][j] = new WordItem(); - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// frequency - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// length - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// handle - - // wordItemTable[i][j].frequency = buffer[0]; - wordItem_frequencyTable[i][j] = buffer[0]; - - length = buffer[1]; + // LUCENENET: Use BinaryReader methods instead of ByteBuffer + int frequency = reader.ReadInt32(); + int length = reader.ReadInt32(); + reader.ReadInt32(); // Skip handle (unused) + + wordItem_frequencyTable[i][j] = frequency; + if (length > 0) { - byte[] lchBuffer = new byte[length]; - dctFile.Read(lchBuffer, 0, lchBuffer.Length); - tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class + byte[] lchBuffer = reader.ReadBytes(length); + string tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET: Use cached encoding instance from base class wordItem_charArrayTable[i][j] = tmpword.ToCharArray(); } else { - // wordItemTable[i][j].charArray = null; wordItem_charArrayTable[i][j] = null; } - // System.out.println(indexTable[i].wordItems[j]); - j++; Review Comment: Please increment `j` here as it was done upstream. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs: ########## @@ -162,7 +162,7 @@ public virtual long Hash1(char c) /// </summary> Review Comment: At the top of this file, please change the declaration to throw on invalid invalid input, since all of the callers expect exceptions for the proper handling. ```c# protected static readonly Encoding gb2312Encoding = Encoding.GetEncoding("GB2312", EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback); ``` ########## src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs: ########## @@ -0,0 +1,72 @@ +using Lucene.Net.Util; +using Lucene.Net.Analysis.Cn.Smart.Hhmm; +using Lucene.Net.Attributes; +using NUnit.Framework; +using System; +using System.IO; +using System.Reflection; + + +[TestFixture] +[LuceneNetSpecific] +public class DictionaryTests : LuceneTestCase +{ + private const string BigramResourceName = "Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramdict.dct"; + + [Test, Category("Dictionary")] + public void TestBigramDictionary() + { + using var resourceStream = GetResourceStream(BigramResourceName); + + FileInfo _tempFile = CreateTempFile("bigramdict", ".dct"); + CopyStreamToFile(resourceStream, _tempFile); + + Assert.IsTrue(_tempFile.Length > 0, "Temp file is empty."); + + BigramDictionary bigramDict = BigramDictionary.GetInstance(); + bigramDict.LoadFromFile(_tempFile.FullName); + + Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()), "Frequency for '啊hello' is incorrect."); + Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()), "Frequency for '阿world' is incorrect."); + } + + [Test, Category("Dictionary")] Review Comment: Please remove the Category attribute. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs: ########## @@ -254,80 +254,83 @@ private void Load(string dictRoot) /// <summary> /// Load the datafile into this <see cref="BigramDictionary"/> /// </summary> - /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param> + /// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param> /// <exception cref="IOException">If there is a low-level I/O error</exception> public virtual void LoadFromFile(string dctFilePath) { - int i, cnt, length, total = 0; // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET specific - refactored constants for clarity + const int HEADER_POSITION = 3755; + const int MAX_VALID_LENGTH = 1000; + //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r")) using var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read); + using var reader = new BinaryReader(dctFile); // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - string currentStr = GetCCByGB2312Id(i); - // if (i == 5231) - // System.out.println(i); - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32(); + string currentStr = GetCCByGB2312Id(i); + int cnt; + try + { + cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer + } + catch (EndOfStreamException) + { + // Reached end of file + break; + } + if (cnt <= 0) { continue; } - total += cnt; - int j = 0; - while (j < cnt) + + for (int j = 0; j < cnt; j++) { - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// frequency - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// length - dctFile.Read(intBuffer, 0, intBuffer.Length); - // buffer[2] = ByteBuffer.wrap(intBuffer).order( - // ByteOrder.LITTLE_ENDIAN).getInt();// handle - - length = buffer[1]; - if (length > 0) + // LUCENENET: Use BinaryReader methods instead of ByteBuffer + int frequency = reader.ReadInt32(); + int length = reader.ReadInt32(); + reader.ReadInt32(); // Skip handle value (unused) Review Comment: Please set the handle to `buffer[2]` as it was done upstream. ########## src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs: ########## @@ -0,0 +1,72 @@ +using Lucene.Net.Util; +using Lucene.Net.Analysis.Cn.Smart.Hhmm; +using Lucene.Net.Attributes; +using NUnit.Framework; +using System; +using System.IO; +using System.Reflection; + + +[TestFixture] +[LuceneNetSpecific] +public class DictionaryTests : LuceneTestCase +{ + private const string BigramResourceName = "Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramdict.dct"; + + [Test, Category("Dictionary")] Review Comment: Please remove the Category attribute. ########## src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs: ########## @@ -0,0 +1,72 @@ +using Lucene.Net.Util; +using Lucene.Net.Analysis.Cn.Smart.Hhmm; +using Lucene.Net.Attributes; +using NUnit.Framework; +using System; +using System.IO; +using System.Reflection; + + Review Comment: Please put these tests in the `Lucene.Net.Analysis.Cn.Smart.Hhmm` namespace. Since this file doesn't exist upstream, it should be moved into a subfolder named `Support/Hhmm`. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs: ########## @@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj) /// <summary> /// Load the datafile into this <see cref="WordDictionary"/> /// </summary> - /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> - /// <returns>number of words read</returns> + /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param> + /// <returns>Number of words read</returns> /// <exception cref="IOException">If there is a low-level I/O error.</exception> private int LoadMainDataFromFile(string dctFilePath) { - int i, cnt, length, total = 0; - // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + int total = 0; + + // The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760). // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET: Use BinaryReader to simplify endian conversion and stream reading. + using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) + using (var reader = new BinaryReader(dctFile)) { - // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - // if (i == 5231) - // System.out.println(i); + int cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer Review Comment: Please include the reason why we changed to use BinaryReader. ```c# // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET ``` ########## src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs: ########## @@ -0,0 +1,72 @@ +using Lucene.Net.Util; +using Lucene.Net.Analysis.Cn.Smart.Hhmm; +using Lucene.Net.Attributes; +using NUnit.Framework; +using System; +using System.IO; +using System.Reflection; + + +[TestFixture] +[LuceneNetSpecific] +public class DictionaryTests : LuceneTestCase +{ + private const string BigramResourceName = "Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramdict.dct"; + + [Test, Category("Dictionary")] + public void TestBigramDictionary() + { + using var resourceStream = GetResourceStream(BigramResourceName); + + FileInfo _tempFile = CreateTempFile("bigramdict", ".dct"); + CopyStreamToFile(resourceStream, _tempFile); + + Assert.IsTrue(_tempFile.Length > 0, "Temp file is empty."); + + BigramDictionary bigramDict = BigramDictionary.GetInstance(); + bigramDict.LoadFromFile(_tempFile.FullName); + + Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()), "Frequency for '啊hello' is incorrect."); + Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()), "Frequency for '阿world' is incorrect."); + } + + [Test, Category("Dictionary")] + public void TestWordDictionaryGetInstance() + { + WordDictionary wordDict = WordDictionary.GetInstance(); + + Assert.NotNull(wordDict, "WordDictionary.GetInstance() returned null."); Review Comment: Note that it is impossible for `WordDictionary.GetInstance()` to return `null`, so this assert is unnecessary. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs: ########## @@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj) /// <summary> /// Load the datafile into this <see cref="WordDictionary"/> /// </summary> - /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> - /// <returns>number of words read</returns> + /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param> + /// <returns>Number of words read</returns> /// <exception cref="IOException">If there is a low-level I/O error.</exception> private int LoadMainDataFromFile(string dctFilePath) { - int i, cnt, length, total = 0; - // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + int total = 0; + + // The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760). // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET: Use BinaryReader to simplify endian conversion and stream reading. + using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) + using (var reader = new BinaryReader(dctFile)) { - // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - // if (i == 5231) - // System.out.println(i); + int cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32(); if (cnt <= 0) { wordItem_charArrayTable[i] = null; wordItem_frequencyTable[i] = null; continue; } + wordItem_charArrayTable[i] = new char[cnt][]; wordItem_frequencyTable[i] = new int[cnt]; total += cnt; - int j = 0; - while (j < cnt) + + for (int j = 0; j < cnt; j++) { - // wordItemTable[i][j] = new WordItem(); - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// frequency - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// length - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// handle - - // wordItemTable[i][j].frequency = buffer[0]; - wordItem_frequencyTable[i][j] = buffer[0]; - - length = buffer[1]; + // LUCENENET: Use BinaryReader methods instead of ByteBuffer Review Comment: Please include the reason why we changed to use BinaryReader. ```c# // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET ``` ########## src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs: ########## @@ -0,0 +1,72 @@ +using Lucene.Net.Util; +using Lucene.Net.Analysis.Cn.Smart.Hhmm; +using Lucene.Net.Attributes; +using NUnit.Framework; +using System; +using System.IO; +using System.Reflection; + + +[TestFixture] +[LuceneNetSpecific] +public class DictionaryTests : LuceneTestCase +{ + private const string BigramResourceName = "Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramdict.dct"; + + [Test, Category("Dictionary")] + public void TestBigramDictionary() + { + using var resourceStream = GetResourceStream(BigramResourceName); + + FileInfo _tempFile = CreateTempFile("bigramdict", ".dct"); + CopyStreamToFile(resourceStream, _tempFile); + + Assert.IsTrue(_tempFile.Length > 0, "Temp file is empty."); + + BigramDictionary bigramDict = BigramDictionary.GetInstance(); + bigramDict.LoadFromFile(_tempFile.FullName); + + Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()), "Frequency for '啊hello' is incorrect."); + Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()), "Frequency for '阿world' is incorrect."); + } + + [Test, Category("Dictionary")] + public void TestWordDictionaryGetInstance() + { + WordDictionary wordDict = WordDictionary.GetInstance(); Review Comment: Please prepare the temp file for loading the same way that was done in the example for `TestBigramDictionary()`. ########## src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs: ########## @@ -0,0 +1,72 @@ +using Lucene.Net.Util; +using Lucene.Net.Analysis.Cn.Smart.Hhmm; +using Lucene.Net.Attributes; +using NUnit.Framework; +using System; +using System.IO; +using System.Reflection; + + +[TestFixture] +[LuceneNetSpecific] +public class DictionaryTests : LuceneTestCase Review Comment: Please setup the directory and name the files with the correct conventions so they are loaded using the existing business logic. This gives us better test coverage than using a custom file name and loading it inside of the test. Also, it is important to consistently use the `CreateTempDir()` and `CreateTempFile()` methods of `LuceneTestCase`, since they are already set up to delete the temp files when the test is finished running. In this case, we should use `CreateTempDir()` so we can update the `AnalyzerProfile.ANALYSIS_DATA_DIR` at the beginning of the test with this temporary location. Then all we need to do is put temp files named `bigramdict.dct` and `coredict.dct` into that location for the corresponding classes to load them. To get the resource stream, please use `this.GetType().FindAndGetResourceStream("bigramdict.dct");`. For that to work, the `bigramdict.dct` file must be in the same directory as this file in the project. It must not be in a subdirectory named `Resources`. So, these files should be placed in the `Support/Hhmm` folder, along with `DictionaryTests.cs`. `FindAndGetResourceStream()` is an extension method in the `J2N` namespace that mimics the classpath functionality in Java for embedded. resource files, which makes them relative to the class structure. ```c# private const string BigramFileName = "bigramdict.dct"; private DirectoryInfo tempDir; public override void OneTimeSetUp() { tempDir = CreateTempDir("smartcn-data"); AnalyzerProfile.ANALYSIS_DATA_DIR = tempDir.FullName; } [Test] public void TestBigramDictionary() { using var resourceStream = this.GetType().FindAndGetResourceStream(BigramFileName); string tempFile = Path.Combine(tempDir.FullName, BigramFileName); using (var tempStream = File.Create(tempFile)) { resourceStream.CopyTo(tempStream); Assert.IsTrue(tempStream.Length > 0, "Temp file is empty."); } BigramDictionary bigramDict = BigramDictionary.GetInstance(); Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()), "Frequency for '啊hello' is incorrect."); Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()), "Frequency for '阿world' is incorrect."); } ``` ########## src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj: ########## @@ -64,4 +64,9 @@ <PackageReference Include="System.Text.Encoding.CodePages" Version="$(SystemTextEncodingCodePagesPackageVersion)" /> </ItemGroup> + <ItemGroup> + <InternalsVisibleTo Include="Lucene.Net.Tests.Analysis.SmartCn" /> Review Comment: Please indent the ItemGroup and InternalsVisibleTo (using spaces) appropriately. ########## src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs: ########## @@ -0,0 +1,72 @@ +using Lucene.Net.Util; +using Lucene.Net.Analysis.Cn.Smart.Hhmm; +using Lucene.Net.Attributes; +using NUnit.Framework; +using System; +using System.IO; +using System.Reflection; + + +[TestFixture] +[LuceneNetSpecific] +public class DictionaryTests : LuceneTestCase +{ + private const string BigramResourceName = "Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramdict.dct"; + + [Test, Category("Dictionary")] + public void TestBigramDictionary() + { + using var resourceStream = GetResourceStream(BigramResourceName); + + FileInfo _tempFile = CreateTempFile("bigramdict", ".dct"); + CopyStreamToFile(resourceStream, _tempFile); + + Assert.IsTrue(_tempFile.Length > 0, "Temp file is empty."); + + BigramDictionary bigramDict = BigramDictionary.GetInstance(); + bigramDict.LoadFromFile(_tempFile.FullName); + + Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()), "Frequency for '啊hello' is incorrect."); + Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()), "Frequency for '阿world' is incorrect."); + } + + [Test, Category("Dictionary")] + public void TestWordDictionaryGetInstance() + { + WordDictionary wordDict = WordDictionary.GetInstance(); + + Assert.NotNull(wordDict, "WordDictionary.GetInstance() returned null."); + Review Comment: Please add additional assertions to ensure the data loaded correctly. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs: ########## @@ -254,80 +254,83 @@ private void Load(string dictRoot) /// <summary> /// Load the datafile into this <see cref="BigramDictionary"/> /// </summary> - /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param> + /// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param> /// <exception cref="IOException">If there is a low-level I/O error</exception> public virtual void LoadFromFile(string dctFilePath) { - int i, cnt, length, total = 0; // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET specific - refactored constants for clarity + const int HEADER_POSITION = 3755; + const int MAX_VALID_LENGTH = 1000; + //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r")) using var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read); + using var reader = new BinaryReader(dctFile); // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - string currentStr = GetCCByGB2312Id(i); - // if (i == 5231) - // System.out.println(i); - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32(); + string currentStr = GetCCByGB2312Id(i); + int cnt; + try + { + cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer Review Comment: Please include the reason why we changed to use BinaryReader. ```c# // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET ``` ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs: ########## @@ -254,80 +254,83 @@ private void Load(string dictRoot) /// <summary> /// Load the datafile into this <see cref="BigramDictionary"/> /// </summary> - /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param> + /// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param> /// <exception cref="IOException">If there is a low-level I/O error</exception> public virtual void LoadFromFile(string dctFilePath) { - int i, cnt, length, total = 0; // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET specific - refactored constants for clarity + const int HEADER_POSITION = 3755; + const int MAX_VALID_LENGTH = 1000; + //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r")) using var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read); + using var reader = new BinaryReader(dctFile); // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - string currentStr = GetCCByGB2312Id(i); - // if (i == 5231) - // System.out.println(i); - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32(); + string currentStr = GetCCByGB2312Id(i); + int cnt; + try + { + cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer + } + catch (EndOfStreamException) + { + // Reached end of file + break; + } + if (cnt <= 0) { continue; } - total += cnt; - int j = 0; - while (j < cnt) + + for (int j = 0; j < cnt; j++) { - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// frequency - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// length - dctFile.Read(intBuffer, 0, intBuffer.Length); - // buffer[2] = ByteBuffer.wrap(intBuffer).order( - // ByteOrder.LITTLE_ENDIAN).getInt();// handle - - length = buffer[1]; - if (length > 0) + // LUCENENET: Use BinaryReader methods instead of ByteBuffer + int frequency = reader.ReadInt32(); + int length = reader.ReadInt32(); + reader.ReadInt32(); // Skip handle value (unused) + + if (length > 0 && length <= MAX_VALID_LENGTH && dctFile.Position + length <= dctFile.Length) { - byte[] lchBuffer = new byte[length]; - dctFile.Read(lchBuffer, 0, lchBuffer.Length); + byte[] lchBuffer = reader.ReadBytes(length); // LUCENENET: Use BinaryReader methods instead of ByteBuffer Review Comment: Please include the reason why we changed to use BinaryReader. ```c# // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET ``` ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs: ########## @@ -254,80 +254,83 @@ private void Load(string dictRoot) /// <summary> /// Load the datafile into this <see cref="BigramDictionary"/> /// </summary> - /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param> + /// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param> /// <exception cref="IOException">If there is a low-level I/O error</exception> public virtual void LoadFromFile(string dctFilePath) { - int i, cnt, length, total = 0; // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET specific - refactored constants for clarity + const int HEADER_POSITION = 3755; Review Comment: It looks like there is a problem here, since the original code used 3756 instead of 3755. I could be wrong, though. This might be due to the change in loop format or it may be an incorrect comment in Java. We should ensure our format exactly matches Lucene, as these files should be portable between .NET and Java. Do note that there is a file in `analysis-data.zip` [here](https://issues.apache.org/jira/browse/LUCENE-1629) that can be used to check whether we still support the original format. Whatever the case, this line should be updated either with the correct number or a comment explaining why/how the upstream code was wrong. ########## src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj: ########## @@ -62,5 +62,10 @@ <ItemGroup Condition=" '$(TargetFramework)' == 'net472' "> <PackageReference Include="System.Text.Encoding.CodePages" Version="$(SystemTextEncodingCodePagesPackageVersion)" /> </ItemGroup> + <ItemGroup> + <EmbeddedResource Include="Resources\bigramdict.dct" /> Review Comment: Please expand this to include all files with these extensions, as was done in the other analysis packages. Also, please place this element above the other `ItemGroup` elements as was done in the morfologik and kuromoji projects and ensure it is properly indented (with 2 spaces per level). ```xml <ItemGroup> <EmbeddedResource Include="**/*.dct" Exclude="bin/**/*;obj/**/*" Label="Dictionary Test Data" /> </ItemGroup> ``` > Note that these files will need to be moved into the `Support/Hhmm` folder along with the tests. ########## src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs: ########## @@ -254,80 +254,83 @@ private void Load(string dictRoot) /// <summary> /// Load the datafile into this <see cref="BigramDictionary"/> /// </summary> - /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param> + /// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param> /// <exception cref="IOException">If there is a low-level I/O error</exception> public virtual void LoadFromFile(string dctFilePath) { - int i, cnt, length, total = 0; // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; + + // LUCENENET: Removed buffer and intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way. + // LUCENENET specific - refactored constants for clarity + const int HEADER_POSITION = 3755; + const int MAX_VALID_LENGTH = 1000; + //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r")) using var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read); + using var reader = new BinaryReader(dctFile); // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { - string currentStr = GetCCByGB2312Id(i); - // if (i == 5231) - // System.out.println(i); - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32(); + string currentStr = GetCCByGB2312Id(i); + int cnt; + try + { + cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader methods instead of ByteBuffer + } + catch (EndOfStreamException) + { + // Reached end of file + break; + } + if (cnt <= 0) { continue; } - total += cnt; - int j = 0; - while (j < cnt) + + for (int j = 0; j < cnt; j++) { - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// frequency - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian) - .GetInt32();// length - dctFile.Read(intBuffer, 0, intBuffer.Length); - // buffer[2] = ByteBuffer.wrap(intBuffer).order( - // ByteOrder.LITTLE_ENDIAN).getInt();// handle - - length = buffer[1]; - if (length > 0) + // LUCENENET: Use BinaryReader methods instead of ByteBuffer Review Comment: Please include the reason why we changed to use BinaryReader. ```c# // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@lucenenet.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org