e-mhui commented on code in PR #9535:
URL: https://github.com/apache/seatunnel/pull/9535#discussion_r2188365957


##########
seatunnel-engine/seatunnel-engine-client/src/test/resources/batch_fake_to_console_multi_table.conf:
##########
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 ######
-###### This config file is a demonstration of streaming processing in 
seatunnel config
+###### This config file is a demonstration of batch processing in seatunnel 
config

Review Comment:
   However, there are too many files that need to be modified, so I wrote a 
Python script to detect irregular configuration files. The detection rules are 
as follows:
   
   1. The file must be a text file starting with either "batch" or "streaming" 
and ending with ".conf".
   2. If it is a streaming job, the file name must start with "streaming", the 
comment in the file must be "This config file is a demonstration of streaming 
processing", and the configuration must be job.mode = "streaming".
   3. If it is a batch job, the file name must start with "batch", the comment 
in the file must be "This config file is a demonstration of batch processing", 
and the configuration must be job.mode = "batch".
   
   All scanned files have been modified, but there are two remaining files that 
are questionable. They are streaming jobs, but their file names start with 
"batch":
   
   1. 
https://github.com/apache/seatunnel/blob/dev/seatunnel-e2e/seatunnel-core-e2e/seatunnel-starter-e2e/src/test/resources/batch_cancel_task_1.conf
   2. 
https://github.com/apache/seatunnel/blob/dev/seatunnel-e2e/seatunnel-core-e2e/seatunnel-starter-e2e/src/test/resources/batch_cancel_task_2.conf
   
   The complete scanning script is as follows:
   
   
   ```python
   import os
   import re
   
   
   def is_valid_config_file(file_path):
       """
       Check if a SeaTunnel config file meets the specifications
       :param file_path: Full path of the file
       :return: Tuple of (is_valid, error_message)
       """
       try:
           with open(file_path, 'r', encoding='utf-8') as f:
               content = f.read()
   
               # Check streaming processing config
               if "This config file is a demonstration of streaming processing" 
in content:
                   file_name = os.path.basename(file_path).lower()
                   if not file_name.startswith('streaming'):
                       return False, "Streaming config file should start with 
'streaming'"
                   if not re.search(r'job\.mode\s*=\s*["\']STREAMING["\']', 
content,
                                   re.IGNORECASE):
                       return False, "Streaming config file should contain 
job.mode = \"STREAMING\""
                   return True, None
   
               # Check batch processing config
               elif "This config file is a demonstration of batch processing" 
in content:
                   file_name = os.path.basename(file_path).lower()
                   if not file_name.startswith('batch'):
                       return False, "Batch config file should start with 
'batch'"
                   if not re.search(r'job\.mode\s*=\s*["\']batch["\']', content,
                                   re.IGNORECASE):
                       return False, "Batch config file should contain job.mode 
= \"batch\""
                   return True, None
   
               return False, "Not a valid SeaTunnel config file"
   
       except Exception as e:
           return False, f"Failed to read file: {str(e)}"
   
   
   def scan_config_files(directory):
       """
       Recursively scan all SeaTunnel config files in a directory
       :param directory: Directory path to scan
       :return: List of invalid files, each element is a tuple of (file_path, 
error_message)
       """
       invalid_files = []
   
       for root, dirs, files in os.walk(directory):
           # Skip target directory[4](@ref)
           if 'target' in dirs:
               dirs.remove('target')
   
           for file_name in files:
               # Only check files starting with batch/streaming and ending with 
.conf[7](@ref)
               lower_name = file_name.lower()
               if (lower_name.startswith('batch') or lower_name.startswith(
                   'streaming')) and lower_name.endswith('.conf'):
                   file_path = os.path.join(root, file_name)
                   is_valid, error_msg = is_valid_config_file(file_path)
                   if not is_valid:
                       invalid_files.append((file_path, error_msg))
   
       return invalid_files
   
   
   def print_invalid_files(invalid_files):
       """
       Print list of invalid files
       :param invalid_files: List of invalid files, each element is a tuple of 
(file_path, error_message)
       """
       if not invalid_files:
           print("All config files meet the specifications")
           return
   
       print("\nFound the following invalid config files:")
       print("=" * 80)
       for idx, (file_path, error_msg) in enumerate(invalid_files, 1):
           print(f"{idx}. File path: {file_path}")
           print(f"   Error message: {error_msg}")
           print("-" * 80)
   
   
   if __name__ == "__main__":
       # Replace with the directory path you want to scan
       target_directory = input("Please enter the directory path to scan: 
").strip()
   
       if os.path.isdir(target_directory):
           print(f"Start scanning directory: {target_directory}")
           invalid_files = scan_config_files(target_directory)
           print_invalid_files(invalid_files)
           print("Scan completed")
       else:
           print(f"Error: {target_directory} is not a valid directory")
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to