Copilot commented on code in PR #402:
URL: https://github.com/apache/fory-site/pull/402#discussion_r2778032272


##########
.github/scripts/detect-duplicates.py:
##########
@@ -0,0 +1,336 @@
+#!/usr/bin/env python3
+"""
+Duplicate Issue and Pull Request Detection Script
+Detects potential duplicate issues and PRs using text similarity analysis.
+"""
+
+import os
+import sys
+import argparse
+from typing import List, Tuple
+from github import Github, GithubException
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import yaml
+
+# Configuration defaults
+DEFAULT_SIMILARITY_THRESHOLD = 0.75
+DEFAULT_HIGH_SIMILARITY_THRESHOLD = 0.90
+DEFAULT_MAX_ISSUES_TO_CHECK = 200
+DEFAULT_AUTO_CLOSE_EXACT_MATCH = False
+DEFAULT_LABEL_POSSIBLE_DUPLICATE = "possible-duplicate"
+DEFAULT_LABEL_EXACT_DUPLICATE = "duplicate"
+DEFAULT_MAX_SIMILAR_TO_SHOW = 5
+
+
+class DuplicateDetector:
+    """Detects duplicate issues and pull requests."""
+    
+    def __init__(self, token: str, repo_name: str, config_path: str = None):
+        try:
+            self.github = Github(token)
+            self.repo = self.github.get_repo(repo_name)
+            self.config = self.load_config(config_path)
+        except Exception as e:
+            print(f"Error initializing GitHub connection: {e}")
+            sys.exit(1)
+        
+    def load_config(self, config_path: str = None) -> dict:
+        """Load configuration from YAML file or use defaults."""
+        default_config = {
+            'similarity_threshold': DEFAULT_SIMILARITY_THRESHOLD,
+            'high_similarity_threshold': DEFAULT_HIGH_SIMILARITY_THRESHOLD,
+            'max_issues_to_check': DEFAULT_MAX_ISSUES_TO_CHECK,
+            'auto_close_exact_match': DEFAULT_AUTO_CLOSE_EXACT_MATCH,
+            'label_possible_duplicate': DEFAULT_LABEL_POSSIBLE_DUPLICATE,
+            'label_exact_duplicate': DEFAULT_LABEL_EXACT_DUPLICATE,
+            'exclude_labels': ['wontfix', 'invalid'],
+            'min_text_length': 20,
+            'max_similar_to_show': DEFAULT_MAX_SIMILAR_TO_SHOW,
+        }
+        
+        if config_path and os.path.exists(config_path):
+            try:
+                with open(config_path, 'r') as f:
+                    user_config = yaml.safe_load(f)
+                    if user_config is None:
+                        user_config = {}
+                    elif not isinstance(user_config, dict):
+                        raise ValueError("Config file must contain a mapping 
at the top level")
+                    default_config.update(user_config)
+            except Exception as e:
+                print(f"Warning: Could not load config file: {e}")
+        
+        return default_config
+    
+    def preprocess_text(self, text: str) -> str:
+        """Preprocess text for comparison."""
+        if not text:
+            return ""
+        # Convert to lowercase and strip whitespace
+        text = text.lower().strip()
+        # Remove URLs
+        import re
+        text = re.sub(r'http\S+|www.\S+', '', text)
+        # Remove markdown code blocks
+        text = re.sub(r'```[\s\S]*?```', '', text)
+        # Remove special characters but keep spaces
+        text = re.sub(r'[^a-z0-9\s]', ' ', text)
+        # Remove extra whitespace
+        text = ' '.join(text.split())
+        return text
+    
+    def find_similar_issues(self, current_number: int, current_title: str, 
+                           current_body: str, item_type: str = 'issue') -> 
List[Tuple[int, str, float]]:
+        """Find similar issues or PRs."""
+        current_text = self.preprocess_text(f"{current_title} {current_body}")
+        
+        if len(current_text) < self.config['min_text_length']:
+            print(f"Text too short for meaningful comparison: 
{len(current_text)} chars")
+            return []
+        
+        # Get existing items to compare against
+        if item_type == 'issue':
+            items = self.repo.get_issues(state='all')
+        else:
+            items = self.repo.get_pulls(state='all')
+        
+        # First pass: collect all candidate items and their texts
+        candidates = []  # List of (item_number, item_title, item_text)
+        checked_count = 0
+        
+        try:
+            for item in items:
+                if checked_count >= self.config['max_issues_to_check']:
+                    break
+                
+                # Skip the current item
+                if item.number == current_number:
+                    continue
+                
+                # Skip pull requests when checking issues (PRs are returned by 
get_issues API)
+                if item_type == 'issue' and hasattr(item, 'pull_request') and 
item.pull_request:
+                    continue
+                
+                try:
+                    # Skip items with excluded labels
+                    item_labels = [label.name for label in item.labels]
+                    if any(label in self.config['exclude_labels'] for label in 
item_labels):
+                        continue
+                    
+                    # Preprocess and store candidate text
+                    item_text = self.preprocess_text(f"{item.title} {item.body 
or ''}")
+                    if item_text:  # Only include non-empty texts
+                        candidates.append((item.number, item.title, item_text))
+                except Exception as e:
+                    print(f"Warning: Error processing item #{item.number}: 
{e}")
+                    continue
+                finally:
+                    checked_count += 1
+        except Exception as e:
+            print(f"Error fetching items from repository: {e}")
+            print("This might be due to API rate limits or permissions 
issues.")
+            return []
+        
+        if not candidates:
+            return []
+        
+        # Second pass: calculate all similarities at once
+        try:
+            # Build corpus: current text + all candidate texts
+            corpus = [current_text] + [text for _, _, text in candidates]
+            
+            # Fit vectorizer once on entire corpus
+            vectorizer = TfidfVectorizer(
+                min_df=1,
+                stop_words='english',
+                ngram_range=(1, 2)
+            )
+            tfidf_matrix = vectorizer.fit_transform(corpus)
+            
+            # Compute similarities between current item (index 0) and all 
candidates
+            similarities = cosine_similarity(tfidf_matrix[0:1], 
tfidf_matrix[1:])[0]
+            
+            # Build results list with items meeting threshold
+            similar_items = []
+            for i, (num, title, _) in enumerate(candidates):
+                similarity = float(similarities[i])
+                if similarity >= self.config['similarity_threshold']:
+                    similar_items.append((num, title, similarity))
+            
+        except Exception as e:
+            print(f"Error calculating similarities: {e}")
+            return []
+        
+        # Sort by similarity (highest first)
+        similar_items.sort(key=lambda x: x[2], reverse=True)
+        return similar_items
+    
+    def add_label(self, item_number: int, label: str, item_type: str = 
'issue'):
+        """Add a label to an issue or PR."""
+        try:
+            # Ensure label exists
+            try:
+                self.repo.get_label(label)
+            except GithubException:
+                # Create label if it doesn't exist (label not found)
+                if label == self.config['label_possible_duplicate']:
+                    self.repo.create_label(label, "FFA500", "Potential 
duplicate issue")
+                elif label == self.config['label_exact_duplicate']:
+                    self.repo.create_label(label, "FF0000", "Exact duplicate 
issue")
+            
+            if item_type == 'issue':
+                item = self.repo.get_issue(item_number)
+            else:
+                item = self.repo.get_pull(item_number)
+            
+            item.add_to_labels(label)
+            print(f"Added label '{label}' to {item_type} #{item_number}")
+        except Exception as e:
+            print(f"Error adding label: {e}")
+    
+    def add_comment(self, item_number: int, similar_items: List[Tuple[int, 
str, float]], 
+                   item_type: str = 'issue'):
+        """Add a comment about potential duplicates."""
+        if not similar_items:
+            return
+        
+        item_type_name = "issue" if item_type == 'issue' else "pull request"
+        
+        # Build comment message
+        comment = f"👋 **Potential Duplicate Detected**\n\n"
+        comment += f"This {item_type_name} appears to be similar to existing 
{item_type_name}s:\n\n"
+        
+        max_to_show = self.config['max_similar_to_show']
+        for number, title, similarity in similar_items[:max_to_show]:
+            similarity_pct = int(similarity * 100)
+            comment += f"- #{number}: {title} (Similarity: 
{similarity_pct}%)\n"

Review Comment:
   The bot comment includes raw issue/PR titles in markdown (`- #{number}: 
{title} ...`). Titles can contain `@mentions` and will trigger notifications 
when the bot posts, which can be abused for spam. Consider sanitizing/escaping 
`@` (and other markdown) in titles before posting (or wrap titles in inline 
code) to prevent unwanted mentions.
   ```suggestion
               sanitized_title = title.replace("@", "@\u200b")
               comment += f"- #{number}: {sanitized_title} (Similarity: 
{similarity_pct}%)\n"
   ```



##########
.github/scripts/detect-duplicates.py:
##########
@@ -0,0 +1,336 @@
+#!/usr/bin/env python3
+"""
+Duplicate Issue and Pull Request Detection Script
+Detects potential duplicate issues and PRs using text similarity analysis.
+"""
+
+import os
+import sys
+import argparse
+from typing import List, Tuple
+from github import Github, GithubException
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import yaml
+
+# Configuration defaults
+DEFAULT_SIMILARITY_THRESHOLD = 0.75
+DEFAULT_HIGH_SIMILARITY_THRESHOLD = 0.90
+DEFAULT_MAX_ISSUES_TO_CHECK = 200
+DEFAULT_AUTO_CLOSE_EXACT_MATCH = False
+DEFAULT_LABEL_POSSIBLE_DUPLICATE = "possible-duplicate"
+DEFAULT_LABEL_EXACT_DUPLICATE = "duplicate"
+DEFAULT_MAX_SIMILAR_TO_SHOW = 5
+
+
+class DuplicateDetector:
+    """Detects duplicate issues and pull requests."""
+    
+    def __init__(self, token: str, repo_name: str, config_path: str = None):
+        try:
+            self.github = Github(token)
+            self.repo = self.github.get_repo(repo_name)
+            self.config = self.load_config(config_path)
+        except Exception as e:
+            print(f"Error initializing GitHub connection: {e}")
+            sys.exit(1)
+        
+    def load_config(self, config_path: str = None) -> dict:
+        """Load configuration from YAML file or use defaults."""
+        default_config = {
+            'similarity_threshold': DEFAULT_SIMILARITY_THRESHOLD,
+            'high_similarity_threshold': DEFAULT_HIGH_SIMILARITY_THRESHOLD,
+            'max_issues_to_check': DEFAULT_MAX_ISSUES_TO_CHECK,
+            'auto_close_exact_match': DEFAULT_AUTO_CLOSE_EXACT_MATCH,
+            'label_possible_duplicate': DEFAULT_LABEL_POSSIBLE_DUPLICATE,
+            'label_exact_duplicate': DEFAULT_LABEL_EXACT_DUPLICATE,
+            'exclude_labels': ['wontfix', 'invalid'],
+            'min_text_length': 20,
+            'max_similar_to_show': DEFAULT_MAX_SIMILAR_TO_SHOW,
+        }
+        
+        if config_path and os.path.exists(config_path):
+            try:
+                with open(config_path, 'r') as f:
+                    user_config = yaml.safe_load(f)
+                    if user_config is None:
+                        user_config = {}
+                    elif not isinstance(user_config, dict):
+                        raise ValueError("Config file must contain a mapping 
at the top level")
+                    default_config.update(user_config)
+            except Exception as e:
+                print(f"Warning: Could not load config file: {e}")
+        
+        return default_config
+    
+    def preprocess_text(self, text: str) -> str:
+        """Preprocess text for comparison."""
+        if not text:
+            return ""
+        # Convert to lowercase and strip whitespace
+        text = text.lower().strip()
+        # Remove URLs
+        import re
+        text = re.sub(r'http\S+|www.\S+', '', text)
+        # Remove markdown code blocks
+        text = re.sub(r'```[\s\S]*?```', '', text)
+        # Remove special characters but keep spaces
+        text = re.sub(r'[^a-z0-9\s]', ' ', text)
+        # Remove extra whitespace

Review Comment:
   `preprocess_text` currently replaces anything outside `[a-z0-9\s]` with 
spaces, which strips all non-Latin characters (e.g., Chinese/Japanese), making 
similarity detection ineffective for non-English issues/PRs. Consider switching 
to Unicode-aware tokenization (e.g., keep `\w` in Unicode mode, or rely on 
`TfidfVectorizer` token patterns) so non-English text is preserved.



##########
.github/workflows/duplicate-detector.yml:
##########
@@ -0,0 +1,52 @@
+name: Duplicate Issue and PR Detection
+
+on:
+  issues:
+    types: [opened, reopened]

Review Comment:
   This PR is described as adding a duplicate issue/PR detector, but it also 
contains many unrelated website/i18n/formatting changes. Consider splitting 
those unrelated changes into a separate PR so the automation change can be 
reviewed and rolled out independently.



##########
.github/workflows/duplicate-detector.yml:
##########
@@ -0,0 +1,52 @@
+name: Duplicate Issue and PR Detection
+
+on:
+  issues:
+    types: [opened, reopened]
+  pull_request_target:
+    types: [opened, reopened]
+
+permissions:
+  issues: write
+  pull-requests: write
+  contents: read
+
+jobs:
+  detect-duplicates:
+    runs-on: ubuntu-latest
+    name: Check for Duplicate Issues and PRs
+    
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      
+      - name: Install dependencies
+        run: |
+          pip install -r .github/scripts/requirements.txt

Review Comment:
   Installing `scikit-learn`/`numpy` during every run can be slow and flaky for 
an on-issue-open workflow. Consider enabling pip caching via 
`actions/setup-python` (e.g., `cache: pip`) so dependencies aren’t 
re-downloaded/built each time.



##########
.github/scripts/detect-duplicates.py:
##########
@@ -0,0 +1,336 @@
+#!/usr/bin/env python3
+"""
+Duplicate Issue and Pull Request Detection Script
+Detects potential duplicate issues and PRs using text similarity analysis.
+"""
+
+import os
+import sys
+import argparse
+from typing import List, Tuple
+from github import Github, GithubException
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import yaml
+
+# Configuration defaults
+DEFAULT_SIMILARITY_THRESHOLD = 0.75
+DEFAULT_HIGH_SIMILARITY_THRESHOLD = 0.90
+DEFAULT_MAX_ISSUES_TO_CHECK = 200
+DEFAULT_AUTO_CLOSE_EXACT_MATCH = False
+DEFAULT_LABEL_POSSIBLE_DUPLICATE = "possible-duplicate"
+DEFAULT_LABEL_EXACT_DUPLICATE = "duplicate"
+DEFAULT_MAX_SIMILAR_TO_SHOW = 5
+
+
+class DuplicateDetector:
+    """Detects duplicate issues and pull requests."""
+    
+    def __init__(self, token: str, repo_name: str, config_path: str = None):
+        try:
+            self.github = Github(token)
+            self.repo = self.github.get_repo(repo_name)
+            self.config = self.load_config(config_path)
+        except Exception as e:
+            print(f"Error initializing GitHub connection: {e}")
+            sys.exit(1)
+        
+    def load_config(self, config_path: str = None) -> dict:
+        """Load configuration from YAML file or use defaults."""
+        default_config = {
+            'similarity_threshold': DEFAULT_SIMILARITY_THRESHOLD,
+            'high_similarity_threshold': DEFAULT_HIGH_SIMILARITY_THRESHOLD,
+            'max_issues_to_check': DEFAULT_MAX_ISSUES_TO_CHECK,
+            'auto_close_exact_match': DEFAULT_AUTO_CLOSE_EXACT_MATCH,
+            'label_possible_duplicate': DEFAULT_LABEL_POSSIBLE_DUPLICATE,
+            'label_exact_duplicate': DEFAULT_LABEL_EXACT_DUPLICATE,
+            'exclude_labels': ['wontfix', 'invalid'],
+            'min_text_length': 20,
+            'max_similar_to_show': DEFAULT_MAX_SIMILAR_TO_SHOW,
+        }
+        
+        if config_path and os.path.exists(config_path):
+            try:
+                with open(config_path, 'r') as f:
+                    user_config = yaml.safe_load(f)
+                    if user_config is None:
+                        user_config = {}
+                    elif not isinstance(user_config, dict):
+                        raise ValueError("Config file must contain a mapping 
at the top level")
+                    default_config.update(user_config)
+            except Exception as e:
+                print(f"Warning: Could not load config file: {e}")
+        
+        return default_config
+    
+    def preprocess_text(self, text: str) -> str:
+        """Preprocess text for comparison."""
+        if not text:
+            return ""
+        # Convert to lowercase and strip whitespace
+        text = text.lower().strip()
+        # Remove URLs
+        import re
+        text = re.sub(r'http\S+|www.\S+', '', text)

Review Comment:
   In `preprocess_text`, the URL-stripping regex uses `www.\S+` where `.` 
matches any character. This will remove text starting with e.g. `wwwx...` 
unintentionally. Escape the dot (e.g. `www\.\S+`) or use a more robust URL 
pattern.
   ```suggestion
           text = re.sub(r'http\S+|www\.\S+', '', text)
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to