Re: [PR] feat: added auto duplicated issue and pr detector [fory-site]

via GitHub Sat, 07 Feb 2026 12:07:17 -0800


Copilot commented on code in PR #402:
URL: https://github.com/apache/fory-site/pull/402#discussion_r2778012819



##########
.github/scripts/detect-duplicates.py:
##########
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+"""
+Duplicate Issue and Pull Request Detection Script
+Detects potential duplicate issues and PRs using text similarity analysis.
+"""
+
+import os
+import sys
+import argparse
+import json
+from typing import List, Dict, Tuple
+from github import Github, GithubException
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import yaml
+
+# Configuration defaults
+DEFAULT_SIMILARITY_THRESHOLD = 0.75
+DEFAULT_HIGH_SIMILARITY_THRESHOLD = 0.90
+DEFAULT_MAX_ISSUES_TO_CHECK = 200
+DEFAULT_AUTO_CLOSE_EXACT_MATCH = False
+DEFAULT_LABEL_POSSIBLE_DUPLICATE = "possible-duplicate"
+DEFAULT_LABEL_EXACT_DUPLICATE = "duplicate"
+
+
+class DuplicateDetector:
+    """Detects duplicate issues and pull requests."""
+    
+    def __init__(self, token: str, repo_name: str, config_path: str = None):
+        try:
+            self.github = Github(token)
+            self.repo = self.github.get_repo(repo_name)
+            self.config = self.load_config(config_path)
+        except Exception as e:
+            print(f"Error initializing GitHub connection: {e}")
+            sys.exit(1)
+        
+    def load_config(self, config_path: str = None) -> Dict:
+        """Load configuration from YAML file or use defaults."""
+        default_config = {
+            'similarity_threshold': DEFAULT_SIMILARITY_THRESHOLD,
+            'high_similarity_threshold': DEFAULT_HIGH_SIMILARITY_THRESHOLD,
+            'max_issues_to_check': DEFAULT_MAX_ISSUES_TO_CHECK,
+            'auto_close_exact_match': DEFAULT_AUTO_CLOSE_EXACT_MATCH,
+            'label_possible_duplicate': DEFAULT_LABEL_POSSIBLE_DUPLICATE,
+            'label_exact_duplicate': DEFAULT_LABEL_EXACT_DUPLICATE,
+            'exclude_labels': ['wontfix', 'invalid'],
+            'min_text_length': 20,
+        }
+        
+        if config_path and os.path.exists(config_path):
+            try:
+                with open(config_path, 'r') as f:
+                    user_config = yaml.safe_load(f)
+                    default_config.update(user_config)
+            except Exception as e:
+                print(f"Warning: Could not load config file: {e}")
+        
+        return default_config
+    
+    def preprocess_text(self, text: str) -> str:
+        """Preprocess text for comparison."""
+        if not text:
+            return ""
+        # Convert to lowercase and strip whitespace
+        text = text.lower().strip()
+        # Remove URLs
+        import re
+        text = re.sub(r'http\S+|www.\S+', '', text)
+        # Remove markdown code blocks
+        text = re.sub(r'```[\s\S]*?```', '', text)
+        # Remove special characters but keep spaces
+        text = re.sub(r'[^a-z0-9\s]', ' ', text)
+        # Remove extra whitespace
+        text = ' '.join(text.split())
+        return text
+    
+    def calculate_similarity(self, text1: str, text2: str) -> float:
+        """Calculate cosine similarity between two texts."""
+        if not text1 or not text2:
+            return 0.0
+        
+        try:
+            vectorizer = TfidfVectorizer(
+                min_df=1,
+                stop_words='english',
+                ngram_range=(1, 2)
+            )
+            tfidf_matrix = vectorizer.fit_transform([text1, text2])
+            similarity = cosine_similarity(tfidf_matrix[0:1], 
tfidf_matrix[1:2])[0][0]
+            return float(similarity)
+        except Exception as e:
+            print(f"Error calculating similarity: {e}")
+            return 0.0
+    
+    def find_similar_issues(self, current_number: int, current_title: str, 
+                           current_body: str, item_type: str = 'issue') -> 
List[Tuple[int, str, float]]:
+        """Find similar issues or PRs."""
+        current_text = self.preprocess_text(f"{current_title} {current_body}")
+        
+        if len(current_text) < self.config['min_text_length']:
+            print(f"Text too short for meaningful comparison: 
{len(current_text)} chars")
+            return []
+        
+        similar_items = []
+        
+        # Get existing items to compare against
+        if item_type == 'issue':
+            items = self.repo.get_issues(state='all')
+        else:
+            items = self.repo.get_pulls(state='all')
+        
+        checked_count = 0
+        
+        try:
+            for item in items:
+                if checked_count >= self.config['max_issues_to_check']:
+                    break
+                
+                # Skip the current item
+                if item.number == current_number:
+                    continue
+                
+                try:
+                    # Skip items with excluded labels
+                    item_labels = [label.name for label in item.labels]
+                    if any(label in self.config['exclude_labels'] for label in 
item_labels):
+                        continue
+                    
+                    # Calculate similarity
+                    item_text = self.preprocess_text(f"{item.title} {item.body 
or ''}")
+                    similarity = self.calculate_similarity(current_text, 
item_text)
+                    
+                    if similarity >= self.config['similarity_threshold']:
+                        similar_items.append((item.number, item.title, 
similarity))
+                except Exception as e:
+                    print(f"Warning: Error processing item #{item.number}: 
{e}")
+                    continue
+                
+                checked_count += 1

Review Comment:
   `checked_count` is only incremented when an item is processed without 
hitting the inner exception path. If processing repeatedly errors (e.g., 
unexpected API shape), `max_issues_to_check` may never be reached and the loop 
can scan far more items than intended. Increment the counter in a `finally` 
block (or increment before the inner `try`) so the cap is always enforced.
   ```suggestion
                   finally:
                       checked_count += 1
   ```



##########
.github/scripts/detect-duplicates.py:
##########
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+"""
+Duplicate Issue and Pull Request Detection Script
+Detects potential duplicate issues and PRs using text similarity analysis.
+"""
+
+import os
+import sys
+import argparse
+import json
+from typing import List, Dict, Tuple
+from github import Github, GithubException
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import yaml

Review Comment:
   `json`, `numpy as np`, and `Dict` are imported but not used. Removing unused 
imports reduces dependency footprint and avoids the appearance that `numpy` is 
required if it isn't.



##########
.github/scripts/detect-duplicates.py:
##########
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+"""
+Duplicate Issue and Pull Request Detection Script
+Detects potential duplicate issues and PRs using text similarity analysis.
+"""
+
+import os
+import sys
+import argparse
+import json
+from typing import List, Dict, Tuple
+from github import Github, GithubException
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import yaml
+
+# Configuration defaults
+DEFAULT_SIMILARITY_THRESHOLD = 0.75
+DEFAULT_HIGH_SIMILARITY_THRESHOLD = 0.90
+DEFAULT_MAX_ISSUES_TO_CHECK = 200
+DEFAULT_AUTO_CLOSE_EXACT_MATCH = False
+DEFAULT_LABEL_POSSIBLE_DUPLICATE = "possible-duplicate"
+DEFAULT_LABEL_EXACT_DUPLICATE = "duplicate"
+
+
+class DuplicateDetector:
+    """Detects duplicate issues and pull requests."""
+    
+    def __init__(self, token: str, repo_name: str, config_path: str = None):
+        try:
+            self.github = Github(token)
+            self.repo = self.github.get_repo(repo_name)
+            self.config = self.load_config(config_path)
+        except Exception as e:
+            print(f"Error initializing GitHub connection: {e}")
+            sys.exit(1)
+        
+    def load_config(self, config_path: str = None) -> Dict:
+        """Load configuration from YAML file or use defaults."""
+        default_config = {
+            'similarity_threshold': DEFAULT_SIMILARITY_THRESHOLD,
+            'high_similarity_threshold': DEFAULT_HIGH_SIMILARITY_THRESHOLD,
+            'max_issues_to_check': DEFAULT_MAX_ISSUES_TO_CHECK,
+            'auto_close_exact_match': DEFAULT_AUTO_CLOSE_EXACT_MATCH,
+            'label_possible_duplicate': DEFAULT_LABEL_POSSIBLE_DUPLICATE,
+            'label_exact_duplicate': DEFAULT_LABEL_EXACT_DUPLICATE,
+            'exclude_labels': ['wontfix', 'invalid'],
+            'min_text_length': 20,
+        }
+        
+        if config_path and os.path.exists(config_path):
+            try:
+                with open(config_path, 'r') as f:
+                    user_config = yaml.safe_load(f)
+                    default_config.update(user_config)
+            except Exception as e:
+                print(f"Warning: Could not load config file: {e}")
+        
+        return default_config
+    
+    def preprocess_text(self, text: str) -> str:
+        """Preprocess text for comparison."""
+        if not text:
+            return ""
+        # Convert to lowercase and strip whitespace
+        text = text.lower().strip()
+        # Remove URLs
+        import re
+        text = re.sub(r'http\S+|www.\S+', '', text)
+        # Remove markdown code blocks
+        text = re.sub(r'```[\s\S]*?```', '', text)
+        # Remove special characters but keep spaces
+        text = re.sub(r'[^a-z0-9\s]', ' ', text)
+        # Remove extra whitespace
+        text = ' '.join(text.split())
+        return text
+    
+    def calculate_similarity(self, text1: str, text2: str) -> float:
+        """Calculate cosine similarity between two texts."""
+        if not text1 or not text2:
+            return 0.0
+        
+        try:
+            vectorizer = TfidfVectorizer(
+                min_df=1,
+                stop_words='english',
+                ngram_range=(1, 2)
+            )
+            tfidf_matrix = vectorizer.fit_transform([text1, text2])
+            similarity = cosine_similarity(tfidf_matrix[0:1], 
tfidf_matrix[1:2])[0][0]
+            return float(similarity)

Review Comment:
   `calculate_similarity()` re-fits a new `TfidfVectorizer` for every pairwise 
comparison, which is unnecessarily expensive (up to `max_issues_to_check` fits 
per run). Consider building a single corpus (current item + candidates), 
fitting once, and then computing cosine similarities against the current vector 
to reduce runtime and API/job time.



##########
.github/duplicate-detector-config.yml:
##########
@@ -0,0 +1,44 @@
+# Duplicate Detection Configuration
+# This file configures the duplicate issue and PR detection behavior
+
+# Similarity threshold (0.0 to 1.0) - Issues with similarity above this will 
be flagged
+# Default: 0.75 means 75% similarity
+similarity_threshold: 0.75
+
+# High similarity threshold for exact matches
+# Issues above this threshold will be marked as exact duplicates
+# Default: 0.90 means 90% similarity
+high_similarity_threshold: 0.90
+
+# Maximum number of past issues/PRs to check against
+# Higher numbers = more thorough but slower
+# Default: 200
+max_issues_to_check: 200
+
+# Automatically close issues that are exact matches (high similarity)
+# Set to true to enable auto-closing
+# Default: false (recommended to keep false for review)
+auto_close_exact_match: false
+
+# Label to add for possible duplicates
+label_possible_duplicate: "possible-duplicate"
+
+# Label to add for exact duplicates
+label_exact_duplicate: "duplicate"
+
+# Labels to exclude from duplicate checking
+# Issues with these labels won't be considered as potential duplicates
+exclude_labels:
+  - "wontfix"
+  - "invalid"
+  - "spam"
+
+# Minimum text length (in characters) required for comparison
+# Issues with less text will be skipped
+# Default: 20
+min_text_length: 20
+
+# Additional settings (optional)
+# Number of top similar issues to show in the comment
+# Default: 5
+max_similar_to_show: 5

Review Comment:
   `max_similar_to_show` is defined here, but the detection script currently 
always shows 5 results in the bot comment. Either wire this setting into the 
script or remove it to avoid a misleading config option.
   ```suggestion
   
   ```



##########
.github/scripts/detect-duplicates.py:
##########
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+"""
+Duplicate Issue and Pull Request Detection Script
+Detects potential duplicate issues and PRs using text similarity analysis.
+"""
+
+import os
+import sys
+import argparse
+import json
+from typing import List, Dict, Tuple
+from github import Github, GithubException
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import yaml
+
+# Configuration defaults
+DEFAULT_SIMILARITY_THRESHOLD = 0.75
+DEFAULT_HIGH_SIMILARITY_THRESHOLD = 0.90
+DEFAULT_MAX_ISSUES_TO_CHECK = 200
+DEFAULT_AUTO_CLOSE_EXACT_MATCH = False
+DEFAULT_LABEL_POSSIBLE_DUPLICATE = "possible-duplicate"
+DEFAULT_LABEL_EXACT_DUPLICATE = "duplicate"
+
+
+class DuplicateDetector:
+    """Detects duplicate issues and pull requests."""
+    
+    def __init__(self, token: str, repo_name: str, config_path: str = None):
+        try:
+            self.github = Github(token)
+            self.repo = self.github.get_repo(repo_name)
+            self.config = self.load_config(config_path)
+        except Exception as e:
+            print(f"Error initializing GitHub connection: {e}")
+            sys.exit(1)
+        
+    def load_config(self, config_path: str = None) -> Dict:
+        """Load configuration from YAML file or use defaults."""
+        default_config = {
+            'similarity_threshold': DEFAULT_SIMILARITY_THRESHOLD,
+            'high_similarity_threshold': DEFAULT_HIGH_SIMILARITY_THRESHOLD,
+            'max_issues_to_check': DEFAULT_MAX_ISSUES_TO_CHECK,
+            'auto_close_exact_match': DEFAULT_AUTO_CLOSE_EXACT_MATCH,
+            'label_possible_duplicate': DEFAULT_LABEL_POSSIBLE_DUPLICATE,
+            'label_exact_duplicate': DEFAULT_LABEL_EXACT_DUPLICATE,
+            'exclude_labels': ['wontfix', 'invalid'],
+            'min_text_length': 20,
+        }
+        
+        if config_path and os.path.exists(config_path):
+            try:
+                with open(config_path, 'r') as f:
+                    user_config = yaml.safe_load(f)
+                    default_config.update(user_config)
+            except Exception as e:
+                print(f"Warning: Could not load config file: {e}")
+        
+        return default_config
+    
+    def preprocess_text(self, text: str) -> str:
+        """Preprocess text for comparison."""
+        if not text:
+            return ""
+        # Convert to lowercase and strip whitespace
+        text = text.lower().strip()
+        # Remove URLs
+        import re
+        text = re.sub(r'http\S+|www.\S+', '', text)
+        # Remove markdown code blocks
+        text = re.sub(r'```[\s\S]*?```', '', text)
+        # Remove special characters but keep spaces
+        text = re.sub(r'[^a-z0-9\s]', ' ', text)
+        # Remove extra whitespace
+        text = ' '.join(text.split())
+        return text
+    
+    def calculate_similarity(self, text1: str, text2: str) -> float:
+        """Calculate cosine similarity between two texts."""
+        if not text1 or not text2:
+            return 0.0
+        
+        try:
+            vectorizer = TfidfVectorizer(
+                min_df=1,
+                stop_words='english',
+                ngram_range=(1, 2)
+            )
+            tfidf_matrix = vectorizer.fit_transform([text1, text2])
+            similarity = cosine_similarity(tfidf_matrix[0:1], 
tfidf_matrix[1:2])[0][0]
+            return float(similarity)
+        except Exception as e:
+            print(f"Error calculating similarity: {e}")
+            return 0.0
+    
+    def find_similar_issues(self, current_number: int, current_title: str, 
+                           current_body: str, item_type: str = 'issue') -> 
List[Tuple[int, str, float]]:
+        """Find similar issues or PRs."""
+        current_text = self.preprocess_text(f"{current_title} {current_body}")
+        
+        if len(current_text) < self.config['min_text_length']:
+            print(f"Text too short for meaningful comparison: 
{len(current_text)} chars")
+            return []
+        
+        similar_items = []
+        
+        # Get existing items to compare against
+        if item_type == 'issue':
+            items = self.repo.get_issues(state='all')
+        else:
+            items = self.repo.get_pulls(state='all')
+        
+        checked_count = 0
+        
+        try:
+            for item in items:
+                if checked_count >= self.config['max_issues_to_check']:
+                    break
+                
+                # Skip the current item
+                if item.number == current_number:
+                    continue
+                
+                try:
+                    # Skip items with excluded labels
+                    item_labels = [label.name for label in item.labels]
+                    if any(label in self.config['exclude_labels'] for label in 
item_labels):
+                        continue
+                    
+                    # Calculate similarity
+                    item_text = self.preprocess_text(f"{item.title} {item.body 
or ''}")
+                    similarity = self.calculate_similarity(current_text, 
item_text)
+                    
+                    if similarity >= self.config['similarity_threshold']:
+                        similar_items.append((item.number, item.title, 
similarity))
+                except Exception as e:
+                    print(f"Warning: Error processing item #{item.number}: 
{e}")
+                    continue
+                
+                checked_count += 1
+        except Exception as e:
+            print(f"Error fetching items from repository: {e}")
+            print("This might be due to API rate limits or permissions 
issues.")
+        
+        # Sort by similarity (highest first)
+        similar_items.sort(key=lambda x: x[2], reverse=True)
+        return similar_items
+    
+    def add_label(self, item_number: int, label: str, item_type: str = 
'issue'):
+        """Add a label to an issue or PR."""
+        try:
+            # Ensure label exists
+            try:
+                self.repo.get_label(label)
+            except GithubException:
+                # Create label if it doesn't exist (label not found)
+                if label == self.config['label_possible_duplicate']:
+                    self.repo.create_label(label, "FFA500", "Potential 
duplicate issue")
+                elif label == self.config['label_exact_duplicate']:
+                    self.repo.create_label(label, "FF0000", "Exact duplicate 
issue")
+            
+            if item_type == 'issue':
+                item = self.repo.get_issue(item_number)
+            else:
+                item = self.repo.get_pull(item_number)
+            
+            item.add_to_labels(label)
+            print(f"Added label '{label}' to {item_type} #{item_number}")
+        except Exception as e:
+            print(f"Error adding label: {e}")
+    
+    def add_comment(self, item_number: int, similar_items: List[Tuple[int, 
str, float]], 
+                   item_type: str = 'issue'):
+        """Add a comment about potential duplicates."""
+        if not similar_items:
+            return
+        
+        item_type_name = "issue" if item_type == 'issue' else "pull request"
+        
+        # Build comment message
+        comment = f"👋 **Potential Duplicate Detected**\n\n"
+        comment += f"This {item_type_name} appears to be similar to existing 
{item_type_name}s:\n\n"
+        
+        for number, title, similarity in similar_items[:5]:  # Show top 5
+            similarity_pct = int(similarity * 100)
+            comment += f"- #{number}: {title} (Similarity: 
{similarity_pct}%)\n"
+        
+        comment += f"\n---\n"
+        comment += f"Please review these {item_type_name}s to see if any of 
them address your concern. "
+        comment += f"If this is indeed a duplicate, please close this 
{item_type_name} and continue the discussion in the existing one.\n\n"
+        comment += f"If this is **not** a duplicate, please add more context 
to help differentiate it.\n\n"
+        comment += f"*This is an automated message. If you believe this is 
incorrect, please remove the label and mention a maintainer.*"
+        
+        try:
+            if item_type == 'issue':
+                item = self.repo.get_issue(item_number)
+            else:
+                item = self.repo.get_pull(item_number)
+            
+            item.create_comment(comment)
+            print(f"Added duplicate detection comment to {item_type} 
#{item_number}")
+        except Exception as e:
+            print(f"Error adding comment: {e}")
+    
+    def close_item(self, item_number: int, duplicate_of: int, item_type: str = 
'issue'):
+        """Close an item as a duplicate."""
+        try:
+            if item_type == 'issue':
+                item = self.repo.get_issue(item_number)
+            else:
+                item = self.repo.get_pull(item_number)
+            
+            comment = f"🔒 **Closing as Exact Duplicate**\n\n"
+            comment += f"This {item_type} is an exact duplicate of 
#{duplicate_of}.\n\n"
+            comment += f"Please continue the discussion in #{duplicate_of}."
+            
+            item.create_comment(comment)
+            item.edit(state='closed')
+            print(f"Closed {item_type} #{item_number} as duplicate of 
#{duplicate_of}")
+        except Exception as e:
+            print(f"Error closing item: {e}")
+    
+    def process_item(self, item_number: int, title: str, body: str, item_type: 
str = 'issue'):
+        """Process an issue or PR for duplicate detection."""
+        print(f"\n{'='*60}")
+        print(f"Processing {item_type} #{item_number}: {title}")
+        print(f"{'='*60}\n")
+        
+        # Find similar items
+        similar_items = self.find_similar_issues(item_number, title, body, 
item_type)
+        
+        if not similar_items:
+            print(f"✅ No duplicates found for {item_type} #{item_number}")
+            return
+        
+        print(f"\n🔍 Found {len(similar_items)} similar {item_type}(s):")
+        for num, ttl, sim in similar_items:
+            print(f"  - #{num}: {ttl[:60]}... (Similarity: {sim:.2%})")
+        
+        # Get the highest similarity
+        highest_similarity = similar_items[0][2]
+        highest_similar_number = similar_items[0][0]
+        
+        # Determine action based on similarity
+        if highest_similarity >= self.config['high_similarity_threshold']:
+            print(f"\n⚠️  High similarity detected ({highest_similarity:.2%})")
+            self.add_label(item_number, self.config['label_exact_duplicate'], 
item_type)
+            self.add_comment(item_number, similar_items, item_type)
+            
+            if self.config['auto_close_exact_match']:

Review Comment:
   `auto_close_exact_match` is described/configured as auto-closing **issues**, 
but `process_item()` will also close pull requests when running in PR mode. If 
auto-closing PRs isn't intended, restrict auto-close to `item_type == 'issue'` 
(or split into separate config flags).
   ```suggestion
               # Only auto-close issues; PRs should not be auto-closed by this 
flag
               if item_type == 'issue' and 
self.config['auto_close_exact_match']:
   ```



##########
.github/workflows/duplicate-detector.yml:
##########
@@ -0,0 +1,54 @@
+name: Duplicate Issue and PR Detection
+
+on:
+  issues:
+    types: [opened, reopened]
+  pull_request_target:
+    types: [opened, reopened]
+
+permissions:
+  issues: write
+  pull-requests: write
+  contents: read
+
+jobs:
+  detect-duplicates:
+    runs-on: ubuntu-latest
+    name: Check for Duplicate Issues and PRs
+    
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

Review Comment:
   This workflow checks out the full git history (`fetch-depth: 0`), but the 
duplicate detection script only uses the GitHub API and repo files 
(config/requirements). Using the default shallow checkout would reduce runtime 
and bandwidth unless full history is needed for another step.
   ```suggestion
   
   ```



##########
.github/scripts/detect-duplicates.py:
##########
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+"""
+Duplicate Issue and Pull Request Detection Script
+Detects potential duplicate issues and PRs using text similarity analysis.
+"""
+
+import os
+import sys
+import argparse
+import json
+from typing import List, Dict, Tuple
+from github import Github, GithubException
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import yaml
+
+# Configuration defaults
+DEFAULT_SIMILARITY_THRESHOLD = 0.75
+DEFAULT_HIGH_SIMILARITY_THRESHOLD = 0.90
+DEFAULT_MAX_ISSUES_TO_CHECK = 200
+DEFAULT_AUTO_CLOSE_EXACT_MATCH = False
+DEFAULT_LABEL_POSSIBLE_DUPLICATE = "possible-duplicate"
+DEFAULT_LABEL_EXACT_DUPLICATE = "duplicate"
+
+
+class DuplicateDetector:
+    """Detects duplicate issues and pull requests."""
+    
+    def __init__(self, token: str, repo_name: str, config_path: str = None):
+        try:
+            self.github = Github(token)
+            self.repo = self.github.get_repo(repo_name)
+            self.config = self.load_config(config_path)
+        except Exception as e:
+            print(f"Error initializing GitHub connection: {e}")
+            sys.exit(1)
+        
+    def load_config(self, config_path: str = None) -> Dict:
+        """Load configuration from YAML file or use defaults."""
+        default_config = {
+            'similarity_threshold': DEFAULT_SIMILARITY_THRESHOLD,
+            'high_similarity_threshold': DEFAULT_HIGH_SIMILARITY_THRESHOLD,
+            'max_issues_to_check': DEFAULT_MAX_ISSUES_TO_CHECK,
+            'auto_close_exact_match': DEFAULT_AUTO_CLOSE_EXACT_MATCH,
+            'label_possible_duplicate': DEFAULT_LABEL_POSSIBLE_DUPLICATE,
+            'label_exact_duplicate': DEFAULT_LABEL_EXACT_DUPLICATE,
+            'exclude_labels': ['wontfix', 'invalid'],
+            'min_text_length': 20,
+        }
+        
+        if config_path and os.path.exists(config_path):
+            try:
+                with open(config_path, 'r') as f:
+                    user_config = yaml.safe_load(f)
+                    default_config.update(user_config)
+            except Exception as e:
+                print(f"Warning: Could not load config file: {e}")
+        
+        return default_config
+    
+    def preprocess_text(self, text: str) -> str:
+        """Preprocess text for comparison."""
+        if not text:
+            return ""
+        # Convert to lowercase and strip whitespace
+        text = text.lower().strip()
+        # Remove URLs
+        import re
+        text = re.sub(r'http\S+|www.\S+', '', text)
+        # Remove markdown code blocks
+        text = re.sub(r'```[\s\S]*?```', '', text)
+        # Remove special characters but keep spaces
+        text = re.sub(r'[^a-z0-9\s]', ' ', text)
+        # Remove extra whitespace
+        text = ' '.join(text.split())
+        return text
+    
+    def calculate_similarity(self, text1: str, text2: str) -> float:
+        """Calculate cosine similarity between two texts."""
+        if not text1 or not text2:
+            return 0.0
+        
+        try:
+            vectorizer = TfidfVectorizer(
+                min_df=1,
+                stop_words='english',
+                ngram_range=(1, 2)
+            )
+            tfidf_matrix = vectorizer.fit_transform([text1, text2])
+            similarity = cosine_similarity(tfidf_matrix[0:1], 
tfidf_matrix[1:2])[0][0]
+            return float(similarity)
+        except Exception as e:
+            print(f"Error calculating similarity: {e}")
+            return 0.0
+    
+    def find_similar_issues(self, current_number: int, current_title: str, 
+                           current_body: str, item_type: str = 'issue') -> 
List[Tuple[int, str, float]]:
+        """Find similar issues or PRs."""
+        current_text = self.preprocess_text(f"{current_title} {current_body}")
+        
+        if len(current_text) < self.config['min_text_length']:
+            print(f"Text too short for meaningful comparison: 
{len(current_text)} chars")
+            return []
+        
+        similar_items = []
+        
+        # Get existing items to compare against
+        if item_type == 'issue':
+            items = self.repo.get_issues(state='all')
+        else:
+            items = self.repo.get_pulls(state='all')

Review Comment:
   For `item_type == 'issue'`, `repo.get_issues(state='all')` will also return 
pull requests (PRs are issues in the API). This can cause issue-duplicate 
detection to suggest/label PRs as duplicates of issues (and vice versa). Filter 
out items where `item.pull_request` is set (or use separate queries) so issue 
checks only compare against real issues.



##########
i18n/en-US/docusaurus-theme-classic/navbar.json:
##########
@@ -0,0 +1,78 @@
+{
+  "logo.alt": {
+    "message": "Apache Fory™ Logo",
+    "description": "The alt text of navbar logo"
+  },
+  "item.label.Start": {
+    "message": "Start",
+    "description": "Navbar item with label Start"

Review Comment:
   This PR introduces several unrelated website/i18n content additions (e.g., 
new navbar translation keys) that aren't mentioned in the PR description about 
duplicate detection. Consider splitting these into a separate PR (or updating 
the PR description) to keep review scope focused.



##########
.github/scripts/detect-duplicates.py:
##########
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+"""
+Duplicate Issue and Pull Request Detection Script
+Detects potential duplicate issues and PRs using text similarity analysis.
+"""
+
+import os
+import sys
+import argparse
+import json
+from typing import List, Dict, Tuple
+from github import Github, GithubException
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import yaml
+
+# Configuration defaults
+DEFAULT_SIMILARITY_THRESHOLD = 0.75
+DEFAULT_HIGH_SIMILARITY_THRESHOLD = 0.90
+DEFAULT_MAX_ISSUES_TO_CHECK = 200
+DEFAULT_AUTO_CLOSE_EXACT_MATCH = False
+DEFAULT_LABEL_POSSIBLE_DUPLICATE = "possible-duplicate"
+DEFAULT_LABEL_EXACT_DUPLICATE = "duplicate"
+
+
+class DuplicateDetector:
+    """Detects duplicate issues and pull requests."""
+    
+    def __init__(self, token: str, repo_name: str, config_path: str = None):
+        try:
+            self.github = Github(token)
+            self.repo = self.github.get_repo(repo_name)
+            self.config = self.load_config(config_path)
+        except Exception as e:
+            print(f"Error initializing GitHub connection: {e}")
+            sys.exit(1)
+        
+    def load_config(self, config_path: str = None) -> Dict:
+        """Load configuration from YAML file or use defaults."""
+        default_config = {
+            'similarity_threshold': DEFAULT_SIMILARITY_THRESHOLD,
+            'high_similarity_threshold': DEFAULT_HIGH_SIMILARITY_THRESHOLD,
+            'max_issues_to_check': DEFAULT_MAX_ISSUES_TO_CHECK,
+            'auto_close_exact_match': DEFAULT_AUTO_CLOSE_EXACT_MATCH,
+            'label_possible_duplicate': DEFAULT_LABEL_POSSIBLE_DUPLICATE,
+            'label_exact_duplicate': DEFAULT_LABEL_EXACT_DUPLICATE,
+            'exclude_labels': ['wontfix', 'invalid'],
+            'min_text_length': 20,
+        }
+        
+        if config_path and os.path.exists(config_path):
+            try:
+                with open(config_path, 'r') as f:
+                    user_config = yaml.safe_load(f)
+                    default_config.update(user_config)
+            except Exception as e:
+                print(f"Warning: Could not load config file: {e}")
+        
+        return default_config
+    
+    def preprocess_text(self, text: str) -> str:
+        """Preprocess text for comparison."""
+        if not text:
+            return ""
+        # Convert to lowercase and strip whitespace
+        text = text.lower().strip()
+        # Remove URLs
+        import re
+        text = re.sub(r'http\S+|www.\S+', '', text)
+        # Remove markdown code blocks
+        text = re.sub(r'```[\s\S]*?```', '', text)
+        # Remove special characters but keep spaces
+        text = re.sub(r'[^a-z0-9\s]', ' ', text)
+        # Remove extra whitespace
+        text = ' '.join(text.split())
+        return text
+    
+    def calculate_similarity(self, text1: str, text2: str) -> float:
+        """Calculate cosine similarity between two texts."""
+        if not text1 or not text2:
+            return 0.0
+        
+        try:
+            vectorizer = TfidfVectorizer(
+                min_df=1,
+                stop_words='english',
+                ngram_range=(1, 2)
+            )
+            tfidf_matrix = vectorizer.fit_transform([text1, text2])
+            similarity = cosine_similarity(tfidf_matrix[0:1], 
tfidf_matrix[1:2])[0][0]
+            return float(similarity)
+        except Exception as e:
+            print(f"Error calculating similarity: {e}")
+            return 0.0
+    
+    def find_similar_issues(self, current_number: int, current_title: str, 
+                           current_body: str, item_type: str = 'issue') -> 
List[Tuple[int, str, float]]:
+        """Find similar issues or PRs."""
+        current_text = self.preprocess_text(f"{current_title} {current_body}")
+        
+        if len(current_text) < self.config['min_text_length']:
+            print(f"Text too short for meaningful comparison: 
{len(current_text)} chars")
+            return []
+        
+        similar_items = []
+        
+        # Get existing items to compare against
+        if item_type == 'issue':
+            items = self.repo.get_issues(state='all')
+        else:
+            items = self.repo.get_pulls(state='all')
+        
+        checked_count = 0
+        
+        try:
+            for item in items:
+                if checked_count >= self.config['max_issues_to_check']:
+                    break
+                
+                # Skip the current item
+                if item.number == current_number:
+                    continue
+                
+                try:
+                    # Skip items with excluded labels
+                    item_labels = [label.name for label in item.labels]
+                    if any(label in self.config['exclude_labels'] for label in 
item_labels):
+                        continue
+                    
+                    # Calculate similarity
+                    item_text = self.preprocess_text(f"{item.title} {item.body 
or ''}")
+                    similarity = self.calculate_similarity(current_text, 
item_text)
+                    
+                    if similarity >= self.config['similarity_threshold']:
+                        similar_items.append((item.number, item.title, 
similarity))

Review Comment:
   The PR description states similarity is computed using title/body **and 
labels**, but the implementation only compares title+body text; labels are only 
used for exclusion. If labels are intended to influence similarity, incorporate 
label names into the compared text (or compute a separate label similarity 
component). Otherwise, update the documentation/PR description to match 
behavior.



##########
.github/scripts/detect-duplicates.py:
##########
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+"""
+Duplicate Issue and Pull Request Detection Script
+Detects potential duplicate issues and PRs using text similarity analysis.
+"""
+
+import os
+import sys
+import argparse
+import json
+from typing import List, Dict, Tuple
+from github import Github, GithubException
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import yaml
+
+# Configuration defaults
+DEFAULT_SIMILARITY_THRESHOLD = 0.75
+DEFAULT_HIGH_SIMILARITY_THRESHOLD = 0.90
+DEFAULT_MAX_ISSUES_TO_CHECK = 200
+DEFAULT_AUTO_CLOSE_EXACT_MATCH = False
+DEFAULT_LABEL_POSSIBLE_DUPLICATE = "possible-duplicate"
+DEFAULT_LABEL_EXACT_DUPLICATE = "duplicate"
+
+
+class DuplicateDetector:
+    """Detects duplicate issues and pull requests."""
+    
+    def __init__(self, token: str, repo_name: str, config_path: str = None):
+        try:
+            self.github = Github(token)
+            self.repo = self.github.get_repo(repo_name)
+            self.config = self.load_config(config_path)
+        except Exception as e:
+            print(f"Error initializing GitHub connection: {e}")
+            sys.exit(1)
+        
+    def load_config(self, config_path: str = None) -> Dict:
+        """Load configuration from YAML file or use defaults."""
+        default_config = {
+            'similarity_threshold': DEFAULT_SIMILARITY_THRESHOLD,
+            'high_similarity_threshold': DEFAULT_HIGH_SIMILARITY_THRESHOLD,
+            'max_issues_to_check': DEFAULT_MAX_ISSUES_TO_CHECK,
+            'auto_close_exact_match': DEFAULT_AUTO_CLOSE_EXACT_MATCH,
+            'label_possible_duplicate': DEFAULT_LABEL_POSSIBLE_DUPLICATE,
+            'label_exact_duplicate': DEFAULT_LABEL_EXACT_DUPLICATE,
+            'exclude_labels': ['wontfix', 'invalid'],
+            'min_text_length': 20,
+        }
+        
+        if config_path and os.path.exists(config_path):
+            try:
+                with open(config_path, 'r') as f:
+                    user_config = yaml.safe_load(f)
+                    default_config.update(user_config)
+            except Exception as e:
+                print(f"Warning: Could not load config file: {e}")
+        
+        return default_config
+    
+    def preprocess_text(self, text: str) -> str:
+        """Preprocess text for comparison."""
+        if not text:
+            return ""
+        # Convert to lowercase and strip whitespace
+        text = text.lower().strip()
+        # Remove URLs
+        import re
+        text = re.sub(r'http\S+|www.\S+', '', text)
+        # Remove markdown code blocks
+        text = re.sub(r'```[\s\S]*?```', '', text)
+        # Remove special characters but keep spaces
+        text = re.sub(r'[^a-z0-9\s]', ' ', text)
+        # Remove extra whitespace
+        text = ' '.join(text.split())
+        return text
+    
+    def calculate_similarity(self, text1: str, text2: str) -> float:
+        """Calculate cosine similarity between two texts."""
+        if not text1 or not text2:
+            return 0.0
+        
+        try:
+            vectorizer = TfidfVectorizer(
+                min_df=1,
+                stop_words='english',
+                ngram_range=(1, 2)
+            )
+            tfidf_matrix = vectorizer.fit_transform([text1, text2])
+            similarity = cosine_similarity(tfidf_matrix[0:1], 
tfidf_matrix[1:2])[0][0]
+            return float(similarity)
+        except Exception as e:
+            print(f"Error calculating similarity: {e}")
+            return 0.0
+    
+    def find_similar_issues(self, current_number: int, current_title: str, 
+                           current_body: str, item_type: str = 'issue') -> 
List[Tuple[int, str, float]]:
+        """Find similar issues or PRs."""
+        current_text = self.preprocess_text(f"{current_title} {current_body}")
+        
+        if len(current_text) < self.config['min_text_length']:
+            print(f"Text too short for meaningful comparison: 
{len(current_text)} chars")
+            return []
+        
+        similar_items = []
+        
+        # Get existing items to compare against
+        if item_type == 'issue':
+            items = self.repo.get_issues(state='all')
+        else:
+            items = self.repo.get_pulls(state='all')
+        
+        checked_count = 0
+        
+        try:
+            for item in items:
+                if checked_count >= self.config['max_issues_to_check']:
+                    break
+                
+                # Skip the current item
+                if item.number == current_number:
+                    continue
+                
+                try:
+                    # Skip items with excluded labels
+                    item_labels = [label.name for label in item.labels]
+                    if any(label in self.config['exclude_labels'] for label in 
item_labels):
+                        continue
+                    
+                    # Calculate similarity
+                    item_text = self.preprocess_text(f"{item.title} {item.body 
or ''}")
+                    similarity = self.calculate_similarity(current_text, 
item_text)
+                    
+                    if similarity >= self.config['similarity_threshold']:
+                        similar_items.append((item.number, item.title, 
similarity))
+                except Exception as e:
+                    print(f"Warning: Error processing item #{item.number}: 
{e}")
+                    continue
+                
+                checked_count += 1
+        except Exception as e:
+            print(f"Error fetching items from repository: {e}")
+            print("This might be due to API rate limits or permissions 
issues.")
+        
+        # Sort by similarity (highest first)
+        similar_items.sort(key=lambda x: x[2], reverse=True)
+        return similar_items
+    
+    def add_label(self, item_number: int, label: str, item_type: str = 
'issue'):
+        """Add a label to an issue or PR."""
+        try:
+            # Ensure label exists
+            try:
+                self.repo.get_label(label)
+            except GithubException:
+                # Create label if it doesn't exist (label not found)
+                if label == self.config['label_possible_duplicate']:
+                    self.repo.create_label(label, "FFA500", "Potential 
duplicate issue")
+                elif label == self.config['label_exact_duplicate']:
+                    self.repo.create_label(label, "FF0000", "Exact duplicate 
issue")
+            
+            if item_type == 'issue':
+                item = self.repo.get_issue(item_number)
+            else:
+                item = self.repo.get_pull(item_number)
+            
+            item.add_to_labels(label)
+            print(f"Added label '{label}' to {item_type} #{item_number}")
+        except Exception as e:
+            print(f"Error adding label: {e}")
+    
+    def add_comment(self, item_number: int, similar_items: List[Tuple[int, 
str, float]], 
+                   item_type: str = 'issue'):
+        """Add a comment about potential duplicates."""
+        if not similar_items:
+            return
+        
+        item_type_name = "issue" if item_type == 'issue' else "pull request"
+        
+        # Build comment message
+        comment = f"👋 **Potential Duplicate Detected**\n\n"
+        comment += f"This {item_type_name} appears to be similar to existing 
{item_type_name}s:\n\n"
+        
+        for number, title, similarity in similar_items[:5]:  # Show top 5
+            similarity_pct = int(similarity * 100)
+            comment += f"- #{number}: {title} (Similarity: 
{similarity_pct}%)\n"

Review Comment:
   The config file defines `max_similar_to_show`, but the script hard-codes 
showing the top 5 results. Either read the limit from config (with a default) 
or drop the config option to avoid dead configuration.



##########
.github/scripts/detect-duplicates.py:
##########
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+"""
+Duplicate Issue and Pull Request Detection Script
+Detects potential duplicate issues and PRs using text similarity analysis.
+"""
+
+import os
+import sys
+import argparse
+import json
+from typing import List, Dict, Tuple
+from github import Github, GithubException
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import yaml
+
+# Configuration defaults
+DEFAULT_SIMILARITY_THRESHOLD = 0.75
+DEFAULT_HIGH_SIMILARITY_THRESHOLD = 0.90
+DEFAULT_MAX_ISSUES_TO_CHECK = 200
+DEFAULT_AUTO_CLOSE_EXACT_MATCH = False
+DEFAULT_LABEL_POSSIBLE_DUPLICATE = "possible-duplicate"
+DEFAULT_LABEL_EXACT_DUPLICATE = "duplicate"
+
+
+class DuplicateDetector:
+    """Detects duplicate issues and pull requests."""
+    
+    def __init__(self, token: str, repo_name: str, config_path: str = None):
+        try:
+            self.github = Github(token)
+            self.repo = self.github.get_repo(repo_name)
+            self.config = self.load_config(config_path)
+        except Exception as e:
+            print(f"Error initializing GitHub connection: {e}")
+            sys.exit(1)
+        
+    def load_config(self, config_path: str = None) -> Dict:
+        """Load configuration from YAML file or use defaults."""
+        default_config = {
+            'similarity_threshold': DEFAULT_SIMILARITY_THRESHOLD,
+            'high_similarity_threshold': DEFAULT_HIGH_SIMILARITY_THRESHOLD,
+            'max_issues_to_check': DEFAULT_MAX_ISSUES_TO_CHECK,
+            'auto_close_exact_match': DEFAULT_AUTO_CLOSE_EXACT_MATCH,
+            'label_possible_duplicate': DEFAULT_LABEL_POSSIBLE_DUPLICATE,
+            'label_exact_duplicate': DEFAULT_LABEL_EXACT_DUPLICATE,
+            'exclude_labels': ['wontfix', 'invalid'],
+            'min_text_length': 20,
+        }
+        
+        if config_path and os.path.exists(config_path):
+            try:
+                with open(config_path, 'r') as f:
+                    user_config = yaml.safe_load(f)

Review Comment:
   `yaml.safe_load()` can return `None` (e.g., empty config file). Calling 
`default_config.update(user_config)` would then raise a `TypeError` and break 
the workflow. Guard against this by treating a `None` result as `{}` and 
validating that the loaded config is a mapping before updating.
   ```suggestion
                       user_config = yaml.safe_load(f)
                       if user_config is None:
                           user_config = {}
                       elif not isinstance(user_config, dict):
                           raise ValueError("Config file must contain a mapping 
at the top level")
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat: added auto duplicated issue and pr detector [fory-site]

Reply via email to