[Pywikipedia-bugs] [Maniphest] [Claimed] T390361: [Code Contribution] Develop the logic for detecting and sending URLs to Wikiwix

Elie243 Sat, 29 Mar 2025 09:25:41 -0700

Elie243 claimed this task.
Elie243 added a comment.


  import pywikibot
  import requests
  from typing import Optional, Dict, List, Tuple
  import logging
  import time
  from datetime import datetime
  from sqlalchemy import create_engine, Column, String, DateTime, Boolean, 
Integer
  from sqlalchemy.ext.declarative import declarative_base
  from sqlalchemy.orm import sessionmaker
  from config import WIKIWIX_API_URL, BATCH_SIZE, SLEEP_TIME, MAX_RETRIES, 
RETRY_DELAY
  from utils import is_valid_url, setup_logger, extract_urls_from_text, 
sanitize_url
  
  Base = declarative_base()
  
  class ArchivedURL(Base):
  
    __tablename__ = 'archived_urls'
    
    original_url = Column(String, primary_key=True)
    archived_url = Column(String, nullable=False)
    wikidata_id = Column(String, nullable=False)
    archived_at = Column(DateTime, default=datetime.utcnow)
    is_successful = Column(Boolean, default=True)
    retry_count = Column(Integer, default=0)
  
  class ArchivingBot:
  
    def __init__(self):
        self.site = pywikibot.Site("wikidata", "wikidata")
        self.repo = self.site.data_repository()
        self.logger = setup_logger()
        self.db_engine = create_engine('sqlite:///archives.db')
        Base.metadata.create_all(self.db_engine)
        self.Session = sessionmaker(bind=self.db_engine)
    
    def archive_url(self, url: str) -> Tuple[Optional[str], bool]:
        """
        Send URL to Wikiwix for archiving with improved error handling and 
validation.
        
        Args:
            url (str): The URL to archive
            
        Returns:
            Tuple[Optional[str], bool]: (archived_url, success_status)
        """
        sanitized_url = sanitize_url(url)
        if not sanitized_url:
            self.logger.warning(f"Invalid URL format: {url}")
            return None, False
    
        retries = 0
        while retries < MAX_RETRIES:
            try:
                params = {"url": sanitized_url}
                response = requests.get(
                    WIKIWIX_API_URL, 
                    params=params,
                    timeout=30,
                    headers={'User-Agent': 'ArchivingBot/1.0'}
                )
                
                if response.status_code == 200:
                    archived_url = response.url
                    if self._verify_archive(archived_url):
                        return archived_url, True
                    else:
                        self.logger.error(f"Archive verification failed for 
{url}")
                        return None, False
                        
                elif response.status_code == 429:  # Rate limit
                    wait_time = int(response.headers.get('Retry-After', 
RETRY_DELAY))
                    self.logger.warning(f"Rate limited. Waiting {wait_time} 
seconds")
                    time.sleep(wait_time)
                else:
                    self.logger.error(f"Failed to archive URL {url}. Status 
code: {response.status_code}")
                    
            except requests.exceptions.Timeout:
                self.logger.warning(f"Timeout while archiving {url}. Attempt 
{retries + 1}/{MAX_RETRIES}")
            except requests.exceptions.RequestException as e:
                self.logger.error(f"Error archiving URL {url}: {str(e)}")
            
            retries += 1
            time.sleep(RETRY_DELAY)
            
        return None, False
    
    def _verify_archive(self, archived_url: str) -> bool:
        """
        Verify that the archived URL is accessible and contains content.
        """
        try:
            response = requests.head(archived_url, timeout=10)
            return response.status_code == 200
        except:
            return False
    
    def save_archived_url(self, original_url: str, archived_url: str, 
wikidata_id: str, is_successful: bool = True):
        """Save archived URL to database with status tracking."""
        session = self.Session()
        try:
            archived = ArchivedURL(
                original_url=original_url,
                archived_url=archived_url,
                wikidata_id=wikidata_id,
                is_successful=is_successful
            )
            session.add(archived)
            session.commit()
            
            if is_successful:
                self.logger.info(f"Successfully archived {original_url} -> 
{archived_url}")
            else:
                self.logger.warning(f"Failed to archive {original_url}")
                
        except Exception as e:
            self.logger.error(f"Error saving to database: {str(e)}")
            session.rollback()
        finally:
            session.close()
    
    def process_recent_changes(self, limit: int = BATCH_SIZE):
        """Process recent changes with improved URL detection and error 
handling."""
        self.logger.info("Starting to process recent changes...")
        
        for change in self.site.recentchanges(limit=limit):
            try:
                if change['type'] == 'edit' and change['namespace'] == 0:
                    item = pywikibot.ItemPage(self.repo, change['title'])
                    item.get()
                    
                    
                    for claims in item.claims.values():
                        for claim in claims:
                          
                            if hasattr(claim, 'target_text'):
                                urls = extract_urls_from_text(claim.target_text)
                                for url in urls:
                                    self._process_url(url, item)
                                    
                            
                            for qualifier in claim.qualifiers.values():
                                for qual in qualifier:
                                    if hasattr(qual, 'target_text'):
                                        urls = 
extract_urls_from_text(qual.target_text)
                                        for url in urls:
                                            self._process_url(url, item)
                                            
                           
                            for ref in claim.sources:
                                for source_claims in ref.values():
                                    for source in source_claims:
                                        if hasattr(source, 'target_text'):
                                            urls = 
extract_urls_from_text(source.target_text)
                                            for url in urls:
                                                self._process_url(url, item)
                                                
            except Exception as e:
                self.logger.error(f"Error processing item {change['title']}: 
{str(e)}")
                continue
    
    def _process_url(self, url: str, item: pywikibot.ItemPage):
        """Process a single URL for archiving."""
        if not self._has_archive(item, url):
            archived_url, success = self.archive_url(url)
            if archived_url:
                self._add_archive_claim(item, url, archived_url)
                self.save_archived_url(url, archived_url, item.id, success)
                time.sleep(1)
    
    def _has_archive(self, item: pywikibot.ItemPage, url: str) -> bool:
        """Check if URL already has an archive."""
        if 'P1065' not in item.claims:
            return False
            
        for claim in item.claims['P1065']:
            if claim.target_text == url:
                return True
        return False
    
    def _add_archive_claim(self, item: pywikibot.ItemPage, original_url: str, 
archived_url: str):
        """Add archived URL claim to item with improved error handling."""
        try:
            claim = pywikibot.Claim(self.repo, 'P1065')
            claim.setTarget(archived_url)
            
          
            ref = pywikibot.Claim(self.repo, 'P248')  
            ref.setTarget(pywikibot.ItemPage(self.repo, 'Q5058'))  
            
            timestamp = pywikibot.Claim(self.repo, 'P813')  # retrieved
            timestamp.setTarget(pywikibot.WbTime.today())
            
            claim.addSources([ref, timestamp])
            item.addClaim(claim)
            
            self.logger.info(f"Added archive for {original_url} -> 
{archived_url}")
            
        except Exception as e:
            self.logger.error(f"Error adding archive claim: {str(e)}")
  
  if __name__ == "__main__":
  
    bot = ArchivingBot()
    while True:
        try:
            bot.process_recent_changes()
            time.sleep(SLEEP_TIME)
        except KeyboardInterrupt:
            break
        except Exception as e:
            bot.logger.error(f"Main loop error: {str(e)}")
            time.sleep(60)

TASK DETAIL
  https://phabricator.wikimedia.org/T390361

EMAIL PREFERENCES
  https://phabricator.wikimedia.org/settings/panel/emailpreferences/

To: Elie243
Cc: MarcosAdo, KINDEHOUNME, Thalie30, Elie243, Wisdom-Hassan, Aklapper, 
pywikibot-bugs-list, Paul26, mevo, Danny_Benjafield_WMDE, S8321414, 
Astuthiodit_1, karapayneWMDE, Invadibot, PotsdamLamb, Jyoo1011, maantietaja, 
JohnsonLee01, SHEKH, Dijkstra, Khutuck, Nintendofan885, Akuckartz, Zkhalido, 
Aram, Viztor, Nandana, Wenyi, Lahi, Gq86, GoranSMilovanovic, QZanden, Tbscho, 
KimKelting, MayS, LawExplorer, Mdupont, JJMC89, Dvorapa, _jensen, rosalieper, 
Altostratus, Avicennasis, Hannolans, Scott_WUaS, mys_721tx, Wikidata-bugs, Xqt, 
Hydriz, aude, jayvdb, Nemo_bis, Lydia_Pintscher, Masti, Alchimista, Mbch331

_______________________________________________
pywikibot-bugs mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[Pywikipedia-bugs] [Maniphest] [Claimed] T390361: [Code Contribution] Develop the logic for detecting and sending URLs to Wikiwix

Reply via email to