Elie243 claimed this task.
Elie243 added a comment.
import pywikibot
import requests
from typing import Optional, Dict, List, Tuple
import logging
import time
from datetime import datetime
from sqlalchemy import create_engine, Column, String, DateTime, Boolean,
Integer
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from config import WIKIWIX_API_URL, BATCH_SIZE, SLEEP_TIME, MAX_RETRIES,
RETRY_DELAY
from utils import is_valid_url, setup_logger, extract_urls_from_text,
sanitize_url
Base = declarative_base()
class ArchivedURL(Base):
__tablename__ = 'archived_urls'
original_url = Column(String, primary_key=True)
archived_url = Column(String, nullable=False)
wikidata_id = Column(String, nullable=False)
archived_at = Column(DateTime, default=datetime.utcnow)
is_successful = Column(Boolean, default=True)
retry_count = Column(Integer, default=0)
class ArchivingBot:
def __init__(self):
self.site = pywikibot.Site("wikidata", "wikidata")
self.repo = self.site.data_repository()
self.logger = setup_logger()
self.db_engine = create_engine('sqlite:///archives.db')
Base.metadata.create_all(self.db_engine)
self.Session = sessionmaker(bind=self.db_engine)
def archive_url(self, url: str) -> Tuple[Optional[str], bool]:
"""
Send URL to Wikiwix for archiving with improved error handling and
validation.
Args:
url (str): The URL to archive
Returns:
Tuple[Optional[str], bool]: (archived_url, success_status)
"""
sanitized_url = sanitize_url(url)
if not sanitized_url:
self.logger.warning(f"Invalid URL format: {url}")
return None, False
retries = 0
while retries < MAX_RETRIES:
try:
params = {"url": sanitized_url}
response = requests.get(
WIKIWIX_API_URL,
params=params,
timeout=30,
headers={'User-Agent': 'ArchivingBot/1.0'}
)
if response.status_code == 200:
archived_url = response.url
if self._verify_archive(archived_url):
return archived_url, True
else:
self.logger.error(f"Archive verification failed for
{url}")
return None, False
elif response.status_code == 429: # Rate limit
wait_time = int(response.headers.get('Retry-After',
RETRY_DELAY))
self.logger.warning(f"Rate limited. Waiting {wait_time}
seconds")
time.sleep(wait_time)
else:
self.logger.error(f"Failed to archive URL {url}. Status
code: {response.status_code}")
except requests.exceptions.Timeout:
self.logger.warning(f"Timeout while archiving {url}. Attempt
{retries + 1}/{MAX_RETRIES}")
except requests.exceptions.RequestException as e:
self.logger.error(f"Error archiving URL {url}: {str(e)}")
retries += 1
time.sleep(RETRY_DELAY)
return None, False
def _verify_archive(self, archived_url: str) -> bool:
"""
Verify that the archived URL is accessible and contains content.
"""
try:
response = requests.head(archived_url, timeout=10)
return response.status_code == 200
except:
return False
def save_archived_url(self, original_url: str, archived_url: str,
wikidata_id: str, is_successful: bool = True):
"""Save archived URL to database with status tracking."""
session = self.Session()
try:
archived = ArchivedURL(
original_url=original_url,
archived_url=archived_url,
wikidata_id=wikidata_id,
is_successful=is_successful
)
session.add(archived)
session.commit()
if is_successful:
self.logger.info(f"Successfully archived {original_url} ->
{archived_url}")
else:
self.logger.warning(f"Failed to archive {original_url}")
except Exception as e:
self.logger.error(f"Error saving to database: {str(e)}")
session.rollback()
finally:
session.close()
def process_recent_changes(self, limit: int = BATCH_SIZE):
"""Process recent changes with improved URL detection and error
handling."""
self.logger.info("Starting to process recent changes...")
for change in self.site.recentchanges(limit=limit):
try:
if change['type'] == 'edit' and change['namespace'] == 0:
item = pywikibot.ItemPage(self.repo, change['title'])
item.get()
for claims in item.claims.values():
for claim in claims:
if hasattr(claim, 'target_text'):
urls = extract_urls_from_text(claim.target_text)
for url in urls:
self._process_url(url, item)
for qualifier in claim.qualifiers.values():
for qual in qualifier:
if hasattr(qual, 'target_text'):
urls =
extract_urls_from_text(qual.target_text)
for url in urls:
self._process_url(url, item)
for ref in claim.sources:
for source_claims in ref.values():
for source in source_claims:
if hasattr(source, 'target_text'):
urls =
extract_urls_from_text(source.target_text)
for url in urls:
self._process_url(url, item)
except Exception as e:
self.logger.error(f"Error processing item {change['title']}:
{str(e)}")
continue
def _process_url(self, url: str, item: pywikibot.ItemPage):
"""Process a single URL for archiving."""
if not self._has_archive(item, url):
archived_url, success = self.archive_url(url)
if archived_url:
self._add_archive_claim(item, url, archived_url)
self.save_archived_url(url, archived_url, item.id, success)
time.sleep(1)
def _has_archive(self, item: pywikibot.ItemPage, url: str) -> bool:
"""Check if URL already has an archive."""
if 'P1065' not in item.claims:
return False
for claim in item.claims['P1065']:
if claim.target_text == url:
return True
return False
def _add_archive_claim(self, item: pywikibot.ItemPage, original_url: str,
archived_url: str):
"""Add archived URL claim to item with improved error handling."""
try:
claim = pywikibot.Claim(self.repo, 'P1065')
claim.setTarget(archived_url)
ref = pywikibot.Claim(self.repo, 'P248')
ref.setTarget(pywikibot.ItemPage(self.repo, 'Q5058'))
timestamp = pywikibot.Claim(self.repo, 'P813') # retrieved
timestamp.setTarget(pywikibot.WbTime.today())
claim.addSources([ref, timestamp])
item.addClaim(claim)
self.logger.info(f"Added archive for {original_url} ->
{archived_url}")
except Exception as e:
self.logger.error(f"Error adding archive claim: {str(e)}")
if __name__ == "__main__":
bot = ArchivingBot()
while True:
try:
bot.process_recent_changes()
time.sleep(SLEEP_TIME)
except KeyboardInterrupt:
break
except Exception as e:
bot.logger.error(f"Main loop error: {str(e)}")
time.sleep(60)
TASK DETAIL
https://phabricator.wikimedia.org/T390361
EMAIL PREFERENCES
https://phabricator.wikimedia.org/settings/panel/emailpreferences/
To: Elie243
Cc: MarcosAdo, KINDEHOUNME, Thalie30, Elie243, Wisdom-Hassan, Aklapper,
pywikibot-bugs-list, Paul26, mevo, Danny_Benjafield_WMDE, S8321414,
Astuthiodit_1, karapayneWMDE, Invadibot, PotsdamLamb, Jyoo1011, maantietaja,
JohnsonLee01, SHEKH, Dijkstra, Khutuck, Nintendofan885, Akuckartz, Zkhalido,
Aram, Viztor, Nandana, Wenyi, Lahi, Gq86, GoranSMilovanovic, QZanden, Tbscho,
KimKelting, MayS, LawExplorer, Mdupont, JJMC89, Dvorapa, _jensen, rosalieper,
Altostratus, Avicennasis, Hannolans, Scott_WUaS, mys_721tx, Wikidata-bugs, Xqt,
Hydriz, aude, jayvdb, Nemo_bis, Lydia_Pintscher, Masti, Alchimista, Mbch331
_______________________________________________
pywikibot-bugs mailing list -- [email protected]
To unsubscribe send an email to [email protected]