jenkins-bot has submitted this change. (
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/1224790?usp=email )
Change subject: Optimize pickle file storage with subdirectory structure
......................................................................
Optimize pickle file storage with subdirectory structure
Currently pickle files are stored in flat directories which creates
over 7 million files in a single directory for enwiki. This causes
significant filesystem performance issues.
This patch implements subdirectory structure using floor(page_id/1000)
to organize pickle files:
- page_id 100000 → en/100000/100000.p
- page_id 100002 → en/100000/100002.p
- page_id 200005 → en/200000/200005.p
This reduces files per directory from ~7M to ~7K, improving filesystem
performance. The implementation adds WikiWhoMixin class to toolforge.py
with _get_wikiwho_pickle_path() static method for calculating paths.
Includes comprehensive unit tests covering path calculation, different
languages, edge cases, and directory structure verification.
Bug: T414087
Change-Id: I5ccaf3ffa50af9c2b6a72d2baf27f54469b863d1
---
M pywikibot/page/_page.py
M pywikibot/page/_toolforge.py
M tests/wikiblame_tests.py
A tests/wikiwho_tests.py
4 files changed, 246 insertions(+), 50 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/page/_page.py b/pywikibot/page/_page.py
index 457ab1a..e48e511 100644
--- a/pywikibot/page/_page.py
+++ b/pywikibot/page/_page.py
@@ -26,7 +26,7 @@
UnknownExtensionError,
)
from pywikibot.page._basepage import BasePage
-from pywikibot.page._toolforge import WikiBlameMixin
+from pywikibot.page._toolforge import WikiBlameMixin, WikiWhoMixin
from pywikibot.site import Namespace
from pywikibot.tools import cached, deprecated_args
@@ -34,7 +34,7 @@
__all__ = ['Page']
-class Page(BasePage, WikiBlameMixin):
+class Page(BasePage, WikiBlameMixin, WikiWhoMixin):
"""Page: A MediaWiki page."""
diff --git a/pywikibot/page/_toolforge.py b/pywikibot/page/_toolforge.py
index d00d983..7e80c78 100644
--- a/pywikibot/page/_toolforge.py
+++ b/pywikibot/page/_toolforge.py
@@ -10,9 +10,11 @@
from __future__ import annotations
import collections
+import pickle
import re
import urllib.parse
from http import HTTPStatus
+from pathlib import Path
from typing import Any
from warnings import warn
@@ -30,12 +32,6 @@
#: Supported wikipedia site codes
WIKIBLAME_CODES = 'als', 'bar', 'de', 'en', 'it', 'nds', 'sco'
- #: Supported WikiWho API language codes
- WIKIWHO_CODES = (
- 'ar', 'de', 'en', 'es', 'eu', 'fr', 'hu', 'id', 'it', 'ja', 'nl', 'pl',
- 'pt', 'tr', 'zh'
- )
-
def _check_wh_supported(self) -> None:
"""Check if WikiHistory is supported."""
if self.site.family.name != 'wikipedia':
@@ -53,45 +49,6 @@
if not self.exists():
raise pywikibot.exceptions.NoPageError(self)
- def _check_wikiwho_supported(self) -> None:
- """Check if WikiWho API is supported.
-
- .. versionadded:: 11.0
-
- :raise NotImplementedError: unsupported site, language, or namespace
- :raise NoPageError: page does not exist
- """
- if self.site.family.name != 'wikipedia':
- raise NotImplementedError(
- 'WikiWho API is implemented for wikipedia family only')
-
- if (code := self.site.code) not in self.WIKIWHO_CODES:
- raise NotImplementedError(
- f'WikiWho API is not implemented for wikipedia:{code}')
-
- if (ns := self.namespace()) != 0:
- raise NotImplementedError(
- f'WikiWho API is not implemented for {ns} namespace')
-
- if not self.exists():
- raise pywikibot.exceptions.NoPageError(self)
-
- def _build_wikiwho_url(self, endpoint: str) -> str:
- """Build WikiWho API URL for the given endpoint.
-
- .. versionadded:: 11.0
-
- :param endpoint: API endpoint (all_content, rev_content,
- edit_persistence)
- :return: Complete API URL
- """
- article_title = self.title(with_ns=False, with_section=False)
- encoded_title = urllib.parse.quote(article_title, safe='')
- base_url = 'https://wikiwho-api.wmcloud.org'
- url = (f'{base_url}/{self.site.code}/api/v1.0.0-beta/{endpoint}/'
- f'{encoded_title}/')
- return url
-
@deprecated('authorsship', since='9.3.0')
@deprecated_args(onlynew=None) # since 9.2.0
def main_authors(self) -> collections.Counter[str, int]:
@@ -255,11 +212,70 @@
return {user: (chars, percent) for user, chars, percent in result}
- def get_annotations(self) -> dict[str, Any]:
+
+class WikiWhoMixin:
+
+ """Page mixin for WikiWho authorship data with optimized pickle storage.
+
+ WikiWho provides token-level provenance and authorship information.
+ This implementation uses an optimized subdirectory structure for pickle
+ caching to avoid filesystem performance issues with millions of files.
+
+ .. versionadded:: 11.0
+ """
+
+ #: Supported WikiWho API language codes
+ WIKIWHO_CODES = (
+ 'ar', 'de', 'en', 'es', 'eu', 'fr', 'hu', 'id', 'it', 'ja', 'nl', 'pl',
+ 'pt', 'tr', 'zh'
+ )
+
+ def _check_wikiwho_supported(self) -> None:
+ """Check if WikiWho API is supported.
+
+ .. versionadded:: 11.0
+
+ :raise NotImplementedError: unsupported site, language, or namespace
+ :raise NoPageError: page does not exist
+ """
+ if self.site.family.name != 'wikipedia':
+ raise NotImplementedError(
+ 'WikiWho API is implemented for wikipedia family only')
+
+ if (code := self.site.code) not in self.WIKIWHO_CODES:
+ raise NotImplementedError(
+ f'WikiWho API is not implemented for wikipedia:{code}')
+
+ if (ns := self.namespace()) != 0:
+ raise NotImplementedError(
+ f'WikiWho API is not implemented for {ns} namespace')
+
+ if not self.exists():
+ raise pywikibot.exceptions.NoPageError(self)
+
+ def _build_wikiwho_url(self, endpoint: str) -> str:
+ """Build WikiWho API URL for the given endpoint.
+
+ .. versionadded:: 11.0
+
+ :param endpoint: API endpoint (all_content, rev_content,
+ edit_persistence)
+ :return: Complete API URL
+ """
+ article_title = self.title(with_ns=False, with_section=False)
+ encoded_title = urllib.parse.quote(article_title, safe='')
+ base_url = 'https://wikiwho-api.wmcloud.org'
+ url = (f'{base_url}/{self.site.code}/api/v1.0.0-beta/{endpoint}/'
+ f'{encoded_title}/')
+ return url
+
+ def get_annotations(self, *, use_cache: bool = True) -> dict[str, Any]:
"""Get WikiWho annotations for article revisions.
This method uses the public WikiWho API to get token-level
provenance annotations showing who added each token in the article.
+ Results are cached locally using pickle files with an optimized
+ subdirectory structure to avoid filesystem performance issues.
Sample:
@@ -277,6 +293,8 @@
- https://wikiwho-api.wmcloud.org
- https://www.mediawiki.org/wiki/WikiWho
+ :param use_cache: Whether to use and save cached data.
+ Set to False to force a fresh API request without caching.
:return: Dictionary containing article_title, page_id, and revisions
with token-level annotations
@@ -287,6 +305,13 @@
"""
self._check_wikiwho_supported()
+ # Check cache first
+ cache_path = self._get_wikiwho_pickle_path(
+ self.site.code, self.pageid)
+ if use_cache and cache_path.exists():
+ with open(cache_path, 'rb') as f:
+ return pickle.load(f)
+
url = self._build_wikiwho_url('all_content')
url = f'{url}?editor=true&o_rev_id=true'
@@ -306,4 +331,52 @@
raise pywikibot.exceptions.ServerError(
f'WikiWho API error: {error_msg}')
+ # Save to cache if caching is enabled
+ if use_cache:
+ cache_path.parent.mkdir(parents=True, exist_ok=True)
+ with open(cache_path, 'wb') as f:
+ pickle.dump(data, f, protocol=pywikibot.config.pickle_protocol)
+
return data
+
+ @staticmethod
+ def _get_wikiwho_pickle_path(lang: str, page_id: int, cache_dir=None):
+ """Calculate pickle file path with subdirectory structure.
+
+ Uses subdirectories based on floor(page_id/1000) to optimize
+ filesystem performance. This avoids having millions of pickle
+ files in a single directory.
+
+ Directory structure:
+ cache_dir/lang/subdirectory/page_id.p
+
+ Where subdirectory = floor(page_id / 1000) * 1000
+
+ Examples:
+ page_id 100000 → en/100000/100000.p
+ page_id 100002 → en/100000/100002.p
+ page_id 200005 → en/200000/200005.p
+
+ This reduces files per directory from ~7M to ~7K for large wikis.
+
+ .. versionadded:: 11.0
+
+ :param lang: Language code (e.g., 'en', 'de', 'fi')
+ :param page_id: Wikipedia page ID
+ :param cache_dir: Custom cache directory (defaults to apicache/wikiwho)
+ :return: Path object for the pickle file
+ """
+ # Use provided cache_dir or default to apicache/wikiwho
+ if cache_dir is None:
+ cache_dir = (Path(pywikibot.config.base_dir)
+ / 'apicache' / 'wikiwho')
+ else:
+ cache_dir = Path(cache_dir)
+
+ # Calculate subdirectory as floor(page_id / 1000) * 1000
+ subdirectory = (page_id // 1000) * 1000
+
+ # Construct path: cache_dir/lang/subdirectory/page_id.p
+ pickle_path = cache_dir / lang / str(subdirectory) / f'{page_id}.p'
+
+ return pickle_path
diff --git a/tests/wikiblame_tests.py b/tests/wikiblame_tests.py
index b4d3e30..b69706b 100644
--- a/tests/wikiblame_tests.py
+++ b/tests/wikiblame_tests.py
@@ -99,8 +99,8 @@
def test_wikiwho_supported_languages(self) -> None:
"""Test that WIKIWHO_CODES contains expected languages."""
- from pywikibot.page._toolforge import WikiBlameMixin
- codes = WikiBlameMixin.WIKIWHO_CODES
+ from pywikibot.page._toolforge import WikiWhoMixin
+ codes = WikiWhoMixin.WIKIWHO_CODES
expected_langs = ['ar', 'de', 'en', 'es', 'eu', 'fr', 'hu', 'id',
'it', 'ja', 'nl', 'pl', 'pt', 'tr', 'zh']
for lang in expected_langs:
diff --git a/tests/wikiwho_tests.py b/tests/wikiwho_tests.py
new file mode 100644
index 0000000..54a898b
--- /dev/null
+++ b/tests/wikiwho_tests.py
@@ -0,0 +1,123 @@
+"""Tests for WikiWhoMixin pickle subdirectory structure."""
+#
+# (C) Pywikibot team, 2026
+#
+# Distributed under the terms of the MIT license.
+#
+from __future__ import annotations
+
+import unittest
+from pathlib import Path
+
+from pywikibot.page._toolforge import WikiWhoMixin
+from tests.aspects import TestCase
+
+
+class TestWikiWhoPicklePaths(TestCase):
+
+ """Test WikiWhoMixin pickle subdirectory path calculation."""
+
+ net = False
+
+ def test_pickle_path_basic(self) -> None:
+ """Test basic pickle path calculation."""
+ path = WikiWhoMixin._get_wikiwho_pickle_path(
+ 'en', 100000, '/tmp/cache')
+ expected = Path('/tmp/cache/en/100000/100000.p')
+ self.assertEqual(path, expected)
+
+ def test_pickle_path_subdirectory_calculation(self) -> None:
+ """Test subdirectory calculated as floor(page_id/1000)*1000."""
+ # page_id 100000 -> subdirectory 100000
+ path1 = WikiWhoMixin._get_wikiwho_pickle_path(
+ 'en', 100000, '/tmp/cache')
+ self.assertEqual(path1.parent.name, '100000')
+
+ # page_id 100002 -> subdirectory 100000
+ path2 = WikiWhoMixin._get_wikiwho_pickle_path(
+ 'en', 100002, '/tmp/cache')
+ self.assertEqual(path2.parent.name, '100000')
+
+ # page_id 100999 -> subdirectory 100000
+ path3 = WikiWhoMixin._get_wikiwho_pickle_path(
+ 'en', 100999, '/tmp/cache')
+ self.assertEqual(path3.parent.name, '100000')
+
+ # page_id 200005 -> subdirectory 200000
+ path4 = WikiWhoMixin._get_wikiwho_pickle_path(
+ 'en', 200005, '/tmp/cache')
+ self.assertEqual(path4.parent.name, '200000')
+
+ def test_pickle_path_different_languages(self) -> None:
+ """Test pickle paths for different language codes."""
+ path_en = WikiWhoMixin._get_wikiwho_pickle_path(
+ 'en', 100000, '/tmp/cache')
+ path_de = WikiWhoMixin._get_wikiwho_pickle_path(
+ 'de', 100000, '/tmp/cache')
+ path_fi = WikiWhoMixin._get_wikiwho_pickle_path(
+ 'fi', 100000, '/tmp/cache')
+
+ self.assertEqual(
+ path_en, Path('/tmp/cache/en/100000/100000.p'))
+ self.assertEqual(
+ path_de, Path('/tmp/cache/de/100000/100000.p'))
+ self.assertEqual(
+ path_fi, Path('/tmp/cache/fi/100000/100000.p'))
+
+ def test_pickle_path_filename(self) -> None:
+ """Test pickle filename is page_id.p."""
+ path1 = WikiWhoMixin._get_wikiwho_pickle_path(
+ 'en', 123456, '/tmp/cache')
+ self.assertEqual(path1.name, '123456.p')
+
+ path2 = WikiWhoMixin._get_wikiwho_pickle_path(
+ 'de', 999999, '/tmp/cache')
+ self.assertEqual(path2.name, '999999.p')
+
+ def test_pickle_path_full_structure(self) -> None:
+ """Test complete directory structure."""
+ # Test case from task description
+ path = WikiWhoMixin._get_wikiwho_pickle_path(
+ 'en', 100000, '/cache')
+ self.assertEqual(path, Path('/cache/en/100000/100000.p'))
+
+ # Additional examples from task
+ path = WikiWhoMixin._get_wikiwho_pickle_path(
+ 'en', 100002, '/cache')
+ self.assertEqual(path, Path('/cache/en/100000/100002.p'))
+
+ path = WikiWhoMixin._get_wikiwho_pickle_path(
+ 'en', 200005, '/cache')
+ self.assertEqual(path, Path('/cache/en/200000/200005.p'))
+
+ def test_pickle_path_edge_cases(self) -> None:
+ """Test edge cases for subdirectory calculation."""
+ # page_id 0 -> subdirectory 0
+ path = WikiWhoMixin._get_wikiwho_pickle_path(
+ 'en', 0, '/tmp/cache')
+ self.assertEqual(path.parent.name, '0')
+
+ # page_id 1 -> subdirectory 0
+ path = WikiWhoMixin._get_wikiwho_pickle_path(
+ 'en', 1, '/tmp/cache')
+ self.assertEqual(path.parent.name, '0')
+
+ # page_id 999 -> subdirectory 0
+ path = WikiWhoMixin._get_wikiwho_pickle_path(
+ 'en', 999, '/tmp/cache')
+ self.assertEqual(path.parent.name, '0')
+
+ # page_id 1000 -> subdirectory 1000
+ path = WikiWhoMixin._get_wikiwho_pickle_path(
+ 'en', 1000, '/tmp/cache')
+ self.assertEqual(path.parent.name, '1000')
+
+ def test_pickle_path_returns_pathlib_path(self) -> None:
+ """Test that method returns pathlib.Path object."""
+ path = WikiWhoMixin._get_wikiwho_pickle_path(
+ 'en', 100000, '/tmp/cache')
+ self.assertIsInstance(path, Path)
+
+
+if __name__ == '__main__':
+ unittest.main()
--
To view, visit
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/1224790?usp=email
To unsubscribe, or for help writing mail filters, visit
https://gerrit.wikimedia.org/r/settings?usp=email
Gerrit-MessageType: merged
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I5ccaf3ffa50af9c2b6a72d2baf27f54469b863d1
Gerrit-Change-Number: 1224790
Gerrit-PatchSet: 8
Gerrit-Owner: Xinacod <[email protected]>
Gerrit-Reviewer: Xqt <[email protected]>
Gerrit-Reviewer: jenkins-bot
_______________________________________________
Pywikibot-commits mailing list -- [email protected]
To unsubscribe send an email to [email protected]