jenkins-bot has submitted this change. ( 
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/1224790?usp=email )

Change subject: Optimize pickle file storage with subdirectory structure
......................................................................

Optimize pickle file storage with subdirectory structure

Currently pickle files are stored in flat directories which creates
over 7 million files in a single directory for enwiki. This causes
significant filesystem performance issues.

This patch implements subdirectory structure using floor(page_id/1000)
to organize pickle files:
- page_id 100000 → en/100000/100000.p
- page_id 100002 → en/100000/100002.p
- page_id 200005 → en/200000/200005.p

This reduces files per directory from ~7M to ~7K, improving filesystem
performance. The implementation adds WikiWhoMixin class to toolforge.py
with _get_wikiwho_pickle_path() static method for calculating paths.

Includes comprehensive unit tests covering path calculation, different
languages, edge cases, and directory structure verification.

Bug: T414087
Change-Id: I5ccaf3ffa50af9c2b6a72d2baf27f54469b863d1
---
M pywikibot/page/_page.py
M pywikibot/page/_toolforge.py
M tests/wikiblame_tests.py
A tests/wikiwho_tests.py
4 files changed, 246 insertions(+), 50 deletions(-)

Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified




diff --git a/pywikibot/page/_page.py b/pywikibot/page/_page.py
index 457ab1a..e48e511 100644
--- a/pywikibot/page/_page.py
+++ b/pywikibot/page/_page.py
@@ -26,7 +26,7 @@
     UnknownExtensionError,
 )
 from pywikibot.page._basepage import BasePage
-from pywikibot.page._toolforge import WikiBlameMixin
+from pywikibot.page._toolforge import WikiBlameMixin, WikiWhoMixin
 from pywikibot.site import Namespace
 from pywikibot.tools import cached, deprecated_args

@@ -34,7 +34,7 @@
 __all__ = ['Page']


-class Page(BasePage, WikiBlameMixin):
+class Page(BasePage, WikiBlameMixin, WikiWhoMixin):

     """Page: A MediaWiki page."""

diff --git a/pywikibot/page/_toolforge.py b/pywikibot/page/_toolforge.py
index d00d983..7e80c78 100644
--- a/pywikibot/page/_toolforge.py
+++ b/pywikibot/page/_toolforge.py
@@ -10,9 +10,11 @@
 from __future__ import annotations

 import collections
+import pickle
 import re
 import urllib.parse
 from http import HTTPStatus
+from pathlib import Path
 from typing import Any
 from warnings import warn

@@ -30,12 +32,6 @@
     #: Supported wikipedia site codes
     WIKIBLAME_CODES = 'als', 'bar', 'de', 'en', 'it', 'nds', 'sco'

-    #: Supported WikiWho API language codes
-    WIKIWHO_CODES = (
-        'ar', 'de', 'en', 'es', 'eu', 'fr', 'hu', 'id', 'it', 'ja', 'nl', 'pl',
-        'pt', 'tr', 'zh'
-    )
-
     def _check_wh_supported(self) -> None:
         """Check if WikiHistory is supported."""
         if self.site.family.name != 'wikipedia':
@@ -53,45 +49,6 @@
         if not self.exists():
             raise pywikibot.exceptions.NoPageError(self)

-    def _check_wikiwho_supported(self) -> None:
-        """Check if WikiWho API is supported.
-
-        .. versionadded:: 11.0
-
-        :raise NotImplementedError: unsupported site, language, or namespace
-        :raise NoPageError: page does not exist
-        """
-        if self.site.family.name != 'wikipedia':
-            raise NotImplementedError(
-                'WikiWho API is implemented for wikipedia family only')
-
-        if (code := self.site.code) not in self.WIKIWHO_CODES:
-            raise NotImplementedError(
-                f'WikiWho API is not implemented for wikipedia:{code}')
-
-        if (ns := self.namespace()) != 0:
-            raise NotImplementedError(
-                f'WikiWho API is not implemented for {ns} namespace')
-
-        if not self.exists():
-            raise pywikibot.exceptions.NoPageError(self)
-
-    def _build_wikiwho_url(self, endpoint: str) -> str:
-        """Build WikiWho API URL for the given endpoint.
-
-        .. versionadded:: 11.0
-
-        :param endpoint: API endpoint (all_content, rev_content,
-            edit_persistence)
-        :return: Complete API URL
-        """
-        article_title = self.title(with_ns=False, with_section=False)
-        encoded_title = urllib.parse.quote(article_title, safe='')
-        base_url = 'https://wikiwho-api.wmcloud.org'
-        url = (f'{base_url}/{self.site.code}/api/v1.0.0-beta/{endpoint}/'
-               f'{encoded_title}/')
-        return url
-
     @deprecated('authorsship', since='9.3.0')
     @deprecated_args(onlynew=None)  # since 9.2.0
     def main_authors(self) -> collections.Counter[str, int]:
@@ -255,11 +212,70 @@

         return {user: (chars, percent) for user, chars, percent in result}

-    def get_annotations(self) -> dict[str, Any]:
+
+class WikiWhoMixin:
+
+    """Page mixin for WikiWho authorship data with optimized pickle storage.
+
+    WikiWho provides token-level provenance and authorship information.
+    This implementation uses an optimized subdirectory structure for pickle
+    caching to avoid filesystem performance issues with millions of files.
+
+    .. versionadded:: 11.0
+    """
+
+    #: Supported WikiWho API language codes
+    WIKIWHO_CODES = (
+        'ar', 'de', 'en', 'es', 'eu', 'fr', 'hu', 'id', 'it', 'ja', 'nl', 'pl',
+        'pt', 'tr', 'zh'
+    )
+
+    def _check_wikiwho_supported(self) -> None:
+        """Check if WikiWho API is supported.
+
+        .. versionadded:: 11.0
+
+        :raise NotImplementedError: unsupported site, language, or namespace
+        :raise NoPageError: page does not exist
+        """
+        if self.site.family.name != 'wikipedia':
+            raise NotImplementedError(
+                'WikiWho API is implemented for wikipedia family only')
+
+        if (code := self.site.code) not in self.WIKIWHO_CODES:
+            raise NotImplementedError(
+                f'WikiWho API is not implemented for wikipedia:{code}')
+
+        if (ns := self.namespace()) != 0:
+            raise NotImplementedError(
+                f'WikiWho API is not implemented for {ns} namespace')
+
+        if not self.exists():
+            raise pywikibot.exceptions.NoPageError(self)
+
+    def _build_wikiwho_url(self, endpoint: str) -> str:
+        """Build WikiWho API URL for the given endpoint.
+
+        .. versionadded:: 11.0
+
+        :param endpoint: API endpoint (all_content, rev_content,
+            edit_persistence)
+        :return: Complete API URL
+        """
+        article_title = self.title(with_ns=False, with_section=False)
+        encoded_title = urllib.parse.quote(article_title, safe='')
+        base_url = 'https://wikiwho-api.wmcloud.org'
+        url = (f'{base_url}/{self.site.code}/api/v1.0.0-beta/{endpoint}/'
+               f'{encoded_title}/')
+        return url
+
+    def get_annotations(self, *, use_cache: bool = True) -> dict[str, Any]:
         """Get WikiWho annotations for article revisions.

         This method uses the public WikiWho API to get token-level
         provenance annotations showing who added each token in the article.
+        Results are cached locally using pickle files with an optimized
+        subdirectory structure to avoid filesystem performance issues.

         Sample:

@@ -277,6 +293,8 @@
            - https://wikiwho-api.wmcloud.org
            - https://www.mediawiki.org/wiki/WikiWho

+        :param use_cache: Whether to use and save cached data.
+            Set to False to force a fresh API request without caching.
         :return: Dictionary containing article_title, page_id, and revisions
             with token-level annotations

@@ -287,6 +305,13 @@
         """
         self._check_wikiwho_supported()

+        # Check cache first
+        cache_path = self._get_wikiwho_pickle_path(
+            self.site.code, self.pageid)
+        if use_cache and cache_path.exists():
+            with open(cache_path, 'rb') as f:
+                return pickle.load(f)
+
         url = self._build_wikiwho_url('all_content')
         url = f'{url}?editor=true&o_rev_id=true'

@@ -306,4 +331,52 @@
             raise pywikibot.exceptions.ServerError(
                 f'WikiWho API error: {error_msg}')

+        # Save to cache if caching is enabled
+        if use_cache:
+            cache_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(cache_path, 'wb') as f:
+                pickle.dump(data, f, protocol=pywikibot.config.pickle_protocol)
+
         return data
+
+    @staticmethod
+    def _get_wikiwho_pickle_path(lang: str, page_id: int, cache_dir=None):
+        """Calculate pickle file path with subdirectory structure.
+
+        Uses subdirectories based on floor(page_id/1000) to optimize
+        filesystem performance. This avoids having millions of pickle
+        files in a single directory.
+
+        Directory structure:
+            cache_dir/lang/subdirectory/page_id.p
+
+        Where subdirectory = floor(page_id / 1000) * 1000
+
+        Examples:
+            page_id 100000 → en/100000/100000.p
+            page_id 100002 → en/100000/100002.p
+            page_id 200005 → en/200000/200005.p
+
+        This reduces files per directory from ~7M to ~7K for large wikis.
+
+        .. versionadded:: 11.0
+
+        :param lang: Language code (e.g., 'en', 'de', 'fi')
+        :param page_id: Wikipedia page ID
+        :param cache_dir: Custom cache directory (defaults to apicache/wikiwho)
+        :return: Path object for the pickle file
+        """
+        # Use provided cache_dir or default to apicache/wikiwho
+        if cache_dir is None:
+            cache_dir = (Path(pywikibot.config.base_dir)
+                         / 'apicache' / 'wikiwho')
+        else:
+            cache_dir = Path(cache_dir)
+
+        # Calculate subdirectory as floor(page_id / 1000) * 1000
+        subdirectory = (page_id // 1000) * 1000
+
+        # Construct path: cache_dir/lang/subdirectory/page_id.p
+        pickle_path = cache_dir / lang / str(subdirectory) / f'{page_id}.p'
+
+        return pickle_path
diff --git a/tests/wikiblame_tests.py b/tests/wikiblame_tests.py
index b4d3e30..b69706b 100644
--- a/tests/wikiblame_tests.py
+++ b/tests/wikiblame_tests.py
@@ -99,8 +99,8 @@

     def test_wikiwho_supported_languages(self) -> None:
         """Test that WIKIWHO_CODES contains expected languages."""
-        from pywikibot.page._toolforge import WikiBlameMixin
-        codes = WikiBlameMixin.WIKIWHO_CODES
+        from pywikibot.page._toolforge import WikiWhoMixin
+        codes = WikiWhoMixin.WIKIWHO_CODES
         expected_langs = ['ar', 'de', 'en', 'es', 'eu', 'fr', 'hu', 'id',
                           'it', 'ja', 'nl', 'pl', 'pt', 'tr', 'zh']
         for lang in expected_langs:
diff --git a/tests/wikiwho_tests.py b/tests/wikiwho_tests.py
new file mode 100644
index 0000000..54a898b
--- /dev/null
+++ b/tests/wikiwho_tests.py
@@ -0,0 +1,123 @@
+"""Tests for WikiWhoMixin pickle subdirectory structure."""
+#
+# (C) Pywikibot team, 2026
+#
+# Distributed under the terms of the MIT license.
+#
+from __future__ import annotations
+
+import unittest
+from pathlib import Path
+
+from pywikibot.page._toolforge import WikiWhoMixin
+from tests.aspects import TestCase
+
+
+class TestWikiWhoPicklePaths(TestCase):
+
+    """Test WikiWhoMixin pickle subdirectory path calculation."""
+
+    net = False
+
+    def test_pickle_path_basic(self) -> None:
+        """Test basic pickle path calculation."""
+        path = WikiWhoMixin._get_wikiwho_pickle_path(
+            'en', 100000, '/tmp/cache')
+        expected = Path('/tmp/cache/en/100000/100000.p')
+        self.assertEqual(path, expected)
+
+    def test_pickle_path_subdirectory_calculation(self) -> None:
+        """Test subdirectory calculated as floor(page_id/1000)*1000."""
+        # page_id 100000 -> subdirectory 100000
+        path1 = WikiWhoMixin._get_wikiwho_pickle_path(
+            'en', 100000, '/tmp/cache')
+        self.assertEqual(path1.parent.name, '100000')
+
+        # page_id 100002 -> subdirectory 100000
+        path2 = WikiWhoMixin._get_wikiwho_pickle_path(
+            'en', 100002, '/tmp/cache')
+        self.assertEqual(path2.parent.name, '100000')
+
+        # page_id 100999 -> subdirectory 100000
+        path3 = WikiWhoMixin._get_wikiwho_pickle_path(
+            'en', 100999, '/tmp/cache')
+        self.assertEqual(path3.parent.name, '100000')
+
+        # page_id 200005 -> subdirectory 200000
+        path4 = WikiWhoMixin._get_wikiwho_pickle_path(
+            'en', 200005, '/tmp/cache')
+        self.assertEqual(path4.parent.name, '200000')
+
+    def test_pickle_path_different_languages(self) -> None:
+        """Test pickle paths for different language codes."""
+        path_en = WikiWhoMixin._get_wikiwho_pickle_path(
+            'en', 100000, '/tmp/cache')
+        path_de = WikiWhoMixin._get_wikiwho_pickle_path(
+            'de', 100000, '/tmp/cache')
+        path_fi = WikiWhoMixin._get_wikiwho_pickle_path(
+            'fi', 100000, '/tmp/cache')
+
+        self.assertEqual(
+            path_en, Path('/tmp/cache/en/100000/100000.p'))
+        self.assertEqual(
+            path_de, Path('/tmp/cache/de/100000/100000.p'))
+        self.assertEqual(
+            path_fi, Path('/tmp/cache/fi/100000/100000.p'))
+
+    def test_pickle_path_filename(self) -> None:
+        """Test pickle filename is page_id.p."""
+        path1 = WikiWhoMixin._get_wikiwho_pickle_path(
+            'en', 123456, '/tmp/cache')
+        self.assertEqual(path1.name, '123456.p')
+
+        path2 = WikiWhoMixin._get_wikiwho_pickle_path(
+            'de', 999999, '/tmp/cache')
+        self.assertEqual(path2.name, '999999.p')
+
+    def test_pickle_path_full_structure(self) -> None:
+        """Test complete directory structure."""
+        # Test case from task description
+        path = WikiWhoMixin._get_wikiwho_pickle_path(
+            'en', 100000, '/cache')
+        self.assertEqual(path, Path('/cache/en/100000/100000.p'))
+
+        # Additional examples from task
+        path = WikiWhoMixin._get_wikiwho_pickle_path(
+            'en', 100002, '/cache')
+        self.assertEqual(path, Path('/cache/en/100000/100002.p'))
+
+        path = WikiWhoMixin._get_wikiwho_pickle_path(
+            'en', 200005, '/cache')
+        self.assertEqual(path, Path('/cache/en/200000/200005.p'))
+
+    def test_pickle_path_edge_cases(self) -> None:
+        """Test edge cases for subdirectory calculation."""
+        # page_id 0 -> subdirectory 0
+        path = WikiWhoMixin._get_wikiwho_pickle_path(
+            'en', 0, '/tmp/cache')
+        self.assertEqual(path.parent.name, '0')
+
+        # page_id 1 -> subdirectory 0
+        path = WikiWhoMixin._get_wikiwho_pickle_path(
+            'en', 1, '/tmp/cache')
+        self.assertEqual(path.parent.name, '0')
+
+        # page_id 999 -> subdirectory 0
+        path = WikiWhoMixin._get_wikiwho_pickle_path(
+            'en', 999, '/tmp/cache')
+        self.assertEqual(path.parent.name, '0')
+
+        # page_id 1000 -> subdirectory 1000
+        path = WikiWhoMixin._get_wikiwho_pickle_path(
+            'en', 1000, '/tmp/cache')
+        self.assertEqual(path.parent.name, '1000')
+
+    def test_pickle_path_returns_pathlib_path(self) -> None:
+        """Test that method returns pathlib.Path object."""
+        path = WikiWhoMixin._get_wikiwho_pickle_path(
+            'en', 100000, '/tmp/cache')
+        self.assertIsInstance(path, Path)
+
+
+if __name__ == '__main__':
+    unittest.main()

--
To view, visit 
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/1224790?usp=email
To unsubscribe, or for help writing mail filters, visit 
https://gerrit.wikimedia.org/r/settings?usp=email

Gerrit-MessageType: merged
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I5ccaf3ffa50af9c2b6a72d2baf27f54469b863d1
Gerrit-Change-Number: 1224790
Gerrit-PatchSet: 8
Gerrit-Owner: Xinacod <[email protected]>
Gerrit-Reviewer: Xqt <[email protected]>
Gerrit-Reviewer: jenkins-bot
_______________________________________________
Pywikibot-commits mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to