From: Mikolaj Lasota <[email protected]>

Bash script used at the moment takes too much time to calculate obsolete
sstate cache files. Let's try to rewrite necessary logic in python and
store intermediate data in memory rather than temporary files.

Signed-off-by: Mikolaj Lasota <[email protected]>
Signed-off-by: Tomasz Dziendzielski <[email protected]>
---
 scripts/sstate-cache-cleaner.py | 166 ++++++++++++++++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100755 scripts/sstate-cache-cleaner.py

diff --git a/scripts/sstate-cache-cleaner.py b/scripts/sstate-cache-cleaner.py
new file mode 100755
index 0000000000..f01db35775
--- /dev/null
+++ b/scripts/sstate-cache-cleaner.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+
+"""
+This script is a python rewrite of poky based 
scripts/sstate-cache-management.sh
+It has a subset of original script features - namely the ability to filter 
cache files by stamp files references.
+The output is a list of unreferenced sstate-cache files - which are obsolete 
and can be removed.
+
+To test the script agains the original one (shell) one might create a small 
test environment:
+ - create a local sstate-cache directory
+ - run two or more separate builds (different hashes/machines) using above dir 
(SSTATE_DIR)
+ - run original shell script using stamp dir from one of the above builds and 
the common cache dir
+ - run this script with the same arguments (same stamp & cache dirs)
+"""
+
+import argparse
+import fnmatch
+import logging
+import os
+import re
+import time
+from functools import reduce
+
+formatter = logging.Formatter('%(asctime)s - %(funcName)s - %(levelname)s - 
%(message)s')
+logger = logging.getLogger('sstate-cache-cleaner')
+logger.setLevel(logging.DEBUG)
+fh = logging.FileHandler('sstate-cache-cleaner.log', 'w')
+fh.setLevel(logging.DEBUG)
+fh.setFormatter(formatter)
+ch = logging.StreamHandler()
+ch.setLevel(logging.INFO)
+ch.setFormatter(formatter)
+logger.addHandler(fh)
+logger.addHandler(ch)
+
+TIME = time.time()
+ONE_DAY_IN_SECONDS = 86400
+
+def collect_sstate_cache_files(cache_dir):
+    """ Collect all sstate-cache files form cache_dir and figure out 
accelerated tasks for cleaning. """
+
+    logger.info('Collecting sstate-cache files...')
+
+    sstate_tasks = set()
+    cache_files = dict()
+    cache_file_regex = re.compile(r'sstate.*:([^_]*)_(.*)\.tgz.*')
+    for root, dirs, files in os.walk(cache_dir):
+        for filename in files:
+            if fnmatch.fnmatch(filename, 'sstate*'):
+                match = cache_file_regex.match(filename)
+                if match:
+                    _hash = match.group(1)
+                    _task = match.group(2)
+                    sstate_tasks.add(_task)
+                    f = os.path.join(root, filename)
+                    try:
+                        if os.stat(f).st_ctime < TIME - ONE_DAY_IN_SECONDS:
+                            if _hash in cache_files:
+                                cache_files[_hash].append(f)
+                            else:
+                                cache_files[_hash] = [f]
+                    except FileNotFoundError as err:
+                        logger.error(err)
+
+    num_of_files = reduce(lambda count, element: count + len(element), 
cache_files.values(), 0)
+    num_of_hashes = len(cache_files)
+    logger.info(f'Found {num_of_files} sstate files ({num_of_hashes} hashes)')
+    return cache_files, sstate_tasks
+
+def collect_stamps(stamps_dirs_list, tasks):
+    """ Collect hashes from the stamp files (only for tasks which were found 
in sstate-cache) """
+
+    logger.info('Collecting stamps...')
+
+    stamps = set()
+    for stamps_dir in stamps_dirs_list:
+        logger.debug(f'Looking for stamps in {stamps_dir}')
+        for root, dirs, files in os.walk(stamps_dir):
+            for filename in files:
+                for task in tasks:
+                    if fnmatch.fnmatch(filename, f'*.do_{task}_setscene.*'):
+                        match = 
re.match(rf'.*\.do_{task}_setscene\.([^\.]*).*', filename)
+                        if match:
+                            stamps.add(match.group(1))
+                    elif fnmatch.fnmatch(filename, f'*.do_{task}.*'):
+                        match = 
re.match(rf'.*do_{task}(\.sigdata)?\.([^\.]*).*', filename)
+                        if match:
+                            stamps.add(match.group(2))
+                    continue
+
+    logger.info(f'Found {len(stamps)} stamps')
+    return stamps
+
+def compute_obsolete_sstate_cache_files(stamps, cache):
+    """ Figure out which cache files are obsolete.
+
+    Check if a cache file is referenced by a stamp file. If yes - it is needed 
- and therefore should be filtered out
+    from the processed list. The list which is returned is a list of files to 
be removed.
+    """
+
+    logger.info('Filtering sstate-cache list for unreferenced (obsolete) 
files...')
+
+    num_stamps = len(stamps) - 1
+    progress = -1
+    for i, stamp in enumerate(stamps):
+        _progress = int(i / num_stamps * 100)
+        if _progress % 5 == 0 and _progress > progress:
+            progress = _progress
+            logger.debug(f'[{progress:3d}%] Cleaning stamp {i}/{num_stamps}')
+        if stamp in cache:
+            del cache[stamp]
+
+    num_of_files = reduce(lambda count, element: count + len(element), 
cache.values(), 0)
+    logger.info(f'Found {num_of_files} sstate files to be removed')
+    return cache
+
+def parse_arguments():
+    """ Parse arguments for cache & stamp directories and output file name """
+
+    parser = argparse.ArgumentParser(
+                        description='Sstate cache cleanup script. \
+                                     Cache files which are not referenced by 
stamp files will be listed for removal.',
+                        epilog='This is a python re-write of poky provided 
sstate-cache-management.sh script. \
+                                Only stamp based cleaning is implemented.')
+    parser.add_argument('--cache-dir', required=True,
+                        help='Specify sstate-cache directory')
+    parser.add_argument('--stamps-dir', required=True, nargs='+',
+                        help='Specify stamps directories')
+    parser.add_argument('--output-file', '-f', required=True,
+                        help='Specify a file for script output - a list of 
obsolete sstate-cache files.')
+
+    logger.debug('Parsing arguments...')
+    return parser.parse_args()
+
+def main():
+    args = parse_arguments()
+
+    stamps_dirs_list = args.stamps_dir
+    for i, path in enumerate(stamps_dirs_list):
+        abs_path = os.path.abspath(path)
+        if not os.path.isdir(abs_path):
+            raise ValueError(f'Stamps directory doesn\'t exist: {abs_path} !')
+        stamps_dirs_list[i] = abs_path
+
+    cache_dir = os.path.abspath(args.cache_dir)
+    if not os.path.isdir(cache_dir):
+        raise ValueError(f'Cache directory doesn\'t exist: {cache_dir} !')
+
+    output_file_path = os.path.abspath(args.output_file)
+
+    cache, tasks = collect_sstate_cache_files(cache_dir)
+    stamps = collect_stamps(stamps_dirs_list, tasks)
+
+    obsolete_sstate = compute_obsolete_sstate_cache_files(stamps, cache)
+    obsolete_sstate_files = [item for sublist in obsolete_sstate.values() for 
item in sublist]
+
+    if not os.path.isdir(os.path.dirname(output_file_path)):
+        logger.warning(f'Output directory doesn\'t exist and will be created: 
{output_file_path}')
+        os.makedirs(os.path.dirname(output_file_path))
+
+    with open(output_file_path, 'w') as out:
+        out.write('\n'.join(obsolete_sstate_files))
+
+    logger.info(f'List of obsolete sstate-cache files saved: 
{output_file_path}')
+
+if __name__ == "__main__":
+    main()
-- 
2.38.0

-=-=-=-=-=-=-=-=-=-=-=-
Links: You receive all messages sent to this group.
View/Reply Online (#173649): 
https://lists.openembedded.org/g/openembedded-core/message/173649
Mute This Topic: https://lists.openembedded.org/mt/95169760/21656
Group Owner: [email protected]
Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub 
[[email protected]]
-=-=-=-=-=-=-=-=-=-=-=-

Reply via email to