This is an automated email from the ASF dual-hosted git repository. machristie pushed a commit to branch develop in repository https://gitbox.apache.org/repos/asf/airavata-django-portal.git
commit a248cbd7d37730547f5827c43bf371861ca2a8e3 Author: Marcus Christie <[email protected]> AuthorDate: Mon Apr 17 15:27:27 2023 -0400 AIRAVATA-3694 Basic archive_user_data management command --- django_airavata/apps/admin/management/__init__.py | 0 .../apps/admin/management/commands/__init__.py | 0 .../admin/management/commands/archive_user_data.py | 123 +++++++++++++++++++++ .../apps/admin/migrations/0001_initial.py | 35 ++++++ django_airavata/apps/admin/models.py | 15 +++ django_airavata/settings.py | 4 + django_airavata/settings_local.py.sample | 4 + 7 files changed, 181 insertions(+) diff --git a/django_airavata/apps/admin/management/__init__.py b/django_airavata/apps/admin/management/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/django_airavata/apps/admin/management/commands/__init__.py b/django_airavata/apps/admin/management/commands/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/django_airavata/apps/admin/management/commands/archive_user_data.py b/django_airavata/apps/admin/management/commands/archive_user_data.py new file mode 100644 index 00000000..0d45e931 --- /dev/null +++ b/django_airavata/apps/admin/management/commands/archive_user_data.py @@ -0,0 +1,123 @@ +import datetime +import os +import shutil +import tarfile +import tempfile +from pathlib import Path +from typing import Iterator + +from django.conf import settings +from django.core.management.base import BaseCommand, CommandError +from django.db import transaction +from django.utils import timezone + +from django_airavata.apps.admin import models + + +class Command(BaseCommand): + help = "Create an archive of user data directories and optionally clean them up" + + def add_arguments(self, parser): + parser.add_argument('--dry-run', + action='store_true', + help="Print the list of files/directories that would be archived then exit", + default=False) + + def handle(self, *args, **options): + max_age_setting = getattr(settings, "GATEWAY_USER_DATA_ARCHIVE_MAX_AGE", None) + if max_age_setting is None: + raise CommandError("Setting GATEWAY_USER_DATA_ARCHIVE_MAX_AGE is not configured") + + max_age = timezone.now() - datetime.timedelta(**max_age_setting) + entries_to_archive = self.get_archive_entries(older_than=max_age) + gateway_id = settings.GATEWAY_ID + + archive_directory = Path(settings.GATEWAY_USER_DATA_ARCHIVE_DIRECTORY) + archive_directory.mkdir(exist_ok=True) + + with tempfile.TemporaryDirectory(dir=archive_directory) as tmpdir: + archive_basename = f"archive_{gateway_id}_older_than_{max_age.strftime('%Y-%m-%d-%H-%M-%S')}" + archive_list_filename = f"{archive_basename}.txt" + archive_list_filepath = os.path.join(tmpdir, archive_list_filename) + with open(archive_list_filepath, "wt") as archive_list_file: + for entry in entries_to_archive: + archive_list_file.write(f"{entry.path}\n") + + # if dry run, just print file and exit + if options['dry_run']: + self.stdout.write(f"DRY RUN: printing {archive_list_filename}, then exiting") + with open(os.path.join(tmpdir, archive_list_filename)) as archive_list_file: + for line in archive_list_file: + self.stdout.write(line) + self.stdout.write(self.style.SUCCESS("DRY RUN: exiting now")) + return + + # otherwise, generate a tarball in tmpdir + archive_tarball_filename = f"{archive_basename}.tgz" + archive_tarball_filepath = os.path.join(tmpdir, archive_tarball_filename) + with tarfile.open(archive_tarball_filepath, "w:gz") as tarball: + with open(os.path.join(tmpdir, archive_list_filename)) as archive_list_file: + for line in archive_list_file: + tarball.add(line.strip()) + self.stdout.write(self.style.SUCCESS(f"Created tarball: {archive_tarball_filename}")) + + # Move the archive files into the final destination + shutil.move(archive_list_filepath, archive_directory / archive_list_filename) + shutil.move(archive_tarball_filepath, archive_directory / archive_tarball_filename) + + with transaction.atomic(): + user_data_archive = models.UserDataArchive( + archive_name=archive_tarball_filename, + archive_path=os.fspath(archive_directory / archive_list_filename), + max_modification_time=max_age) + user_data_archive.save() + # delete archived entries + with open(archive_directory / archive_list_filename) as archive_list_file: + for archive_path in archive_list_file: + archive_path = archive_path.strip() + if os.path.isfile(archive_path): + os.remove(archive_path) + else: + shutil.rmtree(archive_path) + archive_entry = models.UserDataArchiveEntry(user_data_archive=user_data_archive, entry_path=archive_path) + archive_entry.save() + + self.stdout.write(self.style.SUCCESS("Successfully removed archived user data")) + + def get_archive_entries(self, older_than: datetime.datetime) -> Iterator[os.DirEntry]: + + GATEWAY_USER_DIR = settings.USER_STORAGES['default']['OPTIONS']['directory'] + + with os.scandir(GATEWAY_USER_DIR) as user_dirs: + for user_dir_entry in user_dirs: + # Skip over any files (shouldn't be any but who knows) + if not user_dir_entry.is_dir(): + continue + # Skip over shared directories + if self._is_shared_directory(user_dir_entry): + continue + with os.scandir(user_dir_entry.path) as project_dirs: + for project_dir_entry in project_dirs: + yield from self._scan_project_dir_for_archive_entries( + project_dir_entry=project_dir_entry, + older_than=older_than) + + def _scan_project_dir_for_archive_entries(self, project_dir_entry: os.DirEntry, older_than: datetime.datetime) -> Iterator[os.DirEntry]: + # archive files here but not directories + if project_dir_entry.is_file() and project_dir_entry.stat().st_mtime < older_than.timestamp(): + yield project_dir_entry + # Skip over shared directories + if project_dir_entry.is_dir() and not self._is_shared_directory(project_dir_entry): + with os.scandir(project_dir_entry.path) as experiment_dirs: + for experiment_dir_entry in experiment_dirs: + if experiment_dir_entry.stat().st_mtime < older_than.timestamp(): + yield experiment_dir_entry + + def _is_shared_directory(self, dir_entry: os.DirEntry) -> bool: + if not dir_entry.is_dir(): + return False + shared_dirs = getattr(settings, "GATEWAY_DATA_SHARED_DIRECTORIES", {}) + for shared_dir in shared_dirs.values(): + if os.path.samefile(dir_entry.path, shared_dir["path"]): + return True + return False diff --git a/django_airavata/apps/admin/migrations/0001_initial.py b/django_airavata/apps/admin/migrations/0001_initial.py new file mode 100644 index 00000000..6223ce73 --- /dev/null +++ b/django_airavata/apps/admin/migrations/0001_initial.py @@ -0,0 +1,35 @@ +# Generated by Django 3.2.18 on 2023-04-17 14:10 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='UserDataArchive', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created_date', models.DateTimeField(auto_now_add=True)), + ('updated_date', models.DateTimeField(auto_now=True)), + ('archive_name', models.CharField(max_length=255)), + ('archive_path', models.TextField()), + ('rolled_back', models.BooleanField(default=False)), + ('max_modification_time', models.DateTimeField()), + ], + ), + migrations.CreateModel( + name='UserDataArchiveEntry', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('entry_path', models.TextField()), + ('user_data_archive', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='django_airavata_admin.userdataarchive')), + ], + ), + ] diff --git a/django_airavata/apps/admin/models.py b/django_airavata/apps/admin/models.py index 35e0d648..2471f9af 100644 --- a/django_airavata/apps/admin/models.py +++ b/django_airavata/apps/admin/models.py @@ -1,2 +1,17 @@ +from django.db import models # Create your models here. + + +class UserDataArchive(models.Model): + created_date = models.DateTimeField(auto_now_add=True) + updated_date = models.DateTimeField(auto_now=True) + archive_name = models.CharField(max_length=255) + archive_path = models.TextField() + rolled_back = models.BooleanField(default=False) + max_modification_time = models.DateTimeField() + + +class UserDataArchiveEntry(models.Model): + user_data_archive = models.ForeignKey(UserDataArchive, on_delete=models.CASCADE) + entry_path = models.TextField() diff --git a/django_airavata/settings.py b/django_airavata/settings.py index 01c6b144..d041d43f 100644 --- a/django_airavata/settings.py +++ b/django_airavata/settings.py @@ -203,6 +203,10 @@ TUS_ENDPOINT = None # Override and set to the directory where tus uploads will be stored TUS_DATA_DIR = None +# TODO: document, timedelta arguments +GATEWAY_USER_DATA_ARCHIVE_MAX_AGE = None +GATEWAY_USER_DATA_ARCHIVE_DIRECTORY = "/tmp" + # Legacy (PGA) Portal link - provide a link to the legacy portal PGA_URL = None diff --git a/django_airavata/settings_local.py.sample b/django_airavata/settings_local.py.sample index ba7410ad..1245000c 100644 --- a/django_airavata/settings_local.py.sample +++ b/django_airavata/settings_local.py.sample @@ -115,6 +115,10 @@ PROFILE_SERVICE_HOST = AIRAVATA_API_HOST PROFILE_SERVICE_PORT = 8962 PROFILE_SERVICE_SECURE = False +# Gateway user data archive configuration. User data can be periodically +# archived and deleted to free up storage space. +# GATEWAY_USER_DATA_ARCHIVE_MAX_AGE = {'days': 90} +# GATEWAY_USER_DATA_ARCHIVE_DIRECTORY = "/path/dir/where/to/copy/archives" # Portal settings PORTAL_TITLE = 'Django Airavata Gateway'
