Filippo Giunchedi has uploaded a new change for review. https://gerrit.wikimedia.org/r/148997
Change subject: swift-thumb-stats: dump thumb stats from swift ...................................................................... swift-thumb-stats: dump thumb stats from swift basic script to process thumbs into JSON for later analysis Change-Id: Iec2e5e5dee17e2bb29d5d4c0e334f0c2defbc961 --- A thumbstats/swift-thumb-stats 1 file changed, 192 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/software refs/changes/97/148997/1 diff --git a/thumbstats/swift-thumb-stats b/thumbstats/swift-thumb-stats new file mode 100755 index 0000000..304ceb5 --- /dev/null +++ b/thumbstats/swift-thumb-stats @@ -0,0 +1,192 @@ +#!/usr/bin/python + +# this script will scan all containers with thumbnails for the given account +# and feed each thumbnail to a filter. At the end the result from each filter +# is printed on standard output in JSON in an object like: +# {'FooFilter': <result>, 'BarFilter': <result>} + +import argparse +import collections +import datetime +import json +import os +import pprint +import re +import sys +import threading + +import swiftclient + +CONTAINER_THUMB_RE = re.compile('-thumb(\.[a-f0-9][a-f0-9])?$') +THUMB_RE = re.compile('/(?P<size>\d+)px-(?P<name>.*)$') + + +class Thumb(object): + pass + + +class Filter(object): + def process(self, thumb): + pass + def result(self): + pass + + +class BytesPerSize(Filter): + _bytes = {} + def process(self, thumb): + self._bytes[thumb.thumbsize] = \ + self._bytes.setdefault(thumb.thumbsize, 0) + int(thumb.bytes) + def result(self): + return self._bytes + def str(self): + return "BytesPerSize" + + +class CountPerSize(Filter): + """Size vs count breakdown.""" + + _count = {} + def process(self, thumb): + self._count[thumb.thumbsize] = \ + self._count.setdefault(thumb.thumbsize, 0) + 1 + def result(self): + return self._count + def str(self): + return "CountPerSize" + + +class BytesByMonth(Filter): + """Year+month vs size vs bytes breakdown.""" + _month = {} + def process(self, thumb): + key = thumb.last_modified[:7] + size = thumb.thumbsize + self._month[key] = self._month.setdefault(key, {}) + self._month[key][size] = \ + self._month[key].setdefault(size, 0) + int(thumb.bytes) + def result(self): + return self._month + def str(self): + return "BytesPerMonth" + + +def iter_container(connection, name, limit=None): + """Iterate over the container contents.""" + + _, listing = connection.get_container(name, limit=limit) + while listing: + for container in listing: + yield container + marker = container['name'] + _, listing = connection.get_container(name, limit=limit, marker=marker) + + +def iter_thumbs(container): + """Iterate over the container contents and yield Thumb objects.""" + + for thumb in container: + m = THUMB_RE.search(thumb['name']) + if not m: + continue + t = Thumb() + t.name = m.group('name') + t.thumbsize = m.group('size') + t.filename = thumb['name'] + t.bytes = thumb['bytes'] + t.last_modified = thumb['last_modified'] + t.hash = thumb['hash'] + t.content_type = thumb['content_type'] + yield t + + +def _process_container(container, connection, filters): + """Iterate over the container thumbs and pass items to each filter.""" + + container_name = container['name'] + thumbs = iter_thumbs(iter_container(connection, container_name)) + start = datetime.datetime.utcnow() + for i, thumb in enumerate(thumbs): + thumb.container_name = container_name + for f in filters: + f.process(thumb) + if i and i % 10000 == 0: + now = datetime.datetime.utcnow() + elapsed = now - start + start = now + print >>sys.stderr, "%s: inserted 10000 records from %s (%s)" % ( + threading.current_thread().name, container_name, elapsed) + + +def process_container(in_queue, connection, filters): + while True: + try: + container = in_queue.popleft() + _process_container(container, connection, filters) + except IndexError: + break + + +def _join_threads(threads): + """Join the given threads while accepting KeyboardInterrupt.""" + + _threads = threads[:] + while _threads: + try: + for thread in _threads[:]: + if not thread.is_alive(): + _threads.remove(thread) + else: + thread.join(timeout=0.1) + except KeyboardInterrupt: + break + + +def thumb_containers(connection): + headers, containers = connection.get_account(full_listing=True) + for container in containers: + if CONTAINER_THUMB_RE.search(container['name']): + yield container + + +def main(): + parser = argparse.ArgumentParser(description="Print swift account statistics") + parser.add_argument('-A', '--auth', dest='auth', + default=os.environ.get('ST_AUTH', None), + help='URL for obtaining an auth token') + parser.add_argument('-U', '--user', dest='user', + default=os.environ.get('ST_USER', None), + help='User name for obtaining an auth token') + parser.add_argument('-K', '--key', dest='key', + default=os.environ.get('ST_KEY', None), + help='Key for obtaining an auth token') + parser.add_argument('-t', '--threads', dest='threads', + default=3, type=int, + help='How many threads to use (%default)') + args = parser.parse_args() + + if None in (args.auth, args.user, args.key): + parser.error("please provide auth, user and key") + return 1 + + connection = swiftclient.Connection(args.auth, args.user, args.key) + worker_queue = collections.deque(thumb_containers(connection)) + filters = [BytesPerSize(), CountPerSize(), BytesByMonth()] + threads = [] + + for i in range(args.threads): + thread_connection = swiftclient.Connection(args.auth, args.user, + args.key) + t = threading.Thread(target=process_container, args=(worker_queue, + thread_connection, filters)) + t.daemon = True + t.start() + threads.append(t) + + _join_threads(threads) + out = [(str(x), x.result()) for x in filters] + print json.dumps(dict(out)) + + +if __name__ == '__main__': + sys.exit(main()) -- To view, visit https://gerrit.wikimedia.org/r/148997 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Iec2e5e5dee17e2bb29d5d4c0e334f0c2defbc961 Gerrit-PatchSet: 1 Gerrit-Project: operations/software Gerrit-Branch: master Gerrit-Owner: Filippo Giunchedi <fgiunch...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits