Filippo Giunchedi has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/351793 )
Change subject: thumbstats: add Hive export
......................................................................
thumbstats: add Hive export
Bug: T162796
Change-Id: Ia6cd8e66ff7da58ca5e2fe164a220e9d40b5391b
---
M thumbstats/swift-thumb-stats
1 file changed, 40 insertions(+), 5 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/software
refs/changes/93/351793/1
diff --git a/thumbstats/swift-thumb-stats b/thumbstats/swift-thumb-stats
index 960fdef..67b21aa 100755
--- a/thumbstats/swift-thumb-stats
+++ b/thumbstats/swift-thumb-stats
@@ -34,6 +34,34 @@
pass
+class HiveExport(Filter):
+ """Print to stdout a dump of thumbnails found, ready for import into Hive
+ for further analysis.
+
+ The Hive statements below will create a suitable table and import the dump.
+ Note that the dump needs to live on the same server as the hive server is
+ running on. The ORC conversion step is needed for efficient storage and
+ querying by Hive.
+
+ CREATE TABLE thumbstats_raw (name string, pixels int, container string,
path string, bytes int, last_modified timestamp, content_type string);
+ CREATE TABLE thumbstats (name string, pixels int, container string, path
string, bytes int, last_modified timestamp, content_type string) STORED AS ORC
tblproperties ("orc.compress" = "SNAPPY");
+ LOAD DATA LOCAL INPATH "<dumpfile>" OVERWRITE INTO TABLE thumbstats_raw;
+ INSERT OVERWRITE thumbstats SELECT * FROM thumbstats_raw;
+ """
+
+ def process(self, thumb):
+ # Hive < 1.2.0 requires a fixed timestamp format on load, see HIVE-9298
+ thumb.last_modified = thumb.last_modified.replace('T', ' ')
+ thumb.last_modified += '000'
+ export_fields = (thumb.name, thumb.thumbsize, thumb.container_name,
+ thumb.filename, thumb.bytes, thumb.last_modified,
+ thumb.content_type)
+ print "\x01".join([unicode(x).encode('utf8') for x in export_fields])
+
+ def result(self):
+ return {}
+
+
class BytesPerSize(Filter):
"""Size vs bytes breakdown."""
_bytes = {}
@@ -94,7 +122,7 @@
_, listing = connection.get_container(name, limit=limit, marker=marker)
-def iter_thumbs(container):
+def iter_thumbs(container, container_name):
"""Iterate over the container contents and yield Thumb objects."""
for thumb in container:
@@ -102,6 +130,7 @@
if not m:
continue
t = Thumb()
+ t.container_name = container_name
t.name = m.group('name')
t.thumbsize = m.group('size')
t.filename = thumb['name']
@@ -116,7 +145,7 @@
"""Iterate over the container thumbs and pass items to each filter."""
container_name = container['name']
- thumbs = iter_thumbs(iter_container(connection, container_name))
+ thumbs = iter_thumbs(iter_container(connection, container_name),
container_name)
start = datetime.datetime.utcnow()
for i, thumb in enumerate(thumbs):
thumb.container_name = container_name
@@ -166,16 +195,19 @@
description="Print swift thumb statistics")
parser.add_argument(
'-A', '--auth', dest='auth', default=os.environ.get('ST_AUTH', None),
- help='URL for obtaining an auth token (ST_AUTH)')
+ help='URL for obtaining an auth token')
parser.add_argument(
'-U', '--user', dest='user', default=os.environ.get('ST_USER', None),
- help='User name for obtaining an auth token (ST_USER)')
+ help='User name for obtaining an auth token')
parser.add_argument(
'-K', '--key', dest='key', default=os.environ.get('ST_KEY', None),
- help='Key for obtaining an auth token (ST_KEY)')
+ help='Key for obtaining an auth token')
parser.add_argument(
'-t', '--threads', dest='threads', default=3, type=int,
help='How many threads to use (%(default)s)')
+ parser.add_argument(
+ '--hive_export', dest='hive_export', default=False,
action='store_true',
+ help='Export thumbnail data for Hive processing')
args = parser.parse_args()
if None in (args.auth, args.user, args.key):
@@ -188,6 +220,9 @@
filters = [BytesPerSize(), CountPerSize(), BytesByMonth()]
threads = []
+ if args.hive_export:
+ filters = [HiveExport()]
+
for i in range(args.threads):
thread_connection = swiftclient.Connection(
args.auth, args.user, args.key, retry_on_ratelimit=True)
--
To view, visit https://gerrit.wikimedia.org/r/351793
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ia6cd8e66ff7da58ca5e2fe164a220e9d40b5391b
Gerrit-PatchSet: 1
Gerrit-Project: operations/software
Gerrit-Branch: master
Gerrit-Owner: Filippo Giunchedi <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits